multi-threaded extract program. Thanks to Rohit Gupta

2024-09-17 14:17:13 +03:00 · 2012-07-18 12:46:59 +01:00 · 2012-07-18 12:46:59 +01:00 · 7ae76dfe75
commit 7ae76dfe75
parent b609473645
3 changed files with 433 additions and 159 deletions
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@ -10,7 +10,7 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
 alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
 alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
-exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
 exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
--- a/phrase-extract/PhraseExtractionOptions.h
+++ b/phrase-extract/PhraseExtractionOptions.h
@ -0,0 +1,146 @@
 /***********************************************************************
  Moses - factored phrase-based language decoder
  Copyright (C) 2010 University of Edinburgh
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/
 /* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
 #pragma once
 #ifndef PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
 #define PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
 namespace MosesTraining
 {
 enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
 enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
 class PhraseExtractionOptions {
 public: 
     const int maxPhraseLength;
 private:
  bool allModelsOutputFlag;
  bool wordModel;
  REO_MODEL_TYPE wordType;
  bool phraseModel;
  REO_MODEL_TYPE phraseType;
  bool hierModel;
  REO_MODEL_TYPE hierType;
  bool orientationFlag;
  bool translationFlag;
  bool sentenceIdFlag; //create extract file with sentence id
  bool onlyOutputSpanInfo;
  bool gzOutput;
 public:  
  PhraseExtractionOptions(const int initmaxPhraseLength):
            maxPhraseLength(initmaxPhraseLength),
            allModelsOutputFlag(false),
            wordModel(false),
            wordType(REO_MSD),
            phraseModel(false),
            phraseType(REO_MSD),
            hierModel(false),
            hierType(REO_MSD),
            orientationFlag(false),
            translationFlag(true),
            sentenceIdFlag(false),
            onlyOutputSpanInfo(false),
            gzOutput(false){}
    //functions for initialization of options
    void initAllModelsOutputFlag(const bool initallModelsOutputFlag){
        allModelsOutputFlag=initallModelsOutputFlag;
    }
    void initWordModel(const bool initwordModel){
        wordModel=initwordModel;
    }
    void initWordType(REO_MODEL_TYPE initwordType ){
        wordType=initwordType; 
    } 
    void initPhraseModel(const bool initphraseModel ){
        phraseModel=initphraseModel;  
    } 
    void initPhraseType(REO_MODEL_TYPE initphraseType){
        phraseType=initphraseType;
    }  
    void initHierModel(const bool inithierModel){
        hierModel=inithierModel;
    }
    void initHierType(REO_MODEL_TYPE inithierType){
        hierType=inithierType;
    }
    void initOrientationFlag(const bool initorientationFlag){
        orientationFlag=initorientationFlag;
    }
    void initTranslationFlag(const bool inittranslationFlag){
        translationFlag=inittranslationFlag;
    }
    void initSentenceIdFlag(const bool initsentenceIdFlag){
        sentenceIdFlag=initsentenceIdFlag;
    }
    void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){
        onlyOutputSpanInfo= initonlyOutputSpanInfo;
    } 
    void initGzOutput (const bool initgzOutput){
        gzOutput= initgzOutput;
    } 
    // functions for getting values
    bool isAllModelsOutputFlag(){
        return allModelsOutputFlag;
    }
    bool isWordModel(){
        return wordModel;
    }
    REO_MODEL_TYPE isWordType(){
        return wordType; 
    } 
    bool isPhraseModel(){
        return phraseModel;  
    } 
    REO_MODEL_TYPE isPhraseType(){
        return phraseType;
    }  
    bool isHierModel(){
        return hierModel; 
    }
    REO_MODEL_TYPE isHierType(){
        return hierType;
    }
    bool isOrientationFlag(){
        return orientationFlag;
    }
    bool isTranslationFlag(){
        return translationFlag;
    }
    bool isSentenceIdFlag(){
        return sentenceIdFlag;
    }
    bool isOnlyOutputSpanInfo(){
        return onlyOutputSpanInfo;
    } 
    bool isGzOutput (){
        return gzOutput;
   } 
 };
 }
 #endif
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@ -1,6 +1,7 @@
 /*
 * extract.cpp
- *
+ *	Modified by: Rohit Gupta CDAC, Mumbai, India
 *	on July 15, 2012 to implement parallel processing
 *      Modified by: Nadi Tomeh - LIMSI/CNRS
 *      Machine Translation Marathon 2010, Dublin
 */
@ -13,7 +14,7 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <cstring>
-
+#include <sstream>
 #include <map>
 #include <set>
 #include <vector>
@ -23,14 +24,17 @@
 #include "tables-core.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
-
+#include "../moses/src/ThreadPool.h"
 #include "../moses/src/OutputCollector.h"
 #include "PhraseExtractionOptions.h"
 using namespace std;
 using namespace MosesTraining;
-#define LINE_MAX_LENGTH 500000
+namespace MosesTraining {
 const long int LINE_MAX_LENGTH = 500000 ;
 namespace MosesTraining
 {
 // HPhraseVertex represents a point in the alignment matrix
 typedef pair <int, int> HPhraseVertex;
@ -46,58 +50,65 @@ typedef vector < HPhrase > HPhraseVector;
 // The key of the map is the English index and the value is a set of the source ones
 typedef map <int, set<int> > HSentenceVertices;
-enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
+  REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
 enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
 REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                           int, int, int, int, int, int, int,
                           bool (*)(int, int), bool (*)(int, int));
-REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+  REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                             int, int, int, int, int, int, int,
                             bool (*)(int, int), bool (*)(int, int),
                             const HSentenceVertices &, const HSentenceVertices &);
-REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+  REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                           int, int, int, int, int, int, int,
                           bool (*)(int, int), bool (*)(int, int),
                           const HSentenceVertices &, const HSentenceVertices &,
                           const HSentenceVertices &, const HSentenceVertices &,
                           REO_POS);
-void insertVertex(HSentenceVertices &, int, int);
+  void insertVertex(HSentenceVertices &, int, int);
-void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
+  void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
                          int, int, int, int);
-string getOrientString(REO_POS, REO_MODEL_TYPE);
+  string getOrientString(REO_POS, REO_MODEL_TYPE);
-bool ge(int, int);
+  bool ge(int, int);
-bool le(int, int);
+  bool le(int, int);
-bool lt(int, int);
+  bool lt(int, int);
-void extractBase(SentenceAlignment &);
+  bool isAligned (SentenceAlignment &, int, int);
 void extract(SentenceAlignment &);
 void addPhrase(SentenceAlignment &, int, int, int, int, string &);
 bool isAligned (SentenceAlignment &, int, int);
 bool allModelsOutputFlag = false;
 bool wordModel = false;
 REO_MODEL_TYPE wordType = REO_MSD;
 bool phraseModel = false;
 REO_MODEL_TYPE phraseType = REO_MSD;
 bool hierModel = false;
 REO_MODEL_TYPE hierType = REO_MSD;
-Moses::OutputFileStream extractFile;
+}
-Moses::OutputFileStream extractFileInv;
+namespace MosesTraining{
-Moses::OutputFileStream extractFileOrientation;
+class ExtractTask : public Moses::Task{
-Moses::OutputFileStream extractFileSentenceId;
+        private:
-int maxPhraseLength;
+        size_t m_id;
-bool orientationFlag = false;
+        SentenceAlignment *m_sentence;
-bool translationFlag = true;
+        PhraseExtractionOptions &m_options;
-bool sentenceIdFlag = false; //create extract file with sentence id
+        Moses::OutputCollector* m_extractCollector;
-bool onlyOutputSpanInfo = false;
+        Moses::OutputCollector* m_extractCollectorInv;
-bool gzOutput = false;
+        Moses::OutputCollector* m_extractCollectorOrientation;
-
+        Moses::OutputCollector* m_extractCollectorSentenceId;
 public:
  ExtractTask(size_t id, SentenceAlignment *sentence,PhraseExtractionOptions &initoptions, Moses::OutputCollector *extractCollector, Moses::OutputCollector *extractCollectorInv,Moses::OutputCollector *extractCollectorOrientation,Moses::OutputCollector* extractCollectorSentenceId  ):
    m_id(id),
    m_sentence(sentence),
    m_options(initoptions),
    m_extractCollector(extractCollector),
    m_extractCollectorInv(extractCollectorInv),
    m_extractCollectorOrientation(extractCollectorOrientation),
    m_extractCollectorSentenceId(extractCollectorSentenceId) {}
  ~ExtractTask() { delete m_sentence; }
 void Run();
 private:
  vector< string > m_extractedPhrases;
  vector< string > m_extractedPhrasesInv;
  vector< string > m_extractedPhrasesOri;
  vector< string > m_extractedPhrasesSid;
  void extractBase(SentenceAlignment &);
  void extract(SentenceAlignment &);
  void addPhrase(SentenceAlignment &, int, int, int, int, string &);
  void writePhrasesToFile();
 };
 }
 int main(int argc, char* argv[])
@ -105,70 +116,83 @@ int main(int argc, char* argv[])
  cerr	<< "PhraseExtract v1.4, written by Philipp Koehn\n"
        << "phrase extraction from an aligned parallel corpus\n";
-  if (argc < 6) {
+#ifdef WITH_THREADS
-    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
+  int thread_count = 1;
 #endif
 if (argc < 6) {
    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
    #ifdef WITH_THREADS
    cerr<< "| --threads NUM ";
    #endif
    cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --GZOutput ]\n";
    exit(1);
  }
-  char* &fileNameE = argv[1];
+
-  char* &fileNameF = argv[2];
+  Moses::OutputFileStream extractFile;
-  char* &fileNameA = argv[3];
+  Moses::OutputFileStream extractFileInv;
-  string fileNameExtract = string(argv[4]);
+  Moses::OutputFileStream extractFileOrientation;
-  maxPhraseLength = atoi(argv[5]);
+  Moses::OutputFileStream extractFileSentenceId;
  const char* const &fileNameE = argv[1];
  const char* const &fileNameF = argv[2];
  const char* const &fileNameA = argv[3];
  const string fileNameExtract = string(argv[4]);
  PhraseExtractionOptions options(atoi(argv[5]));
  for(int i=6; i<argc; i++) {
    if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
-      onlyOutputSpanInfo = true;
+      options.initOnlyOutputSpanInfo(true);
    } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
-      orientationFlag = true;
+      options.initOrientationFlag(true);
    } else if (strcmp(argv[i],"--NoTTable") == 0) {
-      translationFlag = false;
+      options.initTranslationFlag(false);
    } else if (strcmp(argv[i], "--SentenceId") == 0) {
-      sentenceIdFlag = true;  
+      options.initSentenceIdFlag(true);  
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
-      gzOutput = true;  
+      options.initGzOutput(true);  
    } else if(strcmp(argv[i],"--model") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
        exit(1);
      }
-      char* modelParams = argv[++i];
+      char*  modelParams = argv[++i];
-      char* modelName = strtok(modelParams, "-");
+      char*  modelName = strtok(modelParams, "-");
-      char* modelType = strtok(NULL, "-");
+      char*  modelType = strtok(NULL, "-");
      REO_MODEL_TYPE intModelType;
      if(strcmp(modelName, "wbe") == 0) {
-        wordModel = true;
+        options.initWordModel(true);
        if(strcmp(modelType, "msd") == 0)
-          wordType = REO_MSD;
+          options.initWordType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
-          wordType = REO_MSLR;
+          options.initWordType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          wordType = REO_MONO;
+          options.initWordType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "phrase") == 0) {
-        phraseModel = true;
+        options.initPhraseModel(true);
        if(strcmp(modelType, "msd") == 0)
-          phraseType = REO_MSD;
+          options.initPhraseType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
-          phraseType = REO_MSLR;
+          options.initPhraseType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          phraseType = REO_MONO;
+          options.initPhraseType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "hier") == 0) {
-        hierModel = true;
+        options.initHierModel(true);
        if(strcmp(modelType, "msd") == 0)
-          hierType = REO_MSD;
+          options.initHierType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
-          hierType = REO_MSLR;
+          options.initHierType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          hierType = REO_MONO;
+          options.initHierType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
@ -178,7 +202,21 @@ int main(int argc, char* argv[])
        exit(1);
      }
-      allModelsOutputFlag = true;
+      options.initAllModelsOutputFlag(true);
 #ifdef WITH_THREADS
    }else if (strcmp(argv[i],"-threads") == 0 ||
               strcmp(argv[i],"--threads") == 0 ||
               strcmp(argv[i],"--Threads") == 0) {
        if(argc>(i+1))thread_count = atoi(argv[++i]);
        else {cerr<<"extract: syntax error, NUM is missing for --threads NUM option"<<endl;
        exit(1);
        }
        if(thread_count==0){
                cerr<<"extract: error, NUM is missing for --threads NUM option or --threads 0 is given"<<endl;
                exit(1);
        }
     #endif
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
@ -187,9 +225,9 @@ int main(int argc, char* argv[])
  // default reordering model if no model selected
  // allows for the old syntax to be used
-  if(orientationFlag && !allModelsOutputFlag) {
+  if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
-    wordModel = true;
+    options.initWordModel(true);
-    wordType = REO_MSD;
+    options.initWordType(REO_MSD);
  }
  // open input files
@ -202,21 +240,32 @@ int main(int argc, char* argv[])
  istream *aFileP = &aFile;
  // open output files
-  if (translationFlag) {
+  if (options.isTranslationFlag()) {
-    string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
+    string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
-    extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
+    extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str());
    extractFileInv.Open(fileNameExtractInv.c_str());
  }
-  if (orientationFlag) {
+  if (options.isOrientationFlag()) {
-    string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
+    string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
    extractFileOrientation.Open(fileNameExtractOrientation.c_str());
  }
-  if (sentenceIdFlag) {
+  if (options.isSentenceIdFlag()) {
-    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
+    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (options.isGzOutput()?".gz":"");
    extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
  }
    Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);//r
    Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);//r
    Moses::OutputCollector* extractCollectorOrientation = new Moses::OutputCollector(&extractFileOrientation);//r
    Moses::OutputCollector* extractCollectorSentenceId = new Moses::OutputCollector(&extractFileSentenceId); //r
 #ifdef WITH_THREADS
  // set up thread pool
     Moses::ThreadPool pool(thread_count);
     pool.SetQueueLimit(1000);
 #endif
  int i=0;
  while(true) {
    i++;
@ -228,32 +277,57 @@ int main(int argc, char* argv[])
    if (eFileP->eof()) break;
    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
-    SentenceAlignment sentence;
+    SentenceAlignment *sentence=new SentenceAlignment;
-    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
+	// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
    //az: output src, tgt, and alingment line
-    if (onlyOutputSpanInfo) {
+    if (options.isOnlyOutputSpanInfo()) {
      cout << "LOG: SRC: " << foreignString << endl;
      cout << "LOG: TGT: " << englishString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }
 	if (sentence->create( englishString, foreignString, alignmentString, i)) {
   	ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector , extractCollectorInv, extractCollectorOrientation, extractCollectorSentenceId);
 #ifdef WITH_THREADS
      if (thread_count == 1) {
        task->Run();
        delete task;
      }
      else {
        pool.Submit(task);
      }
 #else
      task->Run();
      delete task;
 #endif
    if (sentence.create( englishString, foreignString, alignmentString, i)) {
      extract(sentence);
    }
-    if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
+    if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }
 #ifdef WITH_THREADS
  // wait for all threads to finish
  pool.Stop(true);
 #endif
  eFile.Close();
  fFile.Close();
  aFile.Close();
      delete extractCollector;
      delete extractCollectorInv;
      delete extractCollectorOrientation;
      delete extractCollectorSentenceId;
  //az: only close if we actually opened it
-  if (!onlyOutputSpanInfo) {
+  if (!options.isOnlyOutputSpanInfo()) {
-    if (translationFlag) {
+    if (options.isTranslationFlag()) {
      extractFile.Close();
      extractFileInv.Close();
    }
-    if (orientationFlag) extractFileOrientation.Close();
+    if (options.isOrientationFlag()){ 
-    if (sentenceIdFlag) {
+	extractFileOrientation.Close();
 	}
    if (options.isSentenceIdFlag()) {
      extractFileSentenceId.Close();
    }
  }
@ -261,8 +335,17 @@ int main(int argc, char* argv[])
 namespace MosesTraining
 {
 void ExtractTask::Run() {
  extract(*m_sentence);
  writePhrasesToFile();
  m_extractedPhrases.clear();
  m_extractedPhrasesInv.clear();
  m_extractedPhrasesOri.clear();
  m_extractedPhrasesSid.clear();
-void extract(SentenceAlignment &sentence)
+}
 void ExtractTask::extract(SentenceAlignment &sentence)
 {
  int countE = sentence.target.size();
  int countF = sentence.source.size();
@ -281,14 +364,14 @@ void extract(SentenceAlignment &sentence)
  HSentenceVertices::const_iterator it;
-  bool relaxLimit = hierModel;
+  bool relaxLimit = m_options.isHierModel();
-  bool buildExtraStructure = phraseModel || hierModel;
+  bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
  // check alignments for target phrase startE...endE
  // loop over extracted phrases which are compatible with the word-alignments
  for(int startE=0; startE<countE; startE++) {
    for(int endE=startE;
-        (endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
+        (endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
        endE++) {
      int minF = 9999;
@ -308,7 +391,7 @@ void extract(SentenceAlignment &sentence)
      }
      if (maxF >= 0 && // aligned to any source words at all
-          (relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits
+          (relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits
        // check if source words are aligned to out of bound target words
        bool out_of_bounds = false;
@ -323,17 +406,17 @@ void extract(SentenceAlignment &sentence)
          // start point of source phrase may retreat over unaligned
          for(int startF=minF;
              (startF>=0 &&
-               (relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
+               (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
               (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
              startF--)
            // end point of source phrase may advance over unaligned
            for(int endF=maxF;
                (endF<countF &&
-                 (relaxLimit || endF<startF+maxPhraseLength) && // within length limit
+                 (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
                 (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
                endF++) { // at this point we have extracted a phrase
              if(buildExtraStructure) { // phrase || hier
-                if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
+                if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
                  inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
                                                   HPhraseVertex(endF,endE)));
                  insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
@ -343,16 +426,16 @@ void extract(SentenceAlignment &sentence)
                                       startF, startE, endF, endE);
              } else {
                string orientationInfo = "";
-                if(wordModel) {
+                if(m_options.isWordModel()) {
                  REO_POS wordPrevOrient, wordNextOrient;
                  bool connectedLeftTopP  = isAligned( sentence, startF-1, startE-1 );
                  bool connectedRightTopP = isAligned( sentence, endF+1,   startE-1 );
                  bool connectedLeftTopN  = isAligned( sentence, endF+1, endE+1 );
                  bool connectedRightTopN = isAligned( sentence, startF-1,   endE+1 );
-                  wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
+                  wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
-                  wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
+                  wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
-                  orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
+                  orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
-                  if(allModelsOutputFlag)
+                  if(m_options.isAllModelsOutputFlag())
                    " | | ";
                }
                addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
@ -378,38 +461,38 @@ void extract(SentenceAlignment &sentence)
      bool connectedLeftTopN  = isAligned( sentence, endF+1, endE+1 );
      bool connectedRightTopN = isAligned( sentence, startF-1,   endE+1 );
-      if(wordModel) {
+      if(m_options.isWordModel()) {
-        wordPrevOrient = getOrientWordModel(sentence, wordType,
+        wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
                                            connectedLeftTopP, connectedRightTopP,
                                            startF, endF, startE, endE, countF, 0, 1,
                                            &ge, &lt);
-        wordNextOrient = getOrientWordModel(sentence, wordType,
+        wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
                                            connectedLeftTopN, connectedRightTopN,
                                            endF, startF, endE, startE, 0, countF, -1,
                                            &lt, &ge);
      }
-      if (phraseModel) {
+      if (m_options.isPhraseModel()) {
-        phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
+        phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
                                                connectedLeftTopP, connectedRightTopP,
                                                startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
-        phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
+        phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
                                                connectedLeftTopN, connectedRightTopN,
                                                endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
      } else {
        phrasePrevOrient = phraseNextOrient = UNKNOWN;
      }
-      if(hierModel) {
+      if(m_options.isHierModel()) {
-        hierPrevOrient = getOrientHierModel(sentence, hierType,
+        hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
                                            connectedLeftTopP, connectedRightTopP,
                                            startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
-        hierNextOrient = getOrientHierModel(sentence, hierType,
+        hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
                                            connectedLeftTopN, connectedRightTopN,
                                            endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
      }
-      orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
+      orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
-                        ((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
+                        ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
-                        ((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
+                        ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
      addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
    }
@ -617,94 +700,139 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
  return "";
 }
-void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
+void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
 {
  // source
-  // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+  //   // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
  	ostringstream outextractstr;
  	ostringstream outextractstrInv;
  	ostringstream outextractstrOrientation;
  	ostringstream outextractstrSentenceId;
-  if (onlyOutputSpanInfo) {
+  if (m_options.isOnlyOutputSpanInfo()) {
    cout << startF << " " << endF << " " << startE << " " << endE << endl;
    return;
  }
-  for(int fi=startF; fi<=endF; fi++) {
+for(int fi=startF; fi<=endF; fi++) {
-    if (translationFlag) extractFile << sentence.source[fi] << " ";
+    if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
-    if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
+    if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
-    if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
+    if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.source[fi] << " ";
  }
-  if (translationFlag) extractFile << "||| ";
+  if (m_options.isTranslationFlag()) outextractstr << "||| ";
-  if (orientationFlag) extractFileOrientation << "||| ";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
-  if (sentenceIdFlag) extractFileSentenceId << "||| ";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
  // target
  for(int ei=startE; ei<=endE; ei++) {
-    if (translationFlag) extractFile << sentence.target[ei] << " ";
+    if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
-    if (translationFlag) extractFileInv << sentence.target[ei] << " ";
+    if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
-    if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
+    if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
-    if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
+    if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.target[ei] << " ";
  }
-  if (translationFlag) extractFile << "|||";
+  if (m_options.isTranslationFlag()) outextractstr << "|||";
-  if (translationFlag) extractFileInv << "||| ";
+  if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
-  if (orientationFlag) extractFileOrientation << "||| ";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
-  if (sentenceIdFlag) extractFileSentenceId << "||| ";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
  // source (for inverse)
  if (translationFlag) {
    for(int fi=startF; fi<=endF; fi++)
      extractFileInv << sentence.source[fi] << " ";
    extractFileInv << "|||";
  }
 if (m_options.isTranslationFlag()) {
    for(int fi=startF; fi<=endF; fi++)
      outextractstrInv << sentence.source[fi] << " ";
    outextractstrInv << "|||";
  }
  // alignment
-  if (translationFlag) {
+ if (m_options.isTranslationFlag()) {
    for(int ei=startE; ei<=endE; ei++) {
-      for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
+      for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
        int fi = sentence.alignedToT[ei][i];
-        extractFile << " " << fi-startF << "-" << ei-startE;
+        outextractstr << " " << fi-startF << "-" << ei-startE;
-        extractFileInv << " " << ei-startE << "-" << fi-startF;
+        outextractstrInv << " " << ei-startE << "-" << fi-startF;
      }
    }
  }
-  if (orientationFlag)
+  if (m_options.isOrientationFlag())
-    extractFileOrientation << orientationInfo;
+    outextractstrOrientation << orientationInfo;
-  if (sentenceIdFlag) {
+  if (m_options.isSentenceIdFlag()) {
-    extractFileSentenceId << sentence.sentenceID;
+    outextractstrSentenceId << sentence.sentenceID;
  }
-  if (translationFlag) extractFile << "\n";
+
-  if (translationFlag) extractFileInv << "\n";
+ if (m_options.isTranslationFlag()) outextractstr << "\n";
-  if (orientationFlag) extractFileOrientation << "\n";
+  if (m_options.isTranslationFlag()) outextractstrInv << "\n";
-  if (sentenceIdFlag) extractFileSentenceId << "\n";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "\n";
    m_extractedPhrases.push_back(outextractstr.str());
    m_extractedPhrasesInv.push_back(outextractstrInv.str());
    m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
    m_extractedPhrasesSid.push_back(outextractstrSentenceId.str());
 }
 void ExtractTask::writePhrasesToFile(){
    ostringstream outextractFile;
    ostringstream outextractFileInv;
    ostringstream outextractFileOrientation;
    ostringstream outextractFileSentenceId;
    for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
        outextractFile<<phrase->data();
    }
    for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
        outextractFileInv<<phrase->data();
    }
    for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
        outextractFileOrientation<<phrase->data();
    }
    for(vector<string>::const_iterator phrase=m_extractedPhrasesSid.begin();phrase!=m_extractedPhrasesSid.end();phrase++){
        outextractFileSentenceId<<phrase->data();
    }
      m_extractCollector->Write(m_id, outextractFile.str());
      m_extractCollectorInv->Write(m_id,outextractFileInv.str());
      m_extractCollectorOrientation->Write(m_id,outextractFileOrientation.str());
      m_extractCollectorSentenceId->Write(m_id,outextractFileSentenceId.str());
 }
 // if proper conditioning, we need the number of times a source phrase occured
-void extractBase( SentenceAlignment &sentence )
+
 void ExtractTask::extractBase( SentenceAlignment &sentence )
 {
    ostringstream outextractFile;
    ostringstream outextractFileInv;
  int countF = sentence.source.size();
  for(int startF=0; startF<countF; startF++) {
    for(int endF=startF;
-        (endF<countF && endF<startF+maxPhraseLength);
+        (endF<countF && endF<startF+m_options.maxPhraseLength);
        endF++) {
      for(int fi=startF; fi<=endF; fi++) {
-        extractFile << sentence.source[fi] << " ";
+         outextractFile << sentence.source[fi] << " ";
-      }
+	}
-      extractFile << "|||" << endl;
+      outextractFile << "|||" << endl;
    }
  }
  int countE = sentence.target.size();
  for(int startE=0; startE<countE; startE++) {
    for(int endE=startE;
-        (endE<countE && endE<startE+maxPhraseLength);
+        (endE<countE && endE<startE+m_options.maxPhraseLength);
        endE++) {
      for(int ei=startE; ei<=endE; ei++) {
-        extractFileInv << sentence.target[ei] << " ";
+        outextractFileInv << sentence.target[ei] << " ";
      }
-      extractFileInv << "|||" << endl;
+      outextractFileInv << "|||" << endl;
    }
  }
    m_extractCollector->Write(m_id, outextractFile.str());
    m_extractCollectorInv->Write(m_id,outextractFileInv.str());
 }
 }