multi-threaded extract program. Thanks to Rohit Gupta

2024-09-17 14:17:13 +03:00 · 2012-07-18 12:46:59 +01:00 · 2012-07-18 12:46:59 +01:00 · 7ae76dfe75
commit 7ae76dfe75
parent b609473645
3 changed files with 433 additions and 159 deletions
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@ -10,7 +10,7 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
 alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
 alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;

-exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;

 exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;

--- a/phrase-extract/PhraseExtractionOptions.h
+++ b/phrase-extract/PhraseExtractionOptions.h
@ -0,0 +1,146 @@
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+
+/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
+
+#pragma once
+#ifndef PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
+#define PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
+
+namespace MosesTraining
+{
+enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
+enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
+
+
+class PhraseExtractionOptions {
+  
+ public: 
+     const int maxPhraseLength;
+ private:
+  bool allModelsOutputFlag;
+  bool wordModel;
+  REO_MODEL_TYPE wordType;
+  bool phraseModel;
+  REO_MODEL_TYPE phraseType;
+  bool hierModel;
+  REO_MODEL_TYPE hierType;
+  bool orientationFlag;
+  bool translationFlag;
+  bool sentenceIdFlag; //create extract file with sentence id
+  bool onlyOutputSpanInfo;
+  bool gzOutput;
+
+public:  
+  PhraseExtractionOptions(const int initmaxPhraseLength):
+            maxPhraseLength(initmaxPhraseLength),
+            allModelsOutputFlag(false),
+            wordModel(false),
+            wordType(REO_MSD),
+            phraseModel(false),
+            phraseType(REO_MSD),
+            hierModel(false),
+            hierType(REO_MSD),
+            orientationFlag(false),
+            translationFlag(true),
+            sentenceIdFlag(false),
+            onlyOutputSpanInfo(false),
+            gzOutput(false){}
+
+
+ 
+    //functions for initialization of options
+    void initAllModelsOutputFlag(const bool initallModelsOutputFlag){
+        allModelsOutputFlag=initallModelsOutputFlag;
+    }
+    void initWordModel(const bool initwordModel){
+        wordModel=initwordModel;
+    }
+    void initWordType(REO_MODEL_TYPE initwordType ){
+        wordType=initwordType; 
+    } 
+    void initPhraseModel(const bool initphraseModel ){
+        phraseModel=initphraseModel;  
+    } 
+    void initPhraseType(REO_MODEL_TYPE initphraseType){
+        phraseType=initphraseType;
+    }  
+    void initHierModel(const bool inithierModel){
+        hierModel=inithierModel;
+    }
+    void initHierType(REO_MODEL_TYPE inithierType){
+        hierType=inithierType;
+    }
+    void initOrientationFlag(const bool initorientationFlag){
+        orientationFlag=initorientationFlag;
+    }
+    void initTranslationFlag(const bool inittranslationFlag){
+        translationFlag=inittranslationFlag;
+    }
+    void initSentenceIdFlag(const bool initsentenceIdFlag){
+        sentenceIdFlag=initsentenceIdFlag;
+    }
+    void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){
+        onlyOutputSpanInfo= initonlyOutputSpanInfo;
+    } 
+    void initGzOutput (const bool initgzOutput){
+        gzOutput= initgzOutput;
+    } 
+    // functions for getting values
+    bool isAllModelsOutputFlag(){
+        return allModelsOutputFlag;
+    }
+    bool isWordModel(){
+        return wordModel;
+    }
+    REO_MODEL_TYPE isWordType(){
+        return wordType; 
+    } 
+    bool isPhraseModel(){
+        return phraseModel;  
+    } 
+    REO_MODEL_TYPE isPhraseType(){
+        return phraseType;
+    }  
+    bool isHierModel(){
+        return hierModel; 
+    }
+    REO_MODEL_TYPE isHierType(){
+        return hierType;
+    }
+    bool isOrientationFlag(){
+        return orientationFlag;
+    }
+    bool isTranslationFlag(){
+        return translationFlag;
+    }
+    bool isSentenceIdFlag(){
+        return sentenceIdFlag;
+    }
+    bool isOnlyOutputSpanInfo(){
+        return onlyOutputSpanInfo;
+    } 
+    bool isGzOutput (){
+        return gzOutput;
+   } 
+};
+
+}
+
+#endif
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@ -1,6 +1,7 @@
 /*
 * extract.cpp
- *
+ *	Modified by: Rohit Gupta CDAC, Mumbai, India
+ *	on July 15, 2012 to implement parallel processing
 *      Modified by: Nadi Tomeh - LIMSI/CNRS
 *      Machine Translation Marathon 2010, Dublin
 */
@ -13,7 +14,7 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <cstring>
-
+#include <sstream>
 #include <map>
 #include <set>
 #include <vector>
@ -23,14 +24,17 @@
 #include "tables-core.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
-
+#include "../moses/src/ThreadPool.h"
+#include "../moses/src/OutputCollector.h"
+#include "PhraseExtractionOptions.h"
 using namespace std;
 using namespace MosesTraining;

-#define LINE_MAX_LENGTH 500000
+namespace MosesTraining {
+
+
+const long int LINE_MAX_LENGTH = 500000 ;

-namespace MosesTraining
-{

 // HPhraseVertex represents a point in the alignment matrix
 typedef pair <int, int> HPhraseVertex;
@ -46,9 +50,6 @@ typedef vector < HPhrase > HPhraseVector;
 // The key of the map is the English index and the value is a set of the source ones
 typedef map <int, set<int> > HSentenceVertices;

-enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
-enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
-
  REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
                           int, int, int, int, int, int, int,
                           bool (*)(int, int), bool (*)(int, int));
@ -72,32 +73,42 @@ bool ge(int, int);
  bool le(int, int);
  bool lt(int, int);

+  bool isAligned (SentenceAlignment &, int, int);
+
+
+}
+namespace MosesTraining{
+class ExtractTask : public Moses::Task{
+        private:
+        size_t m_id;
+        SentenceAlignment *m_sentence;
+        PhraseExtractionOptions &m_options;
+        Moses::OutputCollector* m_extractCollector;
+        Moses::OutputCollector* m_extractCollectorInv;
+        Moses::OutputCollector* m_extractCollectorOrientation;
+        Moses::OutputCollector* m_extractCollectorSentenceId;
+public:
+  ExtractTask(size_t id, SentenceAlignment *sentence,PhraseExtractionOptions &initoptions, Moses::OutputCollector *extractCollector, Moses::OutputCollector *extractCollectorInv,Moses::OutputCollector *extractCollectorOrientation,Moses::OutputCollector* extractCollectorSentenceId  ):
+    m_id(id),
+    m_sentence(sentence),
+    m_options(initoptions),
+    m_extractCollector(extractCollector),
+    m_extractCollectorInv(extractCollectorInv),
+    m_extractCollectorOrientation(extractCollectorOrientation),
+    m_extractCollectorSentenceId(extractCollectorSentenceId) {}
+  ~ExtractTask() { delete m_sentence; }
+void Run();
+private:
+  vector< string > m_extractedPhrases;
+  vector< string > m_extractedPhrasesInv;
+  vector< string > m_extractedPhrasesOri;
+  vector< string > m_extractedPhrasesSid;
  void extractBase(SentenceAlignment &);
  void extract(SentenceAlignment &);
  void addPhrase(SentenceAlignment &, int, int, int, int, string &);
-bool isAligned (SentenceAlignment &, int, int);
-
-bool allModelsOutputFlag = false;
-
-bool wordModel = false;
-REO_MODEL_TYPE wordType = REO_MSD;
-bool phraseModel = false;
-REO_MODEL_TYPE phraseType = REO_MSD;
-bool hierModel = false;
-REO_MODEL_TYPE hierType = REO_MSD;
-
-
-Moses::OutputFileStream extractFile;
-Moses::OutputFileStream extractFileInv;
-Moses::OutputFileStream extractFileOrientation;
-Moses::OutputFileStream extractFileSentenceId;
-int maxPhraseLength;
-bool orientationFlag = false;
-bool translationFlag = true;
-bool sentenceIdFlag = false; //create extract file with sentence id
-bool onlyOutputSpanInfo = false;
-bool gzOutput = false;
+  void writePhrasesToFile();
  
+};
 }

 int main(int argc, char* argv[])
@ -105,27 +116,40 @@ int main(int argc, char* argv[])
  cerr	<< "PhraseExtract v1.4, written by Philipp Koehn\n"
        << "phrase extraction from an aligned parallel corpus\n";

+#ifdef WITH_THREADS
+  int thread_count = 1;
+#endif
 if (argc < 6) {
-    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
+    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
+    #ifdef WITH_THREADS
+
+    cerr<< "| --threads NUM ";
+    #endif
+    cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --GZOutput ]\n";
    exit(1);
  }
-  char* &fileNameE = argv[1];
-  char* &fileNameF = argv[2];
-  char* &fileNameA = argv[3];
-  string fileNameExtract = string(argv[4]);
-  maxPhraseLength = atoi(argv[5]);
+
+  Moses::OutputFileStream extractFile;
+  Moses::OutputFileStream extractFileInv;
+  Moses::OutputFileStream extractFileOrientation;
+  Moses::OutputFileStream extractFileSentenceId;
+  const char* const &fileNameE = argv[1];
+  const char* const &fileNameF = argv[2];
+  const char* const &fileNameA = argv[3];
+  const string fileNameExtract = string(argv[4]);
+  PhraseExtractionOptions options(atoi(argv[5]));

  for(int i=6; i<argc; i++) {
    if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
-      onlyOutputSpanInfo = true;
+      options.initOnlyOutputSpanInfo(true);
    } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
-      orientationFlag = true;
+      options.initOrientationFlag(true);
    } else if (strcmp(argv[i],"--NoTTable") == 0) {
-      translationFlag = false;
+      options.initTranslationFlag(false);
    } else if (strcmp(argv[i], "--SentenceId") == 0) {
-      sentenceIdFlag = true;  
+      options.initSentenceIdFlag(true);  
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
-      gzOutput = true;  
+      options.initGzOutput(true);  
    } else if(strcmp(argv[i],"--model") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
@ -138,37 +162,37 @@ int main(int argc, char* argv[])
      REO_MODEL_TYPE intModelType;

      if(strcmp(modelName, "wbe") == 0) {
-        wordModel = true;
+        options.initWordModel(true);
        if(strcmp(modelType, "msd") == 0)
-          wordType = REO_MSD;
+          options.initWordType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
-          wordType = REO_MSLR;
+          options.initWordType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          wordType = REO_MONO;
+          options.initWordType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "phrase") == 0) {
-        phraseModel = true;
+        options.initPhraseModel(true);
        if(strcmp(modelType, "msd") == 0)
-          phraseType = REO_MSD;
+          options.initPhraseType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
-          phraseType = REO_MSLR;
+          options.initPhraseType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          phraseType = REO_MONO;
+          options.initPhraseType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "hier") == 0) {
-        hierModel = true;
+        options.initHierModel(true);
        if(strcmp(modelType, "msd") == 0)
-          hierType = REO_MSD;
+          options.initHierType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
-          hierType = REO_MSLR;
+          options.initHierType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
-          hierType = REO_MONO;
+          options.initHierType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
@ -178,7 +202,21 @@ int main(int argc, char* argv[])
        exit(1);
      }

-      allModelsOutputFlag = true;
+      options.initAllModelsOutputFlag(true);
+ #ifdef WITH_THREADS
+    }else if (strcmp(argv[i],"-threads") == 0 ||
+               strcmp(argv[i],"--threads") == 0 ||
+               strcmp(argv[i],"--Threads") == 0) {
+        if(argc>(i+1))thread_count = atoi(argv[++i]);
+        else {cerr<<"extract: syntax error, NUM is missing for --threads NUM option"<<endl;
+        exit(1);
+        }
+        if(thread_count==0){
+                cerr<<"extract: error, NUM is missing for --threads NUM option or --threads 0 is given"<<endl;
+                exit(1);
+        }
+     #endif
+
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
@ -187,9 +225,9 @@ int main(int argc, char* argv[])

  // default reordering model if no model selected
  // allows for the old syntax to be used
-  if(orientationFlag && !allModelsOutputFlag) {
-    wordModel = true;
-    wordType = REO_MSD;
+  if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
+    options.initWordModel(true);
+    options.initWordType(REO_MSD);
  }

  // open input files
@ -202,21 +240,32 @@ int main(int argc, char* argv[])
  istream *aFileP = &aFile;

  // open output files
-  if (translationFlag) {
-    string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
-    extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
+  if (options.isTranslationFlag()) {
+    string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
+    extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str());
    extractFileInv.Open(fileNameExtractInv.c_str());
  }
-  if (orientationFlag) {
-    string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
+  if (options.isOrientationFlag()) {
+    string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
    extractFileOrientation.Open(fileNameExtractOrientation.c_str());
  }

-  if (sentenceIdFlag) {
-    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
+  if (options.isSentenceIdFlag()) {
+    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (options.isGzOutput()?".gz":"");
    extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
  }

+
+    Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);//r
+    Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);//r
+    Moses::OutputCollector* extractCollectorOrientation = new Moses::OutputCollector(&extractFileOrientation);//r
+    Moses::OutputCollector* extractCollectorSentenceId = new Moses::OutputCollector(&extractFileSentenceId); //r
+#ifdef WITH_THREADS
+  // set up thread pool
+     Moses::ThreadPool pool(thread_count);
+     pool.SetQueueLimit(1000);
+#endif
+
  int i=0;
  while(true) {
    i++;
@ -228,32 +277,57 @@ int main(int argc, char* argv[])
    if (eFileP->eof()) break;
    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
-    SentenceAlignment sentence;
+    SentenceAlignment *sentence=new SentenceAlignment;
 	// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
    //az: output src, tgt, and alingment line
-    if (onlyOutputSpanInfo) {
+    if (options.isOnlyOutputSpanInfo()) {
      cout << "LOG: SRC: " << foreignString << endl;
      cout << "LOG: TGT: " << englishString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }
+	if (sentence->create( englishString, foreignString, alignmentString, i)) {
+   	ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector , extractCollectorInv, extractCollectorOrientation, extractCollectorSentenceId);
+#ifdef WITH_THREADS
+      if (thread_count == 1) {
+        task->Run();
+        delete task;
+      }
+      else {
+        pool.Submit(task);
+      }
+#else
+      task->Run();
+      delete task;
+#endif

-    if (sentence.create( englishString, foreignString, alignmentString, i)) {
-      extract(sentence);
    }
-    if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
+    if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }
+
+#ifdef WITH_THREADS
+  // wait for all threads to finish
+  pool.Stop(true);
+#endif
+
  eFile.Close();
  fFile.Close();
  aFile.Close();
+      delete extractCollector;
+      delete extractCollectorInv;
+      delete extractCollectorOrientation;
+      delete extractCollectorSentenceId;
  //az: only close if we actually opened it
-  if (!onlyOutputSpanInfo) {
-    if (translationFlag) {
+  if (!options.isOnlyOutputSpanInfo()) {
+    if (options.isTranslationFlag()) {
      extractFile.Close();
      extractFileInv.Close();
+      
    }
-    if (orientationFlag) extractFileOrientation.Close();
-    if (sentenceIdFlag) {
+    if (options.isOrientationFlag()){ 
+	extractFileOrientation.Close();
+	}
+    if (options.isSentenceIdFlag()) {
      extractFileSentenceId.Close();
    }
  }
@ -261,8 +335,17 @@ int main(int argc, char* argv[])

 namespace MosesTraining
 {
+void ExtractTask::Run() {
+  extract(*m_sentence);
+  writePhrasesToFile();
+  m_extractedPhrases.clear();
+  m_extractedPhrasesInv.clear();
+  m_extractedPhrasesOri.clear();
+  m_extractedPhrasesSid.clear();

-void extract(SentenceAlignment &sentence)
+}
+
+void ExtractTask::extract(SentenceAlignment &sentence)
 {
  int countE = sentence.target.size();
  int countF = sentence.source.size();
@ -281,14 +364,14 @@ void extract(SentenceAlignment &sentence)

  HSentenceVertices::const_iterator it;

-  bool relaxLimit = hierModel;
-  bool buildExtraStructure = phraseModel || hierModel;
+  bool relaxLimit = m_options.isHierModel();
+  bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();

  // check alignments for target phrase startE...endE
  // loop over extracted phrases which are compatible with the word-alignments
  for(int startE=0; startE<countE; startE++) {
    for(int endE=startE;
-        (endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
+        (endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
        endE++) {

      int minF = 9999;
@ -308,7 +391,7 @@ void extract(SentenceAlignment &sentence)
      }

      if (maxF >= 0 && // aligned to any source words at all
-          (relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits
+          (relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits

        // check if source words are aligned to out of bound target words
        bool out_of_bounds = false;
@ -323,17 +406,17 @@ void extract(SentenceAlignment &sentence)
          // start point of source phrase may retreat over unaligned
          for(int startF=minF;
              (startF>=0 &&
-               (relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
+               (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
               (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
              startF--)
            // end point of source phrase may advance over unaligned
            for(int endF=maxF;
                (endF<countF &&
-                 (relaxLimit || endF<startF+maxPhraseLength) && // within length limit
+                 (relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
                 (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
                endF++) { // at this point we have extracted a phrase
              if(buildExtraStructure) { // phrase || hier
-                if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
+                if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
                  inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
                                                   HPhraseVertex(endF,endE)));
                  insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
@ -343,16 +426,16 @@ void extract(SentenceAlignment &sentence)
                                       startF, startE, endF, endE);
              } else {
                string orientationInfo = "";
-                if(wordModel) {
+                if(m_options.isWordModel()) {
                  REO_POS wordPrevOrient, wordNextOrient;
                  bool connectedLeftTopP  = isAligned( sentence, startF-1, startE-1 );
                  bool connectedRightTopP = isAligned( sentence, endF+1,   startE-1 );
                  bool connectedLeftTopN  = isAligned( sentence, endF+1, endE+1 );
                  bool connectedRightTopN = isAligned( sentence, startF-1,   endE+1 );
-                  wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
-                  wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
-                  orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
-                  if(allModelsOutputFlag)
+                  wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
+                  wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
+                  orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
+                  if(m_options.isAllModelsOutputFlag())
                    " | | ";
                }
                addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
@ -378,38 +461,38 @@ void extract(SentenceAlignment &sentence)
      bool connectedLeftTopN  = isAligned( sentence, endF+1, endE+1 );
      bool connectedRightTopN = isAligned( sentence, startF-1,   endE+1 );

-      if(wordModel) {
-        wordPrevOrient = getOrientWordModel(sentence, wordType,
+      if(m_options.isWordModel()) {
+        wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
                                            connectedLeftTopP, connectedRightTopP,
                                            startF, endF, startE, endE, countF, 0, 1,
                                            &ge, &lt);
-        wordNextOrient = getOrientWordModel(sentence, wordType,
+        wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
                                            connectedLeftTopN, connectedRightTopN,
                                            endF, startF, endE, startE, 0, countF, -1,
                                            &lt, &ge);
      }
-      if (phraseModel) {
-        phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
+      if (m_options.isPhraseModel()) {
+        phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
                                                connectedLeftTopP, connectedRightTopP,
                                                startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
-        phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
+        phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
                                                connectedLeftTopN, connectedRightTopN,
                                                endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
      } else {
        phrasePrevOrient = phraseNextOrient = UNKNOWN;
      }
-      if(hierModel) {
-        hierPrevOrient = getOrientHierModel(sentence, hierType,
+      if(m_options.isHierModel()) {
+        hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
                                            connectedLeftTopP, connectedRightTopP,
                                            startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
-        hierNextOrient = getOrientHierModel(sentence, hierType,
+        hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
                                            connectedLeftTopN, connectedRightTopN,
                                            endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
      }

-      orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
-                        ((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
-                        ((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
+      orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
+                        ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
+                        ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");

      addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
    }
@ -617,94 +700,139 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
  return "";
 }

-void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
+void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
 {
  // source
-  // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+  //   // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+  	ostringstream outextractstr;
+  	ostringstream outextractstrInv;
+  	ostringstream outextractstrOrientation;
+  	ostringstream outextractstrSentenceId;

-  if (onlyOutputSpanInfo) {
+  if (m_options.isOnlyOutputSpanInfo()) {
    cout << startF << " " << endF << " " << startE << " " << endE << endl;
    return;
  }

 for(int fi=startF; fi<=endF; fi++) {
-    if (translationFlag) extractFile << sentence.source[fi] << " ";
-    if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
-    if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
+    if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
+    if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
+    if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.source[fi] << " ";
  }
-  if (translationFlag) extractFile << "||| ";
-  if (orientationFlag) extractFileOrientation << "||| ";
-  if (sentenceIdFlag) extractFileSentenceId << "||| ";
+  if (m_options.isTranslationFlag()) outextractstr << "||| ";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";

  // target
  for(int ei=startE; ei<=endE; ei++) {
-    if (translationFlag) extractFile << sentence.target[ei] << " ";
-    if (translationFlag) extractFileInv << sentence.target[ei] << " ";
-    if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
-    if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
+    if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
+    if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
+    if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
+    if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.target[ei] << " ";
  }
-  if (translationFlag) extractFile << "|||";
-  if (translationFlag) extractFileInv << "||| ";
-  if (orientationFlag) extractFileOrientation << "||| ";
-  if (sentenceIdFlag) extractFileSentenceId << "||| ";
+  if (m_options.isTranslationFlag()) outextractstr << "|||";
+  if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";

  // source (for inverse)
-  if (translationFlag) {
+
+ if (m_options.isTranslationFlag()) {
    for(int fi=startF; fi<=endF; fi++)
-      extractFileInv << sentence.source[fi] << " ";
-    extractFileInv << "|||";
+      outextractstrInv << sentence.source[fi] << " ";
+    outextractstrInv << "|||";
  }
-
  // alignment
-  if (translationFlag) {
+ if (m_options.isTranslationFlag()) {
    for(int ei=startE; ei<=endE; ei++) {
-      for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
+      for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
        int fi = sentence.alignedToT[ei][i];
-        extractFile << " " << fi-startF << "-" << ei-startE;
-        extractFileInv << " " << ei-startE << "-" << fi-startF;
+        outextractstr << " " << fi-startF << "-" << ei-startE;
+        outextractstrInv << " " << ei-startE << "-" << fi-startF;
      }
    }
  }

-  if (orientationFlag)
-    extractFileOrientation << orientationInfo;
+  if (m_options.isOrientationFlag())
+    outextractstrOrientation << orientationInfo;

-  if (sentenceIdFlag) {
-    extractFileSentenceId << sentence.sentenceID;
+  if (m_options.isSentenceIdFlag()) {
+    outextractstrSentenceId << sentence.sentenceID;
  }

-  if (translationFlag) extractFile << "\n";
-  if (translationFlag) extractFileInv << "\n";
-  if (orientationFlag) extractFileOrientation << "\n";
-  if (sentenceIdFlag) extractFileSentenceId << "\n";
+
+ if (m_options.isTranslationFlag()) outextractstr << "\n";
+  if (m_options.isTranslationFlag()) outextractstrInv << "\n";
+  if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
+  if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "\n";
+
+
+    m_extractedPhrases.push_back(outextractstr.str());
+    m_extractedPhrasesInv.push_back(outextractstrInv.str());
+    m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
+    m_extractedPhrasesSid.push_back(outextractstrSentenceId.str());
+}
+
+
+void ExtractTask::writePhrasesToFile(){
+
+    ostringstream outextractFile;
+    ostringstream outextractFileInv;
+    ostringstream outextractFileOrientation;
+    ostringstream outextractFileSentenceId;
+
+    for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
+        outextractFile<<phrase->data();
+    }
+    for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
+        outextractFileInv<<phrase->data();
+    }
+    for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
+        outextractFileOrientation<<phrase->data();
+    }
+    for(vector<string>::const_iterator phrase=m_extractedPhrasesSid.begin();phrase!=m_extractedPhrasesSid.end();phrase++){
+        outextractFileSentenceId<<phrase->data();
+    }
+
+      m_extractCollector->Write(m_id, outextractFile.str());
+      m_extractCollectorInv->Write(m_id,outextractFileInv.str());
+      m_extractCollectorOrientation->Write(m_id,outextractFileOrientation.str());
+      m_extractCollectorSentenceId->Write(m_id,outextractFileSentenceId.str());
 }

 // if proper conditioning, we need the number of times a source phrase occured
-void extractBase( SentenceAlignment &sentence )
+
+void ExtractTask::extractBase( SentenceAlignment &sentence )
 {
+    ostringstream outextractFile;
+    ostringstream outextractFileInv;
+
  int countF = sentence.source.size();
  for(int startF=0; startF<countF; startF++) {
    for(int endF=startF;
-        (endF<countF && endF<startF+maxPhraseLength);
+        (endF<countF && endF<startF+m_options.maxPhraseLength);
        endF++) {
      for(int fi=startF; fi<=endF; fi++) {
-        extractFile << sentence.source[fi] << " ";
+         outextractFile << sentence.source[fi] << " ";
 	}
-      extractFile << "|||" << endl;
+      outextractFile << "|||" << endl;
    }
  }

  int countE = sentence.target.size();
  for(int startE=0; startE<countE; startE++) {
    for(int endE=startE;
-        (endE<countE && endE<startE+maxPhraseLength);
+        (endE<countE && endE<startE+m_options.maxPhraseLength);
        endE++) {
      for(int ei=startE; ei<=endE; ei++) {
-        extractFileInv << sentence.target[ei] << " ";
+        outextractFileInv << sentence.target[ei] << " ";
      }
-      extractFileInv << "|||" << endl;
+      outextractFileInv << "|||" << endl;
    }
  }
+    m_extractCollector->Write(m_id, outextractFile.str());
+    m_extractCollectorInv->Write(m_id,outextractFileInv.str());
+
 }

 }