multi-threaded extract program. Thanks to Rohit Gupta

This commit is contained in:
Hieu Hoang 2012-07-18 12:46:59 +01:00
parent b609473645
commit 7ae76dfe75
3 changed files with 433 additions and 159 deletions

View File

@ -10,7 +10,7 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;

View File

@ -0,0 +1,146 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2010 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
#pragma once
#ifndef PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
#define PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
namespace MosesTraining
{
enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
class PhraseExtractionOptions {
public:
const int maxPhraseLength;
private:
bool allModelsOutputFlag;
bool wordModel;
REO_MODEL_TYPE wordType;
bool phraseModel;
REO_MODEL_TYPE phraseType;
bool hierModel;
REO_MODEL_TYPE hierType;
bool orientationFlag;
bool translationFlag;
bool sentenceIdFlag; //create extract file with sentence id
bool onlyOutputSpanInfo;
bool gzOutput;
public:
PhraseExtractionOptions(const int initmaxPhraseLength):
maxPhraseLength(initmaxPhraseLength),
allModelsOutputFlag(false),
wordModel(false),
wordType(REO_MSD),
phraseModel(false),
phraseType(REO_MSD),
hierModel(false),
hierType(REO_MSD),
orientationFlag(false),
translationFlag(true),
sentenceIdFlag(false),
onlyOutputSpanInfo(false),
gzOutput(false){}
//functions for initialization of options
void initAllModelsOutputFlag(const bool initallModelsOutputFlag){
allModelsOutputFlag=initallModelsOutputFlag;
}
void initWordModel(const bool initwordModel){
wordModel=initwordModel;
}
void initWordType(REO_MODEL_TYPE initwordType ){
wordType=initwordType;
}
void initPhraseModel(const bool initphraseModel ){
phraseModel=initphraseModel;
}
void initPhraseType(REO_MODEL_TYPE initphraseType){
phraseType=initphraseType;
}
void initHierModel(const bool inithierModel){
hierModel=inithierModel;
}
void initHierType(REO_MODEL_TYPE inithierType){
hierType=inithierType;
}
void initOrientationFlag(const bool initorientationFlag){
orientationFlag=initorientationFlag;
}
void initTranslationFlag(const bool inittranslationFlag){
translationFlag=inittranslationFlag;
}
void initSentenceIdFlag(const bool initsentenceIdFlag){
sentenceIdFlag=initsentenceIdFlag;
}
void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){
onlyOutputSpanInfo= initonlyOutputSpanInfo;
}
void initGzOutput (const bool initgzOutput){
gzOutput= initgzOutput;
}
// functions for getting values
bool isAllModelsOutputFlag(){
return allModelsOutputFlag;
}
bool isWordModel(){
return wordModel;
}
REO_MODEL_TYPE isWordType(){
return wordType;
}
bool isPhraseModel(){
return phraseModel;
}
REO_MODEL_TYPE isPhraseType(){
return phraseType;
}
bool isHierModel(){
return hierModel;
}
REO_MODEL_TYPE isHierType(){
return hierType;
}
bool isOrientationFlag(){
return orientationFlag;
}
bool isTranslationFlag(){
return translationFlag;
}
bool isSentenceIdFlag(){
return sentenceIdFlag;
}
bool isOnlyOutputSpanInfo(){
return onlyOutputSpanInfo;
}
bool isGzOutput (){
return gzOutput;
}
};
}
#endif

View File

@ -1,6 +1,7 @@
/*
* extract.cpp
*
* Modified by: Rohit Gupta CDAC, Mumbai, India
* on July 15, 2012 to implement parallel processing
* Modified by: Nadi Tomeh - LIMSI/CNRS
* Machine Translation Marathon 2010, Dublin
*/
@ -13,7 +14,7 @@
#include <stdlib.h>
#include <assert.h>
#include <cstring>
#include <sstream>
#include <map>
#include <set>
#include <vector>
@ -23,14 +24,17 @@
#include "tables-core.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "../moses/src/ThreadPool.h"
#include "../moses/src/OutputCollector.h"
#include "PhraseExtractionOptions.h"
using namespace std;
using namespace MosesTraining;
#define LINE_MAX_LENGTH 500000
namespace MosesTraining {
const long int LINE_MAX_LENGTH = 500000 ;
namespace MosesTraining
{
// HPhraseVertex represents a point in the alignment matrix
typedef pair <int, int> HPhraseVertex;
@ -46,58 +50,65 @@ typedef vector < HPhrase > HPhraseVector;
// The key of the map is the English index and the value is a set of the source ones
typedef map <int, set<int> > HSentenceVertices;
enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int));
REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &);
REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &,
const HSentenceVertices &, const HSentenceVertices &,
REO_POS);
void insertVertex(HSentenceVertices &, int, int);
void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
void insertVertex(HSentenceVertices &, int, int);
void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
int, int, int, int);
string getOrientString(REO_POS, REO_MODEL_TYPE);
string getOrientString(REO_POS, REO_MODEL_TYPE);
bool ge(int, int);
bool le(int, int);
bool lt(int, int);
bool ge(int, int);
bool le(int, int);
bool lt(int, int);
void extractBase(SentenceAlignment &);
void extract(SentenceAlignment &);
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
bool isAligned (SentenceAlignment &, int, int);
bool allModelsOutputFlag = false;
bool wordModel = false;
REO_MODEL_TYPE wordType = REO_MSD;
bool phraseModel = false;
REO_MODEL_TYPE phraseType = REO_MSD;
bool hierModel = false;
REO_MODEL_TYPE hierType = REO_MSD;
bool isAligned (SentenceAlignment &, int, int);
Moses::OutputFileStream extractFile;
Moses::OutputFileStream extractFileInv;
Moses::OutputFileStream extractFileOrientation;
Moses::OutputFileStream extractFileSentenceId;
int maxPhraseLength;
bool orientationFlag = false;
bool translationFlag = true;
bool sentenceIdFlag = false; //create extract file with sentence id
bool onlyOutputSpanInfo = false;
bool gzOutput = false;
}
namespace MosesTraining{
class ExtractTask : public Moses::Task{
private:
size_t m_id;
SentenceAlignment *m_sentence;
PhraseExtractionOptions &m_options;
Moses::OutputCollector* m_extractCollector;
Moses::OutputCollector* m_extractCollectorInv;
Moses::OutputCollector* m_extractCollectorOrientation;
Moses::OutputCollector* m_extractCollectorSentenceId;
public:
ExtractTask(size_t id, SentenceAlignment *sentence,PhraseExtractionOptions &initoptions, Moses::OutputCollector *extractCollector, Moses::OutputCollector *extractCollectorInv,Moses::OutputCollector *extractCollectorOrientation,Moses::OutputCollector* extractCollectorSentenceId ):
m_id(id),
m_sentence(sentence),
m_options(initoptions),
m_extractCollector(extractCollector),
m_extractCollectorInv(extractCollectorInv),
m_extractCollectorOrientation(extractCollectorOrientation),
m_extractCollectorSentenceId(extractCollectorSentenceId) {}
~ExtractTask() { delete m_sentence; }
void Run();
private:
vector< string > m_extractedPhrases;
vector< string > m_extractedPhrasesInv;
vector< string > m_extractedPhrasesOri;
vector< string > m_extractedPhrasesSid;
void extractBase(SentenceAlignment &);
void extract(SentenceAlignment &);
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
void writePhrasesToFile();
};
}
int main(int argc, char* argv[])
@ -105,70 +116,83 @@ int main(int argc, char* argv[])
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
<< "phrase extraction from an aligned parallel corpus\n";
if (argc < 6) {
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
#ifdef WITH_THREADS
int thread_count = 1;
#endif
if (argc < 6) {
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
#ifdef WITH_THREADS
cerr<< "| --threads NUM ";
#endif
cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --GZOutput ]\n";
exit(1);
}
char* &fileNameE = argv[1];
char* &fileNameF = argv[2];
char* &fileNameA = argv[3];
string fileNameExtract = string(argv[4]);
maxPhraseLength = atoi(argv[5]);
Moses::OutputFileStream extractFile;
Moses::OutputFileStream extractFileInv;
Moses::OutputFileStream extractFileOrientation;
Moses::OutputFileStream extractFileSentenceId;
const char* const &fileNameE = argv[1];
const char* const &fileNameF = argv[2];
const char* const &fileNameA = argv[3];
const string fileNameExtract = string(argv[4]);
PhraseExtractionOptions options(atoi(argv[5]));
for(int i=6; i<argc; i++) {
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
onlyOutputSpanInfo = true;
options.initOnlyOutputSpanInfo(true);
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
orientationFlag = true;
options.initOrientationFlag(true);
} else if (strcmp(argv[i],"--NoTTable") == 0) {
translationFlag = false;
options.initTranslationFlag(false);
} else if (strcmp(argv[i], "--SentenceId") == 0) {
sentenceIdFlag = true;
options.initSentenceIdFlag(true);
} else if (strcmp(argv[i], "--GZOutput") == 0) {
gzOutput = true;
options.initGzOutput(true);
} else if(strcmp(argv[i],"--model") == 0) {
if (i+1 >= argc) {
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
exit(1);
}
char* modelParams = argv[++i];
char* modelName = strtok(modelParams, "-");
char* modelType = strtok(NULL, "-");
char* modelParams = argv[++i];
char* modelName = strtok(modelParams, "-");
char* modelType = strtok(NULL, "-");
REO_MODEL_TYPE intModelType;
if(strcmp(modelName, "wbe") == 0) {
wordModel = true;
options.initWordModel(true);
if(strcmp(modelType, "msd") == 0)
wordType = REO_MSD;
options.initWordType(REO_MSD);
else if(strcmp(modelType, "mslr") == 0)
wordType = REO_MSLR;
options.initWordType(REO_MSLR);
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
wordType = REO_MONO;
options.initWordType(REO_MONO);
else {
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
} else if(strcmp(modelName, "phrase") == 0) {
phraseModel = true;
options.initPhraseModel(true);
if(strcmp(modelType, "msd") == 0)
phraseType = REO_MSD;
options.initPhraseType(REO_MSD);
else if(strcmp(modelType, "mslr") == 0)
phraseType = REO_MSLR;
options.initPhraseType(REO_MSLR);
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
phraseType = REO_MONO;
options.initPhraseType(REO_MONO);
else {
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
} else if(strcmp(modelName, "hier") == 0) {
hierModel = true;
options.initHierModel(true);
if(strcmp(modelType, "msd") == 0)
hierType = REO_MSD;
options.initHierType(REO_MSD);
else if(strcmp(modelType, "mslr") == 0)
hierType = REO_MSLR;
options.initHierType(REO_MSLR);
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
hierType = REO_MONO;
options.initHierType(REO_MONO);
else {
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
@ -178,7 +202,21 @@ int main(int argc, char* argv[])
exit(1);
}
allModelsOutputFlag = true;
options.initAllModelsOutputFlag(true);
#ifdef WITH_THREADS
}else if (strcmp(argv[i],"-threads") == 0 ||
strcmp(argv[i],"--threads") == 0 ||
strcmp(argv[i],"--Threads") == 0) {
if(argc>(i+1))thread_count = atoi(argv[++i]);
else {cerr<<"extract: syntax error, NUM is missing for --threads NUM option"<<endl;
exit(1);
}
if(thread_count==0){
cerr<<"extract: error, NUM is missing for --threads NUM option or --threads 0 is given"<<endl;
exit(1);
}
#endif
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@ -187,9 +225,9 @@ int main(int argc, char* argv[])
// default reordering model if no model selected
// allows for the old syntax to be used
if(orientationFlag && !allModelsOutputFlag) {
wordModel = true;
wordType = REO_MSD;
if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
options.initWordModel(true);
options.initWordType(REO_MSD);
}
// open input files
@ -202,21 +240,32 @@ int main(int argc, char* argv[])
istream *aFileP = &aFile;
// open output files
if (translationFlag) {
string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
if (options.isTranslationFlag()) {
string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str());
extractFileInv.Open(fileNameExtractInv.c_str());
}
if (orientationFlag) {
string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
if (options.isOrientationFlag()) {
string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
extractFileOrientation.Open(fileNameExtractOrientation.c_str());
}
if (sentenceIdFlag) {
string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
if (options.isSentenceIdFlag()) {
string fileNameExtractSentenceId = fileNameExtract + ".sid" + (options.isGzOutput()?".gz":"");
extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
}
Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);//r
Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);//r
Moses::OutputCollector* extractCollectorOrientation = new Moses::OutputCollector(&extractFileOrientation);//r
Moses::OutputCollector* extractCollectorSentenceId = new Moses::OutputCollector(&extractFileSentenceId); //r
#ifdef WITH_THREADS
// set up thread pool
Moses::ThreadPool pool(thread_count);
pool.SetQueueLimit(1000);
#endif
int i=0;
while(true) {
i++;
@ -228,32 +277,57 @@ int main(int argc, char* argv[])
if (eFileP->eof()) break;
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
SentenceAlignment sentence;
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
SentenceAlignment *sentence=new SentenceAlignment;
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
if (onlyOutputSpanInfo) {
if (options.isOnlyOutputSpanInfo()) {
cout << "LOG: SRC: " << foreignString << endl;
cout << "LOG: TGT: " << englishString << endl;
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
if (sentence->create( englishString, foreignString, alignmentString, i)) {
ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector , extractCollectorInv, extractCollectorOrientation, extractCollectorSentenceId);
#ifdef WITH_THREADS
if (thread_count == 1) {
task->Run();
delete task;
}
else {
pool.Submit(task);
}
#else
task->Run();
delete task;
#endif
if (sentence.create( englishString, foreignString, alignmentString, i)) {
extract(sentence);
}
if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
}
#ifdef WITH_THREADS
// wait for all threads to finish
pool.Stop(true);
#endif
eFile.Close();
fFile.Close();
aFile.Close();
delete extractCollector;
delete extractCollectorInv;
delete extractCollectorOrientation;
delete extractCollectorSentenceId;
//az: only close if we actually opened it
if (!onlyOutputSpanInfo) {
if (translationFlag) {
if (!options.isOnlyOutputSpanInfo()) {
if (options.isTranslationFlag()) {
extractFile.Close();
extractFileInv.Close();
}
if (orientationFlag) extractFileOrientation.Close();
if (sentenceIdFlag) {
if (options.isOrientationFlag()){
extractFileOrientation.Close();
}
if (options.isSentenceIdFlag()) {
extractFileSentenceId.Close();
}
}
@ -261,8 +335,17 @@ int main(int argc, char* argv[])
namespace MosesTraining
{
void ExtractTask::Run() {
extract(*m_sentence);
writePhrasesToFile();
m_extractedPhrases.clear();
m_extractedPhrasesInv.clear();
m_extractedPhrasesOri.clear();
m_extractedPhrasesSid.clear();
void extract(SentenceAlignment &sentence)
}
void ExtractTask::extract(SentenceAlignment &sentence)
{
int countE = sentence.target.size();
int countF = sentence.source.size();
@ -281,14 +364,14 @@ void extract(SentenceAlignment &sentence)
HSentenceVertices::const_iterator it;
bool relaxLimit = hierModel;
bool buildExtraStructure = phraseModel || hierModel;
bool relaxLimit = m_options.isHierModel();
bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
// check alignments for target phrase startE...endE
// loop over extracted phrases which are compatible with the word-alignments
for(int startE=0; startE<countE; startE++) {
for(int endE=startE;
(endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
(endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
endE++) {
int minF = 9999;
@ -308,7 +391,7 @@ void extract(SentenceAlignment &sentence)
}
if (maxF >= 0 && // aligned to any source words at all
(relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits
(relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits
// check if source words are aligned to out of bound target words
bool out_of_bounds = false;
@ -323,17 +406,17 @@ void extract(SentenceAlignment &sentence)
// start point of source phrase may retreat over unaligned
for(int startF=minF;
(startF>=0 &&
(relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
(relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
(startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
startF--)
// end point of source phrase may advance over unaligned
for(int endF=maxF;
(endF<countF &&
(relaxLimit || endF<startF+maxPhraseLength) && // within length limit
(relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
(endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
endF++) { // at this point we have extracted a phrase
if(buildExtraStructure) { // phrase || hier
if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
HPhraseVertex(endF,endE)));
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
@ -343,16 +426,16 @@ void extract(SentenceAlignment &sentence)
startF, startE, endF, endE);
} else {
string orientationInfo = "";
if(wordModel) {
if(m_options.isWordModel()) {
REO_POS wordPrevOrient, wordNextOrient;
bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
if(allModelsOutputFlag)
wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
if(m_options.isAllModelsOutputFlag())
" | | ";
}
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
@ -378,38 +461,38 @@ void extract(SentenceAlignment &sentence)
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
if(wordModel) {
wordPrevOrient = getOrientWordModel(sentence, wordType,
if(m_options.isWordModel()) {
wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF, 0, 1,
&ge, &lt);
wordNextOrient = getOrientWordModel(sentence, wordType,
wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF, -1,
&lt, &ge);
}
if (phraseModel) {
phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
if (m_options.isPhraseModel()) {
phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
} else {
phrasePrevOrient = phraseNextOrient = UNKNOWN;
}
if(hierModel) {
hierPrevOrient = getOrientHierModel(sentence, hierType,
if(m_options.isHierModel()) {
hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
hierNextOrient = getOrientHierModel(sentence, hierType,
hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
}
orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
}
@ -617,94 +700,139 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
return "";
}
void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
{
// source
// cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
// // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
ostringstream outextractstr;
ostringstream outextractstrInv;
ostringstream outextractstrOrientation;
ostringstream outextractstrSentenceId;
if (onlyOutputSpanInfo) {
if (m_options.isOnlyOutputSpanInfo()) {
cout << startF << " " << endF << " " << startE << " " << endE << endl;
return;
}
for(int fi=startF; fi<=endF; fi++) {
if (translationFlag) extractFile << sentence.source[fi] << " ";
if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
for(int fi=startF; fi<=endF; fi++) {
if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.source[fi] << " ";
}
if (translationFlag) extractFile << "||| ";
if (orientationFlag) extractFileOrientation << "||| ";
if (sentenceIdFlag) extractFileSentenceId << "||| ";
if (m_options.isTranslationFlag()) outextractstr << "||| ";
if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
// target
for(int ei=startE; ei<=endE; ei++) {
if (translationFlag) extractFile << sentence.target[ei] << " ";
if (translationFlag) extractFileInv << sentence.target[ei] << " ";
if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.target[ei] << " ";
}
if (translationFlag) extractFile << "|||";
if (translationFlag) extractFileInv << "||| ";
if (orientationFlag) extractFileOrientation << "||| ";
if (sentenceIdFlag) extractFileSentenceId << "||| ";
if (m_options.isTranslationFlag()) outextractstr << "|||";
if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
// source (for inverse)
if (translationFlag) {
for(int fi=startF; fi<=endF; fi++)
extractFileInv << sentence.source[fi] << " ";
extractFileInv << "|||";
}
if (m_options.isTranslationFlag()) {
for(int fi=startF; fi<=endF; fi++)
outextractstrInv << sentence.source[fi] << " ";
outextractstrInv << "|||";
}
// alignment
if (translationFlag) {
if (m_options.isTranslationFlag()) {
for(int ei=startE; ei<=endE; ei++) {
for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
int fi = sentence.alignedToT[ei][i];
extractFile << " " << fi-startF << "-" << ei-startE;
extractFileInv << " " << ei-startE << "-" << fi-startF;
outextractstr << " " << fi-startF << "-" << ei-startE;
outextractstrInv << " " << ei-startE << "-" << fi-startF;
}
}
}
if (orientationFlag)
extractFileOrientation << orientationInfo;
if (m_options.isOrientationFlag())
outextractstrOrientation << orientationInfo;
if (sentenceIdFlag) {
extractFileSentenceId << sentence.sentenceID;
if (m_options.isSentenceIdFlag()) {
outextractstrSentenceId << sentence.sentenceID;
}
if (translationFlag) extractFile << "\n";
if (translationFlag) extractFileInv << "\n";
if (orientationFlag) extractFileOrientation << "\n";
if (sentenceIdFlag) extractFileSentenceId << "\n";
if (m_options.isTranslationFlag()) outextractstr << "\n";
if (m_options.isTranslationFlag()) outextractstrInv << "\n";
if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "\n";
m_extractedPhrases.push_back(outextractstr.str());
m_extractedPhrasesInv.push_back(outextractstrInv.str());
m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
m_extractedPhrasesSid.push_back(outextractstrSentenceId.str());
}
void ExtractTask::writePhrasesToFile(){
ostringstream outextractFile;
ostringstream outextractFileInv;
ostringstream outextractFileOrientation;
ostringstream outextractFileSentenceId;
for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
outextractFile<<phrase->data();
}
for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
outextractFileInv<<phrase->data();
}
for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
outextractFileOrientation<<phrase->data();
}
for(vector<string>::const_iterator phrase=m_extractedPhrasesSid.begin();phrase!=m_extractedPhrasesSid.end();phrase++){
outextractFileSentenceId<<phrase->data();
}
m_extractCollector->Write(m_id, outextractFile.str());
m_extractCollectorInv->Write(m_id,outextractFileInv.str());
m_extractCollectorOrientation->Write(m_id,outextractFileOrientation.str());
m_extractCollectorSentenceId->Write(m_id,outextractFileSentenceId.str());
}
// if proper conditioning, we need the number of times a source phrase occured
void extractBase( SentenceAlignment &sentence )
void ExtractTask::extractBase( SentenceAlignment &sentence )
{
ostringstream outextractFile;
ostringstream outextractFileInv;
int countF = sentence.source.size();
for(int startF=0; startF<countF; startF++) {
for(int endF=startF;
(endF<countF && endF<startF+maxPhraseLength);
(endF<countF && endF<startF+m_options.maxPhraseLength);
endF++) {
for(int fi=startF; fi<=endF; fi++) {
extractFile << sentence.source[fi] << " ";
}
extractFile << "|||" << endl;
outextractFile << sentence.source[fi] << " ";
}
outextractFile << "|||" << endl;
}
}
int countE = sentence.target.size();
for(int startE=0; startE<countE; startE++) {
for(int endE=startE;
(endE<countE && endE<startE+maxPhraseLength);
(endE<countE && endE<startE+m_options.maxPhraseLength);
endE++) {
for(int ei=startE; ei<=endE; ei++) {
extractFileInv << sentence.target[ei] << " ";
outextractFileInv << sentence.target[ei] << " ";
}
extractFileInv << "|||" << endl;
outextractFileInv << "|||" << endl;
}
}
m_extractCollector->Write(m_id, outextractFile.str());
m_extractCollectorInv->Write(m_id,outextractFileInv.str());
}
}