mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-17 14:17:13 +03:00
multi-threaded extract program. Thanks to Rohit Gupta
This commit is contained in:
parent
b609473645
commit
7ae76dfe75
@ -10,7 +10,7 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
|
|||||||
alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
|
alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
|
||||||
alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
|
alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
|
||||||
|
|
||||||
exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
|
exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
|
||||||
|
|
||||||
exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
|
exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
|
||||||
|
|
||||||
|
146
phrase-extract/PhraseExtractionOptions.h
Normal file
146
phrase-extract/PhraseExtractionOptions.h
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
/***********************************************************************
|
||||||
|
Moses - factored phrase-based language decoder
|
||||||
|
Copyright (C) 2010 University of Edinburgh
|
||||||
|
|
||||||
|
This library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with this library; if not, write to the Free Software
|
||||||
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
***********************************************************************/
|
||||||
|
|
||||||
|
/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#ifndef PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
|
||||||
|
#define PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
|
||||||
|
|
||||||
|
namespace MosesTraining
|
||||||
|
{
|
||||||
|
enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
|
||||||
|
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
|
||||||
|
|
||||||
|
|
||||||
|
class PhraseExtractionOptions {
|
||||||
|
|
||||||
|
public:
|
||||||
|
const int maxPhraseLength;
|
||||||
|
private:
|
||||||
|
bool allModelsOutputFlag;
|
||||||
|
bool wordModel;
|
||||||
|
REO_MODEL_TYPE wordType;
|
||||||
|
bool phraseModel;
|
||||||
|
REO_MODEL_TYPE phraseType;
|
||||||
|
bool hierModel;
|
||||||
|
REO_MODEL_TYPE hierType;
|
||||||
|
bool orientationFlag;
|
||||||
|
bool translationFlag;
|
||||||
|
bool sentenceIdFlag; //create extract file with sentence id
|
||||||
|
bool onlyOutputSpanInfo;
|
||||||
|
bool gzOutput;
|
||||||
|
|
||||||
|
public:
|
||||||
|
PhraseExtractionOptions(const int initmaxPhraseLength):
|
||||||
|
maxPhraseLength(initmaxPhraseLength),
|
||||||
|
allModelsOutputFlag(false),
|
||||||
|
wordModel(false),
|
||||||
|
wordType(REO_MSD),
|
||||||
|
phraseModel(false),
|
||||||
|
phraseType(REO_MSD),
|
||||||
|
hierModel(false),
|
||||||
|
hierType(REO_MSD),
|
||||||
|
orientationFlag(false),
|
||||||
|
translationFlag(true),
|
||||||
|
sentenceIdFlag(false),
|
||||||
|
onlyOutputSpanInfo(false),
|
||||||
|
gzOutput(false){}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//functions for initialization of options
|
||||||
|
void initAllModelsOutputFlag(const bool initallModelsOutputFlag){
|
||||||
|
allModelsOutputFlag=initallModelsOutputFlag;
|
||||||
|
}
|
||||||
|
void initWordModel(const bool initwordModel){
|
||||||
|
wordModel=initwordModel;
|
||||||
|
}
|
||||||
|
void initWordType(REO_MODEL_TYPE initwordType ){
|
||||||
|
wordType=initwordType;
|
||||||
|
}
|
||||||
|
void initPhraseModel(const bool initphraseModel ){
|
||||||
|
phraseModel=initphraseModel;
|
||||||
|
}
|
||||||
|
void initPhraseType(REO_MODEL_TYPE initphraseType){
|
||||||
|
phraseType=initphraseType;
|
||||||
|
}
|
||||||
|
void initHierModel(const bool inithierModel){
|
||||||
|
hierModel=inithierModel;
|
||||||
|
}
|
||||||
|
void initHierType(REO_MODEL_TYPE inithierType){
|
||||||
|
hierType=inithierType;
|
||||||
|
}
|
||||||
|
void initOrientationFlag(const bool initorientationFlag){
|
||||||
|
orientationFlag=initorientationFlag;
|
||||||
|
}
|
||||||
|
void initTranslationFlag(const bool inittranslationFlag){
|
||||||
|
translationFlag=inittranslationFlag;
|
||||||
|
}
|
||||||
|
void initSentenceIdFlag(const bool initsentenceIdFlag){
|
||||||
|
sentenceIdFlag=initsentenceIdFlag;
|
||||||
|
}
|
||||||
|
void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){
|
||||||
|
onlyOutputSpanInfo= initonlyOutputSpanInfo;
|
||||||
|
}
|
||||||
|
void initGzOutput (const bool initgzOutput){
|
||||||
|
gzOutput= initgzOutput;
|
||||||
|
}
|
||||||
|
// functions for getting values
|
||||||
|
bool isAllModelsOutputFlag(){
|
||||||
|
return allModelsOutputFlag;
|
||||||
|
}
|
||||||
|
bool isWordModel(){
|
||||||
|
return wordModel;
|
||||||
|
}
|
||||||
|
REO_MODEL_TYPE isWordType(){
|
||||||
|
return wordType;
|
||||||
|
}
|
||||||
|
bool isPhraseModel(){
|
||||||
|
return phraseModel;
|
||||||
|
}
|
||||||
|
REO_MODEL_TYPE isPhraseType(){
|
||||||
|
return phraseType;
|
||||||
|
}
|
||||||
|
bool isHierModel(){
|
||||||
|
return hierModel;
|
||||||
|
}
|
||||||
|
REO_MODEL_TYPE isHierType(){
|
||||||
|
return hierType;
|
||||||
|
}
|
||||||
|
bool isOrientationFlag(){
|
||||||
|
return orientationFlag;
|
||||||
|
}
|
||||||
|
bool isTranslationFlag(){
|
||||||
|
return translationFlag;
|
||||||
|
}
|
||||||
|
bool isSentenceIdFlag(){
|
||||||
|
return sentenceIdFlag;
|
||||||
|
}
|
||||||
|
bool isOnlyOutputSpanInfo(){
|
||||||
|
return onlyOutputSpanInfo;
|
||||||
|
}
|
||||||
|
bool isGzOutput (){
|
||||||
|
return gzOutput;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
@ -1,6 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* extract.cpp
|
* extract.cpp
|
||||||
*
|
* Modified by: Rohit Gupta CDAC, Mumbai, India
|
||||||
|
* on July 15, 2012 to implement parallel processing
|
||||||
* Modified by: Nadi Tomeh - LIMSI/CNRS
|
* Modified by: Nadi Tomeh - LIMSI/CNRS
|
||||||
* Machine Translation Marathon 2010, Dublin
|
* Machine Translation Marathon 2010, Dublin
|
||||||
*/
|
*/
|
||||||
@ -13,7 +14,7 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <sstream>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -23,14 +24,17 @@
|
|||||||
#include "tables-core.h"
|
#include "tables-core.h"
|
||||||
#include "InputFileStream.h"
|
#include "InputFileStream.h"
|
||||||
#include "OutputFileStream.h"
|
#include "OutputFileStream.h"
|
||||||
|
#include "../moses/src/ThreadPool.h"
|
||||||
|
#include "../moses/src/OutputCollector.h"
|
||||||
|
#include "PhraseExtractionOptions.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace MosesTraining;
|
using namespace MosesTraining;
|
||||||
|
|
||||||
#define LINE_MAX_LENGTH 500000
|
namespace MosesTraining {
|
||||||
|
|
||||||
|
|
||||||
|
const long int LINE_MAX_LENGTH = 500000 ;
|
||||||
|
|
||||||
namespace MosesTraining
|
|
||||||
{
|
|
||||||
|
|
||||||
// HPhraseVertex represents a point in the alignment matrix
|
// HPhraseVertex represents a point in the alignment matrix
|
||||||
typedef pair <int, int> HPhraseVertex;
|
typedef pair <int, int> HPhraseVertex;
|
||||||
@ -46,58 +50,65 @@ typedef vector < HPhrase > HPhraseVector;
|
|||||||
// The key of the map is the English index and the value is a set of the source ones
|
// The key of the map is the English index and the value is a set of the source ones
|
||||||
typedef map <int, set<int> > HSentenceVertices;
|
typedef map <int, set<int> > HSentenceVertices;
|
||||||
|
|
||||||
enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
|
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
||||||
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
|
|
||||||
|
|
||||||
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
|
||||||
int, int, int, int, int, int, int,
|
int, int, int, int, int, int, int,
|
||||||
bool (*)(int, int), bool (*)(int, int));
|
bool (*)(int, int), bool (*)(int, int));
|
||||||
REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
||||||
int, int, int, int, int, int, int,
|
int, int, int, int, int, int, int,
|
||||||
bool (*)(int, int), bool (*)(int, int),
|
bool (*)(int, int), bool (*)(int, int),
|
||||||
const HSentenceVertices &, const HSentenceVertices &);
|
const HSentenceVertices &, const HSentenceVertices &);
|
||||||
REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
||||||
int, int, int, int, int, int, int,
|
int, int, int, int, int, int, int,
|
||||||
bool (*)(int, int), bool (*)(int, int),
|
bool (*)(int, int), bool (*)(int, int),
|
||||||
const HSentenceVertices &, const HSentenceVertices &,
|
const HSentenceVertices &, const HSentenceVertices &,
|
||||||
const HSentenceVertices &, const HSentenceVertices &,
|
const HSentenceVertices &, const HSentenceVertices &,
|
||||||
REO_POS);
|
REO_POS);
|
||||||
|
|
||||||
void insertVertex(HSentenceVertices &, int, int);
|
void insertVertex(HSentenceVertices &, int, int);
|
||||||
void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
|
void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
|
||||||
int, int, int, int);
|
int, int, int, int);
|
||||||
string getOrientString(REO_POS, REO_MODEL_TYPE);
|
string getOrientString(REO_POS, REO_MODEL_TYPE);
|
||||||
|
|
||||||
bool ge(int, int);
|
bool ge(int, int);
|
||||||
bool le(int, int);
|
bool le(int, int);
|
||||||
bool lt(int, int);
|
bool lt(int, int);
|
||||||
|
|
||||||
void extractBase(SentenceAlignment &);
|
bool isAligned (SentenceAlignment &, int, int);
|
||||||
void extract(SentenceAlignment &);
|
|
||||||
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
|
|
||||||
bool isAligned (SentenceAlignment &, int, int);
|
|
||||||
|
|
||||||
bool allModelsOutputFlag = false;
|
|
||||||
|
|
||||||
bool wordModel = false;
|
|
||||||
REO_MODEL_TYPE wordType = REO_MSD;
|
|
||||||
bool phraseModel = false;
|
|
||||||
REO_MODEL_TYPE phraseType = REO_MSD;
|
|
||||||
bool hierModel = false;
|
|
||||||
REO_MODEL_TYPE hierType = REO_MSD;
|
|
||||||
|
|
||||||
|
|
||||||
Moses::OutputFileStream extractFile;
|
}
|
||||||
Moses::OutputFileStream extractFileInv;
|
namespace MosesTraining{
|
||||||
Moses::OutputFileStream extractFileOrientation;
|
class ExtractTask : public Moses::Task{
|
||||||
Moses::OutputFileStream extractFileSentenceId;
|
private:
|
||||||
int maxPhraseLength;
|
size_t m_id;
|
||||||
bool orientationFlag = false;
|
SentenceAlignment *m_sentence;
|
||||||
bool translationFlag = true;
|
PhraseExtractionOptions &m_options;
|
||||||
bool sentenceIdFlag = false; //create extract file with sentence id
|
Moses::OutputCollector* m_extractCollector;
|
||||||
bool onlyOutputSpanInfo = false;
|
Moses::OutputCollector* m_extractCollectorInv;
|
||||||
bool gzOutput = false;
|
Moses::OutputCollector* m_extractCollectorOrientation;
|
||||||
|
Moses::OutputCollector* m_extractCollectorSentenceId;
|
||||||
|
public:
|
||||||
|
ExtractTask(size_t id, SentenceAlignment *sentence,PhraseExtractionOptions &initoptions, Moses::OutputCollector *extractCollector, Moses::OutputCollector *extractCollectorInv,Moses::OutputCollector *extractCollectorOrientation,Moses::OutputCollector* extractCollectorSentenceId ):
|
||||||
|
m_id(id),
|
||||||
|
m_sentence(sentence),
|
||||||
|
m_options(initoptions),
|
||||||
|
m_extractCollector(extractCollector),
|
||||||
|
m_extractCollectorInv(extractCollectorInv),
|
||||||
|
m_extractCollectorOrientation(extractCollectorOrientation),
|
||||||
|
m_extractCollectorSentenceId(extractCollectorSentenceId) {}
|
||||||
|
~ExtractTask() { delete m_sentence; }
|
||||||
|
void Run();
|
||||||
|
private:
|
||||||
|
vector< string > m_extractedPhrases;
|
||||||
|
vector< string > m_extractedPhrasesInv;
|
||||||
|
vector< string > m_extractedPhrasesOri;
|
||||||
|
vector< string > m_extractedPhrasesSid;
|
||||||
|
void extractBase(SentenceAlignment &);
|
||||||
|
void extract(SentenceAlignment &);
|
||||||
|
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
|
||||||
|
void writePhrasesToFile();
|
||||||
|
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
@ -105,70 +116,83 @@ int main(int argc, char* argv[])
|
|||||||
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
|
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
|
||||||
<< "phrase extraction from an aligned parallel corpus\n";
|
<< "phrase extraction from an aligned parallel corpus\n";
|
||||||
|
|
||||||
if (argc < 6) {
|
#ifdef WITH_THREADS
|
||||||
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
|
int thread_count = 1;
|
||||||
|
#endif
|
||||||
|
if (argc < 6) {
|
||||||
|
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
|
||||||
|
#ifdef WITH_THREADS
|
||||||
|
|
||||||
|
cerr<< "| --threads NUM ";
|
||||||
|
#endif
|
||||||
|
cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --GZOutput ]\n";
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
char* &fileNameE = argv[1];
|
|
||||||
char* &fileNameF = argv[2];
|
Moses::OutputFileStream extractFile;
|
||||||
char* &fileNameA = argv[3];
|
Moses::OutputFileStream extractFileInv;
|
||||||
string fileNameExtract = string(argv[4]);
|
Moses::OutputFileStream extractFileOrientation;
|
||||||
maxPhraseLength = atoi(argv[5]);
|
Moses::OutputFileStream extractFileSentenceId;
|
||||||
|
const char* const &fileNameE = argv[1];
|
||||||
|
const char* const &fileNameF = argv[2];
|
||||||
|
const char* const &fileNameA = argv[3];
|
||||||
|
const string fileNameExtract = string(argv[4]);
|
||||||
|
PhraseExtractionOptions options(atoi(argv[5]));
|
||||||
|
|
||||||
for(int i=6; i<argc; i++) {
|
for(int i=6; i<argc; i++) {
|
||||||
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
|
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
|
||||||
onlyOutputSpanInfo = true;
|
options.initOnlyOutputSpanInfo(true);
|
||||||
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
|
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
|
||||||
orientationFlag = true;
|
options.initOrientationFlag(true);
|
||||||
} else if (strcmp(argv[i],"--NoTTable") == 0) {
|
} else if (strcmp(argv[i],"--NoTTable") == 0) {
|
||||||
translationFlag = false;
|
options.initTranslationFlag(false);
|
||||||
} else if (strcmp(argv[i], "--SentenceId") == 0) {
|
} else if (strcmp(argv[i], "--SentenceId") == 0) {
|
||||||
sentenceIdFlag = true;
|
options.initSentenceIdFlag(true);
|
||||||
} else if (strcmp(argv[i], "--GZOutput") == 0) {
|
} else if (strcmp(argv[i], "--GZOutput") == 0) {
|
||||||
gzOutput = true;
|
options.initGzOutput(true);
|
||||||
} else if(strcmp(argv[i],"--model") == 0) {
|
} else if(strcmp(argv[i],"--model") == 0) {
|
||||||
if (i+1 >= argc) {
|
if (i+1 >= argc) {
|
||||||
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
|
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
char* modelParams = argv[++i];
|
char* modelParams = argv[++i];
|
||||||
char* modelName = strtok(modelParams, "-");
|
char* modelName = strtok(modelParams, "-");
|
||||||
char* modelType = strtok(NULL, "-");
|
char* modelType = strtok(NULL, "-");
|
||||||
|
|
||||||
REO_MODEL_TYPE intModelType;
|
REO_MODEL_TYPE intModelType;
|
||||||
|
|
||||||
if(strcmp(modelName, "wbe") == 0) {
|
if(strcmp(modelName, "wbe") == 0) {
|
||||||
wordModel = true;
|
options.initWordModel(true);
|
||||||
if(strcmp(modelType, "msd") == 0)
|
if(strcmp(modelType, "msd") == 0)
|
||||||
wordType = REO_MSD;
|
options.initWordType(REO_MSD);
|
||||||
else if(strcmp(modelType, "mslr") == 0)
|
else if(strcmp(modelType, "mslr") == 0)
|
||||||
wordType = REO_MSLR;
|
options.initWordType(REO_MSLR);
|
||||||
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
||||||
wordType = REO_MONO;
|
options.initWordType(REO_MONO);
|
||||||
else {
|
else {
|
||||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
} else if(strcmp(modelName, "phrase") == 0) {
|
} else if(strcmp(modelName, "phrase") == 0) {
|
||||||
phraseModel = true;
|
options.initPhraseModel(true);
|
||||||
if(strcmp(modelType, "msd") == 0)
|
if(strcmp(modelType, "msd") == 0)
|
||||||
phraseType = REO_MSD;
|
options.initPhraseType(REO_MSD);
|
||||||
else if(strcmp(modelType, "mslr") == 0)
|
else if(strcmp(modelType, "mslr") == 0)
|
||||||
phraseType = REO_MSLR;
|
options.initPhraseType(REO_MSLR);
|
||||||
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
||||||
phraseType = REO_MONO;
|
options.initPhraseType(REO_MONO);
|
||||||
else {
|
else {
|
||||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
} else if(strcmp(modelName, "hier") == 0) {
|
} else if(strcmp(modelName, "hier") == 0) {
|
||||||
hierModel = true;
|
options.initHierModel(true);
|
||||||
if(strcmp(modelType, "msd") == 0)
|
if(strcmp(modelType, "msd") == 0)
|
||||||
hierType = REO_MSD;
|
options.initHierType(REO_MSD);
|
||||||
else if(strcmp(modelType, "mslr") == 0)
|
else if(strcmp(modelType, "mslr") == 0)
|
||||||
hierType = REO_MSLR;
|
options.initHierType(REO_MSLR);
|
||||||
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
||||||
hierType = REO_MONO;
|
options.initHierType(REO_MONO);
|
||||||
else {
|
else {
|
||||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||||
exit(1);
|
exit(1);
|
||||||
@ -178,7 +202,21 @@ int main(int argc, char* argv[])
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
allModelsOutputFlag = true;
|
options.initAllModelsOutputFlag(true);
|
||||||
|
#ifdef WITH_THREADS
|
||||||
|
}else if (strcmp(argv[i],"-threads") == 0 ||
|
||||||
|
strcmp(argv[i],"--threads") == 0 ||
|
||||||
|
strcmp(argv[i],"--Threads") == 0) {
|
||||||
|
if(argc>(i+1))thread_count = atoi(argv[++i]);
|
||||||
|
else {cerr<<"extract: syntax error, NUM is missing for --threads NUM option"<<endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if(thread_count==0){
|
||||||
|
cerr<<"extract: error, NUM is missing for --threads NUM option or --threads 0 is given"<<endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
|
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
|
||||||
exit(1);
|
exit(1);
|
||||||
@ -187,9 +225,9 @@ int main(int argc, char* argv[])
|
|||||||
|
|
||||||
// default reordering model if no model selected
|
// default reordering model if no model selected
|
||||||
// allows for the old syntax to be used
|
// allows for the old syntax to be used
|
||||||
if(orientationFlag && !allModelsOutputFlag) {
|
if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
|
||||||
wordModel = true;
|
options.initWordModel(true);
|
||||||
wordType = REO_MSD;
|
options.initWordType(REO_MSD);
|
||||||
}
|
}
|
||||||
|
|
||||||
// open input files
|
// open input files
|
||||||
@ -202,21 +240,32 @@ int main(int argc, char* argv[])
|
|||||||
istream *aFileP = &aFile;
|
istream *aFileP = &aFile;
|
||||||
|
|
||||||
// open output files
|
// open output files
|
||||||
if (translationFlag) {
|
if (options.isTranslationFlag()) {
|
||||||
string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
|
string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
|
||||||
extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
|
extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str());
|
||||||
extractFileInv.Open(fileNameExtractInv.c_str());
|
extractFileInv.Open(fileNameExtractInv.c_str());
|
||||||
}
|
}
|
||||||
if (orientationFlag) {
|
if (options.isOrientationFlag()) {
|
||||||
string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
|
string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
|
||||||
extractFileOrientation.Open(fileNameExtractOrientation.c_str());
|
extractFileOrientation.Open(fileNameExtractOrientation.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sentenceIdFlag) {
|
if (options.isSentenceIdFlag()) {
|
||||||
string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
|
string fileNameExtractSentenceId = fileNameExtract + ".sid" + (options.isGzOutput()?".gz":"");
|
||||||
extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
|
extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);//r
|
||||||
|
Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);//r
|
||||||
|
Moses::OutputCollector* extractCollectorOrientation = new Moses::OutputCollector(&extractFileOrientation);//r
|
||||||
|
Moses::OutputCollector* extractCollectorSentenceId = new Moses::OutputCollector(&extractFileSentenceId); //r
|
||||||
|
#ifdef WITH_THREADS
|
||||||
|
// set up thread pool
|
||||||
|
Moses::ThreadPool pool(thread_count);
|
||||||
|
pool.SetQueueLimit(1000);
|
||||||
|
#endif
|
||||||
|
|
||||||
int i=0;
|
int i=0;
|
||||||
while(true) {
|
while(true) {
|
||||||
i++;
|
i++;
|
||||||
@ -228,32 +277,57 @@ int main(int argc, char* argv[])
|
|||||||
if (eFileP->eof()) break;
|
if (eFileP->eof()) break;
|
||||||
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
|
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||||
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
|
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||||
SentenceAlignment sentence;
|
SentenceAlignment *sentence=new SentenceAlignment;
|
||||||
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
|
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
|
||||||
//az: output src, tgt, and alingment line
|
//az: output src, tgt, and alingment line
|
||||||
if (onlyOutputSpanInfo) {
|
if (options.isOnlyOutputSpanInfo()) {
|
||||||
cout << "LOG: SRC: " << foreignString << endl;
|
cout << "LOG: SRC: " << foreignString << endl;
|
||||||
cout << "LOG: TGT: " << englishString << endl;
|
cout << "LOG: TGT: " << englishString << endl;
|
||||||
cout << "LOG: ALT: " << alignmentString << endl;
|
cout << "LOG: ALT: " << alignmentString << endl;
|
||||||
cout << "LOG: PHRASES_BEGIN:" << endl;
|
cout << "LOG: PHRASES_BEGIN:" << endl;
|
||||||
}
|
}
|
||||||
|
if (sentence->create( englishString, foreignString, alignmentString, i)) {
|
||||||
|
ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector , extractCollectorInv, extractCollectorOrientation, extractCollectorSentenceId);
|
||||||
|
#ifdef WITH_THREADS
|
||||||
|
if (thread_count == 1) {
|
||||||
|
task->Run();
|
||||||
|
delete task;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
pool.Submit(task);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
task->Run();
|
||||||
|
delete task;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (sentence.create( englishString, foreignString, alignmentString, i)) {
|
|
||||||
extract(sentence);
|
|
||||||
}
|
}
|
||||||
if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
|
if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef WITH_THREADS
|
||||||
|
// wait for all threads to finish
|
||||||
|
pool.Stop(true);
|
||||||
|
#endif
|
||||||
|
|
||||||
eFile.Close();
|
eFile.Close();
|
||||||
fFile.Close();
|
fFile.Close();
|
||||||
aFile.Close();
|
aFile.Close();
|
||||||
|
delete extractCollector;
|
||||||
|
delete extractCollectorInv;
|
||||||
|
delete extractCollectorOrientation;
|
||||||
|
delete extractCollectorSentenceId;
|
||||||
//az: only close if we actually opened it
|
//az: only close if we actually opened it
|
||||||
if (!onlyOutputSpanInfo) {
|
if (!options.isOnlyOutputSpanInfo()) {
|
||||||
if (translationFlag) {
|
if (options.isTranslationFlag()) {
|
||||||
extractFile.Close();
|
extractFile.Close();
|
||||||
extractFileInv.Close();
|
extractFileInv.Close();
|
||||||
|
|
||||||
}
|
}
|
||||||
if (orientationFlag) extractFileOrientation.Close();
|
if (options.isOrientationFlag()){
|
||||||
if (sentenceIdFlag) {
|
extractFileOrientation.Close();
|
||||||
|
}
|
||||||
|
if (options.isSentenceIdFlag()) {
|
||||||
extractFileSentenceId.Close();
|
extractFileSentenceId.Close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -261,8 +335,17 @@ int main(int argc, char* argv[])
|
|||||||
|
|
||||||
namespace MosesTraining
|
namespace MosesTraining
|
||||||
{
|
{
|
||||||
|
void ExtractTask::Run() {
|
||||||
|
extract(*m_sentence);
|
||||||
|
writePhrasesToFile();
|
||||||
|
m_extractedPhrases.clear();
|
||||||
|
m_extractedPhrasesInv.clear();
|
||||||
|
m_extractedPhrasesOri.clear();
|
||||||
|
m_extractedPhrasesSid.clear();
|
||||||
|
|
||||||
void extract(SentenceAlignment &sentence)
|
}
|
||||||
|
|
||||||
|
void ExtractTask::extract(SentenceAlignment &sentence)
|
||||||
{
|
{
|
||||||
int countE = sentence.target.size();
|
int countE = sentence.target.size();
|
||||||
int countF = sentence.source.size();
|
int countF = sentence.source.size();
|
||||||
@ -281,14 +364,14 @@ void extract(SentenceAlignment &sentence)
|
|||||||
|
|
||||||
HSentenceVertices::const_iterator it;
|
HSentenceVertices::const_iterator it;
|
||||||
|
|
||||||
bool relaxLimit = hierModel;
|
bool relaxLimit = m_options.isHierModel();
|
||||||
bool buildExtraStructure = phraseModel || hierModel;
|
bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
|
||||||
|
|
||||||
// check alignments for target phrase startE...endE
|
// check alignments for target phrase startE...endE
|
||||||
// loop over extracted phrases which are compatible with the word-alignments
|
// loop over extracted phrases which are compatible with the word-alignments
|
||||||
for(int startE=0; startE<countE; startE++) {
|
for(int startE=0; startE<countE; startE++) {
|
||||||
for(int endE=startE;
|
for(int endE=startE;
|
||||||
(endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
|
(endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
|
||||||
endE++) {
|
endE++) {
|
||||||
|
|
||||||
int minF = 9999;
|
int minF = 9999;
|
||||||
@ -308,7 +391,7 @@ void extract(SentenceAlignment &sentence)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (maxF >= 0 && // aligned to any source words at all
|
if (maxF >= 0 && // aligned to any source words at all
|
||||||
(relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits
|
(relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits
|
||||||
|
|
||||||
// check if source words are aligned to out of bound target words
|
// check if source words are aligned to out of bound target words
|
||||||
bool out_of_bounds = false;
|
bool out_of_bounds = false;
|
||||||
@ -323,17 +406,17 @@ void extract(SentenceAlignment &sentence)
|
|||||||
// start point of source phrase may retreat over unaligned
|
// start point of source phrase may retreat over unaligned
|
||||||
for(int startF=minF;
|
for(int startF=minF;
|
||||||
(startF>=0 &&
|
(startF>=0 &&
|
||||||
(relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
|
(relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
|
||||||
(startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
|
(startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
|
||||||
startF--)
|
startF--)
|
||||||
// end point of source phrase may advance over unaligned
|
// end point of source phrase may advance over unaligned
|
||||||
for(int endF=maxF;
|
for(int endF=maxF;
|
||||||
(endF<countF &&
|
(endF<countF &&
|
||||||
(relaxLimit || endF<startF+maxPhraseLength) && // within length limit
|
(relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
|
||||||
(endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
|
(endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
|
||||||
endF++) { // at this point we have extracted a phrase
|
endF++) { // at this point we have extracted a phrase
|
||||||
if(buildExtraStructure) { // phrase || hier
|
if(buildExtraStructure) { // phrase || hier
|
||||||
if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
|
if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
|
||||||
inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
|
inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
|
||||||
HPhraseVertex(endF,endE)));
|
HPhraseVertex(endF,endE)));
|
||||||
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
|
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
|
||||||
@ -343,16 +426,16 @@ void extract(SentenceAlignment &sentence)
|
|||||||
startF, startE, endF, endE);
|
startF, startE, endF, endE);
|
||||||
} else {
|
} else {
|
||||||
string orientationInfo = "";
|
string orientationInfo = "";
|
||||||
if(wordModel) {
|
if(m_options.isWordModel()) {
|
||||||
REO_POS wordPrevOrient, wordNextOrient;
|
REO_POS wordPrevOrient, wordNextOrient;
|
||||||
bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
|
bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
|
||||||
bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
|
bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
|
||||||
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
|
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
|
||||||
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
|
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
|
||||||
wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, <);
|
wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, <);
|
||||||
wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, <, &ge);
|
wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, <, &ge);
|
||||||
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
|
orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
|
||||||
if(allModelsOutputFlag)
|
if(m_options.isAllModelsOutputFlag())
|
||||||
" | | ";
|
" | | ";
|
||||||
}
|
}
|
||||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||||
@ -378,38 +461,38 @@ void extract(SentenceAlignment &sentence)
|
|||||||
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
|
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
|
||||||
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
|
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
|
||||||
|
|
||||||
if(wordModel) {
|
if(m_options.isWordModel()) {
|
||||||
wordPrevOrient = getOrientWordModel(sentence, wordType,
|
wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
|
||||||
connectedLeftTopP, connectedRightTopP,
|
connectedLeftTopP, connectedRightTopP,
|
||||||
startF, endF, startE, endE, countF, 0, 1,
|
startF, endF, startE, endE, countF, 0, 1,
|
||||||
&ge, <);
|
&ge, <);
|
||||||
wordNextOrient = getOrientWordModel(sentence, wordType,
|
wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
|
||||||
connectedLeftTopN, connectedRightTopN,
|
connectedLeftTopN, connectedRightTopN,
|
||||||
endF, startF, endE, startE, 0, countF, -1,
|
endF, startF, endE, startE, 0, countF, -1,
|
||||||
<, &ge);
|
<, &ge);
|
||||||
}
|
}
|
||||||
if (phraseModel) {
|
if (m_options.isPhraseModel()) {
|
||||||
phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
|
phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
|
||||||
connectedLeftTopP, connectedRightTopP,
|
connectedLeftTopP, connectedRightTopP,
|
||||||
startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft);
|
startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft);
|
||||||
phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
|
phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
|
||||||
connectedLeftTopN, connectedRightTopN,
|
connectedLeftTopN, connectedRightTopN,
|
||||||
endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight);
|
endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight);
|
||||||
} else {
|
} else {
|
||||||
phrasePrevOrient = phraseNextOrient = UNKNOWN;
|
phrasePrevOrient = phraseNextOrient = UNKNOWN;
|
||||||
}
|
}
|
||||||
if(hierModel) {
|
if(m_options.isHierModel()) {
|
||||||
hierPrevOrient = getOrientHierModel(sentence, hierType,
|
hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
|
||||||
connectedLeftTopP, connectedRightTopP,
|
connectedLeftTopP, connectedRightTopP,
|
||||||
startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
|
startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
|
||||||
hierNextOrient = getOrientHierModel(sentence, hierType,
|
hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
|
||||||
connectedLeftTopN, connectedRightTopN,
|
connectedLeftTopN, connectedRightTopN,
|
||||||
endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
|
endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
|
||||||
}
|
}
|
||||||
|
|
||||||
orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
|
orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
|
||||||
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
|
((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
|
||||||
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
|
((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
|
||||||
|
|
||||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||||
}
|
}
|
||||||
@ -617,94 +700,139 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
|
void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
|
||||||
{
|
{
|
||||||
// source
|
// source
|
||||||
// cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
|
// // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
|
||||||
|
ostringstream outextractstr;
|
||||||
|
ostringstream outextractstrInv;
|
||||||
|
ostringstream outextractstrOrientation;
|
||||||
|
ostringstream outextractstrSentenceId;
|
||||||
|
|
||||||
if (onlyOutputSpanInfo) {
|
if (m_options.isOnlyOutputSpanInfo()) {
|
||||||
cout << startF << " " << endF << " " << startE << " " << endE << endl;
|
cout << startF << " " << endF << " " << startE << " " << endE << endl;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int fi=startF; fi<=endF; fi++) {
|
for(int fi=startF; fi<=endF; fi++) {
|
||||||
if (translationFlag) extractFile << sentence.source[fi] << " ";
|
if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
|
||||||
if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
|
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
|
||||||
if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
|
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.source[fi] << " ";
|
||||||
}
|
}
|
||||||
if (translationFlag) extractFile << "||| ";
|
if (m_options.isTranslationFlag()) outextractstr << "||| ";
|
||||||
if (orientationFlag) extractFileOrientation << "||| ";
|
if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
|
||||||
if (sentenceIdFlag) extractFileSentenceId << "||| ";
|
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
|
||||||
|
|
||||||
// target
|
// target
|
||||||
for(int ei=startE; ei<=endE; ei++) {
|
for(int ei=startE; ei<=endE; ei++) {
|
||||||
if (translationFlag) extractFile << sentence.target[ei] << " ";
|
if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
|
||||||
if (translationFlag) extractFileInv << sentence.target[ei] << " ";
|
if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
|
||||||
if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
|
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
|
||||||
if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
|
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.target[ei] << " ";
|
||||||
}
|
}
|
||||||
if (translationFlag) extractFile << "|||";
|
if (m_options.isTranslationFlag()) outextractstr << "|||";
|
||||||
if (translationFlag) extractFileInv << "||| ";
|
if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
|
||||||
if (orientationFlag) extractFileOrientation << "||| ";
|
if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
|
||||||
if (sentenceIdFlag) extractFileSentenceId << "||| ";
|
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
|
||||||
|
|
||||||
// source (for inverse)
|
// source (for inverse)
|
||||||
if (translationFlag) {
|
|
||||||
for(int fi=startF; fi<=endF; fi++)
|
|
||||||
extractFileInv << sentence.source[fi] << " ";
|
|
||||||
extractFileInv << "|||";
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if (m_options.isTranslationFlag()) {
|
||||||
|
for(int fi=startF; fi<=endF; fi++)
|
||||||
|
outextractstrInv << sentence.source[fi] << " ";
|
||||||
|
outextractstrInv << "|||";
|
||||||
|
}
|
||||||
// alignment
|
// alignment
|
||||||
if (translationFlag) {
|
if (m_options.isTranslationFlag()) {
|
||||||
for(int ei=startE; ei<=endE; ei++) {
|
for(int ei=startE; ei<=endE; ei++) {
|
||||||
for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
|
for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
|
||||||
int fi = sentence.alignedToT[ei][i];
|
int fi = sentence.alignedToT[ei][i];
|
||||||
extractFile << " " << fi-startF << "-" << ei-startE;
|
outextractstr << " " << fi-startF << "-" << ei-startE;
|
||||||
extractFileInv << " " << ei-startE << "-" << fi-startF;
|
outextractstrInv << " " << ei-startE << "-" << fi-startF;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (orientationFlag)
|
if (m_options.isOrientationFlag())
|
||||||
extractFileOrientation << orientationInfo;
|
outextractstrOrientation << orientationInfo;
|
||||||
|
|
||||||
if (sentenceIdFlag) {
|
if (m_options.isSentenceIdFlag()) {
|
||||||
extractFileSentenceId << sentence.sentenceID;
|
outextractstrSentenceId << sentence.sentenceID;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (translationFlag) extractFile << "\n";
|
|
||||||
if (translationFlag) extractFileInv << "\n";
|
if (m_options.isTranslationFlag()) outextractstr << "\n";
|
||||||
if (orientationFlag) extractFileOrientation << "\n";
|
if (m_options.isTranslationFlag()) outextractstrInv << "\n";
|
||||||
if (sentenceIdFlag) extractFileSentenceId << "\n";
|
if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
|
||||||
|
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "\n";
|
||||||
|
|
||||||
|
|
||||||
|
m_extractedPhrases.push_back(outextractstr.str());
|
||||||
|
m_extractedPhrasesInv.push_back(outextractstrInv.str());
|
||||||
|
m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
|
||||||
|
m_extractedPhrasesSid.push_back(outextractstrSentenceId.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void ExtractTask::writePhrasesToFile(){
|
||||||
|
|
||||||
|
ostringstream outextractFile;
|
||||||
|
ostringstream outextractFileInv;
|
||||||
|
ostringstream outextractFileOrientation;
|
||||||
|
ostringstream outextractFileSentenceId;
|
||||||
|
|
||||||
|
for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
|
||||||
|
outextractFile<<phrase->data();
|
||||||
|
}
|
||||||
|
for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
|
||||||
|
outextractFileInv<<phrase->data();
|
||||||
|
}
|
||||||
|
for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
|
||||||
|
outextractFileOrientation<<phrase->data();
|
||||||
|
}
|
||||||
|
for(vector<string>::const_iterator phrase=m_extractedPhrasesSid.begin();phrase!=m_extractedPhrasesSid.end();phrase++){
|
||||||
|
outextractFileSentenceId<<phrase->data();
|
||||||
|
}
|
||||||
|
|
||||||
|
m_extractCollector->Write(m_id, outextractFile.str());
|
||||||
|
m_extractCollectorInv->Write(m_id,outextractFileInv.str());
|
||||||
|
m_extractCollectorOrientation->Write(m_id,outextractFileOrientation.str());
|
||||||
|
m_extractCollectorSentenceId->Write(m_id,outextractFileSentenceId.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// if proper conditioning, we need the number of times a source phrase occured
|
// if proper conditioning, we need the number of times a source phrase occured
|
||||||
void extractBase( SentenceAlignment &sentence )
|
|
||||||
|
void ExtractTask::extractBase( SentenceAlignment &sentence )
|
||||||
{
|
{
|
||||||
|
ostringstream outextractFile;
|
||||||
|
ostringstream outextractFileInv;
|
||||||
|
|
||||||
int countF = sentence.source.size();
|
int countF = sentence.source.size();
|
||||||
for(int startF=0; startF<countF; startF++) {
|
for(int startF=0; startF<countF; startF++) {
|
||||||
for(int endF=startF;
|
for(int endF=startF;
|
||||||
(endF<countF && endF<startF+maxPhraseLength);
|
(endF<countF && endF<startF+m_options.maxPhraseLength);
|
||||||
endF++) {
|
endF++) {
|
||||||
for(int fi=startF; fi<=endF; fi++) {
|
for(int fi=startF; fi<=endF; fi++) {
|
||||||
extractFile << sentence.source[fi] << " ";
|
outextractFile << sentence.source[fi] << " ";
|
||||||
}
|
}
|
||||||
extractFile << "|||" << endl;
|
outextractFile << "|||" << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int countE = sentence.target.size();
|
int countE = sentence.target.size();
|
||||||
for(int startE=0; startE<countE; startE++) {
|
for(int startE=0; startE<countE; startE++) {
|
||||||
for(int endE=startE;
|
for(int endE=startE;
|
||||||
(endE<countE && endE<startE+maxPhraseLength);
|
(endE<countE && endE<startE+m_options.maxPhraseLength);
|
||||||
endE++) {
|
endE++) {
|
||||||
for(int ei=startE; ei<=endE; ei++) {
|
for(int ei=startE; ei<=endE; ei++) {
|
||||||
extractFileInv << sentence.target[ei] << " ";
|
outextractFileInv << sentence.target[ei] << " ";
|
||||||
}
|
}
|
||||||
extractFileInv << "|||" << endl;
|
outextractFileInv << "|||" << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m_extractCollector->Write(m_id, outextractFile.str());
|
||||||
|
m_extractCollectorInv->Write(m_id,outextractFileInv.str());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user