mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-17 14:17:13 +03:00
multi-threaded extract program. Thanks to Rohit Gupta
This commit is contained in:
parent
b609473645
commit
7ae76dfe75
@ -10,7 +10,7 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
|
||||
alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
|
||||
alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
|
||||
|
||||
exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
|
||||
exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
|
||||
|
||||
exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
|
||||
|
||||
|
146
phrase-extract/PhraseExtractionOptions.h
Normal file
146
phrase-extract/PhraseExtractionOptions.h
Normal file
@ -0,0 +1,146 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2010 University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
|
||||
|
||||
#pragma once
|
||||
#ifndef PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
|
||||
#define PHRASEEXTRACTIONOPTIONS_H_INCLUDED_
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
|
||||
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
|
||||
|
||||
|
||||
class PhraseExtractionOptions {
|
||||
|
||||
public:
|
||||
const int maxPhraseLength;
|
||||
private:
|
||||
bool allModelsOutputFlag;
|
||||
bool wordModel;
|
||||
REO_MODEL_TYPE wordType;
|
||||
bool phraseModel;
|
||||
REO_MODEL_TYPE phraseType;
|
||||
bool hierModel;
|
||||
REO_MODEL_TYPE hierType;
|
||||
bool orientationFlag;
|
||||
bool translationFlag;
|
||||
bool sentenceIdFlag; //create extract file with sentence id
|
||||
bool onlyOutputSpanInfo;
|
||||
bool gzOutput;
|
||||
|
||||
public:
|
||||
PhraseExtractionOptions(const int initmaxPhraseLength):
|
||||
maxPhraseLength(initmaxPhraseLength),
|
||||
allModelsOutputFlag(false),
|
||||
wordModel(false),
|
||||
wordType(REO_MSD),
|
||||
phraseModel(false),
|
||||
phraseType(REO_MSD),
|
||||
hierModel(false),
|
||||
hierType(REO_MSD),
|
||||
orientationFlag(false),
|
||||
translationFlag(true),
|
||||
sentenceIdFlag(false),
|
||||
onlyOutputSpanInfo(false),
|
||||
gzOutput(false){}
|
||||
|
||||
|
||||
|
||||
//functions for initialization of options
|
||||
void initAllModelsOutputFlag(const bool initallModelsOutputFlag){
|
||||
allModelsOutputFlag=initallModelsOutputFlag;
|
||||
}
|
||||
void initWordModel(const bool initwordModel){
|
||||
wordModel=initwordModel;
|
||||
}
|
||||
void initWordType(REO_MODEL_TYPE initwordType ){
|
||||
wordType=initwordType;
|
||||
}
|
||||
void initPhraseModel(const bool initphraseModel ){
|
||||
phraseModel=initphraseModel;
|
||||
}
|
||||
void initPhraseType(REO_MODEL_TYPE initphraseType){
|
||||
phraseType=initphraseType;
|
||||
}
|
||||
void initHierModel(const bool inithierModel){
|
||||
hierModel=inithierModel;
|
||||
}
|
||||
void initHierType(REO_MODEL_TYPE inithierType){
|
||||
hierType=inithierType;
|
||||
}
|
||||
void initOrientationFlag(const bool initorientationFlag){
|
||||
orientationFlag=initorientationFlag;
|
||||
}
|
||||
void initTranslationFlag(const bool inittranslationFlag){
|
||||
translationFlag=inittranslationFlag;
|
||||
}
|
||||
void initSentenceIdFlag(const bool initsentenceIdFlag){
|
||||
sentenceIdFlag=initsentenceIdFlag;
|
||||
}
|
||||
void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){
|
||||
onlyOutputSpanInfo= initonlyOutputSpanInfo;
|
||||
}
|
||||
void initGzOutput (const bool initgzOutput){
|
||||
gzOutput= initgzOutput;
|
||||
}
|
||||
// functions for getting values
|
||||
bool isAllModelsOutputFlag(){
|
||||
return allModelsOutputFlag;
|
||||
}
|
||||
bool isWordModel(){
|
||||
return wordModel;
|
||||
}
|
||||
REO_MODEL_TYPE isWordType(){
|
||||
return wordType;
|
||||
}
|
||||
bool isPhraseModel(){
|
||||
return phraseModel;
|
||||
}
|
||||
REO_MODEL_TYPE isPhraseType(){
|
||||
return phraseType;
|
||||
}
|
||||
bool isHierModel(){
|
||||
return hierModel;
|
||||
}
|
||||
REO_MODEL_TYPE isHierType(){
|
||||
return hierType;
|
||||
}
|
||||
bool isOrientationFlag(){
|
||||
return orientationFlag;
|
||||
}
|
||||
bool isTranslationFlag(){
|
||||
return translationFlag;
|
||||
}
|
||||
bool isSentenceIdFlag(){
|
||||
return sentenceIdFlag;
|
||||
}
|
||||
bool isOnlyOutputSpanInfo(){
|
||||
return onlyOutputSpanInfo;
|
||||
}
|
||||
bool isGzOutput (){
|
||||
return gzOutput;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* extract.cpp
|
||||
*
|
||||
* Modified by: Rohit Gupta CDAC, Mumbai, India
|
||||
* on July 15, 2012 to implement parallel processing
|
||||
* Modified by: Nadi Tomeh - LIMSI/CNRS
|
||||
* Machine Translation Marathon 2010, Dublin
|
||||
*/
|
||||
@ -13,7 +14,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <cstring>
|
||||
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
@ -23,14 +24,17 @@
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
|
||||
#include "../moses/src/ThreadPool.h"
|
||||
#include "../moses/src/OutputCollector.h"
|
||||
#include "PhraseExtractionOptions.h"
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
|
||||
#define LINE_MAX_LENGTH 500000
|
||||
namespace MosesTraining {
|
||||
|
||||
|
||||
const long int LINE_MAX_LENGTH = 500000 ;
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
// HPhraseVertex represents a point in the alignment matrix
|
||||
typedef pair <int, int> HPhraseVertex;
|
||||
@ -46,9 +50,6 @@ typedef vector < HPhrase > HPhraseVector;
|
||||
// The key of the map is the English index and the value is a set of the source ones
|
||||
typedef map <int, set<int> > HSentenceVertices;
|
||||
|
||||
enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
|
||||
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
|
||||
|
||||
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int));
|
||||
@ -72,32 +73,42 @@ bool ge(int, int);
|
||||
bool le(int, int);
|
||||
bool lt(int, int);
|
||||
|
||||
bool isAligned (SentenceAlignment &, int, int);
|
||||
|
||||
|
||||
}
|
||||
namespace MosesTraining{
|
||||
class ExtractTask : public Moses::Task{
|
||||
private:
|
||||
size_t m_id;
|
||||
SentenceAlignment *m_sentence;
|
||||
PhraseExtractionOptions &m_options;
|
||||
Moses::OutputCollector* m_extractCollector;
|
||||
Moses::OutputCollector* m_extractCollectorInv;
|
||||
Moses::OutputCollector* m_extractCollectorOrientation;
|
||||
Moses::OutputCollector* m_extractCollectorSentenceId;
|
||||
public:
|
||||
ExtractTask(size_t id, SentenceAlignment *sentence,PhraseExtractionOptions &initoptions, Moses::OutputCollector *extractCollector, Moses::OutputCollector *extractCollectorInv,Moses::OutputCollector *extractCollectorOrientation,Moses::OutputCollector* extractCollectorSentenceId ):
|
||||
m_id(id),
|
||||
m_sentence(sentence),
|
||||
m_options(initoptions),
|
||||
m_extractCollector(extractCollector),
|
||||
m_extractCollectorInv(extractCollectorInv),
|
||||
m_extractCollectorOrientation(extractCollectorOrientation),
|
||||
m_extractCollectorSentenceId(extractCollectorSentenceId) {}
|
||||
~ExtractTask() { delete m_sentence; }
|
||||
void Run();
|
||||
private:
|
||||
vector< string > m_extractedPhrases;
|
||||
vector< string > m_extractedPhrasesInv;
|
||||
vector< string > m_extractedPhrasesOri;
|
||||
vector< string > m_extractedPhrasesSid;
|
||||
void extractBase(SentenceAlignment &);
|
||||
void extract(SentenceAlignment &);
|
||||
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
|
||||
bool isAligned (SentenceAlignment &, int, int);
|
||||
|
||||
bool allModelsOutputFlag = false;
|
||||
|
||||
bool wordModel = false;
|
||||
REO_MODEL_TYPE wordType = REO_MSD;
|
||||
bool phraseModel = false;
|
||||
REO_MODEL_TYPE phraseType = REO_MSD;
|
||||
bool hierModel = false;
|
||||
REO_MODEL_TYPE hierType = REO_MSD;
|
||||
|
||||
|
||||
Moses::OutputFileStream extractFile;
|
||||
Moses::OutputFileStream extractFileInv;
|
||||
Moses::OutputFileStream extractFileOrientation;
|
||||
Moses::OutputFileStream extractFileSentenceId;
|
||||
int maxPhraseLength;
|
||||
bool orientationFlag = false;
|
||||
bool translationFlag = true;
|
||||
bool sentenceIdFlag = false; //create extract file with sentence id
|
||||
bool onlyOutputSpanInfo = false;
|
||||
bool gzOutput = false;
|
||||
void writePhrasesToFile();
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
@ -105,27 +116,40 @@ int main(int argc, char* argv[])
|
||||
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
|
||||
<< "phrase extraction from an aligned parallel corpus\n";
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
int thread_count = 1;
|
||||
#endif
|
||||
if (argc < 6) {
|
||||
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
|
||||
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
|
||||
#ifdef WITH_THREADS
|
||||
|
||||
cerr<< "| --threads NUM ";
|
||||
#endif
|
||||
cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --GZOutput ]\n";
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameE = argv[1];
|
||||
char* &fileNameF = argv[2];
|
||||
char* &fileNameA = argv[3];
|
||||
string fileNameExtract = string(argv[4]);
|
||||
maxPhraseLength = atoi(argv[5]);
|
||||
|
||||
Moses::OutputFileStream extractFile;
|
||||
Moses::OutputFileStream extractFileInv;
|
||||
Moses::OutputFileStream extractFileOrientation;
|
||||
Moses::OutputFileStream extractFileSentenceId;
|
||||
const char* const &fileNameE = argv[1];
|
||||
const char* const &fileNameF = argv[2];
|
||||
const char* const &fileNameA = argv[3];
|
||||
const string fileNameExtract = string(argv[4]);
|
||||
PhraseExtractionOptions options(atoi(argv[5]));
|
||||
|
||||
for(int i=6; i<argc; i++) {
|
||||
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
|
||||
onlyOutputSpanInfo = true;
|
||||
options.initOnlyOutputSpanInfo(true);
|
||||
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
|
||||
orientationFlag = true;
|
||||
options.initOrientationFlag(true);
|
||||
} else if (strcmp(argv[i],"--NoTTable") == 0) {
|
||||
translationFlag = false;
|
||||
options.initTranslationFlag(false);
|
||||
} else if (strcmp(argv[i], "--SentenceId") == 0) {
|
||||
sentenceIdFlag = true;
|
||||
options.initSentenceIdFlag(true);
|
||||
} else if (strcmp(argv[i], "--GZOutput") == 0) {
|
||||
gzOutput = true;
|
||||
options.initGzOutput(true);
|
||||
} else if(strcmp(argv[i],"--model") == 0) {
|
||||
if (i+1 >= argc) {
|
||||
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
|
||||
@ -138,37 +162,37 @@ int main(int argc, char* argv[])
|
||||
REO_MODEL_TYPE intModelType;
|
||||
|
||||
if(strcmp(modelName, "wbe") == 0) {
|
||||
wordModel = true;
|
||||
options.initWordModel(true);
|
||||
if(strcmp(modelType, "msd") == 0)
|
||||
wordType = REO_MSD;
|
||||
options.initWordType(REO_MSD);
|
||||
else if(strcmp(modelType, "mslr") == 0)
|
||||
wordType = REO_MSLR;
|
||||
options.initWordType(REO_MSLR);
|
||||
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
||||
wordType = REO_MONO;
|
||||
options.initWordType(REO_MONO);
|
||||
else {
|
||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||
exit(1);
|
||||
}
|
||||
} else if(strcmp(modelName, "phrase") == 0) {
|
||||
phraseModel = true;
|
||||
options.initPhraseModel(true);
|
||||
if(strcmp(modelType, "msd") == 0)
|
||||
phraseType = REO_MSD;
|
||||
options.initPhraseType(REO_MSD);
|
||||
else if(strcmp(modelType, "mslr") == 0)
|
||||
phraseType = REO_MSLR;
|
||||
options.initPhraseType(REO_MSLR);
|
||||
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
||||
phraseType = REO_MONO;
|
||||
options.initPhraseType(REO_MONO);
|
||||
else {
|
||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||
exit(1);
|
||||
}
|
||||
} else if(strcmp(modelName, "hier") == 0) {
|
||||
hierModel = true;
|
||||
options.initHierModel(true);
|
||||
if(strcmp(modelType, "msd") == 0)
|
||||
hierType = REO_MSD;
|
||||
options.initHierType(REO_MSD);
|
||||
else if(strcmp(modelType, "mslr") == 0)
|
||||
hierType = REO_MSLR;
|
||||
options.initHierType(REO_MSLR);
|
||||
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
||||
hierType = REO_MONO;
|
||||
options.initHierType(REO_MONO);
|
||||
else {
|
||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||
exit(1);
|
||||
@ -178,7 +202,21 @@ int main(int argc, char* argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
allModelsOutputFlag = true;
|
||||
options.initAllModelsOutputFlag(true);
|
||||
#ifdef WITH_THREADS
|
||||
}else if (strcmp(argv[i],"-threads") == 0 ||
|
||||
strcmp(argv[i],"--threads") == 0 ||
|
||||
strcmp(argv[i],"--Threads") == 0) {
|
||||
if(argc>(i+1))thread_count = atoi(argv[++i]);
|
||||
else {cerr<<"extract: syntax error, NUM is missing for --threads NUM option"<<endl;
|
||||
exit(1);
|
||||
}
|
||||
if(thread_count==0){
|
||||
cerr<<"extract: error, NUM is missing for --threads NUM option or --threads 0 is given"<<endl;
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
} else {
|
||||
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
|
||||
exit(1);
|
||||
@ -187,9 +225,9 @@ int main(int argc, char* argv[])
|
||||
|
||||
// default reordering model if no model selected
|
||||
// allows for the old syntax to be used
|
||||
if(orientationFlag && !allModelsOutputFlag) {
|
||||
wordModel = true;
|
||||
wordType = REO_MSD;
|
||||
if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
|
||||
options.initWordModel(true);
|
||||
options.initWordType(REO_MSD);
|
||||
}
|
||||
|
||||
// open input files
|
||||
@ -202,21 +240,32 @@ int main(int argc, char* argv[])
|
||||
istream *aFileP = &aFile;
|
||||
|
||||
// open output files
|
||||
if (translationFlag) {
|
||||
string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
|
||||
extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
|
||||
if (options.isTranslationFlag()) {
|
||||
string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":"");
|
||||
extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str());
|
||||
extractFileInv.Open(fileNameExtractInv.c_str());
|
||||
}
|
||||
if (orientationFlag) {
|
||||
string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
|
||||
if (options.isOrientationFlag()) {
|
||||
string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
|
||||
extractFileOrientation.Open(fileNameExtractOrientation.c_str());
|
||||
}
|
||||
|
||||
if (sentenceIdFlag) {
|
||||
string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
|
||||
if (options.isSentenceIdFlag()) {
|
||||
string fileNameExtractSentenceId = fileNameExtract + ".sid" + (options.isGzOutput()?".gz":"");
|
||||
extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
|
||||
}
|
||||
|
||||
|
||||
Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);//r
|
||||
Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);//r
|
||||
Moses::OutputCollector* extractCollectorOrientation = new Moses::OutputCollector(&extractFileOrientation);//r
|
||||
Moses::OutputCollector* extractCollectorSentenceId = new Moses::OutputCollector(&extractFileSentenceId); //r
|
||||
#ifdef WITH_THREADS
|
||||
// set up thread pool
|
||||
Moses::ThreadPool pool(thread_count);
|
||||
pool.SetQueueLimit(1000);
|
||||
#endif
|
||||
|
||||
int i=0;
|
||||
while(true) {
|
||||
i++;
|
||||
@ -228,32 +277,57 @@ int main(int argc, char* argv[])
|
||||
if (eFileP->eof()) break;
|
||||
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
SentenceAlignment sentence;
|
||||
SentenceAlignment *sentence=new SentenceAlignment;
|
||||
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
|
||||
//az: output src, tgt, and alingment line
|
||||
if (onlyOutputSpanInfo) {
|
||||
if (options.isOnlyOutputSpanInfo()) {
|
||||
cout << "LOG: SRC: " << foreignString << endl;
|
||||
cout << "LOG: TGT: " << englishString << endl;
|
||||
cout << "LOG: ALT: " << alignmentString << endl;
|
||||
cout << "LOG: PHRASES_BEGIN:" << endl;
|
||||
}
|
||||
if (sentence->create( englishString, foreignString, alignmentString, i)) {
|
||||
ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector , extractCollectorInv, extractCollectorOrientation, extractCollectorSentenceId);
|
||||
#ifdef WITH_THREADS
|
||||
if (thread_count == 1) {
|
||||
task->Run();
|
||||
delete task;
|
||||
}
|
||||
else {
|
||||
pool.Submit(task);
|
||||
}
|
||||
#else
|
||||
task->Run();
|
||||
delete task;
|
||||
#endif
|
||||
|
||||
if (sentence.create( englishString, foreignString, alignmentString, i)) {
|
||||
extract(sentence);
|
||||
}
|
||||
if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
|
||||
if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
|
||||
}
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
// wait for all threads to finish
|
||||
pool.Stop(true);
|
||||
#endif
|
||||
|
||||
eFile.Close();
|
||||
fFile.Close();
|
||||
aFile.Close();
|
||||
delete extractCollector;
|
||||
delete extractCollectorInv;
|
||||
delete extractCollectorOrientation;
|
||||
delete extractCollectorSentenceId;
|
||||
//az: only close if we actually opened it
|
||||
if (!onlyOutputSpanInfo) {
|
||||
if (translationFlag) {
|
||||
if (!options.isOnlyOutputSpanInfo()) {
|
||||
if (options.isTranslationFlag()) {
|
||||
extractFile.Close();
|
||||
extractFileInv.Close();
|
||||
|
||||
}
|
||||
if (orientationFlag) extractFileOrientation.Close();
|
||||
if (sentenceIdFlag) {
|
||||
if (options.isOrientationFlag()){
|
||||
extractFileOrientation.Close();
|
||||
}
|
||||
if (options.isSentenceIdFlag()) {
|
||||
extractFileSentenceId.Close();
|
||||
}
|
||||
}
|
||||
@ -261,8 +335,17 @@ int main(int argc, char* argv[])
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
void ExtractTask::Run() {
|
||||
extract(*m_sentence);
|
||||
writePhrasesToFile();
|
||||
m_extractedPhrases.clear();
|
||||
m_extractedPhrasesInv.clear();
|
||||
m_extractedPhrasesOri.clear();
|
||||
m_extractedPhrasesSid.clear();
|
||||
|
||||
void extract(SentenceAlignment &sentence)
|
||||
}
|
||||
|
||||
void ExtractTask::extract(SentenceAlignment &sentence)
|
||||
{
|
||||
int countE = sentence.target.size();
|
||||
int countF = sentence.source.size();
|
||||
@ -281,14 +364,14 @@ void extract(SentenceAlignment &sentence)
|
||||
|
||||
HSentenceVertices::const_iterator it;
|
||||
|
||||
bool relaxLimit = hierModel;
|
||||
bool buildExtraStructure = phraseModel || hierModel;
|
||||
bool relaxLimit = m_options.isHierModel();
|
||||
bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
|
||||
|
||||
// check alignments for target phrase startE...endE
|
||||
// loop over extracted phrases which are compatible with the word-alignments
|
||||
for(int startE=0; startE<countE; startE++) {
|
||||
for(int endE=startE;
|
||||
(endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
|
||||
(endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
|
||||
endE++) {
|
||||
|
||||
int minF = 9999;
|
||||
@ -308,7 +391,7 @@ void extract(SentenceAlignment &sentence)
|
||||
}
|
||||
|
||||
if (maxF >= 0 && // aligned to any source words at all
|
||||
(relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits
|
||||
(relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits
|
||||
|
||||
// check if source words are aligned to out of bound target words
|
||||
bool out_of_bounds = false;
|
||||
@ -323,17 +406,17 @@ void extract(SentenceAlignment &sentence)
|
||||
// start point of source phrase may retreat over unaligned
|
||||
for(int startF=minF;
|
||||
(startF>=0 &&
|
||||
(relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
|
||||
(relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
|
||||
(startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
|
||||
startF--)
|
||||
// end point of source phrase may advance over unaligned
|
||||
for(int endF=maxF;
|
||||
(endF<countF &&
|
||||
(relaxLimit || endF<startF+maxPhraseLength) && // within length limit
|
||||
(relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
|
||||
(endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
|
||||
endF++) { // at this point we have extracted a phrase
|
||||
if(buildExtraStructure) { // phrase || hier
|
||||
if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
|
||||
if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
|
||||
inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
|
||||
HPhraseVertex(endF,endE)));
|
||||
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
|
||||
@ -343,16 +426,16 @@ void extract(SentenceAlignment &sentence)
|
||||
startF, startE, endF, endE);
|
||||
} else {
|
||||
string orientationInfo = "";
|
||||
if(wordModel) {
|
||||
if(m_options.isWordModel()) {
|
||||
REO_POS wordPrevOrient, wordNextOrient;
|
||||
bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
|
||||
bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
|
||||
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
|
||||
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
|
||||
wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, <);
|
||||
wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, <, &ge);
|
||||
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
|
||||
if(allModelsOutputFlag)
|
||||
wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, <);
|
||||
wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, <, &ge);
|
||||
orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
|
||||
if(m_options.isAllModelsOutputFlag())
|
||||
" | | ";
|
||||
}
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
@ -378,38 +461,38 @@ void extract(SentenceAlignment &sentence)
|
||||
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
|
||||
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
|
||||
|
||||
if(wordModel) {
|
||||
wordPrevOrient = getOrientWordModel(sentence, wordType,
|
||||
if(m_options.isWordModel()) {
|
||||
wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
|
||||
connectedLeftTopP, connectedRightTopP,
|
||||
startF, endF, startE, endE, countF, 0, 1,
|
||||
&ge, <);
|
||||
wordNextOrient = getOrientWordModel(sentence, wordType,
|
||||
wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
|
||||
connectedLeftTopN, connectedRightTopN,
|
||||
endF, startF, endE, startE, 0, countF, -1,
|
||||
<, &ge);
|
||||
}
|
||||
if (phraseModel) {
|
||||
phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
|
||||
if (m_options.isPhraseModel()) {
|
||||
phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
|
||||
connectedLeftTopP, connectedRightTopP,
|
||||
startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft);
|
||||
phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
|
||||
phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
|
||||
connectedLeftTopN, connectedRightTopN,
|
||||
endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight);
|
||||
} else {
|
||||
phrasePrevOrient = phraseNextOrient = UNKNOWN;
|
||||
}
|
||||
if(hierModel) {
|
||||
hierPrevOrient = getOrientHierModel(sentence, hierType,
|
||||
if(m_options.isHierModel()) {
|
||||
hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
|
||||
connectedLeftTopP, connectedRightTopP,
|
||||
startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
|
||||
hierNextOrient = getOrientHierModel(sentence, hierType,
|
||||
hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
|
||||
connectedLeftTopN, connectedRightTopN,
|
||||
endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
|
||||
}
|
||||
|
||||
orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
|
||||
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
|
||||
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
|
||||
orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
|
||||
((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
|
||||
((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
|
||||
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
}
|
||||
@ -617,94 +700,139 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
|
||||
return "";
|
||||
}
|
||||
|
||||
void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
|
||||
void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
|
||||
{
|
||||
// source
|
||||
// cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
|
||||
// // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
|
||||
ostringstream outextractstr;
|
||||
ostringstream outextractstrInv;
|
||||
ostringstream outextractstrOrientation;
|
||||
ostringstream outextractstrSentenceId;
|
||||
|
||||
if (onlyOutputSpanInfo) {
|
||||
if (m_options.isOnlyOutputSpanInfo()) {
|
||||
cout << startF << " " << endF << " " << startE << " " << endE << endl;
|
||||
return;
|
||||
}
|
||||
|
||||
for(int fi=startF; fi<=endF; fi++) {
|
||||
if (translationFlag) extractFile << sentence.source[fi] << " ";
|
||||
if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
|
||||
if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
|
||||
if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
|
||||
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.source[fi] << " ";
|
||||
}
|
||||
if (translationFlag) extractFile << "||| ";
|
||||
if (orientationFlag) extractFileOrientation << "||| ";
|
||||
if (sentenceIdFlag) extractFileSentenceId << "||| ";
|
||||
if (m_options.isTranslationFlag()) outextractstr << "||| ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
|
||||
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
|
||||
|
||||
// target
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
if (translationFlag) extractFile << sentence.target[ei] << " ";
|
||||
if (translationFlag) extractFileInv << sentence.target[ei] << " ";
|
||||
if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
|
||||
if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
|
||||
if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
|
||||
if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
|
||||
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << sentence.target[ei] << " ";
|
||||
}
|
||||
if (translationFlag) extractFile << "|||";
|
||||
if (translationFlag) extractFileInv << "||| ";
|
||||
if (orientationFlag) extractFileOrientation << "||| ";
|
||||
if (sentenceIdFlag) extractFileSentenceId << "||| ";
|
||||
if (m_options.isTranslationFlag()) outextractstr << "|||";
|
||||
if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
|
||||
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "||| ";
|
||||
|
||||
// source (for inverse)
|
||||
if (translationFlag) {
|
||||
|
||||
if (m_options.isTranslationFlag()) {
|
||||
for(int fi=startF; fi<=endF; fi++)
|
||||
extractFileInv << sentence.source[fi] << " ";
|
||||
extractFileInv << "|||";
|
||||
outextractstrInv << sentence.source[fi] << " ";
|
||||
outextractstrInv << "|||";
|
||||
}
|
||||
|
||||
// alignment
|
||||
if (translationFlag) {
|
||||
if (m_options.isTranslationFlag()) {
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
|
||||
for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
|
||||
int fi = sentence.alignedToT[ei][i];
|
||||
extractFile << " " << fi-startF << "-" << ei-startE;
|
||||
extractFileInv << " " << ei-startE << "-" << fi-startF;
|
||||
outextractstr << " " << fi-startF << "-" << ei-startE;
|
||||
outextractstrInv << " " << ei-startE << "-" << fi-startF;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (orientationFlag)
|
||||
extractFileOrientation << orientationInfo;
|
||||
if (m_options.isOrientationFlag())
|
||||
outextractstrOrientation << orientationInfo;
|
||||
|
||||
if (sentenceIdFlag) {
|
||||
extractFileSentenceId << sentence.sentenceID;
|
||||
if (m_options.isSentenceIdFlag()) {
|
||||
outextractstrSentenceId << sentence.sentenceID;
|
||||
}
|
||||
|
||||
if (translationFlag) extractFile << "\n";
|
||||
if (translationFlag) extractFileInv << "\n";
|
||||
if (orientationFlag) extractFileOrientation << "\n";
|
||||
if (sentenceIdFlag) extractFileSentenceId << "\n";
|
||||
|
||||
if (m_options.isTranslationFlag()) outextractstr << "\n";
|
||||
if (m_options.isTranslationFlag()) outextractstrInv << "\n";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
|
||||
if (m_options.isSentenceIdFlag()) outextractstrSentenceId << "\n";
|
||||
|
||||
|
||||
m_extractedPhrases.push_back(outextractstr.str());
|
||||
m_extractedPhrasesInv.push_back(outextractstrInv.str());
|
||||
m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
|
||||
m_extractedPhrasesSid.push_back(outextractstrSentenceId.str());
|
||||
}
|
||||
|
||||
|
||||
void ExtractTask::writePhrasesToFile(){
|
||||
|
||||
ostringstream outextractFile;
|
||||
ostringstream outextractFileInv;
|
||||
ostringstream outextractFileOrientation;
|
||||
ostringstream outextractFileSentenceId;
|
||||
|
||||
for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
|
||||
outextractFile<<phrase->data();
|
||||
}
|
||||
for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
|
||||
outextractFileInv<<phrase->data();
|
||||
}
|
||||
for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
|
||||
outextractFileOrientation<<phrase->data();
|
||||
}
|
||||
for(vector<string>::const_iterator phrase=m_extractedPhrasesSid.begin();phrase!=m_extractedPhrasesSid.end();phrase++){
|
||||
outextractFileSentenceId<<phrase->data();
|
||||
}
|
||||
|
||||
m_extractCollector->Write(m_id, outextractFile.str());
|
||||
m_extractCollectorInv->Write(m_id,outextractFileInv.str());
|
||||
m_extractCollectorOrientation->Write(m_id,outextractFileOrientation.str());
|
||||
m_extractCollectorSentenceId->Write(m_id,outextractFileSentenceId.str());
|
||||
}
|
||||
|
||||
// if proper conditioning, we need the number of times a source phrase occured
|
||||
void extractBase( SentenceAlignment &sentence )
|
||||
|
||||
void ExtractTask::extractBase( SentenceAlignment &sentence )
|
||||
{
|
||||
ostringstream outextractFile;
|
||||
ostringstream outextractFileInv;
|
||||
|
||||
int countF = sentence.source.size();
|
||||
for(int startF=0; startF<countF; startF++) {
|
||||
for(int endF=startF;
|
||||
(endF<countF && endF<startF+maxPhraseLength);
|
||||
(endF<countF && endF<startF+m_options.maxPhraseLength);
|
||||
endF++) {
|
||||
for(int fi=startF; fi<=endF; fi++) {
|
||||
extractFile << sentence.source[fi] << " ";
|
||||
outextractFile << sentence.source[fi] << " ";
|
||||
}
|
||||
extractFile << "|||" << endl;
|
||||
outextractFile << "|||" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
int countE = sentence.target.size();
|
||||
for(int startE=0; startE<countE; startE++) {
|
||||
for(int endE=startE;
|
||||
(endE<countE && endE<startE+maxPhraseLength);
|
||||
(endE<countE && endE<startE+m_options.maxPhraseLength);
|
||||
endE++) {
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
extractFileInv << sentence.target[ei] << " ";
|
||||
outextractFileInv << sentence.target[ei] << " ";
|
||||
}
|
||||
extractFileInv << "|||" << endl;
|
||||
outextractFileInv << "|||" << endl;
|
||||
}
|
||||
}
|
||||
m_extractCollector->Write(m_id, outextractFile.str());
|
||||
m_extractCollectorInv->Write(m_id,outextractFileInv.str());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user