From b3a712baa08d4b48b91714da486bb5ebd7f51602 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 18 Dec 2013 18:40:23 +0000 Subject: [PATCH 01/48] output reordering only --- .../other-builds/extract-ordering/.cproject | 114 +++ .../other-builds/extract-ordering/.project | 34 + phrase-extract/extract-ordering-main.cpp | 736 ++++++++++++++++++ 3 files changed, 884 insertions(+) create mode 100644 contrib/other-builds/extract-ordering/.cproject create mode 100644 contrib/other-builds/extract-ordering/.project create mode 100644 phrase-extract/extract-ordering-main.cpp diff --git a/contrib/other-builds/extract-ordering/.cproject b/contrib/other-builds/extract-ordering/.cproject new file mode 100644 index 000000000..251c8b4ff --- /dev/null +++ b/contrib/other-builds/extract-ordering/.cproject @@ -0,0 +1,114 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/other-builds/extract-ordering/.project b/contrib/other-builds/extract-ordering/.project new file mode 100644 index 000000000..fcf758cab --- /dev/null +++ b/contrib/other-builds/extract-ordering/.project @@ -0,0 +1,34 @@ + + + extract-ordering + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + + + extract-ordering-main.cpp + 1 + PARENT-3-PROJECT_LOC/phrase-extract/extract-ordering-main.cpp + + + diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp new file mode 100644 index 000000000..f3d8a781a --- /dev/null +++ b/phrase-extract/extract-ordering-main.cpp @@ -0,0 +1,736 @@ +/* + * extract.cpp + * Modified by: Rohit Gupta CDAC, Mumbai, India + * on July 15, 2012 to implement parallel processing + * Modified by: Nadi Tomeh - LIMSI/CNRS + * Machine Translation Marathon 2010, Dublin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "SafeGetline.h" +#include "SentenceAlignment.h" +#include "tables-core.h" +#include "InputFileStream.h" +#include "OutputFileStream.h" +#include "PhraseExtractionOptions.h" + +using namespace std; +using namespace MosesTraining; + +namespace MosesTraining +{ + + +const long int LINE_MAX_LENGTH = 500000 ; + + +// HPhraseVertex represents a point in the alignment matrix +typedef pair HPhraseVertex; + +// Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix: +// bottom-left and top-right +typedef pair HPhrase; + +// HPhraseVector is a vector of HPhrases +typedef vector < HPhrase > HPhraseVector; + +// SentenceVertices represents, from all extracted phrases, all vertices that have the same positioning +// The key of the map is the English index and the value is a set of the source ones +typedef map > HSentenceVertices; + +REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, + int, int, int, int, int, int, int, + bool (*)(int, int), bool (*)(int, int)); +REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, + int, int, int, int, int, int, int, + bool (*)(int, int), bool (*)(int, int), + const HSentenceVertices &, const HSentenceVertices &); +REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool, + int, int, int, int, int, int, int, + bool (*)(int, int), bool (*)(int, int), + const HSentenceVertices &, const HSentenceVertices &, + const HSentenceVertices &, const HSentenceVertices &, + REO_POS); + +void insertVertex(HSentenceVertices &, int, int); +void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, + int, int, int, int); +string getOrientString(REO_POS, REO_MODEL_TYPE); + +bool ge(int, int); +bool le(int, int); +bool lt(int, int); + +bool isAligned (SentenceAlignment &, int, int); + +int sentenceOffset = 0; + +std::vector Tokenize(const std::string& str, + const std::string& delimiters = " \t"); + +bool flexScoreFlag = false; + +} + +namespace MosesTraining +{ + +class ExtractTask +{ +public: + ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFileOrientation) + :m_sentence(sentence), + m_options(initoptions), + m_extractFileOrientation(extractFileOrientation) + {} + void Run(); +private: + vector< string > m_extractedPhrasesOri; + void extract(SentenceAlignment &); + void addPhrase(SentenceAlignment &, int, int, int, int, string &); + void writePhrasesToFile(); + + SentenceAlignment &m_sentence; + const PhraseExtractionOptions &m_options; + Moses::OutputFileStream &m_extractFileOrientation; +}; +} + +int main(int argc, char* argv[]) +{ + cerr << "PhraseExtract v1.4, written by Philipp Koehn\n" + << "phrase extraction from an aligned parallel corpus\n"; + + if (argc < 6) { + cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] "; + cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n"; + exit(1); + } + + Moses::OutputFileStream extractFileOrientation; + const char* const &fileNameE = argv[1]; + const char* const &fileNameF = argv[2]; + const char* const &fileNameA = argv[3]; + const string fileNameExtract = string(argv[4]); + PhraseExtractionOptions options(atoi(argv[5])); + + for(int i=6; i= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') { + cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl; + exit(1); + } + sentenceOffset = atoi(argv[++i]); + } else if (strcmp(argv[i], "--GZOutput") == 0) { + options.initGzOutput(true); + } else if (strcmp(argv[i], "--InstanceWeights") == 0) { + if (i+1 >= argc) { + cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl; + exit(1); + } + options.initInstanceWeightsFile(argv[++i]); + } else if (strcmp(argv[i], "--Debug") == 0) { + options.debug = true; + } else if(strcmp(argv[i],"--model") == 0) { + if (i+1 >= argc) { + cerr << "extract: syntax error, no model's information provided to the option --model " << endl; + exit(1); + } + char* modelParams = argv[++i]; + char* modelName = strtok(modelParams, "-"); + char* modelType = strtok(NULL, "-"); + + // REO_MODEL_TYPE intModelType; + + if(strcmp(modelName, "wbe") == 0) { + options.initWordModel(true); + if(strcmp(modelType, "msd") == 0) + options.initWordType(REO_MSD); + else if(strcmp(modelType, "mslr") == 0) + options.initWordType(REO_MSLR); + else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) + options.initWordType(REO_MONO); + else { + cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; + exit(1); + } + } else if(strcmp(modelName, "phrase") == 0) { + options.initPhraseModel(true); + if(strcmp(modelType, "msd") == 0) + options.initPhraseType(REO_MSD); + else if(strcmp(modelType, "mslr") == 0) + options.initPhraseType(REO_MSLR); + else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) + options.initPhraseType(REO_MONO); + else { + cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; + exit(1); + } + } else if(strcmp(modelName, "hier") == 0) { + options.initHierModel(true); + if(strcmp(modelType, "msd") == 0) + options.initHierType(REO_MSD); + else if(strcmp(modelType, "mslr") == 0) + options.initHierType(REO_MSLR); + else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) + options.initHierType(REO_MONO); + else { + cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; + exit(1); + } + } else { + cerr << "extract: syntax error, unknown reordering model: " << modelName << endl; + exit(1); + } + + options.initAllModelsOutputFlag(true); + } else { + cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; + exit(1); + } + } + + // default reordering model if no model selected + // allows for the old syntax to be used + if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) { + options.initWordModel(true); + options.initWordType(REO_MSD); + } + + // open input files + Moses::InputFileStream eFile(fileNameE); + Moses::InputFileStream fFile(fileNameF); + Moses::InputFileStream aFile(fileNameA); + + istream *eFileP = &eFile; + istream *fFileP = &fFile; + istream *aFileP = &aFile; + + istream *iwFileP = NULL; + auto_ptr instanceWeightsFile; + if (options.getInstanceWeightsFile().length()) { + instanceWeightsFile.reset(new Moses::InputFileStream(options.getInstanceWeightsFile())); + iwFileP = instanceWeightsFile.get(); + } + + // open output files + if (options.isOrientationFlag()) { + string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":""); + extractFileOrientation.Open(fileNameExtractOrientation.c_str()); + } + + int i = sentenceOffset; + + while(true) { + i++; + if (i%10000 == 0) cerr << "." << flush; + char englishString[LINE_MAX_LENGTH]; + char foreignString[LINE_MAX_LENGTH]; + char alignmentString[LINE_MAX_LENGTH]; + char weightString[LINE_MAX_LENGTH]; + SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__); + if (eFileP->eof()) break; + SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__); + SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); + if (iwFileP) { + SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__); + } + SentenceAlignment sentence; + // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; + //az: output src, tgt, and alingment line + if (options.isOnlyOutputSpanInfo()) { + cout << "LOG: SRC: " << foreignString << endl; + cout << "LOG: TGT: " << englishString << endl; + cout << "LOG: ALT: " << alignmentString << endl; + cout << "LOG: PHRASES_BEGIN:" << endl; + } + if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { + ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation); + task->Run(); + delete task; + + } + if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases + } + + eFile.Close(); + fFile.Close(); + aFile.Close(); + + //az: only close if we actually opened it + if (!options.isOnlyOutputSpanInfo()) { + if (options.isOrientationFlag()) { + extractFileOrientation.Close(); + } + } +} + +namespace MosesTraining +{ +void ExtractTask::Run() +{ + extract(m_sentence); + writePhrasesToFile(); + m_extractedPhrasesOri.clear(); +} + +void ExtractTask::extract(SentenceAlignment &sentence) +{ + int countE = sentence.target.size(); + int countF = sentence.source.size(); + + HPhraseVector inboundPhrases; + + HSentenceVertices inTopLeft; + HSentenceVertices inTopRight; + HSentenceVertices inBottomLeft; + HSentenceVertices inBottomRight; + + HSentenceVertices outTopLeft; + HSentenceVertices outTopRight; + HSentenceVertices outBottomLeft; + HSentenceVertices outBottomRight; + + HSentenceVertices::const_iterator it; + + bool relaxLimit = m_options.isHierModel(); + bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel(); + + // check alignments for target phrase startE...endE + // loop over extracted phrases which are compatible with the word-alignments + for(int startE=0; startE usedF = sentence.alignedCountS; + for(int ei=startE; ei<=endE; ei++) { + for(size_t i=0; imaxF) { + maxF = fi; + } + usedF[ fi ]--; + } + } + + if (maxF >= 0 && // aligned to any source words at all + (relaxLimit || maxF-minF < m_options.maxPhraseLength)) { // source phrase within limits + + // check if source words are aligned to out of bound target words + bool out_of_bounds = false; + for(int fi=minF; fi<=maxF && !out_of_bounds; fi++) + if (usedF[fi]>0) { + // cout << "ouf of bounds: " << fi << "\n"; + out_of_bounds = true; + } + + // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n"; + if (!out_of_bounds) { + // start point of source phrase may retreat over unaligned + for(int startF=minF; + (startF>=0 && + (relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit + (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned + startF--) + // end point of source phrase may advance over unaligned + for(int endF=maxF; + (endFsecond.find(startF-unit) != it->second.end())) + return LEFT; + if(modelType == REO_MONO) + return UNKNOWN; + if((!connectedLeftTop && connectedRightTop) || + ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end())) + return RIGHT; + if(modelType == REO_MSD) + return UNKNOWN; + connectedLeftTop = false; + for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) + if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() && + it->second.find(indexF) != it->second.end()) + return DRIGHT; + connectedRightTop = false; + for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) + if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && + it->second.find(indexF) != it->second.end()) + return DLEFT; + return UNKNOWN; +} + +// to be called with countF-1 instead of countF +REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType, + bool connectedLeftTop, bool connectedRightTop, + int startF, int endF, int startE, int endE, int countF, int zero, int unit, + bool (*ge)(int, int), bool (*lt)(int, int), + const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft, + const HSentenceVertices & outBottomRight, const HSentenceVertices & outBottomLeft, + REO_POS phraseOrient) +{ + + HSentenceVertices::const_iterator it; + + if(phraseOrient == LEFT || + (connectedLeftTop && !connectedRightTop) || + // (startE == 0 && startF == 0) || + //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) || + ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() && + it->second.find(startF-unit) != it->second.end()) || + ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() && + it->second.find(startF-unit) != it->second.end())) + return LEFT; + if(modelType == REO_MONO) + return UNKNOWN; + if(phraseOrient == RIGHT || + (!connectedLeftTop && connectedRightTop) || + ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && + it->second.find(endF + unit) != it->second.end()) || + ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() && + it->second.find(endF + unit) != it->second.end())) + return RIGHT; + if(modelType == REO_MSD) + return UNKNOWN; + if(phraseOrient != UNKNOWN) + return phraseOrient; + connectedLeftTop = false; + for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) { + if((connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() && + it->second.find(indexF) != it->second.end()) || + (connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() && + it->second.find(indexF) != it->second.end())) + return DRIGHT; + } + connectedRightTop = false; + for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) { + if((connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && + it->second.find(indexF) != it->second.end()) || + (connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() && + it->second.find(indexF) != it->second.end())) + return DLEFT; + } + return UNKNOWN; +} + +bool isAligned ( SentenceAlignment &sentence, int fi, int ei ) +{ + if (ei == -1 && fi == -1) + return true; + if (ei <= -1 || fi <= -1) + return false; + if ((size_t)ei == sentence.target.size() && (size_t)fi == sentence.source.size()) + return true; + if ((size_t)ei >= sentence.target.size() || (size_t)fi >= sentence.source.size()) + return false; + for(size_t i=0; i= second; +} + +bool le(int first, int second) +{ + return first <= second; +} + +bool lt(int first, int second) +{ + return first < second; +} + +void insertVertex( HSentenceVertices & corners, int x, int y ) +{ + set tmp; + tmp.insert(x); + pair< HSentenceVertices::iterator, bool > ret = corners.insert( pair > (y, tmp) ); + if(ret.second == false) { + ret.first->second.insert(x); + } +} + +void insertPhraseVertices( + HSentenceVertices & topLeft, + HSentenceVertices & topRight, + HSentenceVertices & bottomLeft, + HSentenceVertices & bottomRight, + int startF, int startE, int endF, int endE) +{ + + insertVertex(topLeft, startF, startE); + insertVertex(topRight, endF, startE); + insertVertex(bottomLeft, startF, endE); + insertVertex(bottomRight, endF, endE); +} + +string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType) +{ + switch(orient) { + case LEFT: + return "mono"; + break; + case RIGHT: + return "swap"; + break; + case DRIGHT: + return "dright"; + break; + case DLEFT: + return "dleft"; + break; + case UNKNOWN: + switch(modelType) { + case REO_MONO: + return "nomono"; + break; + case REO_MSD: + return "other"; + break; + case REO_MSLR: + return "dright"; + break; + } + break; + } + return ""; +} + +void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo) +{ + // source + // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n"; + ostringstream outextractstrOrientation; + + if (m_options.isOnlyOutputSpanInfo()) { + cout << startF << " " << endF << " " << startE << " " << endE << endl; + return; + } + + if (m_options.debug) { + outextractstrOrientation << "sentenceID=" << sentence.sentenceID << " "; + } + + for(int fi=startF; fi<=endF; fi++) { + if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " "; + } + if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; + + // target + for(int ei=startE; ei<=endE; ei++) { + if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " "; + } + if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; + + // source (for inverse) + + if (m_options.isOrientationFlag()) + outextractstrOrientation << orientationInfo; + + if (m_options.getInstanceWeightsFile().length()) { + if (m_options.isOrientationFlag()) { + outextractstrOrientation << " ||| " << sentence.weightString; + } + } + + + if (m_options.isOrientationFlag()) outextractstrOrientation << "\n"; + + + m_extractedPhrasesOri.push_back(outextractstrOrientation.str()); +} + + +void ExtractTask::writePhrasesToFile() +{ + + ostringstream outextractFile; + ostringstream outextractFileInv; + ostringstream outextractFileOrientation; + + for(vector::const_iterator phrase=m_extractedPhrasesOri.begin(); phrase!=m_extractedPhrasesOri.end(); phrase++) { + outextractFileOrientation<data(); + } + m_extractFileOrientation << outextractFileOrientation.str(); +} + +/** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument. + The separator can only be 1 character long. The default delimiters are space or tab +*/ +std::vector Tokenize(const std::string& str, + const std::string& delimiters) +{ + std::vector tokens; + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (std::string::npos != pos || std::string::npos != lastPos) { + // Found a token, add it to the vector. + tokens.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } + + return tokens; +} + +} From ad0eda5c18c6d8597bd8cc45a79cf7038123d466 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 29 Dec 2013 23:14:08 +0000 Subject: [PATCH 02/48] minor change from UINT32 to UINT64 --- OnDiskPt/Vocab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OnDiskPt/Vocab.h b/OnDiskPt/Vocab.h index 2adbf16f2..56ec7c33b 100644 --- a/OnDiskPt/Vocab.h +++ b/OnDiskPt/Vocab.h @@ -46,7 +46,7 @@ public: } UINT64 AddVocabId(const std::string &str); UINT64 GetVocabId(const std::string &str, bool &found) const; - const std::string &GetString(UINT32 vocabId) const { + const std::string &GetString(UINT64 vocabId) const { return m_lookup[vocabId]; } From 63f6ea8fa7a42fbf1f895dc869e7fe6963c540e3 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 6 Jan 2014 11:55:22 +0000 Subject: [PATCH 03/48] eclipse --- .../other-builds/extract-ordering/.cproject | 29 +- .../other-builds/extract-ordering/.project | 40 +++ phrase-extract/extract-ordering-main.cpp | 270 +++++++++++++++++- 3 files changed, 324 insertions(+), 15 deletions(-) diff --git a/contrib/other-builds/extract-ordering/.cproject b/contrib/other-builds/extract-ordering/.cproject index 251c8b4ff..048d4f200 100644 --- a/contrib/other-builds/extract-ordering/.cproject +++ b/contrib/other-builds/extract-ordering/.cproject @@ -18,19 +18,28 @@ - + - - + + @@ -63,15 +72,15 @@ - + - - diff --git a/contrib/other-builds/extract-ordering/.project b/contrib/other-builds/extract-ordering/.project index fcf758cab..f95b064b7 100644 --- a/contrib/other-builds/extract-ordering/.project +++ b/contrib/other-builds/extract-ordering/.project @@ -25,10 +25,50 @@ org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + InputFileStream.cpp + 1 + PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.cpp + + + InputFileStream.h + 1 + PARENT-3-PROJECT_LOC/phrase-extract/InputFileStream.h + + + OutputFileStream.cpp + 1 + PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.cpp + + + OutputFileStream.h + 1 + PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h + + + SentenceAlignment.cpp + 1 + PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.cpp + + + SentenceAlignment.h + 1 + PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.h + extract-ordering-main.cpp 1 PARENT-3-PROJECT_LOC/phrase-extract/extract-ordering-main.cpp + + tables-core.cpp + 1 + PARENT-3-PROJECT_LOC/phrase-extract/tables-core.cpp + + + tables-core.h + 1 + PARENT-3-PROJECT_LOC/phrase-extract/tables-core.h + diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp index f3d8a781a..5d58028d6 100644 --- a/phrase-extract/extract-ordering-main.cpp +++ b/phrase-extract/extract-ordering-main.cpp @@ -90,21 +90,36 @@ namespace MosesTraining class ExtractTask { public: - ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFileOrientation) - :m_sentence(sentence), + ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation, Moses::OutputFileStream &extractFileContext, Moses::OutputFileStream &extractFileContextInv): + m_sentence(sentence), m_options(initoptions), - m_extractFileOrientation(extractFileOrientation) - {} + m_extractFile(extractFile), + m_extractFileInv(extractFileInv), + m_extractFileOrientation(extractFileOrientation), + m_extractFileContext(extractFileContext), + m_extractFileContextInv(extractFileContextInv) {} void Run(); private: + vector< string > m_extractedPhrases; + vector< string > m_extractedPhrasesInv; vector< string > m_extractedPhrasesOri; + vector< string > m_extractedPhrasesSid; + vector< string > m_extractedPhrasesContext; + vector< string > m_extractedPhrasesContextInv; + void extractBase(SentenceAlignment &); void extract(SentenceAlignment &); void addPhrase(SentenceAlignment &, int, int, int, int, string &); void writePhrasesToFile(); + bool checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF); + bool isPlaceholder(const string &word); SentenceAlignment &m_sentence; const PhraseExtractionOptions &m_options; + Moses::OutputFileStream &m_extractFile; + Moses::OutputFileStream &m_extractFileInv; Moses::OutputFileStream &m_extractFileOrientation; + Moses::OutputFileStream &m_extractFileContext; + Moses::OutputFileStream &m_extractFileContextInv; }; } @@ -119,7 +134,11 @@ int main(int argc, char* argv[]) exit(1); } + Moses::OutputFileStream extractFile; + Moses::OutputFileStream extractFileInv; Moses::OutputFileStream extractFileOrientation; + Moses::OutputFileStream extractFileContext; + Moses::OutputFileStream extractFileContextInv; const char* const &fileNameE = argv[1]; const char* const &fileNameF = argv[2]; const char* const &fileNameA = argv[3]; @@ -206,6 +225,10 @@ int main(int argc, char* argv[]) } options.initAllModelsOutputFlag(true); + } else if (strcmp(argv[i], "--Placeholders") == 0) { + ++i; + string str = argv[i]; + options.placeholders = Tokenize(str.c_str(), ","); } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); @@ -236,10 +259,21 @@ int main(int argc, char* argv[]) } // open output files + if (options.isTranslationFlag()) { + string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":""); + extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str()); + extractFileInv.Open(fileNameExtractInv.c_str()); + } if (options.isOrientationFlag()) { string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":""); extractFileOrientation.Open(fileNameExtractOrientation.c_str()); } + if (options.isFlexScoreFlag()) { + string fileNameExtractContext = fileNameExtract + ".context" + (options.isGzOutput()?".gz":""); + string fileNameExtractContextInv = fileNameExtract + ".context.inv" + (options.isGzOutput()?".gz":""); + extractFileContext.Open(fileNameExtractContext.c_str()); + extractFileContextInv.Open(fileNameExtractContextInv.c_str()); + } int i = sentenceOffset; @@ -267,7 +301,10 @@ int main(int argc, char* argv[]) cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { - ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation); + if (options.placeholders.size()) { + sentence.invertAlignment(); + } + ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation, extractFileContext, extractFileContextInv); task->Run(); delete task; @@ -281,9 +318,19 @@ int main(int argc, char* argv[]) //az: only close if we actually opened it if (!options.isOnlyOutputSpanInfo()) { + if (options.isTranslationFlag()) { + extractFile.Close(); + extractFileInv.Close(); + + } if (options.isOrientationFlag()) { extractFileOrientation.Close(); } + + if (options.isFlexScoreFlag()) { + extractFileContext.Close(); + extractFileContextInv.Close(); + } } } @@ -293,7 +340,13 @@ void ExtractTask::Run() { extract(m_sentence); writePhrasesToFile(); + m_extractedPhrases.clear(); + m_extractedPhrasesInv.clear(); m_extractedPhrasesOri.clear(); + m_extractedPhrasesSid.clear(); + m_extractedPhrasesContext.clear(); + m_extractedPhrasesContextInv.clear(); + } void ExtractTask::extract(SentenceAlignment &sentence) @@ -655,6 +708,8 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, { // source // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n"; + ostringstream outextractstr; + ostringstream outextractstrInv; ostringstream outextractstrOrientation; if (m_options.isOnlyOutputSpanInfo()) { @@ -662,36 +717,138 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, return; } + if (m_options.placeholders.size() && !checkPlaceholders(sentence, startE, endE, startF, endF)) { + return; + } + if (m_options.debug) { + outextractstr << "sentenceID=" << sentence.sentenceID << " "; + outextractstrInv << "sentenceID=" << sentence.sentenceID << " "; outextractstrOrientation << "sentenceID=" << sentence.sentenceID << " "; } for(int fi=startF; fi<=endF; fi++) { + if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " "; if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " "; } + if (m_options.isTranslationFlag()) outextractstr << "||| "; if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; // target for(int ei=startE; ei<=endE; ei++) { + if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " "; + if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " "; if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " "; } + if (m_options.isTranslationFlag()) outextractstr << "|||"; + if (m_options.isTranslationFlag()) outextractstrInv << "||| "; if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; // source (for inverse) + if (m_options.isTranslationFlag()) { + for(int fi=startF; fi<=endF; fi++) + outextractstrInv << sentence.source[fi] << " "; + outextractstrInv << "|||"; + } + + // alignment + if (m_options.isTranslationFlag()) { + for(int ei=startE; ei<=endE; ei++) { + for(unsigned int i=0; i"; + else outextractstrContext << sentence.source[startF-1]; + + outextractstrContextInv << " < "; + if (startE == 0) outextractstrContextInv << ""; + else outextractstrContextInv << sentence.target[startE-1]; + + // write context to right + outextractstrContextRight << "> "; + if (endF+1 == sentence.source.size()) outextractstrContextRight << ""; + else outextractstrContextRight << sentence.source[endF+1]; + + outextractstrContextRightInv << " > "; + if (endE+1 == sentence.target.size()) outextractstrContextRightInv << ""; + else outextractstrContextRightInv << sentence.target[endE+1]; + + outextractstrContext << "\n"; + outextractstrContextInv << "\n"; + outextractstrContextRight << "\n"; + outextractstrContextRightInv << "\n"; + + m_extractedPhrasesContext.push_back(outextractstrContext.str()); + m_extractedPhrasesContextInv.push_back(outextractstrContextInv.str()); + m_extractedPhrasesContext.push_back(outextractstrContextRight.str()); + m_extractedPhrasesContextInv.push_back(outextractstrContextRightInv.str()); + } + + if (m_options.isTranslationFlag()) outextractstr << "\n"; + if (m_options.isTranslationFlag()) outextractstrInv << "\n"; if (m_options.isOrientationFlag()) outextractstrOrientation << "\n"; + m_extractedPhrases.push_back(outextractstr.str()); + m_extractedPhrasesInv.push_back(outextractstrInv.str()); m_extractedPhrasesOri.push_back(outextractstrOrientation.str()); } @@ -702,13 +859,116 @@ void ExtractTask::writePhrasesToFile() ostringstream outextractFile; ostringstream outextractFileInv; ostringstream outextractFileOrientation; + ostringstream outextractFileContext; + ostringstream outextractFileContextInv; + for(vector::const_iterator phrase=m_extractedPhrases.begin(); phrase!=m_extractedPhrases.end(); phrase++) { + outextractFile<data(); + } + for(vector::const_iterator phrase=m_extractedPhrasesInv.begin(); phrase!=m_extractedPhrasesInv.end(); phrase++) { + outextractFileInv<data(); + } for(vector::const_iterator phrase=m_extractedPhrasesOri.begin(); phrase!=m_extractedPhrasesOri.end(); phrase++) { outextractFileOrientation<data(); } + for(vector::const_iterator phrase=m_extractedPhrasesContext.begin(); phrase!=m_extractedPhrasesContext.end(); phrase++) { + outextractFileContext<data(); + } + for(vector::const_iterator phrase=m_extractedPhrasesContextInv.begin(); phrase!=m_extractedPhrasesContextInv.end(); phrase++) { + outextractFileContextInv<data(); + } + + m_extractFile << outextractFile.str(); + m_extractFileInv << outextractFileInv.str(); m_extractFileOrientation << outextractFileOrientation.str(); + if (m_options.isFlexScoreFlag()) { + m_extractFileContext << outextractFileContext.str(); + m_extractFileContextInv << outextractFileContextInv.str(); + } } +// if proper conditioning, we need the number of times a source phrase occured + +void ExtractTask::extractBase( SentenceAlignment &sentence ) +{ + ostringstream outextractFile; + ostringstream outextractFileInv; + + int countF = sentence.source.size(); + for(int startF=0; startF Date: Mon, 6 Jan 2014 13:31:47 +0000 Subject: [PATCH 04/48] only output ordering extract --- phrase-extract/extract-ordering-main.cpp | 270 +---------------------- 1 file changed, 5 insertions(+), 265 deletions(-) diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp index 5d58028d6..f3d8a781a 100644 --- a/phrase-extract/extract-ordering-main.cpp +++ b/phrase-extract/extract-ordering-main.cpp @@ -90,36 +90,21 @@ namespace MosesTraining class ExtractTask { public: - ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation, Moses::OutputFileStream &extractFileContext, Moses::OutputFileStream &extractFileContextInv): - m_sentence(sentence), + ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFileOrientation) + :m_sentence(sentence), m_options(initoptions), - m_extractFile(extractFile), - m_extractFileInv(extractFileInv), - m_extractFileOrientation(extractFileOrientation), - m_extractFileContext(extractFileContext), - m_extractFileContextInv(extractFileContextInv) {} + m_extractFileOrientation(extractFileOrientation) + {} void Run(); private: - vector< string > m_extractedPhrases; - vector< string > m_extractedPhrasesInv; vector< string > m_extractedPhrasesOri; - vector< string > m_extractedPhrasesSid; - vector< string > m_extractedPhrasesContext; - vector< string > m_extractedPhrasesContextInv; - void extractBase(SentenceAlignment &); void extract(SentenceAlignment &); void addPhrase(SentenceAlignment &, int, int, int, int, string &); void writePhrasesToFile(); - bool checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF); - bool isPlaceholder(const string &word); SentenceAlignment &m_sentence; const PhraseExtractionOptions &m_options; - Moses::OutputFileStream &m_extractFile; - Moses::OutputFileStream &m_extractFileInv; Moses::OutputFileStream &m_extractFileOrientation; - Moses::OutputFileStream &m_extractFileContext; - Moses::OutputFileStream &m_extractFileContextInv; }; } @@ -134,11 +119,7 @@ int main(int argc, char* argv[]) exit(1); } - Moses::OutputFileStream extractFile; - Moses::OutputFileStream extractFileInv; Moses::OutputFileStream extractFileOrientation; - Moses::OutputFileStream extractFileContext; - Moses::OutputFileStream extractFileContextInv; const char* const &fileNameE = argv[1]; const char* const &fileNameF = argv[2]; const char* const &fileNameA = argv[3]; @@ -225,10 +206,6 @@ int main(int argc, char* argv[]) } options.initAllModelsOutputFlag(true); - } else if (strcmp(argv[i], "--Placeholders") == 0) { - ++i; - string str = argv[i]; - options.placeholders = Tokenize(str.c_str(), ","); } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); @@ -259,21 +236,10 @@ int main(int argc, char* argv[]) } // open output files - if (options.isTranslationFlag()) { - string fileNameExtractInv = fileNameExtract + ".inv" + (options.isGzOutput()?".gz":""); - extractFile.Open( (fileNameExtract + (options.isGzOutput()?".gz":"")).c_str()); - extractFileInv.Open(fileNameExtractInv.c_str()); - } if (options.isOrientationFlag()) { string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":""); extractFileOrientation.Open(fileNameExtractOrientation.c_str()); } - if (options.isFlexScoreFlag()) { - string fileNameExtractContext = fileNameExtract + ".context" + (options.isGzOutput()?".gz":""); - string fileNameExtractContextInv = fileNameExtract + ".context.inv" + (options.isGzOutput()?".gz":""); - extractFileContext.Open(fileNameExtractContext.c_str()); - extractFileContextInv.Open(fileNameExtractContextInv.c_str()); - } int i = sentenceOffset; @@ -301,10 +267,7 @@ int main(int argc, char* argv[]) cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { - if (options.placeholders.size()) { - sentence.invertAlignment(); - } - ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation, extractFileContext, extractFileContextInv); + ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation); task->Run(); delete task; @@ -318,19 +281,9 @@ int main(int argc, char* argv[]) //az: only close if we actually opened it if (!options.isOnlyOutputSpanInfo()) { - if (options.isTranslationFlag()) { - extractFile.Close(); - extractFileInv.Close(); - - } if (options.isOrientationFlag()) { extractFileOrientation.Close(); } - - if (options.isFlexScoreFlag()) { - extractFileContext.Close(); - extractFileContextInv.Close(); - } } } @@ -340,13 +293,7 @@ void ExtractTask::Run() { extract(m_sentence); writePhrasesToFile(); - m_extractedPhrases.clear(); - m_extractedPhrasesInv.clear(); m_extractedPhrasesOri.clear(); - m_extractedPhrasesSid.clear(); - m_extractedPhrasesContext.clear(); - m_extractedPhrasesContextInv.clear(); - } void ExtractTask::extract(SentenceAlignment &sentence) @@ -708,8 +655,6 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, { // source // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n"; - ostringstream outextractstr; - ostringstream outextractstrInv; ostringstream outextractstrOrientation; if (m_options.isOnlyOutputSpanInfo()) { @@ -717,138 +662,36 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, return; } - if (m_options.placeholders.size() && !checkPlaceholders(sentence, startE, endE, startF, endF)) { - return; - } - if (m_options.debug) { - outextractstr << "sentenceID=" << sentence.sentenceID << " "; - outextractstrInv << "sentenceID=" << sentence.sentenceID << " "; outextractstrOrientation << "sentenceID=" << sentence.sentenceID << " "; } for(int fi=startF; fi<=endF; fi++) { - if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " "; if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " "; } - if (m_options.isTranslationFlag()) outextractstr << "||| "; if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; // target for(int ei=startE; ei<=endE; ei++) { - if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " "; - if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " "; if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " "; } - if (m_options.isTranslationFlag()) outextractstr << "|||"; - if (m_options.isTranslationFlag()) outextractstrInv << "||| "; if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; // source (for inverse) - if (m_options.isTranslationFlag()) { - for(int fi=startF; fi<=endF; fi++) - outextractstrInv << sentence.source[fi] << " "; - outextractstrInv << "|||"; - } - - // alignment - if (m_options.isTranslationFlag()) { - for(int ei=startE; ei<=endE; ei++) { - for(unsigned int i=0; i"; - else outextractstrContext << sentence.source[startF-1]; - - outextractstrContextInv << " < "; - if (startE == 0) outextractstrContextInv << ""; - else outextractstrContextInv << sentence.target[startE-1]; - - // write context to right - outextractstrContextRight << "> "; - if (endF+1 == sentence.source.size()) outextractstrContextRight << ""; - else outextractstrContextRight << sentence.source[endF+1]; - - outextractstrContextRightInv << " > "; - if (endE+1 == sentence.target.size()) outextractstrContextRightInv << ""; - else outextractstrContextRightInv << sentence.target[endE+1]; - - outextractstrContext << "\n"; - outextractstrContextInv << "\n"; - outextractstrContextRight << "\n"; - outextractstrContextRightInv << "\n"; - - m_extractedPhrasesContext.push_back(outextractstrContext.str()); - m_extractedPhrasesContextInv.push_back(outextractstrContextInv.str()); - m_extractedPhrasesContext.push_back(outextractstrContextRight.str()); - m_extractedPhrasesContextInv.push_back(outextractstrContextRightInv.str()); - } - - if (m_options.isTranslationFlag()) outextractstr << "\n"; - if (m_options.isTranslationFlag()) outextractstrInv << "\n"; if (m_options.isOrientationFlag()) outextractstrOrientation << "\n"; - m_extractedPhrases.push_back(outextractstr.str()); - m_extractedPhrasesInv.push_back(outextractstrInv.str()); m_extractedPhrasesOri.push_back(outextractstrOrientation.str()); } @@ -859,116 +702,13 @@ void ExtractTask::writePhrasesToFile() ostringstream outextractFile; ostringstream outextractFileInv; ostringstream outextractFileOrientation; - ostringstream outextractFileContext; - ostringstream outextractFileContextInv; - for(vector::const_iterator phrase=m_extractedPhrases.begin(); phrase!=m_extractedPhrases.end(); phrase++) { - outextractFile<data(); - } - for(vector::const_iterator phrase=m_extractedPhrasesInv.begin(); phrase!=m_extractedPhrasesInv.end(); phrase++) { - outextractFileInv<data(); - } for(vector::const_iterator phrase=m_extractedPhrasesOri.begin(); phrase!=m_extractedPhrasesOri.end(); phrase++) { outextractFileOrientation<data(); } - for(vector::const_iterator phrase=m_extractedPhrasesContext.begin(); phrase!=m_extractedPhrasesContext.end(); phrase++) { - outextractFileContext<data(); - } - for(vector::const_iterator phrase=m_extractedPhrasesContextInv.begin(); phrase!=m_extractedPhrasesContextInv.end(); phrase++) { - outextractFileContextInv<data(); - } - - m_extractFile << outextractFile.str(); - m_extractFileInv << outextractFileInv.str(); m_extractFileOrientation << outextractFileOrientation.str(); - if (m_options.isFlexScoreFlag()) { - m_extractFileContext << outextractFileContext.str(); - m_extractFileContextInv << outextractFileContextInv.str(); - } } -// if proper conditioning, we need the number of times a source phrase occured - -void ExtractTask::extractBase( SentenceAlignment &sentence ) -{ - ostringstream outextractFile; - ostringstream outextractFileInv; - - int countF = sentence.source.size(); - for(int startF=0; startF Date: Mon, 6 Jan 2014 16:31:21 +0000 Subject: [PATCH 05/48] only output ordering extract --- phrase-extract/extract-ordering-main.cpp | 100 ++--------------------- 1 file changed, 7 insertions(+), 93 deletions(-) diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp index f3d8a781a..67c78f687 100644 --- a/phrase-extract/extract-ordering-main.cpp +++ b/phrase-extract/extract-ordering-main.cpp @@ -97,7 +97,6 @@ public: {} void Run(); private: - vector< string > m_extractedPhrasesOri; void extract(SentenceAlignment &); void addPhrase(SentenceAlignment &, int, int, int, int, string &); void writePhrasesToFile(); @@ -292,8 +291,6 @@ namespace MosesTraining void ExtractTask::Run() { extract(m_sentence); - writePhrasesToFile(); - m_extractedPhrasesOri.clear(); } void ExtractTask::extract(SentenceAlignment &sentence) @@ -397,57 +394,7 @@ void ExtractTask::extract(SentenceAlignment &sentence) } } - if(buildExtraStructure) { // phrase || hier - string orientationInfo = ""; - REO_POS wordPrevOrient, wordNextOrient, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient; - for(size_t i = 0; i < inboundPhrases.size(); i++) { - int startF = inboundPhrases[i].first.first; - int startE = inboundPhrases[i].first.second; - int endF = inboundPhrases[i].second.first; - int endE = inboundPhrases[i].second.second; - - bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 ); - bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 ); - bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 ); - bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 ); - - if(m_options.isWordModel()) { - wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), - connectedLeftTopP, connectedRightTopP, - startF, endF, startE, endE, countF, 0, 1, - &ge, <); - wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), - connectedLeftTopN, connectedRightTopN, - endF, startF, endE, startE, 0, countF, -1, - <, &ge); - } - if (m_options.isPhraseModel()) { - phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(), - connectedLeftTopP, connectedRightTopP, - startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft); - phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(), - connectedLeftTopN, connectedRightTopN, - endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight); - } else { - phrasePrevOrient = phraseNextOrient = UNKNOWN; - } - if(m_options.isHierModel()) { - hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(), - connectedLeftTopP, connectedRightTopP, - startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient); - hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(), - connectedLeftTopN, connectedRightTopN, - endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient); - } - - orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " + - ((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " + - ((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : ""); - - addPhrase(sentence, startE, endE, startF, endF, orientationInfo); - } - } } REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType, @@ -653,62 +600,29 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType) void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo) { - // source - // // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n"; - ostringstream outextractstrOrientation; - if (m_options.isOnlyOutputSpanInfo()) { cout << startF << " " << endF << " " << startE << " " << endE << endl; return; } - if (m_options.debug) { - outextractstrOrientation << "sentenceID=" << sentence.sentenceID << " "; - } + m_extractFileOrientation << sentence.sentenceID << " ||| "; for(int fi=startF; fi<=endF; fi++) { - if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " "; + m_extractFileOrientation << sentence.source[fi] << " "; } - if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; + m_extractFileOrientation << "||| "; // target for(int ei=startE; ei<=endE; ei++) { - if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " "; + m_extractFileOrientation << sentence.target[ei] << " "; } - if (m_options.isOrientationFlag()) outextractstrOrientation << "||| "; + m_extractFileOrientation << "||| "; - // source (for inverse) - - if (m_options.isOrientationFlag()) - outextractstrOrientation << orientationInfo; - - if (m_options.getInstanceWeightsFile().length()) { - if (m_options.isOrientationFlag()) { - outextractstrOrientation << " ||| " << sentence.weightString; - } - } - - - if (m_options.isOrientationFlag()) outextractstrOrientation << "\n"; - - - m_extractedPhrasesOri.push_back(outextractstrOrientation.str()); + m_extractFileOrientation << orientationInfo; + m_extractFileOrientation << endl; } -void ExtractTask::writePhrasesToFile() -{ - - ostringstream outextractFile; - ostringstream outextractFileInv; - ostringstream outextractFileOrientation; - - for(vector::const_iterator phrase=m_extractedPhrasesOri.begin(); phrase!=m_extractedPhrasesOri.end(); phrase++) { - outextractFileOrientation<data(); - } - m_extractFileOrientation << outextractFileOrientation.str(); -} - /** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument. The separator can only be 1 character long. The default delimiters are space or tab */ From ac5d6676f223283d6549c5f45dc3bac5d17506f3 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 6 Jan 2014 17:04:10 +0000 Subject: [PATCH 06/48] ordering extract in same format as my own --- phrase-extract/extract-ordering-main.cpp | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp index 67c78f687..e808f127e 100644 --- a/phrase-extract/extract-ordering-main.cpp +++ b/phrase-extract/extract-ordering-main.cpp @@ -606,19 +606,36 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, } m_extractFileOrientation << sentence.sentenceID << " ||| "; + m_extractFileOrientation << orientationInfo << " ||| "; + // start + m_extractFileOrientation << " "; + for(int fi=0; fi "; + + m_extractFileOrientation << "||| "; + + // target + /* for(int ei=startE; ei<=endE; ei++) { m_extractFileOrientation << sentence.target[ei] << " "; } - m_extractFileOrientation << "||| "; - - m_extractFileOrientation << orientationInfo; + */ m_extractFileOrientation << endl; } From abe0155f818f2b76910fff5ea8aad6a10ebf2461 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 6 Jan 2014 17:21:39 +0000 Subject: [PATCH 07/48] ordering extract in same format as my own --- phrase-extract/PhraseExtractionOptions.h | 5 ++++- phrase-extract/extract-ordering-main.cpp | 19 ++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h index 574b9afc1..8c277b7c6 100644 --- a/phrase-extract/PhraseExtractionOptions.h +++ b/phrase-extract/PhraseExtractionOptions.h @@ -33,7 +33,9 @@ class PhraseExtractionOptions { public: - const int maxPhraseLength; + int maxPhraseLength; + int minPhraseLength; + private: bool allModelsOutputFlag; bool wordModel; @@ -56,6 +58,7 @@ public: PhraseExtractionOptions(const int initmaxPhraseLength): maxPhraseLength(initmaxPhraseLength), + minPhraseLength(3), allModelsOutputFlag(false), wordModel(false), wordType(REO_MSD), diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp index e808f127e..930a8016f 100644 --- a/phrase-extract/extract-ordering-main.cpp +++ b/phrase-extract/extract-ordering-main.cpp @@ -598,6 +598,20 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType) return ""; } +int getClass(const std::string &str) +{ + size_t pos = str.find("swap"); + if (pos == str.npos) { + return 0; + } + else if (pos == 0) { + return 1; + } + else { + return 2; + } +} + void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo) { if (m_options.isOnlyOutputSpanInfo()) { @@ -606,7 +620,10 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, } m_extractFileOrientation << sentence.sentenceID << " ||| "; - m_extractFileOrientation << orientationInfo << " ||| "; + m_extractFileOrientation << getClass(orientationInfo) << " ||| "; + + // position + m_extractFileOrientation << startF << " " << endF << " ||| "; // start m_extractFileOrientation << " "; From 35faa887e879db7c0e05457370dec02cc28ca204 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 6 Jan 2014 17:34:04 +0000 Subject: [PATCH 08/48] add support for --MinPhraseLength --- phrase-extract/extract-ordering-main.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp index 930a8016f..bff7fbb65 100644 --- a/phrase-extract/extract-ordering-main.cpp +++ b/phrase-extract/extract-ordering-main.cpp @@ -151,7 +151,9 @@ int main(int argc, char* argv[]) } options.initInstanceWeightsFile(argv[++i]); } else if (strcmp(argv[i], "--Debug") == 0) { - options.debug = true; + options.debug = true; + } else if (strcmp(argv[i], "--MinPhraseLength") == 0) { + options.minPhraseLength = atoi(argv[++i]); } else if(strcmp(argv[i],"--model") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, no model's information provided to the option --model " << endl; @@ -361,6 +363,7 @@ void ExtractTask::extract(SentenceAlignment &sentence) for(int endF=maxF; (endF m_options.minPhraseLength) && // within length limit (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned endF++) { // at this point we have extracted a phrase if(buildExtraStructure) { // phrase || hier From 584af0d0152d8115d4ede495ef8eabca1f997aff Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 6 Jan 2014 18:03:38 +0000 Subject: [PATCH 09/48] add support for --MinPhraseLength --- phrase-extract/PhraseExtractionOptions.h | 2 ++ phrase-extract/extract-ordering-main.cpp | 16 +++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h index 8c277b7c6..87712d6d3 100644 --- a/phrase-extract/PhraseExtractionOptions.h +++ b/phrase-extract/PhraseExtractionOptions.h @@ -35,6 +35,7 @@ class PhraseExtractionOptions public: int maxPhraseLength; int minPhraseLength; + std::string separator; private: bool allModelsOutputFlag; @@ -59,6 +60,7 @@ public: PhraseExtractionOptions(const int initmaxPhraseLength): maxPhraseLength(initmaxPhraseLength), minPhraseLength(3), + separator("|||"), allModelsOutputFlag(false), wordModel(false), wordType(REO_MSD), diff --git a/phrase-extract/extract-ordering-main.cpp b/phrase-extract/extract-ordering-main.cpp index bff7fbb65..104457b01 100644 --- a/phrase-extract/extract-ordering-main.cpp +++ b/phrase-extract/extract-ordering-main.cpp @@ -154,6 +154,8 @@ int main(int argc, char* argv[]) options.debug = true; } else if (strcmp(argv[i], "--MinPhraseLength") == 0) { options.minPhraseLength = atoi(argv[++i]); + } else if (strcmp(argv[i], "--Separator") == 0) { + options.separator = argv[++i]; } else if(strcmp(argv[i],"--model") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, no model's information provided to the option --model " << endl; @@ -622,24 +624,26 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, return; } - m_extractFileOrientation << sentence.sentenceID << " ||| "; - m_extractFileOrientation << getClass(orientationInfo) << " ||| "; + const string &sep = m_options.separator; + + m_extractFileOrientation << sentence.sentenceID << " " << sep << " "; + m_extractFileOrientation << getClass(orientationInfo) << " " << sep << " "; // position - m_extractFileOrientation << startF << " " << endF << " ||| "; + m_extractFileOrientation << startF << " " << endF << " " << sep << " "; // start m_extractFileOrientation << " "; for(int fi=0; fi "; - m_extractFileOrientation << "||| "; - // target /* From 9128c4d3b49429b1e9397e9c19b62c8f2f2c4581 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 15 Jan 2014 10:30:31 +0000 Subject: [PATCH 10/48] Manual source labelling for de-en --- contrib/other-builds/manual-label/DeEn.cpp | 62 ++++++++++++++ contrib/other-builds/manual-label/DeEn.h | 10 +++ .../manual-label/manual-label.cpp | 84 +++++++++++++++++++ 3 files changed, 156 insertions(+) create mode 100644 contrib/other-builds/manual-label/DeEn.cpp create mode 100644 contrib/other-builds/manual-label/DeEn.h create mode 100644 contrib/other-builds/manual-label/manual-label.cpp diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp new file mode 100644 index 000000000..2c94356a8 --- /dev/null +++ b/contrib/other-builds/manual-label/DeEn.cpp @@ -0,0 +1,62 @@ +#include +#include "DeEn.h" +#include "moses/Util.h" + +using namespace std; + +extern bool g_debug; + +bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str) +{ + pos += offset; + if (pos >= source.size() || pos < 0) { + return false; + } + + const string &word = source[pos][factor]; + vector soughts = Moses::Tokenize(str, " "); + for (int i = 0; i < soughts.size(); ++i) { + string &sought = soughts[i]; + bool found = (word == sought); + if (found) { + return true; + } + } + return false; +} + +bool Contains(const Phrase &source, int start, int end, int factor, const string &str) +{ + for (int pos = start; pos <= end; ++pos) { + bool found = IsA(source, pos, 0, factor, str); + if (found) { + return true; + } + } + return false; +} + +void LabelDeEn(const Phrase &source, ostream &out) +{ + list > ranges; + + for (int start = 0; start < source.size(); ++start) { + for (int end = start; end < source.size(); ++end) { + if (IsA(source, start, -1, 1, "VAFIN") + && IsA(source, end, +1, 1, "VVINF VVPP") + && !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) { + cerr << start << " " << end << endl; + //ranges.push_back(pair(start, end); + } + else if ((start == 0 || IsA(source, start, -1, 1, "$,")) + && IsA(source, end, +1, 0, "zu") + && IsA(source, end, +2, 1, "VVINF") + && !Contains(source, start, end, 1, "$,")) { + cerr << start << " " << end << endl; + //ranges.push_back(pair(start, end); + } + } + } + + +} diff --git a/contrib/other-builds/manual-label/DeEn.h b/contrib/other-builds/manual-label/DeEn.h new file mode 100644 index 000000000..999c2dfbd --- /dev/null +++ b/contrib/other-builds/manual-label/DeEn.h @@ -0,0 +1,10 @@ +#pragma once + +#include +#include +#include + +typedef std::vector Word; +typedef std::vector Phrase; + +void LabelDeEn(const Phrase &source, std::ostream &out); diff --git a/contrib/other-builds/manual-label/manual-label.cpp b/contrib/other-builds/manual-label/manual-label.cpp new file mode 100644 index 000000000..904eb9f69 --- /dev/null +++ b/contrib/other-builds/manual-label/manual-label.cpp @@ -0,0 +1,84 @@ +#include +#include +#include +#include "moses/Util.h" +#include "DeEn.h" + +using namespace std; + +bool g_debug = false; + +Phrase Tokenize(const string &line); + +int main(int argc, char** argv) +{ + cerr << "Starting" << endl; + + namespace po = boost::program_options; + po::options_description desc("Options"); + desc.add_options() + ("help", "Print help messages") + ("add", "additional options") + ("source-language,s", po::value()->required(), "Source Language") + ("target-language,t", po::value()->required(), "Target Language"); + + po::variables_map vm; + try + { + po::store(po::parse_command_line(argc, argv, desc), + vm); // can throw + + /** --help option + */ + if ( vm.count("help") ) + { + std::cout << "Basic Command Line Parameter App" << std::endl + << desc << std::endl; + return EXIT_SUCCESS; + } + + po::notify(vm); // throws on error, so do after help in case + // there are any problems + } + catch(po::error& e) + { + std::cerr << "ERROR: " << e.what() << std::endl << std::endl; + std::cerr << desc << std::endl; + return EXIT_FAILURE; + } + + string sourceLang = vm["source-language"].as(); + string targetLang = vm["target-language"].as(); + cerr << sourceLang << " " << targetLang << endl; + + string line; + size_t lineNum = 1; + + while (getline(cin, line)) { + cerr << lineNum << ":" << line << endl; + Phrase source = Tokenize(line); + + LabelDeEn(source, cout); + + ++lineNum; + } + + + + cerr << "Finished" << endl; + return EXIT_SUCCESS; +} + +Phrase Tokenize(const string &line) +{ + Phrase ret; + + vector toks = Moses::Tokenize(line); + for (size_t i = 0; i < toks.size(); ++i) { + Word word = Moses::Tokenize(toks[i], "|"); + ret.push_back(word); + } + + return ret; +} + From 8249d259f42ea6b45c91fcf8452b945103851412 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 15 Jan 2014 17:43:16 +0000 Subject: [PATCH 11/48] Manual source labelling for de-en --- contrib/other-builds/manual-label/DeEn.cpp | 34 ++++++++++++++++--- .../manual-label/manual-label.cpp | 6 +++- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp index 2c94356a8..e8cba272f 100644 --- a/contrib/other-builds/manual-label/DeEn.cpp +++ b/contrib/other-builds/manual-label/DeEn.cpp @@ -38,25 +38,49 @@ bool Contains(const Phrase &source, int start, int end, int factor, const string void LabelDeEn(const Phrase &source, ostream &out) { - list > ranges; + typedef pair Range; + typedef list Ranges; + Ranges ranges; + // find ranges to label for (int start = 0; start < source.size(); ++start) { for (int end = start; end < source.size(); ++end) { if (IsA(source, start, -1, 1, "VAFIN") && IsA(source, end, +1, 1, "VVINF VVPP") && !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) { - cerr << start << " " << end << endl; - //ranges.push_back(pair(start, end); + Range range(start, end); + ranges.push_back(range); } else if ((start == 0 || IsA(source, start, -1, 1, "$,")) && IsA(source, end, +1, 0, "zu") && IsA(source, end, +2, 1, "VVINF") && !Contains(source, start, end, 1, "$,")) { - cerr << start << " " << end << endl; - //ranges.push_back(pair(start, end); + Range range(start, end); + ranges.push_back(range); } } } + // output sentence, with labels + for (int pos = 0; pos < source.size(); ++pos) { + // output beginning of label + for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) { + const Range &range = *iter; + if (range.first == pos) { + out << " "; + } + } + + const Word &word = source[pos]; + out << word[0] << " "; + + for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) { + const Range &range = *iter; + if (range.second == pos) { + out << " "; + } + } + } + out << endl; } diff --git a/contrib/other-builds/manual-label/manual-label.cpp b/contrib/other-builds/manual-label/manual-label.cpp index 904eb9f69..4500d2c84 100644 --- a/contrib/other-builds/manual-label/manual-label.cpp +++ b/contrib/other-builds/manual-label/manual-label.cpp @@ -55,7 +55,11 @@ int main(int argc, char** argv) size_t lineNum = 1; while (getline(cin, line)) { - cerr << lineNum << ":" << line << endl; + //cerr << lineNum << ":" << line << endl; + if (lineNum % 1000 == 0) { + cerr << lineNum << " "; + } + Phrase source = Tokenize(line); LabelDeEn(source, cout); From 54c65f751884a6d92fed92d6f213c9ff50d4d926 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 16 Jan 2014 17:16:39 +0000 Subject: [PATCH 12/48] constraint number of target symbols, for fairer inverse probability --- .../extract-mixed-syntax/.cproject | 125 ++++ .../extract-mixed-syntax/.project | 27 + .../extract-mixed-syntax/Global.cpp | 35 ++ .../extract-mixed-syntax/Global.h | 44 ++ .../extract-mixed-syntax/InputFileStream.cpp | 62 ++ .../extract-mixed-syntax/InputFileStream.h | 48 ++ .../extract-mixed-syntax/Lattice.cpp | 180 ++++++ .../extract-mixed-syntax/Lattice.h | 47 ++ .../extract-mixed-syntax/LatticeNode.cpp | 149 +++++ .../extract-mixed-syntax/LatticeNode.h | 77 +++ .../extract-mixed-syntax/Makefile | 12 + .../extract-mixed-syntax/Range.cpp | 74 +++ .../other-builds/extract-mixed-syntax/Range.h | 57 ++ .../extract-mixed-syntax/Rule.cpp | 553 ++++++++++++++++++ .../other-builds/extract-mixed-syntax/Rule.h | 94 +++ .../extract-mixed-syntax/RuleCollection.cpp | 94 +++ .../extract-mixed-syntax/RuleCollection.h | 53 ++ .../SentenceAlignment.cpp | 331 +++++++++++ .../extract-mixed-syntax/SentenceAlignment.h | 69 +++ .../extract-mixed-syntax/Symbol.cpp | 101 ++++ .../extract-mixed-syntax/Symbol.h | 36 ++ .../extract-mixed-syntax/SymbolSequence.cpp | 56 ++ .../extract-mixed-syntax/SymbolSequence.h | 42 ++ .../extract-mixed-syntax/SyntaxTree.cpp | 245 ++++++++ .../extract-mixed-syntax/SyntaxTree.h | 96 +++ .../extract-mixed-syntax/Tunnel.cpp | 38 ++ .../extract-mixed-syntax/Tunnel.h | 49 ++ .../extract-mixed-syntax/TunnelCollection.cpp | 70 +++ .../extract-mixed-syntax/TunnelCollection.h | 61 ++ .../extract-mixed-syntax/XmlTree.cpp | 344 +++++++++++ .../extract-mixed-syntax/XmlTree.h | 35 ++ .../extract-mixed-syntax/extract.cpp | 242 ++++++++ .../extract-mixed-syntax/extract.h | 39 ++ .../extract-mixed-syntax/gzfilebuf.h | 81 +++ .../extract-mixed-syntax/tables-core.cpp | 110 ++++ .../extract-mixed-syntax/tables-core.h | 72 +++ 36 files changed, 3848 insertions(+) create mode 100644 contrib/other-builds/extract-mixed-syntax/.cproject create mode 100644 contrib/other-builds/extract-mixed-syntax/.project create mode 100644 contrib/other-builds/extract-mixed-syntax/Global.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/Global.h create mode 100644 contrib/other-builds/extract-mixed-syntax/InputFileStream.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/InputFileStream.h create mode 100644 contrib/other-builds/extract-mixed-syntax/Lattice.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/Lattice.h create mode 100644 contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/LatticeNode.h create mode 100644 contrib/other-builds/extract-mixed-syntax/Makefile create mode 100644 contrib/other-builds/extract-mixed-syntax/Range.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/Range.h create mode 100644 contrib/other-builds/extract-mixed-syntax/Rule.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/Rule.h create mode 100644 contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/RuleCollection.h create mode 100644 contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/SentenceAlignment.h create mode 100644 contrib/other-builds/extract-mixed-syntax/Symbol.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/Symbol.h create mode 100644 contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/SymbolSequence.h create mode 100644 contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/SyntaxTree.h create mode 100644 contrib/other-builds/extract-mixed-syntax/Tunnel.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/Tunnel.h create mode 100644 contrib/other-builds/extract-mixed-syntax/TunnelCollection.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/TunnelCollection.h create mode 100644 contrib/other-builds/extract-mixed-syntax/XmlTree.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/XmlTree.h create mode 100644 contrib/other-builds/extract-mixed-syntax/extract.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/extract.h create mode 100644 contrib/other-builds/extract-mixed-syntax/gzfilebuf.h create mode 100644 contrib/other-builds/extract-mixed-syntax/tables-core.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/tables-core.h diff --git a/contrib/other-builds/extract-mixed-syntax/.cproject b/contrib/other-builds/extract-mixed-syntax/.cproject new file mode 100644 index 000000000..345f4d2f5 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/.cproject @@ -0,0 +1,125 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/other-builds/extract-mixed-syntax/.project b/contrib/other-builds/extract-mixed-syntax/.project new file mode 100644 index 000000000..8f0f81f07 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/.project @@ -0,0 +1,27 @@ + + + extract-mixed-syntax + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + diff --git a/contrib/other-builds/extract-mixed-syntax/Global.cpp b/contrib/other-builds/extract-mixed-syntax/Global.cpp new file mode 100644 index 000000000..5a851e2f2 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Global.cpp @@ -0,0 +1,35 @@ +/* + * Global.cpp + * extract + * + * Created by Hieu Hoang on 01/02/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ + +#include "Global.h" + +bool g_debug = false; + +Global::Global() +: minHoleSpanSourceDefault(2) +, maxHoleSpanSourceDefault(7) +, minHoleSpanSourceSyntax(1) +, maxHoleSpanSourceSyntax(1000) +, maxUnaligned(5) + +, maxSymbolsSource(5) +, maxNonTerm(3) +, maxNonTermDefault(2) + +// int minHoleSize(1) +// int minSubPhraseSize(1) // minimum size of a remaining lexical phrase +, glueGrammarFlag(false) +, unknownWordLabelFlag(false) +//bool zipFiles(false) +, sourceSyntax(true) +, targetSyntax(false) +, mixed(true) +, uppermostOnly(true) +, allowDefaultNonTermEdge(true) +{} diff --git a/contrib/other-builds/extract-mixed-syntax/Global.h b/contrib/other-builds/extract-mixed-syntax/Global.h new file mode 100644 index 000000000..71da54695 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Global.h @@ -0,0 +1,44 @@ +#pragma once +/* + * Global.h + * extract + * + * Created by Hieu Hoang on 01/02/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include + +class Global +{ +public: + int minHoleSpanSourceDefault; + int maxHoleSpanSourceDefault; + int minHoleSpanSourceSyntax; + int maxHoleSpanSourceSyntax; + + int maxSymbolsSource; + bool glueGrammarFlag; + bool unknownWordLabelFlag; + int maxNonTerm; + int maxNonTermDefault; + bool sourceSyntax; + bool targetSyntax; + bool mixed; + int maxUnaligned; + bool uppermostOnly; + bool allowDefaultNonTermEdge; + + Global(); + + Global(const Global&); + +}; + +extern bool g_debug; + +#define DEBUG_OUTPUT() void DebugOutput() const; + + diff --git a/contrib/other-builds/extract-mixed-syntax/InputFileStream.cpp b/contrib/other-builds/extract-mixed-syntax/InputFileStream.cpp new file mode 100644 index 000000000..b52d1f920 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/InputFileStream.cpp @@ -0,0 +1,62 @@ +// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "InputFileStream.h" +#include "gzfilebuf.h" +#include + +using namespace std; + +namespace Moses +{ + InputFileStream::InputFileStream(const std::string &filePath) + : std::istream(NULL) + , m_streambuf(NULL) + { + if (filePath.size() > 3 && + filePath.substr(filePath.size() - 3, 3) == ".gz") + { + m_streambuf = new gzfilebuf(filePath.c_str()); + } else { + std::filebuf* fb = new std::filebuf(); + fb = fb->open(filePath.c_str(), std::ios::in); + if (! fb) { + cerr << "Can't read " << filePath.c_str() << endl; + exit(1); + } + m_streambuf = fb; + } + this->init(m_streambuf); + } + + InputFileStream::~InputFileStream() + { + delete m_streambuf; + m_streambuf = NULL; + } + + void InputFileStream::Close() + { + } + + +} + diff --git a/contrib/other-builds/extract-mixed-syntax/InputFileStream.h b/contrib/other-builds/extract-mixed-syntax/InputFileStream.h new file mode 100644 index 000000000..f10ec2164 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/InputFileStream.h @@ -0,0 +1,48 @@ +// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#ifndef moses_InputFileStream_h +#define moses_InputFileStream_h + +#include +#include +#include + +namespace Moses +{ + + /** Used in place of std::istream, can read zipped files if it ends in .gz + */ + class InputFileStream : public std::istream + { + protected: + std::streambuf *m_streambuf; + public: + + InputFileStream(const std::string &filePath); + ~InputFileStream(); + + void Close(); + }; + +} + +#endif diff --git a/contrib/other-builds/extract-mixed-syntax/Lattice.cpp b/contrib/other-builds/extract-mixed-syntax/Lattice.cpp new file mode 100644 index 000000000..8637fad8e --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Lattice.cpp @@ -0,0 +1,180 @@ +/* + * Lattice.cpp + * extract + * + * Created by Hieu Hoang on 18/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ + +#include +#include "Lattice.h" +#include "LatticeNode.h" +#include "Tunnel.h" +#include "TunnelCollection.h" +#include "SyntaxTree.h" +#include "SentenceAlignment.h" +#include "tables-core.h" +#include "Rule.h" +#include "RuleCollection.h" + +using namespace std; + +Lattice::Lattice(size_t sourceSize) +:m_stacks(sourceSize + 1) +{ +} + +Lattice::~Lattice() +{ + std::vector::iterator iterStack; + for (iterStack = m_stacks.begin(); iterStack != m_stacks.end(); ++iterStack) + { + Stack &stack = *iterStack; + RemoveAllInColl(stack); + } +} + +void Lattice::CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global) +{ + // term + Stack &startStack = GetStack(startPos); + + LatticeNode *node = new LatticeNode(startPos, &sentence); + startStack.push_back(node); + + // non-term + for (size_t endPos = startPos + 1; endPos <= sentence.source.size(); ++endPos) + { + const TunnelList &tunnels = tunnelColl.GetTunnels(startPos, endPos - 1); + + TunnelList::const_iterator iterHole; + for (iterHole = tunnels.begin(); iterHole != tunnels.end(); ++iterHole) + { + const Tunnel &tunnel = *iterHole; + CreateArcsUsing1Hole(tunnel, sentence, global); + } + } +} + +void Lattice::CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global) +{ + size_t startPos = tunnel.GetRange(0).GetStartPos() + , endPos = tunnel.GetRange(0).GetEndPos(); + size_t numSymbols = tunnel.GetRange(0).GetWidth(); + assert(numSymbols > 0); + + Stack &startStack = GetStack(startPos); + + + // non-terms. cartesian product of source & target labels + assert(startPos == tunnel.GetRange(0).GetStartPos() && endPos == tunnel.GetRange(0).GetEndPos()); + size_t startT = tunnel.GetRange(1).GetStartPos() + ,endT = tunnel.GetRange(1).GetEndPos(); + + const SyntaxNodes &nodesS = sentence.sourceTree.GetNodes(startPos, endPos); + const SyntaxNodes &nodesT = sentence.targetTree.GetNodes(startT, endT ); + + SyntaxNodes::const_iterator iterS, iterT; + for (iterS = nodesS.begin(); iterS != nodesS.end(); ++iterS) + { + const SyntaxNode *syntaxNodeS = *iterS; + + for (iterT = nodesT.begin(); iterT != nodesT.end(); ++iterT) + { + const SyntaxNode *syntaxNodeT = *iterT; + + bool isSyntax = syntaxNodeS->IsSyntax() || syntaxNodeT->IsSyntax(); + size_t maxSourceNonTermSpan = isSyntax ? global.maxHoleSpanSourceSyntax : global.maxHoleSpanSourceDefault; + + if (maxSourceNonTermSpan >= endPos - startPos) + { + LatticeNode *node = new LatticeNode(tunnel, syntaxNodeS, syntaxNodeT); + startStack.push_back(node); + } + } + } +} + +Stack &Lattice::GetStack(size_t startPos) +{ + assert(startPos < m_stacks.size()); + return m_stacks[startPos]; +} + +const Stack &Lattice::GetStack(size_t startPos) const +{ + assert(startPos < m_stacks.size()); + return m_stacks[startPos]; +} + +void Lattice::CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global) +{ + const Stack &startStack = GetStack(startPos); + + Stack::const_iterator iterStack; + for (iterStack = startStack.begin(); iterStack != startStack.end(); ++iterStack) + { + const LatticeNode *node = *iterStack; + Rule *initRule = new Rule(node); + + if (initRule->CanRecurse(global, sentence.GetTunnelCollection())) + { // may or maynot be valid, but can continue to build on this rule + initRule->CreateRules(m_rules, *this, sentence, global); + } + + if (initRule->IsValid(global, sentence.GetTunnelCollection())) + { // add to rule collection + m_rules.Add(global, initRule, sentence); + } + else + { + delete initRule; + } + + + } +} + +Stack Lattice::GetNonTermNode(const Range &sourceRange) const +{ + Stack ret; + size_t sourcePos = sourceRange.GetStartPos(); + + const Stack &origStack = GetStack(sourcePos); + Stack::const_iterator iter; + for (iter = origStack.begin(); iter != origStack.end(); ++iter) + { + LatticeNode *node = *iter; + const Range &nodeRangeS = node->GetSourceRange(); + + assert(nodeRangeS.GetStartPos() == sourceRange.GetStartPos()); + + if (! node->IsTerminal() && nodeRangeS.GetEndPos() == sourceRange.GetEndPos()) + { + ret.push_back(node); + } + } + + return ret; +} + +std::ostream& operator<<(std::ostream &out, const Lattice &obj) +{ + std::vector::const_iterator iter; + for (iter = obj.m_stacks.begin(); iter != obj.m_stacks.end(); ++iter) + { + const Stack &stack = *iter; + + Stack::const_iterator iterStack; + for (iterStack = stack.begin(); iterStack != stack.end(); ++iterStack) + { + const LatticeNode &node = **iterStack; + out << node << " "; + } + } + + return out; +} + + diff --git a/contrib/other-builds/extract-mixed-syntax/Lattice.h b/contrib/other-builds/extract-mixed-syntax/Lattice.h new file mode 100644 index 000000000..c88aa0844 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Lattice.h @@ -0,0 +1,47 @@ +#pragma once +/* + * Lattice.h + * extract + * + * Created by Hieu Hoang on 18/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include "RuleCollection.h" + +class Global; +class LatticeNode; +class Tunnel; +class TunnelCollection; +class SentenceAlignment; + +typedef std::vector Stack; + +class Lattice +{ + friend std::ostream& operator<<(std::ostream&, const Lattice&); + + std::vector m_stacks; + RuleCollection m_rules; + + Stack &GetStack(size_t endPos); + + void CreateArcsUsing1Hole(const Tunnel &tunnel, const SentenceAlignment &sentence, const Global &global); + +public: + Lattice(size_t sourceSize); + ~Lattice(); + + void CreateArcs(size_t startPos, const TunnelCollection &tunnelColl, const SentenceAlignment &sentence, const Global &global); + void CreateRules(size_t startPos, const SentenceAlignment &sentence, const Global &global); + + const Stack &GetStack(size_t startPos) const; + const RuleCollection &GetRules() const + { return m_rules; } + + Stack GetNonTermNode(const Range &sourceRange) const; + +}; + diff --git a/contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp b/contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp new file mode 100644 index 000000000..8f0cbfc0f --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/LatticeNode.cpp @@ -0,0 +1,149 @@ +/* + * LatticeNode.cpp + * extract + * + * Created by Hieu Hoang on 18/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include "LatticeNode.h" +#include "SyntaxTree.h" +#include "Tunnel.h" +#include "SentenceAlignment.h" +#include "SymbolSequence.h" + +size_t LatticeNode::s_count = 0; + +using namespace std; + +// for terms +LatticeNode::LatticeNode(size_t pos, const SentenceAlignment *sentence) +:m_tunnel(NULL) +,m_isTerminal(true) +,m_sourceTreeNode(NULL) +,m_targetTreeNode(NULL) +,m_sentence(sentence) +,m_sourceRange(pos, pos) +{ + s_count++; + //cerr << *this << endl; +} + +// for non-terms +LatticeNode::LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode) +:m_tunnel(&tunnel) +,m_isTerminal(false) +,m_sourceTreeNode(sourceTreeNode) +,m_targetTreeNode(targetTreeNode) +,m_sentence(NULL) +,m_sourceRange(tunnel.GetRange(0)) +{ + s_count++; + //cerr << *this << endl; +} + +bool LatticeNode::IsSyntax() const +{ + assert(!m_isTerminal); + bool ret = m_sourceTreeNode->IsSyntax() || m_targetTreeNode->IsSyntax(); + return ret; +} + +size_t LatticeNode::GetNumSymbols(size_t direction) const +{ + return 1; +} + +int LatticeNode::Compare(const LatticeNode &otherNode) const +{ + int ret = 0; + if (m_isTerminal != otherNode.m_isTerminal) + { + ret = m_isTerminal ? -1 : 1; + } + + // both term or non-term + else if (m_isTerminal) + { // term. compare source span + if (m_sourceRange.GetStartPos() == otherNode.m_sourceRange.GetStartPos()) + ret = 0; + else + ret = (m_sourceRange.GetStartPos() < otherNode.m_sourceRange.GetStartPos()) ? -1 : +1; + } + else + { // non-term. compare source span and BOTH label + assert(!m_isTerminal); + assert(!otherNode.m_isTerminal); + + if (m_sourceTreeNode->IsSyntax()) + { + ret = m_tunnel->Compare(*otherNode.m_tunnel, 0); + if (ret == 0 && m_sourceTreeNode->GetLabel() != otherNode.m_sourceTreeNode->GetLabel()) + { + ret = (m_sourceTreeNode->GetLabel() < otherNode.m_sourceTreeNode->GetLabel()) ? -1 : +1; + } + } + + if (ret == 0 && m_targetTreeNode->IsSyntax()) + { + ret = m_tunnel->Compare(*otherNode.m_tunnel, 1); + if (ret == 0 && m_targetTreeNode->GetLabel() != otherNode.m_targetTreeNode->GetLabel()) + { + ret = (m_targetTreeNode->GetLabel() < otherNode.m_targetTreeNode->GetLabel()) ? -1 : +1; + } + } + } + + return ret; +} + +void LatticeNode::CreateSymbols(size_t direction, SymbolSequence &symbols) const +{ + if (m_isTerminal) + { + /* + const std::vector &words = (direction == 0 ? m_sentence->source : m_sentence->target); + size_t startPos = m_tunnel.GetStart(direction) + ,endPos = m_tunnel.GetEnd(direction); + + for (size_t pos = startPos; pos <= endPos; ++pos) + { + Symbol symbol(words[pos], pos); + symbols.Add(symbol); + } + */ + } + else + { // output both + + Symbol symbol(m_sourceTreeNode->GetLabel(), m_targetTreeNode->GetLabel() + , m_tunnel->GetRange(0).GetStartPos(), m_tunnel->GetRange(0).GetEndPos() + , m_tunnel->GetRange(1).GetStartPos(), m_tunnel->GetRange(1).GetEndPos() + , m_sourceTreeNode->IsSyntax(), m_targetTreeNode->IsSyntax()); + + symbols.Add(symbol); + } + +} + +std::ostream& operator<<(std::ostream &out, const LatticeNode &obj) +{ + if (obj.m_isTerminal) + { + assert(obj.m_sourceRange.GetWidth() == 1); + size_t pos = obj.m_sourceRange.GetStartPos(); + + const SentenceAlignment &sentence = *obj.m_sentence; + out << obj.m_sourceRange << "=" << sentence.source[pos]; + } + else + { + assert(obj.m_tunnel); + out << obj.GetTunnel() << "=" << obj.m_sourceTreeNode->GetLabel() << obj.m_targetTreeNode->GetLabel() << " "; + } + + return out; +} + + diff --git a/contrib/other-builds/extract-mixed-syntax/LatticeNode.h b/contrib/other-builds/extract-mixed-syntax/LatticeNode.h new file mode 100644 index 000000000..73ea6a224 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/LatticeNode.h @@ -0,0 +1,77 @@ +#pragma once +/* + * LatticeNode.h + * extract + * + * Created by Hieu Hoang on 18/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include +#include "Range.h" + +class Tunnel; +class SyntaxNode; +class SentenceAlignment; +class SymbolSequence; + +class LatticeNode +{ + friend std::ostream& operator<<(std::ostream&, const LatticeNode&); + + bool m_isTerminal; + + // for terms & non-term + Range m_sourceRange; + + // non-terms. source range should be same as m_sourceRange + const Tunnel *m_tunnel; + +public: + static size_t s_count; + + + + const SyntaxNode *m_sourceTreeNode, *m_targetTreeNode; + const SentenceAlignment *m_sentence; + + // for terms + LatticeNode(size_t pos, const SentenceAlignment *sentence); + + // for non-terms + LatticeNode(const Tunnel &tunnel, const SyntaxNode *sourceTreeNode, const SyntaxNode *targetTreeNode); + + bool IsTerminal() const + { return m_isTerminal; } + + bool IsSyntax() const; + + size_t GetNumSymbols(size_t direction) const; + + std::string ToString() const; + + int Compare(const LatticeNode &otherNode) const; + + void CreateSymbols(size_t direction, SymbolSequence &symbols) const; + + const Tunnel &GetTunnel() const + { + assert(m_tunnel); + return *m_tunnel; + } + + const Range &GetSourceRange() const + { + return m_sourceRange; + } + const SyntaxNode &GetSyntaxNode(size_t direction) const + { + const SyntaxNode *node = direction == 0 ? m_sourceTreeNode : m_targetTreeNode; + assert(node); + return *node; + } + +}; + diff --git a/contrib/other-builds/extract-mixed-syntax/Makefile b/contrib/other-builds/extract-mixed-syntax/Makefile new file mode 100644 index 000000000..910c759ea --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Makefile @@ -0,0 +1,12 @@ +all: extract + +clean: + rm -f *.o extract-mixed-syntax + +.cpp.o: + g++ -O6 -g -c $< + +extract: tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o + + g++ tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o -lz -o extract-mixed-syntax + diff --git a/contrib/other-builds/extract-mixed-syntax/Range.cpp b/contrib/other-builds/extract-mixed-syntax/Range.cpp new file mode 100644 index 000000000..a98ac278b --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Range.cpp @@ -0,0 +1,74 @@ +/* + * Range.cpp + * extract + * + * Created by Hieu Hoang on 22/02/2011. + * Copyright 2011 __MyCompanyName__. All rights reserved. + * + */ + +#include "Range.h" + +using namespace std; + +void Range::Merge(const Range &a, const Range &b) +{ + if (a.m_startPos == NOT_FOUND) + { // get the other regardless + m_startPos = b.m_startPos; + } + else if (b.m_startPos == NOT_FOUND) + { + m_startPos = a.m_startPos; + } + else + { + m_startPos = min(a.m_startPos, b.m_startPos); + } + + if (a.m_endPos == NOT_FOUND) + { // get the other regardless + m_endPos = b.m_endPos; + } + else if (b.m_endPos == NOT_FOUND) + { // do nothing + m_endPos = a.m_endPos; + } + else + { + m_endPos = max(a.m_endPos, b.m_endPos); + } + + +} + +int Range::Compare(const Range &other) const +{ + if (m_startPos < other.m_startPos) + return -1; + else if (m_startPos > other.m_startPos) + return +1; + else if (m_endPos < other.m_endPos) + return -1; + else if (m_endPos > other.m_endPos) + return +1; + + return 0; + +} + +bool Range::Overlap(const Range &other) const +{ + if ( other.m_endPos < m_startPos || other.m_startPos > m_endPos) + return false; + + return true; +} + +std::ostream& operator<<(std::ostream &out, const Range &range) +{ + out << "[" << range.m_startPos << "-" << range.m_endPos << "]"; + return out; +} + + diff --git a/contrib/other-builds/extract-mixed-syntax/Range.h b/contrib/other-builds/extract-mixed-syntax/Range.h new file mode 100644 index 000000000..05d0c97c9 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Range.h @@ -0,0 +1,57 @@ +/* + * Range.h + * extract + * + * Created by Hieu Hoang on 22/02/2011. + * Copyright 2011 __MyCompanyName__. All rights reserved. + * + */ +#pragma once +#include +#include +#include + +#define NOT_FOUND std::numeric_limits::max() + +class Range +{ + friend std::ostream& operator<<(std::ostream&, const Range&); + + size_t m_startPos, m_endPos; +public: + + Range() + :m_startPos(NOT_FOUND) + ,m_endPos(NOT_FOUND) + {} + + Range(const Range ©) + :m_startPos(copy.m_startPos) + ,m_endPos(copy.m_endPos) + {} + + Range(size_t startPos, size_t endPos) + :m_startPos(startPos) + ,m_endPos(endPos) + {} + + size_t GetStartPos() const + { return m_startPos; } + size_t GetEndPos() const + { return m_endPos; } + size_t GetWidth() const + { return m_endPos - m_startPos + 1; } + + void SetStartPos(size_t startPos) + { m_startPos = startPos; } + void SetEndPos(size_t endPos) + { m_endPos = endPos; } + + void Merge(const Range &a, const Range &b); + + int Compare(const Range &other) const; + + bool Overlap(const Range &other) const; + + +}; diff --git a/contrib/other-builds/extract-mixed-syntax/Rule.cpp b/contrib/other-builds/extract-mixed-syntax/Rule.cpp new file mode 100644 index 000000000..38d197934 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Rule.cpp @@ -0,0 +1,553 @@ +/* + * Rule.cpp + * extract + * + * Created by Hieu Hoang on 19/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include "Rule.h" +#include "Global.h" +#include "LatticeNode.h" +#include "Lattice.h" +#include "SentenceAlignment.h" +#include "Tunnel.h" +#include "TunnelCollection.h" +#include "RuleCollection.h" + +using namespace std; + +RuleElement::RuleElement(const RuleElement ©) +:m_latticeNode(copy.m_latticeNode) +,m_alignmentPos(copy.m_alignmentPos) +{ +} + + +Rule::Rule(const LatticeNode *latticeNode) +:m_lhs(NULL) +{ + RuleElement element(*latticeNode); + + m_coll.push_back(element); +} + +Rule::Rule(const Rule &prevRule, const LatticeNode *latticeNode) +:m_coll(prevRule.m_coll) +,m_lhs(NULL) +{ + RuleElement element(*latticeNode); + m_coll.push_back(element); +} + +Rule::Rule(const Global &global, bool &isValid, const Rule ©, const LatticeNode *lhs, const SentenceAlignment &sentence) +:m_coll(copy.m_coll) +,m_source(copy.m_source) +,m_target(copy.m_target) +,m_lhs(lhs) +{ + CreateSymbols(global, isValid, sentence); +} + +Rule::~Rule() +{ +} + +// helper for sort +struct CompareLatticeNodeTarget +{ + bool operator() (const RuleElement *a, const RuleElement *b) + { + const Range &rangeA = a->GetLatticeNode().GetTunnel().GetRange(1) + ,&rangeB = b->GetLatticeNode().GetTunnel().GetRange(1); + return rangeA.GetEndPos() < rangeB.GetEndPos(); + } +}; + +void Rule::CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence) +{ + vector nonTerms; + + // source + for (size_t ind = 0; ind < m_coll.size(); ++ind) + { + RuleElement &element = m_coll[ind]; + const LatticeNode &node = element.GetLatticeNode(); + if (node.IsTerminal()) + { + size_t sourcePos = node.GetSourceRange().GetStartPos(); + const string &word = sentence.source[sourcePos]; + Symbol symbol(word, sourcePos); + m_source.Add(symbol); + } + else + { // non-term + const string &sourceWord = node.GetSyntaxNode(0).GetLabel(); + const string &targetWord = node.GetSyntaxNode(1).GetLabel(); + Symbol symbol(sourceWord, targetWord + , node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos() + , node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos() + , node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax()); + m_source.Add(symbol); + + // store current pos within phrase + element.m_alignmentPos.first = ind; + + // for target symbols + nonTerms.push_back(&element); + } + + } + + // target + isValid = true; + + const Range &lhsTargetRange = m_lhs->GetTunnel().GetRange(1); + + // check spans of target non-terms + if (nonTerms.size()) + { + // sort non-term rules elements by target range + std::sort(nonTerms.begin(), nonTerms.end(), CompareLatticeNodeTarget()); + + const Range &first = nonTerms.front()->GetLatticeNode().GetTunnel().GetRange(1); + const Range &last = nonTerms.back()->GetLatticeNode().GetTunnel().GetRange(1); + + if (first.GetStartPos() < lhsTargetRange.GetStartPos() + || last.GetEndPos() > lhsTargetRange.GetEndPos()) + { + isValid = false; + } + } + + if (isValid) + { + size_t indNonTerm = 0; + RuleElement *currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL; + for (size_t targetPos = lhsTargetRange.GetStartPos(); targetPos <= lhsTargetRange.GetEndPos(); ++targetPos) + { + if (currNonTermElement && targetPos == currNonTermElement->GetLatticeNode().GetTunnel().GetRange(1).GetStartPos()) + { // start of a non-term. print out non-terms & skip to the end + + const LatticeNode &node = currNonTermElement->GetLatticeNode(); + + const string &sourceWord = node.GetSyntaxNode(0).GetLabel(); + const string &targetWord = node.GetSyntaxNode(1).GetLabel(); + Symbol symbol(sourceWord, targetWord + , node.GetTunnel().GetRange(0).GetStartPos(), node.GetTunnel().GetRange(0).GetEndPos() + , node.GetTunnel().GetRange(1).GetStartPos(), node.GetTunnel().GetRange(1).GetEndPos() + , node.GetSyntaxNode(0).IsSyntax(), node.GetSyntaxNode(1).IsSyntax()); + m_target.Add(symbol); + + // store current pos within phrase + currNonTermElement->m_alignmentPos.second = m_target.GetSize() - 1; + + assert(currNonTermElement->m_alignmentPos.first != NOT_FOUND); + + targetPos = node.GetTunnel().GetRange(1).GetEndPos(); + indNonTerm++; + currNonTermElement = indNonTerm < nonTerms.size() ? nonTerms[indNonTerm] : NULL; + } + else + { // term + const string &word = sentence.target[targetPos]; + + Symbol symbol(word, targetPos); + m_target.Add(symbol); + + } + } + + assert(indNonTerm == nonTerms.size()); + + if (m_target.GetSize() > global.maxSymbolsSource) { + isValid = false; + cerr << "m_source=" << m_source.GetSize() << ":" << m_source << endl; + cerr << "m_target=" << m_target.GetSize() << ":" << m_target << endl; + } + } +} + +bool Rule::MoreDefaultNonTermThanTerm() const +{ + size_t numTerm = 0, numDefaultNonTerm = 0; + + CollType::const_iterator iter; + for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) + { + const RuleElement &element = *iter; + const LatticeNode &node = element.GetLatticeNode(); + if (node.IsTerminal()) + { + ++numTerm; + } + else if (!node.IsSyntax()) + { + ++numDefaultNonTerm; + } + } + + bool ret = numDefaultNonTerm > numTerm; + return ret; +} + +bool Rule::SourceHasEdgeDefaultNonTerm() const +{ + assert(m_coll.size()); + const LatticeNode &first = m_coll.front().GetLatticeNode(); + const LatticeNode &last = m_coll.back().GetLatticeNode(); + + // 1st + if (!first.IsTerminal() && !first.IsSyntax()) + { + return true; + } + if (!last.IsTerminal() && !last.IsSyntax()) + { + return true; + } + + return false; +} + +bool Rule::IsValid(const Global &global, const TunnelCollection &tunnelColl) const +{ + if (m_coll.size() == 1 && !m_coll[0].GetLatticeNode().IsTerminal()) // can't be only 1 terminal + { + return false; + } + + if (MoreDefaultNonTermThanTerm()) + { // must have at least as many terms as non-syntax non-terms + return false; + } + + if (!global.allowDefaultNonTermEdge && SourceHasEdgeDefaultNonTerm()) + { + return false; + } + + if (GetNumSymbols() > global.maxSymbolsSource) + { + return false; + } + + if (AdjacentDefaultNonTerms()) + { + return false; + } + + if (!IsHole(tunnelColl)) + { + return false; + } + + if (NonTermOverlap()) + { + return false; + } + + /* + std::pair spanS = GetSpan(0) + ,spanT= GetSpan(1); + + if (tunnelColl.NumUnalignedWord(0, spanS.first, spanS.second) >= global.maxUnaligned) + return false; + if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned) + return false; + */ + + return true; +} + +bool Rule::NonTermOverlap() const +{ + vector ranges; + + CollType::const_iterator iter; + for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) + { + const RuleElement &element = *iter; + if (!element.GetLatticeNode().IsTerminal()) + { + const Range &range = element.GetLatticeNode().GetTunnel().GetRange(1); + ranges.push_back(range); + } + } + + vector::const_iterator outerIter; + for (outerIter = ranges.begin(); outerIter != ranges.end(); ++outerIter) + { + const Range &outer = *outerIter; + vector::const_iterator innerIter; + for (innerIter = outerIter + 1; innerIter != ranges.end(); ++innerIter) + { + const Range &inner = *innerIter; + if (outer.Overlap(inner)) + return true; + } + } + + return false; +} + +Range Rule::GetSourceRange() const +{ + assert(m_coll.size()); + const Range &first = m_coll.front().GetLatticeNode().GetSourceRange(); + const Range &last = m_coll.back().GetLatticeNode().GetSourceRange(); + + Range ret(first.GetStartPos(), last.GetEndPos()); + return ret; +} + + +bool Rule::IsHole(const TunnelCollection &tunnelColl) const +{ + const Range &spanS = GetSourceRange(); + const TunnelList &tunnels = tunnelColl.GetTunnels(spanS.GetStartPos(), spanS.GetEndPos()); + + bool ret = tunnels.size() > 0; + return ret; +} + + +bool Rule::CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const +{ + if (GetNumSymbols() >= global.maxSymbolsSource) + return false; + if (AdjacentDefaultNonTerms()) + return false; + if (MaxNonTerm(global)) + return false; + if (NonTermOverlap()) + { + return false; + } + + const Range spanS = GetSourceRange(); + + if (tunnelColl.NumUnalignedWord(0, spanS.GetStartPos(), spanS.GetEndPos()) >= global.maxUnaligned) + return false; +// if (tunnelColl.NumUnalignedWord(1, spanT.first, spanT.second) >= global.maxUnaligned) +// return false; + + + return true; +} + +bool Rule::MaxNonTerm(const Global &global) const +{ + //cerr << *this << endl; + size_t numNonTerm = 0, numNonTermDefault = 0; + + CollType::const_iterator iter; + for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) + { + const LatticeNode *node = &(*iter).GetLatticeNode(); + if (!node->IsTerminal() ) + { + numNonTerm++; + if (!node->IsSyntax()) + { + numNonTermDefault++; + } + if (numNonTerm >= global.maxNonTerm || numNonTermDefault >= global.maxNonTermDefault) + return true; + } + } + + return false; +} + + +bool Rule::AdjacentDefaultNonTerms() const +{ + assert(m_coll.size() > 0); + + const LatticeNode *prevNode = &m_coll.front().GetLatticeNode(); + CollType::const_iterator iter; + for (iter = m_coll.begin() + 1; iter != m_coll.end(); ++iter) + { + const LatticeNode *node = &(*iter).GetLatticeNode(); + if (!prevNode->IsTerminal() && !node->IsTerminal() && !prevNode->IsSyntax() && !node->IsSyntax() ) + { + return true; + } + prevNode = node; + } + + return false; +} + + + +size_t Rule::GetNumSymbols() const +{ + size_t ret = m_coll.size(); + return ret; +} + +void Rule::CreateRules(RuleCollection &rules + , const Lattice &lattice + , const SentenceAlignment &sentence + , const Global &global) +{ + assert(m_coll.size() > 0); + const LatticeNode *latticeNode = &m_coll.back().GetLatticeNode(); + size_t endPos = latticeNode->GetSourceRange().GetEndPos() + 1; + + const Stack &stack = lattice.GetStack(endPos); + + Stack::const_iterator iter; + for (iter = stack.begin(); iter != stack.end(); ++iter) + { + const LatticeNode *newLatticeNode = *iter; + Rule *newRule = new Rule(*this, newLatticeNode); + //cerr << *newRule << endl; + + if (newRule->CanRecurse(global, sentence.GetTunnelCollection())) + { // may or maynot be valid, but can continue to build on this rule + newRule->CreateRules(rules, lattice, sentence, global); + } + + if (newRule->IsValid(global, sentence.GetTunnelCollection())) + { // add to rule collection + rules.Add(global, newRule, sentence); + } + else + { + delete newRule; + } + + } +} + +bool Rule::operator<(const Rule &compare) const +{ + /* + if (g_debug) + { + cerr << *this << endl << compare; + cerr << endl; + } + */ + + bool ret = Compare(compare) < 0; + + /* + if (g_debug) + { + cerr << *this << endl << compare << endl << ret << endl << endl; + } + */ + + return ret; +} + +int Rule::Compare(const Rule &compare) const +{ + //cerr << *this << endl << compare << endl; + assert(m_coll.size() > 0); + assert(m_source.GetSize() > 0); + assert(m_target.GetSize() > 0); + + int ret = 0; + + // compare each fragment + ret = m_source.Compare(compare.m_source); + if (ret != 0) + { + return ret; + } + + ret = m_target.Compare(compare.m_target); + if (ret != 0) + { + return ret; + } + + // compare lhs + const string &thisSourceLabel = m_lhs->GetSyntaxNode(0).GetLabel(); + const string &otherSourceLabel = compare.m_lhs->GetSyntaxNode(0).GetLabel(); + if (thisSourceLabel != otherSourceLabel) + { + ret = (thisSourceLabel < otherSourceLabel) ? -1 : +1; + return ret; + } + + const string &thisTargetLabel = m_lhs->GetSyntaxNode(1).GetLabel(); + const string &otherTargetLabel = compare.m_lhs->GetSyntaxNode(1).GetLabel(); + if (thisTargetLabel != otherTargetLabel) + { + ret = (thisTargetLabel < otherTargetLabel) ? -1 : +1; + return ret; + } + + assert(ret == 0); + return ret; +} + + +const LatticeNode &Rule::GetLatticeNode(size_t ind) const +{ + assert(ind < m_coll.size()); + return m_coll[ind].GetLatticeNode(); +} + +void Rule::DebugOutput() const +{ + std::stringstream strme; + strme << *this; + cerr << strme.str(); + +} + +std::ostream& operator<<(std::ostream &out, const Rule &obj) +{ + + stringstream strmeS, strmeT; + + std::vector::const_iterator iterSymbol; + for (iterSymbol = obj.m_source.begin(); iterSymbol != obj.m_source.end(); ++iterSymbol) + { + const Symbol &symbol = *iterSymbol; + strmeS << symbol << " "; + } + + for (iterSymbol = obj.m_target.begin(); iterSymbol != obj.m_target.end(); ++iterSymbol) + { + const Symbol &symbol = *iterSymbol; + strmeT << symbol << " "; + } + + // lhs + if (obj.m_lhs) + { + strmeS << obj.m_lhs->GetSyntaxNode(0).GetLabel(); + strmeT << obj.m_lhs->GetSyntaxNode(1).GetLabel(); + } + + out << strmeS.str() << " ||| " << strmeT.str() << " ||| "; + + // alignment + Rule::CollType::const_iterator iter; + for (iter = obj.m_coll.begin(); iter != obj.m_coll.end(); ++iter) + { + const RuleElement &element = *iter; + const LatticeNode &node = element.GetLatticeNode(); + bool isTerminal = node.IsTerminal(); + + if (!isTerminal) + { + out << element.m_alignmentPos.first << "-" << element.m_alignmentPos.second << " "; + } + } + + out << "||| 1"; + + return out; +} + diff --git a/contrib/other-builds/extract-mixed-syntax/Rule.h b/contrib/other-builds/extract-mixed-syntax/Rule.h new file mode 100644 index 000000000..bc74fa3f5 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Rule.h @@ -0,0 +1,94 @@ +#pragma once +/* + * Rule.h + * extract + * + * Created by Hieu Hoang on 19/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include "LatticeNode.h" +#include "SymbolSequence.h" +#include "Global.h" + +class Lattice; +class SentenceAlignment; +class Global; +class RuleCollection; +class SyntaxNode; +class TunnelCollection; +class Range; + +class RuleElement +{ +protected: + const LatticeNode *m_latticeNode; +public: + std::pair m_alignmentPos; + + RuleElement(const RuleElement ©); + RuleElement(const LatticeNode &latticeNode) + :m_latticeNode(&latticeNode) + ,m_alignmentPos(NOT_FOUND, NOT_FOUND) + {} + + const LatticeNode &GetLatticeNode() const + { return *m_latticeNode; } + +}; + +class Rule +{ + friend std::ostream& operator<<(std::ostream &out, const Rule &obj); + +protected: + typedef std::vector CollType; + CollType m_coll; + + const LatticeNode *m_lhs; + SymbolSequence m_source, m_target; + + bool IsHole(const TunnelCollection &tunnelColl) const; + bool NonTermOverlap() const; + + const LatticeNode &GetLatticeNode(size_t ind) const; + void CreateSymbols(const Global &global, bool &isValid, const SentenceAlignment &sentence); + +public: + // init + Rule(const LatticeNode *latticeNode); + + // create new rule by appending node to prev rule + Rule(const Rule &prevRule, const LatticeNode *latticeNode); + + // create copy with lhs + Rule(const Global &global, bool &isValid, const Rule ©, const LatticeNode *lhs, const SentenceAlignment &sentence); + + // can continue to add to this rule + bool CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const; + + virtual ~Rule(); + + // can add this to the set of rules + bool IsValid(const Global &global, const TunnelCollection &tunnelColl) const; + + size_t GetNumSymbols() const; + bool AdjacentDefaultNonTerms() const; + bool MaxNonTerm(const Global &global) const; + bool MoreDefaultNonTermThanTerm() const; + bool SourceHasEdgeDefaultNonTerm() const; + + void CreateRules(RuleCollection &rules + , const Lattice &lattice + , const SentenceAlignment &sentence + , const Global &global); + + int Compare(const Rule &compare) const; + bool operator<(const Rule &compare) const; + + Range GetSourceRange() const; + + DEBUG_OUTPUT(); +}; diff --git a/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp b/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp new file mode 100644 index 000000000..28b7adb1b --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp @@ -0,0 +1,94 @@ +/* + * RuleCollection.cpp + * extract + * + * Created by Hieu Hoang on 19/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include "RuleCollection.h" +#include "Rule.h" +#include "SentenceAlignment.h" +#include "tables-core.h" +#include "Lattice.h" +#include "SyntaxTree.h" + +using namespace std; + +RuleCollection::~RuleCollection() +{ + RemoveAllInColl(m_coll); +} + +void RuleCollection::Add(const Global &global, Rule *rule, const SentenceAlignment &sentence) +{ + Range spanS = rule->GetSourceRange(); + + // cartesian product of lhs + Stack nontermNodes = sentence.GetLattice().GetNonTermNode(spanS); + Stack::const_iterator iterStack; + for (iterStack = nontermNodes.begin(); iterStack != nontermNodes.end(); ++iterStack) + { + const LatticeNode &node = **iterStack; + assert(!node.IsTerminal()); + + bool isValid; + // create rules with LHS + //cerr << "old:" << *rule << endl; + Rule *newRule = new Rule(global, isValid, *rule, &node, sentence); + + if (!isValid) + { // lhs doesn't match non-term spans + delete newRule; + continue; + } + + /* + stringstream s; + s << *newRule; + if (s.str().find("Wiederaufnahme der [X] ||| resumption of the [X] ||| ||| 1") == 0) + { + cerr << "READY:" << *newRule << endl; + g_debug = true; + } + else { + g_debug = false; + } + */ + + typedef set::iterator Iterator; + pair ret = m_coll.insert(newRule); + + if (ret.second) + { + //cerr << "ACCEPTED:" << *newRule << endl; + //cerr << ""; + } + else + { + //cerr << "REJECTED:" << *newRule << endl; + delete newRule; + } + + } + + delete rule; + +} + + +std::ostream& operator<<(std::ostream &out, const RuleCollection &obj) +{ + RuleCollection::CollType::const_iterator iter; + for (iter = obj.m_coll.begin(); iter != obj.m_coll.end(); ++iter) + { + const Rule &rule = **iter; + out << rule << endl; + } + + return out; +} + + + + diff --git a/contrib/other-builds/extract-mixed-syntax/RuleCollection.h b/contrib/other-builds/extract-mixed-syntax/RuleCollection.h new file mode 100644 index 000000000..75d55b864 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/RuleCollection.h @@ -0,0 +1,53 @@ +#pragma once +/* + * RuleCollection.h + * extract + * + * Created by Hieu Hoang on 19/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include "Rule.h" + +class SentenceAlignment; + +// helper for sort. Don't compare default non-terminals +struct CompareRule +{ + bool operator() (const Rule *a, const Rule *b) + { + /* + if (g_debug) + { + std::cerr << std::endl << (*a) << std::endl << (*b) << " "; + } + */ + bool ret = (*a) < (*b); + /* + if (g_debug) + { + std::cerr << ret << std::endl; + } + */ + return ret; + } +}; + + +class RuleCollection +{ + friend std::ostream& operator<<(std::ostream &out, const RuleCollection &obj); + +protected: + typedef std::set CollType; + CollType m_coll; + +public: + ~RuleCollection(); + void Add(const Global &global, Rule *rule, const SentenceAlignment &sentence); + size_t GetSize() const + { return m_coll.size(); } +}; + diff --git a/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp b/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp new file mode 100644 index 000000000..b13743bc1 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.cpp @@ -0,0 +1,331 @@ +/* + * SentenceAlignment.cpp + * extract + * + * Created by Hieu Hoang on 19/01/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include +#include "SentenceAlignment.h" +#include "XmlTree.h" +#include "tables-core.h" +#include "TunnelCollection.h" +#include "Lattice.h" +#include "LatticeNode.h" + +using namespace std; + +extern std::set< std::string > targetLabelCollection, sourceLabelCollection; +extern std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection; + +SentenceAlignment::SentenceAlignment() +:m_tunnelCollection(NULL) +,m_lattice(NULL) +{} + +SentenceAlignment::~SentenceAlignment() +{ + delete m_tunnelCollection; + delete m_lattice; +} + +int SentenceAlignment::Create( const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global ) +{ + + // tokenizing English (and potentially extract syntax spans) + if (global.targetSyntax) { + string targetStringCPP = string(targetString); + ProcessAndStripXMLTags( targetStringCPP, targetTree, targetLabelCollection , targetTopLabelCollection ); + target = tokenize( targetStringCPP.c_str() ); + // cerr << "E: " << targetStringCPP << endl; + } + else { + target = tokenize( targetString.c_str() ); + } + + // tokenizing source (and potentially extract syntax spans) + if (global.sourceSyntax) { + string sourceStringCPP = string(sourceString); + ProcessAndStripXMLTags( sourceStringCPP, sourceTree, sourceLabelCollection , sourceTopLabelCollection ); + source = tokenize( sourceStringCPP.c_str() ); + // cerr << "F: " << sourceStringCPP << endl; + } + else { + source = tokenize( sourceString.c_str() ); + } + + // check if sentences are empty + if (target.size() == 0 || source.size() == 0) { + cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl; + cerr << "T: " << targetString << endl << "S: " << sourceString << endl; + return 0; + } + + // prepare data structures for alignments + for(int i=0; i dummy; + alignedToT.push_back( dummy ); + } + + //InitTightest(m_s2tTightest, source.size()); + //InitTightest(m_t2sTightest, target.size()); + + + // reading in alignments + vector alignmentSequence = tokenize( alignmentString.c_str() ); + for(int i=0; i= target.size() || s >= source.size()) { + cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n"; + cerr << "T: " << targetString << endl << "S: " << sourceString << endl; + return 0; + } + alignedToT[t].push_back( s ); + alignedCountS[s]++; + + //SetAlignment(s, t); + } + + bool mixed = global.mixed; + sourceTree.AddDefaultNonTerms(global.sourceSyntax, mixed, source.size()); + targetTree.AddDefaultNonTerms(global.targetSyntax, mixed, target.size()); + + //CalcTightestSpan(m_s2tTightest); + //CalcTightestSpan(m_t2sTightest); + + return 1; +} + +/* +void SentenceAlignment::InitTightest(Outer &tightest, size_t len) +{ + tightest.resize(len); + + for (size_t posOuter = 0; posOuter < len; ++posOuter) + { + Inner &inner = tightest[posOuter]; + size_t innerSize = len - posOuter; + inner.resize(innerSize); + + } +} + +void SentenceAlignment::CalcTightestSpan(Outer &tightest) +{ + size_t len = tightest.size(); + + for (size_t startPos = 0; startPos < len; ++startPos) + { + for (size_t endPos = startPos + 1; endPos < len; ++endPos) + { + const Range &prevRange = GetTightest(tightest, startPos, endPos - 1); + const Range &smallRange = GetTightest(tightest, endPos, endPos); + Range &newRange = GetTightest(tightest, startPos, endPos); + + newRange.Merge(prevRange, smallRange); + //cerr << "[" << startPos << "-" << endPos << "] --> [" << newRange.GetStartPos() << "-" << newRange.GetEndPos() << "]"; + } + } +} + +Range &SentenceAlignment::GetTightest(Outer &tightest, size_t startPos, size_t endPos) +{ + assert(endPos < tightest.size()); + assert(endPos >= startPos); + + Inner &inner = tightest[startPos]; + + size_t ind = endPos - startPos; + Range &ret = inner[ind]; + return ret; +} + +void SentenceAlignment::SetAlignment(size_t source, size_t target) +{ + SetAlignment(m_s2tTightest, source, target); + SetAlignment(m_t2sTightest, target, source); +} + +void SentenceAlignment::SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos) +{ + + Range &range = GetTightest(tightest, thisPos, thisPos); + if (range.GetStartPos() == NOT_FOUND) + { // not yet set, do them both + assert(range.GetEndPos() == NOT_FOUND); + range.SetStartPos(thatPos); + range.SetEndPos(thatPos); + } + else + { + assert(range.GetEndPos() != NOT_FOUND); + range.SetStartPos( (range.GetStartPos() > thatPos) ? thatPos : range.GetStartPos() ); + range.SetEndPos( (range.GetEndPos() < thatPos) ? thatPos : range.GetEndPos() ); + } +} + */ + + +void SentenceAlignment::FindTunnels(const Global &global ) +{ + int countT = target.size(); + int countS = source.size(); + int maxSpan = max(global.maxHoleSpanSourceDefault, global.maxHoleSpanSourceSyntax); + + m_tunnelCollection = new TunnelCollection(countS); + + m_tunnelCollection->alignedCountS = alignedCountS; + m_tunnelCollection->alignedCountT.resize(alignedToT.size()); + for (size_t ind = 0; ind < alignedToT.size(); ind++) + { + m_tunnelCollection->alignedCountT[ind] = alignedToT[ind].size(); + } + + // phrase repository for creating hiero phrases + + // check alignments for target phrase startT...endT + for(int lengthT=1; + lengthT <= maxSpan && lengthT <= countT; + lengthT++) { + for(int startT=0; startT < countT-(lengthT-1); startT++) { + + // that's nice to have + int endT = startT + lengthT - 1; + + // if there is target side syntax, there has to be a node + if (global.targetSyntax && !targetTree.HasNode(startT,endT)) + continue; + + // find find aligned source words + // first: find minimum and maximum source word + int minS = 9999; + int maxS = -1; + vector< int > usedS = alignedCountS; + for(int ti=startT;ti<=endT;ti++) { + for(int i=0;imaxS) { maxS = si; } + usedS[ si ]--; + } + } + + // unaligned phrases are not allowed + if( maxS == -1 ) + continue; + + // source phrase has to be within limits + if( maxS-minS >= maxSpan ) + { + continue; + } + + // check if source words are aligned to out of bound target words + bool out_of_bounds = false; + for(int si=minS;si<=maxS && !out_of_bounds;si++) + { + if (usedS[si]>0) { + out_of_bounds = true; + } + } + + // if out of bound, you gotta go + if (out_of_bounds) + continue; + + if (m_tunnelCollection->NumUnalignedWord(1, startT, endT) >= global.maxUnaligned) + continue; + + // done with all the checks, lets go over all consistent phrase pairs + // start point of source phrase may retreat over unaligned + for(int startS=minS; + (startS>=0 && + startS>maxS - maxSpan && // within length limit + (startS==minS || alignedCountS[startS]==0)); // unaligned + startS--) + { + // end point of source phrase may advance over unaligned + for(int endS=maxS; + (endSNumUnalignedWord(0, startS, endS) >= global.maxUnaligned) + continue; + + // take note that this is a valid phrase alignment + m_tunnelCollection->Add(startS, endS, startT, endT); + } + } + } + } + + //cerr << *tunnelCollection << endl; + +} + +void SentenceAlignment::CreateLattice(const Global &global) +{ + size_t countS = source.size(); + m_lattice = new Lattice(countS); + + for (size_t startPos = 0; startPos < countS; ++startPos) + { + //cerr << "creating arcs for " << startPos << "="; + m_lattice->CreateArcs(startPos, *m_tunnelCollection, *this, global); + + //cerr << LatticeNode::s_count << endl; + } +} + +void SentenceAlignment::CreateRules(const Global &global) +{ + size_t countS = source.size(); + + for (size_t startPos = 0; startPos < countS; ++startPos) + { + //cerr << "creating rules for " << startPos << "\n"; + m_lattice->CreateRules(startPos, *this, global); + } +} + +void OutputSentenceStr(std::ostream &out, const std::vector &vec) +{ + for (size_t pos = 0; pos < vec.size(); ++pos) + { + out << vec[pos] << " "; + } +} + +std::ostream& operator<<(std::ostream &out, const SentenceAlignment &obj) +{ + OutputSentenceStr(out, obj.target); + out << " ==> "; + OutputSentenceStr(out, obj.source); + out << endl; + + out << *obj.m_tunnelCollection; + + if (obj.m_lattice) + out << endl << *obj.m_lattice; + + return out; +} + + + + diff --git a/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.h b/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.h new file mode 100644 index 000000000..a94941309 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/SentenceAlignment.h @@ -0,0 +1,69 @@ +#pragma once +/* + * SentenceAlignment.h + * extract + * + * Created by Hieu Hoang on 19/01/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include +#include "SyntaxTree.h" +#include "Global.h" +#include "Range.h" + +class TunnelCollection; +class Lattice; + +class SentenceAlignment +{ + friend std::ostream& operator<<(std::ostream&, const SentenceAlignment&); + +public: + std::vector target; + std::vector source; + std::vector alignedCountS; + std::vector< std::vector > alignedToT; + SyntaxTree sourceTree, targetTree; + + //typedef std::vector Inner; + //typedef std::vector Outer; + + //Outer m_s2tTightest, m_t2sTightest; + + SentenceAlignment(); + ~SentenceAlignment(); + int Create(const std::string &targetString, const std::string &sourceString, const std::string &alignmentString, int sentenceID, const Global &global); + // void clear() { delete(alignment); }; + void FindTunnels( const Global &global ) ; + + void CreateLattice(const Global &global); + void CreateRules(const Global &global); + + const TunnelCollection &GetTunnelCollection() const + { + assert(m_tunnelCollection); + return *m_tunnelCollection; + } + + const Lattice &GetLattice() const + { + assert(m_lattice); + return *m_lattice; + } + +protected: + TunnelCollection *m_tunnelCollection; + Lattice *m_lattice; + + /* + void CalcTightestSpan(Outer &tightest); + void InitTightest(Outer &tightest, size_t len); + Range &GetTightest(Outer &tightest, size_t startPos, size_t endPos); + void SetAlignment(size_t source, size_t target); + void SetAlignment(Outer &tightest, size_t thisPos, size_t thatPos); + */ +}; + diff --git a/contrib/other-builds/extract-mixed-syntax/Symbol.cpp b/contrib/other-builds/extract-mixed-syntax/Symbol.cpp new file mode 100644 index 000000000..0181dcaeb --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Symbol.cpp @@ -0,0 +1,101 @@ +/* + * Symbol.cpp + * extract + * + * Created by Hieu Hoang on 21/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include "Symbol.h" + +using namespace std; + +Symbol::Symbol(const std::string &label, size_t pos) +:m_label(label) +,m_isTerminal(true) +,m_span(2) +{ + m_span[0].first = pos; +} + +Symbol::Symbol(const std::string &labelS, const std::string &labelT + , size_t startS, size_t endS + , size_t startT, size_t endT + , bool isSourceSyntax, bool isTargetSyntax) +:m_label(labelS) +,m_labelT(labelT) +,m_isTerminal(false) +,m_span(2) +,m_isSourceSyntax(isSourceSyntax) +,m_isTargetSyntax(isTargetSyntax) +{ + m_span[0] = std::pair(startS, endS); + m_span[1] = std::pair(startT, endT); +} + +int CompareNonTerm(bool thisIsSyntax, bool otherIsSyntax + , const std::pair &thisSpan, const std::pair &otherSpan + , std::string thisLabel, std::string otherLabel) +{ + if (thisIsSyntax != otherIsSyntax) + { // 1 is [X] & the other is [NP] on the source + return thisIsSyntax ? -1 : +1; + } + + assert(thisIsSyntax == otherIsSyntax); + if (thisIsSyntax) + { // compare span & label + if (thisSpan != otherSpan) + return thisSpan < otherSpan ? -1 : +1; + if (thisLabel != otherLabel) + return thisLabel < otherLabel ? -1 : +1; + } + + return 0; +} + +int Symbol::Compare(const Symbol &other) const +{ + if (m_isTerminal != other.m_isTerminal) + return m_isTerminal ? -1 : +1; + + assert(m_isTerminal == other.m_isTerminal); + if (m_isTerminal) + { // compare labels & pos + if (m_span[0].first != other.m_span[0].first) + return (m_span[0].first < other.m_span[0].first) ? -1 : +1; + + if (m_label != other.m_label) + return (m_label < other.m_label) ? -1 : +1; + + } + else + { // non terms + int ret = CompareNonTerm(m_isSourceSyntax, other.m_isSourceSyntax + ,m_span[0], other.m_span[0] + ,m_label, other.m_label); + if (ret != 0) + return ret; + + ret = CompareNonTerm(m_isTargetSyntax, other.m_isTargetSyntax + ,m_span[1], other.m_span[1] + ,m_label, other.m_label); + if (ret != 0) + return ret; + } + + return 0; +} + + +std::ostream& operator<<(std::ostream &out, const Symbol &obj) +{ + if (obj.m_isTerminal) + out << obj.m_label; + else + out << obj.m_label + obj.m_labelT; + + return out; +} + diff --git a/contrib/other-builds/extract-mixed-syntax/Symbol.h b/contrib/other-builds/extract-mixed-syntax/Symbol.h new file mode 100644 index 000000000..b79a705b2 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Symbol.h @@ -0,0 +1,36 @@ +#pragma once + +/* + * Symbol.h + * extract + * + * Created by Hieu Hoang on 21/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include + +class Symbol +{ + friend std::ostream& operator<<(std::ostream &out, const Symbol &obj); + +protected: + std::string m_label, m_labelT; // m_labelT only for non-term + std::vector > m_span; + + bool m_isTerminal, m_isSourceSyntax, m_isTargetSyntax; +public: + // for terminals + Symbol(const std::string &label, size_t pos); + + // for non-terminals + Symbol(const std::string &labelS, const std::string &labelT + , size_t startS, size_t endS + , size_t startT, size_t endT + , bool isSourceSyntax, bool isTargetSyntax); + + int Compare(const Symbol &other) const; + +}; diff --git a/contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp b/contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp new file mode 100644 index 000000000..0cf19f664 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/SymbolSequence.cpp @@ -0,0 +1,56 @@ +/* + * SymbolSequence.cpp + * extract + * + * Created by Hieu Hoang on 21/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include "SymbolSequence.h" + +using namespace std; + +int SymbolSequence::Compare(const SymbolSequence &other) const +{ + int ret; + size_t thisSize = GetSize(); + size_t otherSize = other.GetSize(); + if (thisSize != otherSize) + { + ret = (thisSize < otherSize) ? -1 : +1; + return ret; + } + else + { + assert(thisSize == otherSize); + for (size_t ind = 0; ind < thisSize; ++ind) + { + const Symbol &thisSymbol = GetSymbol(ind); + const Symbol &otherSymbol = other.GetSymbol(ind); + ret = thisSymbol.Compare(otherSymbol); + if (ret != 0) + { + return ret; + } + } + } + + assert(ret == 0); + return ret; +} + +std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj) +{ + SymbolSequence::CollType::const_iterator iterSymbol; + for (iterSymbol = obj.m_coll.begin(); iterSymbol != obj.m_coll.end(); ++iterSymbol) + { + const Symbol &symbol = *iterSymbol; + out << symbol << " "; + } + + return out; +} + + diff --git a/contrib/other-builds/extract-mixed-syntax/SymbolSequence.h b/contrib/other-builds/extract-mixed-syntax/SymbolSequence.h new file mode 100644 index 000000000..997c24205 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/SymbolSequence.h @@ -0,0 +1,42 @@ +#pragma once +/* + * SymbolSequence.h + * extract + * + * Created by Hieu Hoang on 21/07/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include "Symbol.h" + +class SymbolSequence +{ + friend std::ostream& operator<<(std::ostream &out, const SymbolSequence &obj); + +protected: + typedef std::vector CollType; + CollType m_coll; + +public: + typedef CollType::iterator iterator; + typedef CollType::const_iterator const_iterator; + const_iterator begin() const { return m_coll.begin(); } + const_iterator end() const { return m_coll.end(); } + + void Add(const Symbol &symbol) + { + m_coll.push_back(symbol); + } + size_t GetSize() const + { return m_coll.size(); } + const Symbol &GetSymbol(size_t ind) const + { return m_coll[ind]; } + + void Clear() + { m_coll.clear(); } + + int Compare(const SymbolSequence &other) const; + +}; diff --git a/contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp b/contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp new file mode 100644 index 000000000..a6ba3de7b --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/SyntaxTree.cpp @@ -0,0 +1,245 @@ +// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $ +// vim:tabstop=2 + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + + +#include +#include +#include "SyntaxTree.h" +//#include "extract.h" +#include "Global.h" + +//extern const Global g_debug; +extern const Global *g_global; + +using namespace std; + +bool SyntaxNode::IsSyntax() const +{ + bool ret = GetLabel() != "[X]"; + return ret; +} + +SyntaxTree::SyntaxTree() +:m_defaultLHS(0,0, "[X]") +{ + m_emptyNode.clear(); +} + +SyntaxTree::~SyntaxTree() +{ + // loop through all m_nodes, delete them + for(int i=0; iuppermostOnly) + { + nodesChart.push_back( newNode ); + //assert(!HasDuplicates(m_index[ startPos ][ endPos ])); + } + else + { + if (nodesChart.size() > 0) + { + assert(nodesChart.size() == 1); + //delete nodes[0]; + nodesChart.resize(0); + } + assert(nodesChart.size() == 0); + nodesChart.push_back( newNode ); + } +} + +ParentNodes SyntaxTree::Parse() { + ParentNodes parents; + + int size = m_index.size(); + + // looping through all spans of size >= 2 + for( int length=2; length<=size; length++ ) + { + for( int startPos = 0; startPos <= size-length; startPos++ ) + { + if (HasNode( startPos, startPos+length-1 )) + { + // processing one (parent) span + + //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":"; + SplitPoints splitPoints; + splitPoints.push_back( startPos ); + //std::cerr << " " << startPos; + + int first = 1; + int covered = 0; + while( covered < length ) + { + // find largest covering subspan (child) + // starting at last covered position + for( int midPos=length-first; midPos>covered; midPos-- ) + { + if( HasNode( startPos+covered, startPos+midPos-1 ) ) + { + covered = midPos; + splitPoints.push_back( startPos+covered ); + // std::cerr << " " << ( startPos+covered ); + first = 0; + } + } + } + // std::cerr << std::endl; + parents.push_back( splitPoints ); + } + } + } + return parents; +} + +bool SyntaxTree::HasNode( int startPos, int endPos ) const +{ + return GetNodes( startPos, endPos).size() > 0; +} + +const SyntaxNodes &SyntaxTree::GetNodes( int startPos, int endPos ) const +{ + SyntaxTreeIndexIterator startIndex = m_index.find( startPos ); + if (startIndex == m_index.end() ) + return m_emptyNode; + + SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos ); + if (endIndex == startIndex->second.end()) + return m_emptyNode; + + return endIndex->second; +} + +// for printing out tree +std::string SyntaxTree::ToString() const +{ + std::stringstream out; + out << *this; + return out.str(); +} + +void SyntaxTree::AddDefaultNonTerms(size_t phraseSize) +{ + for (size_t startPos = 0; startPos <= phraseSize; ++startPos) + { + for (size_t endPos = startPos; endPos < phraseSize; ++endPos) + { + AddNode(startPos, endPos, "X"); + } + } +} + +void SyntaxTree::AddDefaultNonTerms(bool isSyntax, bool mixed, size_t phraseSize) +{ + if (isSyntax) + { + AddDefaultNonTerms(!mixed, phraseSize); + } + else + { // add X everywhere + AddDefaultNonTerms(phraseSize); + } +} + +void SyntaxTree::AddDefaultNonTerms(bool addEverywhere, size_t phraseSize) +{ + //cerr << "GetNumWords()=" << GetNumWords() << endl; + //assert(phraseSize == GetNumWords() || GetNumWords() == 1); // 1 if syntax sentence doesn't have any xml. TODO fix syntax tree obj + + for (size_t startPos = 0; startPos <= phraseSize; ++startPos) + { + for (size_t endPos = startPos; endPos <= phraseSize; ++endPos) + { + const SyntaxNodes &nodes = GetNodes(startPos, endPos); + if (!addEverywhere && nodes.size() > 0) + { // only add if no label + continue; + } + AddNode(startPos, endPos, "X"); + } + } +} + +const SyntaxNodes SyntaxTree::GetNodesForLHS( int startPos, int endPos ) const +{ + SyntaxNodes ret(GetNodes(startPos, endPos)); + + if (ret.size() == 0) + ret.push_back(&m_defaultLHS); + + return ret; +} + +std::ostream& operator<<(std::ostream& os, const SyntaxTree& t) +{ + int size = t.m_index.size(); + for(size_t length=1; length<=size; length++) + { + for(size_t space=0; spaceGetLabel() + "#######"; + + os << label.substr(0,7) << " "; + } + else + { + os << "------- "; + } + } + os << std::endl; + } + return os; +} + + diff --git a/contrib/other-builds/extract-mixed-syntax/SyntaxTree.h b/contrib/other-builds/extract-mixed-syntax/SyntaxTree.h new file mode 100644 index 000000000..50a73a369 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/SyntaxTree.h @@ -0,0 +1,96 @@ +#pragma once + +// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $ +// vim:tabstop=2 + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include +#include + +class SyntaxNode; + +typedef std::vector SyntaxNodes; + +class SyntaxNode { +protected: + int m_start, m_end; + std::string m_label; + SyntaxNodes m_children; + SyntaxNode* m_parent; +public: +SyntaxNode( int startPos, int endPos, const std::string &label) + :m_start(startPos) + ,m_end(endPos) + ,m_label(label) + {} + int GetStart() const + { return m_start; } + int GetEnd() const + { return m_end; } + const std::string &GetLabel() const + { return m_label; } + bool IsSyntax() const; +}; + + +typedef std::vector< int > SplitPoints; +typedef std::vector< SplitPoints > ParentNodes; + +class SyntaxTree { +protected: + SyntaxNodes m_nodes; + SyntaxNode* m_top; + SyntaxNode m_defaultLHS; + + typedef std::map< int, SyntaxNodes > SyntaxTreeIndex2; + typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2; + typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex; + typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator; + SyntaxTreeIndex m_index; + SyntaxNodes m_emptyNode; + + friend std::ostream& operator<<(std::ostream&, const SyntaxTree&); + +public: + SyntaxTree(); + ~SyntaxTree(); + + void AddNode( int startPos, int endPos, std::string label ); + ParentNodes Parse(); + bool HasNode( int startPos, int endPos ) const; + const SyntaxNodes &GetNodes( int startPos, int endPos ) const; + const SyntaxNodes &GetAllNodes() const { return m_nodes; } ; + size_t GetNumWords() const { return m_index.size(); } + std::string ToString() const; + + void AddDefaultNonTerms(bool isSyntax, bool addEverywhere, size_t phraseSize); + void AddDefaultNonTerms(bool mixed, size_t phraseSize); + + void AddDefaultNonTerms(size_t phraseSize); + + const SyntaxNodes GetNodesForLHS( int startPos, int endPos ) const; + +}; + +std::ostream& operator<<(std::ostream&, const SyntaxTree&); + diff --git a/contrib/other-builds/extract-mixed-syntax/Tunnel.cpp b/contrib/other-builds/extract-mixed-syntax/Tunnel.cpp new file mode 100644 index 000000000..fc4846c34 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Tunnel.cpp @@ -0,0 +1,38 @@ +/* + * Tunnel.cpp + * extract + * + * Created by Hieu Hoang on 19/01/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ + +#include "Tunnel.h" + + +int Tunnel::Compare(const Tunnel &other) const +{ + int ret = m_sourceRange.Compare(other.m_sourceRange); + + if (ret != 0) + return ret; + + ret = m_targetRange.Compare(other.m_targetRange); + + return ret; +} + +int Tunnel::Compare(const Tunnel &other, size_t direction) const +{ + const Range &thisRange = (direction == 0) ? m_sourceRange : m_targetRange; + const Range &otherRange = (direction == 0) ? other.m_sourceRange : other.m_targetRange; + + int ret = thisRange.Compare(otherRange); + return ret; +} + +std::ostream& operator<<(std::ostream &out, const Tunnel &tunnel) +{ + out << tunnel.m_sourceRange << "==>" << tunnel.m_targetRange; + return out; +} diff --git a/contrib/other-builds/extract-mixed-syntax/Tunnel.h b/contrib/other-builds/extract-mixed-syntax/Tunnel.h new file mode 100644 index 000000000..2659cca4a --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/Tunnel.h @@ -0,0 +1,49 @@ +#pragma once + +/* + * Tunnel.h + * extract + * + * Created by Hieu Hoang on 19/01/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include +#include +#include "Range.h" + + // for unaligned source terminal + +class Tunnel +{ + friend std::ostream& operator<<(std::ostream&, const Tunnel&); + +protected: + + Range m_sourceRange, m_targetRange; + +public: + Tunnel() + {} + + Tunnel(const Tunnel ©) + :m_sourceRange(copy.m_sourceRange) + ,m_targetRange(copy.m_targetRange) + {} + + Tunnel(const Range &sourceRange, const Range &targetRange) + :m_sourceRange(sourceRange) + ,m_targetRange(targetRange) + {} + + const Range &GetRange(size_t direction) const + { return (direction == 0) ? m_sourceRange : m_targetRange; } + + int Compare(const Tunnel &other) const; + int Compare(const Tunnel &other, size_t direction) const; +}; + +typedef std::vector TunnelList; + diff --git a/contrib/other-builds/extract-mixed-syntax/TunnelCollection.cpp b/contrib/other-builds/extract-mixed-syntax/TunnelCollection.cpp new file mode 100644 index 000000000..228cc3070 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/TunnelCollection.cpp @@ -0,0 +1,70 @@ +/* + * TunnelCollection.cpp + * extract + * + * Created by Hieu Hoang on 19/01/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ + +#include "TunnelCollection.h" +#include "Range.h" + +using namespace std; + +size_t TunnelCollection::NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const +{ + assert(startPos <= endPos); + + if (direction == 0) + assert(endPos < alignedCountS.size()); + else + assert(endPos < alignedCountT.size()); + + size_t ret = 0; + for (size_t ind = startPos; ind <= endPos; ++ind) + { + if (direction == 0 && alignedCountS[ind] == 0) + { + ret++; + } + else if (direction == 1 && alignedCountT[ind] == 0) + { + ret++; + } + + } + + return ret; +} + +void TunnelCollection::Add(int startS, int endS, int startT, int endT) +{ + // m_phraseExist[startS][endS - startS].push_back(Tunnel(startT, endT)); + m_coll[startS][endS - startS].push_back(Tunnel(Range(startS, endS), Range(startT, endT))); +} + + +std::ostream& operator<<(std::ostream &out, const TunnelCollection &TunnelCollection) +{ + size_t size = TunnelCollection.GetSize(); + + for (size_t startPos = 0; startPos < size; ++startPos) + { + for (size_t endPos = startPos; endPos < size; ++endPos) + { + const TunnelList &tunnelList = TunnelCollection.GetTunnels(startPos, endPos); + TunnelList::const_iterator iter; + for (iter = tunnelList.begin(); iter != tunnelList.end(); ++iter) + { + const Tunnel &tunnel = *iter; + out << tunnel << " "; + + } + } + } + + return out; +} + + diff --git a/contrib/other-builds/extract-mixed-syntax/TunnelCollection.h b/contrib/other-builds/extract-mixed-syntax/TunnelCollection.h new file mode 100644 index 000000000..547cbf814 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/TunnelCollection.h @@ -0,0 +1,61 @@ +#pragma once +/* + * TunnelCollection.h + * extract + * + * Created by Hieu Hoang on 19/01/2010. + * Copyright 2010 __MyCompanyName__. All rights reserved. + * + */ +#include +#include "Tunnel.h" + +// reposity of extracted phrase pairs +// which are potential tunnels in larger phrase pairs +class TunnelCollection + { + friend std::ostream& operator<<(std::ostream&, const TunnelCollection&); + + protected: + std::vector< std::vector > m_coll; + // indexed by source pos. and source length + // maps to list of tunnels where are target pos + + public: + std::vector alignedCountS, alignedCountT; + + TunnelCollection(const TunnelCollection &); + + TunnelCollection(size_t size) + :m_coll(size) + { + // size is the length of the source sentence + for (size_t pos = 0; pos < size; ++pos) + { + // create empty tunnel lists + std::vector &endVec = m_coll[pos]; + endVec.resize(size - pos); + } + } + + void Add(int startS, int endS, int startT, int endT); + + //const TunnelList &GetTargetHoles(int startS, int endS) const + //{ + // const TunnelList &targetHoles = m_phraseExist[startS][endS - startS]; + // return targetHoles; + //} + const TunnelList &GetTunnels(int startS, int endS) const + { + const TunnelList &sourceHoles = m_coll[startS][endS - startS]; + return sourceHoles; + } + + const size_t GetSize() const + { return m_coll.size(); } + + size_t NumUnalignedWord(size_t direction, size_t startPos, size_t endPos) const; + + + }; + diff --git a/contrib/other-builds/extract-mixed-syntax/XmlTree.cpp b/contrib/other-builds/extract-mixed-syntax/XmlTree.cpp new file mode 100644 index 000000000..9145c8d1c --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/XmlTree.cpp @@ -0,0 +1,344 @@ +// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $ +// vim:tabstop=2 + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include +#include +#include +#include "SyntaxTree.h" + +using namespace std; + + +inline std::vector Tokenize(const std::string& str, + const std::string& delimiters = " \t") +{ + std::vector tokens; + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (std::string::npos != pos || std::string::npos != lastPos) + { + // Found a token, add it to the vector. + tokens.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } + + return tokens; +} + +const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r") +{ + std::string res = str; + res.erase(str.find_last_not_of(dropChars)+1); + return res.erase(0, res.find_first_not_of(dropChars)); +} + +string ParseXmlTagAttribute(const string& tag,const string& attributeName){ + /*TODO deal with unescaping \"*/ + string tagOpen = attributeName + "=\""; + size_t contentsStart = tag.find(tagOpen); + if (contentsStart == string::npos) return ""; + contentsStart += tagOpen.size(); + size_t contentsEnd = tag.find_first_of('"',contentsStart+1); + if (contentsEnd == string::npos) { + cerr << "Malformed XML attribute: "<< tag; + return ""; + } + size_t possibleEnd; + while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) { + contentsEnd = possibleEnd; + } + return tag.substr(contentsStart,contentsEnd-contentsStart); +} + +/** + * Remove "<" and ">" from XML tag + * + * \param str xml token to be stripped + */ +string TrimXml(const string& str) +{ + // too short to be xml token -> do nothing + if (str.size() < 2) return str; + + // strip first and last character + if (str[0] == '<' && str[str.size() - 1] == '>') + { + return str.substr(1, str.size() - 2); + } + // not an xml token -> do nothing + else { return str; } +} + +/** + * Check if the token is an XML tag, i.e. starts with "<" + * + * \param tag token to be checked + */ +bool isXmlTag(const string& tag) +{ + return tag[0] == '<'; +} + +/** + * Split up the input character string into tokens made up of + * either XML tags or text. + * example: this is a test . + * => (this ), (), ( is a ), (), ( test .) + * + * \param str input string + */ +inline vector TokenizeXml(const string& str) +{ + string lbrack = "<"; + string rbrack = ">"; + vector tokens; // vector of tokens to be returned + string::size_type cpos = 0; // current position in string + string::size_type lpos = 0; // left start of xml tag + string::size_type rpos = 0; // right end of xml tag + + // walk thorugh the string (loop vver cpos) + while (cpos != str.size()) + { + // find the next opening "<" of an xml tag + lpos = str.find_first_of(lbrack, cpos); + if (lpos != string::npos) + { + // find the end of the xml tag + rpos = str.find_first_of(rbrack, lpos); + // sanity check: there has to be closing ">" + if (rpos == string::npos) + { + cerr << "ERROR: malformed XML: " << str << endl; + return tokens; + } + } + else // no more tags found + { + // add the rest as token + tokens.push_back(str.substr(cpos)); + break; + } + + // add stuff before xml tag as token, if there is any + if (lpos - cpos > 0) + tokens.push_back(str.substr(cpos, lpos - cpos)); + + // add xml tag as token + tokens.push_back(str.substr(lpos, rpos-lpos+1)); + cpos = rpos + 1; + } + return tokens; +} + +/** + * Process a sentence with xml annotation + * Xml tags may specifiy additional/replacing translation options + * and reordering constraints + * + * \param line in: sentence, out: sentence without the xml + * \param res vector with translation options specified by xml + * \param reorderingConstraint reordering constraint zones specified by xml + * \param walls reordering constraint walls specified by xml + */ +/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector + is so we can link things up afterwards. We can't create TranslationOptions as we + parse because we don't have the completed source parsed until after this function + removes all the markup from it (CreateFromString in Sentence::Read). +*/ +bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection ) { + //parse XML markup in translation line + + // no xml tag? we're done. + if (line.find_first_of('<') == string::npos) { return true; } + + // break up input into a vector of xml tags and text + // example: (this), (), (is a), (), (test .) + vector xmlTokens = TokenizeXml(line); + + // we need to store opened tags, until they are closed + // tags are stored as tripled (tagname, startpos, contents) + typedef pair< string, pair< size_t, string > > OpenedTag; + vector< OpenedTag > tagStack; // stack that contains active opened tags + + string cleanLine; // return string (text without xml) + size_t wordPos = 0; // position in sentence (in terms of number of words) + bool isLinked = false; + + // loop through the tokens + for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) + { + // not a xml tag, but regular text (may contain many words) + if(!isXmlTag(xmlTokens[xmlTokenPos])) + { + // add a space at boundary, if necessary + if (cleanLine.size()>0 && + cleanLine[cleanLine.size() - 1] != ' ' && + xmlTokens[xmlTokenPos][0] != ' ') + { + cleanLine += " "; + } + cleanLine += xmlTokens[xmlTokenPos]; // add to output + wordPos = Tokenize(cleanLine).size(); // count all the words + } + + // process xml tag + else + { + // *** get essential information about tag *** + + // strip extra boundary spaces and "<" and ">" + string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); + // cerr << "XML TAG IS: " << tag << std::endl; + + if (tag.size() == 0) + { + cerr << "ERROR: empty tag name: " << line << endl; + return false; + } + + // check if unary (e.g., "") + bool isUnary = ( tag[tag.size() - 1] == '/' ); + + // check if opening tag (e.g. "", not "")g + bool isClosed = ( tag[0] == '/' ); + bool isOpen = !isClosed; + + if (isClosed && isUnary) + { + cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl; + return false; + } + + if (isClosed) + tag = tag.substr(1); // remove "/" at the beginning + if (isUnary) + tag = tag.substr(0,tag.size()-1); // remove "/" at the end + + // find the tag name and contents + string::size_type endOfName = tag.find_first_of(' '); + string tagName = tag; + string tagContent = ""; + if (endOfName != string::npos) { + tagName = tag.substr(0,endOfName); + tagContent = tag.substr(endOfName+1); + } + + // *** process new tag *** + + if (isOpen || isUnary) + { + // put the tag on the tag stack + OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); + tagStack.push_back( openedTag ); + // cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl; + } + + // *** process completed tag *** + + if (isClosed || isUnary) + { + // pop last opened tag from stack; + if (tagStack.size() == 0) + { + cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl; + return false; + } + OpenedTag openedTag = tagStack.back(); + tagStack.pop_back(); + + // tag names have to match + if (openedTag.first != tagName) + { + cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl; + return false; + } + + // assemble remaining information about tag + size_t startPos = openedTag.second.first; + string tagContent = openedTag.second.second; + size_t endPos = wordPos; + + // span attribute overwrites position + string span = ParseXmlTagAttribute(tagContent,"span"); + if (! span.empty()) + { + vector ij = Tokenize(span, "-"); + if (ij.size() != 1 && ij.size() != 2) { + cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl; + return false; + } + startPos = atoi(ij[0].c_str()); + if (ij.size() == 1) endPos = startPos + 1; + else endPos = atoi(ij[1].c_str()) + 1; + } + + // cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl; + + if (startPos >= endPos) + { + cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl; + return false; + } + + string label = ParseXmlTagAttribute(tagContent,"label"); + labelCollection.insert( label ); + + // report what we have processed so far + if (0) { + cerr << "XML TAG NAME IS: '" << tagName << "'" << endl; + cerr << "XML TAG LABEL IS: '" << label << "'" << endl; + cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; + } + tree.AddNode( startPos, endPos-1, label ); + } + } + } + // we are done. check if there are tags that are still open + if (tagStack.size() > 0) + { + cerr << "ERROR: some opened tags were never closed: " << line << endl; + return false; + } + + // collect top labels + const SyntaxNodes &topNodes = tree.GetNodes( 0, wordPos-1 ); + for( SyntaxNodes::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) + { + const SyntaxNode *n = *node; + const string &label = n->GetLabel(); + if (topLabelCollection.find( label ) == topLabelCollection.end()) + topLabelCollection[ label ] = 0; + topLabelCollection[ label ]++; + } + + // return de-xml'ed sentence in line + line = cleanLine; + return true; +} diff --git a/contrib/other-builds/extract-mixed-syntax/XmlTree.h b/contrib/other-builds/extract-mixed-syntax/XmlTree.h new file mode 100644 index 000000000..cd54b8f17 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/XmlTree.h @@ -0,0 +1,35 @@ +#pragma once + +// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $ +// vim:tabstop=2 + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include +#include +#include "SyntaxTree.h" + +std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName); +std::string TrimXml(const std::string& str); +bool isXmlTag(const std::string& tag); +inline std::vector TokenizeXml(const std::string& str); +bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection ); diff --git a/contrib/other-builds/extract-mixed-syntax/extract.cpp b/contrib/other-builds/extract-mixed-syntax/extract.cpp new file mode 100644 index 000000000..f4360a94f --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/extract.cpp @@ -0,0 +1,242 @@ +// $Id: extract.cpp 2828 2010-02-01 16:07:58Z hieuhoang1972 $ +// vim:tabstop=2 + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include "extract.h" +#include "InputFileStream.h" +#include "Lattice.h" + +#ifdef WIN32 +// Include Visual Leak Detector +#include +#endif + +using namespace std; + +int main(int argc, char* argv[]) +{ + cerr << "Extract v2.0, written by Philipp Koehn\n" + << "rule extraction from an aligned parallel corpus\n"; + //time_t starttime = time(NULL); + + Global *global = new Global(); + g_global = global; + + + if (argc < 5) { + cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract " + << " [ --Hierarchical | --Orientation" + << " | --GlueGrammar FILE | --UnknownWordLabel FILE" + << " | --OnlyDirect" + + << " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]" + << " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]" + << " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]" + << " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]" + + << " | --MaxSymbolsSource[" << global->maxSymbolsSource << "]" + << " | --MaxNonTerm[" << global->maxNonTerm << "]" + << " | --SourceSyntax | --TargetSyntax" + << " | --UppermostOnly[" << g_global->uppermostOnly << "]" + << endl; + exit(1); + } + char* &fileNameT = argv[1]; + char* &fileNameS = argv[2]; + char* &fileNameA = argv[3]; + string fileNameGlueGrammar; + string fileNameUnknownWordLabel; + string fileNameExtract = string(argv[4]); + + int optionInd = 5; + + for(int i=optionInd;iminHoleSpanSourceDefault = atoi(argv[++i]); + if (global->minHoleSpanSourceDefault < 1) { + cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl; + exit(1); + } + } + else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) { + global->maxHoleSpanSourceDefault = atoi(argv[++i]); + if (global->maxHoleSpanSourceDefault < 1) { + cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl; + exit(1); + } + } + else if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) { + global->minHoleSpanSourceSyntax = atoi(argv[++i]); + if (global->minHoleSpanSourceSyntax < 1) { + cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl; + exit(1); + } + } + else if (strcmp(argv[i],"--UppermostOnly") == 0) { + global->uppermostOnly = atoi(argv[++i]); + } + else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) { + global->maxHoleSpanSourceSyntax = atoi(argv[++i]); + if (global->maxHoleSpanSourceSyntax < 1) { + cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl; + exit(1); + } + } + + // maximum number of words in hierarchical phrase + else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) { + global->maxSymbolsSource = atoi(argv[++i]); + if (global->maxSymbolsSource < 1) { + cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl; + exit(1); + } + } + // maximum number of non-terminals + else if (strcmp(argv[i],"--MaxNonTerm") == 0) { + global->maxNonTerm = atoi(argv[++i]); + if (global->maxNonTerm < 1) { + cerr << "extract error: --MaxNonTerm should be at least 1" << endl; + exit(1); + } + } + // allow consecutive non-terminals (X Y | X Y) + else if (strcmp(argv[i],"--TargetSyntax") == 0) { + global->targetSyntax = true; + } + else if (strcmp(argv[i],"--SourceSyntax") == 0) { + global->sourceSyntax = true; + } + // do not create many part00xx files! + else if (strcmp(argv[i],"--NoFileLimit") == 0) { + // now default + } + else if (strcmp(argv[i],"--GlueGrammar") == 0) { + global->glueGrammarFlag = true; + if (++i >= argc) + { + cerr << "ERROR: Option --GlueGrammar requires a file name" << endl; + exit(0); + } + fileNameGlueGrammar = string(argv[i]); + cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl; + } + else if (strcmp(argv[i],"--UnknownWordLabel") == 0) { + global->unknownWordLabelFlag = true; + if (++i >= argc) + { + cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl; + exit(0); + } + fileNameUnknownWordLabel = string(argv[i]); + cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl; + } + // TODO: this should be a useful option + //else if (strcmp(argv[i],"--ZipFiles") == 0) { + // zipFiles = true; + //} + // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 + else if (strcmp(argv[i],"--Mixed") == 0) { + global->mixed = true; + } + else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) { + global->allowDefaultNonTermEdge = atoi(argv[++i]); + } + + else { + cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; + exit(1); + } + } + + // open input files + Moses::InputFileStream tFile(fileNameT); + Moses::InputFileStream sFile(fileNameS); + Moses::InputFileStream aFile(fileNameA); + + // open output files + string fileNameExtractInv = fileNameExtract + ".inv"; + string fileNameExtractOrientation = fileNameExtract + ".o"; + extractFile.open(fileNameExtract.c_str()); + + + // loop through all sentence pairs + int i=0; + while(true) { + i++; + + if (i % 1000 == 0) { + cerr << i << " " << flush; + } + + string targetString; + string sourceString; + string alignmentString; + + bool ok = getline(tFile, targetString); + if (!ok) + break; + getline(sFile, sourceString); + getline(aFile, alignmentString); + + //cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl; + + //time_t currTime = time(NULL); + //cerr << "A " << (currTime - starttime) << endl; + + SentenceAlignment sentencePair; + if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global )) + { + //cerr << sentence.sourceTree << endl; + //cerr << sentence.targetTree << endl; + + sentencePair.FindTunnels(*g_global); + //cerr << "C " << (time(NULL) - starttime) << endl; + //cerr << sentencePair << endl; + + sentencePair.CreateLattice(*g_global); + //cerr << "D " << (time(NULL) - starttime) << endl; + //cerr << sentencePair << endl; + + sentencePair.CreateRules(*g_global); + //cerr << "E " << (time(NULL) - starttime) << endl; + + //cerr << sentence.lattice->GetRules().GetSize() << endl; + extractFile << sentencePair.GetLattice().GetRules(); + } + + } + + tFile.Close(); + sFile.Close(); + aFile.Close(); + + delete global; +} + + + diff --git a/contrib/other-builds/extract-mixed-syntax/extract.h b/contrib/other-builds/extract-mixed-syntax/extract.h new file mode 100644 index 000000000..a3d2d618b --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/extract.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include "SyntaxTree.h" +#include "XmlTree.h" +#include "Tunnel.h" +#include "TunnelCollection.h" +#include "SentenceAlignment.h" +#include "Global.h" + +std::vector tokenize( const char [] ); + +#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \ + _IS.getline(_LINE, _SIZE, _DELIM); \ + if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \ + if (_IS.gcount() == _SIZE-1) { \ + cerr << "Line too long! Buffer overflow. Delete lines >=" \ + << _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/extract.cpp" \ + << endl; \ + exit(1); \ + } \ + } +#define LINE_MAX_LENGTH 1000000 + +const Global *g_global; + +std::ofstream extractFile; +std::ofstream extractFileInv; +std::ofstream extractFileOrientation; + +std::set< std::string > targetLabelCollection, sourceLabelCollection; +std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection; diff --git a/contrib/other-builds/extract-mixed-syntax/gzfilebuf.h b/contrib/other-builds/extract-mixed-syntax/gzfilebuf.h new file mode 100644 index 000000000..885c661f0 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/gzfilebuf.h @@ -0,0 +1,81 @@ +#ifndef moses_gzfile_buf_h +#define moses_gzfile_buf_h + +#include +#include +#include + +class gzfilebuf : public std::streambuf { +public: + gzfilebuf(const char *filename) + { _gzf = gzopen(filename, "rb"); + setg (_buff+sizeof(int), // beginning of putback area + _buff+sizeof(int), // read position + _buff+sizeof(int)); // end position + } + ~gzfilebuf() { gzclose(_gzf); } +protected: + virtual int_type overflow (int_type c) { + throw; + } + + // write multiple characters + virtual + std::streamsize xsputn (const char* s, + std::streamsize num) { + throw; + } + + virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw; + } + + //read one character + virtual int_type underflow () { + // is read position before end of _buff? + if (gptr() < egptr()) { + return traits_type::to_int_type(*gptr()); + } + + /* process size of putback area + * - use number of characters read + * - but at most four + */ + unsigned int numPutback = gptr() - eback(); + if (numPutback > sizeof(int)) { + numPutback = sizeof(int); + } + + /* copy up to four characters previously read into + * the putback _buff (area of first four characters) + */ + std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback, + numPutback); + + // read new characters + int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int)); + if (num <= 0) { + // ERROR or EOF + return EOF; + } + + // reset _buff pointers + setg (_buff+(sizeof(int)-numPutback), // beginning of putback area + _buff+sizeof(int), // read position + _buff+sizeof(int)+num); // end of buffer + + // return next character + return traits_type::to_int_type(*gptr()); + } + + std::streamsize xsgetn (char* s, + std::streamsize num) { + return gzread(_gzf,s,num); + } + +private: + gzFile _gzf; + static const unsigned int _buffsize = 1024; + char _buff[_buffsize]; +}; + +#endif diff --git a/contrib/other-builds/extract-mixed-syntax/tables-core.cpp b/contrib/other-builds/extract-mixed-syntax/tables-core.cpp new file mode 100644 index 000000000..c3c141b7f --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/tables-core.cpp @@ -0,0 +1,110 @@ +// $Id: tables-core.cpp 3131 2010-04-13 16:29:55Z pjwilliams $ +//#include "beammain.h" +//#include "SafeGetLine.h" +#include "tables-core.h" + +#define TABLE_LINE_MAX_LENGTH 1000 +#define UNKNOWNSTR "UNK" + +// as in beamdecoder/tables.cpp +vector tokenize( const char* input ) { + vector< string > token; + bool betweenWords = true; + int start=0; + int i=0; + for(; input[i] != '\0'; i++) { + bool isSpace = (input[i] == ' ' || input[i] == '\t'); + + if (!isSpace && betweenWords) { + start = i; + betweenWords = false; + } + else if (isSpace && !betweenWords) { + token.push_back( string( input+start, i-start ) ); + betweenWords = true; + } + } + if (!betweenWords) + token.push_back( string( input+start, i-start ) ); + return token; +} + +WORD_ID Vocabulary::storeIfNew( const WORD& word ) { + map::iterator i = lookup.find( word ); + + if( i != lookup.end() ) + return i->second; + + WORD_ID id = vocab.size(); + vocab.push_back( word ); + lookup[ word ] = id; + return id; +} + +WORD_ID Vocabulary::getWordID( const WORD& word ) { + map::iterator i = lookup.find( word ); + if( i == lookup.end() ) + return 0; + return i->second; +} + +PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase ) { + map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase ); + if( i != lookup.end() ) + return i->second; + + PHRASE_ID id = phraseTable.size(); + phraseTable.push_back( phrase ); + lookup[ phrase ] = id; + return id; +} + +PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase ) { + map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase ); + if( i == lookup.end() ) + return 0; + return i->second; +} + +void PhraseTable::clear() { + lookup.clear(); + phraseTable.clear(); +} + +void DTable::init() { + for(int i = -10; i<10; i++) + dtable[i] = -abs( i ); +} + +/* +void DTable::load( const string& fileName ) { + ifstream inFile; + inFile.open(fileName.c_str()); + istream *inFileP = &inFile; + + char line[TABLE_LINE_MAX_LENGTH]; + int i=0; + while(true) { + i++; + SAFE_GETLINE((*inFileP), line, TABLE_LINE_MAX_LENGTH, '\n', __FILE__); + if (inFileP->eof()) break; + + vector token = tokenize( line ); + if (token.size() < 2) { + cerr << "line " << i << " in " << fileName << " too short, skipping\n"; + continue; + } + + int d = atoi( token[0].c_str() ); + double prob = log( atof( token[1].c_str() ) ); + dtable[ d ] = prob; + } +} +*/ + +double DTable::get( int distortion ) { + if (dtable.find( distortion ) == dtable.end()) + return log( 0.00001 ); + return dtable[ distortion ]; +} + diff --git a/contrib/other-builds/extract-mixed-syntax/tables-core.h b/contrib/other-builds/extract-mixed-syntax/tables-core.h new file mode 100644 index 000000000..f039ced7e --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/tables-core.h @@ -0,0 +1,72 @@ +#pragma once +// $Id: tables-core.h 2416 2009-07-30 11:07:38Z hieuhoang1972 $ + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#define TABLE_LINE_MAX_LENGTH 1000 +#define UNKNOWNSTR "UNK" + +vector tokenize( const char[] ); + +//! delete and remove every element of a collection object such as map, set, list etc +template +void RemoveAllInColl(COLL &coll) +{ + for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter) + { + delete (*iter); + } + coll.clear(); +} + +typedef string WORD; +typedef unsigned int WORD_ID; + +class Vocabulary { + public: + map lookup; + vector< WORD > vocab; + WORD_ID storeIfNew( const WORD& ); + WORD_ID getWordID( const WORD& ); + inline WORD &getWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; } +}; + +typedef vector< WORD_ID > PHRASE; +typedef unsigned int PHRASE_ID; + +class PhraseTable { + public: + map< PHRASE, PHRASE_ID > lookup; + vector< PHRASE > phraseTable; + PHRASE_ID storeIfNew( const PHRASE& ); + PHRASE_ID getPhraseID( const PHRASE& ); + void clear(); + inline PHRASE &getPhrase( const PHRASE_ID id ) { return phraseTable[ id ]; } +}; + +typedef vector< pair< PHRASE_ID, double > > PHRASEPROBVEC; + +class TTable { + public: + map< PHRASE_ID, vector< pair< PHRASE_ID, double > > > ttable; + map< PHRASE_ID, vector< pair< PHRASE_ID, vector< double > > > > ttableMulti; +}; + +class DTable { + public: + map< int, double > dtable; + void init(); + void load( const string& ); + double get( int ); +}; + + From 9555dc657d14215ee00293a8914a63dd1a9679cb Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 16 Jan 2014 17:18:51 +0000 Subject: [PATCH 13/48] change --MaxSymbolsSource to --MaxSymbols --- contrib/other-builds/extract-mixed-syntax/Global.cpp | 2 +- contrib/other-builds/extract-mixed-syntax/Global.h | 2 +- contrib/other-builds/extract-mixed-syntax/Rule.cpp | 10 +++++----- contrib/other-builds/extract-mixed-syntax/extract.cpp | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/contrib/other-builds/extract-mixed-syntax/Global.cpp b/contrib/other-builds/extract-mixed-syntax/Global.cpp index 5a851e2f2..bf7f912e7 100644 --- a/contrib/other-builds/extract-mixed-syntax/Global.cpp +++ b/contrib/other-builds/extract-mixed-syntax/Global.cpp @@ -18,7 +18,7 @@ Global::Global() , maxHoleSpanSourceSyntax(1000) , maxUnaligned(5) -, maxSymbolsSource(5) +, maxSymbols(5) , maxNonTerm(3) , maxNonTermDefault(2) diff --git a/contrib/other-builds/extract-mixed-syntax/Global.h b/contrib/other-builds/extract-mixed-syntax/Global.h index 71da54695..d96db6134 100644 --- a/contrib/other-builds/extract-mixed-syntax/Global.h +++ b/contrib/other-builds/extract-mixed-syntax/Global.h @@ -19,7 +19,7 @@ public: int minHoleSpanSourceSyntax; int maxHoleSpanSourceSyntax; - int maxSymbolsSource; + int maxSymbols; bool glueGrammarFlag; bool unknownWordLabelFlag; int maxNonTerm; diff --git a/contrib/other-builds/extract-mixed-syntax/Rule.cpp b/contrib/other-builds/extract-mixed-syntax/Rule.cpp index 38d197934..741bacf89 100644 --- a/contrib/other-builds/extract-mixed-syntax/Rule.cpp +++ b/contrib/other-builds/extract-mixed-syntax/Rule.cpp @@ -162,10 +162,10 @@ void Rule::CreateSymbols(const Global &global, bool &isValid, const SentenceAlig assert(indNonTerm == nonTerms.size()); - if (m_target.GetSize() > global.maxSymbolsSource) { + if (m_target.GetSize() > global.maxSymbols) { isValid = false; - cerr << "m_source=" << m_source.GetSize() << ":" << m_source << endl; - cerr << "m_target=" << m_target.GetSize() << ":" << m_target << endl; + //cerr << "m_source=" << m_source.GetSize() << ":" << m_source << endl; + //cerr << "m_target=" << m_target.GetSize() << ":" << m_target << endl; } } } @@ -229,7 +229,7 @@ bool Rule::IsValid(const Global &global, const TunnelCollection &tunnelColl) con return false; } - if (GetNumSymbols() > global.maxSymbolsSource) + if (GetNumSymbols() > global.maxSymbols) { return false; } @@ -316,7 +316,7 @@ bool Rule::IsHole(const TunnelCollection &tunnelColl) const bool Rule::CanRecurse(const Global &global, const TunnelCollection &tunnelColl) const { - if (GetNumSymbols() >= global.maxSymbolsSource) + if (GetNumSymbols() >= global.maxSymbols) return false; if (AdjacentDefaultNonTerms()) return false; diff --git a/contrib/other-builds/extract-mixed-syntax/extract.cpp b/contrib/other-builds/extract-mixed-syntax/extract.cpp index f4360a94f..05eeffcfe 100644 --- a/contrib/other-builds/extract-mixed-syntax/extract.cpp +++ b/contrib/other-builds/extract-mixed-syntax/extract.cpp @@ -58,7 +58,7 @@ int main(int argc, char* argv[]) << " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]" << " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]" - << " | --MaxSymbolsSource[" << global->maxSymbolsSource << "]" + << " | --MaxSymbols[" << global->maxSymbols<< "]" << " | --MaxNonTerm[" << global->maxNonTerm << "]" << " | --SourceSyntax | --TargetSyntax" << " | --UppermostOnly[" << g_global->uppermostOnly << "]" @@ -109,10 +109,10 @@ int main(int argc, char* argv[]) } // maximum number of words in hierarchical phrase - else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) { - global->maxSymbolsSource = atoi(argv[++i]); - if (global->maxSymbolsSource < 1) { - cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl; + else if (strcmp(argv[i],"--maxSymbols") == 0) { + global->maxSymbols = atoi(argv[++i]); + if (global->maxSymbols < 1) { + cerr << "extract error: --maxSymbols should be at least 1" << endl; exit(1); } } From 2cd6649e629a307061b78f0d939a66a49d06e15e Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 16 Jan 2014 19:13:44 +0000 Subject: [PATCH 14/48] output inverse extract --- .../extract-mixed-syntax/.cproject | 10 +- .../extract-mixed-syntax/Global.cpp | 2 + .../extract-mixed-syntax/Global.h | 3 +- .../extract-mixed-syntax/Lattice.cpp | 4 +- .../extract-mixed-syntax/Makefile | 5 +- .../extract-mixed-syntax/OutputFileStream.cpp | 79 ++++++++++ .../extract-mixed-syntax/OutputFileStream.h | 50 +++++++ .../extract-mixed-syntax/Rule.cpp | 135 ++++++++++++------ .../other-builds/extract-mixed-syntax/Rule.h | 6 +- .../extract-mixed-syntax/RuleCollection.cpp | 30 ++-- .../extract-mixed-syntax/RuleCollection.h | 6 +- .../extract-mixed-syntax/extract.cpp | 27 +++- .../extract-mixed-syntax/extract.h | 5 - 13 files changed, 283 insertions(+), 79 deletions(-) create mode 100644 contrib/other-builds/extract-mixed-syntax/OutputFileStream.cpp create mode 100644 contrib/other-builds/extract-mixed-syntax/OutputFileStream.h diff --git a/contrib/other-builds/extract-mixed-syntax/.cproject b/contrib/other-builds/extract-mixed-syntax/.cproject index 345f4d2f5..1cc09dda3 100644 --- a/contrib/other-builds/extract-mixed-syntax/.cproject +++ b/contrib/other-builds/extract-mixed-syntax/.cproject @@ -27,12 +27,19 @@ - + @@ -122,4 +129,5 @@ + diff --git a/contrib/other-builds/extract-mixed-syntax/Global.cpp b/contrib/other-builds/extract-mixed-syntax/Global.cpp index bf7f912e7..27aeb4b95 100644 --- a/contrib/other-builds/extract-mixed-syntax/Global.cpp +++ b/contrib/other-builds/extract-mixed-syntax/Global.cpp @@ -32,4 +32,6 @@ Global::Global() , mixed(true) , uppermostOnly(true) , allowDefaultNonTermEdge(true) +, gzOutput(false) + {} diff --git a/contrib/other-builds/extract-mixed-syntax/Global.h b/contrib/other-builds/extract-mixed-syntax/Global.h index d96db6134..41cdbf0ce 100644 --- a/contrib/other-builds/extract-mixed-syntax/Global.h +++ b/contrib/other-builds/extract-mixed-syntax/Global.h @@ -30,7 +30,8 @@ public: int maxUnaligned; bool uppermostOnly; bool allowDefaultNonTermEdge; - + bool gzOutput; + Global(); Global(const Global&); diff --git a/contrib/other-builds/extract-mixed-syntax/Lattice.cpp b/contrib/other-builds/extract-mixed-syntax/Lattice.cpp index 8637fad8e..2b9ebac6e 100644 --- a/contrib/other-builds/extract-mixed-syntax/Lattice.cpp +++ b/contrib/other-builds/extract-mixed-syntax/Lattice.cpp @@ -165,7 +165,7 @@ std::ostream& operator<<(std::ostream &out, const Lattice &obj) for (iter = obj.m_stacks.begin(); iter != obj.m_stacks.end(); ++iter) { const Stack &stack = *iter; - + Stack::const_iterator iterStack; for (iterStack = stack.begin(); iterStack != stack.end(); ++iterStack) { @@ -173,7 +173,7 @@ std::ostream& operator<<(std::ostream &out, const Lattice &obj) out << node << " "; } } - + return out; } diff --git a/contrib/other-builds/extract-mixed-syntax/Makefile b/contrib/other-builds/extract-mixed-syntax/Makefile index 910c759ea..b992b161f 100644 --- a/contrib/other-builds/extract-mixed-syntax/Makefile +++ b/contrib/other-builds/extract-mixed-syntax/Makefile @@ -6,7 +6,8 @@ clean: .cpp.o: g++ -O6 -g -c $< -extract: tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o +extract: tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o + + g++ tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o OutputFileStream.o -lz -lboost_iostreams-mt -o extract-mixed-syntax - g++ tables-core.o extract.o SyntaxTree.o XmlTree.o Tunnel.o Lattice.o LatticeNode.o SentenceAlignment.o Global.o InputFileStream.o TunnelCollection.o RuleCollection.o Rule.o Symbol.o SymbolSequence.o Range.o -lz -o extract-mixed-syntax diff --git a/contrib/other-builds/extract-mixed-syntax/OutputFileStream.cpp b/contrib/other-builds/extract-mixed-syntax/OutputFileStream.cpp new file mode 100644 index 000000000..a61ce1ab1 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/OutputFileStream.cpp @@ -0,0 +1,79 @@ +// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include "OutputFileStream.h" +#include "gzfilebuf.h" + +using namespace std; + +namespace Moses +{ +OutputFileStream::OutputFileStream() + :boost::iostreams::filtering_ostream() + ,m_outFile(NULL) +{ +} + +OutputFileStream::OutputFileStream(const std::string &filePath) + : m_outFile(NULL) +{ + Open(filePath); +} + +OutputFileStream::~OutputFileStream() +{ + Close(); +} + +bool OutputFileStream::Open(const std::string &filePath) +{ + m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary); + if (m_outFile->fail()) { + return false; + } + + if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") { + this->push(boost::iostreams::gzip_compressor()); + } + this->push(*m_outFile); + + return true; +} + +void OutputFileStream::Close() +{ + if (m_outFile == NULL) { + return; + } + + this->flush(); + this->pop(); // file + + m_outFile->close(); + delete m_outFile; + m_outFile = NULL; + return; +} + + +} + diff --git a/contrib/other-builds/extract-mixed-syntax/OutputFileStream.h b/contrib/other-builds/extract-mixed-syntax/OutputFileStream.h new file mode 100644 index 000000000..f52e36d76 --- /dev/null +++ b/contrib/other-builds/extract-mixed-syntax/OutputFileStream.h @@ -0,0 +1,50 @@ +// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include + +namespace Moses +{ + +/** Used in place of std::istream, can read zipped files if it ends in .gz + */ +class OutputFileStream : public boost::iostreams::filtering_ostream +{ +protected: + std::ofstream *m_outFile; +public: + OutputFileStream(); + + OutputFileStream(const std::string &filePath); + virtual ~OutputFileStream(); + + bool Open(const std::string &filePath); + void Close(); +}; + +} + diff --git a/contrib/other-builds/extract-mixed-syntax/Rule.cpp b/contrib/other-builds/extract-mixed-syntax/Rule.cpp index 741bacf89..01970eac8 100644 --- a/contrib/other-builds/extract-mixed-syntax/Rule.cpp +++ b/contrib/other-builds/extract-mixed-syntax/Rule.cpp @@ -499,55 +499,96 @@ const LatticeNode &Rule::GetLatticeNode(size_t ind) const void Rule::DebugOutput() const { - std::stringstream strme; - strme << *this; - cerr << strme.str(); - + Output(cerr); } -std::ostream& operator<<(std::ostream &out, const Rule &obj) +void Rule::Output(std::ostream &out) const { - - stringstream strmeS, strmeT; - - std::vector::const_iterator iterSymbol; - for (iterSymbol = obj.m_source.begin(); iterSymbol != obj.m_source.end(); ++iterSymbol) - { - const Symbol &symbol = *iterSymbol; - strmeS << symbol << " "; - } - - for (iterSymbol = obj.m_target.begin(); iterSymbol != obj.m_target.end(); ++iterSymbol) - { - const Symbol &symbol = *iterSymbol; - strmeT << symbol << " "; - } - - // lhs - if (obj.m_lhs) - { - strmeS << obj.m_lhs->GetSyntaxNode(0).GetLabel(); - strmeT << obj.m_lhs->GetSyntaxNode(1).GetLabel(); - } - - out << strmeS.str() << " ||| " << strmeT.str() << " ||| "; - - // alignment - Rule::CollType::const_iterator iter; - for (iter = obj.m_coll.begin(); iter != obj.m_coll.end(); ++iter) - { - const RuleElement &element = *iter; - const LatticeNode &node = element.GetLatticeNode(); - bool isTerminal = node.IsTerminal(); - - if (!isTerminal) - { - out << element.m_alignmentPos.first << "-" << element.m_alignmentPos.second << " "; - } - } - - out << "||| 1"; - - return out; + + stringstream strmeS, strmeT; + + std::vector::const_iterator iterSymbol; + for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol) + { + const Symbol &symbol = *iterSymbol; + strmeS << symbol << " "; + } + + for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol) + { + const Symbol &symbol = *iterSymbol; + strmeT << symbol << " "; + } + + // lhs + if (m_lhs) + { + strmeS << m_lhs->GetSyntaxNode(0).GetLabel(); + strmeT << m_lhs->GetSyntaxNode(1).GetLabel(); + } + + out << strmeS.str() << " ||| " << strmeT.str() << " ||| "; + + // alignment + Rule::CollType::const_iterator iter; + for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) + { + const RuleElement &element = *iter; + const LatticeNode &node = element.GetLatticeNode(); + bool isTerminal = node.IsTerminal(); + + if (!isTerminal) + { + out << element.m_alignmentPos.first << "-" << element.m_alignmentPos.second << " "; + } + } + + out << "||| 1"; + } +void Rule::OutputInv(std::ostream &out) const +{ + stringstream strmeS, strmeT; + + std::vector::const_iterator iterSymbol; + for (iterSymbol = m_source.begin(); iterSymbol != m_source.end(); ++iterSymbol) + { + const Symbol &symbol = *iterSymbol; + strmeS << symbol << " "; + } + + for (iterSymbol = m_target.begin(); iterSymbol != m_target.end(); ++iterSymbol) + { + const Symbol &symbol = *iterSymbol; + strmeT << symbol << " "; + } + + // lhs + if (m_lhs) + { + strmeS << m_lhs->GetSyntaxNode(0).GetLabel(); + strmeT << m_lhs->GetSyntaxNode(1).GetLabel(); + } + + out << strmeT.str() << " ||| " << strmeS.str() << " ||| "; + + // alignment + Rule::CollType::const_iterator iter; + for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) + { + const RuleElement &element = *iter; + const LatticeNode &node = element.GetLatticeNode(); + bool isTerminal = node.IsTerminal(); + + if (!isTerminal) + { + out << element.m_alignmentPos.first << "-" << element.m_alignmentPos.second << " "; + } + } + + out << "||| 1"; + +} + + diff --git a/contrib/other-builds/extract-mixed-syntax/Rule.h b/contrib/other-builds/extract-mixed-syntax/Rule.h index bc74fa3f5..3574094fe 100644 --- a/contrib/other-builds/extract-mixed-syntax/Rule.h +++ b/contrib/other-builds/extract-mixed-syntax/Rule.h @@ -41,8 +41,6 @@ public: class Rule { - friend std::ostream& operator<<(std::ostream &out, const Rule &obj); - protected: typedef std::vector CollType; CollType m_coll; @@ -91,4 +89,8 @@ public: Range GetSourceRange() const; DEBUG_OUTPUT(); + + void Output(std::ostream &out) const; + void OutputInv(std::ostream &out) const; + }; diff --git a/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp b/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp index 28b7adb1b..8389a70cf 100644 --- a/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp +++ b/contrib/other-builds/extract-mixed-syntax/RuleCollection.cpp @@ -76,19 +76,27 @@ void RuleCollection::Add(const Global &global, Rule *rule, const SentenceAlignme } +void RuleCollection::Output(std::ostream &out) const +{ + RuleCollection::CollType::const_iterator iter; + for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) + { + const Rule &rule = **iter; + rule.Output(out); + out << endl; + } +} -std::ostream& operator<<(std::ostream &out, const RuleCollection &obj) -{ - RuleCollection::CollType::const_iterator iter; - for (iter = obj.m_coll.begin(); iter != obj.m_coll.end(); ++iter) - { - const Rule &rule = **iter; - out << rule << endl; - } - - return out; +void RuleCollection::OutputInv(std::ostream &out) const +{ + RuleCollection::CollType::const_iterator iter; + for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) + { + const Rule &rule = **iter; + rule.OutputInv(out); + out << endl; + } } - diff --git a/contrib/other-builds/extract-mixed-syntax/RuleCollection.h b/contrib/other-builds/extract-mixed-syntax/RuleCollection.h index 75d55b864..27d5d794a 100644 --- a/contrib/other-builds/extract-mixed-syntax/RuleCollection.h +++ b/contrib/other-builds/extract-mixed-syntax/RuleCollection.h @@ -38,8 +38,6 @@ struct CompareRule class RuleCollection { - friend std::ostream& operator<<(std::ostream &out, const RuleCollection &obj); - protected: typedef std::set CollType; CollType m_coll; @@ -49,5 +47,9 @@ public: void Add(const Global &global, Rule *rule, const SentenceAlignment &sentence); size_t GetSize() const { return m_coll.size(); } + + void Output(std::ostream &out) const; + void OutputInv(std::ostream &out) const; + }; diff --git a/contrib/other-builds/extract-mixed-syntax/extract.cpp b/contrib/other-builds/extract-mixed-syntax/extract.cpp index 05eeffcfe..8d6bba3d7 100644 --- a/contrib/other-builds/extract-mixed-syntax/extract.cpp +++ b/contrib/other-builds/extract-mixed-syntax/extract.cpp @@ -26,8 +26,10 @@ #include #include #include +#include #include "extract.h" #include "InputFileStream.h" +#include "OutputFileStream.h" #include "Lattice.h" #ifdef WIN32 @@ -165,8 +167,10 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) { global->allowDefaultNonTermEdge = atoi(argv[++i]); - } - + } else if (strcmp(argv[i], "--GZOutput") == 0) { + global->gzOutput = true; + } + else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); @@ -180,8 +184,15 @@ int main(int argc, char* argv[]) // open output files string fileNameExtractInv = fileNameExtract + ".inv"; - string fileNameExtractOrientation = fileNameExtract + ".o"; - extractFile.open(fileNameExtract.c_str()); + if (global->gzOutput) { + fileNameExtract += ".gz"; + fileNameExtractInv += ".gz"; + } + + Moses::OutputFileStream extractFile; + Moses::OutputFileStream extractFileInv; + extractFile.Open(fileNameExtract.c_str()); + extractFileInv.Open(fileNameExtractInv.c_str()); // loop through all sentence pairs @@ -226,7 +237,8 @@ int main(int argc, char* argv[]) //cerr << "E " << (time(NULL) - starttime) << endl; //cerr << sentence.lattice->GetRules().GetSize() << endl; - extractFile << sentencePair.GetLattice().GetRules(); + sentencePair.GetLattice().GetRules().Output(extractFile); + sentencePair.GetLattice().GetRules().OutputInv(extractFileInv); } } @@ -234,7 +246,10 @@ int main(int argc, char* argv[]) tFile.Close(); sFile.Close(); aFile.Close(); - + + extractFile.Close(); + extractFileInv.Close(); + delete global; } diff --git a/contrib/other-builds/extract-mixed-syntax/extract.h b/contrib/other-builds/extract-mixed-syntax/extract.h index a3d2d618b..ac831f2d9 100644 --- a/contrib/other-builds/extract-mixed-syntax/extract.h +++ b/contrib/other-builds/extract-mixed-syntax/extract.h @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include "SyntaxTree.h" @@ -31,9 +30,5 @@ std::vector tokenize( const char [] ); const Global *g_global; -std::ofstream extractFile; -std::ofstream extractFileInv; -std::ofstream extractFileOrientation; - std::set< std::string > targetLabelCollection, sourceLabelCollection; std::map< std::string, int > targetTopLabelCollection, sourceTopLabelCollection; From fef94f27a7d91f8b744e6a83de42d0209ce27c31 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 16 Jan 2014 19:40:08 +0000 Subject: [PATCH 15/48] correct inverse alignment --- contrib/other-builds/extract-mixed-syntax/Rule.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/other-builds/extract-mixed-syntax/Rule.cpp b/contrib/other-builds/extract-mixed-syntax/Rule.cpp index 01970eac8..7cc7d3a6f 100644 --- a/contrib/other-builds/extract-mixed-syntax/Rule.cpp +++ b/contrib/other-builds/extract-mixed-syntax/Rule.cpp @@ -583,7 +583,7 @@ void Rule::OutputInv(std::ostream &out) const if (!isTerminal) { - out << element.m_alignmentPos.first << "-" << element.m_alignmentPos.second << " "; + out << element.m_alignmentPos.second << "-" << element.m_alignmentPos.first << " "; } } From 0410ab67ead3fba0f633fd97069988216cf4af96 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 16 Jan 2014 20:14:25 +0000 Subject: [PATCH 16/48] make it compatible with normal extract --- .../extract-mixed-syntax/extract.cpp | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/contrib/other-builds/extract-mixed-syntax/extract.cpp b/contrib/other-builds/extract-mixed-syntax/extract.cpp index 8d6bba3d7..d16d1eee2 100644 --- a/contrib/other-builds/extract-mixed-syntax/extract.cpp +++ b/contrib/other-builds/extract-mixed-syntax/extract.cpp @@ -47,7 +47,7 @@ int main(int argc, char* argv[]) Global *global = new Global(); g_global = global; - + int sentenceOffset = 0; if (argc < 5) { cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract " @@ -167,16 +167,28 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) { global->allowDefaultNonTermEdge = atoi(argv[++i]); - } else if (strcmp(argv[i], "--GZOutput") == 0) { + } + else if (strcmp(argv[i], "--GZOutput") == 0) { global->gzOutput = true; } - + else if (strcmp(argv[i],"--MaxSpan") == 0) { + // ignore + ++i; + } + else if (strcmp(argv[i],"--SentenceOffset") == 0) { + if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') { + cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl; + exit(1); + } + sentenceOffset = atoi(argv[++i]); + } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } + // open input files Moses::InputFileStream tFile(fileNameT); Moses::InputFileStream sFile(fileNameS); @@ -196,7 +208,7 @@ int main(int argc, char* argv[]) // loop through all sentence pairs - int i=0; + int i = sentenceOffset; while(true) { i++; @@ -240,7 +252,6 @@ int main(int argc, char* argv[]) sentencePair.GetLattice().GetRules().Output(extractFile); sentencePair.GetLattice().GetRules().OutputInv(extractFileInv); } - } tFile.Close(); From 0ff626a7161765d2c8ab178b6acaf21c2804e5c6 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 17 Jan 2014 15:43:51 +0000 Subject: [PATCH 17/48] -extract-command for train-model.perl to override the extract program to be used --- scripts/training/train-model.perl | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index d9ac65b7b..321b7c3f2 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -39,7 +39,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_ $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS, @_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT, - $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $_NUM_LATTICE_FEATURES, $IGNORE, $_FLEXIBILITY_SCORE); + $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $_NUM_LATTICE_FEATURES, $IGNORE, $_FLEXIBILITY_SCORE, $_EXTRACT_COMMAND); my $_BASELINE_CORPUS = ""; my $_CORES = 1; my $debug = 0; # debug this script, do not delete any files in debug mode @@ -140,6 +140,7 @@ $_HELP = 1 'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE, 'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES, 'flexibility-score' => \$_FLEXIBILITY_SCORE, + 'extract-command' => \$_EXTRACT_COMMAND, ); if ($_HELP) { @@ -303,11 +304,19 @@ my $__SORT_PARALLEL = ""; $__SORT_PARALLEL = "--parallel $_SORT_PARALLEL" if $_SORT_PARALLEL; # supporting scripts/binaries from this package -my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/extract"; +my $PHRASE_EXTRACT; +if (defined($_EXTRACT_COMMAND) { + $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/$_EXTRACT_COMMAND"; +else { + $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/extract"; +} $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_EXTRACT"; my $RULE_EXTRACT; -if (defined($_GHKM)) { +if (defined($_EXTRACT_COMMAND) { + $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/$_EXTRACT_COMMAND"; +} +elsif (defined($_GHKM)) { $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/extract-ghkm"; } else { From 1632194a9ecadd4f01f02a8cb13645b91a70c4fd Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 17 Jan 2014 17:08:34 +0000 Subject: [PATCH 18/48] syntax error --- scripts/training/train-model.perl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 321b7c3f2..bcf852eef 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -305,15 +305,16 @@ $__SORT_PARALLEL = "--parallel $_SORT_PARALLEL" if $_SORT_PARALLEL; # supporting scripts/binaries from this package my $PHRASE_EXTRACT; -if (defined($_EXTRACT_COMMAND) { +if (defined($_EXTRACT_COMMAND)) { $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/$_EXTRACT_COMMAND"; +} else { $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/extract"; } $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_EXTRACT"; my $RULE_EXTRACT; -if (defined($_EXTRACT_COMMAND) { +if (defined($_EXTRACT_COMMAND)) { $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/../bin/$_EXTRACT_COMMAND"; } elsif (defined($_GHKM)) { From ade7400fc7de4bf491663981f7c4ebb7847f2026 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 20 Jan 2014 15:42:17 +0000 Subject: [PATCH 19/48] minor error --- scripts/training/train-model.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index bcf852eef..2cc469cc7 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -140,7 +140,7 @@ $_HELP = 1 'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE, 'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES, 'flexibility-score' => \$_FLEXIBILITY_SCORE, - 'extract-command' => \$_EXTRACT_COMMAND, + 'extract-command=s' => \$_EXTRACT_COMMAND, ); if ($_HELP) { From b03e035b5dd04b9b7a5e640dad0f8ae169bc8a77 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 20 Jan 2014 18:37:48 +0000 Subject: [PATCH 20/48] output glue rules --- .../extract-mixed-syntax/extract.cpp | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/contrib/other-builds/extract-mixed-syntax/extract.cpp b/contrib/other-builds/extract-mixed-syntax/extract.cpp index d16d1eee2..334a3e124 100644 --- a/contrib/other-builds/extract-mixed-syntax/extract.cpp +++ b/contrib/other-builds/extract-mixed-syntax/extract.cpp @@ -39,6 +39,8 @@ using namespace std; +void writeGlueGrammar(const string &, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection); + int main(int argc, char* argv[]) { cerr << "Extract v2.0, written by Philipp Koehn\n" @@ -261,8 +263,48 @@ int main(int argc, char* argv[]) extractFile.Close(); extractFileInv.Close(); - delete global; + if (global->glueGrammarFlag) { + writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection); + } + + delete global; } +void writeGlueGrammar( const string & fileName, Global &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection ) +{ + ofstream grammarFile; + grammarFile.open(fileName.c_str()); + if (!options.targetSyntax) { + grammarFile << " [X] ||| [S] ||| 1 ||| ||| 0" << endl + << "[X][S] [X] ||| [X][S] [S] ||| 1 ||| 0-0 ||| 0" << endl + << "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl; + } else { + // chose a top label that is not already a label + string topLabel = "QQQQQQ"; + for( unsigned int i=1; i<=topLabel.length(); i++) { + if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) { + topLabel = topLabel.substr(0,i); + break; + } + } + // basic rules + grammarFile << " [X] ||| [" << topLabel << "] ||| 1 ||| " << endl + << "[X][" << topLabel << "] [X] ||| [X][" << topLabel << "] [" << topLabel << "] ||| 1 ||| 0-0 " << endl; + + // top rules + for( map::const_iterator i = targetTopLabelCollection.begin(); + i != targetTopLabelCollection.end(); i++ ) { + grammarFile << " [X][" << i->first << "] [X] ||| [X][" << i->first << "] [" << topLabel << "] ||| 1 ||| 1-1" << endl; + } + + // glue rules + for( set::const_iterator i = targetLabelCollection.begin(); + i != targetLabelCollection.end(); i++ ) { + grammarFile << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << endl; + } + grammarFile << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << endl; // glue rule for unknown word... + } + grammarFile.close(); +} From c537d940a9bdbead1b14cdd83f11e542f65892d7 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 20 Jan 2014 19:27:46 +0000 Subject: [PATCH 21/48] compile manual-label program --- contrib/other-builds/manual-label/.cproject | 124 ++++++++++++++++++++ contrib/other-builds/manual-label/.project | 27 +++++ contrib/other-builds/manual-label/DeEn.cpp | 8 +- contrib/other-builds/manual-label/Makefile | 13 ++ 4 files changed, 168 insertions(+), 4 deletions(-) create mode 100644 contrib/other-builds/manual-label/.cproject create mode 100644 contrib/other-builds/manual-label/.project create mode 100644 contrib/other-builds/manual-label/Makefile diff --git a/contrib/other-builds/manual-label/.cproject b/contrib/other-builds/manual-label/.cproject new file mode 100644 index 000000000..2efd96e70 --- /dev/null +++ b/contrib/other-builds/manual-label/.cproject @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/other-builds/manual-label/.project b/contrib/other-builds/manual-label/.project new file mode 100644 index 000000000..d0c1dba19 --- /dev/null +++ b/contrib/other-builds/manual-label/.project @@ -0,0 +1,27 @@ + + + manual-label + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp index e8cba272f..8d44e1f1d 100644 --- a/contrib/other-builds/manual-label/DeEn.cpp +++ b/contrib/other-builds/manual-label/DeEn.cpp @@ -45,15 +45,15 @@ void LabelDeEn(const Phrase &source, ostream &out) // find ranges to label for (int start = 0; start < source.size(); ++start) { for (int end = start; end < source.size(); ++end) { - if (IsA(source, start, -1, 1, "VAFIN") - && IsA(source, end, +1, 1, "VVINF VVPP") - && !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) { + if (IsA(source, start, -1, 1, "vafin") + && IsA(source, end, +1, 1, "vvinf vvpp") + && !Contains(source, start, end, 1, "vafin vvinf vvpp vvfin")) { Range range(start, end); ranges.push_back(range); } else if ((start == 0 || IsA(source, start, -1, 1, "$,")) && IsA(source, end, +1, 0, "zu") - && IsA(source, end, +2, 1, "VVINF") + && IsA(source, end, +2, 1, "vvinf") && !Contains(source, start, end, 1, "$,")) { Range range(start, end); ranges.push_back(range); diff --git a/contrib/other-builds/manual-label/Makefile b/contrib/other-builds/manual-label/Makefile new file mode 100644 index 000000000..60ce975cd --- /dev/null +++ b/contrib/other-builds/manual-label/Makefile @@ -0,0 +1,13 @@ +all: manual-label + +clean: + rm -f *.o manual-label + +.cpp.o: + g++ -I../../../ -O6 -g -c $< + +manual-label: DeEn.o manual-label.o + + g++ DeEn.o manual-label.o -lz -lboost_program_options-mt -o manual-label + + From 27152ccce48d7d81996d03fa422a0cc8ac2e450d Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 20 Jan 2014 23:26:06 +0000 Subject: [PATCH 22/48] add source labeller to EMS --- contrib/other-builds/manual-label/DeEn.cpp | 2 +- scripts/ems/experiment.meta | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp index 8d44e1f1d..1f96b6d6f 100644 --- a/contrib/other-builds/manual-label/DeEn.cpp +++ b/contrib/other-builds/manual-label/DeEn.cpp @@ -67,7 +67,7 @@ void LabelDeEn(const Phrase &source, ostream &out) for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) { const Range &range = *iter; if (range.first == pos) { - out << " "; + out << " "; } } diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index b06ba5290..afd09fc7a 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -55,8 +55,16 @@ truecase template-if: input-truecaser IN.$input-extension OUT.$input-extension -model IN1.$input-extension template-if: output-truecaser IN.$output-extension OUT.$output-extension -model IN1.$output-extension parallelizable: yes + +source-label + in: truecased-stem + out: source-labelled + default-name: corpus/labelled + pass-unless: source-labeller + template-if: source-labeller IN.$input-extension OUT.$input-extension + lowercase - in: truecased-stem + in: source-labelled out: lowercased-stem default-name: corpus/lowercased pass-unless: input-lowercaser output-lowercaser From 47c0a72842e779fe8ded44a232902e69f73d8253 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 21 Jan 2014 16:09:01 +0000 Subject: [PATCH 23/48] add source labeller to EMS --- contrib/other-builds/manual-label/DeEn.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp index 1f96b6d6f..7ef9d495d 100644 --- a/contrib/other-builds/manual-label/DeEn.cpp +++ b/contrib/other-builds/manual-label/DeEn.cpp @@ -45,15 +45,15 @@ void LabelDeEn(const Phrase &source, ostream &out) // find ranges to label for (int start = 0; start < source.size(); ++start) { for (int end = start; end < source.size(); ++end) { - if (IsA(source, start, -1, 1, "vafin") - && IsA(source, end, +1, 1, "vvinf vvpp") - && !Contains(source, start, end, 1, "vafin vvinf vvpp vvfin")) { + if (IsA(source, start, -1, 1, "VAFIN") + && IsA(source, end, +1, 1, "VVINF VVPP") + && !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) { Range range(start, end); ranges.push_back(range); } else if ((start == 0 || IsA(source, start, -1, 1, "$,")) && IsA(source, end, +1, 0, "zu") - && IsA(source, end, +2, 1, "vvinf") + && IsA(source, end, +2, 1, "VVINF") && !Contains(source, start, end, 1, "$,")) { Range range(start, end); ranges.push_back(range); From 05de672bd85b9df3c1b5ceea7db782a628bd3983 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 21 Jan 2014 19:21:24 +0000 Subject: [PATCH 24/48] need to 'label' target side too --- scripts/ems/experiment.meta | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index afd09fc7a..660804c8e 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -62,6 +62,8 @@ source-label default-name: corpus/labelled pass-unless: source-labeller template-if: source-labeller IN.$input-extension OUT.$input-extension + template-if: cat IN.$output-extension OUT.$output-extension + parallelizable: yes lowercase in: source-labelled From 6a10f8ce715e69506aaa8581e3995f8b383c2841 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 23 Jan 2014 17:09:56 +0000 Subject: [PATCH 25/48] corrected phrase-table name / type mixup when creating filtering script --- scripts/ems/experiment.perl | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index ca3152364..212260226 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl -w # $Id: experiment.perl 1095 2009-11-16 18:19:49Z philipp $ @@ -2121,10 +2121,12 @@ sub get_config_tables { } # additional settings for factored models - my $ptCmd = $phrase_translation_table; - $ptCmd .= ":$ptImpl" if $ptImpl>0; - $ptCmd .= ":$numFF" if defined($numFF); - $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd); + $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $phrase_translation_table); + $cmd = trim($cmd); + $cmd .= ":$ptImpl" if $ptImpl>0; + $cmd .= ":$numFF" if defined($numFF); + $cmd .= " "; + $cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table) if $reordering_table; $cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table) if $generation_table; $cmd .= "-config $config "; From 878e7ab899e4c015b7567cd9d0b12ae64e9ab1fa Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 24 Jan 2014 16:21:47 +0000 Subject: [PATCH 26/48] source labelling for tuning set. More debugging message in filtering script --- scripts/ems/experiment.meta | 25 ++++++++++++++++---- scripts/training/filter-model-given-input.pl | 11 +++++++-- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 660804c8e..f5ceb454a 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -729,15 +729,32 @@ factorize-input-devtest ignore-unless: use-mira error: can't open error: incompatible number of words in factor + +source-label-input + in: factorized-input + out: source-labelled-input + default-name: tuning/input.labelled + pass-unless: source-labeller + template-if: source-labeller IN OUT + parallelizable: yes + +source-label-input-devtest + in: factorized-input-devtest + out: source-labelled-input-devtest + default-name: tuning/input.devtest.labelled + pass-unless: source-labeller + template-if: source-labeller IN OUT + parallelizable: yes + lowercase-input - in: factorized-input + in: source-labelled-input out: truecased-input default-name: tuning/input.lc pass-unless: input-lowercaser ignore-if: input-truecaser template: $input-lowercaser < IN > OUT lowercase-input-devtest - in: factorized-input-devtest + in: source-labelled-input-devtest out: truecased-input-devtest default-name: tuning/input.devtest.lc pass-unless: input-lowercaser @@ -745,14 +762,14 @@ lowercase-input-devtest ignore-if: input-truecaser template: $input-lowercaser < IN > OUT truecase-input - in: factorized-input TRUECASER:truecase-model + in: source-labelled-input TRUECASER:truecase-model out: truecased-input rerun-on-change: input-truecaser default-name: tuning/input.tc ignore-unless: input-truecaser template: $input-truecaser -model IN1.$input-extension < IN > OUT truecase-input-devtest - in: factorized-input-devtest TRUECASER:truecase-model + in: source-labelled-input-devtest TRUECASER:truecase-model out: truecased-input-devtest rerun-on-change: input-truecaser default-name: tuning/input.devtest.tc diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index 27baaa66d..dd0f2d56a 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -295,6 +295,7 @@ if ($opt_filter) { } # filter files +print STDERR "Filtering files...\n"; for(my $i=0;$i<=$#TABLE;$i++) { my ($used,$total) = (0,0); my $file = $TABLE[$i]; @@ -308,7 +309,9 @@ for(my $i=0;$i<=$#TABLE;$i++) { $file .= ".gz"; } $mid_file .= ".gz" if $file =~ /\.gz$/; - safesystem("ln -s $file $mid_file"); + $cmd = "ln -s $file $mid_file"; + print STDERR "Executing: $cmd\n"; + safesystem($cmd); } else { $mid_file .= ".gz" @@ -331,7 +334,11 @@ for(my $i=0;$i<=$#TABLE;$i++) { my $tmp_input = $TMP_INPUT_FILENAME{$factors}; my $options = ""; $options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count); - open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |"); + + $cmd = "$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |"; + print STDERR "Executing: $cmd\n"; + + open(PIPE,$cmd); while (my $line = ) { print FILE_OUT $line } From dc3d5b8d3851b97cdd563206dfdd6e83cc80a834 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 24 Jan 2014 16:33:30 +0000 Subject: [PATCH 27/48] source labelling for test set. --- scripts/ems/experiment.meta | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index f5ceb454a..7a72b1f95 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -961,15 +961,24 @@ factorize-input pass-unless: TRAINING:input-factors error: can't open error: incompatible number of words in factor + +source-label-input + in: factorized-input + out: source-labelled-input + default-name: evaluation/input.labelled + pass-unless: source-labeller + template-if: source-labeller IN OUT + parallelizable: yes + lowercase-input - in: factorized-input + in: source-labelled-input out: truecased-input default-name: evaluation/input.lc pass-unless: input-lowercaser ignore-if: input-truecaser template: $input-lowercaser < IN > OUT truecase-input - in: factorized-input TRUECASER:truecase-model + in: source-labelled-input TRUECASER:truecase-model out: truecased-input default-name: evaluation/input.tc rerun-on-change: input-truecaser From b6d47733da3728631e7f242f4bc2031b8183695f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 24 Jan 2014 18:09:55 +0000 Subject: [PATCH 28/48] remove duplicate spaces caused when XML are stripped --- scripts/generic/strip-xml.perl | 11 +++++++++++ scripts/training/filter-model-given-input.pl | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl index 71ad8d4c6..9fc43d4d9 100755 --- a/scripts/generic/strip-xml.perl +++ b/scripts/generic/strip-xml.perl @@ -8,6 +8,7 @@ while (my $line = ) { my $len = length($line); my $inXML = 0; + my $prevSpace = 1; for (my $i = 0; $i < $len; ++$i) { my $c = substr($line, $i, 1); @@ -17,10 +18,20 @@ while (my $line = ) { elsif ($c eq ">") { --$inXML; } + elsif ($prevSpace == 1 && $c eq " ") + { # duplicate space. Do nothing + } elsif ($inXML == 0) { + if ($c eq " ") { + $prevSpace = 1; + } + else { + $prevSpace = 0; + } print $c; } } + print "\n"; } diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index dd0f2d56a..fdb1ad53f 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl -w # $Id$ # Given a moses.ini file and an input text prepare minimized translation @@ -255,7 +255,9 @@ if ($opt_hierarchical) { open(FILEHANDLE,">$filename") or die "Can't open $filename for writing"; $TMP_INPUT_FILENAME{$key} = $filename; my @FACTOR = split(/,/, $key); - open(PIPE,"$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |"); + my $cmd = "$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |"; + print STDERR "Executing: $cmd\n"; + open(PIPE,$cmd); while (my $line = ) { print FILEHANDLE $line } From 9745924937e5bc49dff74014b764400cd9311a24 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sat, 25 Jan 2014 10:39:47 +0000 Subject: [PATCH 29/48] output list of available feature functions, separated by space, rather than lines --- moses/FF/Factory.cpp | 4 ++-- moses/Parameter.cpp | 1 + moses/SearchCubePruning.cpp | 5 +++++ moses/SearchNormal.cpp | 5 +++++ moses/StaticData.cpp | 2 ++ moses/StaticData.h | 3 +++ moses/WordsBitmap.cpp | 12 ++++++++++++ moses/WordsBitmap.h | 2 ++ 8 files changed, 32 insertions(+), 2 deletions(-) diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index b201b30c0..2f7fdc84f 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -242,9 +242,9 @@ void FeatureRegistry::PrintFF() const Map::const_iterator iter; for (iter = registry_.begin(); iter != registry_.end(); ++iter) { const string &ffName = iter->first; - std::cerr << ffName << std::endl; + std::cerr << ffName << " "; } - + std::cerr << std::endl; } } // namespace Moses diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 860eb8a78..1b4683dc0 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -202,6 +202,7 @@ Parameter::Parameter() AddParam("placeholder-factor", "Which source factor to use to store the original text for placeholders. The factor must not be used by a translation or gen model"); AddParam("no-cache", "Disable all phrase-table caching. Default = false (ie. enable caching)"); + AddParam("adjacent-only", "Only allow hypotheses which are adjacent to current derivation. ITG without block moves"); } diff --git a/moses/SearchCubePruning.cpp b/moses/SearchCubePruning.cpp index d97cb108c..49ca22645 100644 --- a/moses/SearchCubePruning.cpp +++ b/moses/SearchCubePruning.cpp @@ -250,6 +250,11 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor return true; } + if (StaticData::Instance().AdjacentOnly() && + !hypoBitmap.IsAdjacent(range.GetStartPos(), range.GetEndPos())) { + return false; + } + bool leftMostEdge = (hypoFirstGapPos == startPos); // any length extension is okay if starting at left-most edge if (leftMostEdge) { diff --git a/moses/SearchNormal.cpp b/moses/SearchNormal.cpp index 596ede562..8ac0eca13 100644 --- a/moses/SearchNormal.cpp +++ b/moses/SearchNormal.cpp @@ -253,6 +253,11 @@ void SearchNormal::ExpandAllHypotheses(const Hypothesis &hypothesis, size_t star expectedScore += m_transOptColl.GetFutureScore().CalcFutureScore( hypothesis.GetWordsBitmap(), startPos, endPos ); } + if (StaticData::Instance().AdjacentOnly() && + !hypothesis.GetWordsBitmap().IsAdjacent(startPos, endPos)) { + return; + } + // loop through all translation options const TranslationOptionList &transOptList = m_transOptColl.GetTranslationOptionList(WordsRange(startPos, endPos)); TranslationOptionList::const_iterator iter; diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 7fea2d960..782144360 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -385,6 +385,8 @@ bool StaticData::LoadData(Parameter *parameter) SetBooleanParameter( &m_lmEnableOOVFeature, "lmodel-oov-feature", false); + SetBooleanParameter( &m_adjacentOnly, "adjacent-only", false); + // minimum Bayes risk decoding SetBooleanParameter( &m_mbr, "minimum-bayes-risk", false ); m_mbrSize = (m_parameter->GetParam("mbr-size").size() > 0) ? diff --git a/moses/StaticData.h b/moses/StaticData.h index 7fc33c601..def81afae 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -197,6 +197,7 @@ protected: FactorType m_placeHolderFactor; bool m_useLegacyPT; + bool m_adjacentOnly; FeatureRegistry m_registry; @@ -753,6 +754,8 @@ public: return &m_soft_matches_map_reverse; } + bool AdjacentOnly() const + { return m_adjacentOnly; } }; } diff --git a/moses/WordsBitmap.cpp b/moses/WordsBitmap.cpp index 801c90654..caddb9e98 100644 --- a/moses/WordsBitmap.cpp +++ b/moses/WordsBitmap.cpp @@ -63,6 +63,18 @@ int WordsBitmap::GetFutureCosts(int lastPos) const return sum; } +bool WordsBitmap::IsAdjacent(size_t startPos, size_t endPos) const +{ + size_t first = GetFirstGapPos(); + size_t last = GetLastGapPos(); + + if (startPos == last || endPos == first) { + return true; + } + + return false; +} + } diff --git a/moses/WordsBitmap.h b/moses/WordsBitmap.h index 6f0de7bcf..2deb7b661 100644 --- a/moses/WordsBitmap.h +++ b/moses/WordsBitmap.h @@ -132,6 +132,8 @@ public: return NOT_FOUND; } + bool IsAdjacent(size_t startPos, size_t endPos) const; + //! whether a word has been translated at a particular position bool GetValue(size_t pos) const { return m_bitmap[pos]; From b68a906fdd97c344ce7d6fa90a2db5a79c5853e6 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sat, 25 Jan 2014 10:58:00 +0000 Subject: [PATCH 30/48] output list of available feature functions, separated by space, rather than lines --- moses/WordsBitmap.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/moses/WordsBitmap.cpp b/moses/WordsBitmap.cpp index caddb9e98..0866846ed 100644 --- a/moses/WordsBitmap.cpp +++ b/moses/WordsBitmap.cpp @@ -65,6 +65,10 @@ int WordsBitmap::GetFutureCosts(int lastPos) const bool WordsBitmap::IsAdjacent(size_t startPos, size_t endPos) const { + if (GetNumWordsCovered() == 0) { + return true; + } + size_t first = GetFirstGapPos(); size_t last = GetLastGapPos(); From 14e02978fcbbe9ec6ed3faa232fcb3f30664f40e Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 27 Jan 2014 16:51:35 -0800 Subject: [PATCH 31/48] KenLM 5cc905bc2d214efa7de2db56a9a672b749a95591 Avoid unspecified behavior of mmap when a file is resized reported by Christian Hardmeier Fixes for Mavericks and a workaround for Boost's broken semaphore Clean clang compile (of kenlm) Merged some of 744376b3fbebc41c4a270bf549826d5eb9219ae0 but also undid some of it because it was just masking a fundaemntal problem with pread rather than working around windows limitations --- lm/bhiksha.cc | 15 +-- lm/bhiksha.hh | 9 +- lm/binary_format.cc | 250 ++++++++++++++++++++--------------- lm/binary_format.hh | 112 ++++++++-------- lm/builder/corpus_count.cc | 29 ++-- lm/builder/interpolate.cc | 8 +- lm/config.cc | 4 - lm/facade.hh | 6 +- lm/filter/arpa_io.hh | 4 - lm/filter/count_io.hh | 23 +--- lm/filter/filter_main.cc | 177 ++++++++++++------------- lm/filter/format.hh | 2 +- lm/filter/vocab.cc | 6 +- lm/model.cc | 84 +++++++----- lm/model.hh | 12 +- lm/model_test.cc | 8 +- lm/quantize.cc | 12 +- lm/quantize.hh | 5 +- lm/search_hashed.cc | 33 +++-- lm/search_hashed.hh | 12 +- lm/search_trie.cc | 17 +-- lm/search_trie.hh | 18 ++- lm/trie.hh | 9 -- lm/trie_sort.cc | 4 + lm/virtual_interface.hh | 6 +- lm/vocab.cc | 28 ++-- lm/vocab.hh | 12 +- util/Jamfile | 11 +- util/exception.cc | 8 ++ util/file.cc | 8 +- util/joint_sort.hh | 26 ++-- util/murmur_hash.cc | 7 + util/murmur_hash.hh | 4 + util/pcqueue.hh | 58 +++++++- util/pcqueue_test.cc | 20 +++ util/probing_hash_table.hh | 7 +- util/proxy_iterator.hh | 12 +- util/read_compressed_test.cc | 1 + util/sized_iterator.hh | 14 +- util/sized_iterator_test.cc | 16 +++ util/usage.cc | 8 ++ 41 files changed, 618 insertions(+), 487 deletions(-) create mode 100644 util/pcqueue_test.cc create mode 100644 util/sized_iterator_test.cc diff --git a/lm/bhiksha.cc b/lm/bhiksha.cc index 088ea98d4..c8a18dfda 100644 --- a/lm/bhiksha.cc +++ b/lm/bhiksha.cc @@ -1,4 +1,6 @@ #include "lm/bhiksha.hh" + +#include "lm/binary_format.hh" #include "lm/config.hh" #include "util/file.hh" #include "util/exception.hh" @@ -15,11 +17,11 @@ DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_ const uint8_t kArrayBhikshaVersion = 0; // TODO: put this in binary file header instead when I change the binary file format again. -void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) { - uint8_t version; - uint8_t configured_bits; - util::ReadOrThrow(fd, &version, 1); - util::ReadOrThrow(fd, &configured_bits, 1); +void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) { + uint8_t buffer[2]; + file.ReadForConfig(buffer, 2, offset); + uint8_t version = buffer[0]; + uint8_t configured_bits = buffer[1]; if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion); config.pointer_bhiksha_bits = configured_bits; } @@ -87,9 +89,6 @@ void ArrayBhiksha::FinishedLoading(const Config &config) { *(head_write++) = config.pointer_bhiksha_bits; } -void ArrayBhiksha::LoadedBinary() { -} - } // namespace trie } // namespace ngram } // namespace lm diff --git a/lm/bhiksha.hh b/lm/bhiksha.hh index 8ff88654d..350571a6e 100644 --- a/lm/bhiksha.hh +++ b/lm/bhiksha.hh @@ -24,6 +24,7 @@ namespace lm { namespace ngram { struct Config; +class BinaryFormat; namespace trie { @@ -31,7 +32,7 @@ class DontBhiksha { public: static const ModelType kModelTypeAdd = static_cast(0); - static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {} + static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &/*config*/) {} static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; } @@ -53,8 +54,6 @@ class DontBhiksha { void FinishedLoading(const Config &/*config*/) {} - void LoadedBinary() {} - uint8_t InlineBits() const { return next_.bits; } private: @@ -65,7 +64,7 @@ class ArrayBhiksha { public: static const ModelType kModelTypeAdd = kArrayAdd; - static void UpdateConfigFromBinary(int fd, Config &config); + static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config); static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config); @@ -93,8 +92,6 @@ class ArrayBhiksha { void FinishedLoading(const Config &config); - void LoadedBinary(); - uint8_t InlineBits() const { return next_inline_.bits; } private: diff --git a/lm/binary_format.cc b/lm/binary_format.cc index bef51eb82..9c744b138 100644 --- a/lm/binary_format.cc +++ b/lm/binary_format.cc @@ -14,6 +14,9 @@ namespace lm { namespace ngram { + +const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"}; + namespace { const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version"; const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0"; @@ -58,8 +61,6 @@ struct Sanity { } }; -const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"}; - std::size_t TotalHeaderSize(unsigned char order) { return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order); } @@ -81,83 +82,6 @@ void WriteHeader(void *to, const Parameters ¶ms) { } // namespace -uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) { - if (config.write_mmap) { - std::size_t total = TotalHeaderSize(order) + memory_size; - backing.file.reset(util::CreateOrThrow(config.write_mmap)); - if (config.write_method == Config::WRITE_MMAP) { - backing.vocab.reset(util::MapZeroedWrite(backing.file.get(), total), total, util::scoped_memory::MMAP_ALLOCATED); - } else { - util::ResizeOrThrow(backing.file.get(), 0); - util::MapAnonymous(total, backing.vocab); - } - strncpy(reinterpret_cast(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order)); - return reinterpret_cast(backing.vocab.get()) + TotalHeaderSize(order); - } else { - util::MapAnonymous(memory_size, backing.vocab); - return reinterpret_cast(backing.vocab.get()); - } -} - -uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) { - std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad; - if (config.write_mmap) { - // Grow the file to accomodate the search, using zeros. - try { - util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size); - } catch (util::ErrnoException &e) { - e << " for file " << config.write_mmap; - throw e; - } - - if (config.write_method == Config::WRITE_AFTER) { - util::MapAnonymous(memory_size, backing.search); - return reinterpret_cast(backing.search.get()); - } - // mmap it now. - // We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down. - std::size_t page_size = util::SizePage(); - std::size_t alignment_cruft = adjusted_vocab % page_size; - backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED); - return reinterpret_cast(backing.search.get()) + alignment_cruft; - } else { - util::MapAnonymous(memory_size, backing.search); - return reinterpret_cast(backing.search.get()); - } -} - -void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts, std::size_t vocab_pad, Backing &backing) { - if (!config.write_mmap) return; - switch (config.write_method) { - case Config::WRITE_MMAP: - util::SyncOrThrow(backing.vocab.get(), backing.vocab.size()); - util::SyncOrThrow(backing.search.get(), backing.search.size()); - break; - case Config::WRITE_AFTER: - util::SeekOrThrow(backing.file.get(), 0); - util::WriteOrThrow(backing.file.get(), backing.vocab.get(), backing.vocab.size()); - util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad); - util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size()); - util::FSyncOrThrow(backing.file.get()); - break; - } - // header and vocab share the same mmap. The header is written here because we know the counts. - Parameters params = Parameters(); - params.counts = counts; - params.fixed.order = counts.size(); - params.fixed.probing_multiplier = config.probing_multiplier; - params.fixed.model_type = model_type; - params.fixed.has_vocabulary = config.include_vocab; - params.fixed.search_version = search_version; - WriteHeader(backing.vocab.get(), params); - if (config.write_method == Config::WRITE_AFTER) { - util::SeekOrThrow(backing.file.get(), 0); - util::WriteOrThrow(backing.file.get(), backing.vocab.get(), TotalHeaderSize(counts.size())); - } -} - -namespace detail { - bool IsBinaryFormat(int fd) { const uint64_t size = util::SizeFile(fd); if (size == util::kBadSize || (size <= static_cast(sizeof(Sanity)))) return false; @@ -209,44 +133,164 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version); } -void SeekPastHeader(int fd, const Parameters ¶ms) { - util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size())); +const std::size_t kInvalidSize = static_cast(-1); + +BinaryFormat::BinaryFormat(const Config &config) + : write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method), + header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {} + +void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms) { + file_.reset(fd); + write_mmap_ = NULL; // Ignore write requests; this is already in binary format. + ReadHeader(fd, params); + MatchCheck(model_type, search_version, params); + header_size_ = TotalHeaderSize(params.counts.size()); } -uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing) { - const uint64_t file_size = util::SizeFile(backing.file.get()); +void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const { + assert(header_size_ != kInvalidSize); + util::PReadOrThrow(file_.get(), to, amount, offset_excluding_header + header_size_); +} + +void *BinaryFormat::LoadBinary(std::size_t size) { + assert(header_size_ != kInvalidSize); + const uint64_t file_size = util::SizeFile(file_.get()); // The header is smaller than a page, so we have to map the whole header as well. - std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size); - if (file_size != util::kBadSize && static_cast(file_size) < total_map) - UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map); + uint64_t total_map = static_cast(header_size_) + static_cast(size); + UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map); - util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.search); + util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_); - if (config.enumerate_vocab && !params.fixed.has_vocabulary) - UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary."); - - // Seek to vocabulary words - util::SeekOrThrow(backing.file.get(), total_map); - return reinterpret_cast(backing.search.get()) + TotalHeaderSize(params.counts.size()); + vocab_string_offset_ = total_map; + return reinterpret_cast(mapping_.get()) + header_size_; } -void ComplainAboutARPA(const Config &config, ModelType model_type) { - if (config.write_mmap || !config.messages) return; - if (config.arpa_complain == Config::ALL) { - *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl; - } else if (config.arpa_complain == Config::EXPENSIVE && - (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) { - *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl; +void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) { + vocab_size_ = memory_size; + if (!write_mmap_) { + header_size_ = 0; + util::MapAnonymous(memory_size, memory_vocab_); + return reinterpret_cast(memory_vocab_.get()); + } + header_size_ = TotalHeaderSize(order); + std::size_t total = util::CheckOverflow(static_cast(header_size_) + static_cast(memory_size)); + file_.reset(util::CreateOrThrow(write_mmap_)); + // some gccs complain about uninitialized variables even though all enum values are covered. + void *vocab_base = NULL; + switch (write_method_) { + case Config::WRITE_MMAP: + mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED); + vocab_base = mapping_.get(); + break; + case Config::WRITE_AFTER: + util::ResizeOrThrow(file_.get(), 0); + util::MapAnonymous(total, memory_vocab_); + vocab_base = memory_vocab_.get(); + break; + } + strncpy(reinterpret_cast(vocab_base), kMagicIncomplete, header_size_); + return reinterpret_cast(vocab_base) + header_size_; +} + +void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) { + assert(vocab_size_ != kInvalidSize); + vocab_pad_ = vocab_pad; + std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size; + vocab_string_offset_ = new_size; + if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) { + util::MapAnonymous(memory_size, memory_search_); + assert(header_size_ == 0 || write_mmap_); + vocab_base = reinterpret_cast(memory_vocab_.get()) + header_size_; + return reinterpret_cast(memory_search_.get()); + } + + assert(write_method_ == Config::WRITE_MMAP); + // Also known as total size without vocab words. + // Grow the file to accomodate the search, using zeros. + // According to man mmap, behavior is undefined when the file is resized + // underneath a mmap that is not a multiple of the page size. So to be + // safe, we'll unmap it and map it again. + mapping_.reset(); + util::ResizeOrThrow(file_.get(), new_size); + void *ret; + MapFile(vocab_base, ret); + return ret; +} + +void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) { + // Checking Config's include_vocab is the responsibility of the caller. + assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize); + if (!write_mmap_) { + // Unchanged base. + vocab_base = reinterpret_cast(memory_vocab_.get()); + search_base = reinterpret_cast(memory_search_.get()); + return; + } + if (write_method_ == Config::WRITE_MMAP) { + mapping_.reset(); + } + util::SeekOrThrow(file_.get(), VocabStringReadingOffset()); + util::WriteOrThrow(file_.get(), &buffer[0], buffer.size()); + if (write_method_ == Config::WRITE_MMAP) { + MapFile(vocab_base, search_base); + } else { + vocab_base = reinterpret_cast(memory_vocab_.get()) + header_size_; + search_base = reinterpret_cast(memory_search_.get()); } } -} // namespace detail +void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts) { + if (!write_mmap_) return; + switch (write_method_) { + case Config::WRITE_MMAP: + util::SyncOrThrow(mapping_.get(), mapping_.size()); + break; + case Config::WRITE_AFTER: + util::SeekOrThrow(file_.get(), 0); + util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size()); + util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_); + util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size()); + util::FSyncOrThrow(file_.get()); + break; + } + // header and vocab share the same mmap. + Parameters params = Parameters(); + memset(¶ms, 0, sizeof(Parameters)); + params.counts = counts; + params.fixed.order = counts.size(); + params.fixed.probing_multiplier = config.probing_multiplier; + params.fixed.model_type = model_type; + params.fixed.has_vocabulary = config.include_vocab; + params.fixed.search_version = search_version; + switch (write_method_) { + case Config::WRITE_MMAP: + WriteHeader(mapping_.get(), params); + util::SyncOrThrow(mapping_.get(), mapping_.size()); + break; + case Config::WRITE_AFTER: + { + std::vector buffer(TotalHeaderSize(counts.size())); + WriteHeader(&buffer[0], params); + util::SeekOrThrow(file_.get(), 0); + util::WriteOrThrow(file_.get(), &buffer[0], buffer.size()); + } + break; + } +} + +void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) { + mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED); + vocab_base = reinterpret_cast(mapping_.get()) + header_size_; + search_base = reinterpret_cast(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_; +} bool RecognizeBinary(const char *file, ModelType &recognized) { util::scoped_fd fd(util::OpenReadOrThrow(file)); - if (!detail::IsBinaryFormat(fd.get())) return false; + if (!IsBinaryFormat(fd.get())) { + return false; + } Parameters params; - detail::ReadHeader(fd.get(), params); + ReadHeader(fd.get(), params); recognized = params.fixed.model_type; return true; } diff --git a/lm/binary_format.hh b/lm/binary_format.hh index bf699d5f4..f33f88d75 100644 --- a/lm/binary_format.hh +++ b/lm/binary_format.hh @@ -17,6 +17,8 @@ namespace lm { namespace ngram { +extern const char *kModelNames[6]; + /*Inspect a file to determine if it is a binary lm. If not, return false. * If so, return true and set recognized to the type. This is the only API in * this header designed for use by decoder authors. @@ -42,67 +44,63 @@ struct Parameters { std::vector counts; }; -struct Backing { - // File behind memory, if any. - util::scoped_fd file; - // Vocabulary lookup table. Not to be confused with the vocab words themselves. - util::scoped_memory vocab; - // Raw block of memory backing the language model data structures - util::scoped_memory search; +class BinaryFormat { + public: + explicit BinaryFormat(const Config &config); + + // Reading a binary file: + // Takes ownership of fd + void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms); + // Used to read parts of the file to update the config object before figuring out full size. + void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const; + // Actually load the binary file and return a pointer to the beginning of the search area. + void *LoadBinary(std::size_t size); + + uint64_t VocabStringReadingOffset() const { + assert(vocab_string_offset_ != kInvalidOffset); + return vocab_string_offset_; + } + + // Writing a binary file or initializing in RAM from ARPA: + // Size for vocabulary. + void *SetupJustVocab(std::size_t memory_size, uint8_t order); + // Warning: can change the vocaulary base pointer. + void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base); + // Warning: can change vocabulary and search base addresses. + void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base); + // Write the header at the beginning of the file. + void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts); + + private: + void MapFile(void *&vocab_base, void *&search_base); + + // Copied from configuration. + const Config::WriteMethod write_method_; + const char *write_mmap_; + util::LoadMethod load_method_; + + // File behind memory, if any. + util::scoped_fd file_; + + // If there is a file involved, a single mapping. + util::scoped_memory mapping_; + + // If the data is only in memory, separately allocate each because the trie + // knows vocab's size before it knows search's size (because SRILM might + // have pruned). + util::scoped_memory memory_vocab_, memory_search_; + + // Memory ranges. Note that these may not be contiguous and may not all + // exist. + std::size_t header_size_, vocab_size_, vocab_pad_; + // aka end of search. + uint64_t vocab_string_offset_; + + static const uint64_t kInvalidOffset = (uint64_t)-1; }; -// Create just enough of a binary file to write vocabulary to it. -uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing); -// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin. -uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing); - -// Write header to binary file. This is done last to prevent incomplete files -// from loading. -void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts, std::size_t vocab_pad, Backing &backing); - -namespace detail { - bool IsBinaryFormat(int fd); -void ReadHeader(int fd, Parameters ¶ms); - -void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms); - -void SeekPastHeader(int fd, const Parameters ¶ms); - -uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing); - -void ComplainAboutARPA(const Config &config, ModelType model_type); - -} // namespace detail - -template void LoadLM(const char *file, const Config &config, To &to) { - Backing &backing = to.MutableBacking(); - backing.file.reset(util::OpenReadOrThrow(file)); - - try { - if (detail::IsBinaryFormat(backing.file.get())) { - Parameters params; - detail::ReadHeader(backing.file.get(), params); - detail::MatchCheck(To::kModelType, To::kVersion, params); - // Replace the run-time configured probing_multiplier with the one in the file. - Config new_config(config); - new_config.probing_multiplier = params.fixed.probing_multiplier; - detail::SeekPastHeader(backing.file.get(), params); - To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config); - uint64_t memory_size = To::Size(params.counts, new_config); - uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing); - to.InitializeFromBinary(start, params, new_config, backing.file.get()); - } else { - detail::ComplainAboutARPA(config, To::kModelType); - to.InitializeFromARPA(file, config); - } - } catch (util::Exception &e) { - e << " File: " << file; - throw; - } -} - } // namespace ngram } // namespace lm #endif // LM_BINARY_FORMAT__ diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc index 6ad91dde7..ccc06efca 100644 --- a/lm/builder/corpus_count.cc +++ b/lm/builder/corpus_count.cc @@ -87,7 +87,7 @@ class VocabHandout { Table table_; std::size_t double_cutoff_; - + util::FakeOFStream word_list_; }; @@ -98,7 +98,7 @@ class DedupeHash : public std::unary_function { std::size_t operator()(const WordIndex *start) const { return util::MurmurHashNative(start, size_); } - + private: const std::size_t size_; }; @@ -106,11 +106,11 @@ class DedupeHash : public std::unary_function { class DedupeEquals : public std::binary_function { public: explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {} - + bool operator()(const WordIndex *first, const WordIndex *second) const { return !memcmp(first, second, size_); - } - + } + private: const std::size_t size_; }; @@ -131,7 +131,7 @@ typedef util::ProbingHashTable Dedupe; class Writer { public: - Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) + Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) : block_(position), gram_(block_->Get(), order), dedupe_invalid_(order, std::numeric_limits::max()), dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)), @@ -140,7 +140,7 @@ class Writer { dedupe_.Clear(); assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size); if (order == 1) { - // Add special words. AdjustCounts is responsible if order != 1. + // Add special words. AdjustCounts is responsible if order != 1. AddUnigramWord(kUNK); AddUnigramWord(kBOS); } @@ -170,16 +170,16 @@ class Writer { memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1)); return; } - // Complete the write. + // Complete the write. gram_.Count() = 1; - // Prepare the next n-gram. + // Prepare the next n-gram. if (reinterpret_cast(gram_.begin()) + gram_.TotalSize() != static_cast(block_->Get()) + block_size_) { NGram last(gram_); gram_.NextInMemory(); std::copy(last.begin() + 1, last.end(), gram_.begin()); return; } - // Block end. Need to store the context in a temporary buffer. + // Block end. Need to store the context in a temporary buffer. std::copy(gram_.begin() + 1, gram_.end(), buffer_.get()); dedupe_.Clear(); block_->SetValidSize(block_size_); @@ -207,7 +207,7 @@ class Writer { // Hash table combiner implementation. Dedupe dedupe_; - // Small buffer to hold existing ngrams when shifting across a block boundary. + // Small buffer to hold existing ngrams when shifting across a block boundary. boost::scoped_array buffer_; const std::size_t block_size_; @@ -223,7 +223,7 @@ std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) { return VocabHandout::MemUsage(vocab_estimate); } -CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) +CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block) : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count), dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)), dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) { @@ -240,7 +240,10 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) { uint64_t count = 0; bool delimiters[256]; memset(delimiters, 0, sizeof(delimiters)); - delimiters['\0'] = delimiters['\t'] = delimiters['\n'] = delimiters['\r'] = delimiters[' '] = true; + const char kDelimiterSet[] = "\0\t\n\r "; + for (const char *i = kDelimiterSet; i < kDelimiterSet + sizeof(kDelimiterSet); ++i) { + delimiters[static_cast(*i)] = true; + } try { while(true) { StringPiece line(from_.ReadLine()); diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc index 52e69f02e..500268069 100644 --- a/lm/builder/interpolate.cc +++ b/lm/builder/interpolate.cc @@ -33,12 +33,12 @@ class Callback { pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1]; probs_[order_minus_1 + 1] = pay.complete.prob; pay.complete.prob = log10(pay.complete.prob); - // TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling. - if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS && backoffs_[order_minus_1].Get()) { // check valid pointer at tht end + // TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling. + if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) { pay.complete.backoff = log10(*static_cast(backoffs_[order_minus_1].Get())); ++backoffs_[order_minus_1]; } else { - // Not a context. + // Not a context. pay.complete.backoff = 0.0; } } @@ -52,7 +52,7 @@ class Callback { }; } // namespace -Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) +Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) : uniform_prob_(1.0 / static_cast(unigram_count - 1)), backoffs_(backoffs) {} // perform order-wise interpolation diff --git a/lm/config.cc b/lm/config.cc index dc3365319..9520c41c8 100644 --- a/lm/config.cc +++ b/lm/config.cc @@ -11,11 +11,7 @@ Config::Config() : enumerate_vocab(NULL), unknown_missing(COMPLAIN), sentence_marker_missing(THROW_UP), -#if defined(_WIN32) || defined(_WIN64) - positive_log_probability(SILENT), -#else positive_log_probability(THROW_UP), -#endif unknown_missing_logprob(-100.0), probing_multiplier(1.5), building_memory(1073741824ULL), // 1 GB diff --git a/lm/facade.hh b/lm/facade.hh index 760e839e0..de1551f12 100644 --- a/lm/facade.hh +++ b/lm/facade.hh @@ -17,14 +17,14 @@ template class ModelFacade : publ typedef VocabularyT Vocabulary; /* Translate from void* to State */ - FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const { + FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const { return static_cast(this)->FullScore( *reinterpret_cast(in_state), new_word, *reinterpret_cast(out_state)); } - FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const { + FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const { return static_cast(this)->FullScoreForgotState( context_rbegin, context_rend, @@ -37,7 +37,7 @@ template class ModelFacade : publ return static_cast(this)->FullScore(in_state, new_word, out_state).prob; } - float Score(const void *in_state, const WordIndex new_word, void *out_state) const { + float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const { return static_cast(this)->Score( *reinterpret_cast(in_state), new_word, diff --git a/lm/filter/arpa_io.hh b/lm/filter/arpa_io.hh index 08e658666..602b5b31b 100644 --- a/lm/filter/arpa_io.hh +++ b/lm/filter/arpa_io.hh @@ -14,10 +14,6 @@ #include #include -#if !defined __MINGW32__ -#include -#endif - #include #include diff --git a/lm/filter/count_io.hh b/lm/filter/count_io.hh index 740b8d50e..d992026ff 100644 --- a/lm/filter/count_io.hh +++ b/lm/filter/count_io.hh @@ -5,27 +5,18 @@ #include #include -#if !defined __MINGW32__ -#include -#endif - +#include "util/fake_ofstream.hh" +#include "util/file.hh" #include "util/file_piece.hh" namespace lm { class CountOutput : boost::noncopyable { public: - explicit CountOutput(const char *name) : file_(name, std::ios::out) {} + explicit CountOutput(const char *name) : file_(util::CreateOrThrow(name)) {} void AddNGram(const StringPiece &line) { - if (!(file_ << line << '\n')) { -#if defined __MINGW32__ - std::cerr<<"Writing counts file failed"< void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { @@ -37,12 +28,12 @@ class CountOutput : boost::noncopyable { } private: - std::fstream file_; + util::FakeOFStream file_; }; class CountBatch { public: - explicit CountBatch(std::streamsize initial_read) + explicit CountBatch(std::streamsize initial_read) : initial_read_(initial_read) { buffer_.reserve(initial_read); } @@ -75,7 +66,7 @@ class CountBatch { private: std::streamsize initial_read_; - // This could have been a std::string but that's less happy with raw writes. + // This could have been a std::string but that's less happy with raw writes. std::vector buffer_; }; diff --git a/lm/filter/filter_main.cc b/lm/filter/filter_main.cc index f89ac4df3..82fdc1ef7 100644 --- a/lm/filter/filter_main.cc +++ b/lm/filter/filter_main.cc @@ -6,6 +6,7 @@ #endif #include "lm/filter/vocab.hh" #include "lm/filter/wrapper.hh" +#include "util/exception.hh" #include "util/file_piece.hh" #include @@ -57,7 +58,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; struct Config { - Config() : + Config() : #ifndef NTHREAD batch_size(25000), threads(boost::thread::hardware_concurrency()), @@ -157,102 +158,96 @@ template void DispatchFilterModes(const Config &config, std::istr } // namespace lm int main(int argc, char *argv[]) { - if (argc < 4) { - lm::DisplayHelp(argv[0]); - return 1; - } - - // I used to have boost::program_options, but some users didn't want to compile boost. - lm::Config config; - config.mode = lm::MODE_UNSET; - for (int i = 1; i < argc - 2; ++i) { - const char *str = argv[i]; - if (!std::strcmp(str, "copy")) { - config.mode = lm::MODE_COPY; - } else if (!std::strcmp(str, "single")) { - config.mode = lm::MODE_SINGLE; - } else if (!std::strcmp(str, "multiple")) { - config.mode = lm::MODE_MULTIPLE; - } else if (!std::strcmp(str, "union")) { - config.mode = lm::MODE_UNION; - } else if (!std::strcmp(str, "phrase")) { - config.phrase = true; - } else if (!std::strcmp(str, "context")) { - config.context = true; - } else if (!std::strcmp(str, "arpa")) { - config.format = lm::FORMAT_ARPA; - } else if (!std::strcmp(str, "raw")) { - config.format = lm::FORMAT_COUNT; -#ifndef NTHREAD - } else if (!std::strncmp(str, "threads:", 8)) { - config.threads = boost::lexical_cast(str + 8); - if (!config.threads) { - std::cerr << "Specify at least one thread." << std::endl; - return 1; - } - } else if (!std::strncmp(str, "batch_size:", 11)) { - config.batch_size = boost::lexical_cast(str + 11); - if (config.batch_size < 5000) { - std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl; - if (!config.batch_size) return 1; - } -#endif - } else { + try { + if (argc < 4) { lm::DisplayHelp(argv[0]); return 1; } - } - if (config.mode == lm::MODE_UNSET) { - lm::DisplayHelp(argv[0]); - return 1; - } - - if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) { - std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl; - return 1; - } - - bool cmd_is_model = true; - const char *cmd_input = argv[argc - 2]; - if (!strncmp(cmd_input, "vocab:", 6)) { - cmd_is_model = false; - cmd_input += 6; - } else if (!strncmp(cmd_input, "model:", 6)) { - cmd_input += 6; - } else if (strchr(cmd_input, ':')) { -#if defined __MINGW32__ - std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl; - exit(1); -#else - errx(1, "Specify vocab: or model: before the input file name, not \"%s\"", cmd_input); -#endif // defined - } else { - std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; - } - std::ifstream cmd_file; - std::istream *vocab; - if (cmd_is_model) { - vocab = &std::cin; - } else { - cmd_file.open(cmd_input, std::ios::in); - if (!cmd_file) { -#if defined __MINGW32__ - std::cerr << "Could not open input file " << cmd_input << std::endl; - exit(2); -#else - err(2, "Could not open input file %s", cmd_input); -#endif // defined + // I used to have boost::program_options, but some users didn't want to compile boost. + lm::Config config; + config.mode = lm::MODE_UNSET; + for (int i = 1; i < argc - 2; ++i) { + const char *str = argv[i]; + if (!std::strcmp(str, "copy")) { + config.mode = lm::MODE_COPY; + } else if (!std::strcmp(str, "single")) { + config.mode = lm::MODE_SINGLE; + } else if (!std::strcmp(str, "multiple")) { + config.mode = lm::MODE_MULTIPLE; + } else if (!std::strcmp(str, "union")) { + config.mode = lm::MODE_UNION; + } else if (!std::strcmp(str, "phrase")) { + config.phrase = true; + } else if (!std::strcmp(str, "context")) { + config.context = true; + } else if (!std::strcmp(str, "arpa")) { + config.format = lm::FORMAT_ARPA; + } else if (!std::strcmp(str, "raw")) { + config.format = lm::FORMAT_COUNT; +#ifndef NTHREAD + } else if (!std::strncmp(str, "threads:", 8)) { + config.threads = boost::lexical_cast(str + 8); + if (!config.threads) { + std::cerr << "Specify at least one thread." << std::endl; + return 1; + } + } else if (!std::strncmp(str, "batch_size:", 11)) { + config.batch_size = boost::lexical_cast(str + 11); + if (config.batch_size < 5000) { + std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl; + if (!config.batch_size) return 1; + } +#endif + } else { + lm::DisplayHelp(argv[0]); + return 1; + } } - vocab = &cmd_file; - } - util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr); + if (config.mode == lm::MODE_UNSET) { + lm::DisplayHelp(argv[0]); + return 1; + } - if (config.format == lm::FORMAT_ARPA) { - lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); - } else if (config.format == lm::FORMAT_COUNT) { - lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); + if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) { + std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl; + return 1; + } + + bool cmd_is_model = true; + const char *cmd_input = argv[argc - 2]; + if (!strncmp(cmd_input, "vocab:", 6)) { + cmd_is_model = false; + cmd_input += 6; + } else if (!strncmp(cmd_input, "model:", 6)) { + cmd_input += 6; + } else if (strchr(cmd_input, ':')) { + std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl; + return 1; + } else { + std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; + } + std::ifstream cmd_file; + std::istream *vocab; + if (cmd_is_model) { + vocab = &std::cin; + } else { + cmd_file.open(cmd_input, std::ios::in); + UTIL_THROW_IF(!cmd_file, util::ErrnoException, "Failed to open " << cmd_input); + vocab = &cmd_file; + } + + util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr); + + if (config.format == lm::FORMAT_ARPA) { + lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); + } else if (config.format == lm::FORMAT_COUNT) { + lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); + } + return 0; + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; } - return 0; } diff --git a/lm/filter/format.hh b/lm/filter/format.hh index 7f945b0d6..7d8c28dbc 100644 --- a/lm/filter/format.hh +++ b/lm/filter/format.hh @@ -1,5 +1,5 @@ #ifndef LM_FILTER_FORMAT_H__ -#define LM_FITLER_FORMAT_H__ +#define LM_FILTER_FORMAT_H__ #include "lm/filter/arpa_io.hh" #include "lm/filter/count_io.hh" diff --git a/lm/filter/vocab.cc b/lm/filter/vocab.cc index 7ed5d92fb..011ab5992 100644 --- a/lm/filter/vocab.cc +++ b/lm/filter/vocab.cc @@ -5,10 +5,6 @@ #include -#if !defined __MINGW32__ -#include -#endif - namespace lm { namespace vocab { @@ -34,7 +30,7 @@ bool IsLineEnd(std::istream &in) { }// namespace // Read space separated words in enter separated lines. These lines can be -// very long, so don't read an entire line at a time. +// very long, so don't read an entire line at a time. unsigned int ReadMultiple(std::istream &in, boost::unordered_map > &out) { in.exceptions(std::istream::badbit); unsigned int sentence = 0; diff --git a/lm/model.cc b/lm/model.cc index a26654a6f..a5a16bf8e 100644 --- a/lm/model.cc +++ b/lm/model.cc @@ -34,8 +34,47 @@ template void GenericModel(start - static_cast(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast(base)) << " but Size says they should take " << goal_size); } -template GenericModel::GenericModel(const char *file, const Config &config) { - LoadLM(file, config, *this); +namespace { +void ComplainAboutARPA(const Config &config, ModelType model_type) { + if (config.write_mmap || !config.messages) return; + if (config.arpa_complain == Config::ALL) { + *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl; + } else if (config.arpa_complain == Config::EXPENSIVE && + (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) { + *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl; + } +} + +void CheckCounts(const std::vector &counts) { + UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE); + if (sizeof(uint64_t) > sizeof(std::size_t)) { + for (std::vector::const_iterator i = counts.begin(); i != counts.end(); ++i) { + UTIL_THROW_IF(*i > static_cast(std::numeric_limits::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines."); + } + } +} + +} // namespace + +template GenericModel::GenericModel(const char *file, const Config &init_config) : backing_(init_config) { + util::scoped_fd fd(util::OpenReadOrThrow(file)); + if (IsBinaryFormat(fd.get())) { + Parameters parameters; + int fd_shallow = fd.release(); + backing_.InitializeBinary(fd_shallow, kModelType, kVersion, parameters); + CheckCounts(parameters.counts); + + Config new_config(init_config); + new_config.probing_multiplier = parameters.fixed.probing_multiplier; + Search::UpdateConfigFromBinary(backing_, parameters.counts, VocabularyT::Size(parameters.counts[0], new_config), new_config); + UTIL_THROW_IF(new_config.enumerate_vocab && !parameters.fixed.has_vocabulary, FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary."); + + SetupMemory(backing_.LoadBinary(Size(parameters.counts, new_config)), parameters.counts, new_config); + vocab_.LoadedBinary(parameters.fixed.has_vocabulary, fd_shallow, new_config.enumerate_vocab, backing_.VocabStringReadingOffset()); + } else { + ComplainAboutARPA(init_config, kModelType); + InitializeFromARPA(fd.release(), file, init_config); + } // g++ prints warnings unless these are fully initialized. State begin_sentence = State(); @@ -50,27 +89,9 @@ template GenericModel::Ge P::Init(begin_sentence, null_context, vocab_, search_.Order()); } -namespace { -void CheckCounts(const std::vector &counts) { - UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE); - if (sizeof(uint64_t) > sizeof(std::size_t)) { - for (std::vector::const_iterator i = counts.begin(); i != counts.end(); ++i) { - UTIL_THROW_IF(*i > static_cast(std::numeric_limits::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines."); - } - } -} -} // namespace - -template void GenericModel::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) { - CheckCounts(params.counts); - SetupMemory(start, params.counts, config); - vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab); - search_.LoadedBinary(); -} - -template void GenericModel::InitializeFromARPA(const char *file, const Config &config) { - // Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any. - util::FilePiece f(backing_.file.release(), file, config.ProgressMessages()); +template void GenericModel::InitializeFromARPA(int fd, const char *file, const Config &config) { + // Backing file is the ARPA. + util::FilePiece f(fd, file, config.ProgressMessages()); try { std::vector counts; // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_. @@ -81,13 +102,17 @@ template void GenericModel(search_rebase), counts, config); } else { vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]); search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); @@ -99,18 +124,13 @@ template void GenericModel void GenericModel::UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config) { - util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config)); - Search::UpdateConfigFromBinary(fd, counts, config); -} - template FullScoreReturn GenericModel::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const { FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state); for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) { diff --git a/lm/model.hh b/lm/model.hh index c9c17c4b3..e75da93bf 100644 --- a/lm/model.hh +++ b/lm/model.hh @@ -104,10 +104,6 @@ template class GenericModel : public base::Mod } private: - friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel &to); - - static void UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config); - FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const; // Score bigrams and above. Do not include backoff. @@ -116,15 +112,11 @@ template class GenericModel : public base::Mod // Appears after Size in the cc file. void SetupMemory(void *start, const std::vector &counts, const Config &config); - void InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd); - - void InitializeFromARPA(const char *file, const Config &config); + void InitializeFromARPA(int fd, const char *file, const Config &config); float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const; - Backing &MutableBacking() { return backing_; } - - Backing backing_; + BinaryFormat backing_; VocabularyT vocab_; diff --git a/lm/model_test.cc b/lm/model_test.cc index eb1590942..7005b05ea 100644 --- a/lm/model_test.cc +++ b/lm/model_test.cc @@ -360,10 +360,11 @@ BOOST_AUTO_TEST_CASE(quant_bhiksha_trie) { LoadingTest(); } -template void BinaryTest() { +template void BinaryTest(Config::WriteMethod write_method) { Config config; config.write_mmap = "test.binary"; config.messages = NULL; + config.write_method = write_method; ExpectEnumerateVocab enumerate; config.enumerate_vocab = &enumerate; @@ -406,6 +407,11 @@ template void BinaryTest() { unlink("test_nounk.binary"); } +template void BinaryTest() { + BinaryTest(Config::WRITE_MMAP); + BinaryTest(Config::WRITE_AFTER); +} + BOOST_AUTO_TEST_CASE(write_and_read_probing) { BinaryTest(); } diff --git a/lm/quantize.cc b/lm/quantize.cc index b58c3f3f6..273ea3989 100644 --- a/lm/quantize.cc +++ b/lm/quantize.cc @@ -38,13 +38,13 @@ const char kSeparatelyQuantizeVersion = 2; } // namespace -void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector &/*counts*/, Config &config) { - char version; - util::ReadOrThrow(fd, &version, 1); - util::ReadOrThrow(fd, &config.prob_bits, 1); - util::ReadOrThrow(fd, &config.backoff_bits, 1); +void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) { + unsigned char buffer[3]; + file.ReadForConfig(buffer, 3, offset); + char version = buffer[0]; + config.prob_bits = buffer[1]; + config.backoff_bits = buffer[2]; if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion); - util::AdvanceOrThrow(fd, -3); } void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) { diff --git a/lm/quantize.hh b/lm/quantize.hh index 8ce2378a7..9d3a2f439 100644 --- a/lm/quantize.hh +++ b/lm/quantize.hh @@ -18,12 +18,13 @@ namespace lm { namespace ngram { struct Config; +class BinaryFormat; /* Store values directly and don't quantize. */ class DontQuantize { public: static const ModelType kModelTypeAdd = static_cast(0); - static void UpdateConfigFromBinary(int, const std::vector &, Config &) {} + static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &) {} static uint64_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; } static uint8_t MiddleBits(const Config &/*config*/) { return 63; } static uint8_t LongestBits(const Config &/*config*/) { return 31; } @@ -136,7 +137,7 @@ class SeparatelyQuantize { public: static const ModelType kModelTypeAdd = kQuantAdd; - static void UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config); + static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config); static uint64_t Size(uint8_t order, const Config &config) { uint64_t longest_table = (static_cast(1) << static_cast(config.prob_bits)) * sizeof(float); diff --git a/lm/search_hashed.cc b/lm/search_hashed.cc index 62275d277..354a56b46 100644 --- a/lm/search_hashed.cc +++ b/lm/search_hashed.cc @@ -204,9 +204,10 @@ template void ReadNGrams( namespace detail { template uint8_t *HashedSearch::SetupMemory(uint8_t *start, const std::vector &counts, const Config &config) { - std::size_t allocated = Unigram::Size(counts[0]); - unigram_ = Unigram(start, counts[0], allocated); - start += allocated; + unigram_ = Unigram(start, counts[0]); + start += Unigram::Size(counts[0]); + std::size_t allocated; + middle_.clear(); for (unsigned int n = 2; n < counts.size(); ++n) { allocated = Middle::Size(counts[n - 1], config.probing_multiplier); middle_.push_back(Middle(start, allocated)); @@ -218,9 +219,21 @@ template uint8_t *HashedSearch::SetupMemory(uint8_t *start, return start; } -template void HashedSearch::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector &counts, const Config &config, ProbingVocabulary &vocab, Backing &backing) { - // TODO: fix sorted. - SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), Size(counts, config), backing), counts, config); +/*template void HashedSearch::Relocate(uint8_t *start, const std::vector &counts, const Config &config) { + unigram_ = Unigram(start, counts[0]); + start += Unigram::Size(counts[0]); + for (unsigned int n = 2; n < counts.size(); ++n) { + middle[n-2].Relocate(start); + start += Middle::Size(counts[n - 1], config.probing_multiplier) + } + longest_.Relocate(start); +}*/ + +template void HashedSearch::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing) { + void *vocab_rebase; + void *search_base = backing.GrowForSearch(Size(counts, config), vocab.UnkCountChangePadding(), vocab_rebase); + vocab.Relocate(vocab_rebase); + SetupMemory(reinterpret_cast(search_base), counts, config); PositiveProbWarn warn(config.positive_log_probability); Read1Grams(f, counts[0], vocab, unigram_.Raw(), warn); @@ -277,14 +290,6 @@ template template void HashedSearch::ApplyBui ReadEnd(f); } -template void HashedSearch::LoadedBinary() { - unigram_.LoadedBinary(); - for (typename std::vector::iterator i = middle_.begin(); i != middle_.end(); ++i) { - i->LoadedBinary(); - } - longest_.LoadedBinary(); -} - template class HashedSearch; template class HashedSearch; diff --git a/lm/search_hashed.hh b/lm/search_hashed.hh index 9d067bc2e..8193262b0 100644 --- a/lm/search_hashed.hh +++ b/lm/search_hashed.hh @@ -18,7 +18,7 @@ namespace util { class FilePiece; } namespace lm { namespace ngram { -struct Backing; +class BinaryFormat; class ProbingVocabulary; namespace detail { @@ -72,7 +72,7 @@ template class HashedSearch { static const unsigned int kVersion = 0; // TODO: move probing_multiplier here with next binary file format update. - static void UpdateConfigFromBinary(int, const std::vector &, Config &) {} + static void UpdateConfigFromBinary(const BinaryFormat &, const std::vector &, uint64_t, Config &) {} static uint64_t Size(const std::vector &counts, const Config &config) { uint64_t ret = Unigram::Size(counts[0]); @@ -84,9 +84,7 @@ template class HashedSearch { uint8_t *SetupMemory(uint8_t *start, const std::vector &counts, const Config &config); - void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector &counts, const Config &config, ProbingVocabulary &vocab, Backing &backing); - - void LoadedBinary(); + void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing); unsigned char Order() const { return middle_.size() + 2; @@ -148,7 +146,7 @@ template class HashedSearch { public: Unigram() {} - Unigram(void *start, uint64_t count, std::size_t /*allocated*/) : + Unigram(void *start, uint64_t count) : unigram_(static_cast(start)) #ifdef DEBUG , count_(count) @@ -168,8 +166,6 @@ template class HashedSearch { typename Value::Weights &Unknown() { return unigram_[0]; } - void LoadedBinary() {} - // For building. typename Value::Weights *Raw() { return unigram_; } diff --git a/lm/search_trie.cc b/lm/search_trie.cc index 27605e548..4a88194e8 100644 --- a/lm/search_trie.cc +++ b/lm/search_trie.cc @@ -459,7 +459,7 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c } // namespace -template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { +template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing) { RecordReader inputs[KENLM_MAX_ORDER - 1]; RecordReader contexts[KENLM_MAX_ORDER - 1]; @@ -488,7 +488,10 @@ template void BuildTrie(SortedFiles &files, std::ve sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs); - out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch::Size(fixed_counts, config), backing), fixed_counts, config); + void *vocab_relocate; + void *search_base = backing.GrowForSearch(TrieSearch::Size(fixed_counts, config), vocab.UnkCountChangePadding(), vocab_relocate); + vocab.Relocate(vocab_relocate); + out.SetupMemory(reinterpret_cast(search_base), fixed_counts, config); for (unsigned char i = 2; i <= counts.size(); ++i) { inputs[i-2].Rewind(); @@ -571,15 +574,7 @@ template uint8_t *TrieSearch::Setup return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]); } -template void TrieSearch::LoadedBinary() { - unigram_.LoadedBinary(); - for (Middle *i = middle_begin_; i != middle_end_; ++i) { - i->LoadedBinary(); - } - longest_.LoadedBinary(); -} - -template void TrieSearch::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) { +template void TrieSearch::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing) { std::string temporary_prefix; if (config.temporary_directory_prefix) { temporary_prefix = config.temporary_directory_prefix; diff --git a/lm/search_trie.hh b/lm/search_trie.hh index 763fd1a72..299262a5d 100644 --- a/lm/search_trie.hh +++ b/lm/search_trie.hh @@ -17,13 +17,13 @@ namespace lm { namespace ngram { -struct Backing; +class BinaryFormat; class SortedVocabulary; namespace trie { template class TrieSearch; class SortedFiles; -template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); +template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing); template class TrieSearch { public: @@ -39,11 +39,11 @@ template class TrieSearch { static const unsigned int kVersion = 1; - static void UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config) { - Quant::UpdateConfigFromBinary(fd, counts, config); - util::AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0])); + static void UpdateConfigFromBinary(const BinaryFormat &file, const std::vector &counts, uint64_t offset, Config &config) { + Quant::UpdateConfigFromBinary(file, offset, config); // Currently the unigram pointers are not compresssed, so there will only be a header for order > 2. - if (counts.size() > 2) Bhiksha::UpdateConfigFromBinary(fd, config); + if (counts.size() > 2) + Bhiksha::UpdateConfigFromBinary(file, offset + Quant::Size(counts.size(), config) + Unigram::Size(counts[0]), config); } static uint64_t Size(const std::vector &counts, const Config &config) { @@ -60,9 +60,7 @@ template class TrieSearch { uint8_t *SetupMemory(uint8_t *start, const std::vector &counts, const Config &config); - void LoadedBinary(); - - void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, Backing &backing); + void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing); unsigned char Order() const { return middle_end_ - middle_begin_ + 2; @@ -103,7 +101,7 @@ template class TrieSearch { } private: - friend void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); + friend void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing); // Middles are managed manually so we can delay construction and they don't have to be copyable. void FreeMiddles() { diff --git a/lm/trie.hh b/lm/trie.hh index 9ea3c5466..d858ab5e4 100644 --- a/lm/trie.hh +++ b/lm/trie.hh @@ -62,8 +62,6 @@ class Unigram { return unigram_; } - void LoadedBinary() {} - UnigramPointer Find(WordIndex word, NodeRange &next) const { UnigramValue *val = unigram_ + word; next.begin = val->next; @@ -108,8 +106,6 @@ template class BitPackedMiddle : public BitPacked { void FinishedLoading(uint64_t next_end, const Config &config); - void LoadedBinary() { bhiksha_.LoadedBinary(); } - util::BitAddress Find(WordIndex word, NodeRange &range, uint64_t &pointer) const; util::BitAddress ReadEntry(uint64_t pointer, NodeRange &range) { @@ -138,14 +134,9 @@ class BitPackedLongest : public BitPacked { BaseInit(base, max_vocab, quant_bits); } - void LoadedBinary() {} - util::BitAddress Insert(WordIndex word); util::BitAddress Find(WordIndex word, const NodeRange &node) const; - - private: - uint8_t quant_bits_; }; } // namespace trie diff --git a/lm/trie_sort.cc b/lm/trie_sort.cc index dc542bb32..126d43aba 100644 --- a/lm/trie_sort.cc +++ b/lm/trie_sort.cc @@ -50,6 +50,10 @@ class PartialViewProxy { const void *Data() const { return inner_.Data(); } void *Data() { return inner_.Data(); } + friend void swap(PartialViewProxy first, PartialViewProxy second) { + std::swap_ranges(reinterpret_cast(first.Data()), reinterpret_cast(first.Data()) + first.attention_size_, reinterpret_cast(second.Data())); + } + private: friend class util::ProxyIterator; diff --git a/lm/virtual_interface.hh b/lm/virtual_interface.hh index ff4a388e7..7a3e23796 100644 --- a/lm/virtual_interface.hh +++ b/lm/virtual_interface.hh @@ -125,13 +125,13 @@ class Model { void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); } // Requires in_state != out_state - virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0; + virtual float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; // Requires in_state != out_state - virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; + virtual FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; // Prefer to use FullScore. The context words should be provided in reverse order. - virtual FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0; + virtual FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0; unsigned char Order() const { return order_; } diff --git a/lm/vocab.cc b/lm/vocab.cc index fd7f96dc4..7f0878f40 100644 --- a/lm/vocab.cc +++ b/lm/vocab.cc @@ -32,7 +32,8 @@ const uint64_t kUnknownHash = detail::HashForVocab("", 5); // Sadly some LMs have . const uint64_t kUnknownCapHash = detail::HashForVocab("", 5); -void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count) { +void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint64_t offset) { + util::SeekOrThrow(fd, offset); // Check that we're at the right place by reading which is always first. char check_unk[6]; util::ReadOrThrow(fd, check_unk, 6); @@ -80,11 +81,6 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) { buffer_.push_back(0); } -void WriteWordsWrapper::Write(int fd, uint64_t start) { - util::SeekOrThrow(fd, start); - util::WriteOrThrow(fd, buffer_.data(), buffer_.size()); -} - SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {} uint64_t SortedVocabulary::Size(uint64_t entries, const Config &/*config*/) { @@ -100,6 +96,12 @@ void SortedVocabulary::SetupMemory(void *start, std::size_t allocated, std::size saw_unk_ = false; } +void SortedVocabulary::Relocate(void *new_start) { + std::size_t delta = end_ - begin_; + begin_ = reinterpret_cast(new_start) + 1; + end_ = begin_ + delta; +} + void SortedVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries) { enumerate_ = to; if (enumerate_) { @@ -147,11 +149,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) { bound_ = end_ - begin_ + 1; } -void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) { +void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) { end_ = begin_ + *(reinterpret_cast(begin_) - 1); SetSpecial(Index(""), Index(""), 0); bound_ = end_ - begin_ + 1; - if (have_words) ReadWords(fd, to, bound_); + if (have_words) ReadWords(fd, to, bound_, offset); } namespace { @@ -179,6 +181,11 @@ void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::siz saw_unk_ = false; } +void ProbingVocabulary::Relocate(void *new_start) { + header_ = static_cast(new_start); + lookup_.Relocate(static_cast(new_start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader))); +} + void ProbingVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t /*max_entries*/) { enumerate_ = to; if (enumerate_) { @@ -206,12 +213,11 @@ void ProbingVocabulary::InternalFinishedLoading() { SetSpecial(Index(""), Index(""), 0); } -void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) { +void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) { UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code."); - lookup_.LoadedBinary(); bound_ = header_->bound; SetSpecial(Index(""), Index(""), 0); - if (have_words) ReadWords(fd, to, bound_); + if (have_words) ReadWords(fd, to, bound_, offset); } void MissingUnknown(const Config &config) throw(SpecialWordMissingException) { diff --git a/lm/vocab.hh b/lm/vocab.hh index 226ae4385..074b74d86 100644 --- a/lm/vocab.hh +++ b/lm/vocab.hh @@ -36,7 +36,7 @@ class WriteWordsWrapper : public EnumerateVocab { void Add(WordIndex index, const StringPiece &str); - void Write(int fd, uint64_t start); + const std::string &Buffer() const { return buffer_; } private: EnumerateVocab *inner_; @@ -71,6 +71,8 @@ class SortedVocabulary : public base::Vocabulary { // Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway. void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config); + void Relocate(void *new_start); + void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries); WordIndex Insert(const StringPiece &str); @@ -83,15 +85,13 @@ class SortedVocabulary : public base::Vocabulary { bool SawUnk() const { return saw_unk_; } - void LoadedBinary(bool have_words, int fd, EnumerateVocab *to); + void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset); private: uint64_t *begin_, *end_; WordIndex bound_; - WordIndex highest_value_; - bool saw_unk_; EnumerateVocab *enumerate_; @@ -140,6 +140,8 @@ class ProbingVocabulary : public base::Vocabulary { // Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway. void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config); + void Relocate(void *new_start); + void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries); WordIndex Insert(const StringPiece &str); @@ -152,7 +154,7 @@ class ProbingVocabulary : public base::Vocabulary { bool SawUnk() const { return saw_unk_; } - void LoadedBinary(bool have_words, int fd, EnumerateVocab *to); + void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset); private: void InternalFinishedLoading(); diff --git a/util/Jamfile b/util/Jamfile index 910b30550..5ee5c1c10 100644 --- a/util/Jamfile +++ b/util/Jamfile @@ -23,11 +23,8 @@ fakelib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_pi import testing ; -unit-test bit_packing_test : bit_packing_test.cc kenutil /top//boost_unit_test_framework ; run file_piece_test.o kenutil /top//boost_unit_test_framework : : file_piece.cc ; -unit-test read_compressed_test : read_compressed_test.o kenutil /top//boost_unit_test_framework ; -unit-test joint_sort_test : joint_sort_test.cc kenutil /top//boost_unit_test_framework ; -unit-test probing_hash_table_test : probing_hash_table_test.cc kenutil /top//boost_unit_test_framework ; -unit-test sorted_uniform_test : sorted_uniform_test.cc kenutil /top//boost_unit_test_framework ; -unit-test tokenize_piece_test : tokenize_piece_test.cc kenutil /top//boost_unit_test_framework ; -unit-test multi_intersection_test : multi_intersection_test.cc kenutil /top//boost_unit_test_framework ; +for local t in [ glob *_test.cc : file_piece_test.cc read_compressed_test.cc ] { + local name = [ MATCH "(.*)\.cc" : $(t) ] ; + unit-test $(name) : $(t) kenutil /top//boost_unit_test_framework /top//boost_system ; +} diff --git a/util/exception.cc b/util/exception.cc index 557c39862..083bac20d 100644 --- a/util/exception.cc +++ b/util/exception.cc @@ -51,6 +51,11 @@ void Exception::SetLocation(const char *file, unsigned int line, const char *fun } namespace { +// At least one of these functions will not be called. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif // The XOPEN version. const char *HandleStrerror(int ret, const char *buf) { if (!ret) return buf; @@ -61,6 +66,9 @@ const char *HandleStrerror(int ret, const char *buf) { const char *HandleStrerror(const char *ret, const char * /*buf*/) { return ret; } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif } // namespace ErrnoException::ErrnoException() throw() : errno_(errno) { diff --git a/util/file.cc b/util/file.cc index 0b333e003..51eaf972f 100644 --- a/util/file.cc +++ b/util/file.cc @@ -20,6 +20,7 @@ #if defined __MINGW32__ #include #include +#warning "The file functions on MinGW have not been tested for file sizes above 2^31 - 1. Please read https://stackoverflow.com/questions/12539488/determine-64-bit-file-size-in-c-on-mingw-32-bit and fix" #elif defined(_WIN32) || defined(_WIN64) #include #include @@ -81,6 +82,7 @@ int CreateOrThrow(const char *name) { uint64_t SizeFile(int fd) { #if defined __MINGW32__ struct stat sb; + // Does this handle 64-bit? int ret = fstat(fd, &sb); if (ret == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; return sb.st_size; @@ -109,6 +111,7 @@ uint64_t SizeOrThrow(int fd) { void ResizeOrThrow(int fd, uint64_t to) { #if defined __MINGW32__ + // Does this handle 64-bit? int ret = ftruncate #elif defined(_WIN32) || defined(_WIN64) errno_t ret = _chsize_s @@ -125,7 +128,7 @@ namespace { std::size_t GuardLarge(std::size_t size) { // The following operating systems have broken read/write/pread/pwrite that // only supports up to 2^31. -#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) +#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) || defined(__MINGW32__) return std::min(static_cast(static_cast(-1)), size); #else return size; @@ -172,7 +175,7 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { uint8_t *to = static_cast(to_void); #if defined(_WIN32) || defined(_WIN64) - //UTIL_THROW(Exception, "This pread implementation for windows is broken. Please send me a patch that does not change the file pointer. Atomically. Or send me an implementation of pwrite that is allowed to change the file pointer but can be called concurrently with pread."); + UTIL_THROW(Exception, "This pread implementation for windows is broken. Please send me a patch that does not change the file pointer. Atomically. Or send me an implementation of pwrite that is allowed to change the file pointer but can be called concurrently with pread."); const std::size_t kMaxDWORD = static_cast(4294967295UL); #endif for (;size ;) { @@ -262,6 +265,7 @@ typedef CheckOffT::True IgnoredType; void InternalSeek(int fd, int64_t off, int whence) { if ( #if defined __MINGW32__ + // Does this handle 64-bit? (off_t)-1 == lseek(fd, off, whence) #elif defined(_WIN32) || defined(_WIN64) (__int64)-1 == _lseeki64(fd, off, whence) diff --git a/util/joint_sort.hh b/util/joint_sort.hh index 1b43ddcf4..13a52b67b 100644 --- a/util/joint_sort.hh +++ b/util/joint_sort.hh @@ -9,7 +9,6 @@ #include #include -#include namespace util { @@ -35,9 +34,10 @@ template class JointIter { return *this; } - void swap(const JointIter &other) { - std::swap(key_, other.key_); - std::swap(value_, other.value_); + friend void swap(JointIter &first, JointIter &second) { + using std::swap; + swap(first.key_, second.key_); + swap(first.value_, second.value_); } private: @@ -83,9 +83,11 @@ template class JointProxy { return *(inner_.key_); } - void swap(JointProxy &other) { - std::swap(*inner_.key_, *other.inner_.key_); - std::swap(*inner_.value_, *other.inner_.value_); + friend void swap(JointProxy first, JointProxy second) { + // Allow argument-dependent lookup. + using std::swap; + swap(*first.inner_.key_, *second.inner_.key_); + swap(*first.inner_.value_, *second.inner_.value_); } private: @@ -138,14 +140,4 @@ template void JointSort(const KeyIter &key_begi } // namespace util -namespace std { -template void swap(util::detail::JointIter &left, util::detail::JointIter &right) { - left.swap(right); -} - -template void swap(util::detail::JointProxy &left, util::detail::JointProxy &right) { - left.swap(right); -} -} // namespace std - #endif // UTIL_JOINT_SORT__ diff --git a/util/murmur_hash.cc b/util/murmur_hash.cc index 4f519312d..189668c01 100644 --- a/util/murmur_hash.cc +++ b/util/murmur_hash.cc @@ -153,12 +153,19 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed ) // Trick to test for 64-bit architecture at compile time. namespace { +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif template inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) { return MurmurHash64A(key, len, seed); } template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) { return MurmurHash64B(key, len, seed); } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif } // namespace uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) { diff --git a/util/murmur_hash.hh b/util/murmur_hash.hh index ae7e88dec..4891833e9 100644 --- a/util/murmur_hash.hh +++ b/util/murmur_hash.hh @@ -5,8 +5,12 @@ namespace util { +// 64-bit machine version uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); +// 32-bit machine version (not the same function as above) uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); +// Use the version for this arch. Because the values differ across +// architectures, really only use it for in-memory structures. uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0); } // namespace util diff --git a/util/pcqueue.hh b/util/pcqueue.hh index 3df8749b1..07e4146f5 100644 --- a/util/pcqueue.hh +++ b/util/pcqueue.hh @@ -1,6 +1,8 @@ #ifndef UTIL_PCQUEUE__ #define UTIL_PCQUEUE__ +#include "util/exception.hh" + #include #include #include @@ -8,20 +10,68 @@ #include +#ifdef __APPLE__ +#include +#include +#include +#include +#endif // __APPLE__ + namespace util { -inline void WaitSemaphore (boost::interprocess::interprocess_semaphore &on) { +/* OS X Maverick and Boost interprocess were doing "Function not implemented." + * So this is my own wrapper around the mach kernel APIs. + */ +#ifdef __APPLE__ + +#define MACH_CALL(call) UTIL_THROW_IF(KERN_SUCCESS != (call), Exception, "Mach call failure") + +class Semaphore { + public: + explicit Semaphore(int value) : task_(mach_task_self()) { + MACH_CALL(semaphore_create(task_, &back_, SYNC_POLICY_FIFO, value)); + } + + ~Semaphore() { + MACH_CALL(semaphore_destroy(task_, back_)); + } + + void wait() { + MACH_CALL(semaphore_wait(back_)); + } + + void post() { + MACH_CALL(semaphore_signal(back_)); + } + + private: + semaphore_t back_; + task_t task_; +}; + +inline void WaitSemaphore(Semaphore &semaphore) { + semaphore.wait(); +} + +#else +typedef boost::interprocess::interprocess_semaphore Semaphore; + +inline void WaitSemaphore (Semaphore &on) { while (1) { try { on.wait(); break; } catch (boost::interprocess::interprocess_exception &e) { - if (e.get_native_error() != EINTR) throw; + if (e.get_native_error() != EINTR) { + throw; + } } } } +#endif // __APPLE__ + /* Producer consumer queue safe for multiple producers and multiple consumers. * T must be default constructable and have operator=. * The value is copied twice for Consume(T &out) or three times for Consume(), @@ -82,9 +132,9 @@ template class PCQueue : boost::noncopyable { private: // Number of empty spaces in storage_. - boost::interprocess::interprocess_semaphore empty_; + Semaphore empty_; // Number of occupied spaces in storage_. - boost::interprocess::interprocess_semaphore used_; + Semaphore used_; boost::scoped_array storage_; diff --git a/util/pcqueue_test.cc b/util/pcqueue_test.cc new file mode 100644 index 000000000..22ed2c6f3 --- /dev/null +++ b/util/pcqueue_test.cc @@ -0,0 +1,20 @@ +#include "util/pcqueue.hh" + +#define BOOST_TEST_MODULE PCQueueTest +#include + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(SingleThread) { + PCQueue queue(10); + for (int i = 0; i < 10; ++i) { + queue.Produce(i); + } + for (int i = 0; i < 10; ++i) { + BOOST_CHECK_EQUAL(i, queue.Consume()); + } +} + +} +} // namespace util diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh index 9566028f5..38524806c 100644 --- a/util/probing_hash_table.hh +++ b/util/probing_hash_table.hh @@ -70,6 +70,11 @@ template (new_base); + end_ = begin_ + buckets_; + } + template MutableIterator Insert(const T &t) { #ifdef DEBUG assert(initialized_); @@ -98,8 +103,6 @@ template bool UnsafeMutableFind(const Key key, MutableIterator &out) { #ifdef DEBUG diff --git a/util/proxy_iterator.hh b/util/proxy_iterator.hh index 0ee1716f4..a2810a472 100644 --- a/util/proxy_iterator.hh +++ b/util/proxy_iterator.hh @@ -38,8 +38,8 @@ template class ProxyIterator { typedef std::random_access_iterator_tag iterator_category; typedef typename Proxy::value_type value_type; typedef std::ptrdiff_t difference_type; - typedef Proxy & reference; - typedef Proxy * pointer; + typedef Proxy reference; + typedef ProxyIterator * pointer; ProxyIterator() {} @@ -47,10 +47,10 @@ template class ProxyIterator { template ProxyIterator(const ProxyIterator &in) : p_(*in) {} explicit ProxyIterator(const Proxy &p) : p_(p) {} - // p_'s swap does value swapping, but here we want iterator swapping +/* // p_'s swap does value swapping, but here we want iterator swapping friend inline void swap(ProxyIterator &first, ProxyIterator &second) { swap(first.I(), second.I()); - } + }*/ // p_'s operator= does value copying, but here we want iterator copying. S &operator=(const S &other) { @@ -77,8 +77,8 @@ template class ProxyIterator { std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); } - Proxy &operator*() { return p_; } - const Proxy &operator*() const { return p_; } + Proxy operator*() { return p_; } + const Proxy operator*() const { return p_; } Proxy *operator->() { return &p_; } const Proxy *operator->() const { return &p_; } Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); } diff --git a/util/read_compressed_test.cc b/util/read_compressed_test.cc index 71b97b0f6..50450a025 100644 --- a/util/read_compressed_test.cc +++ b/util/read_compressed_test.cc @@ -17,6 +17,7 @@ #include #if !defined mkstemp +// TODO insecure int mkstemp(char * stemplate) { char *filename = mktemp(stemplate); diff --git a/util/sized_iterator.hh b/util/sized_iterator.hh index dce8f229a..a72657b50 100644 --- a/util/sized_iterator.hh +++ b/util/sized_iterator.hh @@ -36,7 +36,7 @@ class SizedInnerIterator { void *Data() { return ptr_; } std::size_t EntrySize() const { return size_; } - friend inline void swap(SizedInnerIterator &first, SizedInnerIterator &second) { + friend void swap(SizedInnerIterator &first, SizedInnerIterator &second) { std::swap(first.ptr_, second.ptr_); std::swap(first.size_, second.size_); } @@ -69,17 +69,7 @@ class SizedProxy { const void *Data() const { return inner_.Data(); } void *Data() { return inner_.Data(); } - /** - // TODO: this (deep) swap was recently added. why? if any std heap sort etc - // algs are using swap, that's going to be worse performance than using - // =. i'm not sure why we *want* a deep swap. if C++11 compilers are - // choosing between move constructor and swap, then we'd better implement a - // (deep) move constructor. it may also be that this is moot since i made - // ProxyIterator a reference and added a shallow ProxyIterator swap? (I - // need Ken or someone competent to judge whether that's correct also. - - // let me know at graehl@gmail.com - */ - friend void swap(SizedProxy &first, SizedProxy &second) { + friend void swap(SizedProxy first, SizedProxy second) { std::swap_ranges( static_cast(first.inner_.Data()), static_cast(first.inner_.Data()) + first.inner_.EntrySize(), diff --git a/util/sized_iterator_test.cc b/util/sized_iterator_test.cc new file mode 100644 index 000000000..c36bcb2d2 --- /dev/null +++ b/util/sized_iterator_test.cc @@ -0,0 +1,16 @@ +#include "util/sized_iterator.hh" + +#define BOOST_TEST_MODULE SizedIteratorTest +#include + +namespace util { namespace { + +BOOST_AUTO_TEST_CASE(swap_works) { + char str[2] = { 0, 1 }; + SizedProxy first(str, 1), second(str + 1, 1); + swap(first, second); + BOOST_CHECK_EQUAL(1, str[0]); + BOOST_CHECK_EQUAL(0, str[1]); +} + +}} // namespace anonymous util diff --git a/util/usage.cc b/util/usage.cc index 2f870d854..e68d7c7c1 100644 --- a/util/usage.cc +++ b/util/usage.cc @@ -66,6 +66,11 @@ Wall GetWall() { } #endif +// Some of these functions are only used on some platforms. +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-function" +#endif // These all assume first > second double Subtract(time_t first, time_t second) { return difftime(first, second); @@ -87,6 +92,9 @@ double DoubleSec(const struct timespec &tv) { return static_cast(tv.tv_sec) + (static_cast(tv.tv_nsec) / 1000000000.0); } #endif +#ifdef __clang__ +#pragma clang diagnostic pop +#endif class RecordStart { public: From ffd62e994ecb88358b5f3aa835f84d441ec58c77 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 27 Jan 2014 22:25:43 -0800 Subject: [PATCH 32/48] Fix C++11 compilation error / Chris Dyer --- util/joint_sort.hh | 11 +++++++---- util/joint_sort_test.cc | 12 ++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/util/joint_sort.hh b/util/joint_sort.hh index 13a52b67b..b1ec48e26 100644 --- a/util/joint_sort.hh +++ b/util/joint_sort.hh @@ -40,6 +40,12 @@ template class JointIter { swap(first.value_, second.value_); } + void DeepSwap(JointIter &other) { + using std::swap; + swap(*key_, *other.key_); + swap(*value_, *other.value_); + } + private: friend class JointProxy; KeyIter key_; @@ -84,10 +90,7 @@ template class JointProxy { } friend void swap(JointProxy first, JointProxy second) { - // Allow argument-dependent lookup. - using std::swap; - swap(*first.inner_.key_, *second.inner_.key_); - swap(*first.inner_.value_, *second.inner_.value_); + first.Inner().DeepSwap(second.Inner()); } private: diff --git a/util/joint_sort_test.cc b/util/joint_sort_test.cc index 4dc859164..b24c602c9 100644 --- a/util/joint_sort_test.cc +++ b/util/joint_sort_test.cc @@ -47,4 +47,16 @@ BOOST_AUTO_TEST_CASE(char_int) { BOOST_CHECK_EQUAL(327, values[3]); } +BOOST_AUTO_TEST_CASE(swap_proxy) { + char keys[2] = {0, 1}; + int values[2] = {2, 3}; + detail::JointProxy first(keys, values); + detail::JointProxy second(keys + 1, values + 1); + swap(first, second); + BOOST_CHECK_EQUAL(1, keys[0]); + BOOST_CHECK_EQUAL(0, keys[1]); + BOOST_CHECK_EQUAL(3, values[0]); + BOOST_CHECK_EQUAL(2, values[1]); +} + }} // namespace anonymous util From 86ee3e15a441aec72eaebdd0389fa925da2316c7 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Wed, 29 Jan 2014 18:37:42 +0000 Subject: [PATCH 33/48] new version of the `score` tool which is now capable of dealing with additional properties in an appropriate manner --- moses/Jamfile | 1 + .../{domain.cpp => DomainFeature.cpp} | 44 +- phrase-extract/{domain.h => DomainFeature.h} | 10 +- phrase-extract/ExtractionPhrasePair.cpp | 327 +++++++ phrase-extract/ExtractionPhrasePair.h | 162 ++++ phrase-extract/InternalStructFeature.cpp | 78 +- phrase-extract/InternalStructFeature.h | 33 +- phrase-extract/Jamfile | 12 +- phrase-extract/PhraseAlignment.cpp | 230 ----- phrase-extract/PhraseAlignment.h | 106 --- phrase-extract/ScoreFeature.cpp | 13 +- phrase-extract/ScoreFeature.h | 35 +- phrase-extract/ScoreFeatureTest.cpp | 17 +- phrase-extract/score-main.cpp | 806 ++++++++++-------- 14 files changed, 1050 insertions(+), 824 deletions(-) rename phrase-extract/{domain.cpp => DomainFeature.cpp} (85%) rename phrase-extract/{domain.h => DomainFeature.h} (93%) create mode 100644 phrase-extract/ExtractionPhrasePair.cpp create mode 100644 phrase-extract/ExtractionPhrasePair.h delete mode 100644 phrase-extract/PhraseAlignment.cpp delete mode 100644 phrase-extract/PhraseAlignment.h diff --git a/moses/Jamfile b/moses/Jamfile index 76f115510..6af31edfa 100644 --- a/moses/Jamfile +++ b/moses/Jamfile @@ -12,6 +12,7 @@ if $(with-dlib) { alias headers : ../util//kenutil : : : $(max-factors) $(dlib) ; alias ThreadPool : ThreadPool.cpp ; +alias Util : Util.cpp Timer.cpp ; if [ option.get "with-synlm" : no : yes ] = yes { diff --git a/phrase-extract/domain.cpp b/phrase-extract/DomainFeature.cpp similarity index 85% rename from phrase-extract/domain.cpp rename to phrase-extract/DomainFeature.cpp index 67b4a13c3..2f99a8709 100644 --- a/phrase-extract/domain.cpp +++ b/phrase-extract/DomainFeature.cpp @@ -1,6 +1,5 @@ -// $Id$ -//#include "beammain.h" -#include "domain.h" +#include "DomainFeature.h" +#include "ExtractionPhrasePair.h" #include "tables-core.h" #include "InputFileStream.h" #include "SafeGetline.h" @@ -26,7 +25,7 @@ void Domain::load( const std::string &domainFileName ) int lineNumber; if (domainSpecLine.size() != 2 || ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) { - cerr << "ERROR: in domain specification line: '" << line << "'" << endl; + std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl; exit(1); } // store @@ -50,29 +49,34 @@ string Domain::getDomainOfSentence( int sentenceId ) const return "undefined"; } -DomainFeature::DomainFeature(const string& domainFile) +DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain") { //process domain file m_domain.load(domainFile); } +void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, + float count, + int sentenceId) const +{ + std::string value = m_domain.getDomainOfSentence(sentenceId); + phrasePair.AddProperty(m_propertyKey, value, count); +} + void DomainFeature::add(const ScoreFeatureContext& context, std::vector& denseValues, std::map& sparseValues) const { - map< string, float > domainCount; - for(size_t i=0; isentenceId ); - if (domainCount.find( d ) == domainCount.end()) { - domainCount[d] = context.phrasePair[i]->count; - } else { - domainCount[d] += context.phrasePair[i]->count; - } - } - add(domainCount, context.count, context.maybeLog, denseValues, sparseValues); + const map *domainCount = context.phrasePair.GetProperty(m_propertyKey); + assert( domainCount != NULL ); + add(*domainCount, + context.phrasePair.GetCount(), + context.maybeLog, + denseValues, sparseValues); } -void SubsetDomainFeature::add(const map& domainCount,float count, +void SubsetDomainFeature::add(const map& domainCount, + float count, const MaybeLog& maybeLog, std::vector& denseValues, std::map& sparseValues) const @@ -152,7 +156,6 @@ void IndicatorDomainFeature::add(const map& domainCount,float coun denseValues.push_back(maybeLog(2.718)); } } - } void SparseIndicatorDomainFeature::add(const map& domainCount,float count, @@ -166,12 +169,5 @@ void SparseIndicatorDomainFeature::add(const map& domainCount,floa } } -bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const -{ - return m_domain.getDomainOfSentence(lhs.sentenceId) == - m_domain.getDomainOfSentence( rhs.sentenceId); -} - - } diff --git a/phrase-extract/domain.h b/phrase-extract/DomainFeature.h similarity index 93% rename from phrase-extract/domain.h rename to phrase-extract/DomainFeature.h index 279496e01..8ebc599e2 100644 --- a/phrase-extract/domain.h +++ b/phrase-extract/DomainFeature.h @@ -34,13 +34,17 @@ class DomainFeature : public ScoreFeature public: DomainFeature(const std::string& domainFile); - bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const; + + void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, + float count, + int sentenceId) const; + void add(const ScoreFeatureContext& context, std::vector& denseValues, std::map& sparseValues) const; protected: - /** Overriden in subclass */ + /** Overridden in subclass */ virtual void add(const std::map& domainCounts, float count, const MaybeLog& maybeLog, std::vector& denseValues, @@ -49,6 +53,8 @@ protected: Domain m_domain; + const std::string m_propertyKey; + }; class SubsetDomainFeature : public DomainFeature diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp new file mode 100644 index 000000000..e2814f33c --- /dev/null +++ b/phrase-extract/ExtractionPhrasePair.cpp @@ -0,0 +1,327 @@ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include "ExtractionPhrasePair.h" +#include "SafeGetline.h" +#include "tables-core.h" +#include "score.h" +#include "moses/Util.h" + +#include + +using namespace std; + + +namespace MosesTraining { + + +extern Vocabulary vcbT; +extern Vocabulary vcbS; + +extern bool hierarchicalFlag; + + +ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource, + const PHRASE *phraseTarget, + ALIGNMENT *targetToSourceAlignment, + float count, float pcfgSum ) : + m_phraseSource(phraseSource), + m_phraseTarget(phraseTarget), + m_count(count), + m_pcfgSum(pcfgSum) +{ + assert(phraseSource.empty()); + assert(phraseTarget.empty()); + + m_count = count; + m_pcfgSum = pcfgSum; + + std::pair< std::map::iterator, bool > insertedAlignment = + m_targetToSourceAlignments.insert( std::pair(targetToSourceAlignment,count) ); + + m_lastTargetToSourceAlignment = insertedAlignment.first; + m_lastCount = m_count; + m_lastPcfgSum = m_pcfgSum; + + m_isValid = true; +} + + +ExtractionPhrasePair::~ExtractionPhrasePair( ) { + Clear(); +} + + +// return value: true if the given alignment was seen for the first time and thus will be stored, +// false if it was present already (the pointer may thus be deleted( +bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment, + float count, float pcfgSum ) +{ + m_count += count; + m_pcfgSum += pcfgSum; + + m_lastCount = count; + m_lastPcfgSum = pcfgSum; + + std::map::iterator iter = m_lastTargetToSourceAlignment; + if ( *(iter->first) == *targetToSourceAlignment ) { + iter->second += count; + return false; + } else { + std::pair< std::map::iterator, bool > insertedAlignment = + m_targetToSourceAlignments.insert( std::pair(targetToSourceAlignment,count) ); + if ( !insertedAlignment.second ) { + // the alignment already exists: increment count + insertedAlignment.first->second += count; + return false; + } + m_lastTargetToSourceAlignment = insertedAlignment.first; + } + + return true; +} + + +void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum ) +{ + m_count += count; + m_pcfgSum += pcfgSum; + m_lastTargetToSourceAlignment->second += count; + // properties + for ( std::map >::iterator iter=m_properties.begin(); + iter !=m_properties.end(); ++iter ) { + LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second; + (*lastPropertyValue)->second += count; + } + + m_lastCount = count; + m_lastPcfgSum = pcfgSum; +} + + +// Check for lexical match +// and in case of SCFG rules for equal non-terminal alignment. +bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource, + const PHRASE *otherPhraseTarget, + ALIGNMENT *otherTargetToSourceAlignment ) const +{ + if (*otherPhraseTarget != *m_phraseTarget) { + return false; + } + if (*otherPhraseSource != *m_phraseSource) { + return false; + } + + return MatchesAlignment( otherTargetToSourceAlignment ); +} + +// Check for lexical match +// and in case of SCFG rules for equal non-terminal alignment. +// Set boolean indicators. +// (Note that we check in the order: target - source - alignment +// and do not touch the subsequent boolean indicators once a previous one has been set to false.) +bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource, + const PHRASE *otherPhraseTarget, + ALIGNMENT *otherTargetToSourceAlignment, + bool &sourceMatch, + bool &targetMatch, + bool &alignmentMatch ) const +{ + if (*otherPhraseSource != *m_phraseSource) { + sourceMatch = false; + return false; + } else { + sourceMatch = true; + } + if (*otherPhraseTarget != *m_phraseTarget) { + targetMatch = false; + return false; + } else { + targetMatch = true; + } + if ( !MatchesAlignment(otherTargetToSourceAlignment) ) { + alignmentMatch = false; + return false; + } else { + alignmentMatch = true; + } + return true; +} + +// Check for equal non-terminal alignment in case of SCFG rules. +// Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first +bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const +{ + if (!hierarchicalFlag) return true; + + // all or none of the phrasePair's word alignment matrices match, so just pick one + const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first; + + assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1); + assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size()); + + // loop over all symbols but the left hand side of the rule + for (size_t i=0; isize()-1; ++i) { + if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) { + size_t thisAlign = *(thisTargetToSourceAlignment->at(i).begin()); + size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin()); + + if (thisTargetToSourceAlignment->at(i).size() != 1 || + otherTargetToSourceAlignment->at(i).size() != 1 || + thisAlign != otherAlign) { + return false; + } + } + } + + return true; +} + +void ExtractionPhrasePair::Clear() +{ + delete m_phraseSource; + delete m_phraseTarget; + + m_count = 0.0f; + m_pcfgSum = 0.0f; + + for ( std::map::iterator iter=m_targetToSourceAlignments.begin(); + iter!=m_targetToSourceAlignments.end(); ++iter) { + delete iter->first; + } + m_targetToSourceAlignments.clear(); + + for ( std::map >::iterator iter=m_properties.begin(); + iter!=m_properties.end(); ++iter) { + delete (iter->second).second; + delete (iter->second).first; + } + m_properties.clear(); + + m_lastCount = 0.0f; + m_lastPcfgSum = 0.0f; + m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin(); + + m_isValid = false; +} + + +void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, float count ) +{ + if (propertiesString.empty()) { + return; + } + + vector toks; + Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{"); + for (size_t i = 1; i < toks.size(); ++i) { + std::string &tok = toks[i]; + if (tok.empty()) { + continue; + } + size_t endPos = tok.rfind("}"); + tok = tok.substr(0, endPos - 1); + + vector keyValue = Moses::TokenizeFirstOnly(tok, " "); + assert(keyValue.size() == 2); + AddProperty(keyValue[0], keyValue[1], count); + } +} + + +const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const +{ + float bestAlignmentCount = -1; + + std::map::const_iterator bestAlignment = m_targetToSourceAlignments.end(); + + for (std::map::const_iterator iter=m_targetToSourceAlignments.begin(); + iter!=m_targetToSourceAlignments.end(); ++iter) { + if ( (iter->second > bestAlignmentCount) || + ( (iter->second == bestAlignmentCount) && + (*(iter->first) > *(bestAlignment->first)) ) ) { + bestAlignmentCount = iter->second; + bestAlignment = iter; + } + } + + if ( bestAlignment == m_targetToSourceAlignments.end()) { + return NULL; + } + + return bestAlignment->first; +} + + +const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string &key) const +{ + float bestPropertyCount = -1; + + const PROPERTY_VALUES *allPropertyValues = GetProperty( key ); + if ( allPropertyValues == NULL ) { + return NULL; + } + + PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end(); + + for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); + iter!=allPropertyValues->end(); ++iter) { + if ( (iter->second > bestPropertyCount) || + ( (iter->second == bestPropertyCount) && + (iter->first > bestPropertyValue->first) ) ) { + bestPropertyCount = iter->second; + bestPropertyValue = iter; + } + } + + if ( bestPropertyValue == allPropertyValues->end()) { + return NULL; + } + + return &(bestPropertyValue->first); +} + + +std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &key) const +{ + const PROPERTY_VALUES *allPropertyValues = GetProperty( key ); + + if ( allPropertyValues == NULL ) { + return ""; + } + + std::ostringstream oss; + for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); + iter!=allPropertyValues->end(); ++iter) { + if (iter!=allPropertyValues->begin()) { + oss << " "; + } + oss << iter->first; + oss << " "; + oss << iter->second; + } + + std::string allPropertyValuesString(oss.str()); + return allPropertyValuesString; +} + + +} + diff --git a/phrase-extract/ExtractionPhrasePair.h b/phrase-extract/ExtractionPhrasePair.h new file mode 100644 index 000000000..f04984391 --- /dev/null +++ b/phrase-extract/ExtractionPhrasePair.h @@ -0,0 +1,162 @@ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2009 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once +#include "tables-core.h" + +#include +#include +#include + +namespace MosesTraining { + + +typedef std::vector< std::set > ALIGNMENT; + + +class ExtractionPhrasePair { + +protected: + + typedef std::map PROPERTY_VALUES; + typedef std::map::iterator LAST_PROPERTY_VALUE; + + + bool m_isValid; + + const PHRASE *m_phraseSource; + const PHRASE *m_phraseTarget; + + float m_count; + float m_pcfgSum; + + std::map m_targetToSourceAlignments; + std::map > m_properties; + + float m_lastCount; + float m_lastPcfgSum; + std::map::iterator m_lastTargetToSourceAlignment; + +public: + + ExtractionPhrasePair( const PHRASE *phraseSource, + const PHRASE *phraseTarget, + ALIGNMENT *targetToSourceAlignment, + float count, float pcfgSum ); + + ~ExtractionPhrasePair(); + + bool Add( ALIGNMENT *targetToSourceAlignment, + float count, float pcfgSum ); + + void IncrementPrevious( float count, float pcfgSum ); + + bool Matches( const PHRASE *otherPhraseSource, + const PHRASE *otherPhraseTarget, + ALIGNMENT *otherTargetToSourceAlignment ) const; + + bool Matches( const PHRASE *otherPhraseSource, + const PHRASE *otherPhraseTarget, + ALIGNMENT *otherTargetToSourceAlignment, + bool &sourceMatch, + bool &targetMatch, + bool &alignmentMatch ) const; + + bool MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const; + + void Clear(); + + bool IsValid() const { + return m_isValid; + } + + + const PHRASE *GetSource() const { + return m_phraseSource; + } + + const PHRASE *GetTarget() const { + return m_phraseTarget; + } + + float GetCount() const { + return m_count; + } + + float GetPcfgScore() const { + return m_pcfgSum; + } + + const size_t GetNumberOfProperties() const { + return m_properties.size(); + } + + const std::map *GetProperty( const std::string &key ) const { + std::map >::const_iterator iter; + iter = m_properties.find(key); + if (iter == m_properties.end()) { + return NULL; + } else { + return iter->second.first; + } + } + + const ALIGNMENT *FindBestAlignmentTargetToSource() const; + + const std::string *FindBestPropertyValue(const std::string &key) const; + + std::string CollectAllPropertyValues(const std::string &key) const; + + void AddProperties( const std::string &str, float count ); + + void AddProperty( const std::string &key, const std::string &value, float count ) + { + std::map >::iterator iter = m_properties.find(key); + if ( iter == m_properties.end() ) { + // key not found: insert property key and value + PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES(); + std::pair insertedProperty = propertyValues->insert( std::pair(value,count) ); + LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first); + m_properties[key] = std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* >(propertyValues, lastPropertyValue); + } else { + LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second; + if ( (*lastPropertyValue)->first == value ) { // same property key-value pair has been seen right before + // property key-value pair exists already: add count + (*lastPropertyValue)->second += count; + } else { // need to check whether the property key-value pair has appeared before (insert if not) + // property key exists, but not in combination with this value: + // add new value with count + PROPERTY_VALUES *propertyValues = (iter->second).first; + std::pair insertedProperty = propertyValues->insert( std::pair(value,count) ); + if ( !insertedProperty.second ) { // property value for this key appeared before: add count + insertedProperty.first->second += count; + } + LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first); + delete (iter->second).second; + (iter->second).second = lastPropertyValue; + } + } + } + +}; + +} + diff --git a/phrase-extract/InternalStructFeature.cpp b/phrase-extract/InternalStructFeature.cpp index e0e9fd3e2..3757b0e43 100644 --- a/phrase-extract/InternalStructFeature.cpp +++ b/phrase-extract/InternalStructFeature.cpp @@ -1,50 +1,30 @@ #include "InternalStructFeature.h" +#include using namespace std; namespace MosesTraining { -InternalStructFeature::InternalStructFeature() - :m_type(0){ - //cout<<"InternalStructFeature: Construct "< if the dense score is the same - //-> if the sparse feature is set - // compare phrases? with the internalStrucutre string? - /** Return true if the two phrase pairs are equal from the point of this feature. Assume - that they already compare true according to PhraseAlignment.equals() - **/ - -/* if(lhs.ghkmParse==rhs.ghkmParse) - return true; - else - return false; -*/ - //return true; -} - void InternalStructFeature::add(const ScoreFeatureContext& context, - std::vector& denseValues, - std::map& sparseValues) const{ - for(size_t i=0; itreeFragment, denseValues, sparseValues); - } - + std::vector& denseValues, + std::map& sparseValues) const { + const std::map *allTrees = context.phrasePair.GetProperty("Tree"); // our would we rather want to take the most frequent one only? + for ( std::map::const_iterator iter=allTrees->begin(); + iter!=allTrees->end(); ++iter ) { + add(&(iter->first), iter->second, denseValues, sparseValues); + } } -void InternalStructFeatureDense::add(std::string *internalStruct, - std::vector& denseValues, - std::map& sparseValues) const{ +void InternalStructFeatureDense::add(const std::string *treeFragment, + float count, + std::vector& denseValues, + std::map& sparseValues) const { //cout<<"Dense: "<<*internalStruct<find("NP", start)) != string::npos) { - countNP++; + while((start = treeFragment->find("NP", start)) != string::npos) { + countNP += count; start+=2; //length of "NP" } //should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln? @@ -53,21 +33,21 @@ void InternalStructFeatureDense::add(std::string *internalStruct, } -void InternalStructFeatureSparse::add(std::string *internalStruct, - std::vector& denseValues, - std::map& sparseValues) const{ - //cout<<"Sparse: "<<*internalStruct<find("VBZ")!=std::string::npos) - sparseValues["NTVBZ"] = 1; - if(internalStruct->find("VBD")!=std::string::npos) - sparseValues["NTVBD"] = 1; - if(internalStruct->find("VBP")!=std::string::npos) - sparseValues["NTVBP"] = 1; - if(internalStruct->find("PP")!=std::string::npos) - sparseValues["NTPP"] = 1; - if(internalStruct->find("SBAR")!=std::string::npos) - sparseValues["NTSBAR"] = 1; - +void InternalStructFeatureSparse::add(const std::string *treeFragment, + float count, + std::vector& denseValues, + std::map& sparseValues) const { + //cout<<"Sparse: "<<*internalStruct<find("VBZ")!=std::string::npos) + sparseValues["NTVBZ"] += count; + if(treeFragment->find("VBD")!=std::string::npos) + sparseValues["NTVBD"] += count; + if(treeFragment->find("VBP")!=std::string::npos) + sparseValues["NTVBP"] += count; + if(treeFragment->find("PP")!=std::string::npos) + sparseValues["NTPP"] += count; + if(treeFragment->find("SBAR")!=std::string::npos) + sparseValues["NTSBAR"] += count; } diff --git a/phrase-extract/InternalStructFeature.h b/phrase-extract/InternalStructFeature.h index bd513a715..7a6efec1d 100644 --- a/phrase-extract/InternalStructFeature.h +++ b/phrase-extract/InternalStructFeature.h @@ -21,22 +21,19 @@ namespace MosesTraining class InternalStructFeature : public ScoreFeature { public: - InternalStructFeature(); - /** Return true if the two phrase pairs are equal from the point of this feature. Assume - that they already compare true according to PhraseAlignment.equals() - **/ - bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const; + InternalStructFeature() : m_type(0) {}; /** Add the values for this feature function. */ void add(const ScoreFeatureContext& context, - std::vector& denseValues, - std::map& sparseValues) const; + std::vector& denseValues, + std::map& sparseValues) const; protected: - /** Overriden in subclass */ - virtual void add(std::string *internalStruct, - std::vector& denseValues, - std::map& sparseValues) const = 0; + /** Overridden in subclass */ + virtual void add(const std::string *treeFragment, + float count, + std::vector& denseValues, + std::map& sparseValues) const = 0; int m_type; }; @@ -47,9 +44,10 @@ public: InternalStructFeatureDense() :InternalStructFeature(){m_type=1;} //std::cout<<"InternalStructFeatureDense: Construct "<& denseValues, - std::map& sparseValues) const; + virtual void add(const std::string *treeFragment, + float count, + std::vector& denseValues, + std::map& sparseValues) const; }; class InternalStructFeatureSparse : public InternalStructFeature @@ -58,9 +56,10 @@ public: InternalStructFeatureSparse() :InternalStructFeature(){m_type=2;}// std::cout<<"InternalStructFeatureSparse: Construct "<& denseValues, - std::map& sparseValues) const; + virtual void add(const std::string *treeFragment, + float count, + std::vector& denseValues, + std::map& sparseValues) const; }; } diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile index e66ecb9b3..50fed2973 100644 --- a/phrase-extract/Jamfile +++ b/phrase-extract/Jamfile @@ -1,19 +1,19 @@ -local most-deps = [ glob *.cpp : PhraseAlignment.cpp *Test.cpp *-main.cpp ] ; +local most-deps = [ glob *.cpp : ExtractionPhrasePair.cpp *Test.cpp *-main.cpp ] ; #Build .o files with include path setting, reused. for local d in $(most-deps) { obj $(d:B).o : $(d) ; } #and stuff them into an alias. -alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ../moses//ThreadPool ../util//kenutil ; +alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ../moses//ThreadPool ../moses//Util ../util//kenutil ; -#PhraseAlignment.cpp requires that main define some global variables. +#ExtractionPhrasePair.cpp requires that main define some global variables. #Build the mains that do not need these global variables. for local m in [ glob *-main.cpp : score-main.cpp ] { exe [ MATCH "(.*)-main.cpp" : $(m) ] : $(m) deps ; } -#The side dishes that use PhraseAlignment.cpp -exe score : PhraseAlignment.cpp score-main.cpp deps ; +#The side dishes that use ExtractionPhrasePair.cpp +exe score : ExtractionPhrasePair.cpp score-main.cpp deps ; import testing ; -run ScoreFeatureTest.cpp PhraseAlignment.cpp deps ..//boost_unit_test_framework ..//boost_iostreams : : test.domain ; +run ScoreFeatureTest.cpp ExtractionPhrasePair.cpp deps ..//boost_unit_test_framework ..//boost_iostreams : : test.domain ; diff --git a/phrase-extract/PhraseAlignment.cpp b/phrase-extract/PhraseAlignment.cpp deleted file mode 100644 index 075e1b18c..000000000 --- a/phrase-extract/PhraseAlignment.cpp +++ /dev/null @@ -1,230 +0,0 @@ -/* - * PhraseAlignment.cpp - * extract - * - * Created by Hieu Hoang on 28/07/2010. - * Copyright 2010 __MyCompanyName__. All rights reserved. - * - */ - -#include -#include "PhraseAlignment.h" -#include "SafeGetline.h" -#include "tables-core.h" -#include "score.h" - -#include - -using namespace std; - -namespace MosesTraining -{ - -extern Vocabulary vcbT; -extern Vocabulary vcbS; - -extern bool hierarchicalFlag; - -//! convert string to variable of type T. Used to reading floats, int etc from files -template -inline T Scan(const std::string &input) -{ - std::stringstream stream(input); - T ret; - stream >> ret; - return ret; -} - - -//! speeded up version of above -template -inline void Scan(std::vector &output, const std::vector< std::string > &input) -{ - output.resize(input.size()); - for (size_t i = 0 ; i < input.size() ; i++) { - output[i] = Scan( input[i] ); - } -} - - -inline void Tokenize(std::vector &output - , const std::string& str - , const std::string& delimiters = " \t") -{ - // Skip delimiters at beginning. - std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); - // Find first "non-delimiter". - std::string::size_type pos = str.find_first_of(delimiters, lastPos); - - while (std::string::npos != pos || std::string::npos != lastPos) { - // Found a token, add it to the vector. - output.push_back(str.substr(lastPos, pos - lastPos)); - // Skip delimiters. Note the "not_of" - lastPos = str.find_first_not_of(delimiters, pos); - // Find next "non-delimiter" - pos = str.find_first_of(delimiters, lastPos); - } -} - -// speeded up version of above -template -inline void Tokenize( std::vector &output - , const std::string &input - , const std::string& delimiters = " \t") -{ - std::vector stringVector; - Tokenize(stringVector, input, delimiters); - return Scan(output, stringVector ); -} - -// read in a phrase pair and store it -void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFlag ) -{ - assert(phraseS.empty()); - assert(phraseT.empty()); - treeFragment.clear(); - - vector< string > token = tokenize( line ); - int item = 1; - for (size_t j=0; j= phraseT.size() || (size_t)s >= phraseS.size()) { - cerr << "WARNING: phrase pair " << lineID - << " has alignment point (" << s << ", " << t - << ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n"; - } else { - // first alignment point? -> initialize - createAlignVec(phraseS.size(), phraseT.size()); - - // add alignment point - alignedToT[t].insert( s ); - alignedToS[s].insert( t ); - } - } else if ( (item >= 4) && (token[j] == "Tree") ) { // check for information with a key field - ++j; - while ( (j < token.size() ) && (token[j] != "|||") ) { - treeFragment.append(" "); - treeFragment.append(token[j]); - ++j; - } - --j; - } else if (includeSentenceIdFlag && item == 4) { // optional sentence id - sscanf(token[j].c_str(), "%d", &sentenceId); - } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count - sscanf(token[j].c_str(), "%f", &count); - } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score - float pcfgScore = std::atof(token[j].c_str()); - pcfgSum = pcfgScore * count; - } - } - - createAlignVec(phraseS.size(), phraseT.size()); - - if (item + (includeSentenceIdFlag?-1:0) == 3) { - count = 1.0; - } - if (item < 3 || item > 6) { - cerr << "ERROR: faulty line " << lineID << ": " << line << endl; - } -} - -void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize) -{ - // in case of no align info. always need align info, even if blank - if (alignedToT.size() == 0) { - size_t numTgtSymbols = (hierarchicalFlag ? targetSize-1 : targetSize); - alignedToT.resize(numTgtSymbols); - } - - if (alignedToS.size() == 0) { - size_t numSrcSymbols = (hierarchicalFlag ? sourceSize-1 : sourceSize); - alignedToS.resize(numSrcSymbols); - } -} - -void PhraseAlignment::clear() -{ - phraseS.clear(); - phraseT.clear(); - alignedToT.clear(); - alignedToS.clear(); -} - -// check if two word alignments between a phrase pair are the same -bool PhraseAlignment::equals( const PhraseAlignment& other ) -{ - if (this == &other) return true; - if (other.GetTarget() != GetTarget()) return false; - if (other.GetSource() != GetSource()) return false; - if (other.alignedToT != alignedToT) return false; - if (other.alignedToS != alignedToS) return false; - return true; -} - -// check if two word alignments between a phrase pairs "match" -// i.e. they do not differ in the alignment of non-termimals -bool PhraseAlignment::match( const PhraseAlignment& other ) -{ - if (this == &other) return true; - if (other.GetTarget() != GetTarget()) return false; - if (other.GetSource() != GetSource()) return false; - if (!hierarchicalFlag) return true; - - assert(phraseT.size() == alignedToT.size() + 1); - assert(alignedToT.size() == other.alignedToT.size()); - - // loop over all words (note: 0 = left hand side of rule) - for(size_t i=0; i -#include - -namespace MosesTraining -{ - -// data structure for a single phrase pair -class PhraseAlignment -{ -protected: - PHRASE phraseS; - PHRASE phraseT; - - void createAlignVec(size_t sourceSize, size_t targetSize); - void addNTLength(const std::string &tok); -public: - float pcfgSum; - float count; - int sentenceId; - std::string domain; - std::string treeFragment; - - std::vector< std::set > alignedToT; - std::vector< std::set > alignedToS; - - void create( char*, int, bool ); - void clear(); - bool equals( const PhraseAlignment& ); - bool match( const PhraseAlignment& ); - - int Compare(const PhraseAlignment &compare) const; - inline bool operator<(const PhraseAlignment &compare) const { - return Compare(compare) < 0; - } - - const PHRASE &GetSource() const { - return phraseS; - } - const PHRASE &GetTarget() const { - return phraseT; - } -}; - -class PhraseAlignment; - -typedef std::vector PhraseAlignmentCollection; -//typedef std::vector PhrasePairGroup; - -class PhraseAlignmentCollectionOrderer -{ -public: - bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const { - assert(collA.size() > 0); - assert(collB.size() > 0); - - const PhraseAlignment &objA = *collA[0]; - const PhraseAlignment &objB = *collB[0]; - bool ret = objA < objB; - - return ret; - } -}; - - -//typedef std::set PhrasePairGroup; - -class PhrasePairGroup -{ -private: - typedef std::set Coll; - Coll m_coll; - - -public: - typedef Coll::iterator iterator; - typedef Coll::const_iterator const_iterator; - typedef std::vector SortedColl; - - std::pair insert ( const PhraseAlignmentCollection& obj ); - - const SortedColl &GetSortedColl() const { - return m_sortedColl; - } - size_t GetSize() const { - return m_coll.size(); - } - -private: - SortedColl m_sortedColl; - -}; - - -} - diff --git a/phrase-extract/ScoreFeature.cpp b/phrase-extract/ScoreFeature.cpp index f98759755..52157a8cb 100644 --- a/phrase-extract/ScoreFeature.cpp +++ b/phrase-extract/ScoreFeature.cpp @@ -18,7 +18,7 @@ ***********************************************************************/ #include "ScoreFeature.h" -#include "domain.h" +#include "DomainFeature.h" #include "InternalStructFeature.h" using namespace std; @@ -77,10 +77,10 @@ void ScoreFeatureManager::configure(const std::vector args) } sparseDomainAdded = true; m_includeSentenceId = true; - } else if(args[i] == "--GHKMFeatureSparse"){ + } else if(args[i] == "--TreeFeatureSparse"){ //MARIA m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse())); - } else if(args[i] == "--GHKMFeatureDense"){ + } else if(args[i] == "--TreeFeatureDense"){ //MARIA m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense())); } else { @@ -91,12 +91,13 @@ void ScoreFeatureManager::configure(const std::vector args) } -bool ScoreFeatureManager::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const +void ScoreFeatureManager::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, + float count, + int sentenceId) const { for (size_t i = 0; i < m_features.size(); ++i) { - if (!m_features[i]->equals(lhs,rhs)) return false; + m_features[i]->addPropertiesToPhrasePair(phrasePair, count, sentenceId); } - return true; } void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context, diff --git a/phrase-extract/ScoreFeature.h b/phrase-extract/ScoreFeature.h index 5823f21ce..926397e71 100644 --- a/phrase-extract/ScoreFeature.h +++ b/phrase-extract/ScoreFeature.h @@ -35,7 +35,7 @@ #include "util/exception.hh" -#include "PhraseAlignment.h" +#include "ExtractionPhrasePair.h" namespace MosesTraining { @@ -64,17 +64,14 @@ public: /** Passed to each feature to be used to calculate its values */ struct ScoreFeatureContext { ScoreFeatureContext( - const PhraseAlignmentCollection &thePhrasePair, - float theCount, /* Total counts of all phrase pairs*/ + const ExtractionPhrasePair &thePhrasePair, const MaybeLog& theMaybeLog ) : phrasePair(thePhrasePair), - count(theCount), maybeLog(theMaybeLog) { } - const PhraseAlignmentCollection& phrasePair; - float count; + const ExtractionPhrasePair &phrasePair; MaybeLog maybeLog; }; @@ -85,16 +82,19 @@ struct ScoreFeatureContext { class ScoreFeature { public: + + /** Some features might need to store properties in ExtractionPhrasePair, + * e.g. to pass along external information loaded by a feature + * which may distinguish several phrase occurrences based on sentence ID */ + virtual void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, + float count, + int sentenceId) const {}; + /** Add the values for this feature function. */ virtual void add(const ScoreFeatureContext& context, std::vector& denseValues, std::map& sparseValues) const = 0; - /** Return true if the two phrase pairs are equal from the point of this feature. Assume - that they already compare true according to PhraseAlignment.equals() - **/ - virtual bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const = 0; - virtual ~ScoreFeature() {} }; @@ -112,17 +112,18 @@ public: /** Pass the unused command-line arguments to configure the extra features */ void configure(const std::vector args); + /** Some features might need to store properties in ExtractionPhrasePair, + * e.g. to pass along external information loaded by a feature + * which may distinguish several phrase occurrences based on sentence ID */ + void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair, + float count, + int sentenceId) const; + /** Add all the features */ void addFeatures(const ScoreFeatureContext& context, std::vector& denseValues, std::map& sparseValues) const; - /** - * Used to tell if the PhraseAlignment should be considered the same by all - * extended features. - **/ - bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const; - const std::vector& getFeatures() const { return m_features; } diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 65f1f4437..534ab177b 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -17,7 +17,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ -#include "domain.h" +#include "DomainFeature.h" #include "ScoreFeature.h" #include "tables-core.h" @@ -93,18 +93,3 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); } - -BOOST_AUTO_TEST_CASE(domain_equals) -{ - SubsetDomainFeature feature(DomainFileLocation()); - PhraseAlignment a1,a2,a3; - char buf1[] = "a ||| b ||| 0-0 ||| 1"; - char buf2[] = "a ||| b ||| 0-0 ||| 2"; - char buf3[] = "a ||| b ||| 0-0 ||| 3"; - a1.create(buf1, 0, true); //domain a - a2.create(buf2, 1, true); //domain c - a3.create(buf3, 2, true); //domain c - BOOST_CHECK(feature.equals(a2,a3)); - BOOST_CHECK(!feature.equals(a1,a3)); - BOOST_CHECK(!feature.equals(a1,a3)); -} diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index d0305fad2..65a12d176 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -21,22 +21,21 @@ #include #include #include -#include #include #include #include +#include #include +#include #include #include "SafeGetline.h" #include "ScoreFeature.h" #include "tables-core.h" -#include "domain.h" -#include "PhraseAlignment.h" +#include "ExtractionPhrasePair.h" #include "score.h" #include "InputFileStream.h" #include "OutputFileStream.h" -#include "InternalStructFeature.h" using namespace std; using namespace MosesTraining; @@ -61,111 +60,120 @@ int negLogProb = 1; bool lexFlag = true; bool unalignedFlag = false; bool unalignedFWFlag = false; -bool singletonFeature = false; bool crossedNonTerm = false; int countOfCounts[COC_MAX+1]; int totalDistinct = 0; float minCountHierarchical = 0; +std::map sourceLHSCounts; +std::map* > targetLHSAndSourceLHSJointCounts; + +std::set sourceLabelSet; +std::map sourceLabels; +std::vector sourceLabelsByIndex; Vocabulary vcbT; Vocabulary vcbS; } // namespace -vector tokenize( const char [] ); +std::vector tokenize( const char [] ); -void writeCountOfCounts( const string &fileNameCountOfCounts ); -void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLog); -const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair ); -const std::string &findBestTreeFragment(const PhraseAlignmentCollection &phrasePair ); -void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLog ); -double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & ); -double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & ); -set functionWordList; +void processLine( std::string line, + int lineID, bool includeSentenceIdFlag, int &sentenceId, + PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment, + std::string &additionalPropertiesString, + float &count, float &pcfgSum ); +void writeCountOfCounts( const std::string &fileNameCountOfCounts ); +void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile, + const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ); +void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog ); +double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ); +double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource ); +set functionWordList; void loadFunctionWords( const string &fileNameFunctionWords ); -double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & ); -void printSourcePhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &); -void printTargetPhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &); +double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ); +int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ); +void printSourcePhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out ); +void printTargetPhrase( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *targetToSourceAlignment, ostream &out ); +void invertAlignment( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment ); + int main(int argc, char* argv[]) { - cerr << "Score v2.0 written by Philipp Koehn\n" - << "scoring methods for extracted rules\n"; + std::cerr << "Score v2.1 -- " + << "scoring methods for extracted rules" << std::endl; ScoreFeatureManager featureManager; if (argc < 4) { - cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n"; - cerr << featureManager.usage() << endl; + std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl; + std::cerr << featureManager.usage() << std::endl; exit(1); } - string fileNameExtract = argv[1]; - string fileNameLex = argv[2]; - string fileNamePhraseTable = argv[3]; - string fileNameCountOfCounts; - char* fileNameFunctionWords = NULL; - vector featureArgs; //all unknown args passed to feature manager + std::string fileNameExtract = argv[1]; + std::string fileNameLex = argv[2]; + std::string fileNamePhraseTable = argv[3]; + std::string fileNameCountOfCounts; + std::string fileNameFunctionWords; + std::vector featureArgs; // all unknown args passed to feature manager for(int i=4; iOpen(fileNamePhraseTable); if (!success) { - cerr << "ERROR: could not open file phrase table file " - << fileNamePhraseTable << endl; + std::cerr << "ERROR: could not open file phrase table file " + << fileNamePhraseTable << std::endl; exit(1); } phraseTableFile = outputFile; } // loop through all extracted phrase translations - float lastCount = 0.0f; - float lastPcfgSum = 0.0f; - vector< PhraseAlignment > phrasePairsWithSameF; - bool isSingleton = true; - int i=0; - char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; + char line[LINE_MAX_LENGTH], lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; - PhraseAlignment *lastPhrasePair = NULL; - while(true) { - if (extractFileP.eof()) break; - if (++i % 100000 == 0) cerr << "." << flush; - SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); - if (extractFileP.eof()) break; + ExtractionPhrasePair *phrasePair = NULL; + std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource; + std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible + + int tmpSentenceId; + PHRASE *tmpPhraseSource, *tmpPhraseTarget; + ALIGNMENT *tmpTargetToSourceAlignment; + std::string tmpAdditionalPropertiesString; + float tmpCount=0.0f, tmpPcfgSum=0.0f; + + int i=0; + SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ ); + if ( !extractFileP.eof() ) { + ++i; + tmpPhraseSource = new PHRASE(); + tmpPhraseTarget = new PHRASE(); + tmpTargetToSourceAlignment = new ALIGNMENT(); + processLine( std::string(line), + i, featureManager.includeSentenceId(), tmpSentenceId, + tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, + tmpAdditionalPropertiesString, + tmpCount, tmpPcfgSum); + phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget, + tmpTargetToSourceAlignment, + tmpCount, tmpPcfgSum ); + phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount ); + featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId ); + phrasePairsWithSameSource.push_back( phrasePair ); + if ( hierarchicalFlag ) { + phrasePairsWithSameSourceAndTarget.push_back( phrasePair ); + } + strcpy( lastLine, line ); + SAFE_GETLINE( (extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__ ); + } + + while ( !extractFileP.eof() ) { + + if ( ++i % 100000 == 0 ) { + std::cerr << "." << std::flush; + } // identical to last line? just add count if (strcmp(line,lastLine) == 0) { - lastPhrasePair->count += lastCount; - lastPhrasePair->pcfgSum += lastPcfgSum; + phrasePair->IncrementPrevious(tmpCount,tmpPcfgSum); + SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); continue; - } - strcpy( lastLine, line ); - - // create new phrase pair - PhraseAlignment phrasePair; - phrasePair.create( line, i, featureManager.includeSentenceId()); - lastCount = phrasePair.count; - lastPcfgSum = phrasePair.pcfgSum; - - // only differs in count? just add count - if (lastPhrasePair != NULL - && lastPhrasePair->equals( phrasePair ) - && featureManager.equals(*lastPhrasePair, phrasePair)) { - lastPhrasePair->count += phrasePair.count; - lastPhrasePair->pcfgSum += phrasePair.pcfgSum; - continue; - } - - // if new source phrase, process last batch - if (lastPhrasePair != NULL && - lastPhrasePair->GetSource() != phrasePair.GetSource()) { - processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton, featureManager, maybeLogProb ); - - phrasePairsWithSameF.clear(); - isSingleton = false; - lastPhrasePair = NULL; } else { - isSingleton = true; + strcpy( lastLine, line ); } - // add phrase pairs to list, it's now the last one - phrasePairsWithSameF.push_back( phrasePair ); - lastPhrasePair = &phrasePairsWithSameF.back(); + tmpPhraseSource = new PHRASE(); + tmpPhraseTarget = new PHRASE(); + tmpTargetToSourceAlignment = new ALIGNMENT(); + tmpAdditionalPropertiesString.clear(); + processLine( std::string(line), + i, featureManager.includeSentenceId(), tmpSentenceId, + tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, + tmpAdditionalPropertiesString, + tmpCount, tmpPcfgSum); + + bool matchesPrevious = false; + bool sourceMatch = true; bool targetMatch = true; bool alignmentMatch = true; // be careful with these, + // ExtractionPhrasePair::Matches() checks them in order and does not continue with the others + // once the first of them has been found to have to be set to false + + if ( hierarchicalFlag ) { + for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin(); + iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) { + if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, + sourceMatch, targetMatch, alignmentMatch ) ) { + matchesPrevious = true; + phrasePair = (*iter); + break; + } + } + } else { + if ( phrasePair->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, + sourceMatch, targetMatch, alignmentMatch ) ) { + matchesPrevious = true; + } + } + + if ( matchesPrevious ) { + delete tmpPhraseSource; + delete tmpPhraseTarget; + if ( !phrasePair->Add( tmpTargetToSourceAlignment, + tmpCount, tmpPcfgSum ) ) { + delete tmpTargetToSourceAlignment; + } + phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount ); + featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId ); + } else { + + if ( !phrasePairsWithSameSource.empty() && + !sourceMatch ) { + processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb ); + for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); + iter!=phrasePairsWithSameSource.end(); ++iter) { + delete *iter; + } + phrasePairsWithSameSource.clear(); + if ( hierarchicalFlag ) { + phrasePairsWithSameSourceAndTarget.clear(); + } + } + + if ( hierarchicalFlag ) { + if ( !phrasePairsWithSameSourceAndTarget.empty() && + !targetMatch ) { + phrasePairsWithSameSourceAndTarget.clear(); + } + } + + phrasePair = new ExtractionPhrasePair( tmpPhraseSource, tmpPhraseTarget, + tmpTargetToSourceAlignment, + tmpCount, tmpPcfgSum ); + phrasePair->AddProperties( tmpAdditionalPropertiesString, tmpCount ); + featureManager.addPropertiesToPhrasePair( *phrasePair, tmpCount, tmpSentenceId ); + phrasePairsWithSameSource.push_back(phrasePair); + + if ( hierarchicalFlag ) { + phrasePairsWithSameSourceAndTarget.push_back(phrasePair); + } + } + + SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); + } - processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton, featureManager, maybeLogProb ); + + processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb ); + for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); + iter!=phrasePairsWithSameSource.end(); ++iter) { + delete *iter; + } + phrasePairsWithSameSource.clear(); + phraseTableFile->flush(); - if (phraseTableFile != &cout) { + if (phraseTableFile != &std::cout) { delete phraseTableFile; } @@ -286,177 +382,134 @@ int main(int argc, char* argv[]) } } + +void processLine( std::string line, + int lineID, bool includeSentenceIdFlag, int &sentenceId, + PHRASE *phraseSource, PHRASE *phraseTarget, ALIGNMENT *targetToSourceAlignment, + std::string &additionalPropertiesString, + float &count, float &pcfgSum ) +{ + size_t foundAdditionalProperties = line.find("{{"); + if (foundAdditionalProperties != std::string::npos) { + additionalPropertiesString = line.substr(foundAdditionalProperties); + line = line.substr(0,foundAdditionalProperties); + } else { + additionalPropertiesString.clear(); + } + + phraseSource->clear(); + phraseTarget->clear(); + targetToSourceAlignment->clear(); + + std::vector token = tokenize( line.c_str() ); + int item = 1; + for ( size_t j=0; jpush_back( vcbS.storeIfNew( token[j] ) ); + } else if (item == 2) { // target phrase + phraseTarget->push_back( vcbT.storeIfNew( token[j] ) ); + } else if (item == 3) { // alignment + int s,t; + sscanf(token[j].c_str(), "%d-%d", &s, &t); + if ((size_t)t >= phraseTarget->size() || (size_t)s >= phraseSource->size()) { + std::cerr << "WARNING: phrase pair " << lineID + << " has alignment point (" << s << ", " << t << ")" + << " out of bounds (" << phraseSource->size() << ", " << phraseTarget->size() << ")" + << std::endl; + } else { + // first alignment point? -> initialize + if ( targetToSourceAlignment->size() == 0 ) { + size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size()); + targetToSourceAlignment->resize(numberOfTargetSymbols); + } + // add alignment point + targetToSourceAlignment->at(t).insert(s); + } + } else if (includeSentenceIdFlag && item == 4) { // optional sentence id + sscanf(token[j].c_str(), "%d", &sentenceId); + } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count + sscanf(token[j].c_str(), "%f", &count); + } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score + float pcfgScore = std::atof(token[j].c_str()); + pcfgSum = pcfgScore * count; + } + } + + if ( targetToSourceAlignment->size() == 0 ) { + size_t numberOfTargetSymbols = (hierarchicalFlag ? phraseTarget->size()-1 : phraseTarget->size()); + targetToSourceAlignment->resize(numberOfTargetSymbols); + } + + if (item + (includeSentenceIdFlag?-1:0) == 3) { + count = 1.0; + } + if (item < 3 || item > 6) { + std::cerr << "ERROR: faulty line " << lineID << ": " << line << endl; + } + +} + + void writeCountOfCounts( const string &fileNameCountOfCounts ) { // open file Moses::OutputFileStream countOfCountsFile; bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str()); if (!success) { - cerr << "ERROR: could not open count-of-counts file " - << fileNameCountOfCounts << endl; + std::cerr << "ERROR: could not open count-of-counts file " + << fileNameCountOfCounts << std::endl; return; } // Kneser-Ney needs the total number of phrase pairs - countOfCountsFile << totalDistinct << endl; + countOfCountsFile << totalDistinct << std::endl; // write out counts for(int i=1; i<=COC_MAX; i++) { - countOfCountsFile << countOfCounts[ i ] << endl; + countOfCountsFile << countOfCounts[ i ] << std::endl; } countOfCountsFile.Close(); } -void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ) -{ - if (phrasePair.size() == 0) return; - // group phrase pairs based on alignments that matter - // (i.e. that re-arrange non-terminals) - PhrasePairGroup phrasePairGroup; +void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, ostream &phraseTableFile, + const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ) +{ + if (phrasePairsWithSameSource.size() == 0) { + return; + } float totalSource = 0; - //cerr << "phrasePair.size() = " << phrasePair.size() << endl; + //std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl; // loop through phrase pairs - for(size_t i=0; i::const_iterator iter=phrasePairsWithSameSource.begin(); + iter!=phrasePairsWithSameSource.end(); ++iter) { // add to total count - PhraseAlignment &currPhrasePair = phrasePair[i]; - - totalSource += phrasePair[i].count; - - // check for matches - //cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl; - - PhraseAlignmentCollection phraseAlignColl; - phraseAlignColl.push_back(&currPhrasePair); - pair retInsert; - retInsert = phrasePairGroup.insert(phraseAlignColl); - if (!retInsert.second) { - // already exist. Add to that collection instead - PhraseAlignmentCollection &existingColl = const_cast(*retInsert.first); - existingColl.push_back(&currPhrasePair); - } - + totalSource += (*iter)->GetCount(); } // output the distinct phrase pairs, one at a time - const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl(); - PhrasePairGroup::SortedColl::const_iterator iter; - - for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) { - const PhraseAlignmentCollection &group = **iter; - outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile, isSingleton, featureManager, maybeLogProb ); + for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); + iter!=phrasePairsWithSameSource.end(); ++iter) { + // add to total count + outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb ); } - } -const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair ) -{ - float bestAlignmentCount = -1; - PhraseAlignment* bestAlignment = NULL; - - for(size_t i=0; icount > bestAlignmentCount) { - bestAlignmentCount = phrasePair[alignInd]->count; - bestAlignment = phrasePair[alignInd]; - } - } - - return *bestAlignment; -} - -const std::string &findBestTreeFragment(const PhraseAlignmentCollection &phrasePair ) -{ - float bestTreeFragmentCount = -1; - PhraseAlignment *bestTreeFragment = NULL; - - for(size_t i=0; icount > bestTreeFragmentCount) { - bestTreeFragmentCount = phrasePair[treeFragmentInd]->count; - bestTreeFragment = phrasePair[treeFragmentInd]; - } - } - - return bestTreeFragment->treeFragment; -} - -bool calcCrossedNonTerm(size_t sourcePos, size_t targetPos, const std::vector< std::set > &alignedToS) -{ - for (size_t currSource = 0; currSource < alignedToS.size(); ++currSource) { - if (currSource == sourcePos) { - // skip - } else { - const std::set &targetSet = alignedToS[currSource]; - std::set::const_iterator iter; - for (iter = targetSet.begin(); iter != targetSet.end(); ++iter) { - size_t currTarget = *iter; - - if ((currSource < sourcePos && currTarget > targetPos) - || (currSource > sourcePos && currTarget < targetPos) - ) { - return true; - } - } - - } - } - - return false; -} - -int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment) -{ - const std::vector< std::set > &alignedToS = bestAlignment.alignedToS; - - for (size_t sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos) { - const std::set &targetSet = alignedToS[sourcePos]; - - WORD_ID wordId = phraseS[sourcePos]; - const WORD &word = vcbS.getWord(wordId); - bool isNonTerm = isNonTerminal(word); - - if (isNonTerm) { - assert(targetSet.size() == 1); - size_t targetPos = *targetSet.begin(); - bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS); - if (ret) - return 1; - } - } - - return 0; -} - -void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, +void outputPhrasePair(const ExtractionPhrasePair &phrasePair, + float totalCount, int distinctCount, + ostream &phraseTableFile, + const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ) { - if (phrasePair.size() == 0) return; + assert(phrasePair.isValid()); - const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair ); - - // compute count - float count = 0; - for(size_t i=0; icount; - } + const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource(); + float count = phrasePair.GetCount(); map< string, float > domainCount; @@ -464,82 +517,74 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo if (goodTuringFlag || kneserNeyFlag) { totalDistinct++; int countInt = count + 0.99999; - if(countInt <= COC_MAX) + if (countInt <= COC_MAX) countOfCounts[ countInt ]++; } // compute PCFG score float pcfgScore = 0; if (pcfgFlag && !inverseFlag) { - float pcfgSum = 0; - for(size_t i=0; ipcfgSum; - } - pcfgScore = pcfgSum / count; + pcfgScore = phrasePair.GetPcfgScore() / count; } // output phrases - const PHRASE &phraseS = phrasePair[0]->GetSource(); - const PHRASE &phraseT = phrasePair[0]->GetTarget(); + const PHRASE *phraseSource = phrasePair.GetSource(); + const PHRASE *phraseTarget = phrasePair.GetTarget(); // do not output if hierarchical and count below threshold if (hierarchicalFlag && count < minCountHierarchical) { - for(size_t j=0; jsize()-1; ++j) { + if (isNonTerminal(vcbS.getWord( phraseSource->at(j) ))) return; } } // source phrase (unless inverse) - if (! inverseFlag) { - printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile); + if (!inverseFlag) { + printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile); phraseTableFile << " ||| "; } // target phrase - printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile); + printTargetPhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile); phraseTableFile << " ||| "; // source phrase (if inverse) if (inverseFlag) { - printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile); + printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile); phraseTableFile << " ||| "; } // lexical translation probability if (lexFlag) { - double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment); - phraseTableFile << maybeLogProb(lexScore ); + double lexScore = computeLexicalTranslation( phraseSource, phraseTarget, bestAlignmentT2S ); + phraseTableFile << maybeLogProb( lexScore ); } // unaligned word penalty if (unalignedFlag) { - double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment); - phraseTableFile << " " << maybeLogProb(penalty ); + double penalty = computeUnalignedPenalty( bestAlignmentT2S ); + phraseTableFile << " " << maybeLogProb( penalty ); } // unaligned function word penalty if (unalignedFWFlag) { - double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment); - phraseTableFile << " " << maybeLogProb(penalty ); - } - - if (singletonFeature) { - phraseTableFile << " " << (isSingleton ? 1 : 0); + double penalty = computeUnalignedFWPenalty( phraseTarget, bestAlignmentT2S ); + phraseTableFile << " " << maybeLogProb( penalty ); } if (crossedNonTerm && !inverseFlag) { - phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment); + phraseTableFile << " " << calcCrossedNonTerm( phraseTarget, bestAlignmentT2S ); } // target-side PCFG score if (pcfgFlag && !inverseFlag) { - phraseTableFile << " " << maybeLogProb(pcfgScore ); + phraseTableFile << " " << maybeLogProb( pcfgScore ); } // extra features - ScoreFeatureContext context(phrasePair, count, maybeLogProb); - vector extraDense; + ScoreFeatureContext context(phrasePair, maybeLogProb); + std::vector extraDense; map extraSparse; featureManager.addFeatures(context, extraDense, extraSparse); for (size_t i = 0; i < extraDense.size(); ++i) { @@ -553,30 +598,28 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo phraseTableFile << " ||| "; - // alignment info for non-terminals - if (! inverseFlag) { - if (hierarchicalFlag) { - // always output alignment if hiero style, but only for non-terms - // (eh: output all alignments, needed for some feature functions) - assert(phraseT.size() == bestAlignment.alignedToT.size() + 1); + // output alignment info + if ( !inverseFlag ) { + if ( hierarchicalFlag ) { + // always output alignment if hiero style + assert(phraseTarget->size() == bestAlignmentT2S->size()+1); std::vector alignment; - for(size_t j = 0; j < phraseT.size() - 1; j++) { - if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { - if (bestAlignment.alignedToT[ j ].size() != 1) { - cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl; + for ( size_t j = 0; j < phraseTarget->size() - 1; ++j ) { + if ( isNonTerminal(vcbT.getWord( phraseTarget->at(j) ))) { + if ( bestAlignmentT2S->at(j).size() != 1 ) { + std::cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << std::endl; phraseTableFile.flush(); - assert(bestAlignment.alignedToT[ j ].size() == 1); + assert(bestAlignmentT2S->at(j).size() == 1); } - int sourcePos = *(bestAlignment.alignedToT[ j ].begin()); + size_t sourcePos = *(bestAlignmentT2S->at(j).begin()); //phraseTableFile << sourcePos << "-" << j << " "; std::stringstream point; point << sourcePos << "-" << j; alignment.push_back(point.str()); } else { - set::iterator setIter; - for(setIter = (bestAlignment.alignedToT[j]).begin(); setIter != (bestAlignment.alignedToT[j]).end(); setIter++) { - int sourcePos = *setIter; - //phraseTableFile << sourcePos << "-" << j << " "; + for ( std::set::iterator setIter = (bestAlignmentT2S->at(j)).begin(); + setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) { + size_t sourcePos = *setIter; std::stringstream point; point << sourcePos << "-" << j; alignment.push_back(point.str()); @@ -590,40 +633,90 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo } } else if (wordAlignmentFlag) { // alignment info in pb model - for(size_t j=0; j &aligned = bestAlignment.alignedToT[j]; - for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) { - phraseTableFile << *p << "-" << j << " "; + for (size_t j = 0; j < bestAlignmentT2S->size(); ++j) { + for ( std::set::iterator setIter = (bestAlignmentT2S->at(j)).begin(); + setIter != (bestAlignmentT2S->at(j)).end(); ++setIter ) { + size_t sourcePos = *setIter; + phraseTableFile << sourcePos << "-" << j << " "; } } } } - // counts - phraseTableFile << " ||| " << totalCount << " " << count; if (kneserNeyFlag) phraseTableFile << " " << distinctCount; - // tree fragments - if (treeFragmentsFlag && !inverseFlag) { - const std::string &bestTreeFragment = findBestTreeFragment( phrasePair ); - if ( !bestTreeFragment.empty() ) - phraseTableFile << " ||| {{Tree " << bestTreeFragment << "}}"; + if ((treeFragmentsFlag) && + !inverseFlag) { + phraseTableFile << " |||"; } + // tree fragments + if (treeFragmentsFlag && !inverseFlag) { + const std::string *bestTreeFragment = phrasePair.FindBestPropertyValue("Tree"); + if (bestTreeFragment) { + phraseTableFile << " {{Tree " << *bestTreeFragment << "}}"; + } + } - phraseTableFile << endl; + phraseTableFile << std::endl; } -double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment ) + + +bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource ) +{ + for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) { + if (currTarget == targetPos) { + // skip + } else { + const std::set &sourceSet = alignmentTargetToSource->at(currTarget); + for (std::set::const_iterator iter = sourceSet.begin(); + iter != sourceSet.end(); ++iter) { + size_t currSource = *iter; + + if ((currTarget < targetPos && currSource > sourcePos) + || (currTarget > targetPos && currSource < sourcePos) + ) { + return true; + } + } + + } + } + + return false; +} + +int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ) +{ + assert(phraseTarget->size() >= alignmentTargetToSource->size() ); + + for (size_t targetPos = 0; targetPos < alignmentTargetToSource->size(); ++targetPos) { + + if ( isNonTerminal(vcbT.getWord( phraseTarget->at(targetPos) ))) { + const std::set &alignmentPoints = alignmentTargetToSource->at(targetPos); + assert( alignmentPoints.size() == 1 ); + size_t sourcePos = *alignmentPoints.begin(); + bool ret = calcCrossedNonTerm(targetPos, sourcePos, alignmentTargetToSource); + if (ret) + return 1; + } + } + + return 0; +} + + +double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource ) { // unaligned word counter double unaligned = 1.0; // only checking target words - source words are caught when computing inverse - for(size_t ti=0; ti & srcIndices = alignment.alignedToT[ ti ]; + for(size_t ti=0; tisize(); ++ti) { + const set< size_t > & srcIndices = alignmentTargetToSource->at(ti); if (srcIndices.empty()) { unaligned *= 2.718; } @@ -631,14 +724,15 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, co return unaligned; } -double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment ) + +double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ) { // unaligned word counter double unaligned = 1.0; // only checking target words - source words are caught when computing inverse - for(size_t ti=0; ti & srcIndices = alignment.alignedToT[ ti ]; - if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) { + for(size_t ti=0; tisize(); ++ti) { + const set< size_t > & srcIndices = alignmentTargetToSource->at(ti); + if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseTarget->at(ti) ) ) != functionWordList.end()) { unaligned *= 2.718; } } @@ -647,11 +741,11 @@ double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, void loadFunctionWords( const string &fileName ) { - cerr << "Loading function word list from " << fileName; + std::cerr << "Loading function word list from " << fileName; ifstream inFile; inFile.open(fileName.c_str()); if (inFile.fail()) { - cerr << " - ERROR: could not open file\n"; + std::cerr << " - ERROR: could not open file" << std::endl; exit(1); } istream *inFileP = &inFile; @@ -660,32 +754,32 @@ void loadFunctionWords( const string &fileName ) while(true) { SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (inFileP->eof()) break; - vector token = tokenize( line ); + std::vector token = tokenize( line ); if (token.size() > 0) functionWordList.insert( token[0] ); } - inFile.close(); - cerr << " - read " << functionWordList.size() << " function words\n"; + std::cerr << " - read " << functionWordList.size() << " function words" << std::endl; inFile.close(); } -double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment ) + +double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ) { // lexical translation probability double lexScore = 1.0; int null = vcbS.getWordID("NULL"); // all target words have to be explained - for(size_t ti=0; ti & srcIndices = alignment.alignedToT[ ti ]; + for(size_t ti=0; tisize(); ti++) { + const set< size_t > & srcIndices = alignmentTargetToSource->at(ti); if (srcIndices.empty()) { // explain unaligned word by NULL - lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] ); + lexScore *= lexTable.permissiveLookup( null, phraseTarget->at(ti) ); } else { // go through all the aligned words to compute average double thisWordScore = 0; for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) { - thisWordScore += lexTable.permissiveLookup( phraseS[ *p ], phraseT[ ti ] ); + thisWordScore += lexTable.permissiveLookup( phraseSource->at(*p), phraseTarget->at(ti) ); } lexScore *= thisWordScore / (double)srcIndices.size(); } @@ -693,13 +787,14 @@ double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, return lexScore; } + void LexicalTable::load( const string &fileName ) { - cerr << "Loading lexical translation table from " << fileName; + std::cerr << "Loading lexical translation table from " << fileName; ifstream inFile; inFile.open(fileName.c_str()); if (inFile.fail()) { - cerr << " - ERROR: could not open file\n"; + std::cerr << " - ERROR: could not open file" << std::endl; exit(1); } istream *inFileP = &inFile; @@ -709,15 +804,15 @@ void LexicalTable::load( const string &fileName ) int i=0; while(true) { i++; - if (i%100000 == 0) cerr << "." << flush; + if (i%100000 == 0) std::cerr << "." << flush; SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (inFileP->eof()) break; - vector token = tokenize( line ); + std::vector token = tokenize( line ); if (token.size() != 3) { - cerr << "line " << i << " in " << fileName - << " has wrong number of tokens, skipping:\n" - << token.size() << " " << token[0] << " " << line << endl; + std::cerr << "line " << i << " in " << fileName + << " has wrong number of tokens, skipping:" << std::endl + << token.size() << " " << token[0] << " " << line << std::endl; continue; } @@ -726,55 +821,60 @@ void LexicalTable::load( const string &fileName ) WORD_ID wordS = vcbS.storeIfNew( token[1] ); ltable[ wordS ][ wordT ] = prob; } - cerr << endl; + std::cerr << std::endl; } -void printSourcePhrase(const PHRASE &phraseS, const PHRASE &phraseT, - const PhraseAlignment &bestAlignment, ostream &out) + +void printSourcePhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget, + const ALIGNMENT *targetToSourceAlignment, ostream &out) { + // get corresponding target non-terminal and output pair + ALIGNMENT *sourceToTargetAlignment = new ALIGNMENT(); + invertAlignment(phraseSource, phraseTarget, targetToSourceAlignment, sourceToTargetAlignment); // output source symbols, except root, in rule table format - for (std::size_t i = 0; i < phraseS.size()-1; ++i) { - const std::string &word = vcbS.getWord(phraseS[i]); + for (std::size_t i = 0; i < phraseSource->size()-1; ++i) { + const std::string &word = vcbS.getWord(phraseSource->at(i)); if (!unpairedExtractFormatFlag || !isNonTerminal(word)) { out << word << " "; continue; } - // get corresponding target non-terminal and output pair - std::set alignmentPoints = bestAlignment.alignedToS[i]; + const std::set &alignmentPoints = sourceToTargetAlignment->at(i); assert(alignmentPoints.size() == 1); - int j = *(alignmentPoints.begin()); + size_t j = *(alignmentPoints.begin()); if (inverseFlag) { - out << vcbT.getWord(phraseT[j]) << word << " "; + out << vcbT.getWord(phraseTarget->at(j)) << word << " "; } else { - out << word << vcbT.getWord(phraseT[j]) << " "; + out << word << vcbT.getWord(phraseTarget->at(j)) << " "; } } // output source root symbol if (conditionOnTargetLhsFlag && !inverseFlag) { out << "[X]"; } else { - out << vcbS.getWord(phraseS.back()); + out << vcbS.getWord(phraseSource->back()); } + delete sourceToTargetAlignment; } -void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT, - const PhraseAlignment &bestAlignment, ostream &out) + +void printTargetPhrase(const PHRASE *phraseSource, const PHRASE *phraseTarget, + const ALIGNMENT *targetToSourceAlignment, ostream &out) { // output target symbols, except root, in rule table format - for (std::size_t i = 0; i < phraseT.size()-1; ++i) { - const std::string &word = vcbT.getWord(phraseT[i]); + for (std::size_t i = 0; i < phraseTarget->size()-1; ++i) { + const std::string &word = vcbT.getWord(phraseTarget->at(i)); if (!unpairedExtractFormatFlag || !isNonTerminal(word)) { out << word << " "; continue; } // get corresponding source non-terminal and output pair - std::set alignmentPoints = bestAlignment.alignedToT[i]; + std::set alignmentPoints = targetToSourceAlignment->at(i); assert(alignmentPoints.size() == 1); int j = *(alignmentPoints.begin()); if (inverseFlag) { - out << word << vcbS.getWord(phraseS[j]) << " "; + out << word << vcbS.getWord(phraseSource->at(j)) << " "; } else { - out << vcbS.getWord(phraseS[j]) << word << " "; + out << vcbS.getWord(phraseSource->at(j)) << word << " "; } } // output target root symbol @@ -782,24 +882,28 @@ void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT, if (inverseFlag) { out << "[X]"; } else { - out << vcbS.getWord(phraseS.back()); + out << vcbS.getWord(phraseSource->back()); } } else { - out << vcbT.getWord(phraseT.back()); + out << vcbT.getWord(phraseTarget->back()); } } -std::pair PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj ) -{ - std::pair ret = m_coll.insert(obj); - if (ret.second) { - // obj inserted. Also add to sorted vector - const PhraseAlignmentCollection &insertedObj = *ret.first; - m_sortedColl.push_back(&insertedObj); +void invertAlignment(const PHRASE *phraseSource, const PHRASE *phraseTarget, + const ALIGNMENT *inTargetToSourceAlignment, ALIGNMENT *outSourceToTargetAlignment) { +// typedef std::vector< std::set > ALIGNMENT; + + outSourceToTargetAlignment->clear(); + size_t numberOfSourceSymbols = (hierarchicalFlag ? phraseSource->size()-1 : phraseSource->size()); + outSourceToTargetAlignment->resize(numberOfSourceSymbols); + // add alignment point + for (size_t targetPosition = 0; targetPosition < inTargetToSourceAlignment->size(); ++targetPosition) { + for ( std::set::iterator setIter = (inTargetToSourceAlignment->at(targetPosition)).begin(); + setIter != (inTargetToSourceAlignment->at(targetPosition)).end(); ++setIter ) { + size_t sourcePosition = *setIter; + outSourceToTargetAlignment->at(sourcePosition).insert(targetPosition); + } } - - return ret; } - From d26fe4cc4dc8bc417fcdc91ffe05cd6dc8503f4c Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Wed, 29 Jan 2014 23:01:53 +0000 Subject: [PATCH 34/48] fix truecaser with XML input (didn't do anything depending on formatting/whitespace) --- scripts/recaser/train-truecaser.perl | 37 +++++++++++++++++++++++----- scripts/recaser/truecase.perl | 2 +- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index f6cb7e85a..8a1ba4c76 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -25,13 +25,13 @@ open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'"); binmode(CORPUS, ":utf8"); while() { chop; - my @WORD = split; + my ($WORD,$MARKUP) = split_xml($_); my $start = 0; - while($start<=$#WORD && defined($DELAYED_SENTENCE_START{$WORD[$start]})) { $start++; } + while($start<=$#$WORD && defined($DELAYED_SENTENCE_START{$$WORD[$start]})) { $start++; } my $firstWordOfSentence = 1; - for(my $i=$start;$i<=$#WORD;$i++) { - my $currentWord = $WORD[$i]; - if (! $firstWordOfSentence && defined($SENTENCE_END{$WORD[$i-1]})) { + for(my $i=$start;$i<=$#$WORD;$i++) { + my $currentWord = $$WORD[$i]; + if (! $firstWordOfSentence && defined($SENTENCE_END{$$WORD[$i-1]})) { $firstWordOfSentence = 1; } @@ -44,7 +44,7 @@ while() { if (lc($firstChar) eq $firstChar) { # if the first character is not upper case, count the token as full evidence (because if it's not capitalized, then there's no reason to be wary that the given casing is only due to being sentence-initial) $currentWordWeight = 1; - } elsif (scalar(@WORD) == 1) { + } elsif (scalar(@$WORD) == 1) { # if the first character is upper case, but the current token is the only token of the segment, then count the token as partial evidence (because the segment is presumably not a sentence and the token is therefore not the first word of a sentence and is possibly in its natural case) $currentWordWeight = 0.1; } @@ -77,3 +77,28 @@ foreach my $type (keys %CASING) { print MODEL "\n"; } close(MODEL); + + +# store away xml markup +sub split_xml { + my ($line) = @_; + my (@WORD,@MARKUP); + my $i = 0; + $MARKUP[0] = ""; + while($line =~ /\S/) { + if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { + $MARKUP[$i] .= $1." "; + $line = $2; + } + elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + else { + die("ERROR: huh? $line\n"); + } + } + chop($MARKUP[$#MARKUP]); + return (\@WORD,\@MARKUP); +} diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index 517f5c7a1..a1340f3b6 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -74,7 +74,7 @@ sub split_xml { $MARKUP[$i] .= $1." "; $line = $2; } - elsif ($line =~ /^\s*(\S+)(.*)$/) { + elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { $WORD[$i++] = $1; $MARKUP[$i] = ""; $line = $2; From 3a5b54ef191ca3f2edd245c9cb0a8690a1ea161c Mon Sep 17 00:00:00 2001 From: Ales Tamchyna Date: Thu, 30 Jan 2014 15:42:31 +0100 Subject: [PATCH 35/48] non-zero exit code when tmcombine.py fails --- contrib/tmcombine/tmcombine.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/contrib/tmcombine/tmcombine.py b/contrib/tmcombine/tmcombine.py index 57fd109fe..6b98b3acd 100755 --- a/contrib/tmcombine/tmcombine.py +++ b/contrib/tmcombine/tmcombine.py @@ -106,7 +106,7 @@ class Moses(): scores = line[2].split() if len(scores) Date: Thu, 30 Jan 2014 09:03:01 -0800 Subject: [PATCH 36/48] Update read_compressed to support concatenated gzip --- util/read_compressed.cc | 414 ++++++++++++++++------------------- util/read_compressed_test.cc | 5 + 2 files changed, 193 insertions(+), 226 deletions(-) diff --git a/util/read_compressed.cc b/util/read_compressed.cc index b62a6e833..5b87a6fba 100644 --- a/util/read_compressed.cc +++ b/util/read_compressed.cc @@ -49,6 +49,8 @@ class ReadBase { thunk.internal_.reset(with); } + ReadBase *Current(ReadCompressed &thunk) { return thunk.internal_.get(); } + static uint64_t &ReadCount(ReadCompressed &thunk) { return thunk.raw_amount_; } @@ -56,6 +58,8 @@ class ReadBase { namespace { +ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size, bool require_compressed); + // Completed file that other classes can thunk to. class Complete : public ReadBase { public: @@ -80,7 +84,7 @@ class Uncompressed : public ReadBase { class UncompressedWithHeader : public ReadBase { public: - UncompressedWithHeader(int fd, void *already_data, std::size_t already_size) : fd_(fd) { + UncompressedWithHeader(int fd, const void *already_data, std::size_t already_size) : fd_(fd) { assert(already_size); buf_.reset(malloc(already_size)); if (!buf_.get()) throw std::bad_alloc(); @@ -91,6 +95,7 @@ class UncompressedWithHeader : public ReadBase { std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { assert(buf_.get()); + assert(remain_ != end_); std::size_t sending = std::min(amount, end_ - remain_); memcpy(to, remain_, sending); remain_ += sending; @@ -108,23 +113,51 @@ class UncompressedWithHeader : public ReadBase { scoped_fd fd_; }; -#ifdef HAVE_ZLIB -class GZip : public ReadBase { - private: - static const std::size_t kInputBuffer = 16384; +static const std::size_t kInputBuffer = 16384; + +template class StreamCompressed : public ReadBase { public: - GZip(int fd, void *already_data, std::size_t already_size) - : file_(fd), in_buffer_(malloc(kInputBuffer)) { - if (!in_buffer_.get()) throw std::bad_alloc(); - assert(already_size < kInputBuffer); - if (already_size) { - memcpy(in_buffer_.get(), already_data, already_size); - stream_.next_in = static_cast(in_buffer_.get()); - stream_.avail_in = already_size; - stream_.avail_in += ReadOrEOF(file_.get(), static_cast(in_buffer_.get()) + already_size, kInputBuffer - already_size); - } else { - stream_.avail_in = 0; - } + StreamCompressed(int fd, const void *already_data, std::size_t already_size) + : file_(fd), + in_buffer_(MallocOrThrow(kInputBuffer)), + back_(memcpy(in_buffer_.get(), already_data, already_size), already_size) {} + + std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { + if (amount == 0) return 0; + back_.SetOutput(to, amount); + do { + if (!back_.Stream().avail_in) ReadInput(thunk); + if (!back_.Process()) { + // reached end, at least for the compressed portion. + std::size_t ret = static_cast(static_cast(back_.Stream().next_out)) - static_cast(to); + ReplaceThis(ReadFactory(file_.release(), ReadCount(thunk), back_.Stream().next_in, back_.Stream().avail_in, true), thunk); + if (ret) return ret; + // We did not read anything this round, so clients might think EOF. Transfer responsibility to the next reader. + return Current(thunk)->Read(to, amount, thunk); + } + } while (back_.Stream().next_out == to); + return static_cast(static_cast(back_.Stream().next_out)) - static_cast(to); + } + + private: + void ReadInput(ReadCompressed &thunk) { + assert(!back_.Stream().avail_in); + std::size_t got = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer); + back_.SetInput(in_buffer_.get(), got); + ReadCount(thunk) += got; + } + + scoped_fd file_; + scoped_malloc in_buffer_; + + Compression back_; +}; + +#ifdef HAVE_ZLIB +class GZip { + public: + GZip(const void *base, std::size_t amount) { + SetInput(base, amount); stream_.zalloc = Z_NULL; stream_.zfree = Z_NULL; stream_.opaque = Z_NULL; @@ -141,227 +174,154 @@ class GZip : public ReadBase { } } - std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { - if (amount == 0) return 0; + void SetOutput(void *to, std::size_t amount) { stream_.next_out = static_cast(to); stream_.avail_out = std::min(std::numeric_limits::max(), amount); - do { - if (!stream_.avail_in) ReadInput(thunk); - int result = inflate(&stream_, 0); - switch (result) { - case Z_OK: - break; - case Z_STREAM_END: - { - std::size_t ret = static_cast(stream_.next_out) - static_cast(to); - ReplaceThis(new Complete(), thunk); - return ret; - } - case Z_ERRNO: - UTIL_THROW(ErrnoException, "zlib error"); - default: - UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result); - } - } while (stream_.next_out == to); - return static_cast(stream_.next_out) - static_cast(to); + } + + void SetInput(const void *base, std::size_t amount) { + assert(amount < static_cast(std::numeric_limits::max())); + stream_.next_in = const_cast(static_cast(base)); + stream_.avail_in = amount; + } + + const z_stream &Stream() const { return stream_; } + + bool Process() { + int result = inflate(&stream_, 0); + switch (result) { + case Z_OK: + return true; + case Z_STREAM_END: + return false; + case Z_ERRNO: + UTIL_THROW(ErrnoException, "zlib error"); + default: + UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result); + } } private: - void ReadInput(ReadCompressed &thunk) { - assert(!stream_.avail_in); - stream_.next_in = static_cast(in_buffer_.get()); - stream_.avail_in = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer); - ReadCount(thunk) += stream_.avail_in; - } - - scoped_fd file_; - scoped_malloc in_buffer_; z_stream stream_; }; #endif // HAVE_ZLIB -const uint8_t kBZMagic[3] = {'B', 'Z', 'h'}; - #ifdef HAVE_BZLIB -class BZip : public ReadBase { +class BZip { public: - BZip(int fd, void *already_data, std::size_t already_size) { - scoped_fd hold(fd); - closer_.reset(FDOpenReadOrThrow(hold)); - file_ = NULL; - Open(already_data, already_size); - } - - BZip(FILE *file, void *already_data, std::size_t already_size) { - closer_.reset(file); - file_ = NULL; - Open(already_data, already_size); + BZip(const void *base, std::size_t amount) { + memset(&stream_, 0, sizeof(stream_)); + SetInput(base, amount); + HandleError(BZ2_bzDecompressInit(&stream_, 0, 0)); } ~BZip() { - Close(file_); - } - - std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { - assert(file_); - int bzerror = BZ_OK; - int ret = BZ2_bzRead(&bzerror, file_, to, std::min(static_cast(INT_MAX), amount)); - long pos = ftell(closer_.get()); - if (pos != -1) ReadCount(thunk) = pos; - switch (bzerror) { - case BZ_STREAM_END: - /* bzip2 files can be concatenated by e.g. pbzip2. Annoyingly, the - * library doesn't handle this internally. This gets the trailing - * data, grows it up to magic as needed, validates the magic, and - * reopens. - */ - { - bzerror = BZ_OK; - void *trailing_data; - int trailing_size; - BZ2_bzReadGetUnused(&bzerror, file_, &trailing_data, &trailing_size); - UTIL_THROW_IF(bzerror != BZ_OK, BZException, "bzip2 error in BZ2_bzReadGetUnused " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror); - std::string trailing(static_cast(trailing_data), trailing_size); - Close(file_); - - if (trailing_size < (int)sizeof(kBZMagic)) { - trailing.resize(sizeof(kBZMagic)); - if (1 != fread(&trailing[trailing_size], sizeof(kBZMagic) - trailing_size, 1, closer_.get())) { - UTIL_THROW_IF(trailing_size, BZException, "File has trailing cruft"); - // Legitimate end of file. - ReplaceThis(new Complete(), thunk); - return ret; - } - } - UTIL_THROW_IF(memcmp(trailing.data(), kBZMagic, sizeof(kBZMagic)), BZException, "Trailing cruft is not another bzip2 stream"); - Open(&trailing[0], trailing.size()); - } - return ret; - case BZ_OK: - return ret; - default: - UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror); + try { + HandleError(BZ2_bzDecompressEnd(&stream_)); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + abort(); } } + bool Process() { + int ret = BZ2_bzDecompress(&stream_); + if (ret == BZ_STREAM_END) return false; + HandleError(ret); + return true; + } + + void SetOutput(void *base, std::size_t amount) { + stream_.next_out = static_cast(base); + stream_.avail_out = std::min(std::numeric_limits::max(), amount); + } + + void SetInput(const void *base, std::size_t amount) { + stream_.next_in = const_cast(static_cast(base)); + stream_.avail_in = amount; + } + + const bz_stream &Stream() const { return stream_; } + private: - void Open(void *already_data, std::size_t already_size) { - assert(!file_); - int bzerror = BZ_OK; - file_ = BZ2_bzReadOpen(&bzerror, closer_.get(), 0, 0, already_data, already_size); - switch (bzerror) { + void HandleError(int value) { + switch(value) { case BZ_OK: return; case BZ_CONFIG_ERROR: - UTIL_THROW(BZException, "Looks like bzip2 was miscompiled."); + UTIL_THROW(BZException, "bzip2 seems to be miscompiled."); case BZ_PARAM_ERROR: - UTIL_THROW(BZException, "Parameter error"); - case BZ_IO_ERROR: - UTIL_THROW(BZException, "IO error reading file"); + UTIL_THROW(BZException, "bzip2 Parameter error"); + case BZ_DATA_ERROR: + UTIL_THROW(BZException, "bzip2 detected a corrupt file"); + case BZ_DATA_ERROR_MAGIC: + UTIL_THROW(BZException, "bzip2 detected bad magic bytes. Perhaps this was not a bzip2 file after all?"); case BZ_MEM_ERROR: throw std::bad_alloc(); default: - UTIL_THROW(BZException, "Unknown bzip2 error code " << bzerror); + UTIL_THROW(BZException, "Unknown bzip2 error code " << value); } - assert(file_); } - static void Close(BZFILE *&file) { - if (file == NULL) return; - int bzerror = BZ_OK; - BZ2_bzReadClose(&bzerror, file); - if (bzerror != BZ_OK) { - std::cerr << "bz2 readclose error number " << bzerror << std::endl; - abort(); - } - file = NULL; - } - - scoped_FILE closer_; - BZFILE *file_; + bz_stream stream_; }; #endif // HAVE_BZLIB #ifdef HAVE_XZLIB -class XZip : public ReadBase { - private: - static const std::size_t kInputBuffer = 16384; +class XZip { public: - XZip(int fd, void *already_data, std::size_t already_size) - : file_(fd), in_buffer_(malloc(kInputBuffer)), stream_(), action_(LZMA_RUN) { - if (!in_buffer_.get()) throw std::bad_alloc(); - assert(already_size < kInputBuffer); - if (already_size) { - memcpy(in_buffer_.get(), already_data, already_size); - stream_.next_in = static_cast(in_buffer_.get()); - stream_.avail_in = already_size; - stream_.avail_in += ReadOrEOF(file_.get(), static_cast(in_buffer_.get()) + already_size, kInputBuffer - already_size); - } else { - stream_.avail_in = 0; - } - stream_.allocator = NULL; - lzma_ret ret = lzma_stream_decoder(&stream_, UINT64_MAX, LZMA_CONCATENATED); - switch (ret) { - case LZMA_OK: - break; - case LZMA_MEM_ERROR: - UTIL_THROW(ErrnoException, "xz open error"); - default: - UTIL_THROW(XZException, "xz error code " << ret); - } + XZip(const void *base, std::size_t amount) + : stream_(), action_(LZMA_RUN) { + memset(&stream_, 0, sizeof(stream_)); + SetInput(base, amount); + HandleError(lzma_stream_decoder(&stream_, UINT64_MAX, 0)); } ~XZip() { lzma_end(&stream_); } - std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { - if (amount == 0) return 0; - stream_.next_out = static_cast(to); + void SetOutput(void *base, std::size_t amount) { + stream_.next_out = static_cast(base); stream_.avail_out = amount; - do { - if (!stream_.avail_in) ReadInput(thunk); - lzma_ret status = lzma_code(&stream_, action_); - switch (status) { - case LZMA_OK: - break; - case LZMA_STREAM_END: - UTIL_THROW_IF(action_ != LZMA_FINISH, XZException, "Input not finished yet."); - { - std::size_t ret = static_cast(stream_.next_out) - static_cast(to); - ReplaceThis(new Complete(), thunk); - return ret; - } - case LZMA_MEM_ERROR: - throw std::bad_alloc(); - case LZMA_FORMAT_ERROR: - UTIL_THROW(XZException, "xzlib says file format not recognized"); - case LZMA_OPTIONS_ERROR: - UTIL_THROW(XZException, "xzlib says unsupported compression options"); - case LZMA_DATA_ERROR: - UTIL_THROW(XZException, "xzlib says this file is corrupt"); - case LZMA_BUF_ERROR: - UTIL_THROW(XZException, "xzlib says unexpected end of input"); - default: - UTIL_THROW(XZException, "unrecognized xzlib error " << status); - } - } while (stream_.next_out == to); - return static_cast(stream_.next_out) - static_cast(to); + } + + void SetInput(const void *base, std::size_t amount) { + stream_.next_in = static_cast(base); + stream_.avail_in = amount; + if (!amount) action_ = LZMA_FINISH; + } + + const lzma_stream &Stream() const { return stream_; } + + bool Process() { + lzma_ret status = lzma_code(&stream_, action_); + if (status == LZMA_STREAM_END) return false; + HandleError(status); + return true; } private: - void ReadInput(ReadCompressed &thunk) { - assert(!stream_.avail_in); - stream_.next_in = static_cast(in_buffer_.get()); - stream_.avail_in = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer); - if (!stream_.avail_in) action_ = LZMA_FINISH; - ReadCount(thunk) += stream_.avail_in; + void HandleError(lzma_ret value) { + switch (value) { + case LZMA_OK: + return; + case LZMA_MEM_ERROR: + throw std::bad_alloc(); + case LZMA_FORMAT_ERROR: + UTIL_THROW(XZException, "xzlib says file format not recognized"); + case LZMA_OPTIONS_ERROR: + UTIL_THROW(XZException, "xzlib says unsupported compression options"); + case LZMA_DATA_ERROR: + UTIL_THROW(XZException, "xzlib says this file is corrupt"); + case LZMA_BUF_ERROR: + UTIL_THROW(XZException, "xzlib says unexpected end of input"); + default: + UTIL_THROW(XZException, "unrecognized xzlib error " << value); + } } - scoped_fd file_; - scoped_malloc in_buffer_; lzma_stream stream_; - lzma_action action_; }; #endif // HAVE_XZLIB @@ -384,66 +344,68 @@ class IStreamReader : public ReadBase { }; enum MagicResult { - UNKNOWN, GZIP, BZIP, XZIP + UTIL_UNKNOWN, UTIL_GZIP, UTIL_BZIP, UTIL_XZIP }; -MagicResult DetectMagic(const void *from_void) { +MagicResult DetectMagic(const void *from_void, std::size_t length) { const uint8_t *header = static_cast(from_void); - if (header[0] == 0x1f && header[1] == 0x8b) { - return GZIP; + if (length >= 2 && header[0] == 0x1f && header[1] == 0x8b) { + return UTIL_GZIP; } - if (!memcmp(header, kBZMagic, sizeof(kBZMagic))) { - return BZIP; + const uint8_t kBZMagic[3] = {'B', 'Z', 'h'}; + if (length >= sizeof(kBZMagic) && !memcmp(header, kBZMagic, sizeof(kBZMagic))) { + return UTIL_BZIP; } const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; - if (!memcmp(header, kXZMagic, sizeof(kXZMagic))) { - return XZIP; + if (length >= sizeof(kXZMagic) && !memcmp(header, kXZMagic, sizeof(kXZMagic))) { + return UTIL_XZIP; } - return UNKNOWN; + return UTIL_UNKNOWN; } -ReadBase *ReadFactory(int fd, uint64_t &raw_amount) { +ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, const std::size_t already_size, bool require_compressed) { scoped_fd hold(fd); - unsigned char header[ReadCompressed::kMagicSize]; - raw_amount = ReadOrEOF(fd, header, ReadCompressed::kMagicSize); - if (!raw_amount) - return new Uncompressed(hold.release()); - if (raw_amount != ReadCompressed::kMagicSize) - return new UncompressedWithHeader(hold.release(), header, raw_amount); - switch (DetectMagic(header)) { - case GZIP: + std::string header(reinterpret_cast(already_data), already_size); + if (header.size() < ReadCompressed::kMagicSize) { + std::size_t original = header.size(); + header.resize(ReadCompressed::kMagicSize); + std::size_t got = ReadOrEOF(fd, &header[original], ReadCompressed::kMagicSize - original); + raw_amount += got; + header.resize(original + got); + } + if (header.empty()) { + hold.release(); + return new Complete(); + } + switch (DetectMagic(&header[0], header.size())) { + case UTIL_GZIP: #ifdef HAVE_ZLIB - return new GZip(hold.release(), header, ReadCompressed::kMagicSize); + return new StreamCompressed(hold.release(), header.data(), header.size()); #else UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in."); #endif - case BZIP: + case UTIL_BZIP: #ifdef HAVE_BZLIB - return new BZip(hold.release(), header, ReadCompressed::kMagicSize); + return new StreamCompressed(hold.release(), &header[0], header.size()); #else - UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZ), but bzip support was not compiled in."); + UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZh), but bzip support was not compiled in."); #endif - case XZIP: + case UTIL_XZIP: #ifdef HAVE_XZLIB - return new XZip(hold.release(), header, ReadCompressed::kMagicSize); + return new StreamCompressed(hold.release(), header.data(), header.size()); #else UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in."); #endif - case UNKNOWN: - break; + default: + UTIL_THROW_IF(require_compressed, CompressedException, "Uncompressed data detected after a compresssed file. This could be supported but usually indicates an error."); + return new UncompressedWithHeader(hold.release(), header.data(), header.size()); } - try { - SeekOrThrow(fd, 0); - } catch (const util::ErrnoException &e) { - return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize); - } - return new Uncompressed(hold.release()); } } // namespace bool ReadCompressed::DetectCompressedMagic(const void *from_void) { - return DetectMagic(from_void) != UNKNOWN; + return DetectMagic(from_void, kMagicSize) != UTIL_UNKNOWN; } ReadCompressed::ReadCompressed(int fd) { @@ -460,7 +422,7 @@ ReadCompressed::~ReadCompressed() {} void ReadCompressed::Reset(int fd) { internal_.reset(); - internal_.reset(ReadFactory(fd, raw_amount_)); + internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0, false)); } void ReadCompressed::Reset(std::istream &in) { diff --git a/util/read_compressed_test.cc b/util/read_compressed_test.cc index 50450a025..301e8f4b4 100644 --- a/util/read_compressed_test.cc +++ b/util/read_compressed_test.cc @@ -113,6 +113,11 @@ BOOST_AUTO_TEST_CASE(ReadXZ) { } #endif +#ifdef HAVE_ZLIB +BOOST_AUTO_TEST_CASE(AppendGZ) { +} +#endif + BOOST_AUTO_TEST_CASE(IStream) { std::string name(WriteRandom()); std::fstream stream(name.c_str(), std::ios::in); From 29f02c597fcfff85fb01e539a3c8fae2a3883b7b Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 30 Jan 2014 15:55:25 -0800 Subject: [PATCH 37/48] Fix progress bar for compressed files --- util/read_compressed.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/util/read_compressed.cc b/util/read_compressed.cc index 5b87a6fba..71ef0e251 100644 --- a/util/read_compressed.cc +++ b/util/read_compressed.cc @@ -421,6 +421,7 @@ ReadCompressed::ReadCompressed() {} ReadCompressed::~ReadCompressed() {} void ReadCompressed::Reset(int fd) { + raw_amount_ = 0; internal_.reset(); internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0, false)); } From fbfd67f08b023ff5ef19dc34634d250f07c00563 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Mon, 3 Feb 2014 11:41:32 +0000 Subject: [PATCH 38/48] Update README --- contrib/rpm/README | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/contrib/rpm/README b/contrib/rpm/README index 8ba7ef4da..8dd803ff3 100644 --- a/contrib/rpm/README +++ b/contrib/rpm/README @@ -12,12 +12,13 @@ Building the RPM SPEC file The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information: - The Git repository from which an installer will be built, - - The branch in the Git repository to build, and + - The branch in the Git repository to build, + - The location of Boost on the build machine, and - The version of the installed Moses distribution. -For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git): +For example, to build the RELEASE-1.0 branch in the mosesdecoder repository (git://github.com/moses-smt/mosesdecoder.git): -$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0 +$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0 -t /usr This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS. From 98fad4a08ad20546af99ceff8e90f98f11b547eb Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Mon, 3 Feb 2014 12:57:13 +0000 Subject: [PATCH 39/48] RPM installer builder builder (sic) now allows specification of Boost installation. --- contrib/rpm/build_source.sh | 10 +++++++--- contrib/rpm/rpmbuild/SPECS/moses.spec | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/contrib/rpm/build_source.sh b/contrib/rpm/build_source.sh index d0fac6a33..dad753639 100755 --- a/contrib/rpm/build_source.sh +++ b/contrib/rpm/build_source.sh @@ -1,11 +1,13 @@ #!/bin/bash BRANCH="master" +BOOST="/usr" declare -i NO_RPM_BUILD=0 declare -r RPM_VERSION_TAG="___RPM_VERSION__" +declare -r BOOST_TAG="___BOOST_LOCATION__" function usage() { - echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]" + echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version] -t [Boost install: default ${BOOST}]" exit 1 } @@ -13,11 +15,12 @@ if [ $# -lt 4 ]; then usage fi -while getopts r:b:v:nh OPTION +while getopts r:b:t:v:nh OPTION do case "$OPTION" in r) REPO="${OPTARG}";; b) BRANCH="${OPTARG}";; + t) BOOST="${OPTARG}";; v) VERSION="${OPTARG}";; n) NO_RPM_BUILD=1;; [h\?]) usage;; @@ -53,7 +56,8 @@ if [ ${NO_RPM_BUILD} -eq 0 ]; then if [ ! -d ${HOME}/rpmbuild/SPECS ]; then mkdir -p ${HOME}/rpmbuild/SPECS fi - eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec + ESC_BOOST=`echo ${BOOST} | gawk '{gsub(/\//, "\\\\/"); print}'` + eval sed -e \"s/${RPM_VERSION_TAG}/${VERSION}/\" -e \"s/${BOOST_TAG}/${ESC_BOOST}/\" ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then mkdir -p ${HOME}/rpmbuild/SOURCES fi diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec index 0f4a6c6ec..f58668983 100644 --- a/contrib/rpm/rpmbuild/SPECS/moses.spec +++ b/contrib/rpm/rpmbuild/SPECS/moses.spec @@ -35,16 +35,18 @@ cd ../giza-pp make cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 %build -./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2 +./bjam --with-boost=___BOOST_LOCATION__ --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2 %install mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts cp -R bin $RPM_BUILD_ROOT/opt/moses +cp -R scripts/OSM $RPM_BUILD_ROOT/op/moses/scripts +cp -R scripts/Transliteration $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/fuzzy-match $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts -cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts @@ -52,12 +54,14 @@ cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts %files %defattr(-,root,root) /opt/moses/bin/* +/opt/moses/scripts/OMS/* +/opt/moses/scripts/Transliteration/* /opt/moses/scripts/analysis/* /opt/moses/scripts/ems/* +/opt/moses/scripts/fuzzy-match/* /opt/moses/scripts/generic/* /opt/moses/scripts/other/* /opt/moses/scripts/recaser/* -/opt/moses/scripts/regression-testing/* /opt/moses/scripts/share/* /opt/moses/scripts/tokenizer/* /opt/moses/scripts/training/* From b972fcd767d076b78f5f4e0a968b9fddd1511bd4 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Mon, 3 Feb 2014 13:17:58 +0000 Subject: [PATCH 40/48] Typo fixed --- contrib/rpm/rpmbuild/SPECS/moses.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec index f58668983..05e725af0 100644 --- a/contrib/rpm/rpmbuild/SPECS/moses.spec +++ b/contrib/rpm/rpmbuild/SPECS/moses.spec @@ -39,7 +39,7 @@ cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2 %install mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts cp -R bin $RPM_BUILD_ROOT/opt/moses -cp -R scripts/OSM $RPM_BUILD_ROOT/op/moses/scripts +cp -R scripts/OSM $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/Transliteration $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts From caf8018e0a9c22122559c3db741edbcce61fdc4a Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Mon, 3 Feb 2014 13:35:58 +0000 Subject: [PATCH 41/48] Fix another typo --- contrib/rpm/rpmbuild/SPECS/moses.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec index 05e725af0..8ee84f8dc 100644 --- a/contrib/rpm/rpmbuild/SPECS/moses.spec +++ b/contrib/rpm/rpmbuild/SPECS/moses.spec @@ -54,7 +54,7 @@ cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts %files %defattr(-,root,root) /opt/moses/bin/* -/opt/moses/scripts/OMS/* +/opt/moses/scripts/OSM/* /opt/moses/scripts/Transliteration/* /opt/moses/scripts/analysis/* /opt/moses/scripts/ems/* From 62d641900015cc9f05575f9a6f32eb5788a30906 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Mon, 3 Feb 2014 14:00:51 +0000 Subject: [PATCH 42/48] Removed fuzzy-match from scripts for RPM build. --- contrib/rpm/rpmbuild/SPECS/moses.spec | 2 -- 1 file changed, 2 deletions(-) diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec index 8ee84f8dc..9d7933d1f 100644 --- a/contrib/rpm/rpmbuild/SPECS/moses.spec +++ b/contrib/rpm/rpmbuild/SPECS/moses.spec @@ -43,7 +43,6 @@ cp -R scripts/OSM $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/Transliteration $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts -cp -R scripts/fuzzy-match $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts @@ -58,7 +57,6 @@ cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts /opt/moses/scripts/Transliteration/* /opt/moses/scripts/analysis/* /opt/moses/scripts/ems/* -/opt/moses/scripts/fuzzy-match/* /opt/moses/scripts/generic/* /opt/moses/scripts/other/* /opt/moses/scripts/recaser/* From 004a5d604e439de4df5fb454948d5053e15b13c1 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Mon, 3 Feb 2014 14:05:40 +0000 Subject: [PATCH 43/48] RPM installer no longer dependent on Boost. --- contrib/rpm/rpmbuild/SPECS/moses.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec index 9d7933d1f..6d3be7cb8 100644 --- a/contrib/rpm/rpmbuild/SPECS/moses.spec +++ b/contrib/rpm/rpmbuild/SPECS/moses.spec @@ -8,7 +8,7 @@ License: LGPL Group: Development/Tools Vendor: Capita Translation and Interpreting Packager: Ian Johnson -Requires: boost >= 1.48, python >= 2.6, perl >= 5 +Requires: python >= 2.6, perl >= 5 BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release} %description Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices. From 65811a032568931cf09656574edc4bd8e5cda1ce Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Mon, 3 Feb 2014 18:13:10 +0000 Subject: [PATCH 44/48] tree fragments: tiny issues with the extraction pipeline --- phrase-extract/extract-ghkm/ScfgRuleWriter.cpp | 3 ++- scripts/ems/experiment.perl | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp index b2cde6d64..bc8fd7233 100644 --- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp +++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp @@ -166,8 +166,9 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out) void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g) { Write(rule,false); - m_fwd << " Tree "; + m_fwd << " {{Tree "; g.PrintTree(m_fwd); + m_fwd << "}}"; m_fwd << std::endl; m_inv << std::endl; } diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 212260226..d53193f72 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1986,6 +1986,10 @@ sub define_training_extract_phrases { if (&get("TRAINING:use-ghkm")) { $cmd .= "-ghkm "; } + + if (&get("TRAINING:ghkm-tree-fragments")) { + $cmd .= "-ghkm-tree-fragments "; + } } my $extract_settings = &get("TRAINING:extract-settings"); @@ -2013,6 +2017,12 @@ sub define_training_build_ttable { $cmd .= "-no-word-alignment " if defined($word_alignment) && $word_alignment eq "no"; $cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features"); + + if (&get("TRAINING:hierarchical-rule-set")) { + if (&get("TRAINING:ghkm-tree-fragments")) { + $cmd .= "-ghkm-tree-fragments "; + } + } &create_step($step_id,$cmd); } From e40fabfad5b20e3aa70e08e8be7da18165e116d1 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Thu, 6 Feb 2014 19:46:32 +0000 Subject: [PATCH 45/48] fixed compile errors in debug mode --- phrase-extract/ExtractionPhrasePair.cpp | 4 ++-- phrase-extract/score-main.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp index e2814f33c..a975b4126 100644 --- a/phrase-extract/ExtractionPhrasePair.cpp +++ b/phrase-extract/ExtractionPhrasePair.cpp @@ -47,8 +47,8 @@ ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource, m_count(count), m_pcfgSum(pcfgSum) { - assert(phraseSource.empty()); - assert(phraseTarget.empty()); + assert(phraseSource->empty()); + assert(phraseTarget->empty()); m_count = count; m_pcfgSum = pcfgSum; diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 65a12d176..a54433448 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -506,7 +506,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ) { - assert(phrasePair.isValid()); + assert(phrasePair.IsValid()); const ALIGNMENT *bestAlignmentT2S = phrasePair.FindBestAlignmentTargetToSource(); float count = phrasePair.GetCount(); From d78778d1fcb984c0bf8e66d041c11c9050184432 Mon Sep 17 00:00:00 2001 From: Jun-ya NORIMATSU Date: Fri, 7 Feb 2014 15:04:35 +0900 Subject: [PATCH 46/48] prepare for the next version of DALM. --- moses/LM/Jamfile | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/moses/LM/Jamfile b/moses/LM/Jamfile index 8c155c5d9..4f964ddd8 100644 --- a/moses/LM/Jamfile +++ b/moses/LM/Jamfile @@ -94,9 +94,16 @@ if $(with-nplm) { local with-dalm = [ option.get "with-dalm" ] ; if $(with-dalm) { lib dalm : : $(with-dalm)/lib ; - lib MurmurHash3 : : $(with-dalm)/lib ; - obj DALM.o : DALMWrapper.cpp dalm MurmurHash3 ..//headers : $(with-dalm)/include $(with-dalm)/darts-clone ; - alias dalmALIAS : DALM.o dalm MurmurHash3 : : : LM_DALM ; + + if [ path.exists $(with-dalm)/lib/libMurmurHash3.a ] { + lib MurmurHash3 : : $(with-dalm)/lib ; + alias dalm-libs : dalm MurmurHash3 ; + } else { + alias dalm-libs : dalm ; + } + + obj DALM.o : DALMWrapper.cpp dalm-libs ..//headers : $(with-dalm)/include $(with-dalm)/darts-clone ; + alias dalmALIAS : DALM.o dalm-libs : : : LM_DALM ; dependencies += dalmALIAS ; lmmacros += LM_DALM ; } From ee06a0f6522d7cdc6bd834032c221e66ab70d841 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Sat, 8 Feb 2014 15:43:00 +0000 Subject: [PATCH 47/48] don't complain if input contains non-escaped '<' or '>', but is not XML --- scripts/recaser/train-truecaser.perl | 8 ++++++++ scripts/recaser/truecase.perl | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/scripts/recaser/train-truecaser.perl b/scripts/recaser/train-truecaser.perl index 8a1ba4c76..59a83ec91 100755 --- a/scripts/recaser/train-truecaser.perl +++ b/scripts/recaser/train-truecaser.perl @@ -86,15 +86,23 @@ sub split_xml { my $i = 0; $MARKUP[0] = ""; while($line =~ /\S/) { + # XML tag if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { $MARKUP[$i] .= $1." "; $line = $2; } + # non-XML text elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { $WORD[$i++] = $1; $MARKUP[$i] = ""; $line = $2; } + # '<' or '>' occurs in word, but it's not an XML tag + elsif ($line =~ /^\s*(\S+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } else { die("ERROR: huh? $line\n"); } diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index a1340f3b6..22f402196 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -70,15 +70,23 @@ sub split_xml { my $i = 0; $MARKUP[0] = ""; while($line =~ /\S/) { + # XML tag if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { $MARKUP[$i] .= $1." "; $line = $2; } + # non-XML text elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { $WORD[$i++] = $1; $MARKUP[$i] = ""; $line = $2; } + # '<' or '>' occurs in word, but it's not an XML tag + elsif ($line =~ /^\s*(\S+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } else { die("ERROR: huh? $line\n"); } From 50cadc754f36ea763fef2efab9baaa301f434d07 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 11 Feb 2014 03:43:58 +0000 Subject: [PATCH 48/48] use boost::unordered_map for CacheColl. Marginally faster --- contrib/other-builds/OnDiskPt/.cproject | 8 ++++---- contrib/other-builds/mert_lib/.cproject | 4 ++-- contrib/other-builds/moses/.cproject | 8 ++++---- moses/TranslationModel/PhraseDictionary.cpp | 7 +++---- moses/TranslationModel/PhraseDictionary.h | 3 ++- .../TranslationModel/PhraseDictionaryTransliteration.cpp | 2 +- .../TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject index f551380fd..e32a5baea 100644 --- a/contrib/other-builds/OnDiskPt/.cproject +++ b/contrib/other-builds/OnDiskPt/.cproject @@ -11,12 +11,12 @@ - - + + @@ -72,13 +72,13 @@ - - + + diff --git a/contrib/other-builds/mert_lib/.cproject b/contrib/other-builds/mert_lib/.cproject index cc46823a0..463e992bd 100644 --- a/contrib/other-builds/mert_lib/.cproject +++ b/contrib/other-builds/mert_lib/.cproject @@ -11,11 +11,11 @@ - + @@ -64,11 +64,11 @@ - + diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject index ba645f3e7..862a1deb1 100644 --- a/contrib/other-builds/moses/.cproject +++ b/contrib/other-builds/moses/.cproject @@ -11,12 +11,12 @@ - - + + @@ -88,13 +88,13 @@ - - + + diff --git a/moses/TranslationModel/PhraseDictionary.cpp b/moses/TranslationModel/PhraseDictionary.cpp index ef91d520f..f42dc5245 100644 --- a/moses/TranslationModel/PhraseDictionary.cpp +++ b/moses/TranslationModel/PhraseDictionary.cpp @@ -58,8 +58,7 @@ const TargetPhraseCollection *PhraseDictionary::GetTargetPhraseCollectionLEGACY( size_t hash = hash_value(src); - std::map >::iterator iter; - + CacheColl::iterator iter; iter = cache.find(hash); if (iter == cache.end()) { @@ -179,7 +178,7 @@ void PhraseDictionary::ReduceCache() const // find cutoff for last used time priority_queue< clock_t > lastUsedTimes; - std::map >::iterator iter; + CacheColl::iterator iter; iter = cache.begin(); while( iter != cache.end() ) { lastUsedTimes.push( iter->second.second ); @@ -193,7 +192,7 @@ void PhraseDictionary::ReduceCache() const iter = cache.begin(); while( iter != cache.end() ) { if (iter->second.second < cutoffLastUsedTime) { - std::map >::iterator iterRemove = iter++; + CacheColl::iterator iterRemove = iter++; delete iterRemove->second.first; cache.erase(iterRemove); } else iter++; diff --git a/moses/TranslationModel/PhraseDictionary.h b/moses/TranslationModel/PhraseDictionary.h index d158d394b..c6639137a 100644 --- a/moses/TranslationModel/PhraseDictionary.h +++ b/moses/TranslationModel/PhraseDictionary.h @@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include +#include #ifdef WITH_THREADS #include @@ -54,7 +55,7 @@ class ChartCellCollectionBase; class ChartRuleLookupManager; class ChartParser; -class CacheColl : public std::map > +class CacheColl : public boost::unordered_map > { // 1st = hash of source phrase/ address of phrase-table node // 2nd = all translations diff --git a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp index b3a1e0296..84ab07532 100644 --- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp +++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp @@ -59,7 +59,7 @@ void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &input CacheColl &cache = GetCache(); - std::map >::iterator iter; + CacheColl::iterator iter; iter = cache.find(hash); if (iter != cache.end()) { diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp index 45b881765..fc3ffff06 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp @@ -165,7 +165,7 @@ const TargetPhraseCollection *PhraseDictionaryOnDisk::GetTargetPhraseCollection( CacheColl &cache = GetCache(); size_t hash = (size_t) ptNode->GetFilePos(); - std::map >::iterator iter; + CacheColl::iterator iter; iter = cache.find(hash);