From 0fd3ef226377f1a67c7d725ec8acb783c39977b9 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Fri, 22 May 2015 14:31:23 +0000 Subject: [PATCH 001/286] Optimisation of bitvector - precalculate first gap pos --- moses/WordsBitmap.h | 54 ++++++++++++++++++++++++++++++--------- moses/WordsBitmapTest.cpp | 25 +++++++++++++++++- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/moses/WordsBitmap.h b/moses/WordsBitmap.h index bf417cdd0..ca17f6ac0 100644 --- a/moses/WordsBitmap.h +++ b/moses/WordsBitmap.h @@ -43,8 +43,10 @@ class WordsBitmap protected: const size_t m_size; /**< number of words in sentence */ bool *m_bitmap; /**< ticks of words that have been done */ + size_t m_firstGap; /** Position of first gap, pre-calculated as it is consulted often */ WordsBitmap(); // not implemented + WordsBitmap& operator= (const WordsBitmap& other); //! set all elements to false void Initialize() { @@ -56,9 +58,39 @@ protected: //sets elements by vector void Initialize(const std::vector& vector) { size_t vector_size = vector.size(); + bool gapFound = false; for (size_t pos = 0 ; pos < m_size ; pos++) { if (pos < vector_size && vector[pos] == true) m_bitmap[pos] = true; - else m_bitmap[pos] = false; + else { + m_bitmap[pos] = false; + if (!gapFound) { + m_firstGap = pos; + gapFound = true; + } + } + } + if (!gapFound) m_firstGap = NOT_FOUND; + } + + /** Update the first gap, when bits are flipped */ + void UpdateFirstGap(size_t startPos, size_t endPos, bool value) { + if (value) { + //may remove gap + if (startPos <= m_firstGap && m_firstGap <= endPos) { + m_firstGap = NOT_FOUND; + for (size_t i = endPos + 1 ; i < m_size; ++i) { + if (!m_bitmap[i]) { + m_firstGap = i; + break; + } + } + } + + } else { + //setting positions to false, may add new gap + if (startPos < m_firstGap) { + m_firstGap = startPos; + } } } @@ -66,23 +98,24 @@ protected: public: //! create WordsBitmap of length size and initialise with vector WordsBitmap(size_t size, const std::vector& initialize_vector) - :m_size (size) { + :m_size (size), m_firstGap(0) { m_bitmap = (bool*) malloc(sizeof(bool) * size); Initialize(initialize_vector); } //! create WordsBitmap of length size and initialise WordsBitmap(size_t size) - :m_size (size) { + :m_size (size), m_firstGap(0) { m_bitmap = (bool*) malloc(sizeof(bool) * size); Initialize(); } //! deep copy WordsBitmap(const WordsBitmap ©) - :m_size (copy.m_size) { + :m_size (copy.m_size), m_firstGap(copy.m_firstGap) { m_bitmap = (bool*) malloc(sizeof(bool) * m_size); for (size_t pos = 0 ; pos < copy.m_size ; pos++) { m_bitmap[pos] = copy.GetValue(pos); } + m_firstGap = copy.m_firstGap; } ~WordsBitmap() { free(m_bitmap); @@ -99,13 +132,7 @@ public: //! position of 1st word not yet translated, or NOT_FOUND if everything already translated size_t GetFirstGapPos() const { - for (size_t pos = 0 ; pos < m_size ; pos++) { - if (!m_bitmap[pos]) { - return pos; - } - } - // no starting pos - return NOT_FOUND; + return m_firstGap; } @@ -141,12 +168,15 @@ public: //! set value at a particular position void SetValue( size_t pos, bool value ) { m_bitmap[pos] = value; + UpdateFirstGap(pos, pos, value); } //! set value between 2 positions, inclusive void SetValue( size_t startPos, size_t endPos, bool value ) { - for(size_t pos = startPos ; pos <= endPos ; pos++) + for(size_t pos = startPos ; pos <= endPos ; pos++) { m_bitmap[pos] = value; + } + UpdateFirstGap(startPos, endPos, value); } void diff --git a/moses/WordsBitmapTest.cpp b/moses/WordsBitmapTest.cpp index 3acd1351a..781aa2d62 100644 --- a/moses/WordsBitmapTest.cpp +++ b/moses/WordsBitmapTest.cpp @@ -52,8 +52,16 @@ BOOST_AUTO_TEST_CASE(initialise) } + bitvec[0] = true; + bitvec[1] = true; + WordsBitmap wbm3(7,bitvec); + BOOST_CHECK_EQUAL(wbm3.GetFirstGapPos(),4); + + WordsBitmap wbm4(4,bitvec); + BOOST_CHECK_EQUAL(wbm4.GetFirstGapPos(),NOT_FOUND); } + BOOST_AUTO_TEST_CASE(getset) { WordsBitmap wbm(6); @@ -62,6 +70,7 @@ BOOST_AUTO_TEST_CASE(getset) BOOST_CHECK_EQUAL(wbm.GetValue(2),false); wbm.SetValue(2,true); BOOST_CHECK_EQUAL(wbm.GetValue(2),true); + wbm.SetValue(1,3,true); BOOST_CHECK_EQUAL(wbm.GetValue(1),true); @@ -110,6 +119,20 @@ BOOST_AUTO_TEST_CASE(positions) BOOST_CHECK_EQUAL(wbm.GetLastGapPos(), 9); BOOST_CHECK_EQUAL(wbm.GetLastPos(), 7); + WordsRange wr(2,4); + wbm.SetValue(wr,true); + BOOST_CHECK_EQUAL(wbm.GetFirstGapPos(),5); + + WordsRange wr2(5,8); + wbm.SetValue(wr2,true); + BOOST_CHECK_EQUAL(wbm.GetFirstGapPos(),9); + + wbm.SetValue(9,true); + BOOST_CHECK_EQUAL(wbm.GetFirstGapPos(),NOT_FOUND); + + wbm.SetValue(wr,false); + BOOST_CHECK_EQUAL(wbm.GetFirstGapPos(),2); + WordsBitmap wbm2(2); wbm2.SetValue(0,true); wbm2.SetValue(1,true); @@ -120,8 +143,8 @@ BOOST_AUTO_TEST_CASE(positions) BOOST_CHECK_EQUAL(wbm3.GetLastGapPos(), 4); BOOST_CHECK_EQUAL(wbm3.GetLastPos(), NOT_FOUND); - } + BOOST_AUTO_TEST_SUITE_END() From 502e72ce91e749e3e24480bb0d2692c4bf6b0b83 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 24 May 2015 17:08:57 +0400 Subject: [PATCH 002/286] eclipse --- contrib/other-builds/extract-rules/.cproject | 9 +- contrib/other-builds/extract/.cproject | 9 +- contrib/other-builds/manual-label/.cproject | 132 ---------- contrib/other-builds/manual-label/.project | 27 --- contrib/other-builds/manual-label/DeEn.cpp | 46 ---- contrib/other-builds/manual-label/DeEn.h | 5 - .../manual-label/EnOpenNLPChunker.cpp | 202 ---------------- .../manual-label/EnOpenNLPChunker.h | 29 --- .../manual-label/EnPhrasalVerb.cpp | 226 ------------------ .../other-builds/manual-label/EnPhrasalVerb.h | 11 - .../manual-label/LabelByInitialLetter.cpp | 29 --- .../manual-label/LabelByInitialLetter.h | 6 - contrib/other-builds/manual-label/Main.cpp | 195 --------------- contrib/other-builds/manual-label/Main.h | 27 --- contrib/other-builds/manual-label/Makefile | 14 -- .../manual-label/manual-label.project | 131 ---------- contrib/other-builds/moses/.project | 10 - contrib/other-builds/score/.cproject | 1 - contrib/other-builds/server/.cproject | 5 +- 19 files changed, 12 insertions(+), 1102 deletions(-) delete mode 100644 contrib/other-builds/manual-label/.cproject delete mode 100644 contrib/other-builds/manual-label/.project delete mode 100644 contrib/other-builds/manual-label/DeEn.cpp delete mode 100644 contrib/other-builds/manual-label/DeEn.h delete mode 100644 contrib/other-builds/manual-label/EnOpenNLPChunker.cpp delete mode 100644 contrib/other-builds/manual-label/EnOpenNLPChunker.h delete mode 100644 contrib/other-builds/manual-label/EnPhrasalVerb.cpp delete mode 100644 contrib/other-builds/manual-label/EnPhrasalVerb.h delete mode 100644 contrib/other-builds/manual-label/LabelByInitialLetter.cpp delete mode 100644 contrib/other-builds/manual-label/LabelByInitialLetter.h delete mode 100644 contrib/other-builds/manual-label/Main.cpp delete mode 100644 contrib/other-builds/manual-label/Main.h delete mode 100644 contrib/other-builds/manual-label/Makefile delete mode 100644 contrib/other-builds/manual-label/manual-label.project diff --git a/contrib/other-builds/extract-rules/.cproject b/contrib/other-builds/extract-rules/.cproject index e79f0f526..86e38979e 100644 --- a/contrib/other-builds/extract-rules/.cproject +++ b/contrib/other-builds/extract-rules/.cproject @@ -5,16 +5,16 @@ + - - + @@ -25,6 +25,7 @@ @@ -60,16 +61,16 @@ + - - + diff --git a/contrib/other-builds/extract/.cproject b/contrib/other-builds/extract/.cproject index 10701cb6e..4c80306be 100644 --- a/contrib/other-builds/extract/.cproject +++ b/contrib/other-builds/extract/.cproject @@ -5,16 +5,16 @@ + - - + @@ -25,6 +25,7 @@ @@ -61,16 +62,16 @@ + - - + diff --git a/contrib/other-builds/manual-label/.cproject b/contrib/other-builds/manual-label/.cproject deleted file mode 100644 index d9297a9fc..000000000 --- a/contrib/other-builds/manual-label/.cproject +++ /dev/null @@ -1,132 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/contrib/other-builds/manual-label/.project b/contrib/other-builds/manual-label/.project deleted file mode 100644 index d0c1dba19..000000000 --- a/contrib/other-builds/manual-label/.project +++ /dev/null @@ -1,27 +0,0 @@ - - - manual-label - - - - - - org.eclipse.cdt.managedbuilder.core.genmakebuilder - clean,full,incremental, - - - - - org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder - full,incremental, - - - - - - org.eclipse.cdt.core.cnature - org.eclipse.cdt.core.ccnature - org.eclipse.cdt.managedbuilder.core.managedBuildNature - org.eclipse.cdt.managedbuilder.core.ScannerConfigNature - - diff --git a/contrib/other-builds/manual-label/DeEn.cpp b/contrib/other-builds/manual-label/DeEn.cpp deleted file mode 100644 index ea2934c5a..000000000 --- a/contrib/other-builds/manual-label/DeEn.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include "DeEn.h" -#include "Main.h" -#include "moses/Util.h" - -using namespace std; - -extern bool g_debug; - -bool Contains(const Phrase &source, int start, int end, int factor, const string &str) -{ - for (int pos = start; pos <= end; ++pos) { - bool found = IsA(source, pos, 0, factor, str); - if (found) { - return true; - } - } - return false; -} - -void LabelDeEn(const Phrase &source, ostream &out) -{ - Ranges ranges; - - // find ranges to label - for (int start = 0; start < source.size(); ++start) { - for (int end = start; end < source.size(); ++end) { - if (IsA(source, start, -1, 1, "VAFIN") - && IsA(source, end, +1, 1, "VVINF VVPP") - && !Contains(source, start, end, 1, "VAFIN VVINF VVPP VVFIN")) { - Range range(start, end, "reorder-label"); - ranges.push_back(range); - } - else if ((start == 0 || IsA(source, start, -1, 1, "$,")) - && IsA(source, end, +1, 0, "zu") - && IsA(source, end, +2, 1, "VVINF") - && !Contains(source, start, end, 1, "$,")) { - Range range(start, end, "reorder-label"); - ranges.push_back(range); - } - } - } - - OutputWithLabels(source, ranges, out); -} - diff --git a/contrib/other-builds/manual-label/DeEn.h b/contrib/other-builds/manual-label/DeEn.h deleted file mode 100644 index c24ce0079..000000000 --- a/contrib/other-builds/manual-label/DeEn.h +++ /dev/null @@ -1,5 +0,0 @@ -#pragma once - -#include "Main.h" - -void LabelDeEn(const Phrase &source, std::ostream &out); diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp deleted file mode 100644 index 538aa9746..000000000 --- a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp +++ /dev/null @@ -1,202 +0,0 @@ -/* - * EnApacheChunker.cpp - * - * Created on: 28 Feb 2014 - * Author: hieu - */ -#include -#include -#include -#include -#include -#include -#include "EnOpenNLPChunker.h" -#include "moses/Util.h" - -using namespace std; -using namespace boost::algorithm; - -EnOpenNLPChunker::EnOpenNLPChunker(const std::string &openNLPPath) -:m_openNLPPath(openNLPPath) -{ - // TODO Auto-generated constructor stub - -} - -EnOpenNLPChunker::~EnOpenNLPChunker() { - // TODO Auto-generated destructor stub -} - -void EnOpenNLPChunker::Process(std::istream &in, std::ostream &out, const vector &filterList) -{ - const boost::filesystem::path - inPath = boost::filesystem::unique_path(), - outPath = boost::filesystem::unique_path(); - // read all input to a temp file - ofstream inFile(inPath.c_str()); - - string line; - while (getline(in, line)) { - Unescape(line); - inFile << line << endl; - } - inFile.close(); - - // execute chunker - string cmd = "cat " + inPath.native() + " | " - + m_openNLPPath + "/bin/opennlp POSTagger " - + m_openNLPPath + "/models/en-pos-maxent.bin | " - + m_openNLPPath + "/bin/opennlp ChunkerME " - + m_openNLPPath + "/models/en-chunker.bin > " - + outPath.native(); - //g << "Executing:" << cmd << endl; - int ret = system(cmd.c_str()); - - // read result of chunker and output as Moses xml trees - ifstream outFile(outPath.c_str()); - - size_t lineNum = 0; - while (getline(outFile, line)) { - //cerr << line << endl; - MosesReformat(line, out, filterList); - out << endl; - ++lineNum; - } - outFile.close(); - - // clean up temporary files - remove(inPath.c_str()); - remove(outPath.c_str()); -} - -void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, const vector &filterList) -{ - //cerr << "REFORMATING:" << line << endl; - bool inLabel = false; - vector toks; - Moses::Tokenize(toks, line); - for (size_t i = 0; i < toks.size(); ++i) { - const string &tok = toks[i]; - - if (tok.substr(0, 1) == "[" && tok.substr(1,1) != "_") { - // start of chunk - string label = tok.substr(1); - if (UseLabel(label, filterList)) { - out << ""; - inLabel = true; - } - } - else if (ends_with(tok, "]")) { - // end of chunk - if (tok.size() > 1) { - if (tok.substr(1,1) == "_") { - // just a word that happens to be ] - vector factors; - Moses::Tokenize(factors, tok, "_"); - assert(factors.size() == 2); - - Escape(factors[0]); - out << factors[0] << " "; - } - else { - // a word and end of tree - string word = tok.substr(0, tok.size()-1); - - vector factors; - Moses::Tokenize(factors, word, "_"); - assert(factors.size() == 2); - - Escape(factors[0]); - out << factors[0] << " "; - } - - if (inLabel) { - out << " "; - inLabel = false; - } - } - else { - if (inLabel) { - out << " "; - inLabel = false; - } - } - - } - else { - // lexical item - vector factors; - Moses::Tokenize(factors, tok, "_"); - if (factors.size() == 2) { - Escape(factors[0]); - out << factors[0] << " "; - } - else if (factors.size() == 1) { - // word is _ - assert(tok.substr(0, 2) == "__"); - out << "_ "; - } - else { - throw "Unknown format:" + tok; - } - } - } -} - -std::string -replaceAll( std::string const& original, - std::string const& before, - std::string const& after ) -{ - std::string retval; - std::string::const_iterator end = original.end(); - std::string::const_iterator current = original.begin(); - std::string::const_iterator next = - std::search( current, end, before.begin(), before.end() ); - while ( next != end ) { - retval.append( current, next ); - retval.append( after ); - current = next + before.size(); - next = std::search( current, end, before.begin(), before.end() ); - } - retval.append( current, next ); - return retval; -} - -void EnOpenNLPChunker::Escape(string &line) -{ - line = replaceAll(line, "&", "&"); - line = replaceAll(line, "|", "|"); - line = replaceAll(line, "<", "<"); - line = replaceAll(line, ">", ">"); - line = replaceAll(line, "'", "'"); - line = replaceAll(line, "\"", """); - line = replaceAll(line, "[", "["); - line = replaceAll(line, "]", "]"); -} - -void EnOpenNLPChunker::Unescape(string &line) -{ - line = replaceAll(line, "|", "|"); - line = replaceAll(line, "<", "<"); - line = replaceAll(line, ">", ">"); - line = replaceAll(line, """, "\""); - line = replaceAll(line, "'", "'"); - line = replaceAll(line, "[", "["); - line = replaceAll(line, "]", "]"); - line = replaceAll(line, "&", "&"); -} - -bool EnOpenNLPChunker::UseLabel(const std::string &label, const std::vector &filterList) const -{ - if (filterList.size() == 0) { - return true; - } - - for (size_t i = 0; i < filterList.size(); ++i) { - if (label == filterList[i]) { - return true; - } - } - return false; -} diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.h b/contrib/other-builds/manual-label/EnOpenNLPChunker.h deleted file mode 100644 index df9f90e42..000000000 --- a/contrib/other-builds/manual-label/EnOpenNLPChunker.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * EnApacheChunker.h - * - * Created on: 28 Feb 2014 - * Author: hieu - */ - -#pragma once - -#include -#include -#include - -class EnOpenNLPChunker { -public: - EnOpenNLPChunker(const std::string &openNLPPath); - virtual ~EnOpenNLPChunker(); - void Process(std::istream &in, std::ostream &out, const std::vector &filterList); -protected: - const std::string m_openNLPPath; - - void Escape(std::string &line); - void Unescape(std::string &line); - - void MosesReformat(const std::string &line, std::ostream &out, const std::vector &filterList); - - bool UseLabel(const std::string &label, const std::vector &filterList) const; -}; - diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp b/contrib/other-builds/manual-label/EnPhrasalVerb.cpp deleted file mode 100644 index 4bee9b941..000000000 --- a/contrib/other-builds/manual-label/EnPhrasalVerb.cpp +++ /dev/null @@ -1,226 +0,0 @@ -#include -#include -#include -#include -#include "EnPhrasalVerb.h" -#include "moses/Util.h" - -using namespace std; - -void EnPhrasalVerb(const Phrase &source, int revision, ostream &out) -{ - Ranges ranges; - - // find ranges to label - for (int start = 0; start < source.size(); ++start) { - size_t end = std::numeric_limits::max(); - - if (IsA(source, start, 0, 0, "ask asked asking")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "back backed backing")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "blow blown blew")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "break broke broken")) { - end = Found(source, start, 0, "down up in"); - } - else if (IsA(source, start, 0, 0, "bring brought bringing")) { - end = Found(source, start, 0, "down up in"); - } - else if (IsA(source, start, 0, 0, "call called calling")) { - end = Found(source, start, 0, "back up off"); - } - else if (IsA(source, start, 0, 0, "check checked checking")) { - end = Found(source, start, 0, "out in"); - } - else if (IsA(source, start, 0, 0, "cheer cheered cheering")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "clean cleaned cleaning")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "cross crossed crossing")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "cut cutting")) { - end = Found(source, start, 0, "down off out"); - } - else if (IsA(source, start, 0, 0, "do did done")) { - end = Found(source, start, 0, "over up"); - } - else if (IsA(source, start, 0, 0, "drop dropped dropping")) { - end = Found(source, start, 0, "off"); - } - else if (IsA(source, start, 0, 0, "figure figured figuring")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "fill filled filling")) { - end = Found(source, start, 0, "in out up"); - } - else if (IsA(source, start, 0, 0, "find found finding")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "get got getting gotten")) { - end = Found(source, start, 0, "across over back"); - } - else if (IsA(source, start, 0, 0, "give given gave giving")) { - end = Found(source, start, 0, "away back out up"); - } - else if (IsA(source, start, 0, 0, "hand handed handing")) { - end = Found(source, start, 0, "down in over"); - } - else if (IsA(source, start, 0, 0, "hold held holding")) { - end = Found(source, start, 0, "back up"); - } - else if (IsA(source, start, 0, 0, "keep kept keeping")) { - end = Found(source, start, 0, "from up"); - } - else if (IsA(source, start, 0, 0, "let letting")) { - end = Found(source, start, 0, "down in"); - } - else if (IsA(source, start, 0, 0, "look looked looking")) { - end = Found(source, start, 0, "over up"); - } - else if (IsA(source, start, 0, 0, "make made making")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "mix mixed mixing")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "pass passed passing")) { - end = Found(source, start, 0, "out up"); - } - else if (IsA(source, start, 0, 0, "pay payed paying")) { - end = Found(source, start, 0, "back"); - } - else if (IsA(source, start, 0, 0, "pick picked picking")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "point pointed pointing")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "put putting")) { - end = Found(source, start, 0, "down off out together on"); - } - else if (IsA(source, start, 0, 0, "send sending")) { - end = Found(source, start, 0, "back"); - } - else if (IsA(source, start, 0, 0, "set setting")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "sort sorted sorting")) { - end = Found(source, start, 0, "out"); - } - else if (IsA(source, start, 0, 0, "switch switched switching")) { - end = Found(source, start, 0, "off on"); - } - else if (IsA(source, start, 0, 0, "take took taking")) { - end = Found(source, start, 0, "apart back off out"); - } - else if (IsA(source, start, 0, 0, "tear torn tearing")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "think thought thinking")) { - end = Found(source, start, 0, "over"); - } - else if (IsA(source, start, 0, 0, "thrown threw thrown throwing")) { - end = Found(source, start, 0, "away"); - } - else if (IsA(source, start, 0, 0, "turn turned turning")) { - end = Found(source, start, 0, "down off on"); - } - else if (IsA(source, start, 0, 0, "try tried trying")) { - end = Found(source, start, 0, "on out"); - } - else if (IsA(source, start, 0, 0, "use used using")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "warm warmed warming")) { - end = Found(source, start, 0, "up"); - } - else if (IsA(source, start, 0, 0, "work worked working")) { - end = Found(source, start, 0, "out"); - } - - // found range to label - if (end != std::numeric_limits::max() && - end > start + 1) { - bool add = true; - if (revision == 1 && Exist(source, - start + 1, - end - 1, - 1, - "VB VBD VBG VBN VBP VBZ")) { - // there's a verb in between - add = false; - } - - if (add) { - Range range(start + 1, end - 1, "reorder-label"); - ranges.push_back(range); - } - } - } - - OutputWithLabels(source, ranges, out); -} - -bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str) -{ - vector soughts = Moses::Tokenize(str, " "); - for (size_t i = start; i <= end; ++i) { - const Word &word = source[i]; - bool found = Found(word, factor, soughts); - if (found) { - return true; - } - } - - return false; -} - -size_t Found(const Phrase &source, int pos, int factor, const std::string &str) -{ - const size_t MAX_RANGE = 10; - - vector soughts = Moses::Tokenize(str, " "); - vector puncts = Moses::Tokenize(". : , ;", " "); - - - size_t maxEnd = std::min(source.size(), (size_t) pos + MAX_RANGE); - for (size_t i = pos + 1; i < maxEnd; ++i) { - const Word &word = source[i]; - bool found; - - found = Found(word, factor, puncts); - if (found) { - return std::numeric_limits::max(); - } - - found = Found(word, factor, soughts); - if (found) { - return i; - } - } - - return std::numeric_limits::max(); -} - - -bool Found(const Word &word, int factor, const vector &soughts) -{ - const string &element = word[factor]; - for (size_t i = 0; i < soughts.size(); ++i) { - const string &sought = soughts[i]; - bool found = (element == sought); - if (found) { - return true; - } - } - return false; -} - - diff --git a/contrib/other-builds/manual-label/EnPhrasalVerb.h b/contrib/other-builds/manual-label/EnPhrasalVerb.h deleted file mode 100644 index 4cb5f7348..000000000 --- a/contrib/other-builds/manual-label/EnPhrasalVerb.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include "Main.h" - -// roll your own identification of phrasal verbs -void EnPhrasalVerb(const Phrase &source, int revision, std::ostream &out); - -bool Exist(const Phrase &source, int start, int end, int factor, const std::string &str); -size_t Found(const Phrase &source, int pos, int factor, const std::string &str); -bool Found(const Word &word, int factor, const std::vector &soughts); - diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp b/contrib/other-builds/manual-label/LabelByInitialLetter.cpp deleted file mode 100644 index e4136a7ea..000000000 --- a/contrib/other-builds/manual-label/LabelByInitialLetter.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include "LabelByInitialLetter.h" -#include "Main.h" - -using namespace std; - -void LabelByInitialLetter(const Phrase &source, std::ostream &out) -{ - Ranges ranges; - - for (int start = 0; start < source.size(); ++start) { - const string &startWord = source[start][0]; - string startChar = startWord.substr(0,1); - - for (int end = start + 1; end < source.size(); ++end) { - const string &endWord = source[end][0]; - string endChar = endWord.substr(0,1); - - if (startChar == endChar) { - Range range(start, end, startChar + "-label"); - ranges.push_back(range); - } - } - } - - OutputWithLabels(source, ranges, out); - -} - - diff --git a/contrib/other-builds/manual-label/LabelByInitialLetter.h b/contrib/other-builds/manual-label/LabelByInitialLetter.h deleted file mode 100644 index ba8d34c19..000000000 --- a/contrib/other-builds/manual-label/LabelByInitialLetter.h +++ /dev/null @@ -1,6 +0,0 @@ -#pragma once - -#include "Main.h" - -void LabelByInitialLetter(const Phrase &source, std::ostream &out); - diff --git a/contrib/other-builds/manual-label/Main.cpp b/contrib/other-builds/manual-label/Main.cpp deleted file mode 100644 index 896f70590..000000000 --- a/contrib/other-builds/manual-label/Main.cpp +++ /dev/null @@ -1,195 +0,0 @@ -#include -#include -#include -#include "moses/Util.h" -#include "Main.h" -#include "DeEn.h" -#include "EnPhrasalVerb.h" -#include "EnOpenNLPChunker.h" -#include "LabelByInitialLetter.h" - -using namespace std; - -bool g_debug = false; - -Phrase Tokenize(const string &line); - -int main(int argc, char** argv) -{ - cerr << "Starting" << endl; - - namespace po = boost::program_options; - po::options_description desc("Options"); - desc.add_options() - ("help", "Print help messages") - - ("input,i", po::value(), "Input file. Otherwise it will read from standard in") - ("output,o", po::value(), "Output file. Otherwise it will print from standard out") - - ("source-language,s", po::value()->required(), "Source Language") - ("target-language,t", po::value()->required(), "Target Language") - ("revision,r", po::value()->default_value(0), "Revision") - ("filter", po::value(), "Only use labels from this comma-separated list") - - ("opennlp", po::value()->default_value(""), "Path to Apache OpenNLP toolkit") - - ; - - po::variables_map vm; - try - { - po::store(po::parse_command_line(argc, argv, desc), - vm); // can throw - - /** --help option - */ - if ( vm.count("help") ) - { - std::cout << "Basic Command Line Parameter App" << std::endl - << desc << std::endl; - return EXIT_SUCCESS; - } - - po::notify(vm); // throws on error, so do after help in case - // there are any problems - } - catch(po::error& e) - { - std::cerr << "ERROR: " << e.what() << std::endl << std::endl; - std::cerr << desc << std::endl; - return EXIT_FAILURE; - } - - istream *inStrm = &cin; - if (vm.count("input")) { - string inStr = vm["input"].as(); - cerr << "inStr=" << inStr << endl; - ifstream *inFile = new ifstream(inStr.c_str()); - inStrm = inFile; - } - - ostream *outStrm = &cout; - if (vm.count("output")) { - string outStr = vm["output"].as(); - cerr << "outStr=" << outStr << endl; - ostream *outFile = new ofstream(outStr.c_str()); - outStrm = outFile; - } - - vector filterList; - if (vm.count("filter")) { - string filter = vm["filter"].as(); - Moses::Tokenize(filterList, filter, ","); - } - - string sourceLang = vm["source-language"].as(); - string targetLang = vm["target-language"].as(); - int revision = vm["revision"].as(); - - cerr << sourceLang << " " << targetLang << " " << revision << endl; - - if (sourceLang == "en" && revision == 2) { - if (vm.count("opennlp") == 0) { - throw "Need path to openNLP toolkit"; - } - - string openNLPPath = vm["opennlp"].as(); - EnOpenNLPChunker chunker(openNLPPath); - chunker.Process(*inStrm, *outStrm, filterList); - } - else { - // process line-by-line - string line; - size_t lineNum = 1; - - while (getline(*inStrm, line)) { - //cerr << lineNum << ":" << line << endl; - if (lineNum % 1000 == 0) { - cerr << lineNum << " "; - } - - Phrase source = Tokenize(line); - - if (revision == 600 ) { - LabelByInitialLetter(source, *outStrm); - } - else if (sourceLang == "de" && targetLang == "en") { - LabelDeEn(source, *outStrm); - } - else if (sourceLang == "en") { - if (revision == 0 || revision == 1) { - EnPhrasalVerb(source, revision, *outStrm); - } - else if (revision == 2) { - string openNLPPath = vm["opennlp-path"].as(); - EnOpenNLPChunker chunker(openNLPPath); - } - } - - ++lineNum; - } - } - - - cerr << "Finished" << endl; - return EXIT_SUCCESS; -} - -Phrase Tokenize(const string &line) -{ - Phrase ret; - - vector toks = Moses::Tokenize(line); - for (size_t i = 0; i < toks.size(); ++i) { - Word word = Moses::Tokenize(toks[i], "|"); - ret.push_back(word); - } - - return ret; -} - -bool IsA(const Phrase &source, int pos, int offset, int factor, const string &str) -{ - pos += offset; - if (pos >= source.size() || pos < 0) { - return false; - } - - const string &word = source[pos][factor]; - vector soughts = Moses::Tokenize(str, " "); - for (int i = 0; i < soughts.size(); ++i) { - string &sought = soughts[i]; - bool found = (word == sought); - if (found) { - return true; - } - } - return false; -} - - -void OutputWithLabels(const Phrase &source, const Ranges ranges, ostream &out) -{ - // output sentence, with labels - for (int pos = 0; pos < source.size(); ++pos) { - // output beginning of label - for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) { - const Range &range = *iter; - if (range.range.first == pos) { - out << " "; - } - } - - const Word &word = source[pos]; - out << word[0] << " "; - - for (Ranges::const_iterator iter = ranges.begin(); iter != ranges.end(); ++iter) { - const Range &range = *iter; - if (range.range.second == pos) { - out << " "; - } - } - } - out << endl; - -} diff --git a/contrib/other-builds/manual-label/Main.h b/contrib/other-builds/manual-label/Main.h deleted file mode 100644 index 036da0d45..000000000 --- a/contrib/other-builds/manual-label/Main.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -typedef std::vector Word; -typedef std::vector Phrase; - -struct Range -{ - Range(int start,int end, const std::string &l) - :range(start, end) - ,label(l) - {} - - std::pair range; - std::string label; -}; - -typedef std::list Ranges; - -bool IsA(const Phrase &source, int pos, int offset, int factor, const std::string &str); -void OutputWithLabels(const Phrase &source, const Ranges ranges, std::ostream &out); - - diff --git a/contrib/other-builds/manual-label/Makefile b/contrib/other-builds/manual-label/Makefile deleted file mode 100644 index f24d69dc7..000000000 --- a/contrib/other-builds/manual-label/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -all: manual-label - -clean: - rm -f *.o manual-label - -.cpp.o: - g++ -I../../../boost/include -I../../../ -O3 -g -c $< - -OBJECTS = DeEn.o EnOpenNLPChunker.o EnPhrasalVerb.o Main.o LabelByInitialLetter.o - -manual-label: $(OBJECTS) - g++ $(OBJECTS) -L../../../boost/lib64 -lz -lboost_program_options-mt -o manual-label - - diff --git a/contrib/other-builds/manual-label/manual-label.project b/contrib/other-builds/manual-label/manual-label.project deleted file mode 100644 index 5c678561a..000000000 --- a/contrib/other-builds/manual-label/manual-label.project +++ /dev/null @@ -1,131 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - None - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - None - - - - - - - - - - - - - - - - - diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index 7ed5723ea..389f71297 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1140,16 +1140,6 @@ 1 PARENT-3-PROJECT_LOC/moses/FF/DynamicCacheBasedLanguageModel.h - - FF/ExternalFeature.cpp - 1 - PARENT-3-PROJECT_LOC/moses/FF/ExternalFeature.cpp - - - FF/ExternalFeature.h - 1 - PARENT-3-PROJECT_LOC/moses/FF/ExternalFeature.h - FF/FFState.cpp 1 diff --git a/contrib/other-builds/score/.cproject b/contrib/other-builds/score/.cproject index 78a5e13f9..d904122eb 100644 --- a/contrib/other-builds/score/.cproject +++ b/contrib/other-builds/score/.cproject @@ -59,7 +59,6 @@ - diff --git a/contrib/other-builds/server/.cproject b/contrib/other-builds/server/.cproject index 688221af6..78c5185f9 100644 --- a/contrib/other-builds/server/.cproject +++ b/contrib/other-builds/server/.cproject @@ -75,7 +75,6 @@ - @@ -159,10 +158,10 @@ - + - + From f024eede74e7c911abdff96cbdb376aa9101f589 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 3 Jun 2015 12:51:44 +0100 Subject: [PATCH 003/286] Added ca() as short replacement for approxOccurrenceCount() to tsa_tree_iterator. --- moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h index 053ff2445..afc71b5fd 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h @@ -155,7 +155,8 @@ namespace ugdiss } }; - double approxOccurrenceCount(int p=-1) const + double + ca(int p=-1) const // approximate occurrence count { assert(root); if (p < 0) p += lower.size(); @@ -167,6 +168,13 @@ namespace ugdiss return ret; } + inline + double + approxOccurrenceCount(int p=-1) const // deprecated, use ca() + { + return ca(); + } + size_t grow(Token const* t, Token const* stop) { while ((t != stop) && extend(*t)) t = t->next(); From debdd218995ce2b4582998af7b7f22b3c3a7af3f Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 3 Jun 2015 12:52:18 +0100 Subject: [PATCH 004/286] Optional initialization of SentenceBias. --- moses/TranslationModel/UG/mm/ug_sampling_bias.cc | 2 +- moses/TranslationModel/UG/mm/ug_sampling_bias.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index da408dfb3..0c81a1b91 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -152,7 +152,7 @@ namespace Moses : m_bias(bias) { } SentenceBias - ::SentenceBias(size_t const s) : m_bias(s) { } + ::SentenceBias(size_t const s, float const f) : m_bias(s,f) { } id_type SentenceBias diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index f540ddc76..f91a3c91b 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -72,7 +72,7 @@ namespace Moses std::vector m_bias; public: SentenceBias(std::vector const& bias); - SentenceBias(size_t const s); + SentenceBias(size_t const s, float const f = 0); id_type GetClass(id_type idx) const; From 0afe1398108c06e4ac23e80dfe709745a650de36 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 3 Jun 2015 12:55:58 +0100 Subject: [PATCH 005/286] Initial check-in. --- moses/TranslationModel/UG/fuzzy.cc | 177 +++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 moses/TranslationModel/UG/fuzzy.cc diff --git a/moses/TranslationModel/UG/fuzzy.cc b/moses/TranslationModel/UG/fuzzy.cc new file mode 100644 index 000000000..61f4dc898 --- /dev/null +++ b/moses/TranslationModel/UG/fuzzy.cc @@ -0,0 +1,177 @@ +// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*- +#include +#include +#include +#include "mm/ug_bitext.h" +#include "mm/tpt_typedefs.h" +#include "mm/ug_prime_sampling1.h" +#include "generic/sorting/VectorIndexSorter.h" +#include "generic/sorting/NBestList.h" +#include + +using namespace std; +using namespace Moses; +using namespace Moses::bitext; +namespace po=boost::program_options; +using namespace boost::algorithm; +typedef L2R_Token Token; +typedef mmBitext mmbitext; +typedef Bitext::tsa tsa; +typedef imTtrack imttrack; +typedef imTSA imtsa; + +string bname, bname1, bname2, L1, L2, Q1, Q2; +size_t maxhits; +void interpret_args(int ac, char* av[]); + +TokenIndex V1; +TokenIndex V2; +sptr > C1; +sptr > C2; +mmTSA I1; + +void +open_bitext() +{ + C1.reset(new mmTtrack); + if (L2.size()) + { + bname1 = bname + L1 + "."; + bname2 = bname + L2 + "."; + } + else if (L1.size()) + { + bname1 = bname; + bname2 = L1; + } + else bname1 = bname; + + if (bname2.size()) C2.reset(new mmTtrack); + + C1->open(bname1+"mct"); + I1.open(bname1+"sfa", C1); + V1.open(bname1+"tdx"); + V1.setDynamic(true); + + if (bname2.size()) + { + C2->open(bname2+"mct"); + V2.open(bname2+"tdx"); + } + +} + +sptr +read_input() +{ + sptr > > crp(new vector >); + crp->reserve(1000); + string line; + while (getline(cin,line)) + { + crp->push_back(vector()); + fill_token_seq(V1, line, crp->back()); + } + sptr ret(new imttrack (crp)); + return ret; +} + +sptr > > +nbest(TSA::tree_iterator const& r, vector const& hits, + vector& score, VectorIndexSorter& sorter, + size_t const nbest_size) +{ + typedef NBestList > nbest_list_t; + sptr ret(new nbest_list_t(nbest_size, sorter)); + bitvector mycheck(hits.size()); + tsa::ArrayEntry I(r.lower_bound(-1)); + char const* stop = r.upper_bound(-1); + while (I.next < stop) + { + r.root->readEntry(I.next,I); + if (mycheck[I.sid]) continue; + score[I.sid] = hits[I.sid] / r.root->getCorpus()->sntLen(I.sid); + ret->add(I.sid); + mycheck.set(I.sid); + } + return ret; +} + +int main(int argc, char* argv[]) +{ + interpret_args(argc, argv); + open_bitext(); + sptr icrp = read_input(); + imtsa newIdx(icrp,NULL); + sptr hits = prime_sampling1(I1, newIdx, 1000); + vector score(hits->size()); + VectorIndexSorter sorter(score); + for (size_t s = 0; s < icrp->size(); ++s) + { + size_t stop = icrp->sntLen(s); + Token const* t = icrp->sntStart(s); + cout << string(80,'-') << "\n" << toString(V1, t, stop) << endl; + for (size_t i = 0; i < stop; ++i) + { + TSA::tree_iterator r(&I1); + for (size_t k = i; k < stop && r.extend(t[k].id()); ++k) + { + if (r.ca() < 3) continue; + cout << "\n" << r.str(&V1) << " " << int(r.ca()) << endl; + if (r.ca() > 10000) continue; + sptr > > top; + top = nbest(r, *hits, score, sorter, 5); + for (size_t n = 0; n < top->size(); ++n) + { + cout << "[" << n << ": " << score[(*top)[n]] + << " (" << (*hits)[(*top)[n]] << "/" << C1->sntLen((*top)[n]) << ")]\n" + << toString(V1, C1->sntStart((*top)[n]), C1->sntLen((*top)[n])) << "\n"; + if (C2) cout << toString(V2, C2->sntStart((*top)[n]), C2->sntLen((*top)[n])) << "\n"; + cout << endl; + } + } + } + + } +} + +void +interpret_args(int ac, char* av[]) +{ + po::variables_map vm; + po::options_description o("Options"); + o.add_options() + + ("help,h", "print this message") + ("maxhits,n", po::value(&maxhits)->default_value(25), + "max. number of hits") + ("q1", po::value(&Q1), "query in L1") + ("q2", po::value(&Q2), "query in L2") + ; + + po::options_description h("Hidden Options"); + h.add_options() + ("bname", po::value(&bname), "base name of corpus") + ("L1", po::value(&L1), "L1 tag") + ("L2", po::value(&L2), "L2 tag") + ; + + h.add(o); + po::positional_options_description a; + a.add("bname",1); + a.add("L1",1); + a.add("L2",1); + + po::store(po::command_line_parser(ac,av) + .options(h) + .positional(a) + .run(),vm); + po::notify(vm); + if (vm.count("help")) + { + cout << "\nusage:\n\t" << av[0] + << " [options] [--q1=] [--q2=]" << endl; + cout << o << endl; + exit(0); + } +} From 83fa1b6a88a8490f819ca340169fe023a211eb9e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 3 Jun 2015 12:59:32 +0100 Subject: [PATCH 006/286] Initial check-in. --- .../UG/mm/ug_prime_sampling1.h | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 moses/TranslationModel/UG/mm/ug_prime_sampling1.h diff --git a/moses/TranslationModel/UG/mm/ug_prime_sampling1.h b/moses/TranslationModel/UG/mm/ug_prime_sampling1.h new file mode 100644 index 000000000..a0c609bfc --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_prime_sampling1.h @@ -0,0 +1,93 @@ +// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*- +// Functions for "priming" sample selection for sampling phrase tables. +// Author: Ulrich Germann +#pragma once +#include "ug_bitext.h" +#includde "ug_sampling_bias.h" +#include +// #ifndef NO_MOSES +namespace Moses { +// #endif +namespace bitext +{ + +typedef L2R_Token Token; +typedef mmBitext mmbitext; +typedef Bitext::tsa tsa; +typedef imTtrack imttrack; +typedef imTSA imtsa; + +template +void +mark(typename TSA::tree_iterator const& r, SentenceBias>& hits) +{ + char const* stop = r.upper_bound(-1); + for (tsa::ArrayEntry I(r.lower_bound(-1)); I.next < stop;) + { + r.root->readEntry(I.next,I); + size_t slen = r.root->getCorpus()->sntLen(I.sid); + hits[I.sid] += 1./(r.ca() * slen); + } +} + +template +bool +process(typename TSA::tree_iterator& m, + typename TSA::tree_iterator& r, + typename std::vector & hits, + size_t const max_count=1000) +{ + if (m.down()) + { + do + { + if (r.extend(m.getToken(-1)->id())) + { + if (r.approxOccurrenceCount() > max_count) + // don't mark phrases that occur very often + process(m, r, hits, max_count); + else mark(r,hits); + r.up(); + } + else if (r.size() && r.size() == 1) // && r.ca() < max_count) + mark(r,hits); + } + while (m.over()); + m.up(); + } +} + +template +sptr +prime_sampling1(TSA const& refIdx, + TSA const& newIdx, + size_t const max_count) +{ + typename TSA::tree_iterator m(&newIdx); + typename TSA::tree_iterator r(&refIdx); + sptr > ret; + ret.reset(new SentenceBias(refIdx.getCorpus()->size(),0)); + process(m, r, *ret, max_count); + return ret; +} + +template +sptr +prime_sampling1(TokenIndex& V, TSA const& refIdx, + typename std::vector const& input, + size_t const max_count) +{ + sptr > > crp; + crp.reset(new typename std::vector >(input.size())); + for (size_t i = 0; i < input.size(); ++i) + fill_token_seq(V, input[i], (*crp)[i]); + sptr idoc(new imttrack(crp)); + imtsa newIdx(idoc,NULL); + return prime_sampling1(refIdx, newIdx, max_count); +} + +} // end of namespace bitext +// #ifndef NO_MOSES +} // end of namespace Moses +// #endif + From 5a56a5b4964a7ee566e610991fe667c713495111 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 16:20:08 +0100 Subject: [PATCH 007/286] Added target for forced relinking only (no forced recompilation); temporarily disabled tcmalloc. --- moses/TranslationModel/UG/Makefile | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/moses/TranslationModel/UG/Makefile b/moses/TranslationModel/UG/Makefile index ed1dead52..5ad555094 100644 --- a/moses/TranslationModel/UG/Makefile +++ b/moses/TranslationModel/UG/Makefile @@ -60,7 +60,8 @@ nil: # libraries required -LIBS = m z bz2 pthread dl ${BOOSTLIBS} +LIBS = m z bz2 pthread dl ${BOOSTLIBS} +#LIBS += tcmalloc BOOSTLIBS := thread system filesystem program_options iostreams BOOSTLIBS := $(addprefix boost_,${BOOSTLIBS}) ifdef ($(BOOSTLIBTAG),"") @@ -102,13 +103,13 @@ skip += spe-check-coverage3.cc skip += mmsapt.cpp skip += ug_stringdist.cc skip += ug_splice_arglist.cc -skip += ug_lexical_reordering.cc -skip += ug_sampling_bias.cc +# skip += ug_lexical_reordering.cc # objects from elsewhere in the moses tree that are needed extra = ${MOSES_ROOT}/util/exception.cc $(foreach f,$(skip),$(eval broken+=$(shell find -name $f))) +broken += $(wildcard ./mm/stashed/*) $(info SCANNING DIRECTORY TREE FOR FILES) find_cfiles = find -name '*.cc' -or -name '*.cpp' @@ -122,5 +123,11 @@ LIBOBJ = $(call cc2obj,$(filter-out $(PROGRAMS),$(CFILES) $(extra))) $(foreach f,$(CFILES) $(extra),$(eval $(call compile,$f))) $(foreach p,$(PROGRAMS),$(eval $(call build,$p))) +ifeq ($(filter relink,$(MAKECMDGOALS)),relink) +.PHONY: relink +$(foreach p,$(PROGRAMS),$(eval .PHONY: $(call cc2exe,$p))) +relink: $(filter-out relink,$(MAKECMDGOALS)) +endif + -include $(DEP) From 5cb1d95e098f490ca61560c2c6a39acd45c27f09 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 16:21:09 +0100 Subject: [PATCH 008/286] Added member function for retrieving nbest list items without sorting. --- .../TranslationModel/UG/generic/sorting/NBestList.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/moses/TranslationModel/UG/generic/sorting/NBestList.h b/moses/TranslationModel/UG/generic/sorting/NBestList.h index ae0e35a98..c9490729f 100644 --- a/moses/TranslationModel/UG/generic/sorting/NBestList.h +++ b/moses/TranslationModel/UG/generic/sorting/NBestList.h @@ -1,3 +1,4 @@ +// -*- mode: c++; tab-width: 2; indent-tabs-mode:nil; -*- #ifndef __n_best_list_h #define __n_best_list_h #include @@ -27,6 +28,7 @@ public: NBestList(size_t const max_size); bool add(THINGY const& item); THINGY const& operator[](int i) const; + THINGY const& get_unsorted(int i) const; size_t size() const { return m_heap.size(); } @@ -81,5 +83,15 @@ operator[](int i) const return m_list[m_order.at(i)]; } +template +THINGY const& +NBestList:: +get_unsorted(int i) const +{ + if (i < 0) i += m_heap.size(); + return m_list[m_heap.at(i)]; +} + + } #endif From 576c743aee6da2cabe066422575d367f6dc7b12c Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 16:22:03 +0100 Subject: [PATCH 009/286] Simplified #include. --- .../UG/generic/threading/ug_thread_safe_counter.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc b/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc index b4565f99d..e16b7f6e2 100644 --- a/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc +++ b/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc @@ -1,4 +1,7 @@ -#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h" +// #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h" +#include "ug_thread_safe_counter.h" +// obsolete once can be assumed to be available everywhere + namespace Moses { ThreadSafeCounter:: From 243a6a8b3b88462295687186343999a3a1229b94 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 16:23:00 +0100 Subject: [PATCH 010/286] Added #define for intrusive pointer. --- moses/TranslationModel/UG/mm/ug_typedefs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/moses/TranslationModel/UG/mm/ug_typedefs.h b/moses/TranslationModel/UG/mm/ug_typedefs.h index 0181bef9e..ba52cdbaa 100644 --- a/moses/TranslationModel/UG/mm/ug_typedefs.h +++ b/moses/TranslationModel/UG/mm/ug_typedefs.h @@ -31,6 +31,7 @@ namespace ugdiss } #define sptr boost::shared_ptr +#define iptr boost::intrusive_ptr #define scoptr boost::scoped_ptr #define rcast reinterpret_cast #endif From 47fa99b61b4d3fbedfcf22e12923ffdcb481e1b6 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 16:26:47 +0100 Subject: [PATCH 011/286] Added member function size() to LRU_Cache. --- moses/TranslationModel/UG/mm/ug_lru_cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_lru_cache.h b/moses/TranslationModel/UG/mm/ug_lru_cache.h index 0000b194f..43e42b568 100644 --- a/moses/TranslationModel/UG/mm/ug_lru_cache.h +++ b/moses/TranslationModel/UG/mm/ug_lru_cache.h @@ -63,6 +63,7 @@ namespace lru_cache public: LRU_Cache(size_t capacity=1) : m_qfront(0), m_qback(0) { reserve(capacity); } size_t capacity() const { return m_recs.capacity(); } + size_t size() const { return m_idx.size(); } void reserve(size_t s) { m_recs.reserve(s); } sptr @@ -86,7 +87,6 @@ namespace lru_cache boost::lock_guard lock(m_lock); pair foo; foo = m_idx.insert(make_pair(key,m_recs.size())); - uint32_t p = foo.first->second; if (foo.second) // was not in the cache { From 1b4b3a510304b7b926b0e8f04c9035b5ff10ab6b Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 16:27:49 +0100 Subject: [PATCH 012/286] Mmsapt: btfix now instatiated via intrusive pointer ... to prevent deletion while Mmsapt is live. --- moses/TranslationModel/UG/mmsapt.cpp | 33 ++++++++++++++-------------- moses/TranslationModel/UG/mmsapt.h | 3 ++- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index f05c0d59b..787ac76db 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -72,6 +72,7 @@ namespace Moses Mmsapt:: Mmsapt(string const& line) : PhraseDictionary(line, false) + , btfix(new mmbitext) , m_bias_log(NULL) , m_bias_loglevel(0) , m_lr_func(NULL) @@ -288,7 +289,7 @@ namespace Moses Mmsapt:: load_bias(string const fname) { - m_bias = btfix.loadSentenceBias(fname); + m_bias = btfix->loadSentenceBias(fname); } void @@ -457,11 +458,11 @@ namespace Moses // corpus and one in-memory dynamic corpus boost::unique_lock lock(m_lock); - btfix.m_num_workers = this->m_workers; - btfix.open(m_bname, L1, L2); - btfix.setDefaultSampleSize(m_default_sample_size); + btfix->m_num_workers = this->m_workers; + btfix->open(m_bname, L1, L2); + btfix->setDefaultSampleSize(m_default_sample_size); - btdyn.reset(new imbitext(btfix.V1, btfix.V2, m_default_sample_size, m_workers)); + btdyn.reset(new imbitext(btfix->V1, btfix->V2, m_default_sample_size, m_workers)); if (m_bias_file.size()) load_bias(m_bias_file); @@ -530,7 +531,7 @@ namespace Moses else if (dyn) { PhrasePair zilch; zilch.init(); - TSA::tree_iterator m(btfix.I2.get(), dyn->start2, dyn->len2); + TSA::tree_iterator m(btfix->I2.get(), dyn->start2, dyn->len2); if (m.size() == dyn->len2) zilch.raw2 = m.approxOccurrenceCount(); pool += zilch; @@ -552,7 +553,7 @@ namespace Moses uint32_t len = fix ? fix->len2 : dyn->len2; for (uint32_t k = 0; k < len; ++k, x = x->next()) { - StringPiece wrd = (*(btfix.V2))[x->id()]; + StringPiece wrd = (*(btfix->V2))[x->id()]; Word w; w.CreateFromString(Output,ofactor,wrd,false); tp->AddWord(w); } @@ -608,7 +609,7 @@ namespace Moses { // map from Moses Phrase to internal id sequence vector sphrase; - fillIdSeq(src,input_factor,*(btfix.V1),sphrase); + fillIdSeq(src,input_factor,*(btfix->V1),sphrase); if (sphrase.size() == 0) return NULL; // Reserve a local copy of the dynamic bitext in its current form. /btdyn/ @@ -623,7 +624,7 @@ namespace Moses assert(dyn); // lookup phrases in both bitexts - TSA::tree_iterator mfix(btfix.I1.get(), &sphrase[0], sphrase.size()); + TSA::tree_iterator mfix(btfix->I1.get(), &sphrase[0], sphrase.size()); TSA::tree_iterator mdyn(dyn->I1.get()); if (dyn->I1.get()) for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i) @@ -665,7 +666,7 @@ namespace Moses // for btfix. sptr sfix,sdyn; - if (mfix.size() == sphrase.size()) sfix = btfix.lookup(ttask, mfix); + if (mfix.size() == sphrase.size()) sfix = btfix->lookup(ttask, mfix); if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(ttask, mdyn); vector > ppfix,ppdyn; @@ -704,7 +705,7 @@ namespace Moses BOOST_FOREACH(PhrasePair const& pp, ppfix) { if (&pp != &ppfix.front() && pp.joint <= 1) break; - pp.print(*m_bias_log,*btfix.V1, *btfix.V2, m_lr_func->GetModel()); + pp.print(*m_bias_log,*btfix->V1, *btfix->V2, m_lr_func->GetModel()); } } #endif @@ -788,7 +789,7 @@ namespace Moses context->bias_log = m_bias_log; } context->bias - = btfix.SetupDocumentBias(m_bias_server, context_words, m_bias_log); + = btfix->SetupDocumentBias(m_bias_server, context_words, m_bias_log); context->bias->loglevel = m_bias_loglevel; context->bias->log = m_bias_log; } @@ -827,12 +828,12 @@ namespace Moses { if (phrase.GetSize() == 0) return false; vector myphrase; - fillIdSeq(phrase,input_factor,*btfix.V1,myphrase); + fillIdSeq(phrase,input_factor,*btfix->V1,myphrase); - TSA::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size()); + TSA::tree_iterator mfix(btfix->I1.get(),&myphrase[0],myphrase.size()); if (mfix.size() == myphrase.size()) { - btfix.prep(ttask, mfix); + btfix->prep(ttask, mfix); // cerr << phrase << " " << mfix.approxOccurrenceCount() << endl; return true; } @@ -874,7 +875,7 @@ namespace Moses // Mmsapt // ::setupDocumentBias(map const& bias) const // { - // return btfix.SetupDocumentBias(bias); + // return btfix->SetupDocumentBias(bias); // } vector diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 5f688cfd8..9b19eb356 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -5,6 +5,7 @@ #include #include +#include #include "moses/TypeDef.h" #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h" @@ -63,7 +64,7 @@ namespace Moses typedef PhraseScorer pscorer; private: // vector > shards; - mmbitext btfix; + iptr btfix; sptr btdyn; std::string m_bname, m_extra_data, m_bias_file,m_bias_server; std::string L1; From 8f4b2afe26cdb83706d4c54e19f783a1b1e69a6a Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 16:30:07 +0100 Subject: [PATCH 013/286] #include a few more things. --- moses/TranslationModel/UG/mm/ug_lexical_reordering.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h index 9c56e6cb5..965dfcc04 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h @@ -1,5 +1,7 @@ // -*- c++ -*- #pragma once +#include "ug_typedefs.h" +#include #include #ifndef NO_MOSES From e8ee56876e249e2b85ca3290bb0f5883d03fa7c0 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 17:24:53 +0100 Subject: [PATCH 014/286] Initial check-in. --- .../mm/ug_bitext_phrase_extraction_record.h | 25 ++ .../UG/mm/ug_bitext_sampler.h | 223 ++++++++++++++++++ 2 files changed, 248 insertions(+) create mode 100644 moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h create mode 100644 moses/TranslationModel/UG/mm/ug_bitext_sampler.h diff --git a/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h b/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h new file mode 100644 index 000000000..4393dcc60 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h @@ -0,0 +1,25 @@ +// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*- +#include +#include "ug_typedefs.h" + +namespace Moses +{ + namespace bitext + { + struct PhraseExtractionRecord + { + size_t const sid, start, stop; + bool const flip; // 'backward' lookup from L2 + size_t s1, s2, e1, e2; // soft and hard boundaries of target phrase + int po_fwd, po_bwd; // fwd and bwd phrase orientation + std::vector* aln; // local alignments + bitvector* full_aln; // full word alignment for sentence + + PhraseExtractionRecord(size_t const xsid, size_t const xstart, + size_t const xstop, bool const xflip, + std::vector* xaln, bitvector* xfull_aln = NULL) + : sid(xsid), start(xstart), stop(xstop), flip(xflip) + , aln(xaln), full_aln(xfull_aln) { } + }; + } +} diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h new file mode 100644 index 000000000..47716d7e6 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -0,0 +1,223 @@ +// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*- +#pragma once +#include +#include +#include +#include "ug_bitext.h" +#include "ug_bitext_pstats.h" +#include "ug_sampling_bias.h" +#include "ug_tsa_array_entry.h" +#include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h" +#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h" +#include "moses/TranslationModel/UG/generic/sorting/NBestList.h" +namespace Moses +{ +namespace bitext +{ + + enum sampling_method { full_coverage, random_sampling, ranked_sampling }; + + typedef ugdiss::ttrack::Position TokenPosition; + class CandidateSorter + { + SamplingBias const& score; + public: + CandidateSorter(SamplingBias const& s) : score(s) {} + bool operator()(TokenPosition const& a, TokenPosition const& b) const + { return score[a.sid] > score[b.sid]; } + }; + + template + class + BitextSampler : public reference_counter + { + typedef Bitext bitext; + typedef TSA tsa; + typedef SamplingBias bias; + typedef typename Bitext::iter tsa_iter; + mutable boost::condition_variable m_ready; + mutable boost::mutex m_lock; + // const members + // sptr const m_bitext; // keep bitext alive while I am + // should be an + iptr const m_bitext; // keep bitext alive as long as I am + size_t const m_plen; // length of lookup phrase + bool const m_fwd; // forward or backward direction? + sptr const m_root; // root of suffix array + char const* m_next; // current position + char const* m_stop; // end of search range + sampling_method const m_method; /* look at all / random sample / + * ranked samples */ + sptr const m_bias; // bias over candidates + size_t const m_samples; // how many samples at most + // non-const members + sptr m_stats; // destination for phrase stats + size_t m_ctr; // number of samples considered + float m_total_bias; // for random sampling with bias + bool m_finished; + void consider_sample(TokenPosition const& p); + size_t perform_ranked_sampling(); + + public: + BitextSampler(bitext* const bitext, typename bitext::iter const& phrase, + sptr const& bias, size_t const max_samples, + sampling_method const method); + ~BitextSampler(); + bool operator()(); // run sampling + sptr stats(); + bool done() const; + }; + + template + BitextSampler:: + BitextSampler(Bitext* const bitext, + typename bitext::iter const& phrase, + sptr const& bias, size_t const max_samples, + sampling_method const method) + : m_bitext(bitext) + , m_plen(phrase.size()) + , m_fwd(phrase.root == bitext->I1.get()) + , m_root(m_fwd ? bitext->I1 : bitext->I2) + , m_next(phrase.lower_bound(-1)) + , m_stop(phrase.upper_bound(-1)) + , m_method(method) + , m_bias(bias) + , m_samples(max_samples) + , m_ctr(0) + , m_total_bias(0) + , m_finished(false) + { + m_stats.reset(new pstats); + m_stats->raw_cnt = phrase.ca(); + m_stats->register_worker(); + } + + // Ranked sampling sorts all samples by score and then considers the top-ranked + // candidates for phrase extraction. + template + size_t + BitextSampler:: + perform_ranked_sampling() + { + if (m_next == m_stop) return m_ctr; + CandidateSorter sorter(*m_bias); + NBestList nbest(m_samples,sorter); + ugdiss::tsa::ArrayEntry I(m_next); + while (I.next < m_stop) + { + ++m_ctr; + nbest.add(m_root->readEntry(I.next,I)); + } + for (size_t i = 0; i < nbest.size(); ++i) + consider_sample(nbest.get_unsorted(i)); + cerr << m_ctr << " samples considered at " + << __FILE__ << ":" << __LINE__ << endl; + return m_ctr; + } + + template + void + BitextSampler:: + consider_sample(TokenPosition const& p) + { + vector aln; + bitvector full_aln(100*100); + PhraseExtractionRecord rec(p.sid, p.offset, p.offset + m_plen, + !m_fwd, &aln, &full_aln); + int docid = m_bias ? m_bias->GetClass(p.sid) : -1; + bool good = m_bitext->find_trg_phr_bounds(rec); + if (!good) + { // no good, probably because phrase is not coherent + m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd); + return; + } + + // all good: register this sample as valid + size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1); + m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd); + + float sample_weight = 1./num_pairs; + Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid); + + // adjust offsets in phrase-internal aligment + for (size_t k = 1; k < aln.size(); k += 2) aln[k] += rec.s2 - rec.s1; + + vector seen; seen.reserve(10); + // It is possible that the phrase extraction extracts the same + // phrase twice, e.g., when word a co-occurs with sequence b b b + // but is aligned only to the middle word. We can only count + // each phrase pair once per source phrase occurrence, or else + // run the risk of having more joint counts than marginal + // counts. + + for (size_t s = rec.s1; s <= rec.s2; ++s) + { + TSA const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1; + sptr b = I.find(o + s, rec.e1 - s); + UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found"); + + for (size_t i = rec.e1; i <= rec.e2; ++i) + { + uint64_t tpid = b->getPid(); + + // poor man's protection against over-counting + size_t s = 0; + while (s < seen.size() && seen[s] != tpid) ++s; + if (s < seen.size()) continue; + seen.push_back(tpid); + + size_t raw2 = b->approxOccurrenceCount(); + m_stats->add(tpid, sample_weight, aln, raw2, + rec.po_fwd, rec.po_bwd, docid); + bool ok = (i == rec.e2) || b->extend(o[i].id()); + UTIL_THROW_IF2(!ok, "Could not extend target phrase."); + } + if (s < rec.s2) // shift phrase-internal alignments + for (size_t k = 1; k < aln.size(); k += 2) + --aln[k]; + } + } + + template + bool + BitextSampler:: + operator()() + { + if (m_finished) return true; + boost::unique_lock lock(m_lock); + perform_ranked_sampling(); + m_finished = true; + m_ready.notify_all(); + return true; + } + + + template + bool + BitextSampler:: + done() const + { + return m_next == m_stop; + } + + template + sptr + BitextSampler:: + stats() + { + if (m_ctr == 0) (*this)(); + boost::unique_lock lock(m_lock); + while (!m_finished) + m_ready.wait(lock); + return m_stats; + } + + template + BitextSampler:: + ~BitextSampler() + { + cerr << "bye" << endl; + } + +} // end of namespace bitext +} // end of namespace Moses From 623eb7bb7771a2aaecce213509910f4f7a5c7eb7 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 21:13:26 +0100 Subject: [PATCH 015/286] Instantiation of btfix via boost::intrusive_ptr in Mmsapt. This is in preparation for distinct bitext samplers which need to ensure the lifetime of the bitext while sampling. --- .../UG/generic/threading/ug_ref_counter.h | 25 +++++++++++++++++++ moses/TranslationModel/UG/mm/ug_bitext.h | 3 ++- moses/TranslationModel/UG/mmsapt.cpp | 7 +++--- 3 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 moses/TranslationModel/UG/generic/threading/ug_ref_counter.h diff --git a/moses/TranslationModel/UG/generic/threading/ug_ref_counter.h b/moses/TranslationModel/UG/generic/threading/ug_ref_counter.h new file mode 100644 index 000000000..605cf2a32 --- /dev/null +++ b/moses/TranslationModel/UG/generic/threading/ug_ref_counter.h @@ -0,0 +1,25 @@ +#include "ug_thread_safe_counter.h" +#pragma once +// obsolete once intrusive_ref_counter is available everywhere + +namespace Moses { + + class reference_counter + { + public: + friend void intrusive_ptr_add_ref(reference_counter* p) + { + if (p) ++p->m_refcount; + } + friend void intrusive_ptr_release(reference_counter* p) + { + if (p && --p->m_refcount == 0) + delete p; + } + protected: + reference_counter() {} + virtual ~reference_counter() {}; + private: + mutable ThreadSafeCounter m_refcount; + }; +} diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 2d2afc3ca..201569fb0 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -64,6 +64,7 @@ #include "ug_lexical_reordering.h" #include "ug_sampling_bias.h" #include "ug_phrasepair.h" +#include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h" #define PSTATS_CACHE_THRESHOLD 50 @@ -92,7 +93,7 @@ namespace Moses { #endif template - class Bitext + class Bitext : public reference_counter { public: typedef TKN Token; diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 787ac76db..eb8e1fb89 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -7,6 +7,7 @@ #include "mmsapt.h" #include #include +#include #include #include #include @@ -509,7 +510,7 @@ namespace Moses if (fix) { BOOST_FOREACH(sptr const& ff, m_active_ff_fix) - (*ff)(btfix, *fix, &fvals); + (*ff)(*btfix, *fix, &fvals); } if (dyn) { @@ -541,7 +542,7 @@ namespace Moses if (fix) { BOOST_FOREACH(sptr const& ff, m_active_ff_common) - (*ff)(btfix, pool, &fvals); + (*ff)(*btfix, pool, &fvals); } else { @@ -673,7 +674,7 @@ namespace Moses PhrasePair::SortByTargetIdSeq sort_by_tgt_id; if (sfix) { - expand(mfix, btfix, *sfix, ppfix, m_bias_log); + expand(mfix, *btfix, *sfix, ppfix, m_bias_log); sort(ppfix.begin(), ppfix.end(),sort_by_tgt_id); } if (sdyn) From 704432cf0f58c86b88bf09d804081fe40d68a4f4 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 22:25:13 +0100 Subject: [PATCH 016/286] Bug fixes. --- moses/TranslationModel/UG/mm/ug_prime_sampling1.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_prime_sampling1.h b/moses/TranslationModel/UG/mm/ug_prime_sampling1.h index a0c609bfc..452c7fc4d 100644 --- a/moses/TranslationModel/UG/mm/ug_prime_sampling1.h +++ b/moses/TranslationModel/UG/mm/ug_prime_sampling1.h @@ -3,7 +3,7 @@ // Author: Ulrich Germann #pragma once #include "ug_bitext.h" -#includde "ug_sampling_bias.h" +#include "ug_sampling_bias.h" #include // #ifndef NO_MOSES namespace Moses { @@ -19,7 +19,7 @@ typedef imTSA imtsa; template void -mark(typename TSA::tree_iterator const& r, SentenceBias>& hits) +mark(typename TSA::tree_iterator const& r, SentenceBias& hits) { char const* stop = r.upper_bound(-1); for (tsa::ArrayEntry I(r.lower_bound(-1)); I.next < stop;) @@ -33,9 +33,8 @@ mark(typename TSA::tree_iterator const& r, SentenceBias>& hits) template bool process(typename TSA::tree_iterator& m, - typename TSA::tree_iterator& r, - typename std::vector & hits, - size_t const max_count=1000) + typename TSA::tree_iterator& r, + SentenceBias& hits, size_t const max_count=1000) { if (m.down()) { @@ -65,7 +64,7 @@ prime_sampling1(TSA const& refIdx, { typename TSA::tree_iterator m(&newIdx); typename TSA::tree_iterator r(&refIdx); - sptr > ret; + sptr ret; ret.reset(new SentenceBias(refIdx.getCorpus()->size(),0)); process(m, r, *ret, max_count); return ret; From 8a547ea82f65bc60ad7aafecbc1f7498eb98bec6 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 22:25:49 +0100 Subject: [PATCH 017/286] Added missing #include. --- moses/TranslationModel/UG/mm/ug_bitext_sampler.h | 1 + 1 file changed, 1 insertion(+) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index 47716d7e6..7bc630ef5 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -7,6 +7,7 @@ #include "ug_bitext_pstats.h" #include "ug_sampling_bias.h" #include "ug_tsa_array_entry.h" +#include "ug_bitext_phrase_extraction_record.h" #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h" #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h" #include "moses/TranslationModel/UG/generic/sorting/NBestList.h" From c7fffab82ccc237e21a3f5d3d2f600a3ec58295c Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 22:27:10 +0100 Subject: [PATCH 018/286] Bug fixes. --- moses/TranslationModel/UG/mm/ug_sampling_bias.cc | 7 +++++-- moses/TranslationModel/UG/mm/ug_sampling_bias.h | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index 0c81a1b91..2f8a00ccb 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -1,8 +1,10 @@ #include "ug_sampling_bias.h" #include #include +#include "moses/Util.h" +#ifndef NO_MOSES #include "moses/Timer.h" - +#endif // #ifdef HAVE_CURLPP // #include // #include @@ -41,12 +43,13 @@ namespace Moses , m_bias(docname2docid.size(), 0) { // #ifdef HAVE_CURLPP +#ifndef NO_MOSES Timer timer; if (log) timer.start(NULL); std::string json = query_bias_server(server_url, text); init_from_json(json, docname2docid, log); if (log) *log << "Bias query took " << timer << " seconds." << std::endl; - // #endif +#endif } void diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index f91a3c91b..c7954ede0 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -5,7 +5,7 @@ #include #include #include -#include "moses/Util.h" +// #include "moses/Util.h" #include "ug_typedefs.h" namespace Moses { From 53752f70a7eacc0374f4ab59b96f7b2fa2e82b2b Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 22:28:02 +0100 Subject: [PATCH 019/286] Added member function find_trg_phr_bound(PhraseExtractionRecord& rec). --- moses/TranslationModel/UG/mm/ug_bitext.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 201569fb0..29e8c551c 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -64,6 +64,7 @@ #include "ug_lexical_reordering.h" #include "ug_sampling_bias.h" #include "ug_phrasepair.h" +#include "ug_phrase_extraction_record.h" #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h" #define PSTATS_CACHE_THRESHOLD 50 @@ -146,6 +147,8 @@ namespace Moses { bitvector* full_alignment, // stores full word alignment for this sent. bool const flip) const; // flip source and target (reverse lookup) + bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const; + // prep2 launches sampling and returns immediately. // lookup (below) waits for the job to finish before it returns sptr @@ -319,12 +322,22 @@ namespace Moses { void operator()(); }; + template + bool + Bitext:: + find_trg_phr_bounds(PhraseExtractionRecord& rec) const + { + return find_trg_phr_bounds(rec.sid, rec.start, rec.stop, + rec.s1, rec.s2, rec.e1, rec.e2, + rec.po_fwd, rec.po_bwd, + rec.aln, rec.full_aln, rec.flip); + } + template bool Bitext:: find_trg_phr_bounds - (size_t const sid, - size_t const start, size_t const stop, + (size_t const sid, size_t const start, size_t const stop, size_t & s1, size_t & s2, size_t & e1, size_t & e2, int & po_fwd, int & po_bwd, std::vector* core_alignment, bitvector* full_alignment, From 8ae28941070b81c0969eaf8a12ef8d5b2197873d Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 22:29:26 +0100 Subject: [PATCH 020/286] Initial check-in. --- moses/TranslationModel/UG/test-iptr.cc | 20 ++ .../UG/test-ranked-phrase-lookup.cc | 195 ++++++++++++++++++ 2 files changed, 215 insertions(+) create mode 100644 moses/TranslationModel/UG/test-iptr.cc create mode 100644 moses/TranslationModel/UG/test-ranked-phrase-lookup.cc diff --git a/moses/TranslationModel/UG/test-iptr.cc b/moses/TranslationModel/UG/test-iptr.cc new file mode 100644 index 000000000..128057144 --- /dev/null +++ b/moses/TranslationModel/UG/test-iptr.cc @@ -0,0 +1,20 @@ +// -*- c++ -*- +#include +#include +#include "generic/threading/ug_ref_counter.h" + +using namespace std; + +class X : public Moses::reference_counter +{ +public: + X() { cout << "hello" << endl; } + ~X() { cout << "bye-bye" << endl; } +}; + +int main() +{ + boost::intrusive_ptr i(new X); + // i.reset(); + cout << "bla" << endl; +} diff --git a/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc b/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc new file mode 100644 index 000000000..f8ae20a87 --- /dev/null +++ b/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc @@ -0,0 +1,195 @@ +// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*- +#include +#include +#include +#include "mm/ug_bitext.h" +#include "mm/tpt_typedefs.h" +#include "mm/ug_prime_sampling1.h" +#include "mm/ug_bitext_sampler.h" +#include "mm/ug_phrasepair.h" +#include "mm/ug_lru_cache.h" +#include "generic/sorting/VectorIndexSorter.h" +#include "generic/sorting/NBestList.h" +#include +#include + +using namespace std; +using namespace Moses; +using namespace Moses::bitext; +namespace po=boost::program_options; +using namespace boost::algorithm; +typedef L2R_Token Token; +typedef mmBitext mmbitext; +typedef Bitext::tsa tsa; +// typedef TSA::tree_iterator iter; +typedef Bitext::iter iter; +typedef imTtrack imttrack; +typedef imTSA imtsa; + +string bname, bname1, bname2, ifile, L1, L2, Q1, Q2; +size_t maxhits; +void interpret_args(int ac, char* av[]); + +sptr +read_input(TokenIndex& V) +{ + sptr > > crp(new vector >); + crp->reserve(1000); + string line; + istream* in = &cin; + ifstream inputfile; + if (ifile.size()) + { + inputfile.open(ifile.c_str()); + in = & inputfile; + } + while (getline(*in,line)) + { + crp->push_back(vector()); + fill_token_seq(V, line, crp->back()); + } + sptr ret(new imttrack (crp)); + return ret; +} + +int main(int argc, char* argv[]) +{ + interpret_args(argc, argv); + iptr Bptr(new mmbitext); + mmbitext& B = *Bptr; + B.open(bname, L1, L2); + sptr icrp = read_input(*B.V1); + imtsa newIdx(icrp,NULL); + sptr bias = prime_sampling1(*B.I1, newIdx, 5000); + cerr << "primed" << endl; + typedef vector > pplist_t; +#define WITH_CACHE 1 +#if WITH_CACHE + // map > CACHE; + lru_cache::LRU_Cache CACHE(1000); +#endif + for (size_t s = 0; s < icrp->size(); ++s) + { + size_t stop = icrp->sntLen(s); + Token const* t = icrp->sntStart(s); + cout << string(80,'-') << "\n" << toString(*B.V1, t, stop) << endl; + for (size_t i = 0; i < stop; ++i) + { + iter r(B.I1.get()); + for (size_t k = i; k < stop && r.extend(t[k].id()); ++k) + { + // cerr << k << "/" << i << endl; + cout << "\n" << r.str(B.V1.get()) + << " [" << r.ca() << "]" << endl; + // sptr pplist; + sptr stats; +#if WITH_CACHE + // if (r.ca() > 1000) pplist = CACHE.get(r.getPid()); + stats = CACHE.get(r.getPid()); +#endif + vector > pplist; + if (!stats) + { + bitext::BitextSampler + sampler(&B, r, bias, 1000, ranked_sampling); + sampler(); + stats = sampler.stats(); + // pplist->resize(pplist->size()); +#if WITH_CACHE + + // if (r.ca() > 1000) + CACHE.set(r.getPid(), stats); +#endif + } + expand(r, B, *stats, pplist, NULL); + cout << pplist.size() << " " << sizeof(PhrasePair) << "; " + // << pstats::s_instance_count << " instances of pstats live. " + // << PhrasePair::s_instances << " instances of PhrasePair live." + << endl; + // BOOST_FOREACH(PhrasePair const& pp, *pplist) + // { + // if (pp.joint == 1) continue; + // cout << " " << setw(6) << pp.joint << " " + // << toString(*B.V2, pp.start2, pp.len2) << endl; + // } + } + } + } + // cout << pstats::s_instance_count << " instances of pstats live. " + // << PhrasePair::s_instances << " instances of PhrasePair live." + // << endl; +} + + // vector score(hits->size()); + // VectorIndexSorter sorter(score); + // for (size_t s = 0; s < icrp->size(); ++s) + // { + // size_t stop = icrp->sntLen(s); + // Token const* t = icrp->sntStart(s); + // cout << string(80,'-') << "\n" << toString(V1, t, stop) << endl; + // for (size_t i = 0; i < stop; ++i) + // { + // TSA::tree_iterator r(&I1); + // for (size_t k = i; k < stop && r.extend(t[k].id()); ++k) + // { + // if (r.ca() < 3) continue; + // cout << "\n" << r.str(&V1) << " " << int(r.ca()) << endl; + // if (r.ca() > 10000) continue; + // sptr > > top; + // top = nbest(r, *hits, score, sorter, 5); + // for (size_t n = 0; n < top->size(); ++n) + // { + // cout << "[" << n << ": " << score[(*top)[n]] + // << " (" << (*hits)[(*top)[n]] << "/" << C1->sntLen((*top)[n]) << ")]\n" + // << toString(V1, C1->sntStart((*top)[n]), C1->sntLen((*top)[n])) << "\n"; + // if (C2) cout << toString(V2, C2->sntStart((*top)[n]), C2->sntLen((*top)[n])) << "\n"; + // cout << endl; + // } + // } + // } + + // } + //} + +void +interpret_args(int ac, char* av[]) +{ + po::variables_map vm; + po::options_description o("Options"); + o.add_options() + + ("help,h", "print this message") + ("maxhits,n", po::value(&maxhits)->default_value(25), + "max. number of hits") + ("q1", po::value(&Q1), "query in L1") + ("q2", po::value(&Q2), "query in L2") + ; + + po::options_description h("Hidden Options"); + h.add_options() + ("bname", po::value(&bname), "base name of corpus") + ("L1", po::value(&L1), "L1 tag") + ("L2", po::value(&L2), "L2 tag") + ("ifile", po::value(&ifile), "input file") + ; + + h.add(o); + po::positional_options_description a; + a.add("bname",1); + a.add("L1",1); + a.add("L2",1); + a.add("ifile",1); + + po::store(po::command_line_parser(ac,av) + .options(h) + .positional(a) + .run(),vm); + po::notify(vm); + if (vm.count("help")) + { + cout << "\nusage:\n\t" << av[0] + << " [options] [--q1=] [--q2=]" << endl; + cout << o << endl; + exit(0); + } +} From f87f123366a3c27985fd04dd427d2acdf3f042eb Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 22:50:17 +0100 Subject: [PATCH 021/286] Added member function find_trg_phrase_bound(PhraseExtractionRecord& rec) to Bitext class. --- moses/TranslationModel/UG/mm/ug_bitext.h | 67 ++++++++++++++---------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 29e8c551c..145535a95 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -35,6 +35,7 @@ #include "moses/TranslationModel/UG/generic/sampling/Sampling.h" #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h" #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h" +#include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h" // #include "moses/FF/LexicalReordering/LexicalReorderingState.h" #include "moses/Util.h" @@ -97,6 +98,7 @@ namespace Moses { class Bitext : public reference_counter { public: + template friend class BitextSampler; typedef TKN Token; typedef typename TSA::tree_iterator iter; typedef typename std::vector > vec_ppair; @@ -136,18 +138,17 @@ namespace Moses { // points of the target phrase; if non-NULL, store word // alignments in *core_alignment. If /flip/, source phrase is // L2. + bool find_trg_phr_bounds(PhraseExtractionRecord& rec); bool find_trg_phr_bounds ( size_t const sid, // sentence to investigate size_t const start, // start of source phrase size_t const stop, // last position of source phrase - size_t & s1, size_t & s2, // beginning and end of target start + size_t & s1, size_t & s2, // beginning and end of target start size_t & e1, size_t & e2, // beginning and end of target end - int& po_fwd, int& po_bwd, // phrase orientations + int& po_fwd, int& po_bwd, // phrase orientations std::vector * core_alignment, // stores the core alignment - bitvector* full_alignment, // stores full word alignment for this sent. - bool const flip) const; // flip source and target (reverse lookup) - - bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const; + bitvector* full_alignment, // stores full word alignment for this sent. + bool const flip) const; // flip source and target (reverse lookup) // prep2 launches sampling and returns immediately. // lookup (below) waits for the job to finish before it returns @@ -182,7 +183,6 @@ namespace Moses { void prep(ttasksptr const& ttask, iter const& phrase) const; #endif - void setDefaultSampleSize(size_t const max_samples); size_t getDefaultSampleSize() const; @@ -337,21 +337,24 @@ namespace Moses { bool Bitext:: find_trg_phr_bounds - (size_t const sid, size_t const start, size_t const stop, - size_t & s1, size_t & s2, size_t & e1, size_t & e2, - int & po_fwd, int & po_bwd, - std::vector* core_alignment, bitvector* full_alignment, - bool const flip) const + ( size_t const sid, // sentence to investigate + size_t const start, // start of source phrase + size_t const stop, // last position of source phrase + size_t & s1, size_t & s2, // beginning and end of target start + size_t & e1, size_t & e2, // beginning and end of target end + int& po_fwd, int& po_bwd, // phrase orientations + std::vector * core_alignment, // stores the core alignment + bitvector* full_alignment, // stores full word alignment for this sent. + bool const flip) const // flip source and target (reverse lookup) { // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl; - - // a word on the core_alignment: + // a word on the core_alignment (core_alignment): // - // since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1 + // Since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1 // < e2, respectively) are be definition unaligned, we store - // only the core alignment in *core_alignment it is up to the - // calling function to shift alignment points over for start - // positions of extracted phrases that start with a fringe word + // only the core alignment in *aln. It is up to the calling + // function to shift alignment points over for start positions + // of extracted phrases that start with a fringe word assert(T1); assert(T2); assert(Tx); @@ -378,19 +381,29 @@ namespace Moses { size_t lft = forbidden.size(); size_t rgt = 0; std::vector > aln1(slen1),aln2(slen2); + + // process word alignment for this sentence char const* p = Tx->sntStart(sid); char const* x = Tx->sntEnd(sid); - while (p < x) { - if (flip) { p = binread(p,trg); assert(p= slen1 || trg >= slen2), "Alignment range error at sentence " << sid << "!\n" - << src << "/" << slen1 << " " << - trg << "/" << slen2); - + << src << "/" << slen1 << " " << trg << "/" << slen2); + if (src < start || src >= stop) forbidden.set(trg); else @@ -422,8 +435,8 @@ namespace Moses { { BOOST_FOREACH(ushort x, aln1[i]) { - core_alignment->push_back(i-start); - core_alignment->push_back(x-lft); + core_alignment->push_back(i - start); + core_alignment->push_back(x - lft); } } // now determine fwd and bwd phrase orientation From d4234847cd46519614a229beb1a49e49ccc3be14 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 5 Jun 2015 22:51:58 +0100 Subject: [PATCH 022/286] Added #include. --- moses/TranslationModel/UG/mm/ug_lexical_reordering.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc b/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc index d0522c528..704aced63 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc +++ b/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc @@ -1,4 +1,5 @@ #include "ug_lexical_reordering.h" +#include "moses/Util.h" namespace Moses { namespace bitext From 56dee1d4acde732cd1bf7745493790e215bd71ff Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 6 Jun 2015 13:21:33 +0100 Subject: [PATCH 023/286] Bug fixes: missing #include and const declaration of find_trg_phr-bounds(). --- moses/TranslationModel/UG/mm/ug_bitext.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 145535a95..1b1c95c27 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -65,7 +65,7 @@ #include "ug_lexical_reordering.h" #include "ug_sampling_bias.h" #include "ug_phrasepair.h" -#include "ug_phrase_extraction_record.h" +#include "ug_bitext_phrase_extraction_record.h" #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h" #define PSTATS_CACHE_THRESHOLD 50 @@ -138,7 +138,7 @@ namespace Moses { // points of the target phrase; if non-NULL, store word // alignments in *core_alignment. If /flip/, source phrase is // L2. - bool find_trg_phr_bounds(PhraseExtractionRecord& rec); + bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const; bool find_trg_phr_bounds ( size_t const sid, // sentence to investigate size_t const start, // start of source phrase From 7a57ce4dc2efead161f880d668c51af5a0c81904 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 6 Jun 2015 13:22:04 +0100 Subject: [PATCH 024/286] Missing #pragma once. --- .../TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h | 1 + 1 file changed, 1 insertion(+) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h b/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h index 4393dcc60..646875859 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h @@ -1,4 +1,5 @@ // -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*- +#pragma once #include #include "ug_typedefs.h" From 2b7137b5480d0d5d4582cd8d30c6695b7a789a7b Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 11:44:03 +0100 Subject: [PATCH 025/286] Changes to how the context scope is set. --- moses/ExportInterface.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index 0ceeceec1..6ba83250e 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -220,14 +220,26 @@ batch_run() // main loop over set of input sentences boost::shared_ptr source; + + // global scope of caches, biases, etc., if any + boost::shared_ptr gscope; + if ((ioWrapper->GetLookAhead() + ioWrapper->GetLookBack() == 0) + || ioWrapper->GetLookAhead() == std::numeric_limits::max()) + gscope.reset(new ContextScope); + while ((source = ioWrapper->ReadInput()) != NULL) { IFVERBOSE(1) ResetUserTime(); FeatureFunction::CallChangeSource(source.get()); // set up task of translating one sentence - boost::shared_ptr - task = TranslationTask::create(source, ioWrapper); + boost::shared_ptr lscope; + if (gscope) lscope = gscope; + else lscope.reset(new ContextScope); + + boost::shared_ptr task; + task = TranslationTask::create(source, ioWrapper, lscope); + if (source->GetContext()) task->SetContextString(*source->GetContext()); else task->SetContextString(context_string); From b682a0663645ae5ccbf279ff72ac581d9ec28fb0 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 12:07:01 +0100 Subject: [PATCH 026/286] New member functions to expose buffered input and context window size. --- moses/IOWrapper.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h index f1bcefa92..eb2deb729 100644 --- a/moses/IOWrapper.h +++ b/moses/IOWrapper.h @@ -45,6 +45,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "moses/TypeDef.h" #include "moses/Sentence.h" @@ -183,6 +184,21 @@ public: // post editing std::ifstream *spe_src, *spe_trg, *spe_aln; + std::list > const& GetPastInput() const { + return m_past_input; + } + + std::list > const& GetFutureInput() const { + return m_future_input; + } + size_t GetLookAhead() const { + return m_look_ahead; + } + + size_t GetLookBack() const { + return m_look_back; + } + private: template boost::shared_ptr @@ -212,10 +228,10 @@ BufferInput() return ret; ret = source; } - while (m_buffered_ahead < m_look_ahead) { source.reset(new itype); - if (!source->Read(*m_inputStream, *m_inputFactorOrder)) break; + if (!source->Read(*m_inputStream, *m_inputFactorOrder)) + break; m_future_input.push_back(source); m_buffered_ahead += source->GetSize(); } From 26e4cee9b18135400e1fa34e574af80f3c87d84b Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 12:08:27 +0100 Subject: [PATCH 027/286] New create function that includes scope. New member function to exposure IOWrapper. --- moses/TranslationTask.cpp | 12 ++++++++++++ moses/TranslationTask.h | 11 +++++++++++ 2 files changed, 23 insertions(+) diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp index 61cdfc162..bf0228b14 100644 --- a/moses/TranslationTask.cpp +++ b/moses/TranslationTask.cpp @@ -61,6 +61,18 @@ TranslationTask return ret; } +boost::shared_ptr +TranslationTask +::create(boost::shared_ptr const& source, + boost::shared_ptr const& ioWrapper, + boost::shared_ptr const& scope) +{ + boost::shared_ptr ret(new TranslationTask(source, ioWrapper)); + ret->m_self = ret; + ret->m_scope = scope; + return ret; +} + TranslationTask ::TranslationTask(boost::shared_ptr const& source, boost::shared_ptr const& ioWrapper) diff --git a/moses/TranslationTask.h b/moses/TranslationTask.h index 2b75c47d5..500967538 100644 --- a/moses/TranslationTask.h +++ b/moses/TranslationTask.h @@ -92,6 +92,12 @@ public: create(boost::shared_ptr const& source, boost::shared_ptr const& ioWrapper); + static + boost::shared_ptr + create(boost::shared_ptr const& source, + boost::shared_ptr const& ioWrapper, + boost::shared_ptr const& scope); + ~TranslationTask(); /** Translate one sentence * gets called by main function implemented at end of this source file */ @@ -102,6 +108,11 @@ public: return m_source; } + boost::shared_ptr + GetIOWrapper() const { + return m_ioWrapper; + } + boost::shared_ptr SetupManager(SearchAlgorithm algo = DefaultSearchAlgorithm); From cd5530cb875cef5282f2f5bdee93b33327aaf5c0 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 12:10:48 +0100 Subject: [PATCH 028/286] Added option to make moses consider the entire document as context. --- moses/parameters/ContextParameters.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/moses/parameters/ContextParameters.cpp b/moses/parameters/ContextParameters.cpp index 76f529e67..7ee9323bd 100644 --- a/moses/parameters/ContextParameters.cpp +++ b/moses/parameters/ContextParameters.cpp @@ -6,7 +6,8 @@ namespace Moses ContextParameters:: ContextParameters() - : look_ahead(0), look_back(0) { } + : look_ahead(0), look_back(0) +{ } void ContextParameters:: @@ -19,11 +20,18 @@ init(Parameter& params) if (context_window == "") return; - + + if (context_window.substr(0,3) == "all") + { + look_back = look_ahead = std::numeric_limits::max(); + return; + } + size_t p = context_window.find_first_of("0123456789"); if (p == 0) look_back = look_ahead = atoi(context_window.c_str()); - if (p == 1) { + + if (p == 1) { if (context_window[0] == '-') look_back = atoi(context_window.substr(1).c_str()); else if (context_window[0] == '+') @@ -31,6 +39,7 @@ init(Parameter& params) else UTIL_THROW2("Invalid specification of context window."); } + if (p == 2) { if (context_window.substr(0,2) == "+-" || context_window.substr(0,2) == "-+") From d5b0ec7562b018c55144c78a8f3776de28d0ce63 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 12:20:25 +0100 Subject: [PATCH 029/286] Initial check-in. --- .../UG/generic/threading/ug_thread_pool.cc | 31 +++++++++++++++++++ .../UG/generic/threading/ug_thread_pool.h | 30 ++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 moses/TranslationModel/UG/generic/threading/ug_thread_pool.cc create mode 100644 moses/TranslationModel/UG/generic/threading/ug_thread_pool.h diff --git a/moses/TranslationModel/UG/generic/threading/ug_thread_pool.cc b/moses/TranslationModel/UG/generic/threading/ug_thread_pool.cc new file mode 100644 index 000000000..20f95a787 --- /dev/null +++ b/moses/TranslationModel/UG/generic/threading/ug_thread_pool.cc @@ -0,0 +1,31 @@ +#include "ug_thread_pool.h" +namespace ug { + +ThreadPool:: +ThreadPool(size_t const num_workers) + : m_service(), m_busywork(new boost::asio::io_service::work(m_service)) +{ + m_workers.reserve(num_workers); + for (size_t i = 0; i < num_workers; ++i) + { + // boost::shared_ptr t; + // t.reset(new boost::thread(boost::bind(&service_t::run, &m_service))); + boost::thread* t; + t = new boost::thread(boost::bind(&service_t::run, &m_service)); + m_pool.add_thread(t); + // m_workers.push_back(t); + } +} + +ThreadPool:: +~ThreadPool() +{ + m_busywork.reset(); + m_pool.join_all(); + m_service.stop(); +} + + + + +} diff --git a/moses/TranslationModel/UG/generic/threading/ug_thread_pool.h b/moses/TranslationModel/UG/generic/threading/ug_thread_pool.h new file mode 100644 index 000000000..41b0a71a0 --- /dev/null +++ b/moses/TranslationModel/UG/generic/threading/ug_thread_pool.h @@ -0,0 +1,30 @@ +// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*- +#pragma once +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace ug { +class ThreadPool +{ + typedef boost::asio::io_service service_t; + service_t m_service; + boost::thread_group m_pool; + boost::scoped_ptr m_busywork; + std::vector > m_workers; + +public: + ThreadPool(size_t const num_workers); + ~ThreadPool(); + + template + void add(callable& job) { m_service.post(job); } + +}; // end of class declaration ThreadPool +} // end of namespace ug From e8a4a9b10a5bd07097122f3e9e6529ac7a441b0b Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 13:45:51 +0100 Subject: [PATCH 030/286] New member function to expose mapping from sentence IDs to document ids. --- moses/TranslationModel/UG/mm/ug_bitext.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 1b1c95c27..396f2b7dd 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -206,6 +206,7 @@ namespace Moses { string docname(id_type const sid) const; + vector const* sid2did() const; }; #include "ug_bitext_agenda.h" @@ -221,6 +222,14 @@ namespace Moses { return ""; } + template + vector const* + Bitext:: + sid2did() const + { + return m_sid2docid.get(); + } + template sptr Bitext:: From 3c767fc333f5fe9101a6a4cf67afdc48ddcf8174 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 13:47:02 +0100 Subject: [PATCH 031/286] New field to store cumulative bias scores. --- moses/TranslationModel/UG/mm/ug_bitext_jstats.cc | 7 +++++-- moses/TranslationModel/UG/mm/ug_bitext_jstats.h | 4 +++- moses/TranslationModel/UG/mm/ug_phrasepair.h | 7 +++++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc index bcda9ebf3..9ff8b855a 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc @@ -6,6 +6,7 @@ namespace Moses uint32_t jstats::rcnt() const { return my_rcnt; } float jstats::wcnt() const { return my_wcnt; } + float jstats::bcnt() const { return my_bcnt; } uint32_t jstats::cnt2() const { return my_cnt2; } // What was that used for again? UG @@ -15,7 +16,7 @@ namespace Moses jstats:: jstats() - : my_rcnt(0), my_cnt2(0), my_wcnt(0) + : my_rcnt(0), my_cnt2(0), my_wcnt(0), my_bcnt(0) { for (int i = 0; i <= Moses::LRModel::NONE; ++i) ofwd[i] = obwd[i] = 0; @@ -27,6 +28,7 @@ namespace Moses { my_rcnt = other.rcnt(); my_wcnt = other.wcnt(); + my_bcnt = other.bcnt(); my_aln = other.aln(); indoc = other.indoc; for (int i = 0; i <= Moses::LRModel::NONE; i++) @@ -54,13 +56,14 @@ namespace Moses void jstats:: - add(float w, vector const& a, uint32_t const cnt2, + add(float w, float b, vector const& a, uint32_t const cnt2, uint32_t fwd_orient, uint32_t bwd_orient, int const docid) { boost::lock_guard lk(this->lock); my_cnt2 = cnt2; my_rcnt += 1; my_wcnt += w; + my_bcnt += b; if (a.size()) { size_t i = 0; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h index dade27649..49ba0d810 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h @@ -18,6 +18,7 @@ namespace Moses uint32_t my_rcnt; // unweighted joint count uint32_t my_cnt2; // raw counts L2 float my_wcnt; // weighted joint count + float my_bcnt; // cumulative bias // to do: use a static alignment pattern store that stores each pattern only // once, so that we don't have to store so many alignment vectors @@ -33,9 +34,10 @@ namespace Moses uint32_t rcnt() const; // raw joint counts uint32_t cnt2() const; // raw target phrase occurrence count float wcnt() const; // weighted joint counts + float bcnt() const; // cumulative bias scores vector > > const & aln() const; - void add(float w, vector const& a, uint32_t const cnt2, + void add(float w, float b, vector const& a, uint32_t const cnt2, uint32_t fwd_orient, uint32_t bwd_orient, int const docid); void invalidate(); diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h index 7e565c2df..5247b7f01 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.h +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -28,7 +28,7 @@ namespace Moses float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs? float dbwd[Moses::LRModel::NONE+1]; // distortion counts std::vector aln; - float score; + float score, cum_bias; bool inverse; std::vector indoc; PhrasePair() { }; @@ -96,6 +96,7 @@ namespace Moses good2 = 0; sample2 = 0; raw2 = 0; + cum_bias = 0; fvals.resize(numfeats); } @@ -109,6 +110,7 @@ namespace Moses start2 = x; len2 = len; raw2 = js.cnt2(); joint = js.rcnt(); + cum_bias = js.bcnt(); assert(js.aln().size()); if (js.aln().size()) aln = js.aln()[0].second; @@ -176,6 +178,7 @@ namespace Moses joint += o.joint; sample1 += o.sample1; sample2 += o.sample2; + cum_bias += o.cum_bias; // todo: add distortion counts return *this; } @@ -189,7 +192,7 @@ namespace Moses , raw1(o.raw1) , raw2(o.raw2) , sample1(o.sample1) , sample2(o.sample2) , good1(o.good1) , good2(o.good2) - , joint(o.joint) + , joint(o.joint) , cum_bias(o.cum_bias) , fvals(o.fvals) , aln(o.aln) , score(o.score) From f1de6775303b907a3409148f1d093afe9b8295c7 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 13:50:37 +0100 Subject: [PATCH 032/286] SentenceBias now has access to mapping from sentence IDs to document IDs. --- .../UG/mm/ug_sampling_bias.cc | 83 ++++++++++--------- .../TranslationModel/UG/mm/ug_sampling_bias.h | 30 +++---- 2 files changed, 61 insertions(+), 52 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index 2f8a00ccb..5bdbc8890 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -32,14 +32,26 @@ namespace Moses return c.content(); } // #endif + + SamplingBias:: + SamplingBias(std::vector const* sid2doc) + : m_sid2docid(sid2doc) + { } + int + SamplingBias:: + GetClass(id_type const idx) const + { + return m_sid2docid ? m_sid2docid->at(idx) : -1; + } + + DocumentBias:: DocumentBias - ::DocumentBias ( std::vector const& sid2doc, std::map const& docname2docid, std::string const& server_url, std::string const& text, std::ostream* log) - : m_sid2docid(sid2doc) + : SamplingBias(&sid2doc) , m_bias(docname2docid.size(), 0) { // #ifdef HAVE_CURLPP @@ -53,8 +65,8 @@ namespace Moses } void - DocumentBias - ::init_from_json + DocumentBias:: + init_from_json ( std::string const& json, std::map const& docname2docid, std::ostream* log) { // poor man's special purpose json parser for responses from the @@ -111,9 +123,9 @@ namespace Moses } void - DocumentBias - ::init(std::map const& biasmap, - std::map const& docname2docid) + DocumentBias:: + init(std::map const& biasmap, + std::map const& docname2docid) { typedef std::map::value_type doc_record; float total = 0; @@ -127,59 +139,56 @@ namespace Moses std::cerr << "BIAS " << d.first << " " << m_bias[d.second] << std::endl; } - id_type - DocumentBias - ::GetClass(id_type const idx) const - { - return m_sid2docid.at(idx); - } - float - DocumentBias - ::operator[](id_type const idx) const + DocumentBias:: + operator[](id_type const idx) const { - UTIL_THROW_IF2(idx >= m_sid2docid.size(), - "Out of bounds: " << idx << "/" << m_sid2docid.size()); - return m_bias[m_sid2docid[idx]]; + UTIL_THROW_IF2(idx >= m_sid2docid->size(), "Out of bounds: " + << idx << "/" << m_sid2docid->size()); + return m_bias[(*m_sid2docid)[idx]]; } size_t - DocumentBias - ::size() const - { return m_sid2docid.size(); } + DocumentBias:: + size() const + { return m_sid2docid->size(); } - SentenceBias - ::SentenceBias(std::vector const& bias) - : m_bias(bias) { } + SentenceBias:: + SentenceBias(std::vector const& bias, + std::vector const* sid2doc) + : SamplingBias(sid2doc) + , m_bias(bias) + { } - SentenceBias - ::SentenceBias(size_t const s, float const f) : m_bias(s,f) { } - - id_type - SentenceBias - ::GetClass(id_type idx) const { return idx; } + SentenceBias:: + SentenceBias(size_t const s, float const f, + std::vector const* sid2doc) + + : SamplingBias(sid2doc) + , m_bias(s,f) + { } float& - SentenceBias - ::operator[](id_type const idx) + SentenceBias:: + operator[](id_type const idx) { UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds"); return m_bias[idx]; } float - SentenceBias - ::operator[](id_type const idx) const + SentenceBias:: + operator[](id_type const idx) const { UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds"); return m_bias[idx]; } size_t - SentenceBias - ::size() const { return m_bias.size(); } + SentenceBias:: + size() const { return m_bias.size(); } } } diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index c7954ede0..bb20ef123 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -17,7 +17,10 @@ namespace Moses class SamplingBias { + protected: + std::vector const* m_sid2docid; public: + SamplingBias(std::vector const* sid2docid); int loglevel; std::ostream* log; virtual float @@ -27,38 +30,34 @@ namespace Moses virtual size_t size() const = 0; // number of classes - virtual id_type - GetClass(id_type const ID) const = 0; - // returns class of item ID + virtual int + GetClass(id_type const ID) const; + // returns class/document/domain id of item ID }; class DocumentBias : public SamplingBias { - std::vector const& m_sid2docid; std::vector m_bias; - public: - + DocumentBias(std::vector const& sid2doc, std::map const& docname2docid, - std::string const& server_url, std::string const& text, + std::string const& server_url, + std::string const& text, std::ostream* log); - + void init_from_json ( std::string const& json, std::map const& docname2docid, std::ostream* log ); - + void init ( std::map const& biasmap, std::map const& docname2docid); - id_type - GetClass(id_type const idx) const; - float operator[](id_type const idx) const; @@ -71,10 +70,11 @@ namespace Moses { std::vector m_bias; public: - SentenceBias(std::vector const& bias); - SentenceBias(size_t const s, float const f = 0); + SentenceBias(std::vector const& bias, + std::vector const* sid2docid = NULL); - id_type GetClass(id_type idx) const; + SentenceBias(size_t const s, float const f = 0, + std::vector const* sid2docid = NULL); float& operator[](id_type const idx); float operator[](id_type const idx) const; From 4fcb9b98f7bb4a71c7f020e9b149ddacce0f6378 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 13:51:54 +0100 Subject: [PATCH 033/286] Keeping track of cumulative bias scores. --- moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h index 5ff39312c..d24ec499a 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h @@ -87,7 +87,8 @@ Bitext::agenda seen.push_back(tpid); size_t raw2 = b->approxOccurrenceCount(); - j->stats->add(tpid, sample_weight, aln, raw2, + float bwgt = j->m_bias ? (*j->m_bias)[sid] : 1; + j->stats->add(tpid, sample_weight, bwgt, aln, raw2, po_fwd, po_bwd, docid); bool ok = (i == e2) || b->extend(o[i].id()); UTIL_THROW_IF2(!ok, "Could not extend target phrase."); From ff97627e3068e8a5d047702f34339e12c1b2bdb4 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 13:52:34 +0100 Subject: [PATCH 034/286] Update to emacs variables at top. --- moses/TranslationModel/UG/mm/ug_bitext_moses.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_moses.h b/moses/TranslationModel/UG/mm/ug_bitext_moses.h index 539a9166d..9f3db56cf 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_moses.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h @@ -1,4 +1,4 @@ -// -*- mode: c++; cc-style: moses-cc-style -*- +// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; cc-style: moses-cc-style -*- #pragma once #ifndef NO_MOSES namespace Moses { From ac99ec519fe8a121515e4cc09fac8a1314be5490 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 13:53:39 +0100 Subject: [PATCH 035/286] Have SentenceBias keep track of document ids. --- moses/TranslationModel/UG/mm/ug_prime_sampling1.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_prime_sampling1.h b/moses/TranslationModel/UG/mm/ug_prime_sampling1.h index 452c7fc4d..4c39e57a2 100644 --- a/moses/TranslationModel/UG/mm/ug_prime_sampling1.h +++ b/moses/TranslationModel/UG/mm/ug_prime_sampling1.h @@ -31,7 +31,7 @@ mark(typename TSA::tree_iterator const& r, SentenceBias& hits) } template -bool +void process(typename TSA::tree_iterator& m, typename TSA::tree_iterator& r, SentenceBias& hits, size_t const max_count=1000) @@ -60,12 +60,13 @@ template sptr prime_sampling1(TSA const& refIdx, TSA const& newIdx, - size_t const max_count) + size_t const max_count, + std::vector const* sid2docid = NULL) { typename TSA::tree_iterator m(&newIdx); typename TSA::tree_iterator r(&refIdx); sptr ret; - ret.reset(new SentenceBias(refIdx.getCorpus()->size(),0)); + ret.reset(new SentenceBias(refIdx.getCorpus()->size(),0, sid2docid)); process(m, r, *ret, max_count); return ret; } From 69f15d0c5a4c384f1540283023d2a75b6734ae15 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 13:54:22 +0100 Subject: [PATCH 036/286] New member function wait that won't return until sampling is done. --- moses/TranslationModel/UG/mm/ug_bitext_pstats.cc | 14 ++++++++++++-- moses/TranslationModel/UG/mm/ug_bitext_pstats.h | 6 ++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc index 580d7669b..11e098f6d 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc @@ -1,3 +1,4 @@ +#include #include "ug_bitext_pstats.h" namespace Moses @@ -65,7 +66,7 @@ namespace Moses bool pstats:: - add(uint64_t pid, float const w, + add(uint64_t pid, float const w, float const b, vector const& a, uint32_t const cnt2, uint32_t fwd_o, @@ -73,7 +74,7 @@ namespace Moses { boost::lock_guard guard(this->lock); jstats& entry = this->trg[pid]; - entry.add(w, a, cnt2, fwd_o, bwd_o, docid); + entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid); if (this->good < entry.rcnt()) { UTIL_THROW(util::Exception, "more joint counts than good counts:" @@ -82,5 +83,14 @@ namespace Moses return true; } + void + pstats:: + wait() const + { + boost::unique_lock lock(this->lock); + while (this->in_progress) + this->ready.wait(lock); + } + } } diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index 9a14e378b..b7cb142fe 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -21,8 +21,8 @@ namespace Moses #if UG_BITEXT_TRACK_ACTIVE_THREADS static ThreadSafeCounter active; #endif - boost::mutex lock; // for parallel gathering of stats - boost::condition_variable ready; // consumers can wait for me to be ready + mutable boost::mutex lock; // for parallel gathering of stats + mutable boost::condition_variable ready; // consumers can wait for me to be ready size_t raw_cnt; // (approximate) raw occurrence count size_t sample_cnt; // number of instances selected during sampling @@ -46,6 +46,7 @@ namespace Moses bool add(uint64_t const pid, // target phrase id float const w, // sample weight (1./(# of phrases extractable)) + float const b, // sample bias score alnvec const& a, // local alignment uint32_t const cnt2, // raw target phrase count uint32_t fwd_o, // fwd. phrase orientation @@ -57,6 +58,7 @@ namespace Moses size_t const num_pairs, // # of phrases extractable here int const po_fwd, // fwd phrase orientation int const po_bwd); // bwd phrase orientation + void wait() const; }; } From d34b107b91c2b91ef2fc7ee070f32a85b36c222e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 14:00:31 +0100 Subject: [PATCH 037/286] Initial check-in. --- .../TranslationModel/UG/mm/ug_prep_phrases.h | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 moses/TranslationModel/UG/mm/ug_prep_phrases.h diff --git a/moses/TranslationModel/UG/mm/ug_prep_phrases.h b/moses/TranslationModel/UG/mm/ug_prep_phrases.h new file mode 100644 index 000000000..421d6f090 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_prep_phrases.h @@ -0,0 +1,83 @@ +// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*- +// Functions for multi-threaded pre-fetching of phrase table entries +// Author: Ulrich Germann + +#include "moses/TranslationModel/UG/generic/threading/ug_thread_pool.h" +#include "moses/thread_safe_container.h" +#include "ug_bitext.h" +#include "ug_lru_cache.h" + +namespace Moses { +namespace bitext { + +template // , typename BITEXT> +struct StatsCollector +{ + typedef lru_cache::LRU_Cache< uint64_t, pstats > hcache_t; + typedef ThreadSafeContainer > pcache_t; + typedef map > lcache_t; + iptr const> bitext; // underlying bitext + sampling_method method; // sampling method + size_t sample_size; // sample size + sptr bias; // sampling bias + hcache_t* hcache; // "history" cache + pcache_t* pcache; // permanent cache + size_t pcache_th; // threshold for adding items to pcache + sptr lcache; // local cache + ug::ThreadPool* tpool; // thread pool to run jobs on + + StatsCollector(iptr > xbitext, + sptr const xbias) + : method(ranked_sampling) + , sample_size(100) + , bias(xbias) + , hcache(NULL) + , pcache(NULL) + , pcache_th(10000) + , tpool(NULL) + { + bitext = xbitext; + } + + void + process(typename TSA::tree_iterator& m, + typename TSA::tree_iterator& r) + { + if (!lcache) lcache.reset(new lcache_t); + if (m.down()) + { + do + { + if (!r.extend(m.getToken(-1)->id())) continue; + this->process(m, r); + uint64_t pid = r.getPid(); + sptr stats; + if (hcache) stats = hcache->get(pid); + if (!stats && pcache) + { + sptr const* foo = pcache->get(pid); + if (foo) stats = *foo; + } + if (!stats) // need to sample + { + BitextSampler s(bitext.get(), r, bias, sample_size, method); + stats = s.stats(); + if (hcache) hcache->set(pid,stats); + if (pcache && r.ca() >= pcache_th) pcache->set(pid,stats); + if (tpool) tpool->add(s); + else s(); + } + (*lcache)[pid] = stats; + r.up(); + } + while (m.over()); + m.up(); + } + } +}; +} // end of namespace bitext +} // end of namespace Moses + +#if 0 +#endif + // r.up(); From 36c3f9dda8b1f89a4fc37d06217fae98c8933340 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 14:03:20 +0100 Subject: [PATCH 038/286] Work in progress. Bug fix (release pstats in deconstructor!). Various other changes. --- .../UG/mm/ug_bitext_sampler.h | 96 ++++++++++++------- 1 file changed, 61 insertions(+), 35 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index 7bc630ef5..22fd97056 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -41,7 +41,7 @@ namespace bitext // const members // sptr const m_bitext; // keep bitext alive while I am // should be an - iptr const m_bitext; // keep bitext alive as long as I am + iptr const m_bitext; // keep bitext alive as long as I am size_t const m_plen; // length of lookup phrase bool const m_fwd; // forward or backward direction? sptr const m_root; // root of suffix array @@ -60,7 +60,9 @@ namespace bitext size_t perform_ranked_sampling(); public: - BitextSampler(bitext* const bitext, typename bitext::iter const& phrase, + BitextSampler(BitextSampler const& other); + BitextSampler const& operator=(BitextSampler const& other); + BitextSampler(bitext const* const bitext, typename bitext::iter const& phrase, sptr const& bias, size_t const max_samples, sampling_method const method); ~BitextSampler(); @@ -71,7 +73,7 @@ namespace bitext template BitextSampler:: - BitextSampler(Bitext* const bitext, + BitextSampler(Bitext const* const bitext, typename bitext::iter const& phrase, sptr const& bias, size_t const max_samples, sampling_method const method) @@ -92,6 +94,30 @@ namespace bitext m_stats->raw_cnt = phrase.ca(); m_stats->register_worker(); } + + template + BitextSampler:: + BitextSampler(BitextSampler const& other) + : m_bitext(other.m_bitext) + , m_plen(other.m_plen) + , m_fwd(other.m_fwd) + , m_root(other.m_root) + , m_next(other.m_next) + , m_stop(other.m_stop) + , m_method(other.m_method) + , m_bias(other.m_bias) + , m_samples(other.m_samples) + { + // lock both instances + boost::unique_lock mylock(m_lock); + boost::unique_lock yrlock(other.m_lock); + // actually, BitextSamplers should only copied on job submission + m_stats = other.m_stats; + m_stats->register_worker(); + m_ctr = other.m_ctr; + m_total_bias = other.m_total_bias; + m_finished = other.m_finished; + } // Ranked sampling sorts all samples by score and then considers the top-ranked // candidates for phrase extraction. @@ -106,13 +132,13 @@ namespace bitext ugdiss::tsa::ArrayEntry I(m_next); while (I.next < m_stop) { - ++m_ctr; - nbest.add(m_root->readEntry(I.next,I)); + ++m_ctr; + nbest.add(m_root->readEntry(I.next,I)); } for (size_t i = 0; i < nbest.size(); ++i) consider_sample(nbest.get_unsorted(i)); - cerr << m_ctr << " samples considered at " - << __FILE__ << ":" << __LINE__ << endl; + // cerr << m_ctr << " samples considered at " + // << __FILE__ << ":" << __LINE__ << endl; return m_ctr; } @@ -153,29 +179,29 @@ namespace bitext for (size_t s = rec.s1; s <= rec.s2; ++s) { - TSA const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1; - sptr b = I.find(o + s, rec.e1 - s); - UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found"); + TSA const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1; + sptr b = I.find(o + s, rec.e1 - s); + UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found"); - for (size_t i = rec.e1; i <= rec.e2; ++i) - { - uint64_t tpid = b->getPid(); - - // poor man's protection against over-counting - size_t s = 0; - while (s < seen.size() && seen[s] != tpid) ++s; - if (s < seen.size()) continue; - seen.push_back(tpid); - - size_t raw2 = b->approxOccurrenceCount(); - m_stats->add(tpid, sample_weight, aln, raw2, - rec.po_fwd, rec.po_bwd, docid); - bool ok = (i == rec.e2) || b->extend(o[i].id()); - UTIL_THROW_IF2(!ok, "Could not extend target phrase."); - } - if (s < rec.s2) // shift phrase-internal alignments - for (size_t k = 1; k < aln.size(); k += 2) - --aln[k]; + for (size_t i = rec.e1; i <= rec.e2; ++i) + { + uint64_t tpid = b->getPid(); + + // poor man's protection against over-counting + size_t s = 0; + while (s < seen.size() && seen[s] != tpid) ++s; + if (s < seen.size()) continue; + seen.push_back(tpid); + + size_t raw2 = b->approxOccurrenceCount(); + m_stats->add(tpid, sample_weight, m_bias ? (*m_bias)[p.sid] : 1, + aln, raw2, rec.po_fwd, rec.po_bwd, docid); + bool ok = (i == rec.e2) || b->extend(o[i].id()); + UTIL_THROW_IF2(!ok, "Could not extend target phrase."); + } + if (s < rec.s2) // shift phrase-internal alignments + for (size_t k = 1; k < aln.size(); k += 2) + --aln[k]; } } @@ -206,18 +232,18 @@ namespace bitext BitextSampler:: stats() { - if (m_ctr == 0) (*this)(); - boost::unique_lock lock(m_lock); - while (!m_finished) - m_ready.wait(lock); + // if (m_ctr == 0) (*this)(); + // boost::unique_lock lock(m_lock); + // while (!m_finished) + // m_ready.wait(lock); return m_stats; } template BitextSampler:: ~BitextSampler() - { - cerr << "bye" << endl; + { + m_stats->release(); } } // end of namespace bitext From 78b0aab65ba80ebaee4e5308834b19f0307244d5 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 14:04:19 +0100 Subject: [PATCH 039/286] Work in progress. --- .../UG/test-ranked-phrase-lookup.cc | 169 ++++++++++++------ 1 file changed, 117 insertions(+), 52 deletions(-) diff --git a/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc b/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc index f8ae20a87..dd0468d20 100644 --- a/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc +++ b/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc @@ -8,10 +8,13 @@ #include "mm/ug_bitext_sampler.h" #include "mm/ug_phrasepair.h" #include "mm/ug_lru_cache.h" +// #include "mm/memtrack.h" #include "generic/sorting/VectorIndexSorter.h" #include "generic/sorting/NBestList.h" #include #include +#include "moses/thread_safe_container.h" +#include "mm/ug_prep_phrases.h" using namespace std; using namespace Moses; @@ -28,6 +31,7 @@ typedef imTSA imtsa; string bname, bname1, bname2, ifile, L1, L2, Q1, Q2; size_t maxhits; +size_t cache_size; void interpret_args(int ac, char* av[]); sptr @@ -52,22 +56,45 @@ read_input(TokenIndex& V) return ret; } +typedef ThreadSafeContainer > permacache_t; + +void dump(iter& m, TokenIndex& V) +{ + if (m.down()) + { + do + { + // cout << m.str(&V) << endl; + dump(m,V); + } + while (m.over()); + m.up(); + } +} + int main(int argc, char* argv[]) { + typedef vector > pplist_t; interpret_args(argc, argv); - iptr Bptr(new mmbitext); - mmbitext& B = *Bptr; + iptr Bptr(new mmbitext); + mmbitext& B = *Bptr;// static_cast(Bptr.get()); B.open(bname, L1, L2); + B.V1->setDynamic(true); sptr icrp = read_input(*B.V1); imtsa newIdx(icrp,NULL); - sptr bias = prime_sampling1(*B.I1, newIdx, 5000); + sptr bias = prime_sampling1(*B.I1, newIdx, 5000, B.sid2did()); cerr << "primed" << endl; - typedef vector > pplist_t; -#define WITH_CACHE 1 -#if WITH_CACHE - // map > CACHE; - lru_cache::LRU_Cache CACHE(1000); -#endif + ug::ThreadPool T(boost::thread::hardware_concurrency()); + TSA::tree_iterator m(&newIdx); + // dump(m, *B.V1); + // exit(0); + TSA::tree_iterator r(B.I1.get()); + StatsCollector collect(Bptr, bias); + collect.tpool = &T; + collect.process(m, r); + + typedef PhrasePair::SortDescendingByJointCount sorter_t; + sorter_t sorter; for (size_t s = 0; s < icrp->size(); ++s) { size_t stop = icrp->sntLen(s); @@ -78,48 +105,84 @@ int main(int argc, char* argv[]) iter r(B.I1.get()); for (size_t k = i; k < stop && r.extend(t[k].id()); ++k) { - // cerr << k << "/" << i << endl; - cout << "\n" << r.str(B.V1.get()) - << " [" << r.ca() << "]" << endl; - // sptr pplist; - sptr stats; -#if WITH_CACHE - // if (r.ca() > 1000) pplist = CACHE.get(r.getPid()); - stats = CACHE.get(r.getPid()); -#endif - vector > pplist; - if (!stats) - { - bitext::BitextSampler - sampler(&B, r, bias, 1000, ranked_sampling); - sampler(); - stats = sampler.stats(); - // pplist->resize(pplist->size()); -#if WITH_CACHE - - // if (r.ca() > 1000) - CACHE.set(r.getPid(), stats); -#endif - } + sptr stats = (*collect.lcache)[r.getPid()]; + stats->wait(); + pplist_t pplist; expand(r, B, *stats, pplist, NULL); - cout << pplist.size() << " " << sizeof(PhrasePair) << "; " - // << pstats::s_instance_count << " instances of pstats live. " - // << PhrasePair::s_instances << " instances of PhrasePair live." - << endl; - // BOOST_FOREACH(PhrasePair const& pp, *pplist) - // { - // if (pp.joint == 1) continue; - // cout << " " << setw(6) << pp.joint << " " - // << toString(*B.V2, pp.start2, pp.len2) << endl; - // } - } + if (pplist.empty()) continue; + cout << "\n" << r.str(B.V1.get()) << " [" << r.ca() << "]" << endl; + VectorIndexSorter, sorter_t> viso(pplist, sorter); + sptr > ranked = viso.GetOrder(); + size_t ctr=0; + BOOST_FOREACH(size_t const i, *ranked) + { + PhrasePair const& pp = pplist[i]; + // if (pp.joint == 1) break; + cout << boost::format(" %6d %.5f | ") % pp.joint % pp.cum_bias + << toString(*B.V2, pp.start2, pp.len2) << endl; + if (++ctr == 5) break; + } + } } - } - // cout << pstats::s_instance_count << " instances of pstats live. " - // << PhrasePair::s_instances << " instances of PhrasePair live." - // << endl; + } } + +// permacache_t permacache; +// lru_cache::LRU_Cache CACHE(10000); + + +// for (size_t s = 0; s < icrp->size(); ++s) +// { +// size_t stop = icrp->sntLen(s); +// Token const* t = icrp->sntStart(s); +// cout << string(80,'-') << "\n" << toString(*B.V1, t, stop) << endl; +// for (size_t i = 0; i < stop; ++i) +// { +// iter r(B.I1.get()); +// for (size_t k = i; k < stop && r.extend(t[k].id()); ++k) +// { +// // cerr << k << "/" << i << endl; +// cout << "\n" << r.str(B.V1.get()) +// << " [" << r.ca() << "]" << endl; +// // sptr pplist; +// sptr stats; + +// if (cache_size) stats = CACHE.get(r.getPid()); + +// vector > pplist; +// if (!stats) +// { +// bitext::BitextSampler +// sampler(&B, r, bias, 1000, ranked_sampling); +// sampler(); +// stats = sampler.stats(); +// if (cache_size) CACHE.set(r.getPid(), stats); +// } +// expand(r, B, *stats, pplist, NULL); +// typedef PhrasePair::SortDescendingByJointCount sorter_t; +// sorter_t sorter; +// VectorIndexSorter, sorter_t> viso(pplist, sorter); +// sptr > ranked = viso.GetOrder(); +// size_t ctr=0; +// BOOST_FOREACH(size_t const i, *ranked) +// { +// PhrasePair const& pp = pplist[i]; +// // if (pp.joint == 1) break; +// cout << " " << setw(6) << pp.joint << " " +// << boost::format("%.5f ") % pp.cum_bias +// << toString(*B.V2, pp.start2, pp.len2) << endl; +// if (++ctr == 5) break; +// } +// } +// } +// } +// // MemTrack::TrackListMemoryUsage(); +// // cout << pstats::s_instance_count << " instances of pstats live. " +// // << PhrasePair::s_instances << " instances of PhrasePair live." +// // << endl; +// } + // vector score(hits->size()); // VectorIndexSorter sorter(score); // for (size_t s = 0; s < icrp->size(); ++s) @@ -159,10 +222,12 @@ interpret_args(int ac, char* av[]) o.add_options() ("help,h", "print this message") - ("maxhits,n", po::value(&maxhits)->default_value(25), - "max. number of hits") - ("q1", po::value(&Q1), "query in L1") - ("q2", po::value(&Q2), "query in L2") + ("cache,C", po::value(&cache_size)->default_value(0), + "cache size") + // ("maxhits,n", po::value(&maxhits)->default_value(25), + // "max. number of hits") + // ("q1", po::value(&Q1), "query in L1") + // ("q2", po::value(&Q2), "query in L2") ; po::options_description h("Hidden Options"); @@ -170,7 +235,7 @@ interpret_args(int ac, char* av[]) ("bname", po::value(&bname), "base name of corpus") ("L1", po::value(&L1), "L1 tag") ("L2", po::value(&L2), "L2 tag") - ("ifile", po::value(&ifile), "input file") + ("ifile,i", po::value(&ifile), "input file") ; h.add(o); From 5dc9d68d2de126ea2c048eed2cef8a85a0672db8 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 14:05:41 +0100 Subject: [PATCH 040/286] Initial check-in. --- .../UG/test-boost-threadpool.cc | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 moses/TranslationModel/UG/test-boost-threadpool.cc diff --git a/moses/TranslationModel/UG/test-boost-threadpool.cc b/moses/TranslationModel/UG/test-boost-threadpool.cc new file mode 100644 index 000000000..3f3340af9 --- /dev/null +++ b/moses/TranslationModel/UG/test-boost-threadpool.cc @@ -0,0 +1,32 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include "moses/TranslationModel/UG/generic/threading/ug_thread_pool.h" +using namespace std; + +class hello +{ + size_t n; +public: + hello(size_t const x) : n(x) { } + void operator()() { cout << "hello #" << n << endl; } +}; + + +int main() +{ + ug::ThreadPool T(10); + vector > jobs; + for (size_t i = 0; i < 20; ++i) + { + boost::shared_ptr j(new hello(i)); + jobs.push_back(j); + T.add(*j); + } +} From 5e2e63f678eebbf98daf00c75702684d9ba94213 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 14:06:54 +0100 Subject: [PATCH 041/286] Integration of ranked sampling. --- moses/TranslationModel/UG/mmsapt.cpp | 113 ++++++++++++++++++++++++--- moses/TranslationModel/UG/mmsapt.h | 14 +++- 2 files changed, 115 insertions(+), 12 deletions(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index eb8e1fb89..924f0197b 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -77,10 +77,12 @@ namespace Moses , m_bias_log(NULL) , m_bias_loglevel(0) , m_lr_func(NULL) + , bias_key(((char*)this)+3) , cache_key(((char*)this)+2) , context_key(((char*)this)+1) // , m_tpc_ctr(0) , ofactor(1,0) + , m_sampling_method(ranked_sampling) { init(line); setup_local_feature_functions(); @@ -232,6 +234,18 @@ namespace Moses if ((m = param.find("extra")) != param.end()) m_extra_data = m->second; + if ((m = param.find("method")) != param.end()) + { + if (m->second == "rank") + m_sampling_method = ranked_sampling; + else if (m->second == "random") + m_sampling_method = random_sampling; + else if (m->second == "full") + m_sampling_method = full_coverage; + else UTIL_THROW2("unrecognized specification 'method='" << m->second + << "' in line:\n" << line); + } + dflt = pair("tuneable","true"); m_tuneable = Scan(param.insert(dflt).first->second.c_str()); @@ -262,6 +276,7 @@ namespace Moses // known_parameters.push_back("limit"); // replaced by "table-limit" known_parameters.push_back("logcnt"); known_parameters.push_back("lr-func"); // associated lexical reordering function + known_parameters.push_back("method"); known_parameters.push_back("name"); known_parameters.push_back("num-features"); known_parameters.push_back("output-factor"); @@ -441,6 +456,7 @@ namespace Moses Mmsapt:: Load(bool with_checks) { + boost::unique_lock lock(m_lock); // load feature functions (i.e., load underlying data bases, if any) BOOST_FOREACH(sptr& ff, m_active_ff_fix) ff->load(); BOOST_FOREACH(sptr& ff, m_active_ff_dyn) ff->load(); @@ -455,9 +471,11 @@ namespace Moses << this->m_numScoreComponents << ")!\n";); } #endif + + m_thread_pool.reset(new ug::ThreadPool(max(m_workers,size_t(1)))); + // Load corpora. For the time being, we can have one memory-mapped static // corpus and one in-memory dynamic corpus - boost::unique_lock lock(m_lock); btfix->m_num_workers = this->m_workers; btfix->open(m_bname, L1, L2); @@ -667,7 +685,19 @@ namespace Moses // for btfix. sptr sfix,sdyn; - if (mfix.size() == sphrase.size()) sfix = btfix->lookup(ttask, mfix); + if (mfix.size() == sphrase.size()) + { + sptr context = scope->get(btfix.get()); + sptr const* foo = context->cache1->get(mfix.getPid()); + if (foo) { sfix = *foo; sfix->wait(); } + else + { + BitextSampler s(btfix.get(), mfix, context->bias, + m_default_sample_size, m_sampling_method); + s(); + sfix = s.stats(); + } + } if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(ttask, mdyn); vector > ppfix,ppdyn; @@ -764,18 +794,17 @@ namespace Moses ChartRuleLookupManager* Mmsapt:: CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &, - size_t UnclearWhatThisVariableIsSupposedToAccomplishBecauseNobodyBotheredToDocumentItInPhraseTableDotHButIllTakeThisAsAnOpportunityToComplyWithTheMosesConventionOfRidiculouslyLongVariableAndClassNames) + size_t ) { throw "CreateRuleLookupManager is currently not supported in Mmsapt!"; } void Mmsapt:: - InitializeForInput(ttasksptr const& ttask) + set_bias_via_server(ttasksptr const& ttask) { sptr const& scope = ttask->GetScope(); - sptr context - = scope->get(&btfix, true); + sptr context = scope->get(btfix.get(), true); if (m_bias_server.size() && context->bias == NULL) { // we need to create the bias boost::unique_lock lock(context->lock); @@ -784,8 +813,7 @@ namespace Moses { if (m_bias_log) { - *m_bias_log << HERE << endl - << "BIAS LOOKUP CONTEXT: " + *m_bias_log << HERE << endl << "BIAS LOOKUP CONTEXT: " << context_words << endl; context->bias_log = m_bias_log; } @@ -797,6 +825,60 @@ namespace Moses if (!context->cache1) context->cache1.reset(new pstats::cache_t); if (!context->cache2) context->cache2.reset(new pstats::cache_t); } + } + + void + Mmsapt:: + set_bias_for_ranking(ttasksptr const& ttask, iptr const> bt) + { // thinking ahead: shard-specific set-up, for multi-shard Mmsapts + + sptr const& scope = ttask->GetScope(); + if (!scope) return; + + sptr context = scope->get(bt.get(), true); + if (context->bias) return; + + if (!context->cache1) context->cache1.reset(new pstats::cache_t); + if (!context->cache2) context->cache2.reset(new pstats::cache_t); + + sptr iowrapper = ttask->GetIOWrapper(); + vector input; + input.reserve(iowrapper->GetPastInput().size() + + iowrapper->GetFutureInput().size()); + BOOST_FOREACH(sptr const& s, iowrapper->GetPastInput()) + input.push_back(s->ToString()); + BOOST_FOREACH(sptr const& s, iowrapper->GetFutureInput()) + input.push_back(s->ToString()); + + size_t N = 10 * m_default_sample_size; + context->bias = prime_sampling1(*bt->V1, *bt->I1, input, N); + + } + + // void + // Mmsapt:: + // set_bias_via_ranking(ttasksptr const& ttask) + // { + // sptr const& scope = ttask->GetScope(); + // if (!scope) return; + // sptr bias = scope->get(bias_key); + // // For the time being, let's assume that ranking is always primed + // // on the entire document and leave local priming for another day. + // if (bias) return; + // // + // } + + void + Mmsapt:: + InitializeForInput(ttasksptr const& ttask) + { + set_bias_for_ranking(ttask, this->btfix); + // to do: depending on method, set bias for ranking, via consulting the bias + // server, or none at al. + + sptr const& scope = ttask->GetScope(); + sptr context = scope->get(btfix.get(), true); + boost::unique_lock mylock(m_lock); sptr localcache = scope->get(cache_key); if (!localcache) @@ -828,13 +910,24 @@ namespace Moses PrefixExists(ttasksptr const& ttask, Moses::Phrase const& phrase) const { if (phrase.GetSize() == 0) return false; - vector myphrase; + sptr const& scope = ttask->GetScope(); + + vector myphrase; fillIdSeq(phrase,input_factor,*btfix->V1,myphrase); TSA::tree_iterator mfix(btfix->I1.get(),&myphrase[0],myphrase.size()); if (mfix.size() == myphrase.size()) { - btfix->prep(ttask, mfix); + sptr context = scope->get(btfix.get(), true); + uint64_t pid = mfix.getPid(); + if (!context->cache1->get(pid)) + { + BitextSampler s(btfix.get(), mfix, context->bias, + m_default_sample_size, m_sampling_method); + if (*context->cache1->get(pid, s.stats()) == s.stats()) + m_thread_pool->add(s); + } + // btfix->prep(ttask, mfix); // cerr << phrase << " " << mfix.approxOccurrenceCount() << endl; return true; } diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 9b19eb356..cfb379c78 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -11,6 +11,7 @@ #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h" #include "moses/TranslationModel/UG/generic/sampling/Sampling.h" #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h" +#include "moses/TranslationModel/UG/generic/threading/ug_thread_pool.h" #include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h" #include "moses/TranslationModel/UG/mm/ug_mm_tsa.h" @@ -19,6 +20,8 @@ #include "moses/TranslationModel/UG/mm/ug_typedefs.h" #include "moses/TranslationModel/UG/mm/tpt_pickler.h" #include "moses/TranslationModel/UG/mm/ug_bitext.h" +#include "moses/TranslationModel/UG/mm/ug_prime_sampling1.h" +#include "moses/TranslationModel/UG/mm/ug_bitext_sampler.h" #include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h" #include "moses/TranslationModel/UG/TargetPhraseCollectionCache.h" @@ -82,7 +85,10 @@ namespace Moses int m_bias_loglevel; LexicalReordering* m_lr_func; // associated lexical reordering function string m_lr_func_name; // name of associated lexical reordering function + sampling_method m_sampling_method; // sampling method, see ug_bitext_sampler + boost::scoped_ptr m_thread_pool; public: + void* const bias_key; // for getting bias from ttask void* const cache_key; // for getting cache from ttask void* const context_key; // for context scope from ttask private: @@ -116,8 +122,8 @@ namespace Moses std::vector >* registry = NULL); // add feature function if specified - void - add_corpus_specific_features(std::vector >& ffvec); + // void + // add_corpus_specific_features(std::vector >& ffvec); // built-in feature functions // PScorePfwd calc_pfwd_fix, calc_pfwd_dyn; @@ -136,6 +142,10 @@ namespace Moses std::vector ofactor; void setup_local_feature_functions(); + void set_bias_via_server(ttasksptr const& ttask); + + void + set_bias_for_ranking(ttasksptr const& ttask, iptr const> bt); private: From 2f125eddc3adac883f406b575988b8694bf5a1fa Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 14:08:35 +0100 Subject: [PATCH 042/286] Bug fix. Readability. --- .../UG/generic/sorting/VectorIndexSorter.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h index f224b3bae..c68b0d2e4 100644 --- a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h +++ b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h @@ -34,15 +34,17 @@ namespace Moses COMP const& Compare; VectorIndexSorter(std::vector const& v, COMP const& comp) - : m_vecref(v), Compare(comp) { - } + : m_vecref(v), Compare(comp) + { } VectorIndexSorter(std::vector const& v) - : m_vecref(v), m_comp(new COMP()), Compare(*m_comp) { - } + : m_vecref(v), m_comp(new COMP()), Compare(*m_comp) + { } - bool operator()(IDX_T const & a, IDX_T const & b) const { - bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b)); + bool + operator()(IDX_T const & a, IDX_T const & b) const + { + bool fwd = Compare(m_vecref.at(a), m_vecref.at(b)); bool bwd = Compare(m_vecref[b], m_vecref[a]); return (fwd == bwd ? a < b : fwd); } @@ -61,7 +63,7 @@ namespace Moses GetOrder() const { boost::shared_ptr > ret(new std::vector(m_vecref.size())); - get_order(*ret); + GetOrder(*ret); return ret; } From b2a3bd280e31b194b9cff3e44915f6344e959bef Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 14:10:48 +0100 Subject: [PATCH 043/286] Allow intrusive pointers to const objects. --- moses/TranslationModel/UG/generic/threading/ug_ref_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/moses/TranslationModel/UG/generic/threading/ug_ref_counter.h b/moses/TranslationModel/UG/generic/threading/ug_ref_counter.h index 605cf2a32..d885b54a5 100644 --- a/moses/TranslationModel/UG/generic/threading/ug_ref_counter.h +++ b/moses/TranslationModel/UG/generic/threading/ug_ref_counter.h @@ -7,11 +7,11 @@ namespace Moses { class reference_counter { public: - friend void intrusive_ptr_add_ref(reference_counter* p) + friend void intrusive_ptr_add_ref(reference_counter const* p) { if (p) ++p->m_refcount; } - friend void intrusive_ptr_release(reference_counter* p) + friend void intrusive_ptr_release(reference_counter const* p) { if (p && --p->m_refcount == 0) delete p; From 3a5acb56ccdd8fef7dff0fd71829cb2f65babd82 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Jun 2015 15:32:53 +0100 Subject: [PATCH 044/286] Added some logging messages. --- moses/TranslationModel/UG/mmsapt.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 924f0197b..69082bc0b 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -851,7 +851,9 @@ namespace Moses input.push_back(s->ToString()); size_t N = 10 * m_default_sample_size; + VERBOSE(1,"Priming bias for ranking. [" << HERE << "]" << endl); context->bias = prime_sampling1(*bt->V1, *bt->I1, input, N); + VERBOSE(1,"Done. [" << HERE << "]" << endl); } From 9d46c5efa1c1f2cc3981547eb1a105daaa141471 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 15 Jun 2015 14:20:45 +0100 Subject: [PATCH 045/286] Rearrangement of members to match initialization order. --- moses/TranslationModel/UG/mm/ug_phrasepair.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h index 5247b7f01..f35c5df0f 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.h +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -24,11 +24,12 @@ namespace Moses uint32_t len2; uint64_t p1, p2; uint32_t raw1, raw2, sample1, sample2, good1, good2, joint; + float cum_bias; std::vector fvals; float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs? float dbwd[Moses::LRModel::NONE+1]; // distortion counts std::vector aln; - float score, cum_bias; + float score; bool inverse; std::vector indoc; PhrasePair() { }; From a627fd3cc664b8521f220112a4791e0154f5cfae Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 15 Jun 2015 14:22:32 +0100 Subject: [PATCH 046/286] Bug fix: set_bias_for_ranking needs to lock. --- moses/TranslationModel/UG/mmsapt.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 69082bc0b..cb4f1c93c 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -78,11 +78,11 @@ namespace Moses , m_bias_loglevel(0) , m_lr_func(NULL) , bias_key(((char*)this)+3) + , m_sampling_method(ranked_sampling) , cache_key(((char*)this)+2) , context_key(((char*)this)+1) // , m_tpc_ctr(0) , ofactor(1,0) - , m_sampling_method(ranked_sampling) { init(line); setup_local_feature_functions(); @@ -836,6 +836,7 @@ namespace Moses if (!scope) return; sptr context = scope->get(bt.get(), true); + boost::unique_lock lock(context->lock); if (context->bias) return; if (!context->cache1) context->cache1.reset(new pstats::cache_t); From ad8114ddb0b7af26c74680a9657c5fa6f82cf1eb Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Mon, 15 Jun 2015 16:23:12 +0100 Subject: [PATCH 047/286] capitalisation --- scripts/ems/experiment.meta | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 62e38128c..9edeec460 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -166,14 +166,14 @@ prepare-bilingual-nplm ignore-unless: bilingual-lm rerun-on-change: TRAINING:corpus TRAINING:word-alignment template: $moses-script-dir/training/bilingual-lm/extract_training.py -c IN0 -e $output-extension -f $input-extension -a IN1.$TRAINING:alignment-symmetrization-method -w $working-dir/$bilingual-lm-workdir -n $order -m $source-window $bilingual-lm-settings - default-name: LM/bilingualLM_prep + default-name: lm/bilingualLM_prep train-bilingual-lm in: numberized_ngrams TRAINING:corpus out: binlm ignore-unless: bilingual-lm rerun-on-change: numberized_ngrams template: $moses-script-dir/training/bilingual-lm/train_nplm.py -w $working-dir/$bilingual-lm-workdir -c IN1 -r $working-dir/$nplm-output-dir -n $train_order $nplm-settings - default-name: LM/BilingualLM + default-name: lm/bilingualLM get-corpus in: get-corpus-script out: raw-corpus From 6c0f875385ffaf827139c9a9220a2c5bfd195178 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Tue, 16 Jun 2015 16:19:41 +0100 Subject: [PATCH 048/286] testing the waters for c++11 please adjust your compiler options or complain if you rely on a compiler that doesn't support c++11 yet. --- Jamroot | 2 +- moses/StaticData.cpp | 3 +-- phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++---------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/Jamroot b/Jamroot index 119c6183e..a4957dfa2 100644 --- a/Jamroot +++ b/Jamroot @@ -108,7 +108,7 @@ external-lib z ; #lib dl : : static:static shared:shared ; #requirements += dl ; - +requirements += gcc:-std=c++0x ; if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] { if [ option.get "full-tcmalloc" : : "yes" ] { diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 8fb88c257..28d9f7831 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -1115,8 +1115,7 @@ void StaticData::LoadSparseWeightsFromConfig() } std::map > weights = m_parameter->GetAllWeights(); - std::map >::iterator iter; - for (iter = weights.begin(); iter != weights.end(); ++iter) { + for (auto iter = weights.begin(); iter != weights.end(); ++iter) { // this indicates that it is sparse feature if (featureNames.find(iter->first) == featureNames.end()) { UTIL_THROW_IF2(iter->second.size() != 1, "ERROR: only one weight per sparse feature allowed: " << iter->first); diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 93a452dad..2863122dd 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")), + manager.configure({"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")), + manager.configure({"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")), + manager.configure({"--SparseDomainBlah","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--DomainSubset")), + manager.configure({"--DomainSubset"}), ScoreFeatureArgumentException); } @@ -84,16 +84,16 @@ static void checkDomainConfigured( BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - (boost::assign::list_of ("--DomainRatio")("/dev/null")); + ({"--DomainRatio","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--DomainIndicator")("/dev/null")); + ({"--DomainIndicator","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--DomainSubset")("/dev/null")); + ({"--DomainSubset","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); + ({"--SparseDomainRatio","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); + ({"--SparseDomainIndicator","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); + ({"--SparseDomainSubset","/dev/null"}); } From 2a798c0b9f19e44c1a63c7c75f657ae15968c8d0 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Wed, 17 Jun 2015 00:00:42 +0100 Subject: [PATCH 049/286] daily automatic beautifier --- phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 2863122dd..51d4e1297 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure({"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), + manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure({"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), + manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure({"--SparseDomainBlah","/dev/null"}), + manager.configure( {"--SparseDomainBlah","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure({"--DomainSubset"}), + manager.configure( {"--DomainSubset"}), ScoreFeatureArgumentException); } @@ -84,16 +84,16 @@ static void checkDomainConfigured( BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - ({"--DomainRatio","/dev/null"}); + ( {"--DomainRatio","/dev/null"}); checkDomainConfigured - ({"--DomainIndicator","/dev/null"}); + ( {"--DomainIndicator","/dev/null"}); checkDomainConfigured - ({"--DomainSubset","/dev/null"}); + ( {"--DomainSubset","/dev/null"}); checkDomainConfigured - ({"--SparseDomainRatio","/dev/null"}); + ( {"--SparseDomainRatio","/dev/null"}); checkDomainConfigured - ({"--SparseDomainIndicator","/dev/null"}); + ( {"--SparseDomainIndicator","/dev/null"}); checkDomainConfigured - ({"--SparseDomainSubset","/dev/null"}); + ( {"--SparseDomainSubset","/dev/null"}); } From 42c5424c86bc2f7f79b70821169dc24433e04b28 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 17 Jun 2015 10:58:47 +0400 Subject: [PATCH 050/286] 1st casualty of c++11. clang 2.6 (latest c++ compiler on osx) doesn't support list of object init --- phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 51d4e1297..93a452dad 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), + manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), + manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--SparseDomainBlah","/dev/null"}), + manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--DomainSubset"}), + manager.configure(boost::assign::list_of("--DomainSubset")), ScoreFeatureArgumentException); } @@ -84,16 +84,16 @@ static void checkDomainConfigured( BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - ( {"--DomainRatio","/dev/null"}); + (boost::assign::list_of ("--DomainRatio")("/dev/null")); checkDomainConfigured - ( {"--DomainIndicator","/dev/null"}); + (boost::assign::list_of("--DomainIndicator")("/dev/null")); checkDomainConfigured - ( {"--DomainSubset","/dev/null"}); + (boost::assign::list_of("--DomainSubset")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainRatio","/dev/null"}); + (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainIndicator","/dev/null"}); + (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainSubset","/dev/null"}); + (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); } From 80f0f71d03b0348649835e674692938dc6862840 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 17 Jun 2015 11:25:27 +0400 Subject: [PATCH 051/286] Revert "1st casualty of c++11. clang 2.6 (latest c++ compiler on osx) doesn't support list of object init" This reverts commit 42c5424c86bc2f7f79b70821169dc24433e04b28. --- phrase-extract/ScoreFeatureTest.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 93a452dad..51d4e1297 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -53,16 +53,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")), + manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")), + manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")), + manager.configure( {"--SparseDomainBlah","/dev/null"}), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure(boost::assign::list_of("--DomainSubset")), + manager.configure( {"--DomainSubset"}), ScoreFeatureArgumentException); } @@ -84,16 +84,16 @@ static void checkDomainConfigured( BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - (boost::assign::list_of ("--DomainRatio")("/dev/null")); + ( {"--DomainRatio","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--DomainIndicator")("/dev/null")); + ( {"--DomainIndicator","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--DomainSubset")("/dev/null")); + ( {"--DomainSubset","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); + ( {"--SparseDomainRatio","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); + ( {"--SparseDomainIndicator","/dev/null"}); checkDomainConfigured - (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); + ( {"--SparseDomainSubset","/dev/null"}); } From 127b860c6a7b54daa9b8808006835410510241aa Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 17 Jun 2015 11:27:50 +0400 Subject: [PATCH 052/286] false alarm. clang does support object list init. Needed to enable c++11 for all toolsets --- Jamroot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jamroot b/Jamroot index a4957dfa2..4f76ec3ba 100644 --- a/Jamroot +++ b/Jamroot @@ -108,7 +108,7 @@ external-lib z ; #lib dl : : static:static shared:shared ; #requirements += dl ; -requirements += gcc:-std=c++0x ; +requirements += -std=c++0x ; if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] { if [ option.get "full-tcmalloc" : : "yes" ] { From 7031992caa2bd850d2442ae99b697f01194046db Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 17 Jun 2015 11:42:46 +0400 Subject: [PATCH 053/286] use c++11 unordered set code --- phrase-extract/ScoreFeatureTest.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 51d4e1297..9497414be 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -25,7 +25,7 @@ #include #include -#include +#include using namespace MosesTraining; using namespace std; @@ -95,5 +95,15 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) ( {"--SparseDomainIndicator","/dev/null"}); checkDomainConfigured ( {"--SparseDomainSubset","/dev/null"}); + + unordered_set s; + s.insert(4); + s.insert(7); + s.insert(4); + s.insert(1); + + for (auto i: s) { + cerr << i << " "; + } } From 425118aa5d794a43a1aff6e692c4e90c7e0f800e Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Wed, 17 Jun 2015 09:32:29 +0100 Subject: [PATCH 054/286] bugfixes - working directory --- scripts/training/train-neurallm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py index 4f0e8bdaf..00da64986 100755 --- a/scripts/training/train-neurallm.py +++ b/scripts/training/train-neurallm.py @@ -187,12 +187,14 @@ def main(options): ret = subprocess.call(extraction_cmd) if ret: raise Exception("preparing neural LM failed") + options.validation_file = os.path.join( + options.working_dir, os.path.basename(options.validation_corpus)) else: options.validation_file = None - options.input_words_file = options.words_file - options.output_words_file = options.words_file + options.input_words_file = os.path.join(options.working_dir, options.words_file) + options.output_words_file = os.path.join(options.working_dir, options.words_file) options.input_vocab_size = options.vocab_size options.output_vocab_size = options.vocab_size From f29f67710e980db7f965b9b2e849b7c14dcf338d Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Thu, 18 Jun 2015 00:00:39 +0100 Subject: [PATCH 055/286] daily automatic beautifier --- phrase-extract/ScoreFeatureTest.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 9497414be..cc22f8630 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -95,15 +95,15 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) ( {"--SparseDomainIndicator","/dev/null"}); checkDomainConfigured ( {"--SparseDomainSubset","/dev/null"}); - + unordered_set s; s.insert(4); s.insert(7); s.insert(4); s.insert(1); - - for (auto i: s) { - cerr << i << " "; + +for (auto i: s) { + cerr << i << " "; } } From 90470e878d7ee150baafbb718ee6a402f641c9a5 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 19 Jun 2015 15:58:14 +0100 Subject: [PATCH 056/286] Fix some C++11-related compilation errors (clang) --- biconcor/Vocabulary.cpp | 4 ++-- moses/TranslationModel/RuleTable/LoaderFactory.cpp | 3 +-- phrase-extract/extract-mixed-syntax/Main.cpp | 7 ++----- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/biconcor/Vocabulary.cpp b/biconcor/Vocabulary.cpp index f0f07c97d..3879b451d 100644 --- a/biconcor/Vocabulary.cpp +++ b/biconcor/Vocabulary.cpp @@ -62,7 +62,7 @@ void Vocabulary::Save(const string& fileName ) const vcbFile.open( fileName.c_str(), ios::out | ios::ate | ios::trunc); if (!vcbFile) { - cerr << "Failed to open " << vcbFile << endl; + cerr << "Failed to open " << fileName << endl; exit(1); } @@ -81,7 +81,7 @@ void Vocabulary::Load(const string& fileName ) vcbFile.open(fileName.c_str()); if (!vcbFile) { - cerr << "no such file or directory: " << vcbFile << endl; + cerr << "no such file or directory: " << fileName << endl; exit(1); } diff --git a/moses/TranslationModel/RuleTable/LoaderFactory.cpp b/moses/TranslationModel/RuleTable/LoaderFactory.cpp index 66a39e3bd..5569f952c 100644 --- a/moses/TranslationModel/RuleTable/LoaderFactory.cpp +++ b/moses/TranslationModel/RuleTable/LoaderFactory.cpp @@ -40,9 +40,8 @@ std::auto_ptr RuleTableLoaderFactory::Create( { InputFileStream input(path); std::string line; - bool cont = std::getline(input, line); - if (cont) { + if (std::getline(input, line)) { std::vector tokens; Tokenize(tokens, line); if (tokens.size() == 1) { diff --git a/phrase-extract/extract-mixed-syntax/Main.cpp b/phrase-extract/extract-mixed-syntax/Main.cpp index 5d1b3e7f5..f011e6e8d 100644 --- a/phrase-extract/extract-mixed-syntax/Main.cpp +++ b/phrase-extract/extract-mixed-syntax/Main.cpp @@ -148,13 +148,10 @@ int main(int argc, char** argv) cerr << lineNum << " "; } - bool success; - success = getline(strmSource, lineSource); - if (!success) { + if (!getline(strmSource, lineSource)) { throw "Couldn't read source"; } - success = getline(strmAlignment, lineAlignment); - if (!success) { + if (!getline(strmAlignment, lineAlignment)) { throw "Couldn't read alignment"; } From 65bd46df65910d673ed49c9a7551d472e8127039 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 19 Jun 2015 21:50:01 +0100 Subject: [PATCH 057/286] Added feature with cumulative bias. --- moses/TranslationModel/UG/mmsapt.cpp | 5 ++- .../TranslationModel/UG/sapt_phrase_scorers.h | 1 + .../UG/sapt_pscore_cumulative_bias.h | 37 +++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index cb4f1c93c..f65a9c9ee 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -77,8 +77,8 @@ namespace Moses , m_bias_log(NULL) , m_bias_loglevel(0) , m_lr_func(NULL) - , bias_key(((char*)this)+3) , m_sampling_method(ranked_sampling) + , bias_key(((char*)this)+3) , cache_key(((char*)this)+2) , context_key(((char*)this)+1) // , m_tpc_ctr(0) @@ -200,6 +200,7 @@ namespace Moses param.insert(pair("coh", "0")); param.insert(pair("rare", "1")); param.insert(pair("prov", "1")); + param.insert(pair("cumb", "0")); poolCounts = true; @@ -269,6 +270,7 @@ namespace Moses known_parameters.push_back("cache"); known_parameters.push_back("coh"); known_parameters.push_back("config"); + known_parameters.push_back("cumb"); known_parameters.push_back("extra"); known_parameters.push_back("feature-sets"); known_parameters.push_back("input-factor"); @@ -425,6 +427,7 @@ namespace Moses check_ff > ("rare", &m_active_ff_common); check_ff >("unal", &m_active_ff_common); check_ff >("coh", &m_active_ff_common); + check_ff >("cumb", &m_active_ff_common); // for these ones either way is possible (specification ends with '+' // if corpus-specific diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h index ace907d73..efdcaffaa 100644 --- a/moses/TranslationModel/UG/sapt_phrase_scorers.h +++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h @@ -12,3 +12,4 @@ #include "sapt_pscore_coherence.h" // coherence feature: good/sample-size #include "sapt_pscore_phrasecount.h" // phrase count #include "sapt_pscore_wordcount.h" // word count +#include "sapt_pscore_cumulative_bias.h" // cumulative bias score diff --git a/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h new file mode 100644 index 000000000..0dff728d7 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h @@ -0,0 +1,37 @@ +// -*- c++ -*- +// Phrase scorer that records the aggregated bias score +// + +#include "sapt_pscore_base.h" +#include + +using namespace std; +namespace Moses { + namespace bitext { + + template + class + PScoreCumBias : public PhraseScorer + { + public: + PScoreCumBias(string const spec) + { + this->m_index = -1; + this->m_feature_names.push_back("cumb"); + this->m_num_feats = this->m_feature_names.size(); + } + + bool + isIntegerValued(int i) const { return false; } + + void + operator()(Bitext const& bt, + PhrasePair& pp, + vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + (*dest)[this->m_index] = log(pp.cum_bias); + } + }; + } // namespace bitext +} // namespace Moses From 1bd10e104ce5a8e51e7336ad5bbf1c61b56a0883 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sun, 21 Jun 2015 18:27:56 +0200 Subject: [PATCH 058/286] workaround/cleaning for weird copy-constructor behaviour with C++11 --- .../CompactPT/BlockHashIndex.cpp | 2 +- .../LexicalReorderingTableCreator.cpp | 9 +++-- .../CompactPT/MmapAllocator.h | 12 ++++--- .../CompactPT/PhraseTableCreator.cpp | 6 ++-- .../TranslationModel/CompactPT/StringVector.h | 35 +++++++++---------- 5 files changed, 33 insertions(+), 31 deletions(-) diff --git a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp index c90dcd6d9..27209f5bc 100644 --- a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp +++ b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp @@ -34,7 +34,7 @@ namespace Moses BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits, size_t threadsNum) : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), - m_fileHandle(0), m_fileHandleStart(0), m_size(0), + m_fileHandle(0), m_fileHandleStart(0), m_landmarks(true), m_size(0), m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0), m_threadPool(threadsNum) { diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp index 9fe9eec30..8e9f4fa0a 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp @@ -52,13 +52,12 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl; m_hash.BeginSave(m_outFile); - - + if(tempfilePath.size()) { MmapAllocator allocEncoded(util::FMakeTemp(tempfilePath)); m_encodedScores = new StringVector(allocEncoded); } else { - m_encodedScores = new StringVector(); + m_encodedScores = new StringVector(true); } EncodeScores(); @@ -68,12 +67,12 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::cerr << "Pass 2/2: Compressing scores" << std::endl; - + if(tempfilePath.size()) { MmapAllocator allocCompressed(util::FMakeTemp(tempfilePath)); m_compressedScores = new StringVector(allocCompressed); } else { - m_compressedScores = new StringVector(); + m_compressedScores = new StringVector(true); } CompressScores(); diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h index 78084f883..0e04890bd 100644 --- a/moses/TranslationModel/CompactPT/MmapAllocator.h +++ b/moses/TranslationModel/CompactPT/MmapAllocator.h @@ -62,6 +62,9 @@ public: typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; + MmapAllocator(MmapAllocator &&) = delete; + MmapAllocator(const MmapAllocator &&) = delete; + MmapAllocator() throw() : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0), @@ -151,11 +154,12 @@ public: if(!m_fixed) { util::UnmapOrThrow(p, num * sizeof(T)); } else { - size_t map_offset = (m_data_offset / m_page_size) * m_page_size; - size_t relative_offset = m_data_offset - map_offset; - util::UnmapOrThrow((pointer)((char*)p - relative_offset), num * sizeof(T)); + const size_t map_offset = (m_data_offset / m_page_size) * m_page_size; + const size_t relative_offset = m_data_offset - map_offset; + const size_t adjusted_map_size = m_map_size + relative_offset; + + util::UnmapOrThrow((pointer)((char*)p - relative_offset), adjusted_map_size); } - } void construct (pointer p, const T& value) { diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp index ba1dfc578..d590ef9b3 100644 --- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp @@ -130,7 +130,7 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath, MmapAllocator allocCompressed(util::FMakeTemp(tempfilePath)); m_compressedTargetPhrases = new StringVector(allocCompressed); } else { - m_compressedTargetPhrases = new StringVector(); + m_compressedTargetPhrases = new StringVector(true); } CompressTargetPhrases(); @@ -203,7 +203,7 @@ void PhraseTableCreator::Save() = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) temp1[it->second] = it->first; std::sort(temp1.begin(), temp1.end()); - StringVector sourceSymbols; + StringVector sourceSymbols(true); for(std::vector::iterator it = temp1.begin(); it != temp1.end(); it++) sourceSymbols.push_back(*it); @@ -224,7 +224,7 @@ void PhraseTableCreator::Save() for(boost::unordered_map::iterator it = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++) temp2[it->second] = it->first; - StringVector targetSymbols; + StringVector targetSymbols(true); for(std::vector::iterator it = temp2.begin(); it != temp2.end(); it++) targetSymbols.push_back(*it); diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h index bb2bc11ef..3af970c41 100644 --- a/moses/TranslationModel/CompactPT/StringVector.h +++ b/moses/TranslationModel/CompactPT/StringVector.h @@ -147,8 +147,8 @@ public: typedef RangeIterator iterator; typedef StringIterator string_iterator; - StringVector(); - StringVector(Allocator alloc); + StringVector(bool allocate = false); + StringVector(Allocator& alloc); virtual ~StringVector() { delete m_charArray; @@ -203,13 +203,13 @@ public: m_memoryMapped = memoryMapped; size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool); - size += m_positions.load(in, m_memoryMapped); + size += m_positions.load(in, false); - size += loadCharArray(*m_charArray, in, m_memoryMapped); + size += loadCharArray(m_charArray, in, m_memoryMapped); return size; } - size_t loadCharArray(std::vector >& c, + size_t loadCharArray(std::vector >*& c, std::FILE* in, bool map = false) { // Can only be read into memory. Mapping not possible with std:allocator. assert(map == false); @@ -219,13 +219,13 @@ public: size_t valSize; byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t); - c.resize(valSize, 0); - byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + c = new std::vector >(valSize, 0); + byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); return byteSize; } - size_t loadCharArray(std::vector >& c, + size_t loadCharArray(std::vector >*& c, std::FILE* in, bool map = false) { size_t byteSize = 0; @@ -235,19 +235,17 @@ public: if(map == false) { // Read data into temporary file (default constructor of MmapAllocator) // and map memory onto temporary file. Can be resized. - - c.resize(valSize, 0); - byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); + c = new std::vector >(valSize, 0); + byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); } else { // Map it directly on specified region of file "in" starting at valPos // with length valSize * sizeof(ValueT). Mapped region cannot be resized. size_t valPos = std::ftell(in); Allocator alloc(in, valPos); - std::vector > charArrayTemp(alloc); - charArrayTemp.resize(valSize, 0); - c.swap(charArrayTemp); - + c = new std::vector >(alloc); + c->resize(valSize, 0); + byteSize += valSize * sizeof(ValueT); } @@ -369,11 +367,12 @@ OStream& operator<<(OStream &os, ValueIteratorRange cr) // StringVector template class Allocator> -StringVector::StringVector() - : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector >()) { } +StringVector::StringVector(bool allocate) + : m_sorted(true), m_memoryMapped(false), + m_charArray(allocate ? new std::vector >() : 0) { } template class Allocator> -StringVector::StringVector(Allocator alloc) +StringVector::StringVector(Allocator &alloc) : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector >(alloc)) { } template class Allocator> From 0f943dd9c10acf4ac0cae5b642175d763594e4b1 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 21 Jun 2015 21:16:12 +0400 Subject: [PATCH 059/286] clang compile errors --- contrib/other-builds/all.workspace | 4 ++-- contrib/other-builds/moses/moses.project | 2 +- .../CompactPT/LexicalReorderingTableCompact.cpp | 4 ++-- moses/TranslationModel/CompactPT/PhraseDecoder.cpp | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/contrib/other-builds/all.workspace b/contrib/other-builds/all.workspace index 66dafe3d2..621bafdc2 100644 --- a/contrib/other-builds/all.workspace +++ b/contrib/other-builds/all.workspace @@ -6,10 +6,10 @@ - + - + diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project index 66e0b9bad..81072d667 100644 --- a/contrib/other-builds/moses/moses.project +++ b/contrib/other-builds/moses/moses.project @@ -814,7 +814,7 @@ - + diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp index fe475507c..cd71b1776 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp @@ -78,9 +78,9 @@ GetScore(const Phrase& f, const Phrase& e, const Phrase& c) if(m_hash.GetSize() != index) { std::string scoresString; if(m_inMemory) - scoresString = m_scoresMemory[index]; + scoresString = m_scoresMemory[index].str(); else - scoresString = m_scoresMapped[index]; + scoresString = m_scoresMapped[index].str(); BitWrapper<> bitStream(scoresString); for(size_t i = 0; i < m_numScoreComponent; i++) diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp index 3cf2f010e..54e6815a1 100644 --- a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp +++ b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp @@ -224,9 +224,9 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase & // Retrieve compressed and encoded target phrase collection std::string encodedPhraseCollection; if(m_phraseDictionary.m_inMemory) - encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId]; + encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId].str(); else - encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId]; + encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId].str(); BitWrapper<> encodedBitStream(encodedPhraseCollection); if(m_coding == PREnc && bitsLeft) From 6151003c1362f7ba12e769c3dd69bf21992ac48e Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sun, 21 Jun 2015 19:24:43 +0200 Subject: [PATCH 060/286] Remove C++11 oddities --- moses/TranslationModel/CompactPT/MmapAllocator.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h index 0e04890bd..1d0d06f77 100644 --- a/moses/TranslationModel/CompactPT/MmapAllocator.h +++ b/moses/TranslationModel/CompactPT/MmapAllocator.h @@ -61,9 +61,6 @@ public: typedef const T& const_reference; typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; - - MmapAllocator(MmapAllocator &&) = delete; - MmapAllocator(const MmapAllocator &&) = delete; MmapAllocator() throw() : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), From e57ca5ec34c8723a73122b3e0963a1e8ff719a45 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Mon, 22 Jun 2015 00:00:43 +0100 Subject: [PATCH 061/286] daily automatic beautifier --- .../CompactPT/LexicalReorderingTableCreator.cpp | 4 ++-- moses/TranslationModel/CompactPT/MmapAllocator.h | 4 ++-- moses/TranslationModel/CompactPT/StringVector.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp index 8e9f4fa0a..4941d32ec 100644 --- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp +++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp @@ -52,7 +52,7 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl; m_hash.BeginSave(m_outFile); - + if(tempfilePath.size()) { MmapAllocator allocEncoded(util::FMakeTemp(tempfilePath)); m_encodedScores = new StringVector(allocEncoded); @@ -67,7 +67,7 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::cerr << "Pass 2/2: Compressing scores" << std::endl; - + if(tempfilePath.size()) { MmapAllocator allocCompressed(util::FMakeTemp(tempfilePath)); m_compressedScores = new StringVector(allocCompressed); diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h index 1d0d06f77..72d0c1663 100644 --- a/moses/TranslationModel/CompactPT/MmapAllocator.h +++ b/moses/TranslationModel/CompactPT/MmapAllocator.h @@ -61,7 +61,7 @@ public: typedef const T& const_reference; typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; - + MmapAllocator() throw() : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)), m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0), @@ -154,7 +154,7 @@ public: const size_t map_offset = (m_data_offset / m_page_size) * m_page_size; const size_t relative_offset = m_data_offset - map_offset; const size_t adjusted_map_size = m_map_size + relative_offset; - + util::UnmapOrThrow((pointer)((char*)p - relative_offset), adjusted_map_size); } } diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h index 3af970c41..aaec500f0 100644 --- a/moses/TranslationModel/CompactPT/StringVector.h +++ b/moses/TranslationModel/CompactPT/StringVector.h @@ -235,7 +235,7 @@ public: if(map == false) { // Read data into temporary file (default constructor of MmapAllocator) // and map memory onto temporary file. Can be resized. - c = new std::vector >(valSize, 0); + c = new std::vector >(valSize, 0); byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT); } else { // Map it directly on specified region of file "in" starting at valPos @@ -245,7 +245,7 @@ public: Allocator alloc(in, valPos); c = new std::vector >(alloc); c->resize(valSize, 0); - + byteSize += valSize * sizeof(ValueT); } From 2a242afa346b70a6c8dc22522349300b6d28e563 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 22 Jun 2015 10:46:12 -0400 Subject: [PATCH 062/286] Didn't need header --- moses/IOWrapper.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h index f1bcefa92..e3057794f 100644 --- a/moses/IOWrapper.h +++ b/moses/IOWrapper.h @@ -61,8 +61,6 @@ POSSIBILITY OF SUCH DAMAGE. #include "moses/ChartKBestExtractor.h" #include "moses/Syntax/KBestExtractor.h" -#include "search/applied.hh" - #include namespace Moses From aaf8397d6f11c4039f06b3f6089c402e0cbea54b Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Tue, 23 Jun 2015 13:24:42 +0100 Subject: [PATCH 063/286] Make it possible for TranslationTasks to be accessed by the LM --- moses/BaseManager.cpp | 5 +++++ moses/BaseManager.h | 1 + moses/FF/StatefulFeatureFunction.h | 8 ++++++++ moses/Hypothesis.cpp | 8 +++++--- moses/LM/IRST.cpp | 2 +- moses/LM/IRST.h | 2 +- 6 files changed, 21 insertions(+), 5 deletions(-) diff --git a/moses/BaseManager.cpp b/moses/BaseManager.cpp index ce1a5c5d9..609a6e9f5 100644 --- a/moses/BaseManager.cpp +++ b/moses/BaseManager.cpp @@ -27,6 +27,11 @@ BaseManager::GetSource() const return m_source; } +const ttasksptr& +BaseManager::GetTtask() const { + return m_ttask.lock(); +} + void BaseManager:: OutputSearchGraphAsHypergraph(std::ostream& out) const diff --git a/moses/BaseManager.h b/moses/BaseManager.h index 422b61c1f..7367997ad 100644 --- a/moses/BaseManager.h +++ b/moses/BaseManager.h @@ -50,6 +50,7 @@ public: //! the input sentence being decoded const InputType& GetSource() const; + const ttasksptr& GetTtask() const; virtual void Decode() = 0; // outputs diff --git a/moses/FF/StatefulFeatureFunction.h b/moses/FF/StatefulFeatureFunction.h index c12f9516f..9baa4735d 100644 --- a/moses/FF/StatefulFeatureFunction.h +++ b/moses/FF/StatefulFeatureFunction.h @@ -37,6 +37,14 @@ public: const FFState* prev_state, ScoreComponentCollection* accumulator) const = 0; + virtual FFState* EvaluateWhenAppliedWithContext( + ttasksptr const& ttasks, + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const { + return EvaluateWhenApplied(cur_hypo, prev_state, accumulator); + } + virtual FFState* EvaluateWhenApplied( const ChartHypothesis& /* cur_hypo */, int /* featureID - used to index the state in the previous hypotheses */, diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp index bc466664a..b722ae05f 100644 --- a/moses/Hypothesis.cpp +++ b/moses/Hypothesis.cpp @@ -230,9 +230,11 @@ EvaluateWhenApplied(StatefulFeatureFunction const& sfff, { const StaticData &staticData = StaticData::Instance(); if (! staticData.IsFeatureFunctionIgnored( sfff )) { - m_ffStates[state_idx] - = sfff.EvaluateWhenApplied - (*this, m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL, + Manager& manager = this->GetManager(); //Get the manager and the ttask + ttasksptr const& ttask = manager.GetTtask(); + + m_ffStates[state_idx] = sfff.EvaluateWhenAppliedWithContext + (ttask, *this, m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL, &m_currScoreBreakdown); } } diff --git a/moses/LM/IRST.cpp b/moses/LM/IRST.cpp index 4e4d66571..caa3f6a16 100644 --- a/moses/LM/IRST.cpp +++ b/moses/LM/IRST.cpp @@ -278,7 +278,7 @@ void LanguageModelIRST::CalcScore(const Phrase &phrase, float &fullScore, float fullScore = ngramScore + before_boundary; } -FFState* LanguageModelIRST::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const +FFState* LanguageModelIRST::EvaluateWhenAppliedWithContext(ttasksptr const& ttasks, const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const { if (!hypo.GetCurrTargetLength()) { std::auto_ptr ret(new IRSTLMState(ps)); diff --git a/moses/LM/IRST.h b/moses/LM/IRST.h index e10751663..72ff84efd 100644 --- a/moses/LM/IRST.h +++ b/moses/LM/IRST.h @@ -93,7 +93,7 @@ public: virtual LMResult GetValue(const std::vector &contextFactor, State* finalState = NULL) const; - virtual FFState *EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const; + virtual FFState *EvaluateWhenAppliedWithContext(ttasksptr const& ttasks, const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const; virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const; From 69c1308f01b0a0b51fb2ee1c38c04b175f96f631 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Tue, 23 Jun 2015 15:02:15 +0100 Subject: [PATCH 064/286] Add context_weights parameter to command line --- moses/ExportInterface.cpp | 9 +++++++++ moses/Parameter.cpp | 1 + moses/TranslationTask.cpp | 16 ++++++++++++++++ moses/TranslationTask.h | 4 ++++ 4 files changed, 30 insertions(+) diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index c444e98c9..54121609b 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -215,6 +215,9 @@ batch_run() std::string context_string; params.SetParameter(context_string,"context-string",string("")); + std::string context_weights; + params.SetParameter(context_weights,"context-weights",string("")); + // main loop over set of input sentences boost::shared_ptr source; @@ -228,6 +231,12 @@ batch_run() task->SetContextString(*source->GetContext()); else task->SetContextString(context_string); + //if (source->GetContextWeights().isEmpty()) + // task->SetContextWeights(*source->GetContextWeights()); + /*else //The context_weights will never be passed to the config file.*/ + if (context_weights != "") { + task->SetContextWeights(context_weights); + } // Allow for (sentence-)context-specific processing prior to // decoding. This can be used, for example, for context-sensitive // phrase lookup. diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index cf8737e3b..4eaf419c4 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -243,6 +243,7 @@ Parameter::Parameter() AddParam(misc_opts,"feature", "All the feature functions should be here"); AddParam(misc_opts,"context-string", "A (tokenized) string containing context words for context-sensitive translation."); + AddParam(misc_opts,"context-weights", "A key-value map for context-sensitive translation."); AddParam(misc_opts,"context-window", "Context window (in words) for context-sensitive translation: {+|-|+-}."); diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp index 61cdfc162..dd9fcbc52 100644 --- a/moses/TranslationTask.cpp +++ b/moses/TranslationTask.cpp @@ -30,6 +30,12 @@ TranslationTask return m_context_string; } +std::map const& +TranslationTask::GetContextWeights() const +{ + return m_context_weights; +} + void TranslationTask ::SetContextString(std::string const& context) @@ -37,6 +43,16 @@ TranslationTask m_context_string = context; } +void +TranslationTask +::SetContextWeights(std::string const& context_weights) +{ + std::vector tokens = Tokenize(context_weights,":"); + for (std::vector::iterator it = tokens.begin(); it != tokens.end(); it++) { + std::vector key_and_value = Tokenize(*it, ","); + m_context_weights.insert(std::pair(key_and_value[0], atof(key_and_value[1].c_str()))); + } +} boost::shared_ptr diff --git a/moses/TranslationTask.h b/moses/TranslationTask.h index 2b75c47d5..bf1add124 100644 --- a/moses/TranslationTask.h +++ b/moses/TranslationTask.h @@ -67,6 +67,7 @@ protected: // task stays alive till it's done with it. std::string m_context_string; + std::map m_context_weights; public: boost::shared_ptr @@ -115,6 +116,9 @@ public: std::string const& GetContextString() const; void SetContextString(std::string const& context); + std::map const& GetContextWeights() const; + void SetContextWeights(std::string const& context_weights); + protected: boost::shared_ptr m_source; boost::shared_ptr m_ioWrapper; From e50926abf66e5119c576661926f115e6c07ff3aa Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Tue, 23 Jun 2015 16:58:58 +0100 Subject: [PATCH 065/286] Enable the Suffix array to get context_weights from command line --- moses/TranslationModel/UG/mm/ug_bitext.h | 16 ++++++++++++++++ moses/TranslationModel/UG/mm/ug_sampling_bias.cc | 11 +++++++++++ moses/TranslationModel/UG/mm/ug_sampling_bias.h | 5 +++++ moses/TranslationModel/UG/mmsapt.cpp | 13 +++++++++++++ 4 files changed, 45 insertions(+) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 2d2afc3ca..fc433669c 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -192,6 +192,8 @@ namespace Moses { sptr SetupDocumentBias(string const& bserver, string const& text, ostream* log) const; + sptr + SetupDocumentBias(map context_weights, ostream* log) const; void mark_match(Token const* start, Token const* end, iter const& m, @@ -433,6 +435,20 @@ namespace Moses { return ret; } + template + sptr + Bitext:: + SetupDocumentBias + ( map context_weights, ostream* log ) const + { + sptr ret; + UTIL_THROW_IF2(m_sid2docid == NULL, + "Document bias requested but no document map loaded."); + ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, + context_weights, log)); + return ret; + } + template void Bitext:: diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index da408dfb3..2944f49e8 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -49,6 +49,17 @@ namespace Moses // #endif } + DocumentBias + ::DocumentBias(std::vector const& sid2doc, + std::map const& docname2docid, + std::map const& context_weights, + std::ostream* log) + : m_sid2docid(sid2doc) + , m_bias(docname2docid.size(), 0) + { + init(context_weights, docname2docid); + } + void DocumentBias ::init_from_json diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index f540ddc76..55eec3854 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -45,6 +45,11 @@ namespace Moses std::string const& server_url, std::string const& text, std::ostream* log); + DocumentBias(std::vector const& sid2doc, + std::map const& docname2docid, + std::map const& context_weights, + std::ostream* log); + void init_from_json ( std::string const& json, diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index f05c0d59b..3b04fe30d 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -794,6 +794,19 @@ namespace Moses } if (!context->cache1) context->cache1.reset(new pstats::cache_t); if (!context->cache2) context->cache2.reset(new pstats::cache_t); + } else if (ttask->GetContextWeights().empty()) { + if (m_bias_log) + { + *m_bias_log << HERE << endl + << "BIAS FROM MAP LOOKUP" << endl; + context->bias_log = m_bias_log; + } + context->bias + = btfix.SetupDocumentBias(ttask->GetContextWeights(), m_bias_log); + context->bias->loglevel = m_bias_loglevel; + context->bias->log = m_bias_log; + if (!context->cache1) context->cache1.reset(new pstats::cache_t); + if (!context->cache2) context->cache2.reset(new pstats::cache_t); } boost::unique_lock mylock(m_lock); sptr localcache = scope->get(cache_key); From 5a0168a6fae2a4e7a104aa9e99e7fbf17fc02c7d Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Tue, 23 Jun 2015 17:27:01 +0100 Subject: [PATCH 066/286] forgot to negate a condition --- moses/TranslationModel/UG/mmsapt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 3b04fe30d..5eb5c6785 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -794,7 +794,7 @@ namespace Moses } if (!context->cache1) context->cache1.reset(new pstats::cache_t); if (!context->cache2) context->cache2.reset(new pstats::cache_t); - } else if (ttask->GetContextWeights().empty()) { + } else if (!ttask->GetContextWeights().empty()) { if (m_bias_log) { *m_bias_log << HERE << endl From 0d34023aad0dbf28c28bcc17876b4016b5b1b3ea Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 24 Jun 2015 14:56:37 +0400 Subject: [PATCH 067/286] prune generation table --- misc/Jamfile | 4 ++- misc/pruneGeneration.cpp | 55 ++++++++++++++++++++++++++++++++++++++++ misc/pruneGeneration.h | 45 ++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 misc/pruneGeneration.cpp create mode 100644 misc/pruneGeneration.h diff --git a/misc/Jamfile b/misc/Jamfile index bfea14d58..46a18e253 100644 --- a/misc/Jamfile +++ b/misc/Jamfile @@ -14,6 +14,8 @@ exe 1-1-Extraction : 1-1-Extraction.cpp ..//boost_filesystem ../moses//moses ; exe prunePhraseTable : prunePhraseTable.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ; +exe pruneGeneration : pruneGeneration.cpp ..//boost_filesystem ../moses//moses ..//boost_program_options ; + local with-cmph = [ option.get "with-cmph" ] ; if $(with-cmph) { exe processPhraseTableMin : processPhraseTableMin.cpp ..//boost_filesystem ../moses//moses ; @@ -46,6 +48,6 @@ $(TOP)//boost_iostreams $(TOP)//boost_program_options ; -alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable ; +alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable pruneGeneration ; #processPhraseTable queryPhraseTable diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp new file mode 100644 index 000000000..45873a4ac --- /dev/null +++ b/misc/pruneGeneration.cpp @@ -0,0 +1,55 @@ +#include +#include +#include +#include +#include "pruneGeneration.h" + +using namespace std; + +int main(int argc, char **argv) +{ + cerr << "Starting" << endl; + int limit = atoi(argv[1]); + + vector records; + string prevInWord; + string line; + while (getline(cin, line)) { + vector toks; + Tokenize(toks, line); + assert(toks.size() == 4); + + if (prevInWord != toks[0]) { + Output(limit, records); + records.clear(); + } + + // add new record + float prob = atof(toks[2].c_str()); + records.push_back(Rec(prob, line)); + + prevInWord = toks[0]; + } + + // last + Output(limit, records); + records.clear(); + + cerr << "Finished" << endl; +} + +void Output(int limit, vector &records) +{ + Prune(limit, records); + + for (size_t i = 0; i < limit && i < records.size(); ++i) { + const Rec &rec = records[i]; + cout << rec.line << endl; + } +} + +void Prune(int limit, std::vector &records) +{ + std::sort(records.rbegin(), records.rend()); + +} diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h new file mode 100644 index 000000000..693c5f149 --- /dev/null +++ b/misc/pruneGeneration.h @@ -0,0 +1,45 @@ +#pragma once +#include +#include + +class Rec +{ +public: + float prob; + std::string line; + + Rec(float aprob, const std::string &aline) + :prob(aprob) + ,line(aline) + {} + + inline bool operator< (const Rec &compare) const { + return prob < compare.prob; + } +}; + +//////////////////////////////////////////////////////////// + +void Output(int limit, std::vector &records); +void Prune(int limit, std::vector &records); + +//////////////////////////////////////////////////////////// +inline void Tokenize(std::vector &output + , const std::string& str + , const std::string& delimiters = " \t") +{ + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (std::string::npos != pos || std::string::npos != lastPos) { + // Found a token, add it to the vector. + output.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } +} + From bac5c2e55c1b2454328bf18207b6d9633d2b9adf Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 24 Jun 2015 16:24:12 +0400 Subject: [PATCH 068/286] compile error with gcc --- misc/pruneGeneration.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index 45873a4ac..275d599df 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include "pruneGeneration.h" using namespace std; From 9936c9f264f95c02e47a6e987bea0e2026b78727 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 22 Jun 2015 10:46:12 -0400 Subject: [PATCH 069/286] Didn't need header --- moses/IOWrapper.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h index f1bcefa92..e3057794f 100644 --- a/moses/IOWrapper.h +++ b/moses/IOWrapper.h @@ -61,8 +61,6 @@ POSSIBILITY OF SUCH DAMAGE. #include "moses/ChartKBestExtractor.h" #include "moses/Syntax/KBestExtractor.h" -#include "search/applied.hh" - #include namespace Moses From d928340cd4a0a07fb8058a3a586cba2d4633c416 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 24 Jun 2015 14:34:27 +0100 Subject: [PATCH 070/286] Added context handling to TranslationRequest for moses server. --- moses/server/TranslationRequest.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index cad3696d1..3848f81ba 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -1,4 +1,5 @@ #include "TranslationRequest.h" +#include "moses/ContextScope.h" #include namespace MosesServer @@ -30,6 +31,7 @@ create(xmlrpc_c::paramList const& paramList, boost::shared_ptr ret; ret.reset(new TranslationRequest(paramList,cond, mut)); ret->m_self = ret; + ret->m_scope.reset(new Moses::ContextScope); return ret; } @@ -270,7 +272,10 @@ parse_request(std::map const& params) if (si != params.end()) m_nbestSize = xmlrpc_c::value_int(si->second); - + si = params.find("context"); + if (si != params.end()) { + m_context_string = xmlrpc_c::value_string(si->second); + } // // biased sampling for suffix-array-based sampling phrase table? // if ((si = params.find("bias")) != params.end()) // { From 555f91eb7ec79cc69e1b18889fd17217d3425389 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 24 Jun 2015 18:31:05 +0400 Subject: [PATCH 071/286] codelite --- contrib/other-builds/all.workspace | 5 +- .../other-builds/moses-cmd/moses-cmd.project | 14 +-- contrib/other-builds/moses/moses.project | 4 +- .../pruneGeneration/pruneGeneration.project | 97 +++++++++++++++++++ misc/pruneGeneration.cpp | 7 +- misc/pruneGeneration.h | 1 - 6 files changed, 111 insertions(+), 17 deletions(-) create mode 100644 contrib/other-builds/pruneGeneration/pruneGeneration.project diff --git a/contrib/other-builds/all.workspace b/contrib/other-builds/all.workspace index 621bafdc2..5a7eaf114 100644 --- a/contrib/other-builds/all.workspace +++ b/contrib/other-builds/all.workspace @@ -9,7 +9,8 @@ - + + @@ -23,6 +24,7 @@ + @@ -36,6 +38,7 @@ + diff --git a/contrib/other-builds/moses-cmd/moses-cmd.project b/contrib/other-builds/moses-cmd/moses-cmd.project index ac567ffce..44a0d621f 100644 --- a/contrib/other-builds/moses-cmd/moses-cmd.project +++ b/contrib/other-builds/moses-cmd/moses-cmd.project @@ -26,13 +26,6 @@ - - - - - - - @@ -150,4 +143,11 @@ + + + + + + + diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project index 81072d667..0fbd942c6 100644 --- a/contrib/other-builds/moses/moses.project +++ b/contrib/other-builds/moses/moses.project @@ -793,8 +793,6 @@ - - @@ -897,4 +895,6 @@ + + diff --git a/contrib/other-builds/pruneGeneration/pruneGeneration.project b/contrib/other-builds/pruneGeneration/pruneGeneration.project new file mode 100644 index 000000000..7060d55ea --- /dev/null +++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + None + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + None + + + + + + + + + + + + + + + + + + + diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index 275d599df..19ae2184f 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -42,7 +42,7 @@ int main(int argc, char **argv) void Output(int limit, vector &records) { - Prune(limit, records); + std::sort(records.rbegin(), records.rend()); for (size_t i = 0; i < limit && i < records.size(); ++i) { const Rec &rec = records[i]; @@ -50,8 +50,3 @@ void Output(int limit, vector &records) } } -void Prune(int limit, std::vector &records) -{ - std::sort(records.rbegin(), records.rend()); - -} diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h index 693c5f149..470e607d4 100644 --- a/misc/pruneGeneration.h +++ b/misc/pruneGeneration.h @@ -21,7 +21,6 @@ public: //////////////////////////////////////////////////////////// void Output(int limit, std::vector &records); -void Prune(int limit, std::vector &records); //////////////////////////////////////////////////////////// inline void Tokenize(std::vector &output From dce0f33270bd6e169850a9337141c5af39f3f765 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 24 Jun 2015 18:35:59 +0400 Subject: [PATCH 072/286] prune generation table in ems --- scripts/ems/experiment.meta | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 9edeec460..4177f967e 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -739,6 +739,14 @@ build-generation-custom ignore-unless: AND generation-factors generation-corpus default-name: model/generation-table final-model: yes +generation-prune + in: generation-table + out: generation-table-pruned + rerun-on-change: TRAINING:prune-generation + pass-unless: AND TRAINING:prune-generation + default-name: model/generation-table-pruned + final-model: yes + template: $TRAINING:prune-generation < IN > OUT build-sparse in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus out: sparse @@ -747,7 +755,7 @@ build-sparse default-name: model/sparse-features template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features" create-config - in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm + in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table-pruned sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm out: config ignore-if: use-hiero thot rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature From 78b2810cfe52d0a7246c4c376e32e4f1bc321577 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 24 Jun 2015 18:09:22 +0100 Subject: [PATCH 073/286] Allow context server to use ports other than 80. --- .../TranslationModel/UG/mm/ug_http_client.cc | 34 +++++++++++++------ moses/TranslationModel/UG/mm/ug_http_client.h | 10 ++++-- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_http_client.cc b/moses/TranslationModel/UG/mm/ug_http_client.cc index 1d6d70edb..1bbb93b23 100644 --- a/moses/TranslationModel/UG/mm/ug_http_client.cc +++ b/moses/TranslationModel/UG/mm/ug_http_client.cc @@ -7,28 +7,40 @@ std::string http_client::content() const { return m_content.str(); } http_client:: http_client(boost::asio::io_service& io_service, - const std::string& server, const std::string& path) + std::string const& server, + std::string const& port, + std::string const& path) : resolver_(io_service), socket_(io_service) { - init(server,path); + init(server, port, path); } - + http_client:: http_client(boost::asio::io_service& io_service, std::string url) : resolver_(io_service), socket_(io_service) { - size_t p = url.find("://"); - if (p < url.size()) url.erase(0,p+3); - p = url.find("/"); + std::string server; + std::string path = "/"; + std::string port = "http"; + size_t p = url.find("://"), q; if (p < url.size()) - init(url.substr(0,p),url.substr(p)); - else - init(url,"/"); + { + port = url.substr(0,p); + url.erase(0, p+3); + } + p = std::min(url.find_first_of(":/"), url.size()); + q = std::min(url.find("/"), url.size()); + if (p < url.size() && url[p] == ':') + port = url.substr(p,q-p); + server = url.substr(0,p); + if (q < url.size()) + path = url.substr(q); + init(server, port, path); } void http_client:: -init(std::string const& server, std::string const& path) +init(std::string const& server, std::string const& port, std::string const& path) { // Form the request. We specify the "Connection: close" header so // that the server will close the socket after transmitting the @@ -43,7 +55,7 @@ init(std::string const& server, std::string const& path) // Start an asynchronous resolve to translate the server and service names // into a list of endpoints. - tcp::resolver::query query(server, "http"); + tcp::resolver::query query(server, port); resolver_.async_resolve(query, boost::bind(&http_client::handle_resolve, this, boost::asio::placeholders::error, diff --git a/moses/TranslationModel/UG/mm/ug_http_client.h b/moses/TranslationModel/UG/mm/ug_http_client.h index 53ee258f9..825c0c37e 100644 --- a/moses/TranslationModel/UG/mm/ug_http_client.h +++ b/moses/TranslationModel/UG/mm/ug_http_client.h @@ -35,9 +35,15 @@ class http_client public: http_client(boost::asio::io_service& io_service, std::string url); http_client(boost::asio::io_service& io_service, - const std::string& server, const std::string& path); + std::string const& server, + std::string const& port, + std::string const& path); private: - void init(std::string const& server, std::string const& path); + + void init(std::string const& server, + std::string const& port, + std::string const& path); + void handle_resolve(const boost::system::error_code& err, tcp::resolver::iterator endpoint_iterator); void handle_connect(const boost::system::error_code& err, From 4ec69fbfdff104218db16c9c1ba8c8c381c331c3 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Thu, 25 Jun 2015 00:00:42 +0100 Subject: [PATCH 074/286] daily automatic beautifier --- misc/pruneGeneration.cpp | 10 +++++----- misc/pruneGeneration.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index 19ae2184f..8207e287f 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -12,7 +12,7 @@ int main(int argc, char **argv) { cerr << "Starting" << endl; int limit = atoi(argv[1]); - + vector records; string prevInWord; string line; @@ -20,12 +20,12 @@ int main(int argc, char **argv) vector toks; Tokenize(toks, line); assert(toks.size() == 4); - + if (prevInWord != toks[0]) { Output(limit, records); records.clear(); } - + // add new record float prob = atof(toks[2].c_str()); records.push_back(Rec(prob, line)); @@ -37,13 +37,13 @@ int main(int argc, char **argv) Output(limit, records); records.clear(); - cerr << "Finished" << endl; + cerr << "Finished" << endl; } void Output(int limit, vector &records) { std::sort(records.rbegin(), records.rend()); - + for (size_t i = 0; i < limit && i < records.size(); ++i) { const Rec &rec = records[i]; cout << rec.line << endl; diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h index 470e607d4..dae5958f8 100644 --- a/misc/pruneGeneration.h +++ b/misc/pruneGeneration.h @@ -7,12 +7,12 @@ class Rec public: float prob; std::string line; - + Rec(float aprob, const std::string &aline) - :prob(aprob) - ,line(aline) + :prob(aprob) + ,line(aline) {} - + inline bool operator< (const Rec &compare) const { return prob < compare.prob; } From c80df1212ede1c8db39fbd5fe21f11d8f2ea60f7 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 25 Jun 2015 10:48:35 +0400 Subject: [PATCH 075/286] prune multiple files at once. Make up for failure in ems to give the full path of the gen table --- .../other-builds/OnDiskPt/OnDiskPt.project | 31 ++++++++++---- .../extract-mixed-syntax.project | 40 ++++++++++++++----- contrib/other-builds/extract/extract.project | 31 ++++++++++---- contrib/other-builds/lm/lm.project | 31 ++++++++++---- .../other-builds/moses-cmd/moses-cmd.project | 32 +++++++-------- .../pruneGeneration/pruneGeneration.project | 9 +++-- contrib/other-builds/score/score.project | 30 +++++++------- contrib/other-builds/search/search.project | 14 +++---- contrib/other-builds/util/util.project | 12 +++--- misc/pruneGeneration.cpp | 18 ++++++--- misc/pruneGeneration.h | 4 +- 11 files changed, 165 insertions(+), 87 deletions(-) diff --git a/contrib/other-builds/OnDiskPt/OnDiskPt.project b/contrib/other-builds/OnDiskPt/OnDiskPt.project index 06f80d233..3a89ec832 100644 --- a/contrib/other-builds/OnDiskPt/OnDiskPt.project +++ b/contrib/other-builds/OnDiskPt/OnDiskPt.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -27,6 +44,8 @@ + + @@ -40,9 +59,9 @@ - - - + + + @@ -72,7 +91,7 @@ - + @@ -110,7 +129,7 @@ - + @@ -118,6 +137,4 @@ - - diff --git a/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project b/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project index 83c652f8c..87d76689a 100644 --- a/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project +++ b/contrib/other-builds/extract-mixed-syntax/extract-mixed-syntax.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -43,6 +60,10 @@ + + + + @@ -56,13 +77,14 @@ - - - + + + - - + + + @@ -94,7 +116,7 @@ - + @@ -133,7 +155,7 @@ - + @@ -141,8 +163,4 @@ - - - - diff --git a/contrib/other-builds/extract/extract.project b/contrib/other-builds/extract/extract.project index ac74607f2..d86e89035 100644 --- a/contrib/other-builds/extract/extract.project +++ b/contrib/other-builds/extract/extract.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -13,6 +30,8 @@ + + @@ -26,11 +45,11 @@ - - + + - + @@ -60,7 +79,7 @@ - + @@ -99,7 +118,7 @@ - + @@ -107,6 +126,4 @@ - - diff --git a/contrib/other-builds/lm/lm.project b/contrib/other-builds/lm/lm.project index a184fe3d1..c30ebe533 100644 --- a/contrib/other-builds/lm/lm.project +++ b/contrib/other-builds/lm/lm.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -27,6 +44,8 @@ + + @@ -40,9 +59,9 @@ - - - + + + @@ -72,7 +91,7 @@ - + @@ -110,7 +129,7 @@ - + @@ -118,6 +137,4 @@ - - diff --git a/contrib/other-builds/moses-cmd/moses-cmd.project b/contrib/other-builds/moses-cmd/moses-cmd.project index 44a0d621f..5303ba7c7 100644 --- a/contrib/other-builds/moses-cmd/moses-cmd.project +++ b/contrib/other-builds/moses-cmd/moses-cmd.project @@ -26,6 +26,13 @@ + + + + + + + @@ -39,20 +46,20 @@ - - - + + + - - - - - - + + + + + + @@ -143,11 +150,4 @@ - - - - - - - diff --git a/contrib/other-builds/pruneGeneration/pruneGeneration.project b/contrib/other-builds/pruneGeneration/pruneGeneration.project index 7060d55ea..39109197a 100644 --- a/contrib/other-builds/pruneGeneration/pruneGeneration.project +++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project @@ -2,6 +2,10 @@ + + + + @@ -15,6 +19,7 @@ + @@ -90,8 +95,4 @@ - - - - diff --git a/contrib/other-builds/score/score.project b/contrib/other-builds/score/score.project index c88df0e78..08e0b9414 100644 --- a/contrib/other-builds/score/score.project +++ b/contrib/other-builds/score/score.project @@ -19,6 +19,10 @@ + + + + @@ -32,17 +36,17 @@ - - - + + + - - - - - - + + + + + + @@ -86,7 +90,7 @@ - + @@ -125,7 +129,7 @@ - + @@ -133,8 +137,4 @@ - - - - diff --git a/contrib/other-builds/search/search.project b/contrib/other-builds/search/search.project index d96252a89..8be29fd1d 100644 --- a/contrib/other-builds/search/search.project +++ b/contrib/other-builds/search/search.project @@ -10,6 +10,8 @@ + + @@ -23,9 +25,9 @@ - - - + + + @@ -55,7 +57,7 @@ - + @@ -93,7 +95,7 @@ - + @@ -101,6 +103,4 @@ - - diff --git a/contrib/other-builds/util/util.project b/contrib/other-builds/util/util.project index 1006ddb52..4bb27306e 100644 --- a/contrib/other-builds/util/util.project +++ b/contrib/other-builds/util/util.project @@ -62,6 +62,8 @@ + + @@ -75,8 +77,8 @@ - - + + @@ -105,7 +107,7 @@ - + @@ -143,7 +145,7 @@ - + @@ -151,6 +153,4 @@ - - diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index 8207e287f..98b21530c 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -1,10 +1,10 @@ #include #include -#include #include #include #include #include "pruneGeneration.h" +#include "moses/InputFileStream.h" using namespace std; @@ -13,16 +13,23 @@ int main(int argc, char **argv) cerr << "Starting" << endl; int limit = atoi(argv[1]); + Process(limit, cin, cout); + + cerr << "Finished" << endl; +} + +void Process(int limit, istream &inStrme, ostream &outStrme) +{ vector records; string prevInWord; string line; - while (getline(cin, line)) { + while (getline(inStrme, line)) { vector toks; Tokenize(toks, line); assert(toks.size() == 4); if (prevInWord != toks[0]) { - Output(limit, records); + Output(outStrme, records, limit); records.clear(); } @@ -34,13 +41,12 @@ int main(int argc, char **argv) } // last - Output(limit, records); + Output(outStrme, records, limit); records.clear(); - cerr << "Finished" << endl; } -void Output(int limit, vector &records) +void Output(ostream &outStrme, vector &records, int limit) { std::sort(records.rbegin(), records.rend()); diff --git a/misc/pruneGeneration.h b/misc/pruneGeneration.h index dae5958f8..b22d09869 100644 --- a/misc/pruneGeneration.h +++ b/misc/pruneGeneration.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include class Rec { @@ -20,7 +21,8 @@ public: //////////////////////////////////////////////////////////// -void Output(int limit, std::vector &records); +void Process(int limit, std::istream &inStrme, std::ostream &outStrme); +void Output(std::ostream &outStrme, std::vector &records, int limit); //////////////////////////////////////////////////////////// inline void Tokenize(std::vector &output From 930dce10bff821431213441fa1c07c1195d916b9 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 25 Jun 2015 13:02:29 +0400 Subject: [PATCH 076/286] prune multiple files at once. Make up for failure in ems to give the full path of the gen table --- contrib/other-builds/moses/moses.project | 2 + .../pruneGeneration/pruneGeneration.project | 27 +++++- misc/pruneGeneration.cpp | 44 ++++++++- moses/OutputFileStream.cpp | 90 +++++++++++++++++++ moses/OutputFileStream.h | 81 +++++++++++++++++ 5 files changed, 241 insertions(+), 3 deletions(-) create mode 100644 moses/OutputFileStream.cpp create mode 100644 moses/OutputFileStream.h diff --git a/contrib/other-builds/moses/moses.project b/contrib/other-builds/moses/moses.project index 0fbd942c6..0ceb40723 100644 --- a/contrib/other-builds/moses/moses.project +++ b/contrib/other-builds/moses/moses.project @@ -775,6 +775,8 @@ + + diff --git a/contrib/other-builds/pruneGeneration/pruneGeneration.project b/contrib/other-builds/pruneGeneration/pruneGeneration.project index 39109197a..6f8a6adf5 100644 --- a/contrib/other-builds/pruneGeneration/pruneGeneration.project +++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project @@ -1,5 +1,22 @@ + + + + + + + + @@ -20,8 +37,16 @@ + - + + + + + + + + diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index 98b21530c..e436263e9 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -3,8 +3,10 @@ #include #include #include +#include #include "pruneGeneration.h" #include "moses/InputFileStream.h" +#include "moses/OutputFileStream.h" using namespace std; @@ -12,8 +14,46 @@ int main(int argc, char **argv) { cerr << "Starting" << endl; int limit = atoi(argv[1]); + string inPathStem = argv[2]; + string outPathStem = argv[3]; - Process(limit, cin, cout); + namespace fs = boost::filesystem; + + //cerr << "inPathStem=" << inPathStem << endl; + fs::path p(inPathStem); + fs::path dir = p.parent_path(); + //cerr << "dir=" << dir << endl; + + fs::path fileStem = p.filename(); + string fileStemStr = fileStem.native(); + size_t fileStemStrSize = fileStemStr.size(); + //cerr << "fileStem=" << fileStemStr << endl; + + // loop thru each file in directory + fs::directory_iterator end_iter; + for( fs::directory_iterator dir_iter(dir) ; dir_iter != end_iter ; ++dir_iter) { + if (fs::is_regular_file(dir_iter->status())) { + fs::path currPath = *dir_iter; + string currPathStr = currPath.native(); + //cerr << "currPathStr=" << currPathStr << endl; + + fs::path currFile = currPath.filename(); + string currFileStr = currFile.native(); + + if (currFileStr.find(fileStemStr) == 0) { + // found gen table we need + //cerr << "found=" << currPathStr << endl; + string suffix = currFileStr.substr(fileStemStrSize, currFileStr.size() - fileStemStrSize); + string outPath = outPathStem + suffix; + cerr << "PRUNING " << currPathStr << " TO " << outPath << endl; + + Moses::InputFileStream inStrme(currPathStr); + Moses::OutputFileStream outStrme(outPath); + Process(limit, inStrme, outStrme); + + } + } + } cerr << "Finished" << endl; } @@ -52,7 +92,7 @@ void Output(ostream &outStrme, vector &records, int limit) for (size_t i = 0; i < limit && i < records.size(); ++i) { const Rec &rec = records[i]; - cout << rec.line << endl; + outStrme << rec.line << endl; } } diff --git a/moses/OutputFileStream.cpp b/moses/OutputFileStream.cpp new file mode 100644 index 000000000..d7874b06f --- /dev/null +++ b/moses/OutputFileStream.cpp @@ -0,0 +1,90 @@ +// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include +#include "OutputFileStream.h" +#include "gzfilebuf.h" + +using namespace std; +using namespace boost::algorithm; + +namespace Moses +{ +OutputFileStream::OutputFileStream() + :boost::iostreams::filtering_ostream() + ,m_outFile(NULL) + ,m_open(false) +{ +} + +OutputFileStream::OutputFileStream(const std::string &filePath) + :m_outFile(NULL) + ,m_open(false) +{ + Open(filePath); +} + +OutputFileStream::~OutputFileStream() +{ + Close(); +} + +bool OutputFileStream::Open(const std::string &filePath) +{ + assert(!m_open); + if (filePath == std::string("-")) { + // Write to standard output. Leave m_outFile null. + this->push(std::cout); + } else { + m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary); + if (m_outFile->fail()) { + return false; + } + + if (ends_with(filePath, ".gz")) { + this->push(boost::iostreams::gzip_compressor()); + } + this->push(*m_outFile); + } + + m_open = true; + return true; +} + +void OutputFileStream::Close() +{ + if (!m_open) return; + this->flush(); + if (m_outFile) { + this->pop(); // file + + m_outFile->close(); + delete m_outFile; + m_outFile = NULL; + } + m_open = false; +} + + +} + diff --git a/moses/OutputFileStream.h b/moses/OutputFileStream.h new file mode 100644 index 000000000..b77741a73 --- /dev/null +++ b/moses/OutputFileStream.h @@ -0,0 +1,81 @@ +// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $ + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include + +namespace Moses +{ + +/** Version of std::ostream with transparent compression. + * + * Transparently compresses output when writing to a file whose name ends in + * ".gz". Or, writes to stdout instead of a file when given a filename + * consisting of just a dash ("-"). + */ +class OutputFileStream : public boost::iostreams::filtering_ostream +{ +private: + /** File that needs flushing & closing when we close this stream. + * + * Is NULL when no file is opened, e.g. when writing to standard output. + */ + std::ofstream *m_outFile; + + /// Is this stream open? + bool m_open; + +public: + /** Create an unopened OutputFileStream. + * + * Until it's been opened, nothing can be done with this stream. + */ + OutputFileStream(); + + /// Create an OutputFileStream, and open it by calling Open(). + OutputFileStream(const std::string &filePath); + virtual ~OutputFileStream(); + + // TODO: Can we please just always throw an exception when this fails? + /** Open stream. + * + * If filePath is "-" (just a dash), this opens the stream for writing to + * standard output. Otherwise, it opens the given file. If the filename + * has the ".gz" suffix, output will be transparently compressed. + * + * Call Close() to close the file. + * + * Returns whether opening the file was successful. It may also throw an + * exception on failure. + */ + bool Open(const std::string &filePath); + + /// Flush and close stream. After this, the stream can be opened again. + void Close(); +}; + +} + From 47a488767e4944e5768bda70fe3271528e753ef5 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Thu, 25 Jun 2015 13:12:33 +0100 Subject: [PATCH 077/286] Enable the bias weights to be (re)set by the server. --- moses/LM/IRST.cpp | 4 ++++ moses/TranslationModel/UG/mm/ug_sampling_bias.cc | 5 +++++ moses/TranslationModel/UG/mm/ug_sampling_bias.h | 2 ++ moses/TranslationModel/UG/mmsapt.cpp | 3 +++ moses/TranslationTask.cpp | 7 +++++++ moses/TranslationTask.h | 2 ++ 6 files changed, 23 insertions(+) diff --git a/moses/LM/IRST.cpp b/moses/LM/IRST.cpp index caa3f6a16..f70530231 100644 --- a/moses/LM/IRST.cpp +++ b/moses/LM/IRST.cpp @@ -36,6 +36,7 @@ using namespace irstlm; #include "moses/Phrase.h" #include "moses/InputFileStream.h" #include "moses/StaticData.h" +#include "moses/TranslationTask.h" using namespace std; @@ -285,6 +286,9 @@ FFState* LanguageModelIRST::EvaluateWhenAppliedWithContext(ttasksptr const& ttas return ret.release(); } + //get the context_weight map here + std::map context_weight = ttasks->GetContextWeights(); + //[begin, end) in STL-like fashion. const int begin = (const int) hypo.GetCurrTargetWordsRange().GetStartPos(); const int end = (const int) hypo.GetCurrTargetWordsRange().GetEndPos() + 1; diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index 2944f49e8..7ac540045 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -60,6 +60,10 @@ namespace Moses init(context_weights, docname2docid); } + std::map& SamplingBias::getBiasMap() { + return m_bias_map; + } + void DocumentBias ::init_from_json @@ -96,6 +100,7 @@ namespace Moses << x.first << " " << x.second << std::endl; } } + m_bias_map = bias; init(bias, docname2docid); // using xmlrpc_parse_json didn't always work (parser errors) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index 55eec3854..172bb60db 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -20,6 +20,8 @@ namespace Moses public: int loglevel; std::ostream* log; + std::map m_bias_map; //Map to store the biasmap as you get it from the server + std::map& getBiasMap(); virtual float operator[](id_type const ID) const = 0; // returns (unnormalized bias) for the class of item ID diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 5eb5c6785..f6f9fff50 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -791,6 +791,9 @@ namespace Moses = btfix.SetupDocumentBias(m_bias_server, context_words, m_bias_log); context->bias->loglevel = m_bias_loglevel; context->bias->log = m_bias_log; + //Reset the bias in the ttaskptr so that other functions + //so that other functions can utilize the biases; + ttask->ReSetContextWeights(context->bias->getBiasMap()); } if (!context->cache1) context->cache1.reset(new pstats::cache_t); if (!context->cache2) context->cache2.reset(new pstats::cache_t); diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp index dd9fcbc52..0c6e6c69e 100644 --- a/moses/TranslationTask.cpp +++ b/moses/TranslationTask.cpp @@ -36,6 +36,13 @@ TranslationTask::GetContextWeights() const return m_context_weights; } +void +TranslationTask +::ReSetContextWeights(std::map const& new_weights) +{ + m_context_weights = new_weights; +} + void TranslationTask ::SetContextString(std::string const& context) diff --git a/moses/TranslationTask.h b/moses/TranslationTask.h index bf1add124..f37bad535 100644 --- a/moses/TranslationTask.h +++ b/moses/TranslationTask.h @@ -118,6 +118,8 @@ public: std::map const& GetContextWeights() const; void SetContextWeights(std::string const& context_weights); + void ReSetContextWeights(std::map const& new_weights); + protected: boost::shared_ptr m_source; From b83803203e94535aa4405df244ccbd32ab80ed34 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 25 Jun 2015 18:10:31 +0400 Subject: [PATCH 078/286] prune generation table in ems --- scripts/ems/experiment.meta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 4177f967e..110ab39b7 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -746,7 +746,7 @@ generation-prune pass-unless: AND TRAINING:prune-generation default-name: model/generation-table-pruned final-model: yes - template: $TRAINING:prune-generation < IN > OUT + template: $TRAINING:prune-generation IN OUT build-sparse in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus out: sparse From 22cc22064c3cfcd6a762ebf8e597a3ed13642814 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 25 Jun 2015 15:17:26 +0100 Subject: [PATCH 079/286] Changed implementation of indocs (to keep track of which documents phrases come from) from vector to map. --- .../UG/mm/ug_bitext_agenda_job.h | 17 +++++++++++++---- .../TranslationModel/UG/mm/ug_bitext_jstats.cc | 2 +- moses/TranslationModel/UG/mm/ug_bitext_jstats.h | 3 ++- .../TranslationModel/UG/mm/ug_bitext_pstats.cc | 2 +- moses/TranslationModel/UG/mm/ug_bitext_pstats.h | 4 ++-- moses/TranslationModel/UG/mm/ug_phrasepair.h | 11 +++++++---- 6 files changed, 26 insertions(+), 13 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h index 0e0624351..36b9873e0 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -137,7 +137,10 @@ int Bitext::agenda::job float p = (*m_bias)[sid]; id_type docid = m_bias->GetClass(sid); - uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0; + + // uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0; + std::map::const_iterator m = stats->indoc.find(docid); + uint32_t k = m != stats->indoc.end() ? m->second : 0 ; // always consider candidates from dominating documents and // from documents that have not been considered at all yet @@ -159,11 +162,17 @@ int Bitext::agenda::job e = root->getCorpus()->sntEnd(sid); *log << docid << ":" << sid << " " << size_t(k) << "/" << N << " @" << p << " => " << d << " ["; - for (size_t i = 0; i < stats->indoc.size(); ++i) + for (std::map::const_iterator m = stats->indoc.begin(); + m != stats->indoc.end(); ++m) { - if (i) *log << " "; - *log << stats->indoc[i]; + if (m != stats->indoc.begin()) *log << " "; + *log << m->first << ":" << m->second; } + // for (size_t i = 0; i < stats->indoc.size(); ++i) + // { + // if (i) *log << " "; + // *log << stats->indoc[i]; + // } *log << "] "; for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " "; if (!ret) *log << "SKIP"; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc index bcda9ebf3..517caf783 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc @@ -76,7 +76,7 @@ namespace Moses ++obwd[bwd_orient]; if (docid >= 0) { - while (int(indoc.size()) <= docid) indoc.push_back(0); + // while (int(indoc.size()) <= docid) indoc.push_back(0); ++indoc[docid]; } } diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h index dade27649..03b231487 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h @@ -27,7 +27,8 @@ namespace Moses uint32_t obwd[Moses::LRModel::NONE+1]; // backward distortion type counts public: - vector indoc; // counts origin of samples (for biased sampling) + std::map indoc; + // vector indoc; // counts origin of samples (for biased sampling) jstats(); jstats(jstats const& other); uint32_t rcnt() const; // raw joint counts diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc index 580d7669b..8702d9c50 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc @@ -58,7 +58,7 @@ namespace Moses ++obwd[po_bwd]; if (docid >= 0) { - while (int(indoc.size()) <= docid) indoc.push_back(0); + // while (int(indoc.size()) <= docid) indoc.push_back(0); ++indoc[docid]; } } diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index 9a14e378b..e5cf4ab26 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -33,8 +33,8 @@ namespace Moses uint32_t ofwd[Moses::LRModel::NONE+1]; // distribution of fwd phrase orientations uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations - std::vector indoc; // distribution over where samples came from - + // std::vector indoc; // distribution over where samples came from + std::map indoc; typedef std::map trg_map_t; trg_map_t trg; pstats(); diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h index 7e565c2df..7f03d89df 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.h +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -30,7 +30,8 @@ namespace Moses std::vector aln; float score; bool inverse; - std::vector indoc; + // std::vector indoc; + std::map indoc; PhrasePair() { }; PhrasePair(PhrasePair const& o); @@ -306,10 +307,12 @@ namespace Moses out << toString (V1, this->start1, this->len1) << " ::: " << toString (V2, this->start2, this->len2) << " " << this->joint << " ["; - for (size_t i = 0; i < this->indoc.size(); ++i) + // for (size_t i = 0; i < this->indoc.size(); ++i) + for (std::map::const_iterator m = indoc.begin(); + m != indoc.end(); ++m) { - if (i) out << " "; - out << this->indoc[i]; + if (m != indoc.begin()) out << " "; + out << m->first << ":" << m->second; } out << "] ["; vector lrscores; From afdc1b480ea81f424b35ac9e96c57815f847203a Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Thu, 25 Jun 2015 15:47:17 +0100 Subject: [PATCH 080/286] Break everything by trying to add ttasksptr to TargetPhrase --- moses/ChartParser.cpp | 4 ++-- moses/Syntax/F2S/GlueRuleSynthesizer.cpp | 4 +++- moses/Syntax/F2S/HyperTreeLoader.cpp | 4 +++- moses/Syntax/S2T/OovHandler-inl.h | 4 +++- moses/Syntax/S2T/RuleTrieLoader.cpp | 4 +++- moses/Syntax/T2S/GlueRuleSynthesizer.cpp | 4 +++- moses/Syntax/T2S/RuleTrieLoader.cpp | 4 +++- moses/TargetPhrase.cpp | 10 +++++++--- moses/TargetPhrase.h | 13 ++++++++++--- .../TranslationModel/BilingualDynSuffixArray.cpp | 4 +++- .../ChartRuleLookupManagerSkeleton.cpp | 4 +++- .../CompactPT/PhraseDictionaryCompact.cpp | 3 ++- .../PhraseDictionaryTransliteration.cpp | 4 +++- moses/TranslationModel/ProbingPT/ProbingPT.cpp | 4 +++- .../TranslationModel/RuleTable/LoaderCompact.cpp | 4 +++- .../TranslationModel/RuleTable/LoaderStandard.cpp | 4 +++- .../RuleTable/PhraseDictionaryFuzzyMatch.cpp | 2 +- moses/TranslationModel/SkeletonPT.cpp | 4 +++- moses/TranslationModel/UG/mmsapt.cpp | 15 ++++++++------- moses/TranslationModel/UG/mmsapt.h | 3 ++- 20 files changed, 71 insertions(+), 31 deletions(-) diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp index 66e22a055..9e4f9ffaa 100644 --- a/moses/ChartParser.cpp +++ b/moses/ChartParser.cpp @@ -95,7 +95,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); // add to dictionary - TargetPhrase *targetPhrase = new TargetPhrase(firstPt); + TargetPhrase *targetPhrase = new TargetPhrase(m_ttask, firstPt); Word &targetWord = targetPhrase->AddWord(); targetWord.CreateUnknownWord(sourceWord); @@ -117,7 +117,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range // drop source word. create blank trans opt float unknownScore = FloorScore(-numeric_limits::infinity()); - TargetPhrase *targetPhrase = new TargetPhrase(firstPt); + TargetPhrase *targetPhrase = new TargetPhrase(m_ttask, firstPt); // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; diff --git a/moses/Syntax/F2S/GlueRuleSynthesizer.cpp b/moses/Syntax/F2S/GlueRuleSynthesizer.cpp index 09423f5d3..377e9a3c3 100644 --- a/moses/Syntax/F2S/GlueRuleSynthesizer.cpp +++ b/moses/Syntax/F2S/GlueRuleSynthesizer.cpp @@ -4,6 +4,7 @@ #include "moses/FF/UnknownWordPenaltyProducer.h" #include "moses/StaticData.h" +#include "moses/TranslationTask.h" namespace Moses { @@ -53,7 +54,8 @@ TargetPhrase *GlueRuleSynthesizer::SynthesizeTargetPhrase( const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); - TargetPhrase *targetPhrase = new TargetPhrase(); + const ttasksptr ttask = NULL; + TargetPhrase *targetPhrase = new TargetPhrase(ttask); std::ostringstream alignmentSS; for (std::size_t i = 0; i < e.tail.size(); ++i) { diff --git a/moses/Syntax/F2S/HyperTreeLoader.cpp b/moses/Syntax/F2S/HyperTreeLoader.cpp index 21d5b0447..b81ae1a19 100644 --- a/moses/Syntax/F2S/HyperTreeLoader.cpp +++ b/moses/Syntax/F2S/HyperTreeLoader.cpp @@ -19,6 +19,7 @@ #include "moses/ChartTranslationOptionList.h" #include "moses/FactorCollection.h" #include "moses/Syntax/RuleTableFF.h" +#include "moses/TranslationTask.h" #include "util/file_piece.hh" #include "util/string_piece.hh" #include "util/tokenize_piece.hh" @@ -109,7 +110,8 @@ bool HyperTreeLoader::Load(const std::vector &input, ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet); // Target-side - TargetPhrase *targetPhrase = new TargetPhrase(&ff); + const ttasksptr ttasks = NULL; + TargetPhrase *targetPhrase = new TargetPhrase(ttasks, &ff); Word *targetLHS = NULL; targetPhrase->CreateFromString(Output, output, targetString, &targetLHS); targetPhrase->SetTargetLHS(targetLHS); diff --git a/moses/Syntax/S2T/OovHandler-inl.h b/moses/Syntax/S2T/OovHandler-inl.h index 76eed861e..255691fd2 100644 --- a/moses/Syntax/S2T/OovHandler-inl.h +++ b/moses/Syntax/S2T/OovHandler-inl.h @@ -2,6 +2,7 @@ #include "moses/FF/UnknownWordPenaltyProducer.h" #include "moses/StaticData.h" +#include "moses/TranslationTask.h" namespace Moses { @@ -70,7 +71,8 @@ TargetPhrase *OovHandler::SynthesizeTargetPhrase( const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); - TargetPhrase *targetPhrase = new TargetPhrase(); + const ttasksptr ttask = NULL; + TargetPhrase *targetPhrase = new TargetPhrase(ttask); Word &targetWord = targetPhrase->AddWord(); targetWord.CreateUnknownWord(oov); diff --git a/moses/Syntax/S2T/RuleTrieLoader.cpp b/moses/Syntax/S2T/RuleTrieLoader.cpp index a88c0f5fe..e43b20790 100644 --- a/moses/Syntax/S2T/RuleTrieLoader.cpp +++ b/moses/Syntax/S2T/RuleTrieLoader.cpp @@ -19,6 +19,7 @@ #include "moses/ChartTranslationOptionList.h" #include "moses/FactorCollection.h" #include "moses/Syntax/RuleTableFF.h" +#include "moses/TranslationTask.h" #include "util/file_piece.hh" #include "util/string_piece.hh" #include "util/tokenize_piece.hh" @@ -100,7 +101,8 @@ bool RuleTrieLoader::Load(const std::vector &input, Word *targetLHS; // create target phrase obj - TargetPhrase *targetPhrase = new TargetPhrase(&ff); + const ttasksptr ttask = NULL; + TargetPhrase *targetPhrase = new TargetPhrase(ttask, &ff); targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); // source Phrase sourcePhrase; diff --git a/moses/Syntax/T2S/GlueRuleSynthesizer.cpp b/moses/Syntax/T2S/GlueRuleSynthesizer.cpp index 9c6dd91ab..70abf97da 100644 --- a/moses/Syntax/T2S/GlueRuleSynthesizer.cpp +++ b/moses/Syntax/T2S/GlueRuleSynthesizer.cpp @@ -4,6 +4,7 @@ #include "moses/FF/UnknownWordPenaltyProducer.h" #include "moses/StaticData.h" +#include "moses/TranslationTask.h" namespace Moses { @@ -45,7 +46,8 @@ TargetPhrase *GlueRuleSynthesizer::SynthesizeTargetPhrase( const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); - TargetPhrase *targetPhrase = new TargetPhrase(); + const ttasksptr ttask = NULL; + TargetPhrase *targetPhrase = new TargetPhrase(ttask); std::ostringstream alignmentSS; for (std::size_t i = 0; i < node.children.size(); ++i) { diff --git a/moses/Syntax/T2S/RuleTrieLoader.cpp b/moses/Syntax/T2S/RuleTrieLoader.cpp index 81924f05d..70b73dc0e 100644 --- a/moses/Syntax/T2S/RuleTrieLoader.cpp +++ b/moses/Syntax/T2S/RuleTrieLoader.cpp @@ -19,6 +19,7 @@ #include "moses/ChartTranslationOptionList.h" #include "moses/FactorCollection.h" #include "moses/Syntax/RuleTableFF.h" +#include "moses/TranslationTask.h" #include "util/file_piece.hh" #include "util/string_piece.hh" #include "util/tokenize_piece.hh" @@ -105,7 +106,8 @@ bool RuleTrieLoader::Load(const std::vector &input, Word *targetLHS; // create target phrase obj - TargetPhrase *targetPhrase = new TargetPhrase(&ff); + const ttasksptr ttasks = NULL; + TargetPhrase *targetPhrase = new TargetPhrase(ttasks, &ff); // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS); targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); // source diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 4976375e9..3426a3093 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -39,7 +39,7 @@ using namespace std; namespace Moses { -TargetPhrase::TargetPhrase( std::string out_string, const PhraseDictionary *pt) +TargetPhrase::TargetPhrase(const ttasksptr ttasks, std::string out_string, const PhraseDictionary *pt) :Phrase(0) , m_fullScore(0.0) , m_futureScore(0.0) @@ -48,6 +48,7 @@ TargetPhrase::TargetPhrase( std::string out_string, const PhraseDictionary *pt) , m_lhsTarget(NULL) , m_ruleSource(NULL) , m_container(pt) + , m_ttasks(ttasks) { //ACAT @@ -58,7 +59,7 @@ TargetPhrase::TargetPhrase( std::string out_string, const PhraseDictionary *pt) NULL); } -TargetPhrase::TargetPhrase(const PhraseDictionary *pt) +TargetPhrase::TargetPhrase(const ttasksptr ttasks, const PhraseDictionary *pt) :Phrase() , m_fullScore(0.0) , m_futureScore(0.0) @@ -67,10 +68,11 @@ TargetPhrase::TargetPhrase(const PhraseDictionary *pt) , m_lhsTarget(NULL) , m_ruleSource(NULL) , m_container(pt) + , m_ttasks(ttasks) { } -TargetPhrase::TargetPhrase(const Phrase &phrase, const PhraseDictionary *pt) +TargetPhrase::TargetPhrase(const ttasksptr ttasks, const Phrase &phrase, const PhraseDictionary *pt) : Phrase(phrase) , m_fullScore(0.0) , m_futureScore(0.0) @@ -79,6 +81,7 @@ TargetPhrase::TargetPhrase(const Phrase &phrase, const PhraseDictionary *pt) , m_lhsTarget(NULL) , m_ruleSource(NULL) , m_container(pt) + , m_ttasks(ttasks) { } @@ -92,6 +95,7 @@ TargetPhrase::TargetPhrase(const TargetPhrase ©) , m_alignNonTerm(copy.m_alignNonTerm) , m_properties(copy.m_properties) , m_container(copy.m_container) + , m_ttasks(copy.m_ttasks) { if (copy.m_lhsTarget) { m_lhsTarget = new Word(*copy.m_lhsTarget); diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h index 35b06c1c7..d60deaf42 100644 --- a/moses/TargetPhrase.h +++ b/moses/TargetPhrase.h @@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "ScoreComponentCollection.h" #include "AlignmentInfo.h" #include "AlignmentInfoCollection.h" +#include "moses/TranslationTask.h" #include "moses/PP/PhraseProperty.h" #include "util/string_piece.hh" @@ -79,14 +80,20 @@ private: const PhraseDictionary *m_container; mutable boost::unordered_map > m_data; + const ttasksptr m_ttasks; public: - TargetPhrase(const PhraseDictionary *pt = NULL); - TargetPhrase(std::string out_string, const PhraseDictionary *pt = NULL); + TargetPhrase(const ttasksptr ttasks, const PhraseDictionary *pt = NULL); + TargetPhrase(const ttasksptr ttasks, std::string out_string, const PhraseDictionary *pt = NULL); TargetPhrase(const TargetPhrase ©); - explicit TargetPhrase(const Phrase &targetPhrase, const PhraseDictionary *pt); + explicit TargetPhrase(const ttasksptr ttasks, const Phrase &targetPhrase, const PhraseDictionary *pt); ~TargetPhrase(); + //GetTtasks PTr + const ttasksptr getTtasksPtr(){ + return m_ttasks; + } + // 1st evaluate method. Called during loading of phrase table. void EvaluateInIsolation(const Phrase &source, const std::vector &ffs); diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp index b0607b770..223e4e9ed 100644 --- a/moses/TranslationModel/BilingualDynSuffixArray.cpp +++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp @@ -3,6 +3,7 @@ #include "moses/FactorCollection.h" #include "moses/StaticData.h" #include "moses/TargetPhrase.h" +#include "moses/TranslationTask.h" #include "moses/TranslationModel/UG/generic/sorting/NBestList.h" #include "moses/TranslationModel/UG/generic/sampling/Sampling.h" @@ -376,7 +377,8 @@ TargetPhrase* BilingualDynSuffixArray:: GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase, const PhraseDictionary *pt) const { - TargetPhrase* targetPhrase = new TargetPhrase(pt); + const ttasksptr ttask = NULL; + TargetPhrase* targetPhrase = new TargetPhrase(ttask, pt); for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words Word& word = m_trgVocab->GetWord( phrase.words[i]); UTIL_THROW_IF2(word == m_trgVocab->GetkOOVWord(), diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp index 53011e5ac..5fca94fcf 100644 --- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp +++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp @@ -30,6 +30,7 @@ #include "moses/ChartCellCollection.h" #include "moses/TranslationModel/PhraseDictionaryMemory.h" #include "moses/TranslationModel/SkeletonPT.h" +#include "moses/TranslationTask.h" using namespace std; @@ -79,7 +80,8 @@ TargetPhrase *ChartRuleLookupManagerSkeleton::CreateTargetPhrase(const Word &sou string str = sourceWord.GetFactor(0)->GetString().as_string(); str = "ChartManagerSkeleton:" + str; - TargetPhrase *tp = new TargetPhrase(&m_skeletonPT); + const ttasksptr ttask = NULL; + TargetPhrase *tp = new TargetPhrase(ttask, &m_skeletonPT); Word &word = tp->AddWord(); word.CreateFromString(Output, m_skeletonPT.GetOutput(), str, false); diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp index afed99057..1d4d08605 100644 --- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp +++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp @@ -126,8 +126,9 @@ PhraseDictionaryCompact::GetTargetPhraseCollectionNonCacheLEGACY(const Phrase &s (m_tableLimit == 0 || tpv->size() < m_tableLimit) ? tpv->end() : tpv->begin() + m_tableLimit; NTH_ELEMENT4(tpv->begin(), nth, tpv->end(), CompareTargetPhrase()); + const ttasksptr ttask = NULL; for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) { - TargetPhrase *tp = new TargetPhrase(*it); + TargetPhrase *tp = new TargetPhrase(ttask, *it); phraseColl->Add(tp); } diff --git a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp index 03b69d0ad..947ea8e72 100644 --- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp +++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp @@ -5,6 +5,7 @@ #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" #include "moses/DecodeGraph.h" #include "moses/DecodeStep.h" +#include "moses/TranslationTask.h" #include "util/tempfile.hh" using namespace std; @@ -117,7 +118,8 @@ std::vector PhraseDictionaryTransliteration::CreateTargetPhrases( Tokenize(toks, line, "\t"); UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore"); - TargetPhrase *tp = new TargetPhrase(this); + const ttasksptr ttask = NULL; + TargetPhrase *tp = new TargetPhrase(ttask, this); Word &word = tp->AddWord(); word.CreateFromString(Output, m_output, toks[0], false); diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp index 19b7e8795..b7a9725e3 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp @@ -1,6 +1,7 @@ // vim:tabstop=2 #include "ProbingPT.h" #include "moses/StaticData.h" +#include "moses/TranslationTask.h" #include "moses/FactorCollection.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" #include "quering.hh" @@ -152,7 +153,8 @@ TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const ta const std::vector &probingPhrase = probingTargetPhrase.target_phrase; size_t size = probingPhrase.size(); - TargetPhrase *tp = new TargetPhrase(this); + const ttasksptr ttask = NULL; + TargetPhrase *tp = new TargetPhrase(ttask, this); // words for (size_t i = 0; i < size; ++i) { diff --git a/moses/TranslationModel/RuleTable/LoaderCompact.cpp b/moses/TranslationModel/RuleTable/LoaderCompact.cpp index c947dfdc2..824d8bc0b 100644 --- a/moses/TranslationModel/RuleTable/LoaderCompact.cpp +++ b/moses/TranslationModel/RuleTable/LoaderCompact.cpp @@ -21,6 +21,7 @@ #include "moses/AlignmentInfoCollection.h" #include "moses/InputFileStream.h" +#include "moses/TranslationTask.h" #include "moses/Util.h" #include "moses/Word.h" #include "Trie.h" @@ -217,7 +218,8 @@ bool RuleTableLoaderCompact::LoadRuleSection( // The remaining columns are currently ignored. // Create and score target phrase. - TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase, &ruleTable); + const ttasksptr ttask = NULL; + TargetPhrase *targetPhrase = new TargetPhrase(ttask, targetPhrasePhrase, &ruleTable); targetPhrase->SetAlignNonTerm(alignNonTerm); targetPhrase->SetTargetLHS(targetLhs); diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.cpp b/moses/TranslationModel/RuleTable/LoaderStandard.cpp index f9e6ac6fd..9febc4426 100644 --- a/moses/TranslationModel/RuleTable/LoaderStandard.cpp +++ b/moses/TranslationModel/RuleTable/LoaderStandard.cpp @@ -36,6 +36,7 @@ #include "moses/WordsRange.h" #include "moses/ChartTranslationOptionList.h" #include "moses/FactorCollection.h" +#include "moses/TranslationTask.h" #include "util/file_piece.hh" #include "util/string_piece.hh" #include "util/tokenize_piece.hh" @@ -217,7 +218,8 @@ bool RuleTableLoaderStandard::Load(FormatType format Word *targetLHS; // create target phrase obj - TargetPhrase *targetPhrase = new TargetPhrase(&ruleTable); + const ttasksptr ttask = NULL; + TargetPhrase *targetPhrase = new TargetPhrase(ttask, &ruleTable); targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); // source Phrase sourcePhrase; diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp index 5f7ddf85d..450b4d65a 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp @@ -267,7 +267,7 @@ void PhraseDictionaryFuzzyMatch::InitializeForInput(ttasksptr const& ttask) sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS); // create target phrase obj - TargetPhrase *targetPhrase = new TargetPhrase(this); + TargetPhrase *targetPhrase = new TargetPhrase(ttask, this); targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS); // rest of target phrase diff --git a/moses/TranslationModel/SkeletonPT.cpp b/moses/TranslationModel/SkeletonPT.cpp index 8e2b1daa3..d3683e761 100644 --- a/moses/TranslationModel/SkeletonPT.cpp +++ b/moses/TranslationModel/SkeletonPT.cpp @@ -1,5 +1,6 @@ // vim:tabstop=2 #include "SkeletonPT.h" +#include "TranslationTask.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" using namespace std; @@ -53,7 +54,8 @@ TargetPhrase *SkeletonPT::CreateTargetPhrase(const Phrase &sourcePhrase) const string str = sourcePhrase.GetWord(0).GetFactor(0)->GetString().as_string(); str = "SkeletonPT:" + str; - TargetPhrase *tp = new TargetPhrase(this); + const ttasksptr ttask = NULL; + TargetPhrase *tp = new TargetPhrase(ttask, this); Word &word = tp->AddWord(); word.CreateFromString(Output, m_output, str, false); diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index f6f9fff50..9b0e7e425 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -496,7 +496,8 @@ namespace Moses TargetPhrase* Mmsapt:: - mkTPhrase(Phrase const& src, + mkTPhrase(ttasksptr const& ttask, + Phrase const& src, PhrasePair* fix, PhrasePair* dyn, sptr > const& dynbt) const @@ -547,7 +548,7 @@ namespace Moses BOOST_FOREACH(sptr const& ff, m_active_ff_common) (*ff)(*dynbt, pool, &fvals); } - TargetPhrase* tp = new TargetPhrase(this); + TargetPhrase* tp = new TargetPhrase(ttask, this); Token const* x = fix ? fix->start2 : dyn->start2; uint32_t len = fix ? fix->len2 : dyn->len2; for (uint32_t k = 0; k < len; ++k, x = x->next()) @@ -687,12 +688,12 @@ namespace Moses while (i < ppfix.size() && k < ppdyn.size()) { int cmp = sorter.cmp(ppfix[i], ppdyn[k]); - if (cmp < 0) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn)); - else if (cmp == 0) ret->Add(mkTPhrase(src,&ppfix[i++],&ppdyn[k++],dyn)); - else ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn)); + if (cmp < 0) ret->Add(mkTPhrase(ttask, src,&ppfix[i++],NULL,dyn)); + else if (cmp == 0) ret->Add(mkTPhrase(ttask, src,&ppfix[i++],&ppdyn[k++],dyn)); + else ret->Add(mkTPhrase(ttask, src,NULL,&ppdyn[k++],dyn)); } - while (i < ppfix.size()) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn)); - while (k < ppdyn.size()) ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn)); + while (i < ppfix.size()) ret->Add(mkTPhrase(ttask, src,&ppfix[i++],NULL,dyn)); + while (k < ppdyn.size()) ret->Add(mkTPhrase(ttask, src,NULL,&ppdyn[k++],dyn)); if (m_tableLimit) ret->Prune(true, m_tableLimit); else ret->Prune(true,ret->GetSize()); diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 5f688cfd8..81687cc50 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -149,7 +149,8 @@ namespace Moses mm2dtable_t COOCraw; TargetPhrase* - mkTPhrase(Phrase const& src, + mkTPhrase(ttasksptr const& ttask, + Phrase const& src, Moses::bitext::PhrasePair* fix, Moses::bitext::PhrasePair* dyn, sptr > const& dynbt) const; From a3ecd9f2a7d2b7a8de93f06f2fa912b1fe0f49e4 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Thu, 25 Jun 2015 15:47:39 +0100 Subject: [PATCH 081/286] Revert "Break everything by trying to add ttasksptr to TargetPhrase" and try an easier approach This reverts commit afdc1b480ea81f424b35ac9e96c57815f847203a. --- moses/ChartParser.cpp | 4 ++-- moses/Syntax/F2S/GlueRuleSynthesizer.cpp | 4 +--- moses/Syntax/F2S/HyperTreeLoader.cpp | 4 +--- moses/Syntax/S2T/OovHandler-inl.h | 4 +--- moses/Syntax/S2T/RuleTrieLoader.cpp | 4 +--- moses/Syntax/T2S/GlueRuleSynthesizer.cpp | 4 +--- moses/Syntax/T2S/RuleTrieLoader.cpp | 4 +--- moses/TargetPhrase.cpp | 10 +++------- moses/TargetPhrase.h | 13 +++---------- .../TranslationModel/BilingualDynSuffixArray.cpp | 4 +--- .../ChartRuleLookupManagerSkeleton.cpp | 4 +--- .../CompactPT/PhraseDictionaryCompact.cpp | 3 +-- .../PhraseDictionaryTransliteration.cpp | 4 +--- moses/TranslationModel/ProbingPT/ProbingPT.cpp | 4 +--- .../TranslationModel/RuleTable/LoaderCompact.cpp | 4 +--- .../TranslationModel/RuleTable/LoaderStandard.cpp | 4 +--- .../RuleTable/PhraseDictionaryFuzzyMatch.cpp | 2 +- moses/TranslationModel/SkeletonPT.cpp | 4 +--- moses/TranslationModel/UG/mmsapt.cpp | 15 +++++++-------- moses/TranslationModel/UG/mmsapt.h | 3 +-- 20 files changed, 31 insertions(+), 71 deletions(-) diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp index 9e4f9ffaa..66e22a055 100644 --- a/moses/ChartParser.cpp +++ b/moses/ChartParser.cpp @@ -95,7 +95,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); // add to dictionary - TargetPhrase *targetPhrase = new TargetPhrase(m_ttask, firstPt); + TargetPhrase *targetPhrase = new TargetPhrase(firstPt); Word &targetWord = targetPhrase->AddWord(); targetWord.CreateUnknownWord(sourceWord); @@ -117,7 +117,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range // drop source word. create blank trans opt float unknownScore = FloorScore(-numeric_limits::infinity()); - TargetPhrase *targetPhrase = new TargetPhrase(m_ttask, firstPt); + TargetPhrase *targetPhrase = new TargetPhrase(firstPt); // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; diff --git a/moses/Syntax/F2S/GlueRuleSynthesizer.cpp b/moses/Syntax/F2S/GlueRuleSynthesizer.cpp index 377e9a3c3..09423f5d3 100644 --- a/moses/Syntax/F2S/GlueRuleSynthesizer.cpp +++ b/moses/Syntax/F2S/GlueRuleSynthesizer.cpp @@ -4,7 +4,6 @@ #include "moses/FF/UnknownWordPenaltyProducer.h" #include "moses/StaticData.h" -#include "moses/TranslationTask.h" namespace Moses { @@ -54,8 +53,7 @@ TargetPhrase *GlueRuleSynthesizer::SynthesizeTargetPhrase( const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); - const ttasksptr ttask = NULL; - TargetPhrase *targetPhrase = new TargetPhrase(ttask); + TargetPhrase *targetPhrase = new TargetPhrase(); std::ostringstream alignmentSS; for (std::size_t i = 0; i < e.tail.size(); ++i) { diff --git a/moses/Syntax/F2S/HyperTreeLoader.cpp b/moses/Syntax/F2S/HyperTreeLoader.cpp index b81ae1a19..21d5b0447 100644 --- a/moses/Syntax/F2S/HyperTreeLoader.cpp +++ b/moses/Syntax/F2S/HyperTreeLoader.cpp @@ -19,7 +19,6 @@ #include "moses/ChartTranslationOptionList.h" #include "moses/FactorCollection.h" #include "moses/Syntax/RuleTableFF.h" -#include "moses/TranslationTask.h" #include "util/file_piece.hh" #include "util/string_piece.hh" #include "util/tokenize_piece.hh" @@ -110,8 +109,7 @@ bool HyperTreeLoader::Load(const std::vector &input, ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet); // Target-side - const ttasksptr ttasks = NULL; - TargetPhrase *targetPhrase = new TargetPhrase(ttasks, &ff); + TargetPhrase *targetPhrase = new TargetPhrase(&ff); Word *targetLHS = NULL; targetPhrase->CreateFromString(Output, output, targetString, &targetLHS); targetPhrase->SetTargetLHS(targetLHS); diff --git a/moses/Syntax/S2T/OovHandler-inl.h b/moses/Syntax/S2T/OovHandler-inl.h index 255691fd2..76eed861e 100644 --- a/moses/Syntax/S2T/OovHandler-inl.h +++ b/moses/Syntax/S2T/OovHandler-inl.h @@ -2,7 +2,6 @@ #include "moses/FF/UnknownWordPenaltyProducer.h" #include "moses/StaticData.h" -#include "moses/TranslationTask.h" namespace Moses { @@ -71,8 +70,7 @@ TargetPhrase *OovHandler::SynthesizeTargetPhrase( const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); - const ttasksptr ttask = NULL; - TargetPhrase *targetPhrase = new TargetPhrase(ttask); + TargetPhrase *targetPhrase = new TargetPhrase(); Word &targetWord = targetPhrase->AddWord(); targetWord.CreateUnknownWord(oov); diff --git a/moses/Syntax/S2T/RuleTrieLoader.cpp b/moses/Syntax/S2T/RuleTrieLoader.cpp index e43b20790..a88c0f5fe 100644 --- a/moses/Syntax/S2T/RuleTrieLoader.cpp +++ b/moses/Syntax/S2T/RuleTrieLoader.cpp @@ -19,7 +19,6 @@ #include "moses/ChartTranslationOptionList.h" #include "moses/FactorCollection.h" #include "moses/Syntax/RuleTableFF.h" -#include "moses/TranslationTask.h" #include "util/file_piece.hh" #include "util/string_piece.hh" #include "util/tokenize_piece.hh" @@ -101,8 +100,7 @@ bool RuleTrieLoader::Load(const std::vector &input, Word *targetLHS; // create target phrase obj - const ttasksptr ttask = NULL; - TargetPhrase *targetPhrase = new TargetPhrase(ttask, &ff); + TargetPhrase *targetPhrase = new TargetPhrase(&ff); targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); // source Phrase sourcePhrase; diff --git a/moses/Syntax/T2S/GlueRuleSynthesizer.cpp b/moses/Syntax/T2S/GlueRuleSynthesizer.cpp index 70abf97da..9c6dd91ab 100644 --- a/moses/Syntax/T2S/GlueRuleSynthesizer.cpp +++ b/moses/Syntax/T2S/GlueRuleSynthesizer.cpp @@ -4,7 +4,6 @@ #include "moses/FF/UnknownWordPenaltyProducer.h" #include "moses/StaticData.h" -#include "moses/TranslationTask.h" namespace Moses { @@ -46,8 +45,7 @@ TargetPhrase *GlueRuleSynthesizer::SynthesizeTargetPhrase( const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); - const ttasksptr ttask = NULL; - TargetPhrase *targetPhrase = new TargetPhrase(ttask); + TargetPhrase *targetPhrase = new TargetPhrase(); std::ostringstream alignmentSS; for (std::size_t i = 0; i < node.children.size(); ++i) { diff --git a/moses/Syntax/T2S/RuleTrieLoader.cpp b/moses/Syntax/T2S/RuleTrieLoader.cpp index 70b73dc0e..81924f05d 100644 --- a/moses/Syntax/T2S/RuleTrieLoader.cpp +++ b/moses/Syntax/T2S/RuleTrieLoader.cpp @@ -19,7 +19,6 @@ #include "moses/ChartTranslationOptionList.h" #include "moses/FactorCollection.h" #include "moses/Syntax/RuleTableFF.h" -#include "moses/TranslationTask.h" #include "util/file_piece.hh" #include "util/string_piece.hh" #include "util/tokenize_piece.hh" @@ -106,8 +105,7 @@ bool RuleTrieLoader::Load(const std::vector &input, Word *targetLHS; // create target phrase obj - const ttasksptr ttasks = NULL; - TargetPhrase *targetPhrase = new TargetPhrase(ttasks, &ff); + TargetPhrase *targetPhrase = new TargetPhrase(&ff); // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS); targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); // source diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 3426a3093..4976375e9 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -39,7 +39,7 @@ using namespace std; namespace Moses { -TargetPhrase::TargetPhrase(const ttasksptr ttasks, std::string out_string, const PhraseDictionary *pt) +TargetPhrase::TargetPhrase( std::string out_string, const PhraseDictionary *pt) :Phrase(0) , m_fullScore(0.0) , m_futureScore(0.0) @@ -48,7 +48,6 @@ TargetPhrase::TargetPhrase(const ttasksptr ttasks, std::string out_string, const , m_lhsTarget(NULL) , m_ruleSource(NULL) , m_container(pt) - , m_ttasks(ttasks) { //ACAT @@ -59,7 +58,7 @@ TargetPhrase::TargetPhrase(const ttasksptr ttasks, std::string out_string, const NULL); } -TargetPhrase::TargetPhrase(const ttasksptr ttasks, const PhraseDictionary *pt) +TargetPhrase::TargetPhrase(const PhraseDictionary *pt) :Phrase() , m_fullScore(0.0) , m_futureScore(0.0) @@ -68,11 +67,10 @@ TargetPhrase::TargetPhrase(const ttasksptr ttasks, const PhraseDictionary *pt) , m_lhsTarget(NULL) , m_ruleSource(NULL) , m_container(pt) - , m_ttasks(ttasks) { } -TargetPhrase::TargetPhrase(const ttasksptr ttasks, const Phrase &phrase, const PhraseDictionary *pt) +TargetPhrase::TargetPhrase(const Phrase &phrase, const PhraseDictionary *pt) : Phrase(phrase) , m_fullScore(0.0) , m_futureScore(0.0) @@ -81,7 +79,6 @@ TargetPhrase::TargetPhrase(const ttasksptr ttasks, const Phrase &phrase, const P , m_lhsTarget(NULL) , m_ruleSource(NULL) , m_container(pt) - , m_ttasks(ttasks) { } @@ -95,7 +92,6 @@ TargetPhrase::TargetPhrase(const TargetPhrase ©) , m_alignNonTerm(copy.m_alignNonTerm) , m_properties(copy.m_properties) , m_container(copy.m_container) - , m_ttasks(copy.m_ttasks) { if (copy.m_lhsTarget) { m_lhsTarget = new Word(*copy.m_lhsTarget); diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h index d60deaf42..35b06c1c7 100644 --- a/moses/TargetPhrase.h +++ b/moses/TargetPhrase.h @@ -30,7 +30,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "ScoreComponentCollection.h" #include "AlignmentInfo.h" #include "AlignmentInfoCollection.h" -#include "moses/TranslationTask.h" #include "moses/PP/PhraseProperty.h" #include "util/string_piece.hh" @@ -80,20 +79,14 @@ private: const PhraseDictionary *m_container; mutable boost::unordered_map > m_data; - const ttasksptr m_ttasks; public: - TargetPhrase(const ttasksptr ttasks, const PhraseDictionary *pt = NULL); - TargetPhrase(const ttasksptr ttasks, std::string out_string, const PhraseDictionary *pt = NULL); + TargetPhrase(const PhraseDictionary *pt = NULL); + TargetPhrase(std::string out_string, const PhraseDictionary *pt = NULL); TargetPhrase(const TargetPhrase ©); - explicit TargetPhrase(const ttasksptr ttasks, const Phrase &targetPhrase, const PhraseDictionary *pt); + explicit TargetPhrase(const Phrase &targetPhrase, const PhraseDictionary *pt); ~TargetPhrase(); - //GetTtasks PTr - const ttasksptr getTtasksPtr(){ - return m_ttasks; - } - // 1st evaluate method. Called during loading of phrase table. void EvaluateInIsolation(const Phrase &source, const std::vector &ffs); diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp index 223e4e9ed..b0607b770 100644 --- a/moses/TranslationModel/BilingualDynSuffixArray.cpp +++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp @@ -3,7 +3,6 @@ #include "moses/FactorCollection.h" #include "moses/StaticData.h" #include "moses/TargetPhrase.h" -#include "moses/TranslationTask.h" #include "moses/TranslationModel/UG/generic/sorting/NBestList.h" #include "moses/TranslationModel/UG/generic/sampling/Sampling.h" @@ -377,8 +376,7 @@ TargetPhrase* BilingualDynSuffixArray:: GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase, const PhraseDictionary *pt) const { - const ttasksptr ttask = NULL; - TargetPhrase* targetPhrase = new TargetPhrase(ttask, pt); + TargetPhrase* targetPhrase = new TargetPhrase(pt); for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words Word& word = m_trgVocab->GetWord( phrase.words[i]); UTIL_THROW_IF2(word == m_trgVocab->GetkOOVWord(), diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp index 5fca94fcf..53011e5ac 100644 --- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp +++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.cpp @@ -30,7 +30,6 @@ #include "moses/ChartCellCollection.h" #include "moses/TranslationModel/PhraseDictionaryMemory.h" #include "moses/TranslationModel/SkeletonPT.h" -#include "moses/TranslationTask.h" using namespace std; @@ -80,8 +79,7 @@ TargetPhrase *ChartRuleLookupManagerSkeleton::CreateTargetPhrase(const Word &sou string str = sourceWord.GetFactor(0)->GetString().as_string(); str = "ChartManagerSkeleton:" + str; - const ttasksptr ttask = NULL; - TargetPhrase *tp = new TargetPhrase(ttask, &m_skeletonPT); + TargetPhrase *tp = new TargetPhrase(&m_skeletonPT); Word &word = tp->AddWord(); word.CreateFromString(Output, m_skeletonPT.GetOutput(), str, false); diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp index 1d4d08605..afed99057 100644 --- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp +++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp @@ -126,9 +126,8 @@ PhraseDictionaryCompact::GetTargetPhraseCollectionNonCacheLEGACY(const Phrase &s (m_tableLimit == 0 || tpv->size() < m_tableLimit) ? tpv->end() : tpv->begin() + m_tableLimit; NTH_ELEMENT4(tpv->begin(), nth, tpv->end(), CompareTargetPhrase()); - const ttasksptr ttask = NULL; for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) { - TargetPhrase *tp = new TargetPhrase(ttask, *it); + TargetPhrase *tp = new TargetPhrase(*it); phraseColl->Add(tp); } diff --git a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp index 947ea8e72..03b69d0ad 100644 --- a/moses/TranslationModel/PhraseDictionaryTransliteration.cpp +++ b/moses/TranslationModel/PhraseDictionaryTransliteration.cpp @@ -5,7 +5,6 @@ #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" #include "moses/DecodeGraph.h" #include "moses/DecodeStep.h" -#include "moses/TranslationTask.h" #include "util/tempfile.hh" using namespace std; @@ -118,8 +117,7 @@ std::vector PhraseDictionaryTransliteration::CreateTargetPhrases( Tokenize(toks, line, "\t"); UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore"); - const ttasksptr ttask = NULL; - TargetPhrase *tp = new TargetPhrase(ttask, this); + TargetPhrase *tp = new TargetPhrase(this); Word &word = tp->AddWord(); word.CreateFromString(Output, m_output, toks[0], false); diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp index b7a9725e3..19b7e8795 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp @@ -1,7 +1,6 @@ // vim:tabstop=2 #include "ProbingPT.h" #include "moses/StaticData.h" -#include "moses/TranslationTask.h" #include "moses/FactorCollection.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" #include "quering.hh" @@ -153,8 +152,7 @@ TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const ta const std::vector &probingPhrase = probingTargetPhrase.target_phrase; size_t size = probingPhrase.size(); - const ttasksptr ttask = NULL; - TargetPhrase *tp = new TargetPhrase(ttask, this); + TargetPhrase *tp = new TargetPhrase(this); // words for (size_t i = 0; i < size; ++i) { diff --git a/moses/TranslationModel/RuleTable/LoaderCompact.cpp b/moses/TranslationModel/RuleTable/LoaderCompact.cpp index 824d8bc0b..c947dfdc2 100644 --- a/moses/TranslationModel/RuleTable/LoaderCompact.cpp +++ b/moses/TranslationModel/RuleTable/LoaderCompact.cpp @@ -21,7 +21,6 @@ #include "moses/AlignmentInfoCollection.h" #include "moses/InputFileStream.h" -#include "moses/TranslationTask.h" #include "moses/Util.h" #include "moses/Word.h" #include "Trie.h" @@ -218,8 +217,7 @@ bool RuleTableLoaderCompact::LoadRuleSection( // The remaining columns are currently ignored. // Create and score target phrase. - const ttasksptr ttask = NULL; - TargetPhrase *targetPhrase = new TargetPhrase(ttask, targetPhrasePhrase, &ruleTable); + TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase, &ruleTable); targetPhrase->SetAlignNonTerm(alignNonTerm); targetPhrase->SetTargetLHS(targetLhs); diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.cpp b/moses/TranslationModel/RuleTable/LoaderStandard.cpp index 9febc4426..f9e6ac6fd 100644 --- a/moses/TranslationModel/RuleTable/LoaderStandard.cpp +++ b/moses/TranslationModel/RuleTable/LoaderStandard.cpp @@ -36,7 +36,6 @@ #include "moses/WordsRange.h" #include "moses/ChartTranslationOptionList.h" #include "moses/FactorCollection.h" -#include "moses/TranslationTask.h" #include "util/file_piece.hh" #include "util/string_piece.hh" #include "util/tokenize_piece.hh" @@ -218,8 +217,7 @@ bool RuleTableLoaderStandard::Load(FormatType format Word *targetLHS; // create target phrase obj - const ttasksptr ttask = NULL; - TargetPhrase *targetPhrase = new TargetPhrase(ttask, &ruleTable); + TargetPhrase *targetPhrase = new TargetPhrase(&ruleTable); targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); // source Phrase sourcePhrase; diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp index 450b4d65a..5f7ddf85d 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp @@ -267,7 +267,7 @@ void PhraseDictionaryFuzzyMatch::InitializeForInput(ttasksptr const& ttask) sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS); // create target phrase obj - TargetPhrase *targetPhrase = new TargetPhrase(ttask, this); + TargetPhrase *targetPhrase = new TargetPhrase(this); targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS); // rest of target phrase diff --git a/moses/TranslationModel/SkeletonPT.cpp b/moses/TranslationModel/SkeletonPT.cpp index d3683e761..8e2b1daa3 100644 --- a/moses/TranslationModel/SkeletonPT.cpp +++ b/moses/TranslationModel/SkeletonPT.cpp @@ -1,6 +1,5 @@ // vim:tabstop=2 #include "SkeletonPT.h" -#include "TranslationTask.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" using namespace std; @@ -54,8 +53,7 @@ TargetPhrase *SkeletonPT::CreateTargetPhrase(const Phrase &sourcePhrase) const string str = sourcePhrase.GetWord(0).GetFactor(0)->GetString().as_string(); str = "SkeletonPT:" + str; - const ttasksptr ttask = NULL; - TargetPhrase *tp = new TargetPhrase(ttask, this); + TargetPhrase *tp = new TargetPhrase(this); Word &word = tp->AddWord(); word.CreateFromString(Output, m_output, str, false); diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 9b0e7e425..f6f9fff50 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -496,8 +496,7 @@ namespace Moses TargetPhrase* Mmsapt:: - mkTPhrase(ttasksptr const& ttask, - Phrase const& src, + mkTPhrase(Phrase const& src, PhrasePair* fix, PhrasePair* dyn, sptr > const& dynbt) const @@ -548,7 +547,7 @@ namespace Moses BOOST_FOREACH(sptr const& ff, m_active_ff_common) (*ff)(*dynbt, pool, &fvals); } - TargetPhrase* tp = new TargetPhrase(ttask, this); + TargetPhrase* tp = new TargetPhrase(this); Token const* x = fix ? fix->start2 : dyn->start2; uint32_t len = fix ? fix->len2 : dyn->len2; for (uint32_t k = 0; k < len; ++k, x = x->next()) @@ -688,12 +687,12 @@ namespace Moses while (i < ppfix.size() && k < ppdyn.size()) { int cmp = sorter.cmp(ppfix[i], ppdyn[k]); - if (cmp < 0) ret->Add(mkTPhrase(ttask, src,&ppfix[i++],NULL,dyn)); - else if (cmp == 0) ret->Add(mkTPhrase(ttask, src,&ppfix[i++],&ppdyn[k++],dyn)); - else ret->Add(mkTPhrase(ttask, src,NULL,&ppdyn[k++],dyn)); + if (cmp < 0) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn)); + else if (cmp == 0) ret->Add(mkTPhrase(src,&ppfix[i++],&ppdyn[k++],dyn)); + else ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn)); } - while (i < ppfix.size()) ret->Add(mkTPhrase(ttask, src,&ppfix[i++],NULL,dyn)); - while (k < ppdyn.size()) ret->Add(mkTPhrase(ttask, src,NULL,&ppdyn[k++],dyn)); + while (i < ppfix.size()) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn)); + while (k < ppdyn.size()) ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn)); if (m_tableLimit) ret->Prune(true, m_tableLimit); else ret->Prune(true,ret->GetSize()); diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 81687cc50..5f688cfd8 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -149,8 +149,7 @@ namespace Moses mm2dtable_t COOCraw; TargetPhrase* - mkTPhrase(ttasksptr const& ttask, - Phrase const& src, + mkTPhrase(Phrase const& src, Moses::bitext::PhrasePair* fix, Moses::bitext::PhrasePair* dyn, sptr > const& dynbt) const; From 943d814f798346a14a9dd011676aa3fe192f096d Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Thu, 25 Jun 2015 16:36:18 +0100 Subject: [PATCH 082/286] Duplicate constructors to include ttasksptr.Works --- moses/TargetPhrase.cpp | 51 ++++++++++++++++++++++++++++++++++++++++++ moses/TargetPhrase.h | 8 +++++++ 2 files changed, 59 insertions(+) diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 4976375e9..207757884 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -32,6 +32,7 @@ #include "Util.h" #include "AlignmentInfoCollection.h" #include "InputPath.h" +#include "TranslationTask.h" #include "moses/TranslationModel/PhraseDictionary.h" #include @@ -48,6 +49,7 @@ TargetPhrase::TargetPhrase( std::string out_string, const PhraseDictionary *pt) , m_lhsTarget(NULL) , m_ruleSource(NULL) , m_container(pt) + , m_ttask(NULL) { //ACAT @@ -58,6 +60,52 @@ TargetPhrase::TargetPhrase( std::string out_string, const PhraseDictionary *pt) NULL); } +TargetPhrase::TargetPhrase(ttasksptr& ttask, std::string out_string, const PhraseDictionary *pt) + :Phrase(0) + , m_fullScore(0.0) + , m_futureScore(0.0) + , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_lhsTarget(NULL) + , m_ruleSource(NULL) + , m_container(pt) + , m_ttask(ttask) +{ + + //ACAT + const StaticData &staticData = StaticData::Instance(); + // XXX should this really be InputFactorOrder??? + CreateFromString(Output, staticData.GetInputFactorOrder(), out_string, + // staticData.GetFactorDelimiter(), // eliminated [UG] + NULL); +} + +TargetPhrase::TargetPhrase(ttasksptr& ttask, const PhraseDictionary *pt) + :Phrase() + , m_fullScore(0.0) + , m_futureScore(0.0) + , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_lhsTarget(NULL) + , m_ruleSource(NULL) + , m_container(pt) + , m_ttask(ttask) +{ +} + +TargetPhrase::TargetPhrase(ttasksptr& ttask, const Phrase &phrase, const PhraseDictionary *pt) + : Phrase(phrase) + , m_fullScore(0.0) + , m_futureScore(0.0) + , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) + , m_lhsTarget(NULL) + , m_ruleSource(NULL) + , m_container(pt) + , m_ttask(ttask) +{ +} + TargetPhrase::TargetPhrase(const PhraseDictionary *pt) :Phrase() , m_fullScore(0.0) @@ -67,6 +115,7 @@ TargetPhrase::TargetPhrase(const PhraseDictionary *pt) , m_lhsTarget(NULL) , m_ruleSource(NULL) , m_container(pt) + , m_ttask(NULL) { } @@ -79,6 +128,7 @@ TargetPhrase::TargetPhrase(const Phrase &phrase, const PhraseDictionary *pt) , m_lhsTarget(NULL) , m_ruleSource(NULL) , m_container(pt) + , m_ttask(NULL) { } @@ -92,6 +142,7 @@ TargetPhrase::TargetPhrase(const TargetPhrase ©) , m_alignNonTerm(copy.m_alignNonTerm) , m_properties(copy.m_properties) , m_container(copy.m_container) + , m_ttask(copy.m_ttask) { if (copy.m_lhsTarget) { m_lhsTarget = new Word(*copy.m_lhsTarget); diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h index 35b06c1c7..9527dcbb5 100644 --- a/moses/TargetPhrase.h +++ b/moses/TargetPhrase.h @@ -32,6 +32,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "AlignmentInfoCollection.h" #include "moses/PP/PhraseProperty.h" #include "util/string_piece.hh" +//#include "moses/TranslationTask.h" #include #include @@ -58,6 +59,7 @@ public: Scores const* GetExtraScores(FeatureFunction const* ff) const; void SetExtraScores(FeatureFunction const* ff, boost::shared_ptr const& scores); + ttasksptr m_ttask; private: ScoreCache_t m_cached_scores; @@ -85,6 +87,12 @@ public: TargetPhrase(std::string out_string, const PhraseDictionary *pt = NULL); TargetPhrase(const TargetPhrase ©); explicit TargetPhrase(const Phrase &targetPhrase, const PhraseDictionary *pt); + + /*ttasksptr version*/ + TargetPhrase(ttasksptr &ttask, const PhraseDictionary *pt = NULL); + TargetPhrase(ttasksptr &ttask, std::string out_string, const PhraseDictionary *pt = NULL); + explicit TargetPhrase(ttasksptr &ttask, const Phrase &targetPhrase, const PhraseDictionary *pt); + ~TargetPhrase(); // 1st evaluate method. Called during loading of phrase table. From fcc9bb1e60ddd3973eaccb978e30d2fdafe3e314 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Thu, 25 Jun 2015 16:52:14 +0100 Subject: [PATCH 083/286] when using the suffix array PT, set the ttask in the targetPhrase --- moses/TranslationModel/UG/mmsapt.cpp | 16 +++++++++------- moses/TranslationModel/UG/mmsapt.h | 3 ++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index f6f9fff50..7b212c8df 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -496,7 +496,8 @@ namespace Moses TargetPhrase* Mmsapt:: - mkTPhrase(Phrase const& src, + mkTPhrase(ttasksptr const& ttask, + Phrase const& src, PhrasePair* fix, PhrasePair* dyn, sptr > const& dynbt) const @@ -547,7 +548,8 @@ namespace Moses BOOST_FOREACH(sptr const& ff, m_active_ff_common) (*ff)(*dynbt, pool, &fvals); } - TargetPhrase* tp = new TargetPhrase(this); + + TargetPhrase* tp = new TargetPhrase(const_cast(ttask), this); Token const* x = fix ? fix->start2 : dyn->start2; uint32_t len = fix ? fix->len2 : dyn->len2; for (uint32_t k = 0; k < len; ++k, x = x->next()) @@ -687,12 +689,12 @@ namespace Moses while (i < ppfix.size() && k < ppdyn.size()) { int cmp = sorter.cmp(ppfix[i], ppdyn[k]); - if (cmp < 0) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn)); - else if (cmp == 0) ret->Add(mkTPhrase(src,&ppfix[i++],&ppdyn[k++],dyn)); - else ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn)); + if (cmp < 0) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],NULL,dyn)); + else if (cmp == 0) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],&ppdyn[k++],dyn)); + else ret->Add(mkTPhrase(ttask,src,NULL,&ppdyn[k++],dyn)); } - while (i < ppfix.size()) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn)); - while (k < ppdyn.size()) ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn)); + while (i < ppfix.size()) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],NULL,dyn)); + while (k < ppdyn.size()) ret->Add(mkTPhrase(ttask,src,NULL,&ppdyn[k++],dyn)); if (m_tableLimit) ret->Prune(true, m_tableLimit); else ret->Prune(true,ret->GetSize()); diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 5f688cfd8..81687cc50 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -149,7 +149,8 @@ namespace Moses mm2dtable_t COOCraw; TargetPhrase* - mkTPhrase(Phrase const& src, + mkTPhrase(ttasksptr const& ttask, + Phrase const& src, Moses::bitext::PhrasePair* fix, Moses::bitext::PhrasePair* dyn, sptr > const& dynbt) const; From ea23c921b3603666e17f27b29ff86a9efb7fab62 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Thu, 25 Jun 2015 17:52:55 +0100 Subject: [PATCH 084/286] Make getScore be able to access ttasksptr. Only works with suffix array for now, other people can implement it in their pts --- moses/LM/Base.cpp | 3 ++- moses/LM/Base.h | 3 +++ moses/LM/IRST.cpp | 5 ++++- moses/LM/IRST.h | 2 +- moses/TargetPhrase.cpp | 4 ++++ moses/TargetPhrase.h | 1 + 6 files changed, 15 insertions(+), 3 deletions(-) diff --git a/moses/LM/Base.cpp b/moses/LM/Base.cpp index 76a6336c3..9e74b0d81 100644 --- a/moses/LM/Base.cpp +++ b/moses/LM/Base.cpp @@ -78,7 +78,8 @@ void LanguageModel::EvaluateInIsolation(const Phrase &source float fullScore, nGramScore; size_t oovCount; - CalcScore(targetPhrase, fullScore, nGramScore, oovCount); + CalcScoreWithContext(targetPhrase.GetTtask(), targetPhrase, fullScore, nGramScore, oovCount); + //CalcScore(targetPhrase, fullScore, nGramScore, oovCount); float estimateScore = fullScore - nGramScore; if (StaticData::Instance().GetLMEnableOOVFeature()) { diff --git a/moses/LM/Base.h b/moses/LM/Base.h index eb0a98ca1..cfe92c6ba 100644 --- a/moses/LM/Base.h +++ b/moses/LM/Base.h @@ -72,6 +72,9 @@ public: * \param oovCount number of LM OOVs */ virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, std::size_t &oovCount) const = 0; + virtual void CalcScoreWithContext(ttasksptr const& ttask, const Phrase &phrase, float &fullScore, float &ngramScore, std::size_t &oovCount) const { + CalcScore(phrase, fullScore, ngramScore, oovCount); + } virtual void CalcScoreFromCache(const Phrase &phrase, float &fullScore, float &ngramScore, std::size_t &oovCount) const { } diff --git a/moses/LM/IRST.cpp b/moses/LM/IRST.cpp index f70530231..9448339aa 100644 --- a/moses/LM/IRST.cpp +++ b/moses/LM/IRST.cpp @@ -238,7 +238,7 @@ const FFState* LanguageModelIRST::EmptyHypothesisState(const InputType &/*input* return ret.release(); } -void LanguageModelIRST::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const +void LanguageModelIRST::CalcScoreWithContext(ttasksptr const& ttasks, const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { fullScore = 0; ngramScore = 0; @@ -246,6 +246,9 @@ void LanguageModelIRST::CalcScore(const Phrase &phrase, float &fullScore, float if ( !phrase.GetSize() ) return; + //get the context_weight map here + std::map context_weight = ttasks->GetContextWeights(); + int _min = min(m_lmtb_size - 1, (int) phrase.GetSize()); int codes[m_lmtb_size]; diff --git a/moses/LM/IRST.h b/moses/LM/IRST.h index 72ff84efd..7dedde7b0 100644 --- a/moses/LM/IRST.h +++ b/moses/LM/IRST.h @@ -95,7 +95,7 @@ public: virtual FFState *EvaluateWhenAppliedWithContext(ttasksptr const& ttasks, const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const; - virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const; + virtual void CalcScoreWithContext(ttasksptr const& ttasks, const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const; /* virtual FFState *EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const; diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 207757884..9151f4bb9 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -174,6 +174,10 @@ void TargetPhrase::WriteToRulePB(hgmert::Rule* pb) const } #endif +const ttasksptr& TargetPhrase::GetTtask() const { + return m_ttask; +} + void TargetPhrase::EvaluateInIsolation(const Phrase &source) { const std::vector &ffs = FeatureFunction::GetFeatureFunctions(); diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h index 9527dcbb5..63b8f01fd 100644 --- a/moses/TargetPhrase.h +++ b/moses/TargetPhrase.h @@ -92,6 +92,7 @@ public: TargetPhrase(ttasksptr &ttask, const PhraseDictionary *pt = NULL); TargetPhrase(ttasksptr &ttask, std::string out_string, const PhraseDictionary *pt = NULL); explicit TargetPhrase(ttasksptr &ttask, const Phrase &targetPhrase, const PhraseDictionary *pt); + const ttasksptr& GetTtask() const; ~TargetPhrase(); From 41a11dfe8ac9e7d01e950607afdd13492113e9d5 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 25 Jun 2015 18:20:03 +0100 Subject: [PATCH 085/286] Allow ports other than 80 as the server ports for the context bias server. --- .../TranslationModel/UG/mm/ug_http_client.cc | 11 ++- .../UG/mm/ug_sampling_bias.cc | 77 ++++++++++++++++--- .../TranslationModel/UG/mm/ug_sampling_bias.h | 1 + 3 files changed, 78 insertions(+), 11 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_http_client.cc b/moses/TranslationModel/UG/mm/ug_http_client.cc index 1bbb93b23..da8537910 100644 --- a/moses/TranslationModel/UG/mm/ug_http_client.cc +++ b/moses/TranslationModel/UG/mm/ug_http_client.cc @@ -1,4 +1,5 @@ #include "ug_http_client.h" +#include "moses/Util.h" namespace Moses { using boost::asio::ip::tcp; @@ -31,10 +32,16 @@ http_client(boost::asio::io_service& io_service, std::string url) p = std::min(url.find_first_of(":/"), url.size()); q = std::min(url.find("/"), url.size()); if (p < url.size() && url[p] == ':') - port = url.substr(p,q-p); + port = url.substr(p+1,q-p-1); server = url.substr(0,p); if (q < url.size()) path = url.substr(q); +#if 0 + std::cerr << HERE << std::endl; + std::cerr << "SERVER " << server << std::endl; + std::cerr << "PORT |" << port << "|" << std::endl; + std::cerr << "PATH " << path << std::endl; +#endif init(server, port, path); } @@ -55,7 +62,7 @@ init(std::string const& server, std::string const& port, std::string const& path // Start an asynchronous resolve to translate the server and service names // into a list of endpoints. - tcp::resolver::query query(server, port); + tcp::resolver::query query(server, port.c_str()); resolver_.async_resolve(query, boost::bind(&http_client::handle_resolve, this, boost::asio::placeholders::error, diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index da408dfb3..d54305997 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -2,7 +2,7 @@ #include #include #include "moses/Timer.h" - +// #include // #ifdef HAVE_CURLPP // #include // #include @@ -19,19 +19,77 @@ namespace Moses { using ugdiss::id_type; - // #ifdef WITH_MMT_BIAS_CLIENT - std::string - query_bias_server(std::string const& url, std::string const& text) + size_t ca_write_callback(void *ptr, size_t size, size_t nmemb, + std::string* response) { - std::string query = url+uri_encode(text); + char const* c = reinterpret_cast(ptr); + *response += std::string(c, size * nmemb); + return size * nmemb; + } + + std::string + query_bias_server(std::string const& server, std::string const& context) + { +#if 0 + std::string query = server + uri_encode(context); + std::string response; + + CURL* curl = curl_easy_init(); + UTIL_THROW_IF2(!curl, "Could not init curl."); + curl_easy_setopt(curl, CURLOPT_URL, query.c_str()); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ca_write_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); + CURLcode res = curl_easy_perform(curl); + curl_easy_cleanup(curl); + return response; +#else + std::string query = server+uri_encode(context); boost::asio::io_service io_service; Moses::http_client c(io_service, query); io_service.run(); - return c.content(); - } - // #endif - DocumentBias + std::string response = c.content(); + std::cerr << "SERVER RESPONSE: " << response << std::endl; + + return c.content(); +#endif + } + +// // #ifdef WITH_MMT_BIAS_CLIENT +// std::string +// query_bias_server(std::string const& url, std::string const& text) +// { +// #if 1 +// std::string query = url+uri_encode(text); +// boost::asio::io_service io_service; +// Moses::http_client c(io_service, query); +// io_service.run(); + +// std::string response = c.content(); +// std::cerr << "SERVER RESPONSE: " << response << std::endl; + +// return c.content(); +// #else +// return ""; +// #endif +// } +// // #endif + + + // std::string + // query_bias_server(std::string const& url, int const port, + // std::string const& context, + // std::string const& src_lang) + // { + // char* response + // = ca_get_context(url.c_str(), port, context.c_str(), src_lang.c_str()); + // UTIL_THROW_IF2(!response, "No response from server"); + // std::string json = response; + // free(response); + // return json; + // } + + DocumentBias ::DocumentBias ( std::vector const& sid2doc, std::map const& docname2docid, @@ -44,6 +102,7 @@ namespace Moses Timer timer; if (log) timer.start(NULL); std::string json = query_bias_server(server_url, text); + std::cerr << "SERVER RESPONSE " << json << std::endl; init_from_json(json, docname2docid, log); if (log) *log << "Bias query took " << timer << " seconds." << std::endl; // #endif diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index f540ddc76..24d39689e 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -37,6 +37,7 @@ namespace Moses { std::vector const& m_sid2docid; std::vector m_bias; + // std::map m_bias; public: From faf7b51fb7ad8e382c751c832de74fda745a2f57 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Fri, 26 Jun 2015 00:01:00 +0100 Subject: [PATCH 086/286] daily automatic beautifier --- misc/pruneGeneration.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/misc/pruneGeneration.cpp b/misc/pruneGeneration.cpp index e436263e9..d58c10ebd 100644 --- a/misc/pruneGeneration.cpp +++ b/misc/pruneGeneration.cpp @@ -31,7 +31,7 @@ int main(int argc, char **argv) // loop thru each file in directory fs::directory_iterator end_iter; - for( fs::directory_iterator dir_iter(dir) ; dir_iter != end_iter ; ++dir_iter) { + for( fs::directory_iterator dir_iter(dir) ; dir_iter != end_iter ; ++dir_iter) { if (fs::is_regular_file(dir_iter->status())) { fs::path currPath = *dir_iter; string currPathStr = currPath.native(); @@ -46,15 +46,15 @@ int main(int argc, char **argv) string suffix = currFileStr.substr(fileStemStrSize, currFileStr.size() - fileStemStrSize); string outPath = outPathStem + suffix; cerr << "PRUNING " << currPathStr << " TO " << outPath << endl; - + Moses::InputFileStream inStrme(currPathStr); Moses::OutputFileStream outStrme(outPath); Process(limit, inStrme, outStrme); - + } } } - + cerr << "Finished" << endl; } From ca5485264196fbc79e4f478e1937e95c170645e8 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 26 Jun 2015 11:37:35 +0400 Subject: [PATCH 087/286] tighten up extract-parallel on osx. Can now use gsplit and bsd split --- scripts/generic/extract-parallel.perl | 38 ++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index 3240f24eb..226dbeb6e 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. @@ -15,8 +15,7 @@ sub systemCheck($); sub NumStr($); sub DigitStr($); sub CharStr($); - -my $is_osx = ($^O eq "darwin"); +sub GetSplitVersion($); my $alph = "abcdefghijklmnopqrstuvwxyz"; my @alph = (split(//,$alph)); @@ -42,7 +41,7 @@ my $baselineExtract; my $glueFile; my $phraseOrientation = 0; my $phraseOrientationPriorsFile; -my $splitCmdOption="-d"; +my $splitCmdOption = ""; my $GZIP_EXEC; if(`which pigz`) { @@ -53,6 +52,15 @@ else { } print STDERR "using $GZIP_EXEC \n"; +my $isBSDSplit = GetSplitVersion($splitCmd); +print STDERR "isBSDSplit=$isBSDSplit \n"; + +if ($isBSDSplit == 0) { + $splitCmdOption .= "-d"; +} + +my $gzOut = 0; + for (my $i = 8; $i < $#ARGV + 1; ++$i) { $makeTTable = 0 if $ARGV[$i] eq "--NoTTable"; @@ -73,11 +81,15 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i) $phraseOrientationPriorsFile = $ARGV[++$i]; next; } - $splitCmdOption="",next if $ARGV[$i] eq "--NoNumericSuffix"; + if ($ARGV[$i] eq '--GZOutput') { + $gzOut = 1; + } $otherExtractArgs .= $ARGV[$i] ." "; } +die("Need to specify --GZOutput for parallel extract") if ($gzOut == 0); + my $cmd; my $TMPDIR=dirname($extract) ."/tmp.$$"; $cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR"; @@ -272,7 +284,7 @@ if ($phraseOrientation && defined($phraseOrientationPriorsFile)) { # delete temporary files $cmd = "rm -rf $TMPDIR \n"; -`$cmd`; +systemCheck($cmd); print STDERR "Finished ".localtime() ."\n"; @@ -352,10 +364,22 @@ sub CharStr($) sub NumStr($) { my $i = shift; - if ($is_osx){ + if ($isBSDSplit){ return CharStr($i); }else{ return DigitStr($i); } } +sub GetSplitVersion($) +{ + my $splitCmd = shift; + my $retVal = system("$splitCmd -h"); + if ($retVal != 0) { + return 1; + } + else { + return 0; + } +} + From 57e213ed190a15ebfbc193e9eeb525813e92cc1a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 26 Jun 2015 12:18:21 +0400 Subject: [PATCH 088/286] tighten up extract-parallel on osx. Can now use gsplit and bsd split --- scripts/generic/extract-parallel.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index 226dbeb6e..2424c1bd2 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -374,7 +374,7 @@ sub NumStr($) sub GetSplitVersion($) { my $splitCmd = shift; - my $retVal = system("$splitCmd -h"); + my $retVal = system("$splitCmd --help"); if ($retVal != 0) { return 1; } From 82edbb98a7aa9186287f8f00dfcbbeb2906e7a5a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 28 Jun 2015 10:40:43 +0400 Subject: [PATCH 089/286] comments in ini file about default weights --- scripts/training/train-model.perl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index b693d774d..4d73ef4ee 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -2358,6 +2358,8 @@ sub create_ini { print INI "\n# dense weights for feature functions\n"; print INI "[weight]\n"; + print INI "# The default weights are NOT optimized for translation quality. You MUST tune the weights.\n"; + print INI "# Documentation for tuning is here: http://www.statmt.org/moses/?n=FactoredTraining.Tuning \n"; print INI "UnknownWordPenalty0= 1\n"; print INI "WordPenalty0= -1\n"; print INI "PhrasePenalty0= 0.2\n"; From f66beabf4f0dca33a6bbcc37072811e9017e19b5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 28 Jun 2015 14:03:54 +0400 Subject: [PATCH 090/286] Generation error in EMS due to pruning. Lets see if this works. --- scripts/ems/experiment.meta | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 110ab39b7..ee6b188e8 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -743,7 +743,7 @@ generation-prune in: generation-table out: generation-table-pruned rerun-on-change: TRAINING:prune-generation - pass-unless: AND TRAINING:prune-generation + ignore-unless: AND TRAINING:prune-generation default-name: model/generation-table-pruned final-model: yes template: $TRAINING:prune-generation IN OUT From f7c3d00824e1664ba0cbfbc80ff94a82f3eb7561 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 28 Jun 2015 22:20:42 +0400 Subject: [PATCH 091/286] more testing of c++11 waters --- phrase-extract/ScoreFeatureTest.cpp | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index cc22f8630..0ed2f71e6 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -26,6 +26,7 @@ #include #include +#include using namespace MosesTraining; using namespace std; @@ -81,6 +82,16 @@ static void checkDomainConfigured( BOOST_CHECK(manager.includeSentenceId()); } +template +T adder(T v) { + return v; +} + +template +T adder(T first, Args... args) { + return first + adder(args...); +} + BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured @@ -102,8 +113,23 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) s.insert(4); s.insert(1); -for (auto i: s) { + for (auto i: s) { cerr << i << " "; } + + unordered_map m; + m["a"] = 4; + m["ba"] = 6; + m["aabc"] = 7; + + for (auto i: m) { + cerr << i.first << "=" << i.second << " "; + } + + long sum = adder(1, 2, 3, 8, 7); + + std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy"; + std::string ssum = adder(s1, s2, s3, s4); + } From fba4a3e24da01a01088c95c8c85f71d551ba4634 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Mon, 29 Jun 2015 00:00:54 +0100 Subject: [PATCH 092/286] daily automatic beautifier --- phrase-extract/ScoreFeatureTest.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 0ed2f71e6..94a5a0480 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -83,12 +83,14 @@ static void checkDomainConfigured( } template -T adder(T v) { +T adder(T v) +{ return v; } template -T adder(T first, Args... args) { +T adder(T first, Args... args) +{ return first + adder(args...); } @@ -113,7 +115,7 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) s.insert(4); s.insert(1); - for (auto i: s) { +for (auto i: s) { cerr << i << " "; } @@ -121,15 +123,15 @@ BOOST_AUTO_TEST_CASE(manager_config_domain) m["a"] = 4; m["ba"] = 6; m["aabc"] = 7; - - for (auto i: m) { + +for (auto i: m) { cerr << i.first << "=" << i.second << " "; } - + long sum = adder(1, 2, 3, 8, 7); - std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy"; - std::string ssum = adder(s1, s2, s3, s4); + std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy"; + std::string ssum = adder(s1, s2, s3, s4); } From 5e81e4b9c37bcfe4f7828ca16bb03c28cbf4f491 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Mon, 29 Jun 2015 12:23:53 +0700 Subject: [PATCH 093/286] Simplify unnecessarily complicated condition. --- moses/ChartHypothesisCollection.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/moses/ChartHypothesisCollection.h b/moses/ChartHypothesisCollection.h index 37cd907d9..b2464e151 100644 --- a/moses/ChartHypothesisCollection.h +++ b/moses/ChartHypothesisCollection.h @@ -52,11 +52,7 @@ public: // shouldn't be mixing hypos with different lhs assert(hypoA->GetTargetLHS() == hypoB->GetTargetLHS()); - int ret = hypoA->RecombineCompare(*hypoB); - if (ret != 0) - return (ret < 0); - - return false; + return (hypoA->RecombineCompare(*hypoB) < 0); } }; From 6aa6131b844e5e08b4644692fa48c4d85fd2cf81 Mon Sep 17 00:00:00 2001 From: Nicola Bertoldi Date: Mon, 29 Jun 2015 09:30:26 +0200 Subject: [PATCH 094/286] introduced flag for the existence of TtaskSptr in TargetPhrase --- moses/LM/Base.cpp | 6 +++++- moses/StaticData.cpp | 4 +++- moses/TargetPhrase.cpp | 23 +++++++++++++++-------- moses/TargetPhrase.h | 11 ++++++----- 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/moses/LM/Base.cpp b/moses/LM/Base.cpp index 9e74b0d81..8b69626ac 100644 --- a/moses/LM/Base.cpp +++ b/moses/LM/Base.cpp @@ -78,7 +78,11 @@ void LanguageModel::EvaluateInIsolation(const Phrase &source float fullScore, nGramScore; size_t oovCount; - CalcScoreWithContext(targetPhrase.GetTtask(), targetPhrase, fullScore, nGramScore, oovCount); + if (targetPhrase.HasTtaskSPtr()){ + CalcScoreWithContext(targetPhrase.GetTtask(), targetPhrase, fullScore, nGramScore, oovCount); + }else{ + CalcScore(targetPhrase, fullScore, nGramScore, oovCount); + } //CalcScore(targetPhrase, fullScore, nGramScore, oovCount); float estimateScore = fullScore - nGramScore; diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 28d9f7831..83da42a9e 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -1115,7 +1115,9 @@ void StaticData::LoadSparseWeightsFromConfig() } std::map > weights = m_parameter->GetAllWeights(); - for (auto iter = weights.begin(); iter != weights.end(); ++iter) { + std::map >::iterator iter; +// for (auto iter = weights.begin(); iter != weights.end(); ++iter) { + for (iter = weights.begin(); iter != weights.end(); ++iter) { // this indicates that it is sparse feature if (featureNames.find(iter->first) == featureNames.end()) { UTIL_THROW_IF2(iter->second.size() != 1, "ERROR: only one weight per sparse feature allowed: " << iter->first); diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 9151f4bb9..893edca08 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -48,10 +48,9 @@ TargetPhrase::TargetPhrase( std::string out_string, const PhraseDictionary *pt) , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_lhsTarget(NULL) , m_ruleSource(NULL) + , m_ttask_flag(false) , m_container(pt) - , m_ttask(NULL) { - //ACAT const StaticData &staticData = StaticData::Instance(); // XXX should this really be InputFactorOrder??? @@ -68,8 +67,9 @@ TargetPhrase::TargetPhrase(ttasksptr& ttask, std::string out_string, const Phras , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_lhsTarget(NULL) , m_ruleSource(NULL) - , m_container(pt) , m_ttask(ttask) + , m_ttask_flag(true) + , m_container(pt) { //ACAT @@ -88,8 +88,9 @@ TargetPhrase::TargetPhrase(ttasksptr& ttask, const PhraseDictionary *pt) , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_lhsTarget(NULL) , m_ruleSource(NULL) - , m_container(pt) , m_ttask(ttask) + , m_ttask_flag(true) + , m_container(pt) { } @@ -101,8 +102,9 @@ TargetPhrase::TargetPhrase(ttasksptr& ttask, const Phrase &phrase, const PhraseD , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_lhsTarget(NULL) , m_ruleSource(NULL) - , m_container(pt) , m_ttask(ttask) + , m_ttask_flag(true) + , m_container(pt) { } @@ -114,8 +116,8 @@ TargetPhrase::TargetPhrase(const PhraseDictionary *pt) , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_lhsTarget(NULL) , m_ruleSource(NULL) + , m_ttask_flag(false) , m_container(pt) - , m_ttask(NULL) { } @@ -127,8 +129,8 @@ TargetPhrase::TargetPhrase(const Phrase &phrase, const PhraseDictionary *pt) , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_lhsTarget(NULL) , m_ruleSource(NULL) + , m_ttask_flag(false) , m_container(pt) - , m_ttask(NULL) { } @@ -141,8 +143,9 @@ TargetPhrase::TargetPhrase(const TargetPhrase ©) , m_alignTerm(copy.m_alignTerm) , m_alignNonTerm(copy.m_alignNonTerm) , m_properties(copy.m_properties) - , m_container(copy.m_container) , m_ttask(copy.m_ttask) + , m_ttask_flag(true) + , m_container(copy.m_container) { if (copy.m_lhsTarget) { m_lhsTarget = new Word(*copy.m_lhsTarget); @@ -174,6 +177,10 @@ void TargetPhrase::WriteToRulePB(hgmert::Rule* pb) const } #endif +bool TargetPhrase::HasTtaskSPtr() const { + return m_ttask_flag; +} + const ttasksptr& TargetPhrase::GetTtask() const { return m_ttask; } diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h index 63b8f01fd..4e6b1278b 100644 --- a/moses/TargetPhrase.h +++ b/moses/TargetPhrase.h @@ -53,16 +53,16 @@ class PhraseDictionary; class TargetPhrase: public Phrase { public: - typedef std::map > - ScoreCache_t; + typedef std::map > ScoreCache_t; ScoreCache_t const& GetExtraScores() const; Scores const* GetExtraScores(FeatureFunction const* ff) const; - void SetExtraScores(FeatureFunction const* ff, - boost::shared_ptr const& scores); - ttasksptr m_ttask; + void SetExtraScores(FeatureFunction const* ff,boost::shared_ptr const& scores); + private: ScoreCache_t m_cached_scores; + ttasksptr m_ttask; + bool m_ttask_flag; private: friend std::ostream& operator<<(std::ostream&, const TargetPhrase&); @@ -93,6 +93,7 @@ public: TargetPhrase(ttasksptr &ttask, std::string out_string, const PhraseDictionary *pt = NULL); explicit TargetPhrase(ttasksptr &ttask, const Phrase &targetPhrase, const PhraseDictionary *pt); const ttasksptr& GetTtask() const; + bool HasTtaskSPtr() const; ~TargetPhrase(); From a374706bd4a995aa810b748f122b2d6279866088 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Wed, 1 Jul 2015 00:00:59 +0100 Subject: [PATCH 095/286] daily automatic beautifier --- moses/server/TranslationRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index 3848f81ba..bc2b5032b 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -1,5 +1,5 @@ #include "TranslationRequest.h" -#include "moses/ContextScope.h" +#include "moses/ContextScope.h" #include namespace MosesServer From 81f337bcd838a69bf0e275c8138b173427a17d02 Mon Sep 17 00:00:00 2001 From: hieu Date: Wed, 1 Jul 2015 09:42:07 +0400 Subject: [PATCH 096/286] rollback c++11 for now --- Jamroot | 2 +- moses/StaticData.cpp | 3 ++- phrase-extract/ScoreFeatureTest.cpp | 33 ++++++++++++++++------------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/Jamroot b/Jamroot index 4f76ec3ba..b3544274b 100644 --- a/Jamroot +++ b/Jamroot @@ -108,7 +108,7 @@ external-lib z ; #lib dl : : static:static shared:shared ; #requirements += dl ; -requirements += -std=c++0x ; +#requirements += -std=c++0x ; if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] { if [ option.get "full-tcmalloc" : : "yes" ] { diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 28d9f7831..281129a2e 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -1115,7 +1115,8 @@ void StaticData::LoadSparseWeightsFromConfig() } std::map > weights = m_parameter->GetAllWeights(); - for (auto iter = weights.begin(); iter != weights.end(); ++iter) { + std::map >::iterator iter; + for (iter = weights.begin(); iter != weights.end(); ++iter) { // this indicates that it is sparse feature if (featureNames.find(iter->first) == featureNames.end()) { UTIL_THROW_IF2(iter->second.size() != 1, "ERROR: only one weight per sparse feature allowed: " << iter->first); diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 94a5a0480..9537b970f 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -24,9 +24,10 @@ #define BOOST_TEST_MODULE MosesTrainingScoreFeature #include #include +#include -#include -#include +//#include +//#include using namespace MosesTraining; using namespace std; @@ -54,16 +55,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), + manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), + manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--SparseDomainBlah","/dev/null"}), + manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--DomainSubset"}), + manager.configure(boost::assign::list_of("--DomainSubset")), ScoreFeatureArgumentException); } @@ -97,25 +98,27 @@ T adder(T first, Args... args) BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - ( {"--DomainRatio","/dev/null"}); + (boost::assign::list_of("--DomainRatio")("/dev/null")); checkDomainConfigured - ( {"--DomainIndicator","/dev/null"}); + (boost::assign::list_of("--DomainIndicator")("/dev/null")); checkDomainConfigured - ( {"--DomainSubset","/dev/null"}); + (boost::assign::list_of("--DomainSubset")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainRatio","/dev/null"}); + (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainIndicator","/dev/null"}); + (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainSubset","/dev/null"}); + (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); + /* + // C++11 testing unordered_set s; s.insert(4); s.insert(7); s.insert(4); s.insert(1); -for (auto i: s) { + for (auto i: s) { cerr << i << " "; } @@ -124,7 +127,7 @@ for (auto i: s) { m["ba"] = 6; m["aabc"] = 7; -for (auto i: m) { + for (auto i: m) { cerr << i.first << "=" << i.second << " "; } @@ -132,6 +135,6 @@ for (auto i: m) { std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy"; std::string ssum = adder(s1, s2, s3, s4); - + */ } From 851a801c64eab2fc5ec77c6e8eed2cbf6a6a8bfe Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 1 Jul 2015 14:01:57 +0100 Subject: [PATCH 097/286] Bug fix in copy constructor of TargetPhrase: m_ttask_flag wasn't copied but always set to true. --- moses/TargetPhrase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 893edca08..c0e0b3e41 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -144,7 +144,7 @@ TargetPhrase::TargetPhrase(const TargetPhrase ©) , m_alignNonTerm(copy.m_alignNonTerm) , m_properties(copy.m_properties) , m_ttask(copy.m_ttask) - , m_ttask_flag(true) + , m_ttask_flag(copy.m_ttask_flag) , m_container(copy.m_container) { if (copy.m_lhsTarget) { From 106668d754ca66aa97a4a7ec1b6edf5a9964d688 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 1 Jul 2015 14:48:25 +0100 Subject: [PATCH 098/286] Changed m_ttask from ttasksptr to ttaskwptr in TargetPhrase ... .. to avoid blocking destruction of TranslationTask at the end of its lifetime. --- moses/TargetPhrase.cpp | 2 +- moses/TargetPhrase.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index c0e0b3e41..fc7af9687 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -182,7 +182,7 @@ bool TargetPhrase::HasTtaskSPtr() const { } const ttasksptr& TargetPhrase::GetTtask() const { - return m_ttask; + return m_ttask.lock(); } void TargetPhrase::EvaluateInIsolation(const Phrase &source) diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h index 4e6b1278b..460dcc33f 100644 --- a/moses/TargetPhrase.h +++ b/moses/TargetPhrase.h @@ -61,7 +61,7 @@ public: private: ScoreCache_t m_cached_scores; - ttasksptr m_ttask; + ttaskwptr m_ttask; bool m_ttask_flag; private: From 86292f2ce332013c187afd8046a9eeec2770561e Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Thu, 2 Jul 2015 00:01:16 +0100 Subject: [PATCH 099/286] daily automatic beautifier --- moses/StaticData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 281129a2e..8fb88c257 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -1115,7 +1115,7 @@ void StaticData::LoadSparseWeightsFromConfig() } std::map > weights = m_parameter->GetAllWeights(); - std::map >::iterator iter; + std::map >::iterator iter; for (iter = weights.begin(); iter != weights.end(); ++iter) { // this indicates that it is sparse feature if (featureNames.find(iter->first) == featureNames.end()) { From 515862ee1c1d89f604759cb5cefa7eeb452a3fba Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 2 Jul 2015 01:31:11 +0100 Subject: [PATCH 100/286] Reformatting for readability. --- util/probing_hash_table.hh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh index f32b64ea3..f4192577b 100644 --- a/util/probing_hash_table.hh +++ b/util/probing_hash_table.hh @@ -92,7 +92,8 @@ template GetKey()); if (equal_(got, t.GetKey())) { out = i; return true; } if (equal_(got, invalid_)) { - UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); + UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, + "Hash table with " << buckets_ << " buckets is full."); *i = t; out = i; return false; From e94921dc442033a87b7cf4d712db1dcfbc9a003f Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 2 Jul 2015 01:32:34 +0100 Subject: [PATCH 101/286] Removal of 'using namespace ...' from several header files. --- .beautify-ignore | 1 + Jamroot | 2 +- moses/BaseManager.cpp | 2 +- moses/BaseManager.h | 2 +- moses/StaticData.cpp | 3 +- moses/TranslationModel/UG/bitext-find.cc | 27 ++++---- moses/TranslationModel/UG/mm/tpt_pickler.h | 12 ++-- moses/TranslationModel/UG/mm/tpt_tightindex.h | 10 +-- moses/TranslationModel/UG/mm/tpt_tokenindex.h | 68 +++++++++---------- moses/TranslationModel/UG/mm/ug_bitext.h | 65 ++++++++++-------- .../TranslationModel/UG/mm/ug_bitext_agenda.h | 4 +- .../UG/mm/ug_bitext_agenda_job.h | 10 +-- .../UG/mm/ug_bitext_agenda_worker.h | 12 ++-- .../UG/mm/ug_bitext_jstats.cc | 6 +- .../TranslationModel/UG/mm/ug_bitext_jstats.h | 20 +++--- .../UG/mm/ug_bitext_pstats.cc | 2 +- .../TranslationModel/UG/mm/ug_bitext_pstats.h | 2 +- .../UG/mm/ug_conll_bottom_up_token.h | 2 +- .../TranslationModel/UG/mm/ug_conll_record.cc | 2 +- .../TranslationModel/UG/mm/ug_conll_record.h | 4 +- .../TranslationModel/UG/mm/ug_corpus_token.cc | 2 +- .../TranslationModel/UG/mm/ug_corpus_token.h | 2 +- moses/TranslationModel/UG/mm/ug_deptree.h | 14 ++-- moses/TranslationModel/UG/mm/ug_im_bitext.cc | 4 +- moses/TranslationModel/UG/mm/ug_im_bitext.h | 6 +- moses/TranslationModel/UG/mm/ug_im_tsa.h | 44 ++++++------ moses/TranslationModel/UG/mm/ug_im_ttrack.h | 48 ++++++------- .../UG/mm/ug_lexical_phrase_scorer1.h | 26 +++---- .../UG/mm/ug_lexical_phrase_scorer2.h | 16 ++--- moses/TranslationModel/UG/mm/ug_lru_cache.h | 10 +-- moses/TranslationModel/UG/mm/ug_mm_2d_table.h | 40 +++++------ moses/TranslationModel/UG/mm/ug_mm_bitext.h | 4 +- moses/TranslationModel/UG/mm/ug_mm_tsa.h | 16 ++--- moses/TranslationModel/UG/mm/ug_mm_ttrack.h | 39 +++++------ moses/TranslationModel/UG/mm/ug_mmbitext.h | 22 +++--- moses/TranslationModel/UG/mm/ug_phrasepair.h | 17 +++-- .../TranslationModel/UG/mm/ug_sampling_bias.h | 2 +- moses/TranslationModel/UG/mm/ug_tsa_base.h | 60 ++++++++-------- .../UG/mm/ug_tsa_bitset_cache.h | 20 +++--- .../UG/mm/ug_tsa_tree_iterator.h | 66 +++++++++--------- moses/TranslationModel/UG/mm/ug_ttrack_base.h | 46 ++++++------- .../UG/mm/ug_ttrack_position.h | 6 +- moses/TranslationModel/UG/mm/ug_typedefs.h | 26 +++---- moses/TranslationModel/UG/mmsapt.cpp | 3 +- moses/TranslationModel/UG/mmsapt.h | 2 +- moses/TranslationModel/UG/sapt_pscore_lex1.h | 2 +- moses/TranslationModel/UG/sapt_pscore_pbwd.h | 2 +- moses/TranslationModel/UG/sapt_pscore_pfwd.h | 2 +- .../UG/sapt_pscore_unaligned.h | 2 +- moses/server/TranslationRequest.cpp | 2 +- phrase-extract/ScoreFeatureTest.cpp | 33 +++++---- 51 files changed, 432 insertions(+), 408 deletions(-) diff --git a/.beautify-ignore b/.beautify-ignore index b05524e1d..ef4c2b762 100644 --- a/.beautify-ignore +++ b/.beautify-ignore @@ -21,6 +21,7 @@ mingw/MosesGUI/icons_rc.py mingw/MosesGUI/Ui_credits.py mingw/MosesGUI/Ui_mainWindow.py moses/TranslationModel/UG +moses/server phrase-extract/pcfg-common phrase-extract/syntax-common randlm diff --git a/Jamroot b/Jamroot index 4f76ec3ba..b3544274b 100644 --- a/Jamroot +++ b/Jamroot @@ -108,7 +108,7 @@ external-lib z ; #lib dl : : static:static shared:shared ; #requirements += dl ; -requirements += -std=c++0x ; +#requirements += -std=c++0x ; if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] { if [ option.get "full-tcmalloc" : : "yes" ] { diff --git a/moses/BaseManager.cpp b/moses/BaseManager.cpp index 609a6e9f5..211da8c9b 100644 --- a/moses/BaseManager.cpp +++ b/moses/BaseManager.cpp @@ -27,7 +27,7 @@ BaseManager::GetSource() const return m_source; } -const ttasksptr& +const ttasksptr BaseManager::GetTtask() const { return m_ttask.lock(); } diff --git a/moses/BaseManager.h b/moses/BaseManager.h index 7367997ad..f4c7eeff2 100644 --- a/moses/BaseManager.h +++ b/moses/BaseManager.h @@ -50,7 +50,7 @@ public: //! the input sentence being decoded const InputType& GetSource() const; - const ttasksptr& GetTtask() const; + const ttasksptr GetTtask() const; virtual void Decode() = 0; // outputs diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 83da42a9e..281129a2e 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -1115,8 +1115,7 @@ void StaticData::LoadSparseWeightsFromConfig() } std::map > weights = m_parameter->GetAllWeights(); - std::map >::iterator iter; -// for (auto iter = weights.begin(); iter != weights.end(); ++iter) { + std::map >::iterator iter; for (iter = weights.begin(); iter != weights.end(); ++iter) { // this indicates that it is sparse feature if (featureNames.find(iter->first) == featureNames.end()) { diff --git a/moses/TranslationModel/UG/bitext-find.cc b/moses/TranslationModel/UG/bitext-find.cc index 18cc6e0fa..0e94464ba 100644 --- a/moses/TranslationModel/UG/bitext-find.cc +++ b/moses/TranslationModel/UG/bitext-find.cc @@ -85,24 +85,25 @@ int main(int argc, char* argv[]) ++k; size_t s1,s2,e1,e2; int po_fwd=-1,po_bwd=-1; - vector caln; - // cout << sid << " " << B.docname(sid) << endl; + std::vector caln; + // cout << sid << " " << B.docname(sid) << std::endl; if (!B.find_trg_phr_bounds(sid, off, off+m.size(), s1,s2,e1,e2,po_fwd,po_bwd, &caln, NULL, &m == &m2)) { - // cout << "alignment failure" << endl; + // cout << "alignment failure" << std::endl; } - cout << sid << " " << B.docname(sid) - << " dfwd=" << po_fwd << " dbwd=" << po_bwd - << "\n"; - write_sentence(*B.T1, sid, *B.V1, cout); cout << "\n"; - write_sentence(*B.T2, sid, *B.V2, cout); cout << "\n"; + std::cout << sid << " " << B.docname(sid) + << " dfwd=" << po_fwd << " dbwd=" << po_bwd + << "\n"; + + write_sentence(*B.T1, sid, *B.V1, std::cout); std::cout << "\n"; + write_sentence(*B.T2, sid, *B.V2, std::cout); std::cout << "\n"; B.write_yawat_alignment(sid, m1.size() ? &m1 : NULL, - m2.size() ? &m2 : NULL, cout); - cout << endl; + m2.size() ? &m2 : NULL, std::cout); + std::cout << std::endl; } } @@ -141,9 +142,9 @@ interpret_args(int ac, char* av[]) po::notify(vm); if (vm.count("help")) { - cout << "\nusage:\n\t" << av[0] - << " [options] [--q1=] [--q2=]" << endl; - cout << o << endl; + std::cout << "\nusage:\n\t" << av[0] + << " [options] [--q1=] [--q2=]" << std::endl; + std::cout << o << std::endl; exit(0); } } diff --git a/moses/TranslationModel/UG/mm/tpt_pickler.h b/moses/TranslationModel/UG/mm/tpt_pickler.h index 5ac71c16d..5ae033151 100644 --- a/moses/TranslationModel/UG/mm/tpt_pickler.h +++ b/moses/TranslationModel/UG/mm/tpt_pickler.h @@ -3,10 +3,10 @@ #ifndef __Pickler #define __Pickler -#include -#include -#include -#include +#include +#include +#include +#include #include "tpt_typedefs.h" #include "num_read_write.h" #include @@ -20,7 +20,7 @@ namespace ugdiss /** * The following functions write and read data in a compact binary * representation. Write and read errors can be checked directly - * on the ostream object after the function call, so no return value is + * on the std::ostream object after the function call, so no return value is * necessary.*/ void binwrite(std::ostream& out, char data); void binwrite(std::ostream& out, unsigned char data); @@ -165,7 +165,7 @@ namespace ugdiss binread(in,k); binread(in,v); data[k] = v; - // cerr << "* " << i << " " << k << " " << v << endl; + // cerr << "* " << i << " " << k << " " << v << std::endl; } } diff --git a/moses/TranslationModel/UG/mm/tpt_tightindex.h b/moses/TranslationModel/UG/mm/tpt_tightindex.h index 967215aeb..96e3614ad 100644 --- a/moses/TranslationModel/UG/mm/tpt_tightindex.h +++ b/moses/TranslationModel/UG/mm/tpt_tightindex.h @@ -12,7 +12,7 @@ #include "tpt_typedefs.h" // #include #include -using namespace std; +// // using namespace std; #ifndef uchar #endif @@ -29,7 +29,7 @@ namespace ugdiss { // void tightwritex(iostream& out, size_t data, bool flag); void - tightwrite(std::ostream& out, ::uint64_t data, bool flag); + tightwrite(std::ostream& out, uint64_t data, bool flag); filepos_type tightread(std::istream& in, std::ios::pos_type stop); @@ -91,7 +91,7 @@ namespace ugdiss tightread4(char const* start, char const* stop, uint32_t& dest); char const* - tightread8(char const* start, char const* stop, ::uint64_t& dest); + tightread8(char const* start, char const* stop, uint64_t& dest); template char const* @@ -102,13 +102,13 @@ namespace ugdiss if (sizeof(numType)==4) return tightread4(start,stop,reinterpret_cast(dest)); else if (sizeof(numType)==8) - return tightread8(start,stop,reinterpret_cast(dest)); + return tightread8(start,stop,reinterpret_cast(dest)); assert(0); return NULL; } // char const* -// tightread(char const* start, char const* stop, ::uint64_t& dest); +// tightread(char const* start, char const* stop, uint64_t& dest); // char const* // tightread(char const* start, char const* stop, filepos_type& dest); diff --git a/moses/TranslationModel/UG/mm/tpt_tokenindex.h b/moses/TranslationModel/UG/mm/tpt_tokenindex.h index 9f7c69b3e..2642bdd2f 100644 --- a/moses/TranslationModel/UG/mm/tpt_tokenindex.h +++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.h @@ -20,7 +20,7 @@ #include #include -using namespace std; +// // using namespace std; namespace bio=boost::iostreams; namespace ugdiss @@ -28,9 +28,9 @@ namespace ugdiss class TokenIndex { /** Reverse index: maps from ID to char const* */ - mutable vector ridx; + mutable std::vector ridx; /** Label for the UNK token */ - string unkLabel; + std::string unkLabel; id_type unkId,numTokens; /// New 2013-09-02: thread-safe @@ -38,8 +38,8 @@ namespace ugdiss // NEW 2011-01-30: dynamic adding of unknown items bool dynamic; // dynamically assign a new word id to unknown items? - boost::shared_ptr > str2idExtra; - boost::shared_ptr > newWords; + boost::shared_ptr > str2idExtra; + boost::shared_ptr > newWords; // The use of pointers to external items is a bit of a bad hack // in terms of the semantic of TokenIndex const: since external items // are changed, the TokenIndex instance remains unchanged and const works, @@ -48,7 +48,7 @@ namespace ugdiss // thread-safe! public: - /** string->ID lookup works via binary search in a vector of Entry instances */ + /** string->ID lookup works via binary search in a std::vector of Entry instances */ class Entry { public: @@ -69,26 +69,26 @@ namespace ugdiss Entry const* startIdx; Entry const* endIdx; CompFunc comp; - TokenIndex(string unkToken="UNK"); - // TokenIndex(string fname,string unkToken="UNK",bool dyna=false); - void open(string fname,string unkToken="UNK",bool dyna=false); + TokenIndex(std::string unkToken="UNK"); + // TokenIndex(std::string fname,std::string unkToken="UNK",bool dyna=false); + void open(std::string fname,std::string unkToken="UNK",bool dyna=false); void close(); // id_type unkId,numTokens; id_type operator[](char const* w) const; - id_type operator[](string const& w) const; + id_type operator[](std::string const& w) const; char const* const operator[](id_type id) const; char const* const operator[](id_type id); - vector reverseIndex() const; + std::vector reverseIndex() const; - string toString(vector const& v); - string toString(vector const& v) const; + std::string toString(std::vector const& v); + std::string toString(std::vector const& v) const; - string toString(id_type const* start, id_type const* const stop); - string toString(id_type const* start, id_type const* const stop) const; + std::string toString(id_type const* start, id_type const* const stop); + std::string toString(id_type const* start, id_type const* const stop) const; - vector toIdSeq(string const& line) const; + std::vector toIdSeq(std::string const& line) const; - bool fillIdSeq(string const& line, vector & v) const; + bool fillIdSeq(std::string const& line, std::vector & v) const; void iniReverseIndex(); id_type getNumTokens() const; @@ -104,27 +104,27 @@ namespace ugdiss char const* const getUnkToken() const; - void write(string fname); // write TokenIndex to a new file + void write(std::string fname); // write TokenIndex to a new file bool isDynamic() const; bool setDynamic(bool onoff); - void setUnkLabel(string unk); + void setUnkLabel(std::string unk); }; void - write_tokenindex_to_disk(vector > const& tok, - string const& ofile, string const& unkToken); + write_tokenindex_to_disk(std::vector > const& tok, + std::string const& ofile, std::string const& unkToken); /** for sorting words by frequency */ class compWords { - string unk; + std::string unk; public: - compWords(string _unk) : unk(_unk) {}; + compWords(std::string _unk) : unk(_unk) {}; bool - operator()(pair const& A, - pair const& B) const + operator()(std::pair const& A, + std::pair const& B) const { if (A.first == unk) return false;// do we still need this special treatment? if (B.first == unk) return true; // do we still need this special treatment? @@ -136,27 +136,27 @@ namespace ugdiss template void - mkTokenIndex(string ofile,MYMAP const& M,string unkToken) + mkTokenIndex(std::string ofile,MYMAP const& M,std::string unkToken) { - // typedef pair IndexEntry; // offset and id - typedef pair Token; // token and id + // typedef std::pair IndexEntry; // offset and id + typedef std::pair Token; // token and id // first, sort the word list in decreasing order of frequency, so that we // can assign IDs in an encoding-efficient manner (high frequency. low ID) - vector > wcounts(M.size()); // for sorting by frequency + std::vector > wcounts(M.size()); // for sorting by frequency typedef typename MYMAP::const_iterator myIter; size_t z=0; for (myIter m = M.begin(); m != M.end(); m++) { - // cout << m->first << " " << m->second << endl; - wcounts[z++] = pair(m->first,m->second); + // cout << m->first << " " << m->second << std::endl; + wcounts[z++] = std::pair(m->first,m->second); } compWords compFunc(unkToken); sort(wcounts.begin(),wcounts.end(),compFunc); // Assign IDs ... - vector tok(wcounts.size()); + std::vector tok(wcounts.size()); for (size_t i = 0; i < wcounts.size(); i++) tok[i] = Token(wcounts[i].first,i); // and re-sort in alphabetical order @@ -166,9 +166,9 @@ namespace ugdiss template void - fill_token_seq(TokenIndex& V, string const& line, vector& dest) + fill_token_seq(TokenIndex& V, std::string const& line, std::vector& dest) { - istringstream buf(line); string w; + std::istringstream buf(line); std::string w; while (buf>>w) dest.push_back(Token(V[w])); } } diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index fc433669c..e14cc5d3d 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -71,10 +71,19 @@ namespace Moses { class Mmsapt; namespace bitext { - using namespace ugdiss; - + // using namespace ugdiss; + using ugdiss::bitvector; + using ugdiss::Ttrack; + using ugdiss::TSA; + using ugdiss::imTSA; + using ugdiss::mmTSA; + using ugdiss::L2R_Token; + using ugdiss::SimpleWordId; + using ugdiss::imTtrack; + using ugdiss::mmTtrack; + using ugdiss::binread; float lbop(size_t const tries, size_t const succ, float const confidence); - void write_bitvector(bitvector const& v, ostream& out); + void write_bitvector(bitvector const& v, std::ostream& out); #ifndef NO_MOSES struct @@ -86,7 +95,7 @@ namespace Moses { boost::shared_mutex lock; sptr bias; sptr cache1, cache2; - ostream* bias_log; + std::ostream* bias_log; ContextForQuery() : bias_log(NULL) { } }; #endif @@ -96,10 +105,10 @@ namespace Moses { { public: typedef TKN Token; - typedef typename TSA::tree_iterator iter; + typedef typename ugdiss::TSA::tree_iterator iter; typedef typename std::vector > vec_ppair; typedef typename lru_cache::LRU_Cache pplist_cache_t; - typedef TSA tsa; + typedef ugdiss::TSA tsa; friend class Moses::Mmsapt; protected: mutable boost::shared_mutex m_lock; // for thread-safe operation @@ -112,7 +121,7 @@ namespace Moses { size_t m_pstats_cache_threshold; // threshold for caching sampling results sptr m_cache1, m_cache2; // caches for sampling results - vector m_docname; + std::vector m_docname; map m_docname2docid; // maps from doc names to ids sptr > m_sid2docid; // maps from sentences to docs (ids) @@ -141,7 +150,7 @@ namespace Moses { size_t & s1, size_t & s2, // beginning and end of target start size_t & e1, size_t & e2, // beginning and end of target end int& po_fwd, int& po_bwd, // phrase orientations - std::vector * core_alignment, // stores the core alignment + std::vector * core_alignment, // stores the core alignment bitvector* full_alignment, // stores full word alignment for this sent. bool const flip) const; // flip source and target (reverse lookup) @@ -190,17 +199,17 @@ namespace Moses { loadSentenceBias(string const& fname) const; sptr - SetupDocumentBias(string const& bserver, string const& text, ostream* log) const; + SetupDocumentBias(string const& bserver, string const& text, std::ostream* log) const; sptr - SetupDocumentBias(map context_weights, ostream* log) const; + SetupDocumentBias(map context_weights, std::ostream* log) const; void mark_match(Token const* start, Token const* end, iter const& m, bitvector& check) const; void write_yawat_alignment - ( id_type const sid, iter const* m1, iter const* m2, ostream& out ) const; + ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const; string docname(id_type const sid) const; @@ -229,7 +238,7 @@ namespace Moses { size_t i = 0; float v; while (in>>v) (*ret)[i++] = v; UTIL_THROW_IF2(i != T1->size(), - "Mismatch between bias vector size and corpus size at " + "Mismatch between bias std::vector size and corpus size at " << HERE); return ret; } @@ -239,8 +248,8 @@ namespace Moses { Bitext:: toString(uint64_t pid, int isL2) const { - ostringstream buf; - uint32_t sid,off,len; parse_pid(pid,sid,off,len); + std::ostringstream buf; + uint32_t sid,off,len; ugdiss::parse_pid(pid,sid,off,len); Token const* t = (isL2 ? T2 : T1)->sntStart(sid) + off; Token const* x = t + len; TokenIndex const& V = isL2 ? *V2 : *V1; @@ -328,10 +337,10 @@ namespace Moses { size_t const start, size_t const stop, size_t & s1, size_t & s2, size_t & e1, size_t & e2, int & po_fwd, int & po_bwd, - std::vector* core_alignment, bitvector* full_alignment, + std::vector* core_alignment, bitvector* full_alignment, bool const flip) const { - // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl; + // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << std::endl; // a word on the core_alignment: // @@ -425,7 +434,7 @@ namespace Moses { sptr Bitext:: SetupDocumentBias - ( string const& bserver, string const& text, ostream* log ) const + ( string const& bserver, string const& text, std::ostream* log ) const { sptr ret; UTIL_THROW_IF2(m_sid2docid == NULL, @@ -439,7 +448,7 @@ namespace Moses { sptr Bitext:: SetupDocumentBias - ( map context_weights, ostream* log ) const + ( map context_weights, std::ostream* log ) const { sptr ret; UTIL_THROW_IF2(m_sid2docid == NULL, @@ -541,12 +550,12 @@ namespace Moses { m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0); - // convert pstats entries to phrase pairs + // convert pstats entries to phrase std::pairs pstats::trg_map_t::iterator a; for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a) { uint32_t sid,off,len; - parse_pid(a->first, sid, off, len); + ugdiss::parse_pid(a->first, sid, off, len); m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second); m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1), m_pp.joint); @@ -596,16 +605,16 @@ namespace Moses { void Bitext:: write_yawat_alignment - ( id_type const sid, iter const* m1, iter const* m2, ostream& out ) const + ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const { - vector a1(T1->sntLen(sid),-1), a2(T2->sntLen(sid),-1); + std::vector a1(T1->sntLen(sid),-1), a2(T2->sntLen(sid),-1); bitvector f1(a1.size()), f2(a2.size()); if (m1) mark_match(T1->sntStart(sid), T1->sntEnd(sid), *m1, f1); if (m2) mark_match(T2->sntStart(sid), T2->sntEnd(sid), *m2, f2); - vector > agroups; - vector grouplabel; - pair ag; + std::vector > agroups; + std::vector grouplabel; + std::pair ag; ag.first.resize(a1.size()); ag.second.resize(a2.size()); char const* x = Tx->sntStart(sid); @@ -670,19 +679,19 @@ namespace Moses { void expand(typename Bitext::iter const& m, Bitext const& bt, pstats const& ps, - std::vector >& dest, ostream* log) + std::vector >& dest, std::ostream* log) { bool fwd = m.root == bt.I1.get(); dest.reserve(ps.trg.size()); PhrasePair pp; pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0); // cout << HERE << " " - // << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << endl; + // << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << std::endl; pstats::trg_map_t::const_iterator a; for (a = ps.trg.begin(); a != ps.trg.end(); ++a) { uint32_t sid,off,len; - parse_pid(a->first, sid, off, len); + ugdiss::parse_pid(a->first, sid, off, len); pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off, len, a->second); dest.push_back(pp); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h index d07fba6aa..72e6c8638 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h @@ -76,7 +76,7 @@ void Bitext } else ++i; } - // cerr << workers.size() << "/" << target << " active" << endl; + // cerr << workers.size() << "/" << target << " active" << std::endl; if (int(workers.size()) > target) this->doomed = workers.size() - target; else @@ -132,7 +132,7 @@ Bitext ::agenda ::get_job() { - // cerr << workers.size() << " workers on record" << endl; + // cerr << workers.size() << " workers on record" << std::endl; sptr ret; if (this->shutdown) return ret; boost::unique_lock lock(this->lock); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h index 36b9873e0..5975edd6f 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -100,7 +100,7 @@ Bitext::agenda::job #if 0 cerr << ctr++ << " " << m.str(m_bitext->V1.get()) << " " << sid << "/" << root->getCorpusSize() - << " " << offset << " " << stop-x << endl; + << " " << offset << " " << stop-x << std::endl; #endif bias_total += (*m_bias)[sid]; ++stats->raw_cnt; @@ -109,7 +109,7 @@ Bitext::agenda::job #if UG_BITEXT_TRACK_ACTIVE_THREADS ++active; // if (active%5 == 0) - // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl; + // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << std::endl; #endif } @@ -130,10 +130,10 @@ int Bitext::agenda::job if (!m_bias) return 1; - using namespace boost::math; + // // using namespace boost::math; typedef boost::math::binomial_distribution<> binomial; - ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL; + std::ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL; float p = (*m_bias)[sid]; id_type docid = m_bias->GetClass(sid); @@ -177,7 +177,7 @@ int Bitext::agenda::job for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " "; if (!ret) *log << "SKIP"; else if (p < .5 && d > .9) *log << "FORCE"; - *log << endl; + *log << std::endl; } return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h index 5ff39312c..104b7acb5 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h @@ -17,9 +17,9 @@ Bitext::agenda // reduce the number of lock / unlock operations we need to do // during sampling. - uint64_t sid=0, offset=0; // sid and offset of source phrase - size_t s1=0, s2=0, e1=0, e2=0; // soft and hard boundaries of target phrase - vector aln; // stores phrase-pair-internal alignment + uint64_t sid=0, offset=0; // sid and offset of source phrase + size_t s1=0, s2=0, e1=0, e2=0; // soft and hard boundaries of target phrase + std::vector aln; // stores phrase-pair-internal alignment while(sptr j = ag.get_job()) { j->stats->register_worker(); @@ -53,7 +53,7 @@ Bitext::agenda Token const* eos = ag.bt.T2->sntEnd(sid); cerr << "[" << j->stats->good + 1 << "] "; while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " "; - cerr << "[" << docid << "]" << endl; + cerr << "[" << docid << "]" << std::endl; #endif float sample_weight = 1./num_pairs; @@ -62,11 +62,11 @@ Bitext::agenda // adjust offsets in phrase-internal aligment for (size_t k = 1; k < aln.size(); k += 2) aln[k] += s2 - s1; - vector seen; seen.reserve(10); + std::vector seen; seen.reserve(10); // It is possible that the phrase extraction extracts the same // phrase twice, e.g., when word a co-occurs with sequence b b b // but is aligned only to the middle word. We can only count - // each phrase pair once per source phrase occurrence, or else + // each phrase std::pair once per source phrase occurrence, or else // run the risk of having more joint counts than marginal // counts. diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc index 517caf783..dd6fe5f82 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc @@ -54,7 +54,7 @@ namespace Moses void jstats:: - add(float w, vector const& a, uint32_t const cnt2, + add(float w, std::vector const& a, uint32_t const cnt2, uint32_t fwd_orient, uint32_t bwd_orient, int const docid) { boost::lock_guard lk(this->lock); @@ -66,7 +66,7 @@ namespace Moses size_t i = 0; while (i < my_aln.size() && my_aln[i].second != a) ++i; if (i == my_aln.size()) - my_aln.push_back(pair >(1,a)); + my_aln.push_back(std::pair >(1,a)); else my_aln[i].first++; if (my_aln[i].first > my_aln[i/2].first) @@ -81,7 +81,7 @@ namespace Moses } } - vector > > const& + std::vector > > const& jstats:: aln() const { return my_aln; } diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h index 03b231487..8f5c55e04 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h @@ -1,5 +1,7 @@ // -*- c++ -*- #pragma once +#include +#include #include "ug_typedefs.h" #include "ug_lexical_reordering.h" #include @@ -8,9 +10,10 @@ namespace Moses { namespace bitext { - using namespace ugdiss; - // "joint" (i.e., phrase pair) statistics + // using namespace ugdiss; + + // "joint" (i.e., phrase std::pair) statistics class jstats { @@ -20,23 +23,24 @@ namespace Moses float my_wcnt; // weighted joint count // to do: use a static alignment pattern store that stores each pattern only - // once, so that we don't have to store so many alignment vectors - vector > > my_aln; // internal word alignment + // once, so that we don't have to store so many alignment std::vectors + std::vector > > my_aln; + // internal word alignment uint32_t ofwd[Moses::LRModel::NONE+1]; // forward distortion type counts uint32_t obwd[Moses::LRModel::NONE+1]; // backward distortion type counts public: std::map indoc; - // vector indoc; // counts origin of samples (for biased sampling) + // std::vector indoc; // counts origin of samples (for biased sampling) jstats(); jstats(jstats const& other); uint32_t rcnt() const; // raw joint counts uint32_t cnt2() const; // raw target phrase occurrence count float wcnt() const; // weighted joint counts - vector > > const & aln() const; - void add(float w, vector const& a, uint32_t const cnt2, + std::vector > > const & aln() const; + void add(float w, std::vector const& a, uint32_t const cnt2, uint32_t fwd_orient, uint32_t bwd_orient, int const docid); void invalidate(); @@ -46,7 +50,7 @@ namespace Moses uint32_t dcnt_bwd(PhraseOrientation const idx) const; void fill_lr_vec(Moses::LRModel::Direction const& dir, Moses::LRModel::ModelType const& mdl, - vector& v); + std::vector& v); }; } } diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc index 8702d9c50..ebd4a00d2 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc @@ -66,7 +66,7 @@ namespace Moses bool pstats:: add(uint64_t pid, float const w, - vector const& a, + std::vector const& a, uint32_t const cnt2, uint32_t fwd_o, uint32_t bwd_o, int const docid) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index e5cf4ab26..ca4e80418 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -17,7 +17,7 @@ namespace Moses { typedef boost::unordered_map > map_t; typedef ThreadSafeContainer, map_t> cache_t; - typedef std::vector alnvec; + typedef std::vector alnvec; #if UG_BITEXT_TRACK_ACTIVE_THREADS static ThreadSafeCounter active; #endif diff --git a/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h b/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h index 89dc93ad1..29816a55d 100644 --- a/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h +++ b/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h @@ -7,7 +7,7 @@ #include "ug_typedefs.h" namespace ugdiss { - using namespace std; + // using namespace std; template class ConllBottomUpToken : public T diff --git a/moses/TranslationModel/UG/mm/ug_conll_record.cc b/moses/TranslationModel/UG/mm/ug_conll_record.cc index 5374c027c..c44a20b92 100644 --- a/moses/TranslationModel/UG/mm/ug_conll_record.cc +++ b/moses/TranslationModel/UG/mm/ug_conll_record.cc @@ -3,7 +3,7 @@ namespace ugdiss { Conll_Record Conll_Record:: - remap(vector const& m) const + remap(std::vector const& m) const { Conll_Record ret; ret.sform = m.size() > 0 && m[0] ? m[0][this->sform] : this->sform; diff --git a/moses/TranslationModel/UG/mm/ug_conll_record.h b/moses/TranslationModel/UG/mm/ug_conll_record.h index e52a4974b..c8663a166 100644 --- a/moses/TranslationModel/UG/mm/ug_conll_record.h +++ b/moses/TranslationModel/UG/mm/ug_conll_record.h @@ -5,7 +5,7 @@ namespace ugdiss { - using namespace std; + // using namespace std; class Conll_Record @@ -29,7 +29,7 @@ namespace ugdiss // virtual bool operator==(Conll_Record const& other) const; // virtual bool operator<(Conll_Record const& other) const; - Conll_Record remap(vector const& m) const; + Conll_Record remap(std::vector const& m) const; #if 0 /** constructor for conversion from CONLL-stype text format diff --git a/moses/TranslationModel/UG/mm/ug_corpus_token.cc b/moses/TranslationModel/UG/mm/ug_corpus_token.cc index 4be8cbd95..aa08ccc4e 100644 --- a/moses/TranslationModel/UG/mm/ug_corpus_token.cc +++ b/moses/TranslationModel/UG/mm/ug_corpus_token.cc @@ -35,7 +35,7 @@ namespace ugdiss id_type SimpleWordId:: - remap(vector const& m) const + remap(std::vector const& m) const { if (!m[0]) return theID; return m[0][theID]; diff --git a/moses/TranslationModel/UG/mm/ug_corpus_token.h b/moses/TranslationModel/UG/mm/ug_corpus_token.h index b9693cbf2..52ec41a40 100644 --- a/moses/TranslationModel/UG/mm/ug_corpus_token.h +++ b/moses/TranslationModel/UG/mm/ug_corpus_token.h @@ -27,7 +27,7 @@ namespace ugdiss id_type const& id() const; int cmp(SimpleWordId const& other) const; bool operator==(SimpleWordId const& other) const; - id_type remap(vector const& m) const; + id_type remap(std::vector const& m) const; }; /** Token class for suffix arrays */ diff --git a/moses/TranslationModel/UG/mm/ug_deptree.h b/moses/TranslationModel/UG/mm/ug_deptree.h index b28a4bbe8..1cd3f2a0c 100644 --- a/moses/TranslationModel/UG/mm/ug_deptree.h +++ b/moses/TranslationModel/UG/mm/ug_deptree.h @@ -15,22 +15,22 @@ #include "ug_conll_bottom_up_token.h" #include "ug_typedefs.h" -using namespace std; +// using namespace std; namespace ugdiss { - // Fills the vector v with pointers to the internal root r_x for the + // Fills the std::vector v with pointers to the internal root r_x for the // stretch [start,x] for all x: start <= x < stop. If the stretch // is incoherent, r_x is NULL template void - fill_L2R_roots(T const* start,T const* stop, vector& v) + fill_L2R_roots(T const* start,T const* stop, std::vector& v) { assert(stop>start); v.resize(stop-start); v[0] = start; bitvector isR(v.size()); - vector root(v.size()); + std::vector root(v.size()); isR.set(0); root[0] = start+start->parent; for (T const* x = start+1; x < stop; ++x) @@ -95,7 +95,7 @@ namespace ugdiss template T const* - findInternalRoot(vector const& v) + findInternalRoot(std::vector const& v) { T const* a = as(&(*v.begin())); T const* b = as(&(*v.end())); @@ -108,7 +108,7 @@ namespace ugdiss public: Conll_Record const* rec; // pointer to the record (see below) for this node DTNode* parent; // pointer to my parent - vector children; // children (in the order they appear in the sentence) + std::vector children; // children (in the order they appear in the sentence) DTNode(Conll_Record const* p); }; @@ -117,7 +117,7 @@ namespace ugdiss DependencyTree { public: - vector w; + std::vector w; DependencyTree(Conll_Record const* first, Conll_Record const* last); }; #endif diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.cc b/moses/TranslationModel/UG/mm/ug_im_bitext.cc index b411cc7dc..5efa3b8c4 100644 --- a/moses/TranslationModel/UG/mm/ug_im_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.cc @@ -38,8 +38,8 @@ namespace Moses { UTIL_THROW_IF2(c != '-', "[" << HERE << "] " << "Error in alignment information:\n" << a); - binwrite(obuf,row); - binwrite(obuf,col); + ugdiss::binwrite(obuf,row); + ugdiss::binwrite(obuf,col); } // important: DO NOT replace the two lines below this comment by // char const* x = obuf.str().c_str(), as the memory x is pointing diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.h b/moses/TranslationModel/UG/mm/ug_im_bitext.h index 63e44f1b9..9515ec98b 100644 --- a/moses/TranslationModel/UG/mm/ug_im_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.h @@ -25,12 +25,12 @@ namespace Moses imBitext(imBitext const& other); // sptr > - // add(vector const& s1, vector const& s2, vector & a); + // add(vector const& s1, std::vector const& s2, vector & a); sptr > add(vector const& s1, - vector const& s2, - vector const& a) const; + std::vector const& s2, + std::vector const& a) const; }; diff --git a/moses/TranslationModel/UG/mm/ug_im_tsa.h b/moses/TranslationModel/UG/mm/ug_im_tsa.h index e920d9f96..92e7f033c 100644 --- a/moses/TranslationModel/UG/mm/ug_im_tsa.h +++ b/moses/TranslationModel/UG/mm/ug_im_tsa.h @@ -20,8 +20,8 @@ namespace ugdiss { - using namespace std; - using namespace boost; + // using namespace std; + // using namespace boost; namespace bio=boost::iostreams; // template class imBitext; @@ -37,8 +37,8 @@ namespace ugdiss friend class tree_iterator; private: - vector sufa; // stores the actual array - vector index; /* top-level index into regions in sufa + std::vector sufa; // stores the actual array + std::vector index; /* top-level index into regions in sufa * (for faster access) */ private: char const* @@ -54,11 +54,11 @@ namespace ugdiss imTSA(); imTSA(boost::shared_ptr const> c, bdBitset const* filt, - ostream* log = NULL); + std::ostream* log = NULL); imTSA(imTSA const& prior, boost::shared_ptr const> const& crp, - vector const& newsids, size_t const vsize); + std::vector const& newsids, size_t const vsize); count_type sntCnt(char const* p, char const * const q) const; @@ -86,7 +86,7 @@ namespace ugdiss sanityCheck() const; void - save_as_mm_tsa(string fname) const; + save_as_mm_tsa(std::string fname) const; /// add a sentence to the database // shared_ptr > add(vector const& snt) const; @@ -140,7 +140,7 @@ namespace ugdiss // specified in filter template imTSA:: - imTSA(boost::shared_ptr const> c, bdBitset const* filter, ostream* log) + imTSA(boost::shared_ptr const> c, bdBitset const* filter, std::ostream* log) { assert(c); this->corpus = c; @@ -166,14 +166,14 @@ namespace ugdiss // alignment in the memory, using a ushort instead of a uint32_t might not // even make a difference. - vector wcnt; // word counts + std::vector wcnt; // word counts sufa.resize(c->count_tokens(wcnt,filter,slimit,log)); - if (log) *log << sufa.size() << "." << endl; + if (log) *log << sufa.size() << "." << std::endl; // exit(1); - // we use a second vector that keeps track for each ID of the current insertion + // we use a second std::vector that keeps track for each ID of the current insertion // position in the array - vector tmp(wcnt.size(),0); + std::vector tmp(wcnt.size(),0); for (size_t i = 1; i < wcnt.size(); ++i) tmp[i] = tmp[i-1] + wcnt[i-1]; @@ -198,14 +198,14 @@ namespace ugdiss } // Now sort the array - if (log) *log << "sorting ...." << endl; + if (log) *log << "sorting ...." << std::endl; index.resize(wcnt.size()+1,0); typename ttrack::Position::LESS > sorter(c.get()); for (size_t i = 0; i < wcnt.size(); i++) { if (log && wcnt[i] > 5000) *log << "sorting " << wcnt[i] - << " entries starting with id " << i << "." << endl; + << " entries starting with id " << i << "." << std::endl; index[i+1] = index[i]+wcnt[i]; assert(index[i+1]==tmp[i]); // sanity check if (wcnt[i]>1) @@ -217,7 +217,7 @@ namespace ugdiss this->indexSize = this->index.size(); #if 1 // Sanity check during code development. Can be removed once the thing is stable. - typename vector::iterator m = sufa.begin(); + typename std::vector::iterator m = sufa.begin(); for (size_t i = 0; i < wcnt.size(); i++) { for (size_t k = 0; k < wcnt[i]; ++k,++m) @@ -330,14 +330,14 @@ namespace ugdiss template void imTSA:: - save_as_mm_tsa(string fname) const + save_as_mm_tsa(std::string fname) const { - ofstream out(fname.c_str()); + std::ofstream out(fname.c_str()); filepos_type idxStart(0); id_type idxSize(index.size()); numwrite(out,idxStart); numwrite(out,idxSize); - vector mmIndex; + std::vector mmIndex; for (size_t i = 1; i < this->index.size(); i++) { mmIndex.push_back(out.tellp()); @@ -360,7 +360,7 @@ namespace ugdiss imTSA:: imTSA(imTSA const& prior, boost::shared_ptr const> const& crp, - vector const& newsids, size_t const vsize) + std::vector const& newsids, size_t const vsize) { typename ttrack::Position::LESS > sorter(crp.get()); @@ -369,7 +369,7 @@ namespace ugdiss size_t newToks = 0; BOOST_FOREACH(id_type sid, newsids) newToks += crp->sntLen(sid); - vector nidx(newToks); // new array entries + std::vector nidx(newToks); // new array entries size_t n = 0; BOOST_FOREACH(id_type sid, newsids) @@ -390,9 +390,9 @@ namespace ugdiss this->index.resize(vsize+1); size_t i = 0; - typename vector::iterator k = this->sufa.begin(); + typename std::vector::iterator k = this->sufa.begin(); // cerr << newToks << " new items at " - // << __FILE__ << ":" << __LINE__ << endl; + // << __FILE__ << ":" << __LINE__ << std::endl; for (size_t n = 0; n < nidx.size();) { id_type nid = crp->getToken(nidx[n])->id(); diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h index 503a5546c..fd14c161f 100644 --- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h @@ -28,8 +28,8 @@ namespace ugdiss { - using namespace std; - using namespace boost; + // using namespace std; + // using namespace boost; namespace bio=boost::iostreams; template class imTSA; @@ -37,7 +37,8 @@ namespace ugdiss template typename boost::shared_ptr > - append(typename boost::shared_ptr > const & crp, vector const & snt); + append(typename boost::shared_ptr > const & crp, + std::vector const & snt); template class imTtrack : public Ttrack @@ -45,19 +46,20 @@ namespace ugdiss private: size_t numToks; - boost::shared_ptr > > myData; // pointer to corpus data + boost::shared_ptr > > myData; + // pointer to corpus data friend class imTSA; friend typename boost::shared_ptr > - append(typename boost::shared_ptr > const & crp, vector const & snt); + append(typename boost::shared_ptr > const & crp, std::vector const & snt); void m_check_token_count(); // debugging function public: - imTtrack(boost::shared_ptr > > const& d); - imTtrack(istream& in, TokenIndex& V, ostream* log = NULL); + imTtrack(boost::shared_ptr > > const& d); + imTtrack(std::istream& in, TokenIndex& V, std::ostream* log = NULL); imTtrack(size_t reserve = 0); // imTtrack(istream& in, Vocab& V); @@ -80,7 +82,7 @@ namespace ugdiss m_check_token_count() { // sanity check size_t check = 0; - BOOST_FOREACH(vector const& s, *myData) + BOOST_FOREACH(std::vector const& s, *myData) check += s.size(); UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]" << " Wrong token count after appending sentence!" @@ -131,28 +133,28 @@ namespace ugdiss template imTtrack:: - imTtrack(istream& in, TokenIndex& V, ostream* log) + imTtrack(std::istream& in, TokenIndex& V, std::ostream* log) : numToks(0) { - myData.reset(new vector >()); - string line,w; + myData.reset(new std::vector >()); + std::string line,w; size_t linectr=0; - boost::unordered_map H; + boost::unordered_map H; // for (id_type i = 0; i < V.knownVocabSize(); ++i) // H[V[i]] = i; while (getline(in,line)) { - // cout << line << endl; - myData->push_back(vector()); + // cout << line << std::endl; + myData->push_back(std::vector()); if (log && ++linectr%1000000==0) - *log << linectr/1000000 << "M lines of input processed" << endl; - istringstream buf(line); - // cout << line << endl; + *log << linectr/1000000 << "M lines of input processed" << std::endl; + std::istringstream buf(line); + // cout << line << std::endl; while (buf>>w) { myData->back().push_back(Token(V[w])); // cout << w << " " << myData->back().back().id() << " " - // << V[w] << endl; + // << V[w] << std::endl; } // myData->back().resize(myData->back().size(), Token(0)); numToks += myData->back().size(); @@ -164,17 +166,17 @@ namespace ugdiss imTtrack(size_t reserve) : numToks(0) { - myData.reset(new vector >()); + myData.reset(new std::vector >()); if (reserve) myData->reserve(reserve); } template imTtrack:: - imTtrack(boost::shared_ptr > > const& d) + imTtrack(boost::shared_ptr > > const& d) : numToks(0) { myData = d; - BOOST_FOREACH(vector const& v, *d) + BOOST_FOREACH(std::vector const& v, *d) numToks += v.size(); } @@ -186,7 +188,7 @@ namespace ugdiss id_type i; for (i = 0; i < myData->size(); ++i) { - vector const& v = (*myData)[i]; + std::vector const& v = (*myData)[i]; if (v.size() == 0) continue; if (&v.front() <= t && &v.back() >= t) break; @@ -197,7 +199,7 @@ namespace ugdiss /// add a sentence to the database template boost::shared_ptr > - append(boost::shared_ptr > const& crp, vector const & snt) + append(boost::shared_ptr > const& crp, std::vector const & snt) { #if 1 if (crp) crp->m_check_token_count(); diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h index 742e0dd4e..2b83c9f4e 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h @@ -11,7 +11,7 @@ #include #include "tpt_pickler.h" -using namespace std; +// using namespace std; namespace ugdiss { @@ -20,16 +20,16 @@ namespace ugdiss LexicalPhraseScorer1 { typedef boost::unordered_map inner_map_t; - vector L1_given_L2; - vector L2_given_L1; + std::vector L1_given_L2; + std::vector L2_given_L1; void load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2, - vector & lex); + std::vector & lex); public: void open(string const& bname, string const& L1, string const& L2, TokenIndex & V1, TokenIndex & V2); void score(TKN const* snt1, size_t const s1, size_t const e1, TKN const* snt2, size_t const s2, size_t const e2, - vector aln, float & fwd_score, float& bwd_score); + std::vector aln, float & fwd_score, float& bwd_score); void score(TKN const* snt1, size_t const s1, size_t const e1, TKN const* snt2, size_t const s2, size_t const e2, char const* const aln_start, char const* const aln_end, @@ -42,10 +42,10 @@ namespace ugdiss void LexicalPhraseScorer1:: load_lex (string const& fname, TokenIndex & V1, TokenIndex & V2, - vector & lex) + std::vector & lex) { boost::iostreams::filtering_istream in; - cout << fname << endl; + cout << fname << std::endl; open_input_stream(fname,in); lex.resize(V1.ksize()); string w1,w2; float p; @@ -66,8 +66,8 @@ namespace ugdiss { string lex1 = bname+L1+"-"+L2+"."+L1+"-given-"+L2+".lex.gz"; string lex2 = bname+L1+"-"+L2+"."+L2+"-given-"+L1+".lex.gz"; - cout << lex1 << endl; - cout << lex2 << endl; + cout << lex1 << std::endl; + cout << lex2 << std::endl; load_lex(lex1,V1,V2,L1_given_L2); load_lex(lex2,V2,V1,L2_given_L1); } @@ -79,8 +79,8 @@ namespace ugdiss TKN const* snt2, size_t const s2, size_t const e2, vector aln, float & fwd_score, float& bwd_score) { - vector p1(e1,0), p2(e2,0); - vector c1(e1,0), c2(e2,0); + std::vector p1(e1,0), p2(e2,0); + std::vector c1(e1,0), c2(e2,0); size_t i1=0,i2=0; for (size_t k = 0; k < aln.size(); ++k) { @@ -126,8 +126,8 @@ namespace ugdiss char const* const aln_start, char const* const aln_end, float & fwd_score, float& bwd_score) { - vector p1(e1,0), p2(e2,0); - vector c1(e1,0), c2(e2,0); + std::vector p1(e1,0), p2(e2,0); + std::vector c1(e1,0), c2(e2,0); size_t i1=0,i2=0; for (char const* x = aln_start; x < aln_end;) { diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h index fdd0366df..6a87b4f69 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h @@ -14,7 +14,7 @@ #include "tpt_pickler.h" #include "ug_mm_2d_table.h" #include "util/exception.hh" -using namespace std; +// using namespace std; namespace ugdiss { @@ -22,7 +22,7 @@ namespace ugdiss class LexicalPhraseScorer2 { - vector ftag; + std::vector ftag; public: typedef mm2dTable table_t; table_t COOC; @@ -31,7 +31,7 @@ namespace ugdiss void score(TKN const* snt1, size_t const s1, size_t const e1, TKN const* snt2, size_t const s2, size_t const e2, - vector const & aln, float const alpha, + std::vector const & aln, float const alpha, float & fwd_score, float& bwd_score) const; void @@ -67,8 +67,8 @@ namespace ugdiss vector const & aln, float const alpha, float & fwd_score, float& bwd_score) const { - vector p1(e1,0), p2(e2,0); - vector c1(e1,0), c2(e2,0); + std::vector p1(e1,0), p2(e2,0); + std::vector c1(e1,0), c2(e2,0); size_t i1=0,i2=0; for (size_t k = 0; k < aln.size(); ++k) { @@ -113,7 +113,7 @@ namespace ugdiss cerr << "[" << s << "," << t << "] " << COOC.m1(s) << "/" << COOC[s][t] << "/" - << COOC.m2(t) << endl; + << COOC.m2(t) << std::endl; #endif return ret; } @@ -141,8 +141,8 @@ namespace ugdiss char const* const aln_start, char const* const aln_end, float const alpha, float & fwd_score, float& bwd_score) const { - vector p1(e1,0), p2(e2,0); - vector c1(e1,0), c2(e2,0); + std::vector p1(e1,0), p2(e2,0); + std::vector c1(e1,0), c2(e2,0); size_t i1=0,i2=0; for (char const* x = aln_start; x < aln_end;) { diff --git a/moses/TranslationModel/UG/mm/ug_lru_cache.h b/moses/TranslationModel/UG/mm/ug_lru_cache.h index 0000b194f..682880a69 100644 --- a/moses/TranslationModel/UG/mm/ug_lru_cache.h +++ b/moses/TranslationModel/UG/mm/ug_lru_cache.h @@ -14,14 +14,14 @@ namespace lru_cache { - using namespace std; - using namespace boost; + // using namespace std; + // using namespace boost; template class LRU_Cache { public: - typedef unordered_map map_t; + typedef boost::unordered_map map_t; private: struct Record { @@ -33,7 +33,7 @@ namespace lru_cache mutable boost::shared_mutex m_lock; uint32_t m_qfront, m_qback; - vector m_recs; + std::vector m_recs; map_t m_idx; void @@ -84,7 +84,7 @@ namespace lru_cache set(KEY const& key, sptr const& ptr) { boost::lock_guard lock(m_lock); - pair foo; + std::pair foo; foo = m_idx.insert(make_pair(key,m_recs.size())); uint32_t p = foo.first->second; diff --git a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h index 2455ca603..e2284382e 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h +++ b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h @@ -13,7 +13,7 @@ namespace bio=boost::iostreams; namespace ugdiss { - using namespace std; + // using namespace std; template class mm2dTable @@ -71,12 +71,12 @@ namespace ugdiss } - void open(string fname); + void open(std::string fname); void close(); Row operator[](ID key) const; - mm2dTable(string const fname="") { if (!fname.empty()) open(fname); }; + mm2dTable(std::string const fname="") { if (!fname.empty()) open(fname); }; ~mm2dTable() { file.reset(); }; }; @@ -110,25 +110,25 @@ namespace ugdiss template void mm2dTable:: - open(string fname) + open(std::string fname) { - // cout << "opening " << fname << " at " << __FILE__ << ":" << __LINE__ << endl; + // cout << "opening " << fname << " at " << __FILE__ << ":" << __LINE__ << std::endl; if (access(fname.c_str(),R_OK)) { - ostringstream msg; + std::ostringstream msg; msg << "[" << __FILE__ << ":" << __LINE__ <<"] FATAL ERROR: " - << "file '" << fname << " is not accessible." << endl; - string foo = msg.str(); + << "file '" << fname << " is not accessible." << std::endl; + std::string foo = msg.str(); UTIL_THROW(util::Exception,foo.c_str()); } file.reset(new bio::mapped_file_source()); file->open(fname); if (!file->is_open()) { - ostringstream msg; + std::ostringstream msg; msg << "[" << __FILE__ << ":" << __LINE__ <<"] FATAL ERROR: " - << "Opening file '" << fname << "' failed." << endl; - string foo = msg.str(); + << "Opening file '" << fname << "' failed." << std::endl; + std::string foo = msg.str(); UTIL_THROW(util::Exception,foo.c_str()); } char const* p = file->data(); @@ -137,15 +137,15 @@ namespace ugdiss numRows = *reinterpret_cast(p); p += sizeof(id_type); numCols = *reinterpret_cast(p); p += sizeof(id_type); data = reinterpret_cast(p); - // cout << numRows << " rows; " << numCols << " columns " << endl; + // cout << numRows << " rows; " << numCols << " columns " << std::endl; M1 = reinterpret_cast(index+numRows+1); M2 = M1+numRows; // cout << "Table " << fname << " has " << numRows << " rows and " - // << numCols << " columns." << endl; + // << numCols << " columns." << std::endl; // cout << "File size is " << file.size()*1024 << " bytes; "; // cout << "M2 starts " << (reinterpret_cast(M2) - file.data()) - // << " bytes into the file" << endl; - // cout << M2[0] << endl; + // << " bytes into the file" << std::endl; + // cout << M2[0] << std::endl; } template< @@ -156,15 +156,15 @@ namespace ugdiss typename ICONT // inner container type > void - write_mm_2d_table(ostream& out, vector const& T, - vector const* m1 = NULL, - vector const* m2 = NULL) + write_mm_2d_table(std::ostream& out, std::vector const& T, + std::vector const* m1 = NULL, + std::vector const* m2 = NULL) { assert(T.size()); typedef typename ICONT::const_iterator iter; // compute marginals if necessary - vector m1x,m2x; + std::vector m1x,m2x; if (!m1) { m1x.resize(T.size(),INIT(0)); @@ -191,7 +191,7 @@ namespace ugdiss numwrite(out,id_type(m2->size())); // number of columns // write actual table - vector index; + std::vector index; size_t ctr =0; index.reserve(m1->size()+1); for (ID r = 0; r < ID(T.size()); ++r) diff --git a/moses/TranslationModel/UG/mm/ug_mm_bitext.h b/moses/TranslationModel/UG/mm/ug_mm_bitext.h index 4f93d4d3c..82a007a9d 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_mm_bitext.h @@ -35,7 +35,7 @@ namespace Moses // in the future, we might also allow listing documents with // sentence ranges. string buffer,docname; size_t a=0,b; - this->m_sid2docid.reset(new vector(this->T1->size())); + this->m_sid2docid.reset(new std::vector(this->T1->size())); while(getline(docmap,buffer)) { istringstream line(buffer); @@ -46,7 +46,7 @@ namespace Moses this->m_docname.push_back(docname); line >> b; #ifndef NO_MOSES - VERBOSE(1, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << endl); + VERBOSE(1, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << std::endl); #endif for (b += a; a < b; ++a) (*this->m_sid2docid)[a] = docid; diff --git a/moses/TranslationModel/UG/mm/ug_mm_tsa.h b/moses/TranslationModel/UG/mm/ug_mm_tsa.h index ff2d4c693..e73ff5a71 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_tsa.h +++ b/moses/TranslationModel/UG/mm/ug_mm_tsa.h @@ -19,7 +19,7 @@ namespace ugdiss { - using namespace std; + // using namespace std; namespace bio=boost::iostreams; template @@ -43,8 +43,8 @@ namespace ugdiss public: mmTSA(); - mmTSA(string fname, Ttrack const* c); - void open(string fname, typename boost::shared_ptr const> c); + mmTSA(std::string fname, Ttrack const* c); + void open(std::string fname, typename boost::shared_ptr const> c); count_type sntCnt(char const* p, char const * const q) const; @@ -109,7 +109,7 @@ namespace ugdiss template mmTSA:: - mmTSA(string fname, Ttrack const* c) + mmTSA(std::string fname, Ttrack const* c) { open(fname,c); } @@ -119,12 +119,12 @@ namespace ugdiss template void mmTSA:: - open(string fname, typename boost::shared_ptr const> c) + open(std::string fname, typename boost::shared_ptr const> c) { this->bsc.reset(new BitSetCache >(this)); if (access(fname.c_str(),F_OK)) { - ostringstream msg; + std::ostringstream msg; msg << "mmTSA<>::open: File '" << fname << "' does not exist."; throw std::runtime_error(msg.str().c_str()); } @@ -137,7 +137,7 @@ namespace ugdiss p = numread(p,idxOffset); p = numread(p,this->indexSize); - // cerr << fname << ": " << idxOffset << " " << this->indexSize << endl; + // cerr << fname << ": " << idxOffset << " " << this->indexSize << std::endl; this->startArray = p; this->index = reinterpret_cast(file.data()+idxOffset); @@ -243,7 +243,7 @@ namespace ugdiss { raw = 0; id_type sid; uint16_t off; - boost::dynamic_bitset check(this->corpus->size()); + boost::dynamic_bitset check(this->corpus->size()); while (p < q) { p = tightread(p,q,sid); diff --git a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h index bfee14e3e..91167822d 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h @@ -24,7 +24,7 @@ namespace ugdiss { - using namespace std; + // using namespace std; namespace bio=boost::iostreams; template @@ -42,7 +42,7 @@ namespace ugdiss * of more than four billion words) */ public: - mmTtrack(string fname); + mmTtrack(std::string fname); mmTtrack(); // return pointer to beginning of sentence @@ -58,20 +58,20 @@ namespace ugdiss size_t numTokens() const; // open an mmTtrack file - void open(string fname); + void open(std::string fname); // FUNCTIONS FOR BUILDING CORPUS TRACKS // write a blank file header at the beginning of a new ttrack file - void write_blank_file_header(ostream& out) const; + void write_blank_file_header(std::ostream& out) const; // write the sentence index /idx/ and fill the file header - void write_index_and_finalize(ostream& out, - vector const& idx, + void write_index_and_finalize(std::ostream& out, + std::vector const& idx, count_type tokenCount) const; // copy a contiguous sequence of sentences to another stream // return the number of tokens copied - id_type copySentences(ostream& trg, id_type start, id_type stop) const; + id_type copySentences(std::ostream& trg, id_type start, id_type stop) const; /** find the sentence id of a given token */ id_type findSid(TKN const* t) const; @@ -79,7 +79,7 @@ namespace ugdiss id_type findSid(id_type tokenOffset) const; /// re-assign ids based on the id maps in /f/ - void remap(string const fname, vector const & f) const; + void remap(std::string const fname, std::vector const & f) const; }; @@ -87,7 +87,7 @@ namespace ugdiss template void mmTtrack:: - remap(string const fname, vector const & f) const + remap(std::string const fname, std::vector const & f) const { bio::mapped_file myfile(fname); assert(myfile.is_open()); @@ -128,8 +128,9 @@ namespace ugdiss { if (sid >= this->numSent) { - cerr << "Fatal error: requested sentence #"<numSent <<")" << endl; + std::cerr << "Fatal error: requested sentence #" + << sid <<" is beyond corpus size (" + << this->numSent <<")" << std::endl; } assert(sid < this->numSent); return data+index[sid]; @@ -155,7 +156,7 @@ namespace ugdiss template mmTtrack:: - mmTtrack(string fname) + mmTtrack(std::string fname) { open(fname); } @@ -163,18 +164,18 @@ namespace ugdiss template void mmTtrack:: - open(string fname) + open(std::string fname) { if (access(fname.c_str(),F_OK)) { - ostringstream msg; + std::ostringstream msg; msg << "mmTtrack<>::open: File '" << fname << "' does not exist."; throw std::runtime_error(msg.str().c_str()); } file.open(fname); if (!file.is_open()) { - cerr << "Error opening file " << fname << endl; + std::cerr << "Error opening file " << fname << std::endl; assert(0); } filepos_type idxOffset; @@ -210,7 +211,7 @@ namespace ugdiss template void mmTtrack:: - write_blank_file_header(ostream& out) const + write_blank_file_header(std::ostream& out) const { numwrite(out,filepos_type(0)); // place holder for index start numwrite(out,id_type(0)); // place holder for index size @@ -220,8 +221,8 @@ namespace ugdiss template void mmTtrack:: - write_index_and_finalize(ostream& out, - vectorconst& idx, + write_index_and_finalize(std::ostream& out, + std::vectorconst& idx, id_type tokenCount) const { id_type idxSize = idx.size(); @@ -237,7 +238,7 @@ namespace ugdiss template id_type mmTtrack:: - copySentences(ostream& trg, id_type start, id_type stop) const + copySentences(std::ostream& trg, id_type start, id_type stop) const { assert(stop > start); TKN const* a = sntStart(start); diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.h b/moses/TranslationModel/UG/mm/ug_mmbitext.h index 3837abc59..04c54e60b 100644 --- a/moses/TranslationModel/UG/mm/ug_mmbitext.h +++ b/moses/TranslationModel/UG/mm/ug_mmbitext.h @@ -31,8 +31,8 @@ #include "ug_corpus_token.h" #include "tpt_pickler.h" -using namespace ugdiss; -using namespace std; +// using namespace ugdiss; +// using namespace std; namespace Moses { typedef L2R_Token Token; @@ -43,7 +43,7 @@ namespace Moses { public: typedef mmTSA::tree_iterator iter; class pstats; // one-sided phrase statistics - class jstats; // phrase pair ("joint") statistics + class jstats; // phrase std::pair ("joint") statistics class agenda { boost::mutex lock; @@ -51,7 +51,7 @@ namespace Moses { class job; class worker; list joblist; - vector > workers; + std::vector > workers; bool shutdown; size_t doomed; public: @@ -83,7 +83,7 @@ namespace Moses { find_trg_phr_bounds (size_t const sid, size_t const start, size_t const stop, size_t & s1, size_t & s2, size_t & e1, size_t & e2, - vector * core_alignment, bool const flip) const; + std::vector * core_alignment, bool const flip) const; boost::unordered_map > cache1,cache2; private: @@ -99,22 +99,22 @@ namespace Moses { void prep(iter const& phrase); }; - // "joint" (i.e., phrase pair) statistics + // "joint" (i.e., phrase std::pair) statistics class mmbitext:: jstats { uint32_t my_rcnt; // unweighted count float my_wcnt; // weighted count - vector > > my_aln; + std::vector > > my_aln; boost::mutex lock; public: jstats(); jstats(jstats const& other); uint32_t rcnt() const; float wcnt() const; - vector > > const & aln() const; - void add(float w, vector const& a); + std::vector > > const & aln() const; + void add(float w, std::vector const& a); }; // struct @@ -151,11 +151,11 @@ namespace Moses { size_t in_progress; // keeps track of how many threads are currently working on this boost::unordered_map trg; pstats(); - // vector nbest; + // std::vector nbest; // void select_nbest(size_t const N=10); void release(); void register_worker(); - void add(mmbitext::iter const& trg_phrase, float const w, vector const& a); + void add(mmbitext::iter const& trg_phrase, float const w, std::vector const& a); }; class diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h index 7f03d89df..42375da10 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.h +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -12,6 +12,9 @@ namespace Moses { namespace bitext { + + using ugdiss::TokenIndex; + template class PhrasePair @@ -27,7 +30,7 @@ namespace Moses std::vector fvals; float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs? float dbwd[Moses::LRModel::NONE+1]; // distortion counts - std::vector aln; + std::vector aln; float score; bool inverse; // std::vector indoc; @@ -54,10 +57,10 @@ namespace Moses void fill_lr_vec(LRModel::Direction const& dir, LRModel::ModelType const& mdl, - vector& v) const; + std::vector& v) const; #ifndef NO_MOSES void - print(ostream& out, TokenIndex const& V1, TokenIndex const& V2, + print(std::ostream& out, TokenIndex const& V1, TokenIndex const& V2, LRModel const& LR) const; #endif @@ -271,7 +274,7 @@ namespace Moses PhrasePair ::fill_lr_vec(LRModel::Direction const& dir, LRModel::ModelType const& mdl, - vector& v) const + std::vector& v) const { // how many distinct scores do we have? size_t num_scores = (mdl == LRModel::MSLR ? 4 : mdl == LRModel::MSD ? 3 : 2); @@ -301,7 +304,7 @@ namespace Moses template void PhrasePair - ::print(ostream& out, TokenIndex const& V1, TokenIndex const& V2, + ::print(std::ostream& out, TokenIndex const& V1, TokenIndex const& V2, LRModel const& LR) const { out << toString (V1, this->start1, this->len1) << " ::: " @@ -315,14 +318,14 @@ namespace Moses out << m->first << ":" << m->second; } out << "] ["; - vector lrscores; + std::vector lrscores; this->fill_lr_vec(LR.GetDirection(), LR.GetModelType(), lrscores); for (size_t i = 0; i < lrscores.size(); ++i) { if (i) out << " "; out << boost::format("%.2f") % exp(lrscores[i]); } - out << "]" << endl; + out << "]" << std::endl; #if 0 for (int i = 0; i <= Moses::LRModel::NONE; i++) { diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index 999d93704..247e59664 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -2,7 +2,7 @@ #pragma once #include -#include +#include #include #include #include "moses/Util.h" diff --git a/moses/TranslationModel/UG/mm/ug_tsa_base.h b/moses/TranslationModel/UG/mm/ug_tsa_base.h index 8a4117910..3eaf738ab 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_base.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_base.h @@ -21,8 +21,8 @@ namespace ugdiss { - using namespace std; - using namespace boost; + // using namespace std; + // using namespace boost; namespace bio=boost::iostreams; template @@ -56,7 +56,7 @@ namespace ugdiss typedef boost::shared_ptr bitset_pointer; typedef TKN Token; typedef BitSetCache > BSC_t; - /* to allow caching of bit vectors that are expensive to create on + /* to allow caching of bit std::vectors that are expensive to create on * the fly */ friend class TSA_tree_iterator; @@ -148,8 +148,8 @@ namespace ugdiss * [keyStart,keyStop) */ char const* - lower_bound(typename vector::const_iterator const& keyStart, - typename vector::const_iterator const& keyStop) const; + lower_bound(typename std::vector::const_iterator const& keyStart, + typename std::vector::const_iterator const& keyStop) const; char const* lower_bound(TKN const* keyStart, TKN const* keyStop) const; @@ -160,29 +160,29 @@ namespace ugdiss * [keyStart,keyStop) */ char const* - upper_bound(typename vector::const_iterator const& keyStart, - typename vector::const_iterator const& keyStop) const; + upper_bound(typename std::vector::const_iterator const& keyStart, + typename std::vector::const_iterator const& keyStop) const; char const* upper_bound(TKN const* keyStart, int keyLength) const; /** dump all suffixes in order to /out/ */ - void dump(ostream& out, TokenIndex const& T) const; + void dump(std::ostream& out, TokenIndex const& T) const; /** fill the dynamic bit set with true for all sentences that contain * /phrase/. * @return the raw number of occurrences. */ count_type - fillBitSet(vector const& phrase, bdBitset& dest) const; + fillBitSet(std::vector const& phrase, bdBitset& dest) const; count_type fillBitSet(TKN const* key, size_t keyLen, bdBitset& dest) const; count_type setBits(char const* startRange, char const* endRange, - boost::dynamic_bitset& bs) const; + boost::dynamic_bitset& bs) const; void setTokenBits(char const* startRange, char const* endRange, size_t len, @@ -246,11 +246,11 @@ namespace ugdiss getCounts(char const* p, char const* const q, count_type& sids, count_type& raw) const = 0; - string + std::string suffixAt(char const* p, TokenIndex const* V=NULL, size_t maxlen=0) const; - string + std::string suffixAt(ArrayEntry const& I, TokenIndex const* V=NULL, size_t maxlen=0) const; @@ -269,18 +269,18 @@ namespace ugdiss next 16 bits: length of the phrase */ ::uint64_t - getSequenceId(typename vector::const_iterator const& pstart, - typename vector::const_iterator const& pstop) const; + getSequenceId(typename std::vector::const_iterator const& pstart, + typename std::vector::const_iterator const& pstop) const; ::uint64_t getSequenceId(TKN const* t, ushort plen) const; /** Return the phrase represented by phrase ID pid_ */ - string + std::string getSequence(::uint64_t pid, TokenIndex const& V) const; /** Return the phrase represented by phrase ID pid_ */ - vector + std::vector getSequence(::uint64_t pid) const; TKN const* @@ -308,7 +308,7 @@ namespace ugdiss bool findBranches(TKN const* base, bitvector const& terminals, - vector& dest) const; + std::vector& dest) const; double aveIndexEntrySize() const { @@ -356,7 +356,7 @@ namespace ugdiss template count_type TSA:: - fillBitSet(vector const& key, + fillBitSet(std::vector const& key, bitvector& bitset) const { if (!key.size()) return 0; @@ -555,8 +555,8 @@ namespace ugdiss template char const* TSA:: - lower_bound(typename vector::const_iterator const& keyStart, - typename vector::const_iterator const& keyStop) const + lower_bound(typename std::vector::const_iterator const& keyStart, + typename std::vector::const_iterator const& keyStop) const { TKN const* const a = &(*keyStart); TKN const* const z = &(*keyStop); @@ -597,8 +597,8 @@ namespace ugdiss template char const* TSA:: - upper_bound(typename vector::const_iterator const& keyStart, - typename vector::const_iterator const& keyStop) const + upper_bound(typename std::vector::const_iterator const& keyStart, + typename std::vector::const_iterator const& keyStop) const { TKN const* const a = &((TKN)*keyStart); TKN const* const z = &((TKN)*keyStop); @@ -631,7 +631,7 @@ namespace ugdiss { char const* lo = lower_bound(keyStart,keyLen); char const* up = upper_bound(keyStart,keyLen); - // cerr << up-lo << endl; + // cerr << up-lo << std::endl; return rawCnt(lo,up); } @@ -640,8 +640,8 @@ namespace ugdiss template ::uint64_t TSA:: - getSequenceId(typename vector::const_iterator const& pstart, - typename vector::const_iterator const& pstop) const + getSequenceId(typename std::vector::const_iterator const& pstart, + typename std::vector::const_iterator const& pstop) const { return getSequenceId(&(*pstart),pstop-pstart); } @@ -668,14 +668,14 @@ namespace ugdiss //--------------------------------------------------------------------------- template - vector + std::vector TSA:: getSequence(::uint64_t pid) const { size_t plen = pid % 65536; size_t offset = (pid >> 16) % 65536; TKN const* w = corpus->sntStart(pid >> 32)+offset; - vector ret(plen); + std::vector ret(plen); for (size_t i = 0; i < plen; i++, w = w->next()) { assert(w); @@ -685,11 +685,11 @@ namespace ugdiss } template - string + std::string TSA:: getSequence(::uint64_t pid, TokenIndex const& V) const { - ostringstream buf; + std::ostringstream buf; TKN const* a = getSequenceStart(pid); buf << V[a->id()]; size_t len = getSequenceLength(pid); @@ -806,7 +806,7 @@ namespace ugdiss bool TSA:: findBranches(TKN const* base, bitvector const& terminals, - vector& dest) const + std::vector& dest) const { dest.assign(terminals.count(),tree_iterator(this)); for (size_t i = terminals.find_first(), k = 0; diff --git a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h index d13449e36..ec8a499b2 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h @@ -9,24 +9,24 @@ #include #include #include -// A simple mechanism for caching bit vectors representing occurrences of token +// A simple mechanism for caching bit std::vectors representing occurrences of token // sequences in a corpus. Useful for very frequent items for which the bit -// vector is expensive to create on the fly. The variable threshold determines -// when bit vectors are cached and when they are created on the fly, using the +// std::vector is expensive to create on the fly. The variable threshold determines +// when bit std::vectors are cached and when they are created on the fly, using the // size of the range of entries in the TSA's index in bytes to determine -// whether or not to store the respective bit vector in the cache. +// whether or not to store the respective bit std::vector in the cache. namespace ugdiss { - using namespace std; + // using namespace std; template class BitSetCache { public: - typedef boost::dynamic_bitset BitSet; + typedef boost::dynamic_bitset BitSet; typedef boost::shared_ptr bsptr; - typedef map,bsptr> myMap; + typedef std::map,bsptr> myMap; typedef myMap::iterator myMapIter; private: TSA const* tsa; @@ -56,7 +56,7 @@ namespace ugdiss if (!lo) return ret; if (up-lo > threshold) { - pair k(lo,keyLen); + std::pair k(lo,keyLen); myMapIter m = cached1.find(k); if (m != cached1.end()) ret = m->second; @@ -83,9 +83,9 @@ namespace ugdiss if (!lo) return ret; if (up-lo > threshold) { - pair k(lo,keyLen); + std::pair k(lo,keyLen); // cout << "bla " << keyStart->id() << " " - // << cached2.size() << " " << up-lo << " " << k.second << endl; + // << cached2.size() << " " << up-lo << " " << k.second << std::endl; myMapIter m = cached2.find(k); if (m != cached2.end()) ret = m->second; diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h index 053ff2445..42be0e0a1 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h @@ -21,11 +21,11 @@ namespace ugdiss #define _DISPLAY_CHAIN // for debugging only template - void display(T const* x, string label) + void display(T const* x, std::string label) { - cout << label << ":"; - for (;x;x=next(x)) cout << " " << x->lemma; - cout << endl; + std::cout << label << ":"; + for (;x;x=next(x)) std::cout << " " << x->lemma; + std::cout << std::endl; } #endif @@ -47,11 +47,11 @@ namespace ugdiss TSA_tree_iterator { protected: - vector lower; - vector upper; + std::vector lower; + std::vector upper; // for debugging ... - void showBounds(ostream& out) const; + void showBounds(std::ostream& out) const; public: typedef TKN Token; @@ -76,7 +76,7 @@ namespace ugdiss bool full_match_only=true); TSA_tree_iterator(TSA const* s, TokenIndex const& V, - string const& key); + std::string const& key); char const* lower_bound(int p) const; char const* upper_bound(int p) const; @@ -96,7 +96,7 @@ namespace ugdiss virtual bool over(); virtual bool up(); - string str(TokenIndex const* V=NULL, int start=0, int stop=0) const; + std::string str(TokenIndex const* V=NULL, int start=0, int stop=0) const; // checks if the sentence [start,stop) contains the given sequence. bool match(Token const* start, Token const* stop) const; @@ -105,23 +105,23 @@ namespace ugdiss // fillBitSet: deprecated; use markSentences() instead count_type - fillBitSet(boost::dynamic_bitset& bitset) const; + fillBitSet(boost::dynamic_bitset& bitset) const; count_type markEndOfSequence(Token const* start, Token const* stop, - boost::dynamic_bitset& dest) const; + boost::dynamic_bitset& dest) const; count_type markSequence(Token const* start, Token const* stop, bitvector& dest) const; count_type - markSentences(boost::dynamic_bitset& bitset) const; + markSentences(boost::dynamic_bitset& bitset) const; count_type - markOccurrences(boost::dynamic_bitset& bitset, + markOccurrences(boost::dynamic_bitset& bitset, bool markOnlyStartPosition=false) const; count_type - markOccurrences(vector& dest) const; + markOccurrences(std::vector& dest) const; ::uint64_t getSequenceId() const; @@ -181,7 +181,7 @@ namespace ugdiss return this->size(); } - sptr > + sptr > randomSample(int level, size_t N) const; }; @@ -286,7 +286,7 @@ namespace ugdiss // display(root->corpus->getToken(U),"U1"); int x = root->corpus->cmp(U,L,lower.size()-1); - // cerr << "x=" << x << endl; + // cerr << "x=" << x << std::endl; if (x != 1) return false; lower.back() = upper.back(); @@ -359,10 +359,10 @@ namespace ugdiss TSA_tree_iterator:: TSA_tree_iterator(TSA const* s, TokenIndex const& V, - string const& key) + std::string const& key) : root(s) { - istringstream buf(key); string w; + std::istringstream buf(key); std::string w; while (buf >> w) { if (this->extend(V[w])) @@ -482,8 +482,8 @@ namespace ugdiss #if 0 tsa::ArrayEntry I; root->readEntry(lo,I); - cout << I.sid << " " << I.offset << endl; - cout << root->corpus->sntLen(I.sid) << endl; + cout << I.sid << " " << I.offset << std::endl; + cout << root->corpus->sntLen(I.sid) << std::endl; #endif hi = root->find_end(lo, hi, getToken(0), 1, 0); upper.push_back(hi); @@ -574,11 +574,11 @@ namespace ugdiss Token const* eos = root->corpus->sntEnd(A.sid); #endif if (p < 0) p += lower.size(); - // cerr << p << ". " << t->id() << endl; + // cerr << p << ". " << t->id() << std::endl; while (p-- > 0) { t = next(t); - // if (t) cerr << p << ". " << t->id() << endl; + // if (t) cerr << p << ". " << t->id() << std::endl; assert(t >= bos && t < eos); } return t; @@ -616,7 +616,7 @@ namespace ugdiss template count_type TSA_tree_iterator:: - fillBitSet(boost::dynamic_bitset& bitset) const + fillBitSet(boost::dynamic_bitset& bitset) const { return markSentences(bitset); } @@ -626,7 +626,7 @@ namespace ugdiss template count_type TSA_tree_iterator:: - markSentences(boost::dynamic_bitset& bitset) const + markSentences(boost::dynamic_bitset& bitset) const { assert(root && root->corpus); bitset.resize(root->corpus->size()); @@ -653,7 +653,7 @@ namespace ugdiss template count_type TSA_tree_iterator:: - markOccurrences(boost::dynamic_bitset& bitset, bool markOnlyStartPosition) const + markOccurrences(boost::dynamic_bitset& bitset, bool markOnlyStartPosition) const { assert(root && root->corpus); if (bitset.size() != root->corpus->numTokens()) @@ -669,7 +669,7 @@ namespace ugdiss template count_type TSA_tree_iterator:: - markOccurrences(vector& dest) const + markOccurrences(std::vector& dest) const { assert(root && root->corpus); assert(dest.size() == root->corpus->numTokens()); @@ -700,7 +700,7 @@ namespace ugdiss count_type TSA_tree_iterator:: markEndOfSequence(Token const* start, Token const* stop, - boost::dynamic_bitset& dest) const + boost::dynamic_bitset& dest) const { count_type matchCount=0; Token const* a = getToken(0); @@ -769,7 +769,7 @@ namespace ugdiss } template - string + std::string TSA_tree_iterator:: str(TokenIndex const* V, int start, int stop) const { @@ -779,7 +779,7 @@ namespace ugdiss assert(start>=0 && start < int(this->size())); assert(stop > 0 && stop <= int(this->size())); Token const* x = this->getToken(0); - ostringstream buf; + std::ostringstream buf; for (int i = start; i < stop; ++i, x = x->next()) { assert(x); @@ -802,7 +802,7 @@ namespace ugdiss assert(start>=0 && start < int(this->size())); assert(stop > 0 && stop <= int(this->size())); Token const* x = this->getToken(0); - ostringstream buf; + std::ostringstream buf; for (int i = start; i < stop; ++i, x = x->next()) { assert(x); @@ -899,15 +899,15 @@ namespace ugdiss /// randomly select up to N occurrences of the sequence template - sptr > + sptr > TSA_tree_iterator:: randomSample(int level, size_t N) const { if (level < 0) level += lower.size(); assert(level >=0); - sptr > - ret(new vector(N)); + sptr > + ret(new std::vector(N)); size_t m=0; // number of samples selected so far typename Token::ArrayEntry I(lower.at(level)); diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.h b/moses/TranslationModel/UG/mm/ug_ttrack_base.h index d087a9e58..9668bee0e 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_base.h +++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.h @@ -22,7 +22,7 @@ namespace ugdiss { - using namespace std; + // using namespace std; typedef boost::dynamic_bitset bdBitset; @@ -39,12 +39,12 @@ namespace ugdiss } template - string + std::string toString(TokenIndex const& V, Token const* x, size_t const len) { if (!len) return ""; UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!"); - ostringstream buf; + std::ostringstream buf; buf << V[x->id()]; size_t i = 1; for (x = x->next(); x && i < len; ++i, x = x->next()) @@ -100,10 +100,10 @@ namespace ugdiss endPos(id_type sid) const { return sntEnd(sid)-sntStart(0); } /** Don't use this unless you want a copy of the sentence */ - vector + std::vector operator[](id_type sid) const { - return vector(sntStart(sid),sntEnd(sid)); + return std::vector(sntStart(sid),sntEnd(sid)); } /** @return size of corpus in number of sentences */ @@ -114,9 +114,9 @@ namespace ugdiss /** @return string representation of sentence /sid/ * Currently only defined for Ttrack */ - string str(id_type sid, TokenIndex const& T) const; + std::string str(id_type sid, TokenIndex const& T) const; - string pid2str(TokenIndex const* V, uint64_t pid) const; + std::string pid2str(TokenIndex const* V, uint64_t pid) const; // /** @return string representation of sentence /sid/ // * Currently only defined for Ttrack */ @@ -124,8 +124,8 @@ namespace ugdiss /** counts the tokens in the corpus; used for example in the construction of * token sequence arrays */ - count_type count_tokens(vector& cnt, bdBitset const* filter, - int lengthCutoff=0, ostream* log=NULL) const; + count_type count_tokens(std::vector& cnt, bdBitset const* filter, + int lengthCutoff=0, std::ostream* log=NULL) const; // static id_type toID(TKN const& t); @@ -171,8 +171,8 @@ namespace ugdiss template count_type Ttrack:: - count_tokens(vector& cnt, bdBitset const* filter, - int lengthCutoff, ostream* log) const + count_tokens(std::vector& cnt, bdBitset const* filter, + int lengthCutoff, std::ostream* log) const { bdBitset filter2; if (!filter) @@ -199,7 +199,7 @@ namespace ugdiss { if (log) *log << "WARNING: skipping sentence #" << sid - << " with more than 65536 tokens" << endl; + << " with more than 65536 tokens" << std::endl; expectedTotal -= stop-k; } else @@ -207,7 +207,7 @@ namespace ugdiss totalCount += stop-k; for (; k < stop; ++k) { - // cout << sid << " " << stop-k << " " << k->lemma << " " << k->id() << " " << sizeof(*k) << endl; + // cout << sid << " " << stop-k << " " << k->lemma << " " << k->id() << " " << sizeof(*k) << std::endl; id_type wid = k->id(); while (wid >= cnt.size()) cnt.push_back(0); cnt[wid]++; @@ -217,8 +217,8 @@ namespace ugdiss if (this->size() == filter->count()) { if (totalCount != expectedTotal) - cerr << "OOPS: expected " << expectedTotal - << " tokens but counted " << totalCount << endl; + std::cerr << "OOPS: expected " << expectedTotal + << " tokens but counted " << totalCount << std::endl; assert(totalCount == expectedTotal); } return totalCount; @@ -244,25 +244,25 @@ namespace ugdiss int ret=-1; #if 0 - cerr << "A: "; for (TKN const* x = a; x; x = next(x)) cerr << x->lemma << " "; cerr << endl; - cerr << "B: "; for (TKN const* x = b; x; x = next(x)) cerr << x->lemma << " "; cerr << endl; + cerr << "A: "; for (TKN const* x = a; x; x = next(x)) cerr << x->lemma << " "; cerr << std::endl; + cerr << "B: "; for (TKN const* x = b; x; x = next(x)) cerr << x->lemma << " "; cerr << std::endl; #endif while (a >= bosA && a < eosA) { - // cerr << keyLength << "a. " << (a ? a->lemma : 0) << " " << (b ? b->lemma : 0) << endl; + // cerr << keyLength << "a. " << (a ? a->lemma : 0) << " " << (b ? b->lemma : 0) << std::endl; if (*a < *b) { break; } // return -1; if (*a > *b) { ret = 2; break; } // return 2; a = next(a); b = next(b); - // cerr << keyLength << "b. " << (a ? a->lemma : 0) << " " << (b ? b->lemma : 0) << endl; + // cerr << keyLength << "b. " << (a ? a->lemma : 0) << " " << (b ? b->lemma : 0) << std::endl; if (--keyLength==0 || b < bosB || b >= eosB) { ret = (a < bosA || a >= eosA) ? 0 : 1; break; } } - // cerr << "RETURNING " << ret << endl; + // cerr << "RETURNING " << ret << std::endl; return ret; } @@ -312,7 +312,7 @@ namespace ugdiss { cout << t2->lemma << "." << int(t2->minpos) << " " << k->lemma << "." << int(k->minpos) << " " - << t2->cmp(*k) << endl; + << t2->cmp(*k) << std::endl; } } #endif @@ -382,7 +382,7 @@ namespace ugdiss } template - string + std::string Ttrack:: pid2str(TokenIndex const* V, uint64_t pid) const { @@ -390,7 +390,7 @@ namespace ugdiss pid >>= 16; uint32_t off = pid % (1<<16); uint32_t sid = pid>>16; - ostringstream buf; + std::ostringstream buf; TKN const* t = sntStart(sid) + off; TKN const* stop = t + len; if (V) diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_position.h b/moses/TranslationModel/UG/mm/ug_ttrack_position.h index 6d473f263..09eb1508f 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_position.h +++ b/moses/TranslationModel/UG/mm/ug_ttrack_position.h @@ -57,13 +57,13 @@ namespace ugdiss cout << "A: " << z->id(); for (z = next(z); z >= bosA && z < eosA; z = next(z)) cout << "-" << z->id(); - cout << endl; + cout << std::endl; z = b; cout << "B: " << z->id(); for (z = next(z); z >= bosB && z < eosB; z = next(z)) cout << "-" << z->id(); - cout << endl; + cout << std::endl; #endif while (*a == *b) { @@ -76,7 +76,7 @@ namespace ugdiss } int x = a->cmp(*b); - // cout << " " << (x < 0 ? "YES" : "NO") << endl; + // cout << " " << (x < 0 ? "YES" : "NO") << std::endl; assert (x != 0); return x < 0; diff --git a/moses/TranslationModel/UG/mm/ug_typedefs.h b/moses/TranslationModel/UG/mm/ug_typedefs.h index 0181bef9e..78d78efe4 100644 --- a/moses/TranslationModel/UG/mm/ug_typedefs.h +++ b/moses/TranslationModel/UG/mm/ug_typedefs.h @@ -10,24 +10,24 @@ #include "tpt_typedefs.h" namespace ugdiss { - using namespace std; + // using namespace std; typedef boost::dynamic_bitset bitvector; - typedef vector > flt_2d_table; - typedef vector flt_3d_table; - typedef vector flt_4d_table; + typedef std::vector > flt_2d_table; + typedef std::vector flt_3d_table; + typedef std::vector flt_4d_table; - typedef vector > ushort_2d_table; - typedef vector ushort_3d_table; - typedef vector ushort_4d_table; + typedef std::vector > ushort_2d_table; + typedef std::vector ushort_3d_table; + typedef std::vector ushort_4d_table; - typedef vector > short_2d_table; - typedef vector short_3d_table; - typedef vector short_4d_table; + typedef std::vector > short_2d_table; + typedef std::vector short_3d_table; + typedef std::vector short_4d_table; - typedef vector > int_2d_table; - typedef vector int_3d_table; - typedef vector int_4d_table; + typedef std::vector > int_2d_table; + typedef std::vector int_3d_table; + typedef std::vector int_4d_table; } #define sptr boost::shared_ptr diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 7b212c8df..132b720ba 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -296,6 +296,7 @@ namespace Moses load_extra_data(string bname, bool locking = true) { using namespace boost; + using namespace ugdiss; // TO DO: ADD CHECKS FOR ROBUSTNESS // - file existence? // - same number of lines? @@ -701,7 +702,7 @@ namespace Moses #if 1 if (m_bias_log && m_lr_func && m_bias_loglevel > 3) { - typename PhrasePair::SortDescendingByJointCount sorter; + PhrasePair::SortDescendingByJointCount sorter; sort(ppfix.begin(), ppfix.end(),sorter); BOOST_FOREACH(PhrasePair const& pp, ppfix) { diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 81687cc50..82003a24f 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -145,7 +145,7 @@ namespace Moses std::vector > wlex21; // word translation lexicon (without counts, get these from calc_lex.COOC) - typedef mm2dTable mm2dtable_t; + typedef ugdiss::mm2dTable mm2dtable_t; mm2dtable_t COOCraw; TargetPhrase* diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h index 76ca2a9a4..eb3b20c71 100644 --- a/moses/TranslationModel/UG/sapt_pscore_lex1.h +++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h @@ -16,7 +16,7 @@ namespace Moses { float m_alpha; string m_lexfile; public: - LexicalPhraseScorer2 scorer; + ugdiss::LexicalPhraseScorer2 scorer; PScoreLex1(string const& alphaspec, string const& lexfile) { diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h index 9366777ef..2cbe58209 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pbwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h @@ -27,7 +27,7 @@ namespace Moses { { if (x == '+') { --checksum; continue; } if (x != 'g' && x != 's' && x != 'r') continue; - string s = (format("pbwd-%c%.3f") % x % c).str(); + string s = (boost::format("pbwd-%c%.3f") % x % c).str(); this->m_feature_names.push_back(s); } this->m_num_feats = this->m_feature_names.size(); diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h index c5de210a1..95956e861 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pfwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h @@ -28,7 +28,7 @@ namespace Moses { { if (x == '+') { --checksum; continue; } if (x != 'g' && x != 's' && x != 'r') continue; - string s = (format("pfwd-%c%.3f") % x % c).str(); + string s = (boost::format("pfwd-%c%.3f") % x % c).str(); this->m_feature_names.push_back(s); } this->m_num_feats = this->m_feature_names.size(); diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h index 8dceb1ad0..3388c6b7d 100644 --- a/moses/TranslationModel/UG/sapt_pscore_unaligned.h +++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h @@ -12,7 +12,7 @@ namespace Moses { class PScoreUnaligned : public PhraseScorer { - typedef boost::dynamic_bitset bitvector; + typedef boost::dynamic_bitset bitvector; public: PScoreUnaligned(string const spec) { diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index 3848f81ba..bc2b5032b 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -1,5 +1,5 @@ #include "TranslationRequest.h" -#include "moses/ContextScope.h" +#include "moses/ContextScope.h" #include namespace MosesServer diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp index 94a5a0480..9537b970f 100644 --- a/phrase-extract/ScoreFeatureTest.cpp +++ b/phrase-extract/ScoreFeatureTest.cpp @@ -24,9 +24,10 @@ #define BOOST_TEST_MODULE MosesTrainingScoreFeature #include #include +#include -#include -#include +//#include +//#include using namespace MosesTraining; using namespace std; @@ -54,16 +55,16 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except) //Check that configure rejects illegal domain arg combinations ScoreFeatureManager manager; BOOST_CHECK_THROW( - manager.configure( {"--DomainRatio","/dev/null","--DomainIndicator","/dev/null"}), + manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--SparseDomainSubset","/dev/null","--SparseDomainRatio","/dev/null"}), + manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--SparseDomainBlah","/dev/null"}), + manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")), ScoreFeatureArgumentException); BOOST_CHECK_THROW( - manager.configure( {"--DomainSubset"}), + manager.configure(boost::assign::list_of("--DomainSubset")), ScoreFeatureArgumentException); } @@ -97,25 +98,27 @@ T adder(T first, Args... args) BOOST_AUTO_TEST_CASE(manager_config_domain) { checkDomainConfigured - ( {"--DomainRatio","/dev/null"}); + (boost::assign::list_of("--DomainRatio")("/dev/null")); checkDomainConfigured - ( {"--DomainIndicator","/dev/null"}); + (boost::assign::list_of("--DomainIndicator")("/dev/null")); checkDomainConfigured - ( {"--DomainSubset","/dev/null"}); + (boost::assign::list_of("--DomainSubset")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainRatio","/dev/null"}); + (boost::assign::list_of("--SparseDomainRatio")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainIndicator","/dev/null"}); + (boost::assign::list_of("--SparseDomainIndicator")("/dev/null")); checkDomainConfigured - ( {"--SparseDomainSubset","/dev/null"}); + (boost::assign::list_of("--SparseDomainSubset")("/dev/null")); + /* + // C++11 testing unordered_set s; s.insert(4); s.insert(7); s.insert(4); s.insert(1); -for (auto i: s) { + for (auto i: s) { cerr << i << " "; } @@ -124,7 +127,7 @@ for (auto i: s) { m["ba"] = 6; m["aabc"] = 7; -for (auto i: m) { + for (auto i: m) { cerr << i.first << "=" << i.second << " "; } @@ -132,6 +135,6 @@ for (auto i: m) { std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy"; std::string ssum = adder(s1, s2, s3, s4); - + */ } From 070e0a83e186b752c3b2d9178169574fa617ac79 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Thu, 2 Jul 2015 16:57:17 +0100 Subject: [PATCH 102/286] LD_PRELOAD systax was wrong --- contrib/moses-speedtest/runtests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/moses-speedtest/runtests.py b/contrib/moses-speedtest/runtests.py index a043a26db..610e0f505 100644 --- a/contrib/moses-speedtest/runtests.py +++ b/contrib/moses-speedtest/runtests.py @@ -255,7 +255,7 @@ def execute_tests(testcase, cur_directory, config): subprocess.call([config.drop_caches], shell=True) #Create the command for executing moses: - whole_command = 'LD_PRELOAD ' + opt + time_command + testcase.command + whole_command = 'LD_PRELOAD=' + opt + time_command + testcase.command variant = 'ldpre_' + opt #test normal and cached @@ -282,7 +282,7 @@ def execute_tests(testcase, cur_directory, config): subprocess.call([config.drop_caches], shell=True) #Create the command for executing moses: - whole_command = 'LD_PRELOAD ' + opt + testcase.prof_command + whole_command = 'LD_PRELOAD=' + opt + testcase.prof_command variant = 'profile_ldpre_' + opt #test normal and cached From b05ca8cb807e28319ca5e6b370e086216004567c Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 2 Jul 2015 18:06:55 +0100 Subject: [PATCH 103/286] Fixes to make code compile on various versions of gcc. --- moses/TargetPhrase.cpp | 2 +- moses/TargetPhrase.h | 2 +- moses/TranslationModel/UG/mm/ug_mm_2d_table.h | 2 +- moses/TranslationModel/UG/mm/ug_mm_ttrack.h | 5 +++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index fc7af9687..251a9db7d 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -181,7 +181,7 @@ bool TargetPhrase::HasTtaskSPtr() const { return m_ttask_flag; } -const ttasksptr& TargetPhrase::GetTtask() const { +const ttasksptr TargetPhrase::GetTtask() const { return m_ttask.lock(); } diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h index 460dcc33f..554ac2793 100644 --- a/moses/TargetPhrase.h +++ b/moses/TargetPhrase.h @@ -92,7 +92,7 @@ public: TargetPhrase(ttasksptr &ttask, const PhraseDictionary *pt = NULL); TargetPhrase(ttasksptr &ttask, std::string out_string, const PhraseDictionary *pt = NULL); explicit TargetPhrase(ttasksptr &ttask, const Phrase &targetPhrase, const PhraseDictionary *pt); - const ttasksptr& GetTtask() const; + ttasksptr const GetTtask() const; bool HasTtaskSPtr() const; ~TargetPhrase(); diff --git a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h index e2284382e..0ae16895b 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h +++ b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h @@ -103,7 +103,7 @@ namespace ugdiss operator[](ID key) const { if (start==stop) return INIT(0); - Cell const* c = lower_bound(start,stop,key); + Cell const* c = std::lower_bound(start,stop,key); return (c != stop && c->id == key ? c->val : INIT(0)); } diff --git a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h index 91167822d..b87d638b2 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h @@ -21,6 +21,7 @@ #include "ug_ttrack_base.h" #include "num_read_write.h" #include "ug_load_primer.h" +#include "ug_tsa_base.h" namespace ugdiss { @@ -193,7 +194,7 @@ namespace ugdiss findSid(TKN const* t) const { id_type tokenPos = t-data; - id_type const* p = upper_bound(index,index+this->numSent,tokenPos); + id_type const* p = std::upper_bound(index,index+this->numSent,tokenPos); assert(p>index); return p-index-1; } @@ -203,7 +204,7 @@ namespace ugdiss mmTtrack:: findSid(id_type tokenPos) const { - id_type const* p = upper_bound(index,index+this->numSent,tokenPos); + id_type const* p = std::upper_bound(index,index+this->numSent,tokenPos); assert(p>index); return p-index-1; } From 64ec34df5d427ead0e2281d45bae5891c0adb786 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 2 Jul 2015 23:49:00 +0100 Subject: [PATCH 104/286] Proper indentation with spaces (no tabs). --- .../TranslationModel/UG/mm/ug_prep_phrases.h | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_prep_phrases.h b/moses/TranslationModel/UG/mm/ug_prep_phrases.h index 421d6f090..93a5ea82a 100644 --- a/moses/TranslationModel/UG/mm/ug_prep_phrases.h +++ b/moses/TranslationModel/UG/mm/ug_prep_phrases.h @@ -46,32 +46,32 @@ struct StatsCollector if (!lcache) lcache.reset(new lcache_t); if (m.down()) { - do - { - if (!r.extend(m.getToken(-1)->id())) continue; - this->process(m, r); - uint64_t pid = r.getPid(); - sptr stats; - if (hcache) stats = hcache->get(pid); - if (!stats && pcache) - { - sptr const* foo = pcache->get(pid); - if (foo) stats = *foo; - } - if (!stats) // need to sample - { - BitextSampler s(bitext.get(), r, bias, sample_size, method); - stats = s.stats(); - if (hcache) hcache->set(pid,stats); - if (pcache && r.ca() >= pcache_th) pcache->set(pid,stats); - if (tpool) tpool->add(s); - else s(); - } - (*lcache)[pid] = stats; - r.up(); - } - while (m.over()); - m.up(); + do + { + if (!r.extend(m.getToken(-1)->id())) continue; + this->process(m, r); + uint64_t pid = r.getPid(); + sptr stats; + if (hcache) stats = hcache->get(pid); + if (!stats && pcache) + { + sptr const* foo = pcache->get(pid); + if (foo) stats = *foo; + } + if (!stats) // need to sample + { + BitextSampler s(bitext.get(), r, bias, sample_size, method); + stats = s.stats(); + if (hcache) hcache->set(pid,stats); + if (pcache && r.ca() >= pcache_th) pcache->set(pid,stats); + if (tpool) tpool->add(s); + else s(); + } + (*lcache)[pid] = stats; + r.up(); + } + while (m.over()); + m.up(); } } }; From 1c25b29ebb05331b7ecfc016783d1ec19a5fc04e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 2 Jul 2015 23:55:14 +0100 Subject: [PATCH 105/286] Show from which documents phrase translations were collected. --- .../UG/test-ranked-phrase-lookup.cc | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc b/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc index dd0468d20..613e46360 100644 --- a/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc +++ b/moses/TranslationModel/UG/test-ranked-phrase-lookup.cc @@ -83,14 +83,14 @@ int main(int argc, char* argv[]) sptr icrp = read_input(*B.V1); imtsa newIdx(icrp,NULL); sptr bias = prime_sampling1(*B.I1, newIdx, 5000, B.sid2did()); - cerr << "primed" << endl; - ug::ThreadPool T(boost::thread::hardware_concurrency()); + cerr << "primed " << endl; + ug::ThreadPool T(1); // boost::thread::hardware_concurrency()); TSA::tree_iterator m(&newIdx); // dump(m, *B.V1); // exit(0); TSA::tree_iterator r(B.I1.get()); StatsCollector collect(Bptr, bias); - collect.tpool = &T; + // collect.tpool = &T; collect.process(m, r); typedef PhrasePair::SortDescendingByJointCount sorter_t; @@ -119,7 +119,14 @@ int main(int argc, char* argv[]) PhrasePair const& pp = pplist[i]; // if (pp.joint == 1) break; cout << boost::format(" %6d %.5f | ") % pp.joint % pp.cum_bias - << toString(*B.V2, pp.start2, pp.len2) << endl; + << toString(*B.V2, pp.start2, pp.len2) + << " ["; + for (size_t d = 0; d < pp.indoc.size(); ++d) + { + if (d) cout << ":"; + cout << pp.indoc[d]; + } + cout << "]" << endl; if (++ctr == 5) break; } } From f78bb4a6e93297d56408e085b9311e1c23aa548c Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 2 Jul 2015 23:56:53 +0100 Subject: [PATCH 106/286] Bigger K-best list to accommodate phrase extraction failure. --- .../UG/mm/ug_bitext_sampler.h | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index 22fd97056..628c2059f 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -56,7 +56,7 @@ namespace bitext size_t m_ctr; // number of samples considered float m_total_bias; // for random sampling with bias bool m_finished; - void consider_sample(TokenPosition const& p); + bool consider_sample(TokenPosition const& p); size_t perform_ranked_sampling(); public: @@ -93,6 +93,7 @@ namespace bitext m_stats.reset(new pstats); m_stats->raw_cnt = phrase.ca(); m_stats->register_worker(); + // cerr << phrase.str(bitext->V1.get()) << " [" << HERE << "]" << endl; } template @@ -128,22 +129,24 @@ namespace bitext { if (m_next == m_stop) return m_ctr; CandidateSorter sorter(*m_bias); - NBestList nbest(m_samples,sorter); + // below: nbest size = 20 * m_samples to allow for failed phrase extraction + NBestList nbest(20*m_samples,sorter); ugdiss::tsa::ArrayEntry I(m_next); while (I.next < m_stop) { ++m_ctr; nbest.add(m_root->readEntry(I.next,I)); } - for (size_t i = 0; i < nbest.size(); ++i) - consider_sample(nbest.get_unsorted(i)); + size_t n = 0; + for (size_t i = 0; n < m_samples && i < nbest.size(); ++i) + if (consider_sample(nbest[i])) ++n;; // cerr << m_ctr << " samples considered at " // << __FILE__ << ":" << __LINE__ << endl; return m_ctr; } template - void + bool BitextSampler:: consider_sample(TokenPosition const& p) { @@ -152,11 +155,18 @@ namespace bitext PhraseExtractionRecord rec(p.sid, p.offset, p.offset + m_plen, !m_fwd, &aln, &full_aln); int docid = m_bias ? m_bias->GetClass(p.sid) : -1; + bool good = m_bitext->find_trg_phr_bounds(rec); + +#if 0 + cerr << p.sid << " " << docid << " " + << (good ? "OK " : "bad ") + << __FILE__ << ":" << __LINE__ << endl; +#endif if (!good) { // no good, probably because phrase is not coherent m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd); - return; + return false; } // all good: register this sample as valid @@ -203,6 +213,7 @@ namespace bitext for (size_t k = 1; k < aln.size(); k += 2) --aln[k]; } + return true; } template From b67a8a4fcb40d691c01c266a1fd4ba0ff50838a7 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Fri, 3 Jul 2015 00:00:41 +0100 Subject: [PATCH 107/286] daily automatic beautifier --- moses/WordsBitmapTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/WordsBitmapTest.cpp b/moses/WordsBitmapTest.cpp index fd69e8fe3..0a7bd7324 100644 --- a/moses/WordsBitmapTest.cpp +++ b/moses/WordsBitmapTest.cpp @@ -70,7 +70,7 @@ BOOST_AUTO_TEST_CASE(getset) BOOST_CHECK_EQUAL(wbm.GetValue(2),false); wbm.SetValue(2,true); BOOST_CHECK_EQUAL(wbm.GetValue(2),true); - + wbm.SetValue(1,3,true); BOOST_CHECK_EQUAL(wbm.GetValue(1),true); From 9dae3eb78520b7d574c8415a4fbebd84e37383bd Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 3 Jul 2015 15:11:21 +0100 Subject: [PATCH 108/286] Code cleanup. --- .../UG/mm/ug_sampling_bias.cc | 58 ------------------- 1 file changed, 58 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index f5e660456..15eb78343 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -19,30 +19,9 @@ namespace Moses { using ugdiss::id_type; - size_t ca_write_callback(void *ptr, size_t size, size_t nmemb, - std::string* response) - { - char const* c = reinterpret_cast(ptr); - *response += std::string(c, size * nmemb); - return size * nmemb; - } - std::string query_bias_server(std::string const& server, std::string const& context) { -#if 0 - std::string query = server + uri_encode(context); - std::string response; - - CURL* curl = curl_easy_init(); - UTIL_THROW_IF2(!curl, "Could not init curl."); - curl_easy_setopt(curl, CURLOPT_URL, query.c_str()); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ca_write_callback); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); - CURLcode res = curl_easy_perform(curl); - curl_easy_cleanup(curl); - return response; -#else std::string query = server+uri_encode(context); boost::asio::io_service io_service; Moses::http_client c(io_service, query); @@ -52,43 +31,8 @@ namespace Moses std::cerr << "SERVER RESPONSE: " << response << std::endl; return c.content(); -#endif } -// // #ifdef WITH_MMT_BIAS_CLIENT -// std::string -// query_bias_server(std::string const& url, std::string const& text) -// { -// #if 1 -// std::string query = url+uri_encode(text); -// boost::asio::io_service io_service; -// Moses::http_client c(io_service, query); -// io_service.run(); - -// std::string response = c.content(); -// std::cerr << "SERVER RESPONSE: " << response << std::endl; - -// return c.content(); -// #else -// return ""; -// #endif -// } -// // #endif - - - // std::string - // query_bias_server(std::string const& url, int const port, - // std::string const& context, - // std::string const& src_lang) - // { - // char* response - // = ca_get_context(url.c_str(), port, context.c_str(), src_lang.c_str()); - // UTIL_THROW_IF2(!response, "No response from server"); - // std::string json = response; - // free(response); - // return json; - // } - DocumentBias ::DocumentBias ( std::vector const& sid2doc, @@ -98,14 +42,12 @@ namespace Moses : m_sid2docid(sid2doc) , m_bias(docname2docid.size(), 0) { - // #ifdef HAVE_CURLPP Timer timer; if (log) timer.start(NULL); std::string json = query_bias_server(server_url, text); std::cerr << "SERVER RESPONSE " << json << std::endl; init_from_json(json, docname2docid, log); if (log) *log << "Bias query took " << timer << " seconds." << std::endl; - // #endif } DocumentBias From e1f31666c3c9b37ff299c7d12e3be1c1cd151f07 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 3 Jul 2015 17:20:27 +0100 Subject: [PATCH 109/286] Fixes to make things compile after merging with branch mmt-dev. --- moses/TranslationModel/UG/mm/ug_bitext.h | 2 +- .../UG/mm/ug_bitext_jstats.cc | 2 +- .../TranslationModel/UG/mm/ug_bitext_jstats.h | 8 +++--- .../mm/ug_bitext_phrase_extraction_record.h | 6 ++--- .../UG/mm/ug_bitext_sampler.h | 2 +- .../UG/mm/ug_sampling_bias.cc | 25 +++++++++---------- moses/TranslationModel/UG/mmsapt.cpp | 2 +- 7 files changed, 24 insertions(+), 23 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index b9178edd3..de56c429e 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -363,7 +363,7 @@ namespace Moses { size_t & s1, size_t & s2, // beginning and end of target start size_t & e1, size_t & e2, // beginning and end of target end int& po_fwd, int& po_bwd, // phrase orientations - std::vector * core_alignment, // stores the core alignment + std::vector * core_alignment, // stores the core alignment bitvector* full_alignment, // stores full word alignment for this sent. bool const flip) const // flip source and target (reverse lookup) { diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc index 803d41ac1..a5debcec8 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc @@ -56,7 +56,7 @@ namespace Moses void jstats:: - add(float w, float b, vector const& a, uint32_t const cnt2, + add(float w, float b, std::vector const& a, uint32_t const cnt2, uint32_t fwd_orient, uint32_t bwd_orient, int const docid) { boost::lock_guard lk(this->lock); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h index bd6fcfcd1..2984c9293 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h @@ -42,9 +42,11 @@ namespace Moses float bcnt() const; // cumulative bias scores std::vector > > const & aln() const; - void add(float w, std::vector const& a, uint32_t const cnt2, - uint32_t fwd_orient, uint32_t bwd_orient, - int const docid); + + void + add(float w, float b, std::vector const& a, uint32_t const cnt2, + uint32_t fwd_orient, uint32_t bwd_orient, int const docid); + void invalidate(); void validate(); bool valid(); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h b/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h index 646875859..390ccbf5c 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h @@ -13,12 +13,12 @@ namespace Moses bool const flip; // 'backward' lookup from L2 size_t s1, s2, e1, e2; // soft and hard boundaries of target phrase int po_fwd, po_bwd; // fwd and bwd phrase orientation - std::vector* aln; // local alignments - bitvector* full_aln; // full word alignment for sentence + std::vector* aln; // local alignments + ugdiss::bitvector* full_aln; // full word alignment for sentence PhraseExtractionRecord(size_t const xsid, size_t const xstart, size_t const xstop, bool const xflip, - std::vector* xaln, bitvector* xfull_aln = NULL) + std::vector* xaln, ugdiss::bitvector* xfull_aln = NULL) : sid(xsid), start(xstart), stop(xstop), flip(xflip) , aln(xaln), full_aln(xfull_aln) { } }; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index 628c2059f..9333dd879 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -150,7 +150,7 @@ namespace bitext BitextSampler:: consider_sample(TokenPosition const& p) { - vector aln; + std::vector aln; bitvector full_aln(100*100); PhraseExtractionRecord rec(p.sid, p.offset, p.offset + m_plen, !m_fwd, &aln, &full_aln); diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index 60ba0314c..7bb2e2afe 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -49,16 +49,15 @@ namespace Moses } DocumentBias:: - DocumentBias - ( std::vector const& sid2doc, - std::map const& docname2docid, - std::string const& server_url, std::string const& text, - std::ostream* log) + DocumentBias(std::vector const& sid2doc, + std::map const& docname2docid, + std::string const& server_url, std::string const& text, + std::ostream* log) : SamplingBias(&sid2doc) , m_bias(docname2docid.size(), 0) { // #ifdef HAVE_CURLPP -#ifndef NO_MOSES + // #ifndef NO_MOSES Timer timer; if (log) timer.start(NULL); std::string json = query_bias_server(server_url, text); @@ -67,13 +66,13 @@ namespace Moses if (log) *log << "Bias query took " << timer << " seconds." << std::endl; } - DocumentBias - ::DocumentBias(std::vector const& sid2doc, - std::map const& docname2docid, - std::map const& context_weights, - std::ostream* log) - : m_sid2docid(sid2doc) - , m_bias(docname2docid.size(), 0) + DocumentBias:: + DocumentBias(std::vector const& sid2doc, + std::map const& docname2docid, + std::map const& context_weights, + std::ostream* log) + : SamplingBias(&sid2doc) + , m_bias(docname2docid.size(), 0) { init(context_weights, docname2docid); } diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index c2e973239..3fa76f4d8 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -841,7 +841,7 @@ namespace Moses context->bias_log = m_bias_log; } context->bias - = btfix.SetupDocumentBias(ttask->GetContextWeights(), m_bias_log); + = btfix->SetupDocumentBias(ttask->GetContextWeights(), m_bias_log); context->bias->loglevel = m_bias_loglevel; context->bias->log = m_bias_log; if (!context->cache1) context->cache1.reset(new pstats::cache_t); From 078c8f7fdbd0e80770155afd71b29c28de1bedcb Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Fri, 3 Jul 2015 17:51:00 +0100 Subject: [PATCH 110/286] Some fixes in the profiler config and added possibility to use google-profiler. Untested still, no documentaion --- contrib/moses-speedtest/runtests.py | 108 ++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 21 deletions(-) diff --git a/contrib/moses-speedtest/runtests.py b/contrib/moses-speedtest/runtests.py index 610e0f505..f05506137 100644 --- a/contrib/moses-speedtest/runtests.py +++ b/contrib/moses-speedtest/runtests.py @@ -2,6 +2,7 @@ import os import subprocess import time +import shutil from argparse import ArgumentParser from testsuite_common import processLogLine @@ -26,16 +27,21 @@ def parse_cmd(): arguments = parser.parse_args() return arguments -def repoinit(testconfig, profiler=True): +def repoinit(testconfig, profiler=None): """Determines revision and sets up the repo. If given the profiler optional argument, wil init the profiler repo instead of the default one.""" revision = '' #Update the repo - if profiler: + if profiler == "gnu-profiler": if testconfig.repo_prof is not None: os.chdir(testconfig.repo_prof) else: raise ValueError('Profiling repo is not defined') + elif profiler == "google-profiler": + if testconfig.repo_gprof is not None: + os.chdir(testconfig.repo_gprof) + else: + raise ValueError('Profiling repo is not defined') else: os.chdir(testconfig.repo) #Checkout specific branch, else maintain main branch @@ -61,9 +67,10 @@ def repoinit(testconfig, profiler=True): class Configuration: """A simple class to hold all of the configuration constatns""" - def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None): + def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None, repo_gprof=None): self.repo = repo self.repo_prof = repo_prof + self.repo_gprof = repo_gprof self.drop_caches = drop_caches self.tests = tests self.testlogs = testlogs @@ -88,16 +95,17 @@ class Configuration: class Test: """A simple class to contain all information about tests""" - def __init__(self, name, command, ldopts, permutations, prof_command=None): + def __init__(self, name, command, ldopts, permutations, prof_command=None, gprof_command=None): self.name = name self.command = command self.prof_command = prof_command + self.gprof_command = gprof_command self.ldopts = ldopts.replace(' ', '').split(',') #Not tested yet self.permutations = permutations -def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None): +def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None, moses_gprof_repo=None): """Parses the config file""" - command, ldopts, prof_command = '', '', None + command, ldopts, prof_command, gprof_command = '', '', None, None permutations = [] fileopen = open(conffile, 'r') for line in fileopen: @@ -108,8 +116,10 @@ def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None): if opt == 'Command:': command = args.replace('\n', '') - if moses_prof is not None: # Get optional command for profiling + if moses_prof_repo is not None: # Get optional command for profiling prof_command = moses_prof_repo + '/bin/' + command + if moses_gprof_repo is not None: # Get optional command for google-perftools + gprof_command = moses_gprof_repo + '/bin' + command command = moses_repo + '/bin/' + command elif opt == 'LDPRE:': ldopts = args.replace('\n', '') @@ -118,14 +128,14 @@ def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None): else: raise ValueError('Unrecognized option ' + opt) #We use the testdir as the name. - testcase = Test(testdir, command, ldopts, permutations, prof_command) + testcase = Test(testdir, command, ldopts, permutations, prof_command, gprof_command) fileopen.close() return testcase def parse_testconfig(conffile): """Parses the config file for the whole testsuite.""" repo_path, drop_caches, tests_dir, testlog_dir = '', '', '', '' - basebranch, baserev, repo_prof_path = '', '', None + basebranch, baserev, repo_prof_path, repo_gprof_path = '', '', None, None fileopen = open(conffile, 'r') for line in fileopen: line = line.split('#')[0] # Discard comments @@ -146,10 +156,12 @@ def parse_testconfig(conffile): baserev = args.replace('\n', '') elif opt == 'MOSES_PROFILER_REPO:': # Optional repo_prof_path = args.replace('\n', '') + elif opt == 'MOSES_GOOGLE_PROFILER_REPO:': # Optional + repo_gprof_path = args.replace('\n', '') else: raise ValueError('Unrecognized option ' + opt) config = Configuration(repo_path, drop_caches, tests_dir, testlog_dir,\ - basebranch, baserev, repo_prof_path) + basebranch, baserev, repo_prof_path, repo_gprof_path) fileopen.close() return config @@ -160,7 +172,9 @@ def get_config(): config.additional_args(args.singletestdir, args.revision, args.branch) revision = repoinit(config) if config.repo_prof is not None: - repoinit(config, True) + repoinit(config, "gnu-profiler") + if config.repo_gprof is not None: + repoinit(config, "google-profiler") config.set_revision(revision) return config @@ -214,14 +228,25 @@ def write_gprof(command, name, variant, config): subprocess.call([gprof_command], shell=True) os.remove('gmon_path') # After we are done discard the gmon file -def execute_test(command, path, name, variant, config, profile=False): +def write_pprof(name, variant, config): + """Copies the google-perftools profiler output to the corresponding test directory""" + output_dir = config.testlogs + '/' + name + if not os.path.exists(output_dir): + os.makedirs(output_dir) + outputfile = output_dir + '/gprof_' + time.strftime("%d.%m.%Y_%H:%M:%S") + '_' + name + '_' + variant + shutil.move("/tmp/moses.prof", outputfile) + + +def execute_test(command, path, name, variant, config, profile=None): """Executes a testcase given a whole command, path to the test file output, name of the test and variant tested. Config is the global configuration""" subprocess.Popen([command], stdout=None, stderr=subprocess.PIPE, shell=True).communicate() - if not profile: + if profile is None: write_log(path, name + '_' + variant, config) - else: # Basically produce a gmon output + elif profile == "gnu-profiler": # Basically produce a gmon output write_gprof(command, name, variant, config) + elif profile == "google-profiler": + write_pprof(name, variant, config) def execute_tests(testcase, cur_directory, config): @@ -271,9 +296,9 @@ def execute_tests(testcase, cur_directory, config): if 'vanilla' in testcase.permutations: whole_command = testcase.prof_command - execute_test(whole_command, time_path, testcase.name, 'profile', config, True) + execute_test(whole_command, time_path, testcase.name, 'profile', config, "gnu-profiler") if 'cached' in testcase.permutations: - execute_test(whole_command, time_path, testcase.name, 'profile_cached', config, True) + execute_test(whole_command, time_path, testcase.name, 'profile_cached', config, "gnu-profiler") if 'ldpre' in testcase.permutations: for opt in testcase.ldopts: @@ -282,13 +307,42 @@ def execute_tests(testcase, cur_directory, config): subprocess.call([config.drop_caches], shell=True) #Create the command for executing moses: - whole_command = 'LD_PRELOAD=' + opt + testcase.prof_command + whole_command = 'LD_PRELOAD=' + opt + " " + testcase.prof_command variant = 'profile_ldpre_' + opt #test normal and cached - execute_test(whole_command, time_path, testcase.name, variant, config, True) + execute_test(whole_command, time_path, testcase.name, variant, config, "gnu-profiler") if 'cached' in testcase.permutations: - execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, True) + execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, "gnu-profiler") + + #Google-perftools profiler + if 'google-profiler' in testcase.permutations: + subprocess.call(['sync'], shell=True) # Drop caches first + subprocess.call([config.drop_caches], shell=True) + + #Create the command for executing moses + whole_command = "CPUPROFILE=/tmp/moses.prof " + testcase.gprof_command + + #test normal and cached + execute_test(whole_command, time_path, testcase.name, 'vanilla', config, 'google-profiler') + if 'cached' in testcase.permutations: + execute_test(whole_command, time_path, testcase.name, 'vanilla_cached', config, 'google-profiler') + + #Now perform LD_PRELOAD tests + if 'ldpre' in testcase.permutations: + for opt in testcase.ldopts: + #Clear caches + subprocess.call(['sync'], shell=True) + subprocess.call([config.drop_caches], shell=True) + + #Create the command for executing moses: + whole_command = 'LD_PRELOAD=' + opt + " " + whole_command + variant = 'ldpre_' + opt + + #test normal and cached + execute_test(whole_command, time_path, testcase.name, variant, config, 'google-profiler') + if 'cached' in testcase.permutations: + execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, 'google-profiler') # Go through all the test directories and executes tests @@ -340,10 +394,15 @@ if __name__ == '__main__': subprocess.call(['./previous.sh'], shell=True) #If profiler configuration exists also init it if BASECONFIG.repo_prof is not None: - repoinit(BASECONFIG, True) + repoinit(BASECONFIG, "gnu-profiler") os.chdir(BASECONFIG.repo_prof) subprocess.call(['./previous.sh'], shell=True) + if BASECONFIG.repo_gprof is not None: + repoinit(BASECONFIG, "google-profiler") + os.chdir(BASECONFIG.repo_gprof) + subprocess.call(['./previous.sh'], shell=True) + #Perform tests for directory in FIRSTTIME: cur_testcase = parse_configfile(BASECONFIG.tests + '/' + directory +\ @@ -353,7 +412,10 @@ if __name__ == '__main__': #Reset back the repository to the normal configuration repoinit(CONFIG) if BASECONFIG.repo_prof is not None: - repoinit(CONFIG, True) + repoinit(CONFIG, "gnu-profiler") + + if BASECONFIG.repo_gprof is not None: + repoinit(CONFIG, "google-profiler") #Builds moses os.chdir(CONFIG.repo) @@ -362,6 +424,10 @@ if __name__ == '__main__': os.chdir(CONFIG.repo_prof) subprocess.call(['./previous.sh'], shell=True) + if CONFIG.repo_gprof is not None: + os.chdir(CONFIG.repo_gprof) + subprocess.call(['./previous.sh'], shell=True) + if CONFIG.singletest: TESTCASE = parse_configfile(CONFIG.tests + '/' +\ CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo) From 0fe0f78e921d316b3ab6a42e70ad0377e101e0e3 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Sat, 4 Jul 2015 00:00:44 +0100 Subject: [PATCH 111/286] daily automatic beautifier --- moses/BaseManager.cpp | 3 ++- moses/Hypothesis.cpp | 4 ++-- moses/LM/Base.cpp | 4 ++-- moses/TargetPhrase.cpp | 6 ++++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/moses/BaseManager.cpp b/moses/BaseManager.cpp index 609a6e9f5..52520d92c 100644 --- a/moses/BaseManager.cpp +++ b/moses/BaseManager.cpp @@ -28,7 +28,8 @@ BaseManager::GetSource() const } const ttasksptr& -BaseManager::GetTtask() const { +BaseManager::GetTtask() const +{ return m_ttask.lock(); } diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp index b722ae05f..59d53a61d 100644 --- a/moses/Hypothesis.cpp +++ b/moses/Hypothesis.cpp @@ -234,8 +234,8 @@ EvaluateWhenApplied(StatefulFeatureFunction const& sfff, ttasksptr const& ttask = manager.GetTtask(); m_ffStates[state_idx] = sfff.EvaluateWhenAppliedWithContext - (ttask, *this, m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL, - &m_currScoreBreakdown); + (ttask, *this, m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL, + &m_currScoreBreakdown); } } diff --git a/moses/LM/Base.cpp b/moses/LM/Base.cpp index 8b69626ac..b999214ac 100644 --- a/moses/LM/Base.cpp +++ b/moses/LM/Base.cpp @@ -78,9 +78,9 @@ void LanguageModel::EvaluateInIsolation(const Phrase &source float fullScore, nGramScore; size_t oovCount; - if (targetPhrase.HasTtaskSPtr()){ + if (targetPhrase.HasTtaskSPtr()) { CalcScoreWithContext(targetPhrase.GetTtask(), targetPhrase, fullScore, nGramScore, oovCount); - }else{ + } else { CalcScore(targetPhrase, fullScore, nGramScore, oovCount); } //CalcScore(targetPhrase, fullScore, nGramScore, oovCount); diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 893edca08..21f185498 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -177,11 +177,13 @@ void TargetPhrase::WriteToRulePB(hgmert::Rule* pb) const } #endif -bool TargetPhrase::HasTtaskSPtr() const { +bool TargetPhrase::HasTtaskSPtr() const +{ return m_ttask_flag; } -const ttasksptr& TargetPhrase::GetTtask() const { +const ttasksptr& TargetPhrase::GetTtask() const +{ return m_ttask; } From 4dd2ea3117bdc156cde45d9d126e07a245c6381d Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 5 Jul 2015 13:08:57 +0100 Subject: [PATCH 112/286] Added random sampling to BitextSampler. --- .../UG/mm/ug_bitext_sampler.h | 557 +++++++++++------- moses/TranslationModel/UG/mmsapt.cpp | 34 +- 2 files changed, 356 insertions(+), 235 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index 9333dd879..d3845415d 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -1,8 +1,14 @@ // -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*- #pragma once + +#include + +#include #include #include #include +#include + #include "ug_bitext.h" #include "ug_bitext_pstats.h" #include "ug_sampling_bias.h" @@ -16,246 +22,357 @@ namespace Moses namespace bitext { - enum sampling_method { full_coverage, random_sampling, ranked_sampling }; +enum sampling_method { full_coverage, random_sampling, ranked_sampling }; - typedef ugdiss::ttrack::Position TokenPosition; - class CandidateSorter - { - SamplingBias const& score; - public: - CandidateSorter(SamplingBias const& s) : score(s) {} - bool operator()(TokenPosition const& a, TokenPosition const& b) const - { return score[a.sid] > score[b.sid]; } - }; +typedef ugdiss::ttrack::Position TokenPosition; +class CandidateSorter +{ + SamplingBias const& score; +public: + CandidateSorter(SamplingBias const& s) : score(s) {} + bool operator()(TokenPosition const& a, TokenPosition const& b) const + { return score[a.sid] > score[b.sid]; } +}; - template - class - BitextSampler : public reference_counter - { - typedef Bitext bitext; - typedef TSA tsa; - typedef SamplingBias bias; - typedef typename Bitext::iter tsa_iter; - mutable boost::condition_variable m_ready; - mutable boost::mutex m_lock; - // const members - // sptr const m_bitext; // keep bitext alive while I am - // should be an - iptr const m_bitext; // keep bitext alive as long as I am - size_t const m_plen; // length of lookup phrase - bool const m_fwd; // forward or backward direction? - sptr const m_root; // root of suffix array - char const* m_next; // current position - char const* m_stop; // end of search range - sampling_method const m_method; /* look at all / random sample / - * ranked samples */ - sptr const m_bias; // bias over candidates - size_t const m_samples; // how many samples at most - // non-const members - sptr m_stats; // destination for phrase stats - size_t m_ctr; // number of samples considered - float m_total_bias; // for random sampling with bias - bool m_finished; - bool consider_sample(TokenPosition const& p); - size_t perform_ranked_sampling(); +template +class +BitextSampler : public reference_counter +{ + typedef Bitext bitext; + typedef TSA tsa; + typedef SamplingBias bias; + typedef typename Bitext::iter tsa_iter; + mutable boost::condition_variable m_ready; + mutable boost::mutex m_lock; + // const members + // sptr const m_bitext; // keep bitext alive while I am + // should be an + iptr const m_bitext; // keep bitext alive as long as I am + size_t const m_plen; // length of lookup phrase + bool const m_fwd; // forward or backward direction? + sptr const m_root; // root of suffix array + char const* m_next; // current position + char const* m_stop; // end of search range + sampling_method const m_method; // look at all/random/ranked samples + sptr const m_bias; // bias over candidates + size_t const m_samples; // how many samples at most + // non-const members + sptr m_stats; // destination for phrase stats + size_t m_ctr; // number of samples considered + float m_total_bias; // for random sampling with bias + bool m_finished; + + boost::taus88 m_rnd; // every job has its own pseudo random generator + double m_rnddenom; // denominator for scaling random sampling + double m_bias_total; + + bool consider_sample(TokenPosition const& p); + size_t perform_ranked_sampling(); + size_t perform_random_sampling(); + + int check_sample_distribution(uint64_t const& sid, uint64_t const& offset); + bool flip_coin(ugdiss::id_type & sid, ushort & offset); - public: - BitextSampler(BitextSampler const& other); - BitextSampler const& operator=(BitextSampler const& other); - BitextSampler(bitext const* const bitext, typename bitext::iter const& phrase, - sptr const& bias, size_t const max_samples, - sampling_method const method); - ~BitextSampler(); - bool operator()(); // run sampling - sptr stats(); - bool done() const; - }; - - template - BitextSampler:: - BitextSampler(Bitext const* const bitext, - typename bitext::iter const& phrase, - sptr const& bias, size_t const max_samples, - sampling_method const method) - : m_bitext(bitext) - , m_plen(phrase.size()) - , m_fwd(phrase.root == bitext->I1.get()) - , m_root(m_fwd ? bitext->I1 : bitext->I2) - , m_next(phrase.lower_bound(-1)) - , m_stop(phrase.upper_bound(-1)) - , m_method(method) - , m_bias(bias) - , m_samples(max_samples) - , m_ctr(0) - , m_total_bias(0) - , m_finished(false) - { - m_stats.reset(new pstats); - m_stats->raw_cnt = phrase.ca(); - m_stats->register_worker(); - // cerr << phrase.str(bitext->V1.get()) << " [" << HERE << "]" << endl; - } - - template - BitextSampler:: - BitextSampler(BitextSampler const& other) - : m_bitext(other.m_bitext) - , m_plen(other.m_plen) - , m_fwd(other.m_fwd) - , m_root(other.m_root) - , m_next(other.m_next) - , m_stop(other.m_stop) - , m_method(other.m_method) - , m_bias(other.m_bias) - , m_samples(other.m_samples) - { - // lock both instances - boost::unique_lock mylock(m_lock); - boost::unique_lock yrlock(other.m_lock); - // actually, BitextSamplers should only copied on job submission - m_stats = other.m_stats; - m_stats->register_worker(); - m_ctr = other.m_ctr; - m_total_bias = other.m_total_bias; - m_finished = other.m_finished; - } +public: + BitextSampler(BitextSampler const& other); + BitextSampler const& operator=(BitextSampler const& other); + BitextSampler(bitext const* const bitext, typename bitext::iter const& phrase, + sptr const& bias, size_t const max_samples, + sampling_method const method); + ~BitextSampler(); + bool operator()(); // run sampling + sptr stats(); + bool done() const; +}; - // Ranked sampling sorts all samples by score and then considers the top-ranked - // candidates for phrase extraction. - template - size_t - BitextSampler:: - perform_ranked_sampling() - { - if (m_next == m_stop) return m_ctr; - CandidateSorter sorter(*m_bias); - // below: nbest size = 20 * m_samples to allow for failed phrase extraction - NBestList nbest(20*m_samples,sorter); - ugdiss::tsa::ArrayEntry I(m_next); - while (I.next < m_stop) - { - ++m_ctr; - nbest.add(m_root->readEntry(I.next,I)); - } - size_t n = 0; - for (size_t i = 0; n < m_samples && i < nbest.size(); ++i) - if (consider_sample(nbest[i])) ++n;; - // cerr << m_ctr << " samples considered at " - // << __FILE__ << ":" << __LINE__ << endl; - return m_ctr; - } - - template - bool - BitextSampler:: - consider_sample(TokenPosition const& p) - { - std::vector aln; - bitvector full_aln(100*100); - PhraseExtractionRecord rec(p.sid, p.offset, p.offset + m_plen, - !m_fwd, &aln, &full_aln); - int docid = m_bias ? m_bias->GetClass(p.sid) : -1; +template +int +BitextSampler:: +check_sample_distribution(uint64_t const& sid, uint64_t const& offset) +{ // ensure that the sampled distribution approximately matches the bias + // @return 0: SKIP this occurrence + // @return 1: consider this occurrence for sampling + // @return 2: include this occurrence in the sample by all means - bool good = m_bitext->find_trg_phr_bounds(rec); + typedef boost::math::binomial_distribution<> binomial; + + // std::ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL; + std::ostream* log = NULL; + + if (!m_bias) return 1; + + float p = (*m_bias)[sid]; + id_type docid = m_bias->GetClass(sid); + + std::map::const_iterator m = m_stats->indoc.find(docid); + uint32_t k = m != m_stats->indoc.end() ? m->second : 0 ; + + // always consider candidates from dominating documents and + // from documents that have not been considered at all yet + bool ret = (p > .5 || k == 0); + + if (ret && !log) return 1; + + uint32_t N = m_stats->good; // number of trials + float d = cdf(complement(binomial(N, p), k)); + // d: probability that samples contains k or more instances from doc #docid + ret = ret || d >= .05; #if 0 - cerr << p.sid << " " << docid << " " - << (good ? "OK " : "bad ") - << __FILE__ << ":" << __LINE__ << endl; + if (log) + { + Token const* t = m_root->getCorpus()->sntStart(sid)+offset; + Token const* x = t - min(offset,uint64_t(3)); + Token const* e = t + 4; + if (e > m_root->getCorpus()->sntEnd(sid)) + e = m_root->getCorpus()->sntEnd(sid); + *log << docid << ":" << sid << " " << size_t(k) << "/" << N + << " @" << p << " => " << d << " ["; + std::map::const_iterator m; + for (m = m_stats->indoc.begin(); m != m_stats->indoc.end(); ++m) + { + if (m != m_stats->indoc.begin()) *log << " "; + *log << m->first << ":" << m->second; + } + *log << "] "; + for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " "; + if (!ret) *log << "SKIP"; + else if (p < .5 && d > .9) *log << "FORCE"; + *log << std::endl; + } #endif - if (!good) - { // no good, probably because phrase is not coherent - m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd); - return false; - } + return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0); +} + +template +bool +BitextSampler:: +flip_coin(ugdiss::id_type & sid, ushort & offset) +{ + int no_maybe_yes = m_bias ? check_sample_distribution(sid, offset) : 1; + if (no_maybe_yes == 0) return false; // no + if (no_maybe_yes > 1) return true; // yes + // ... maybe: flip a coin + size_t options_chosen = m_stats->good; + size_t options_total = max(m_stats->raw_cnt, m_ctr); + size_t options_left = (options_total - m_ctr); + size_t random_number = options_left * (m_rnd()/(m_rnd.max()+1.)); + size_t threshold; + if (m_bias_total) // we have a bias and there are candidates with non-zero prob + threshold = ((*m_bias)[sid]/m_bias_total * options_total * m_samples); + else // no bias, or all have prob 0 (can happen with a very opinionated bias) + threshold = m_samples; + return random_number + options_chosen < threshold; +} + + + + +template +BitextSampler:: +BitextSampler(Bitext const* const bitext, + typename bitext::iter const& phrase, + sptr const& bias, size_t const max_samples, + sampling_method const method) + : m_bitext(bitext) + , m_plen(phrase.size()) + , m_fwd(phrase.root == bitext->I1.get()) + , m_root(m_fwd ? bitext->I1 : bitext->I2) + , m_next(phrase.lower_bound(-1)) + , m_stop(phrase.upper_bound(-1)) + , m_method(method) + , m_bias(bias) + , m_samples(max_samples) + , m_ctr(0) + , m_total_bias(0) + , m_finished(false) +{ + m_stats.reset(new pstats); + m_stats->raw_cnt = phrase.ca(); + m_stats->register_worker(); + // cerr << phrase.str(bitext->V1.get()) << " [" << HERE << "]" << endl; +} + +template +BitextSampler:: +BitextSampler(BitextSampler const& other) + : m_bitext(other.m_bitext) + , m_plen(other.m_plen) + , m_fwd(other.m_fwd) + , m_root(other.m_root) + , m_next(other.m_next) + , m_stop(other.m_stop) + , m_method(other.m_method) + , m_bias(other.m_bias) + , m_samples(other.m_samples) +{ + // lock both instances + boost::unique_lock mylock(m_lock); + boost::unique_lock yrlock(other.m_lock); + // actually, BitextSamplers should only copied on job submission + m_stats = other.m_stats; + m_stats->register_worker(); + m_ctr = other.m_ctr; + m_total_bias = other.m_total_bias; + m_finished = other.m_finished; +} + +// Ranked sampling sorts all samples by score and then considers the top-ranked +// candidates for phrase extraction. +template +size_t +BitextSampler:: +perform_ranked_sampling() +{ + if (m_next == m_stop) return m_ctr; + CandidateSorter sorter(*m_bias); + // below: nbest size = 4 * m_samples to allow for failed phrase extraction + NBestList nbest(4*m_samples,sorter); + ugdiss::tsa::ArrayEntry I(m_next); + while (I.next < m_stop) + { + ++m_ctr; + nbest.add(m_root->readEntry(I.next,I)); + } + for (size_t i = 0; m_stats->good < m_samples && i < nbest.size(); ++i) + consider_sample(nbest[i]); + return m_ctr; +} + +// Ranked sampling sorts all samples by score and then considers the top-ranked +// candidates for phrase extraction. +template +size_t +BitextSampler:: +perform_random_sampling() +{ + if (m_next == m_stop) return m_ctr; + m_bias_total = 0; + if (m_bias) + { + m_stats->raw_cnt = 0; + for (ugdiss::tsa::ArrayEntry I(m_next); I.next < m_stop;) + { + m_root->readEntry(I.next,I); + ++m_stats->raw_cnt; + m_bias_total += (*m_bias)[I.sid]; + } + } + + ugdiss::tsa::ArrayEntry I(m_next); + while (m_stats->good < m_samples && I.next < m_stop) + { + ++m_ctr; + m_root->readEntry(I.next,I); + if (!flip_coin(I.sid, I.offset)) continue; + consider_sample(I); + } + return m_ctr; +} + +template +bool +BitextSampler:: +consider_sample(TokenPosition const& p) +{ + std::vector aln; + bitvector full_aln(100*100); + PhraseExtractionRecord + rec(p.sid, p.offset, p.offset + m_plen, !m_fwd, &aln, &full_aln); + int docid = m_bias ? m_bias->GetClass(p.sid) : -1; + if (!m_bitext->find_trg_phr_bounds(rec)) + { // no good, probably because phrase is not coherent + m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd); + return false; + } - // all good: register this sample as valid - size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1); - m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd); + // all good: register this sample as valid + size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1); + m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd); - float sample_weight = 1./num_pairs; - Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid); + float sample_weight = 1./num_pairs; + Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid); - // adjust offsets in phrase-internal aligment - for (size_t k = 1; k < aln.size(); k += 2) aln[k] += rec.s2 - rec.s1; + // adjust offsets in phrase-internal aligment + for (size_t k = 1; k < aln.size(); k += 2) + aln[k] += rec.s2 - rec.s1; - vector seen; seen.reserve(10); - // It is possible that the phrase extraction extracts the same - // phrase twice, e.g., when word a co-occurs with sequence b b b - // but is aligned only to the middle word. We can only count - // each phrase pair once per source phrase occurrence, or else - // run the risk of having more joint counts than marginal - // counts. + vector seen; seen.reserve(10); + // It is possible that the phrase extraction extracts the same + // phrase twice, e.g., when word a co-occurs with sequence b b b but + // is aligned only to the middle word. We can only count each phrase + // pair once per source phrase occurrence, or else run the risk of + // having more joint counts than marginal counts. - for (size_t s = rec.s1; s <= rec.s2; ++s) - { - TSA const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1; - sptr b = I.find(o + s, rec.e1 - s); - UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found"); + for (size_t s = rec.s1; s <= rec.s2; ++s) + { + TSA const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1; + sptr b = I.find(o + s, rec.e1 - s); + UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found"); - for (size_t i = rec.e1; i <= rec.e2; ++i) - { - uint64_t tpid = b->getPid(); - - // poor man's protection against over-counting - size_t s = 0; - while (s < seen.size() && seen[s] != tpid) ++s; - if (s < seen.size()) continue; - seen.push_back(tpid); - - size_t raw2 = b->approxOccurrenceCount(); - m_stats->add(tpid, sample_weight, m_bias ? (*m_bias)[p.sid] : 1, - aln, raw2, rec.po_fwd, rec.po_bwd, docid); - bool ok = (i == rec.e2) || b->extend(o[i].id()); - UTIL_THROW_IF2(!ok, "Could not extend target phrase."); - } - if (s < rec.s2) // shift phrase-internal alignments - for (size_t k = 1; k < aln.size(); k += 2) - --aln[k]; - } - return true; - } + for (size_t i = rec.e1; i <= rec.e2; ++i) + { + uint64_t tpid = b->getPid(); + if (find(seen.begin(), seen.end(), tpid) != seen.end()) + continue; // don't over-count + seen.push_back(tpid); + size_t raw2 = b->approxOccurrenceCount(); + m_stats->add(tpid, sample_weight, m_bias ? (*m_bias)[p.sid] : 1, + aln, raw2, rec.po_fwd, rec.po_bwd, docid); + bool ok = (i == rec.e2) || b->extend(o[i].id()); + UTIL_THROW_IF2(!ok, "Could not extend target phrase."); + } + if (s < rec.s2) // shift phrase-internal alignments + for (size_t k = 1; k < aln.size(); k += 2) + --aln[k]; + } + return true; +} - template - bool - BitextSampler:: - operator()() - { - if (m_finished) return true; - boost::unique_lock lock(m_lock); +template +bool +BitextSampler:: +operator()() +{ + if (m_finished) return true; + boost::unique_lock lock(m_lock); + if (m_method == ranked_sampling) perform_ranked_sampling(); - m_finished = true; - m_ready.notify_all(); - return true; - } + else if (m_method == random_sampling) + perform_random_sampling(); + else UTIL_THROW2("Unsupported sampling method."); + m_finished = true; + m_ready.notify_all(); + return true; +} - template - bool - BitextSampler:: - done() const - { - return m_next == m_stop; - } +template +bool +BitextSampler:: +done() const +{ + return m_next == m_stop; +} - template - sptr - BitextSampler:: - stats() - { - // if (m_ctr == 0) (*this)(); - // boost::unique_lock lock(m_lock); - // while (!m_finished) - // m_ready.wait(lock); - return m_stats; - } +template +sptr +BitextSampler:: +stats() +{ + // if (m_ctr == 0) (*this)(); + // boost::unique_lock lock(m_lock); + // while (!m_finished) + // m_ready.wait(lock); + return m_stats; +} - template - BitextSampler:: - ~BitextSampler() - { - m_stats->release(); - } +template +BitextSampler:: +~BitextSampler() +{ + m_stats->release(); +} } // end of namespace bitext } // end of namespace Moses diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 3fa76f4d8..651621f96 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -833,17 +833,19 @@ namespace Moses } if (!context->cache1) context->cache1.reset(new pstats::cache_t); if (!context->cache2) context->cache2.reset(new pstats::cache_t); - } else if (!ttask->GetContextWeights().empty()) { - if (m_bias_log) - { - *m_bias_log << HERE << endl - << "BIAS FROM MAP LOOKUP" << endl; - context->bias_log = m_bias_log; - } - context->bias - = btfix->SetupDocumentBias(ttask->GetContextWeights(), m_bias_log); - context->bias->loglevel = m_bias_loglevel; - context->bias->log = m_bias_log; + } + else if (!ttask->GetContextWeights().empty()) + { + if (m_bias_log) + { + *m_bias_log << HERE << endl + << "BIAS FROM MAP LOOKUP" << endl; + context->bias_log = m_bias_log; + } + context->bias + = btfix->SetupDocumentBias(ttask->GetContextWeights(), m_bias_log); + context->bias->loglevel = m_bias_loglevel; + context->bias->log = m_bias_log; if (!context->cache1) context->cache1.reset(new pstats::cache_t); if (!context->cache2) context->cache2.reset(new pstats::cache_t); } @@ -897,13 +899,15 @@ namespace Moses Mmsapt:: InitializeForInput(ttasksptr const& ttask) { - set_bias_for_ranking(ttask, this->btfix); - // to do: depending on method, set bias for ranking, via consulting the bias - // server, or none at al. - sptr const& scope = ttask->GetScope(); sptr context = scope->get(btfix.get(), true); + // set sampling bias, depending on sampling method specified + if (m_sampling_method == ranked_sampling) + set_bias_for_ranking(ttask, this->btfix); + else if (m_sampling_method == random_sampling) + set_bias_via_server(ttask); + boost::unique_lock mylock(m_lock); sptr localcache = scope->get(cache_key); if (!localcache) From 4f155f104d4643c1156867deb01131538ee22c25 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 6 Jul 2015 14:07:50 +0400 Subject: [PATCH 113/286] codelite --- .../other-builds/pruneGeneration/pruneGeneration.project | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/contrib/other-builds/pruneGeneration/pruneGeneration.project b/contrib/other-builds/pruneGeneration/pruneGeneration.project index 6f8a6adf5..70d26f418 100644 --- a/contrib/other-builds/pruneGeneration/pruneGeneration.project +++ b/contrib/other-builds/pruneGeneration/pruneGeneration.project @@ -1,6 +1,9 @@ + + + - - - @@ -44,8 +44,10 @@ + + From 44372d778763d2463024304c7ffe6e648f2956cf Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 6 Jul 2015 12:05:41 +0100 Subject: [PATCH 114/286] extract-ghkm: fix a couple of exception-related issues --- phrase-extract/extract-ghkm/Alignment.cpp | 4 +- phrase-extract/extract-ghkm/Exception.h | 46 --------------------- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 18 ++++---- 3 files changed, 11 insertions(+), 57 deletions(-) delete mode 100644 phrase-extract/extract-ghkm/Exception.h diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp index 6f946fe5a..ba89a1594 100644 --- a/phrase-extract/extract-ghkm/Alignment.cpp +++ b/phrase-extract/extract-ghkm/Alignment.cpp @@ -19,7 +19,7 @@ #include "Alignment.h" -#include "Exception.h" +#include "syntax-common/exception.h" #include #include @@ -44,7 +44,7 @@ void ReadAlignment(const std::string &s, Alignment &a) } int src = std::atoi(s.substr(begin, end-begin).c_str()); if (end+1 == s.size()) { - throw Exception("Target index missing"); + throw Syntax::Exception("Target index missing"); } begin = end+1; diff --git a/phrase-extract/extract-ghkm/Exception.h b/phrase-extract/extract-ghkm/Exception.h deleted file mode 100644 index 99e1067f4..000000000 --- a/phrase-extract/extract-ghkm/Exception.h +++ /dev/null @@ -1,46 +0,0 @@ -/*********************************************************************** - Moses - statistical machine translation system - Copyright (C) 2006-2011 University of Edinburgh - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -***********************************************************************/ - -#pragma once -#ifndef EXTRACT_GHKM_EXCEPTION_H_ -#define EXTRACT_GHKM_EXCEPTION_H_ - -#include - -namespace MosesTraining -{ -namespace GHKM -{ - -class Exception -{ -public: - Exception(const char *msg) : m_msg(msg) {} - Exception(const std::string &msg) : m_msg(msg) {} - const std::string &GetMsg() const { - return m_msg; - } -private: - std::string m_msg; -}; - -} // namespace GHKM -} // namespace MosesTraining - -#endif diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index c2ee43767..a4e8afcd3 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -30,6 +30,7 @@ #include +#include "syntax-common/exception.h" #include "syntax-common/xml_tree_parser.h" #include "InputFileStream.h" @@ -43,7 +44,6 @@ #include "Alignment.h" #include "AlignmentGraph.h" -#include "Exception.h" #include "Node.h" #include "Options.h" #include "PhraseOrientation.h" @@ -160,11 +160,11 @@ int ExtractGHKM::Main(int argc, char *argv[]) try { targetParseTree = targetXmlTreeParser.Parse(targetLine); assert(targetParseTree.get()); - } catch (const Exception &e) { + } catch (const Syntax::Exception &e) { std::ostringstream oss; oss << "Failed to parse target XML tree at line " << lineNum; - if (!e.GetMsg().empty()) { - oss << ": " << e.GetMsg(); + if (!e.msg().empty()) { + oss << ": " << e.msg(); } Error(oss.str()); } @@ -178,11 +178,11 @@ int ExtractGHKM::Main(int argc, char *argv[]) try { sourceParseTree = sourceXmlTreeParser.Parse(sourceLine); assert(sourceParseTree.get()); - } catch (const Exception &e) { + } catch (const Syntax::Exception &e) { std::ostringstream oss; oss << "Failed to parse source XML tree at line " << lineNum; - if (!e.GetMsg().empty()) { - oss << ": " << e.GetMsg(); + if (!e.msg().empty()) { + oss << ": " << e.msg(); } Error(oss.str()); } @@ -192,10 +192,10 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Read word alignments. try { ReadAlignment(alignmentLine, alignment); - } catch (const Exception &e) { + } catch (const Syntax::Exception &e) { std::ostringstream oss; oss << "Failed to read alignment at line " << lineNum << ": "; - oss << e.GetMsg(); + oss << e.msg(); Error(oss.str()); } if (alignment.size() == 0) { From 540f9e9974d36192cf48e9cdb476fe8bd944b950 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Mon, 6 Jul 2015 14:19:59 +0100 Subject: [PATCH 115/286] Update documentation for google-profiler. Untested --- contrib/moses-speedtest/README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/contrib/moses-speedtest/README.md b/contrib/moses-speedtest/README.md index 74b7d079e..5a8fe47aa 100644 --- a/contrib/moses-speedtest/README.md +++ b/contrib/moses-speedtest/README.md @@ -28,14 +28,16 @@ TEST_DIR: /home/moses-speedtest/phrase_tables/tests TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs BASEBRANCH: RELEASE-2.1.1 MOSES_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-prof +MOSES_GOOGLE_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-gperftools The _MOSES\_REPO\_PATH_ is the place where you have set up and built moses. -The _DROP\_CACHES\_COMM_ is the command that would b eused to drop caches. It should run without needing root access. +The _DROP\_CACHES\_COMM_ is the command that would be used to drop caches. It should run without needing root access. _TEST\_DIR_ is the directory where all the tests will reside. _TEST\_LOG\_DIR_ is the directory where the performance logs will be gathered. It should be created before running the testsuite for the first time. _BASEBRANCH_ is the branch against which all new tests will be compared. It should normally be set to be the latest Moses stable release. _MOSES\_PROFILER\_REPO_ is a path to a moses repository set up and built with profiling enabled. Optional if you want to produce profiling results. +_MOSES\_GOOGLE\_PROFILER\_REPO is a path to moses repository set up with full tcmalloc and profiler, as well as shared link for use with gperftools. ### Creating tests In order to create a test one should go into the TEST_DIR and create a new folder. That folder will be used for the name of the test. @@ -45,7 +47,7 @@ An example such configuration file is **test\_config**
 Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
 LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/, 
-Variants: vanilla, cached, ldpre, profile #Can't have cached without ldpre or vanilla
+Variants: vanilla, cached, ldpre, profile, google-profiler #Can't have cached without ldpre or vanilla
 
The _Command:_ line specifies the executable (which is looked up in the /bin directory of the repo.) and any arguments necessary. Before running the test, the script cds to the current test directory so you can use relative paths. @@ -67,6 +69,16 @@ cd mosesdecoder Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run. +#### Produce google profiler results. +If you want to produce profiler results together in some tests you need to specify the _MOSES\_GOOGLE\_PROFILER\_REPO in the config +```bash +git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-google-profile +cd mosesdecoder +./bjam link=shared -j10 --full-tcmalloc- -with-cmph=/usr/include/ +``` + +Afterwards for testcases which contain the **google-profiler** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **google-profiler** output from every run. + ### Running tests. Running the tests is done through the **runtests.py** script. From e7228ec9fb09941593fa09329d421ca7b951f12e Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 6 Jul 2015 14:41:34 +0100 Subject: [PATCH 116/286] extract-ghkm: minor refactoring --- moses/FF/PhraseOrientationFeature.cpp | 70 +++++++++---------- moses/FF/PhraseOrientationFeature.h | 6 +- phrase-extract/extract-ghkm/Alignment.cpp | 5 +- phrase-extract/extract-ghkm/Alignment.h | 4 +- .../extract-ghkm/AlignmentGraph.cpp | 62 ++++++++++------ phrase-extract/extract-ghkm/AlignmentGraph.h | 4 ++ phrase-extract/extract-ghkm/ComposedRule.cpp | 3 + phrase-extract/extract-ghkm/ComposedRule.h | 3 + phrase-extract/extract-ghkm/ExtractGHKM.cpp | 13 ++-- phrase-extract/extract-ghkm/ExtractGHKM.h | 5 +- phrase-extract/extract-ghkm/Main.cpp | 2 +- phrase-extract/extract-ghkm/Node.cpp | 3 + phrase-extract/extract-ghkm/Node.h | 3 + phrase-extract/extract-ghkm/Options.h | 4 +- .../extract-ghkm/PhraseOrientation.cpp | 4 +- .../extract-ghkm/PhraseOrientation.h | 3 + phrase-extract/extract-ghkm/Rule.cpp | 3 + phrase-extract/extract-ghkm/Rule.h | 3 + phrase-extract/extract-ghkm/ScfgRule.cpp | 3 + phrase-extract/extract-ghkm/ScfgRule.h | 3 + .../extract-ghkm/ScfgRuleWriter.cpp | 3 + phrase-extract/extract-ghkm/ScfgRuleWriter.h | 4 +- phrase-extract/extract-ghkm/Span.cpp | 3 + phrase-extract/extract-ghkm/Span.h | 3 + phrase-extract/extract-ghkm/StsgRule.cpp | 3 + phrase-extract/extract-ghkm/StsgRule.h | 3 + .../extract-ghkm/StsgRuleWriter.cpp | 3 + phrase-extract/extract-ghkm/StsgRuleWriter.h | 3 + phrase-extract/extract-ghkm/Subgraph.cpp | 3 + phrase-extract/extract-ghkm/Subgraph.h | 4 +- 30 files changed, 167 insertions(+), 71 deletions(-) diff --git a/moses/FF/PhraseOrientationFeature.cpp b/moses/FF/PhraseOrientationFeature.cpp index fea8dafad..0865dcac5 100644 --- a/moses/FF/PhraseOrientationFeature.cpp +++ b/moses/FF/PhraseOrientationFeature.cpp @@ -134,7 +134,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source, if (targetPhrase.GetAlignNonTerm().GetSize() != 0) { // Initialize phrase orientation scoring object - MosesTraining::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(), + MosesTraining::Syntax::GHKM::PhraseOrientation phraseOrientation(source.GetSize(), targetPhrase.GetSize(), targetPhrase.GetAlignTerm(), targetPhrase.GetAlignNonTerm()); PhraseOrientationFeature::ReoClassData* reoClassData = new PhraseOrientationFeature::ReoClassData(); @@ -150,7 +150,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source, // LEFT-TO-RIGHT DIRECTION - MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_L2R); + MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::Syntax::GHKM::PhraseOrientation::REO_DIR_L2R); if ( ((targetIndex == 0) || !phraseOrientation.TargetSpanIsAligned(0,targetIndex)) // boundary non-terminal in rule-initial position (left boundary) && (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule @@ -170,7 +170,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source, if (reoClassData->firstNonTerminalPreviousSourceSpanIsAligned && reoClassData->firstNonTerminalFollowingSourceSpanIsAligned) { // discontinuous - l2rOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT; + l2rOrientation = MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT; } else { reoClassData->firstNonTerminalIsBoundary = true; } @@ -180,7 +180,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source, // RIGHT-TO-LEFT DIRECTION - MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::GHKM::PhraseOrientation::REO_DIR_R2L); + MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = phraseOrientation.GetOrientationInfo(sourceIndex,sourceIndex,MosesTraining::Syntax::GHKM::PhraseOrientation::REO_DIR_R2L); if ( ((targetIndex == targetPhrase.GetSize()-1) || !phraseOrientation.TargetSpanIsAligned(targetIndex,targetPhrase.GetSize()-1)) // boundary non-terminal in rule-final position (right boundary) && (targetPhraseLHS != m_glueTargetLHS) ) { // and not glue rule @@ -200,7 +200,7 @@ void PhraseOrientationFeature::EvaluateInIsolation(const Phrase &source, if (reoClassData->lastNonTerminalPreviousSourceSpanIsAligned && reoClassData->lastNonTerminalFollowingSourceSpanIsAligned) { // discontinuous - r2lOrientation = MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT; + r2lOrientation = MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT; } else { reoClassData->lastNonTerminalIsBoundary = true; } @@ -335,25 +335,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied( // LEFT-TO-RIGHT DIRECTION - MosesTraining::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT]; + MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS l2rOrientation = reoClassData->nonTerminalReoClassL2R[nNT]; IFFEATUREVERBOSE(2) { FEATUREVERBOSE(2, "l2rOrientation "); switch (l2rOrientation) { - case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT: + case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT: FEATUREVERBOSE2(2, "mono" << std::endl); break; - case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT: + case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT: FEATUREVERBOSE2(2, "swap" << std::endl); break; - case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT: + case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT: FEATUREVERBOSE2(2, "dleft" << std::endl); break; - case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: + case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: FEATUREVERBOSE2(2, "dright" << std::endl); break; - case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: - // modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR + case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: + // modelType == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_MSLR FEATUREVERBOSE2(2, "unknown->dleft" << std::endl); break; default: @@ -396,23 +396,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied( } else { - if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { + if ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { newScores[0] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityMono()); // if sub-derivation has left-boundary non-terminal: // add recursive actual score of boundary non-terminal from subderivation LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x1, newScores, accumulator); - } else if ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { + } else if ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { newScores[1] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilitySwap()); // if sub-derivation has left-boundary non-terminal: // add recursive actual score of boundary non-terminal from subderivation LeftBoundaryL2RScoreRecursive(featureID, prevState, 0x2, newScores, accumulator); - } else if ( ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || - ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || - ( l2rOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { + } else if ( ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || + ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || + ( l2rOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { newScores[2] += TransformScore(orientationPhraseProperty->GetLeftToRightProbabilityDiscontinuous()); // if sub-derivation has left-boundary non-terminal: @@ -437,25 +437,25 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied( // RIGHT-TO-LEFT DIRECTION - MosesTraining::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT]; + MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS r2lOrientation = reoClassData->nonTerminalReoClassR2L[nNT]; IFFEATUREVERBOSE(2) { FEATUREVERBOSE(2, "r2lOrientation "); switch (r2lOrientation) { - case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT: + case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT: FEATUREVERBOSE2(2, "mono" << std::endl); break; - case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT: + case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT: FEATUREVERBOSE2(2, "swap" << std::endl); break; - case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT: + case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT: FEATUREVERBOSE2(2, "dleft" << std::endl); break; - case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: + case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT: FEATUREVERBOSE2(2, "dright" << std::endl); break; - case MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: - // modelType == MosesTraining::GHKM::PhraseOrientation::REO_MSLR + case MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN: + // modelType == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_MSLR FEATUREVERBOSE2(2, "unknown->dleft" << std::endl); break; default: @@ -498,23 +498,23 @@ FFState* PhraseOrientationFeature::EvaluateWhenApplied( } else { - if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { + if ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { newScores[m_offsetR2LScores+0] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityMono()); // if sub-derivation has right-boundary non-terminal: // add recursive actual score of boundary non-terminal from subderivation RightBoundaryR2LScoreRecursive(featureID, prevState, 0x1, newScores, accumulator); - } else if ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { + } else if ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { newScores[m_offsetR2LScores+1] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilitySwap()); // if sub-derivation has right-boundary non-terminal: // add recursive actual score of boundary non-terminal from subderivation RightBoundaryR2LScoreRecursive(featureID, prevState, 0x2, newScores, accumulator); - } else if ( ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || - ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || - ( r2lOrientation == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { + } else if ( ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || + ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || + ( r2lOrientation == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { newScores[m_offsetR2LScores+2] += TransformScore(orientationPhraseProperty->GetRightToLeftProbabilityDiscontinuous()); // if sub-derivation has right-boundary non-terminal: @@ -862,17 +862,17 @@ void PhraseOrientationFeature::SparseNonTerminalR2LScore(const Factor* nonTermin } -const std::string* PhraseOrientationFeature::ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const +const std::string* PhraseOrientationFeature::ToString(const MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS o) const { - if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { + if ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_LEFT ) { return &MORIENT; - } else if ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { + } else if ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_RIGHT ) { return &SORIENT; - } else if ( ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || - ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || - ( o == MosesTraining::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { + } else if ( ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DLEFT ) || + ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_DRIGHT ) || + ( o == MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN ) ) { return &DORIENT; } else { diff --git a/moses/FF/PhraseOrientationFeature.h b/moses/FF/PhraseOrientationFeature.h index 7c429dd1c..ad5b5a15e 100644 --- a/moses/FF/PhraseOrientationFeature.h +++ b/moses/FF/PhraseOrientationFeature.h @@ -302,8 +302,8 @@ public: struct ReoClassData { public: - std::vector nonTerminalReoClassL2R; - std::vector nonTerminalReoClassR2L; + std::vector nonTerminalReoClassL2R; + std::vector nonTerminalReoClassR2L; bool firstNonTerminalIsBoundary; bool firstNonTerminalPreviousSourceSpanIsAligned; bool firstNonTerminalFollowingSourceSpanIsAligned; @@ -401,7 +401,7 @@ protected: ScoreComponentCollection* scoreBreakdown, const std::string* o) const; - const std::string* ToString(const MosesTraining::GHKM::PhraseOrientation::REO_CLASS o) const; + const std::string* ToString(const MosesTraining::Syntax::GHKM::PhraseOrientation::REO_CLASS o) const; static const std::string MORIENT; static const std::string SORIENT; diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp index ba89a1594..9293a07cf 100644 --- a/phrase-extract/extract-ghkm/Alignment.cpp +++ b/phrase-extract/extract-ghkm/Alignment.cpp @@ -27,6 +27,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -44,7 +46,7 @@ void ReadAlignment(const std::string &s, Alignment &a) } int src = std::atoi(s.substr(begin, end-begin).c_str()); if (end+1 == s.size()) { - throw Syntax::Exception("Target index missing"); + throw Exception("Target index missing"); } begin = end+1; @@ -70,4 +72,5 @@ void FlipAlignment(Alignment &a) } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/extract-ghkm/Alignment.h index 154e1fc4f..da1279f8f 100644 --- a/phrase-extract/extract-ghkm/Alignment.h +++ b/phrase-extract/extract-ghkm/Alignment.h @@ -25,6 +25,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -35,5 +37,5 @@ void ReadAlignment(const std::string &, Alignment &); void FlipAlignment(Alignment &); } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining - diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 9dba71331..21708bdfc 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -34,6 +34,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -242,36 +244,24 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root) return p; } -// Finds the set of frontier nodes. The definition of a frontier node differs -// from Galley et al's (2004) in the following ways: -// -// 1. A node with an empty span is not a frontier node (this excludes -// unaligned target subtrees). -// 2. Target word nodes are not frontier nodes. -// 3. Source word nodes are not frontier nodes. -// 4. Unless the --AllowUnary option is used, a node is not a frontier node if -// it has the same span as its parent. +// Recursively constructs the set of frontier nodes for the tree (or subtree) +// rooted at the given node. void AlignmentGraph::ComputeFrontierSet(Node *root, const Options &options, std::set &frontierSet) const { - // Don't include word nodes or unaligned target subtrees. + // Non-tree nodes and unaligned target subtrees are not frontier nodes (and + // nor are their descendants). See the comment for the function + // AlignmentGraph::IsFrontierNode(). if (root->GetType() != TREE || root->GetSpan().empty()) { return; } - if (!SpansIntersect(root->GetComplementSpan(), Closure(root->GetSpan()))) { - // Unless unary rules are explicitly allowed, we use Chung et al's (2011) - // modified defintion of a frontier node to eliminate the production of - // non-lexical unary rules. - assert(root->GetParents().size() <= 1); - if (options.allowUnary - || root->GetParents().empty() - || root->GetParents()[0]->GetSpan() != root->GetSpan()) { - frontierSet.insert(root); - } + if (IsFrontierNode(*root, options)) { + frontierSet.insert(root); } + // Recursively check descendants. const std::vector &children = root->GetChildren(); for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { @@ -279,6 +269,37 @@ void AlignmentGraph::ComputeFrontierSet(Node *root, } } +// Determines whether the given node is a frontier node or not. The definition +// of a frontier node differs from Galley et al's (2004) in the following ways: +// +// 1. A node with an empty span is not a frontier node (this is to exclude +// unaligned target subtrees). +// 2. Target word nodes are not frontier nodes. +// 3. Source word nodes are not frontier nodes. +// 4. Unless the --AllowUnary option is used, a node is not a frontier node if +// it has the same span as its parent. +bool AlignmentGraph::IsFrontierNode(const Node &n, const Options &options) const +{ + // Don't include word nodes or unaligned target subtrees. + if (n.GetType() != TREE || n.GetSpan().empty()) { + return false; + } + // This is the original GHKM definition of a frontier node. + if (SpansIntersect(n.GetComplementSpan(), Closure(n.GetSpan()))) { + return false; + } + // Unless unary rules are explicitly allowed, we use Chung et al's (2011) + // modified defintion of a frontier node to eliminate the production of + // non-lexical unary rules. + assert(n.GetParents().size() <= 1); + if (!options.allowUnary && + !n.GetParents().empty() && + n.GetParents()[0]->GetSpan() == n.GetSpan()) { + return false; + } + return true; +} + void AlignmentGraph::CalcComplementSpans(Node *root) { Span compSpan; @@ -393,4 +414,5 @@ Node *AlignmentGraph::DetermineAttachmentPoint(int index) } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.h b/phrase-extract/extract-ghkm/AlignmentGraph.h index 032b946f0..be1182c16 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.h +++ b/phrase-extract/extract-ghkm/AlignmentGraph.h @@ -32,6 +32,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -64,6 +66,7 @@ private: Node *CopyParseTree(const SyntaxTree *); void ComputeFrontierSet(Node *, const Options &, std::set &) const; + bool IsFrontierNode(const Node &, const Options &) const; void CalcComplementSpans(Node *); void GetTargetTreeLeaves(Node *, std::vector &); void AttachUnalignedSourceWords(); @@ -78,6 +81,7 @@ private: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/ComposedRule.cpp b/phrase-extract/extract-ghkm/ComposedRule.cpp index d322a255f..b4f6a6fcd 100644 --- a/phrase-extract/extract-ghkm/ComposedRule.cpp +++ b/phrase-extract/extract-ghkm/ComposedRule.cpp @@ -29,6 +29,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -128,4 +130,5 @@ Subgraph ComposedRule::CreateSubgraph() } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ComposedRule.h b/phrase-extract/extract-ghkm/ComposedRule.h index d456fd27c..9ff910293 100644 --- a/phrase-extract/extract-ghkm/ComposedRule.h +++ b/phrase-extract/extract-ghkm/ComposedRule.h @@ -28,6 +28,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -67,6 +69,7 @@ private: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index a4e8afcd3..8a415eb71 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -55,6 +55,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -131,8 +133,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::string sourceLine; std::string alignmentLine; Alignment alignment; - Syntax::XmlTreeParser targetXmlTreeParser; - Syntax::XmlTreeParser sourceXmlTreeParser; + XmlTreeParser targetXmlTreeParser; + XmlTreeParser sourceXmlTreeParser; ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options); StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options); size_t lineNum = options.sentenceOffset; @@ -160,7 +162,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) try { targetParseTree = targetXmlTreeParser.Parse(targetLine); assert(targetParseTree.get()); - } catch (const Syntax::Exception &e) { + } catch (const Exception &e) { std::ostringstream oss; oss << "Failed to parse target XML tree at line " << lineNum; if (!e.msg().empty()) { @@ -178,7 +180,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) try { sourceParseTree = sourceXmlTreeParser.Parse(sourceLine); assert(sourceParseTree.get()); - } catch (const Syntax::Exception &e) { + } catch (const Exception &e) { std::ostringstream oss; oss << "Failed to parse source XML tree at line " << lineNum; if (!e.msg().empty()) { @@ -192,7 +194,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Read word alignments. try { ReadAlignment(alignmentLine, alignment); - } catch (const Syntax::Exception &e) { + } catch (const Exception &e) { std::ostringstream oss; oss << "Failed to read alignment at line " << lineNum << ": "; oss << e.msg(); @@ -896,4 +898,5 @@ void ExtractGHKM::StripBitParLabels( } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h index 0d0fa8bf1..170de7ae9 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.h +++ b/phrase-extract/extract-ghkm/ExtractGHKM.h @@ -32,12 +32,14 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { struct Options; -class ExtractGHKM : public Syntax::Tool +class ExtractGHKM : public Tool { public: ExtractGHKM() : Tool("extract-ghkm") {} @@ -76,4 +78,5 @@ private: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Main.cpp b/phrase-extract/extract-ghkm/Main.cpp index 64b3e0f00..f7a2173fb 100644 --- a/phrase-extract/extract-ghkm/Main.cpp +++ b/phrase-extract/extract-ghkm/Main.cpp @@ -21,6 +21,6 @@ int main(int argc, char *argv[]) { - MosesTraining::GHKM::ExtractGHKM tool; + MosesTraining::Syntax::GHKM::ExtractGHKM tool; return tool.Main(argc, argv); } diff --git a/phrase-extract/extract-ghkm/Node.cpp b/phrase-extract/extract-ghkm/Node.cpp index 384db3306..382fda996 100644 --- a/phrase-extract/extract-ghkm/Node.cpp +++ b/phrase-extract/extract-ghkm/Node.cpp @@ -23,6 +23,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -70,4 +72,5 @@ void Node::GetTargetWords(std::vector &targetWords) const } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Node.h b/phrase-extract/extract-ghkm/Node.h index 71a24b28e..81f4a46b9 100644 --- a/phrase-extract/extract-ghkm/Node.h +++ b/phrase-extract/extract-ghkm/Node.h @@ -30,6 +30,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -215,6 +217,7 @@ Node *Node::LowestCommonAncestor(InputIterator first, InputIterator last) } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h index f694fb55c..429469883 100644 --- a/phrase-extract/extract-ghkm/Options.h +++ b/phrase-extract/extract-ghkm/Options.h @@ -23,6 +23,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -89,5 +91,5 @@ public: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining - diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/extract-ghkm/PhraseOrientation.cpp index 57952d580..f07e19a46 100644 --- a/phrase-extract/extract-ghkm/PhraseOrientation.cpp +++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp @@ -28,6 +28,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -469,5 +471,5 @@ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining - diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/extract-ghkm/PhraseOrientation.h index 572124e61..d956e2bc8 100644 --- a/phrase-extract/extract-ghkm/PhraseOrientation.h +++ b/phrase-extract/extract-ghkm/PhraseOrientation.h @@ -32,6 +32,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -120,4 +122,5 @@ private: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Rule.cpp b/phrase-extract/extract-ghkm/Rule.cpp index 1b7207c3c..b4b59f8e3 100644 --- a/phrase-extract/extract-ghkm/Rule.cpp +++ b/phrase-extract/extract-ghkm/Rule.cpp @@ -5,6 +5,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -38,4 +40,5 @@ bool Rule::PartitionOrderComp(const Node *a, const Node *b) } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Rule.h b/phrase-extract/extract-ghkm/Rule.h index b87934735..5317be7c8 100644 --- a/phrase-extract/extract-ghkm/Rule.h +++ b/phrase-extract/extract-ghkm/Rule.h @@ -9,6 +9,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -54,6 +56,7 @@ protected: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index 1a49c862e..e26b17a87 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -28,6 +28,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -197,4 +199,5 @@ void ScfgRule::UpdateSourceLabelCoocCounts(std::map< std::string, std::map Date: Tue, 7 Jul 2015 00:12:20 +0100 Subject: [PATCH 117/286] Added seeding of random generator to produce the same results across repeated runs of the decoder. --- moses/TranslationModel/UG/mm/ug_bitext_sampler.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index d3845415d..f987af79f 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -186,6 +186,8 @@ BitextSampler(Bitext const* const bitext, , m_ctr(0) , m_total_bias(0) , m_finished(false) + , m_rnd(0) + , m_rnd_denom(m_rnd.max() + 1) { m_stats.reset(new pstats); m_stats->raw_cnt = phrase.ca(); @@ -205,6 +207,8 @@ BitextSampler(BitextSampler const& other) , m_method(other.m_method) , m_bias(other.m_bias) , m_samples(other.m_samples) + , m_rnd(0) + , m_rnd_denom(m_rnd.max() + 1) { // lock both instances boost::unique_lock mylock(m_lock); From 8bdbfe583f44db9e44cb3c97755960ce9de27bb2 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 7 Jul 2015 00:12:56 +0100 Subject: [PATCH 118/286] 1. Added initialization of pstats cache on ContextForQuery. 2. Code cleanup: removed obsolete code. --- moses/TranslationModel/UG/mmsapt.cpp | 61 ++++------------------------ 1 file changed, 7 insertions(+), 54 deletions(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 651621f96..703b60ec6 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -53,16 +53,6 @@ namespace Moses } } -#if 0 - Mmsapt:: - Mmsapt(string const& description, string const& line) - : PhraseDictionary(description,line), ofactor(1,0), m_bias_log(NULL) - , m_bias_loglevel(0) - { - this->init(line); - } -#endif - vector const& Mmsapt:: GetFeatureNames() const @@ -392,14 +382,6 @@ namespace Moses } } - // void - // Mmsapt:: - // add_corpus_specific_features(vector >& registry) - // { - // check_ff >("pbwd",m_lbop_conf,registry); - // check_ff >("logcnt",registry); - // } - void Mmsapt:: Load() @@ -655,12 +637,6 @@ namespace Moses for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i) mdyn.extend(sphrase[i]); -#if 0 - cerr << src << endl; - cerr << mfix.size() << ":" << mfix.getPid() << " " - << mdyn.size() << " " << mdyn.getPid() << endl; -#endif - if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size()) return NULL; // phrase not found in either bitext @@ -746,32 +722,6 @@ namespace Moses } } #endif - - -#if 0 - if (combine_pstats(src, - mfix.getPid(), sfix.get(), btfix, - mdyn.getPid(), sdyn.get(), *dyn, ret)) - { -#if 0 - sort(ret->begin(), ret->end(), CompareTargetPhrase()); - cout << "SOURCE PHRASE: " << src << endl; - size_t i = 0; - for (TargetPhraseCollection::iterator r = ret->begin(); r != ret->end(); ++r) - { - cout << ++i << " " << **r << endl; - FVector fv = (*r)->GetScoreBreakdown().CreateFVector(); - typedef pair item_t; - BOOST_FOREACH(item_t f, fv) - cout << f.first << ":" << f.second << " "; - cout << endl; - } -#endif - } -#endif - - // put the result in the cache and return - cache->add(phrasekey, ret); return ret; } @@ -831,8 +781,8 @@ namespace Moses //so that other functions can utilize the biases; ttask->ReSetContextWeights(context->bias->getBiasMap()); } - if (!context->cache1) context->cache1.reset(new pstats::cache_t); - if (!context->cache2) context->cache2.reset(new pstats::cache_t); + // if (!context->cache1) context->cache1.reset(new pstats::cache_t); + // if (!context->cache2) context->cache2.reset(new pstats::cache_t); } else if (!ttask->GetContextWeights().empty()) { @@ -846,9 +796,11 @@ namespace Moses = btfix->SetupDocumentBias(ttask->GetContextWeights(), m_bias_log); context->bias->loglevel = m_bias_loglevel; context->bias->log = m_bias_log; - if (!context->cache1) context->cache1.reset(new pstats::cache_t); - if (!context->cache2) context->cache2.reset(new pstats::cache_t); + // if (!context->cache1) context->cache1.reset(new pstats::cache_t); + // if (!context->cache2) context->cache2.reset(new pstats::cache_t); } + if (!context->cache1) context->cache1.reset(new pstats::cache_t); + if (!context->cache2) context->cache2.reset(new pstats::cache_t); } void @@ -907,6 +859,7 @@ namespace Moses set_bias_for_ranking(ttask, this->btfix); else if (m_sampling_method == random_sampling) set_bias_via_server(ttask); + else UTIL_THROW2("Unknown sampling method: " << m_sampling_method); boost::unique_lock mylock(m_lock); sptr localcache = scope->get(cache_key); From 36bab92fcfadc2d335fc4655c47fd5ba1347e9e9 Mon Sep 17 00:00:00 2001 From: XapaJIaMnu Date: Tue, 7 Jul 2015 11:14:50 +0100 Subject: [PATCH 119/286] Test and fix errors in the profiling part of speedtest and update documentation. Works. --- contrib/moses-speedtest/README.md | 8 ++++---- contrib/moses-speedtest/runtests.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/contrib/moses-speedtest/README.md b/contrib/moses-speedtest/README.md index 5a8fe47aa..cdf3eb652 100644 --- a/contrib/moses-speedtest/README.md +++ b/contrib/moses-speedtest/README.md @@ -63,21 +63,21 @@ The _Variants:_ line specifies what type of tests should we run. This particular If you want to produce profiler results together in some tests you need to specify the _MOSES\_PROFILER\_REPO_ in the config ```bash git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-profile -cd mosesdecoder +cd mosesdecoder-profile ./bjam -j10 --with-cmph=/usr/include/ variant=profile ``` -Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run. +Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run (files ending in **\_profile**). #### Produce google profiler results. If you want to produce profiler results together in some tests you need to specify the _MOSES\_GOOGLE\_PROFILER\_REPO in the config ```bash git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-google-profile cd mosesdecoder -./bjam link=shared -j10 --full-tcmalloc- -with-cmph=/usr/include/ +./bjam link=shared -j10 --full-tcmalloc --with-cmph=/usr/include/ ``` -Afterwards for testcases which contain the **google-profiler** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **google-profiler** output from every run. +Afterwards for testcases which contain the **google-profiler** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **google-profiler** output from every run (files prefixed with **pprof**). To analyze the output you need to use [pprof](http://google-perftools.googlecode.com/svn/trunk/doc/cpuprofile.html). ### Running tests. Running the tests is done through the **runtests.py** script. diff --git a/contrib/moses-speedtest/runtests.py b/contrib/moses-speedtest/runtests.py index f05506137..19d601d42 100644 --- a/contrib/moses-speedtest/runtests.py +++ b/contrib/moses-speedtest/runtests.py @@ -119,7 +119,7 @@ def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None, moses_ if moses_prof_repo is not None: # Get optional command for profiling prof_command = moses_prof_repo + '/bin/' + command if moses_gprof_repo is not None: # Get optional command for google-perftools - gprof_command = moses_gprof_repo + '/bin' + command + gprof_command = moses_gprof_repo + '/bin/' + command command = moses_repo + '/bin/' + command elif opt == 'LDPRE:': ldopts = args.replace('\n', '') @@ -226,14 +226,14 @@ def write_gprof(command, name, variant, config): executable_path = command.split(' ')[0] # Path to the moses binary gprof_command = 'gprof ' + executable_path + ' ' + gmon_path + ' > ' + outputfile subprocess.call([gprof_command], shell=True) - os.remove('gmon_path') # After we are done discard the gmon file + os.remove(gmon_path) # After we are done discard the gmon file def write_pprof(name, variant, config): """Copies the google-perftools profiler output to the corresponding test directory""" output_dir = config.testlogs + '/' + name if not os.path.exists(output_dir): os.makedirs(output_dir) - outputfile = output_dir + '/gprof_' + time.strftime("%d.%m.%Y_%H:%M:%S") + '_' + name + '_' + variant + outputfile = output_dir + '/pprof_' + time.strftime("%d.%m.%Y_%H:%M:%S") + '_' + name + '_' + variant shutil.move("/tmp/moses.prof", outputfile) @@ -373,7 +373,7 @@ if __name__ == '__main__': for logfile in os.listdir(CONFIG.testlogs): logfile_name = CONFIG.testlogs + '/' + logfile - if not check_for_basever(logfile_name, CONFIG.basebranch): + if os.path.isfile(logfile_name) and not check_for_basever(logfile_name, CONFIG.basebranch): logfile = logfile.replace('_vanilla', '') logfile = logfile.replace('_cached', '') logfile = logfile.replace('_ldpre', '') @@ -384,7 +384,7 @@ if __name__ == '__main__': #Create a new configuration for base version tests: BASECONFIG = Configuration(CONFIG.repo, CONFIG.drop_caches,\ CONFIG.tests, CONFIG.testlogs, CONFIG.basebranch,\ - CONFIG.baserev, CONFIG.repo_prof) + CONFIG.baserev, CONFIG.repo_prof, CONFIG.repo_gprof) BASECONFIG.additional_args(None, CONFIG.baserev, CONFIG.basebranch) #Set up the repository and get its revision: REVISION = repoinit(BASECONFIG) @@ -406,7 +406,7 @@ if __name__ == '__main__': #Perform tests for directory in FIRSTTIME: cur_testcase = parse_configfile(BASECONFIG.tests + '/' + directory +\ - '/config', directory, BASECONFIG.repo) + '/config', directory, BASECONFIG.repo, BASECONFIG.repo_prof, BASECONFIG.repo_gprof) execute_tests(cur_testcase, directory, BASECONFIG) #Reset back the repository to the normal configuration @@ -430,10 +430,10 @@ if __name__ == '__main__': if CONFIG.singletest: TESTCASE = parse_configfile(CONFIG.tests + '/' +\ - CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo) + CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof) execute_tests(TESTCASE, CONFIG.singletest, CONFIG) else: for directory in ALL_DIR: cur_testcase = parse_configfile(CONFIG.tests + '/' + directory +\ - '/config', directory, CONFIG.repo) + '/config', directory, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof) execute_tests(cur_testcase, directory, CONFIG) From 03e19dd915506bc0e3973d2bc8e74aac5f3c5900 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 7 Jul 2015 20:16:41 +0100 Subject: [PATCH 120/286] Commented out m_rnd_denom. Not used. --- moses/TranslationModel/UG/mm/ug_bitext_sampler.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index f987af79f..a069ef008 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -63,7 +63,7 @@ BitextSampler : public reference_counter bool m_finished; boost::taus88 m_rnd; // every job has its own pseudo random generator - double m_rnddenom; // denominator for scaling random sampling + // double m_rnd_denom; // denominator for scaling random sampling double m_bias_total; bool consider_sample(TokenPosition const& p); @@ -187,7 +187,7 @@ BitextSampler(Bitext const* const bitext, , m_total_bias(0) , m_finished(false) , m_rnd(0) - , m_rnd_denom(m_rnd.max() + 1) + // , m_rnd_denom(m_rnd.max() + 1) { m_stats.reset(new pstats); m_stats->raw_cnt = phrase.ca(); @@ -208,7 +208,7 @@ BitextSampler(BitextSampler const& other) , m_bias(other.m_bias) , m_samples(other.m_samples) , m_rnd(0) - , m_rnd_denom(m_rnd.max() + 1) + // , m_rnd_denom(m_rnd.max() + 1) { // lock both instances boost::unique_lock mylock(m_lock); From 37342f6231e4809b8b8dc3e54b3a3b44216e3c51 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 8 Jul 2015 00:14:22 +0100 Subject: [PATCH 121/286] Changed verbosity level to 2 for using default values for FF weights. --- moses/FF/Factory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 167e02370..74c54117e 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -165,8 +165,8 @@ FeatureFactory << "WARNING: Auto-initializing all weights for this FF to 1.0"); weights.assign(feature->GetNumScoreComponents(),1.0); } else { - TRACE_ERR("WARNING: No weights specified in config file for FF " - << featureName << ". Using default values supplied by FF."); + VERBOSE(2,"WARNING: No weights specified in config file for FF " + << featureName << ". Using default values supplied by FF."); } } UTIL_THROW_IF2(weights.size() != feature->GetNumScoreComponents(), From 3fdbb00904cd849d221a10848623923a79d8c4b3 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Fri, 10 Jul 2015 15:44:24 +0100 Subject: [PATCH 122/286] Improvements to handling of bilingual LM in EMS --- scripts/ems/example/config.toy.bilinguallm | 18 ++-- scripts/ems/experiment.meta | 8 +- scripts/ems/experiment.perl | 85 +++++++++++++++++-- .../training/bilingual-lm/create_blm_ini.py | 52 ++++++++++++ scripts/training/train-model.perl | 4 +- 5 files changed, 145 insertions(+), 22 deletions(-) create mode 100755 scripts/training/bilingual-lm/create_blm_ini.py diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm index cd6880f32..fbc85866e 100644 --- a/scripts/ems/example/config.toy.bilinguallm +++ b/scripts/ems/example/config.toy.bilinguallm @@ -194,21 +194,19 @@ raw-corpus = $toy-data/nc-5k.$output-extension [LM:bilingual-lm] #bilingual-lm -exclude-from-interpolation = true + +#required settings bilingual-lm = "yes" -bilingual-lm-workdir = "bilingual" -bilingual-lm-settings = "" order = "5" source-window = "4" +nplm-dir = "/mnt/gna0/rsennrich/tools/nplm-0.3-gpu-experimental/" -#actual training -train_order = "14" #this is equal to order + 2*source-window + 1 -nplm-output-dir = "nplm_out" -nplm-settings = "-l /mnt/gna0/rsennrich/tools/nplm-0.3-gpu-experimental/" +# Add extra settings for ngram extraction or nplm training +#bilingual-lm-settings = "" +#nplm-settings = "" -#Config file generation: -config-feature-line = "BilingualNPLM order=$order source_window=$source-window path=$working-dir/$nplm-output-dir/train.10k.model.nplm.10 source_vocab=$working-dir/$bilingual-lm-workdir/vocab.source target_vocab=$working-dir/$bilingual-lm-workdir/vocab.target" -config-weight-line = "BilingualNPLM0= 0.1" +# Defaults to 10 +#epochs = 2 ################################################################# # INTERPOLATING LANGUAGE MODELS diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index ee6b188e8..6113bcc61 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -160,20 +160,18 @@ train ignore-if: no-splitter-training [LM] multiple -prepare-bilingual-nplm +prepare-bilingual-lm in: TRAINING:corpus TRAINING:word-alignment out: numberized_ngrams ignore-unless: bilingual-lm rerun-on-change: TRAINING:corpus TRAINING:word-alignment - template: $moses-script-dir/training/bilingual-lm/extract_training.py -c IN0 -e $output-extension -f $input-extension -a IN1.$TRAINING:alignment-symmetrization-method -w $working-dir/$bilingual-lm-workdir -n $order -m $source-window $bilingual-lm-settings - default-name: lm/bilingualLM_prep + default-name: lm/blm train-bilingual-lm in: numberized_ngrams TRAINING:corpus out: binlm ignore-unless: bilingual-lm rerun-on-change: numberized_ngrams - template: $moses-script-dir/training/bilingual-lm/train_nplm.py -w $working-dir/$bilingual-lm-workdir -c IN1 -r $working-dir/$nplm-output-dir -n $train_order $nplm-settings - default-name: lm/bilingualLM + default-name: lm/blm get-corpus in: get-corpus-script out: raw-corpus diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index a3f5310a5..efdda5df9 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1135,6 +1135,12 @@ sub define_step { elsif ($DO_STEP[$i] =~ /^LM:(.+):train-randomized$/) { &define_lm_train_randomized($i,$1); } + elsif ($DO_STEP[$i] =~ /^LM:(.+):train-bilingual-lm$/) { + &define_lm_train_bilingual_lm($i,$1); + } + elsif ($DO_STEP[$i] =~ /^LM:(.+):prepare-bilingual-lm$/) { + &define_lm_prepare_bilingual_lm($i,$1); + } elsif ($DO_STEP[$i] eq 'TRAINING:prepare-data') { &define_training_prepare_data($i); } @@ -1777,6 +1783,69 @@ sub define_lm_train_randomized { &create_step($step_id,$cmd); } +sub define_lm_train_bilingual_lm { + my ($step_id,$set) = @_; + my ($working_dir, $ngrams, $corpus) = &get_output_and_input($step_id); + my $scripts = &check_backoff_and_get("LM:moses-script-dir"); + my $cmd = "$scripts/training/bilingual-lm/train_nplm.py -w $working_dir -c $corpus -r $working_dir"; + my $nplm_dir = &check_backoff_and_get("LM:$set:nplm-dir"); + $cmd .= " -l $nplm_dir"; + + my ($n, $m, $total_order) = &get_bilingual_lm_order($set); + $cmd .= " -n $total_order"; + + my $epochs = &get_bilingual_lm_epochs($set); + $cmd .= " -e $epochs" if defined($epochs); + + my $nplm_settings = backoff_and_get("LM:$set:nplm-settings"); + $cmd .= " $nplm_settings" if defined($nplm_settings); + + # Create the ini file + $cmd .= "\n"; + $cmd .= "$scripts/training/bilingual-lm/create_blm_ini.py -w $working_dir -n $n -m $m -x $set -e $epochs"; + + &create_step($step_id,$cmd); +} + +sub define_lm_prepare_bilingual_lm { + my ($step_id,$set) = @_; + my ($working_dir, $corpus, $align) = &get_output_and_input($step_id); + my $scripts = &check_backoff_and_get("LM:moses-script-dir"); + my $cmd = "$scripts/training/bilingual-lm/extract_training.py -w $working_dir -c $corpus"; + + my $input_extension = &check_backoff_and_get("GENERAL:input-extension"); + my $output_extension = &check_backoff_and_get("GENERAL:output-extension"); + $cmd .= " -e $output_extension -f $input_extension"; + + my $align_method = &check_backoff_and_get("TRAINING:alignment-symmetrization-method"); + $cmd .= " -a $align.$align_method"; + + my ($n, $m, $total_order) = &get_bilingual_lm_order($set); + $cmd .= " -n $n -m $m"; + + my $bilingual_settings = backoff_and_get("LM:$set:bilingual-lm-settings"); + $cmd .= " $bilingual_settings" if defined($bilingual_settings); + + + &create_step($step_id,$cmd); +} + +sub get_bilingual_lm_order { + my ($set) = @_; + my $order = &backoff_and_get("LM:$set:order"); + $order = 5 unless defined ($order); + my $source_window = &backoff_and_get("LM:$set:source-window"); + $source_window = 4 unless defined ($order); + return ($order, $source_window, $order + 2*$source_window+1); +} + +sub get_bilingual_lm_epochs { + my ($set) = @_; + my $epochs = &backoff_and_get("LM:$set:epochs"); + $epochs = 10 unless defined($epochs); + return $epochs; +} + sub define_lm_randomize { my ($step_id,$set_dummy) = @_; @@ -2548,7 +2617,8 @@ sub define_training_create_config { } # sparse lexical features provide additional content for config file - $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features; + my @additional_ini_files; + push (@additional_ini_files, "$sparse_lexical_features.ini") if $sparse_lexical_features; my @LM_SETS = &get_sets("LM"); my %INTERPOLATED_AWAY; @@ -2599,8 +2669,9 @@ sub define_training_create_config { if (&get("LM:$set:config-feature-line") && &get("LM:$set:config-weight-line")) { $feature_lines .= &get("LM:$set:config-feature-line") . ";"; $weight_lines .= &get("LM:$set:config-weight-line") . ";"; - } - else { + } elsif (&get("LM:$set:bilingual-lm")) { + push(@additional_ini_files, "$lm/blm.ini"); + } else { my $order = &check_backoff_and_get("LM:$set:order"); my $lm_file = "$lm"; @@ -2629,13 +2700,15 @@ sub define_training_create_config { } } - if (defined($feature_lines)) { + if ($feature_lines) { $cmd .= "-config-add-feature-lines \"$feature_lines\" "; } - if (defined($weight_lines)) { + if ($weight_lines) { $cmd .= "-config-add-weight-lines \"$weight_lines\" "; } + $cmd .= "-additional-ini-file " . join(":", @additional_ini_files); + &create_step($step_id,$cmd); } @@ -2795,7 +2868,7 @@ sub get_interpolated_lm_sets { my $count=0; my $icount=0; foreach my $set (@LM_SETS) { - next if (&get("LM:$set:exclude-from-interpolation")); + next if (&get("LM:$set:exclude-from-interpolation")) or (&get("LM:$set:bilingual-lm")); my $order = &check_backoff_and_get("LM:$set:order"); my $factor = 0; diff --git a/scripts/training/bilingual-lm/create_blm_ini.py b/scripts/training/bilingual-lm/create_blm_ini.py new file mode 100755 index 000000000..44099e9b3 --- /dev/null +++ b/scripts/training/bilingual-lm/create_blm_ini.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +import argparse +import os +import os.path +import sys + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-w", "--working-dir", dest="working_dir") + parser.add_argument("-n", "--target-context", dest="n") + parser.add_argument("-m", "--source-context", dest="m") + parser.add_argument("-i", "--ini_filename", dest="ini_filename") + parser.add_argument("-x", "--name", dest="name") + parser.add_argument("-e", "--epochs", dest="epochs") + + + parser.set_defaults( + working_dir="working", + n = "5", + m = "4", + ini_filename = "blm.ini", + name = "comb", + epochs = "10" + ) + + options = parser.parse_args() + + if not os.path.exists(options.working_dir): + os.makedirs(options.working_dir) + + # Bit of a hack, parse the working directory to get the name + name = os.path.basename(options.working_dir).split(".")[0].split("-")[-1] + + ini_filename = os.path.join(options.working_dir,options.ini_filename) + with open(ini_filename,"w") as ifh: + print>>ifh, "[feature]" + print>>ifh,"BilingualNPLM name=BLM%s order=%s source_window=%s path=%s/train.10k.model.nplm.%s source_vocab=%s/vocab.source target_vocab=%s/vocab.target" \ + % (options.name,options.n, options.m, options.working_dir, options.epochs, options.working_dir, options.working_dir) + print>>ifh + print>>ifh,"[weight]" + print>>ifh,"BLM%s= 0.1" % options.name + print>>ifh + + +if __name__ == "__main__": + main() + diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 4d73ef4ee..112e9d286 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -2343,7 +2343,9 @@ sub create_ini { } if ($_ADDITIONAL_INI_FILE) { print INI "\n# additional settings\n\n"; - print INI `cat $_ADDITIONAL_INI_FILE`; + for my $AIF (split (/:/, $_ADDITIONAL_INI_FILE)) { + print INI `cat $AIF`; + } } # feature functions and weights From cc5f12894498a72b3ff80f44df63ef8479d82861 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 11 Jul 2015 00:24:20 +0100 Subject: [PATCH 123/286] Allow 'ranked' as alias for sampling method 'rank'. --- moses/TranslationModel/UG/mmsapt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 703b60ec6..5a911f2b4 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -227,7 +227,7 @@ namespace Moses if ((m = param.find("method")) != param.end()) { - if (m->second == "rank") + if (m->second == "rank" || m->second == "ranked") m_sampling_method = ranked_sampling; else if (m->second == "random") m_sampling_method = random_sampling; From da117e7d3e5ecd79d0313cfc0ebc48b14a60d19a Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 13 Jul 2015 14:25:17 +0100 Subject: [PATCH 124/286] Bug fix related to m_ttask_flag in TargetPhrase. Also changed m_ttask to weak pointer in TargetPhrase. --- moses/TargetPhrase.cpp | 6 +++--- moses/TargetPhrase.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 21f185498..e8cdb64b6 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -144,7 +144,7 @@ TargetPhrase::TargetPhrase(const TargetPhrase ©) , m_alignNonTerm(copy.m_alignNonTerm) , m_properties(copy.m_properties) , m_ttask(copy.m_ttask) - , m_ttask_flag(true) + , m_ttask_flag(copy.m_ttask_flag) , m_container(copy.m_container) { if (copy.m_lhsTarget) { @@ -182,9 +182,9 @@ bool TargetPhrase::HasTtaskSPtr() const return m_ttask_flag; } -const ttasksptr& TargetPhrase::GetTtask() const +const ttasksptr TargetPhrase::GetTtask() const { - return m_ttask; + return m_ttask.lock(); } void TargetPhrase::EvaluateInIsolation(const Phrase &source) diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h index 4e6b1278b..56ed27af3 100644 --- a/moses/TargetPhrase.h +++ b/moses/TargetPhrase.h @@ -61,7 +61,7 @@ public: private: ScoreCache_t m_cached_scores; - ttasksptr m_ttask; + ttaskwptr m_ttask; bool m_ttask_flag; private: @@ -92,7 +92,7 @@ public: TargetPhrase(ttasksptr &ttask, const PhraseDictionary *pt = NULL); TargetPhrase(ttasksptr &ttask, std::string out_string, const PhraseDictionary *pt = NULL); explicit TargetPhrase(ttasksptr &ttask, const Phrase &targetPhrase, const PhraseDictionary *pt); - const ttasksptr& GetTtask() const; + const ttasksptr GetTtask() const; bool HasTtaskSPtr() const; ~TargetPhrase(); From 0abef8c581a8cab3075f610d366bed22c1c9664e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 13 Jul 2015 17:50:14 +0100 Subject: [PATCH 125/286] Moved m_ttask and m_ttash_flag in initialization orderto avoid compiler warnings. --- moses/TargetPhrase.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index fc7af9687..8e95fc0aa 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -42,13 +42,13 @@ namespace Moses { TargetPhrase::TargetPhrase( std::string out_string, const PhraseDictionary *pt) :Phrase(0) + , m_ttask_flag(false) , m_fullScore(0.0) , m_futureScore(0.0) , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_lhsTarget(NULL) , m_ruleSource(NULL) - , m_ttask_flag(false) , m_container(pt) { //ACAT @@ -61,14 +61,14 @@ TargetPhrase::TargetPhrase( std::string out_string, const PhraseDictionary *pt) TargetPhrase::TargetPhrase(ttasksptr& ttask, std::string out_string, const PhraseDictionary *pt) :Phrase(0) + , m_ttask(ttask) + , m_ttask_flag(true) , m_fullScore(0.0) , m_futureScore(0.0) , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_lhsTarget(NULL) , m_ruleSource(NULL) - , m_ttask(ttask) - , m_ttask_flag(true) , m_container(pt) { From e94007c7f401be81184f197aa84b8080208ef93c Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 13 Jul 2015 17:51:44 +0100 Subject: [PATCH 126/286] Mmsapt can now handle factorized phrase tables with more than one factor. --- moses/TranslationModel/UG/mmsapt.cpp | 31 +++++++++++++++++----------- moses/TranslationModel/UG/mmsapt.h | 10 ++++++--- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 5a911f2b4..7ce29f1cc 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -21,14 +21,14 @@ namespace Moses using namespace boost; void - fillIdSeq(Phrase const& mophrase, size_t const ifactor, + fillIdSeq(Phrase const& mophrase, std::vector const& ifactors, TokenIndex const& V, vector& dest) { dest.resize(mophrase.GetSize()); for (size_t i = 0; i < mophrase.GetSize(); ++i) { - Factor const* f = mophrase.GetFactor(i,ifactor); - dest[i] = V[f->ToString()]; + // Factor const* f = mophrase.GetFactor(i,ifactor); + dest[i] = V[mophrase.GetWord(i).GetString(ifactors, false)]; // f->ToString()]; } } @@ -72,7 +72,8 @@ namespace Moses , cache_key(((char*)this)+2) , context_key(((char*)this)+1) // , m_tpc_ctr(0) - , ofactor(1,0) + // , m_ifactor(1,0) + // , m_ofactor(1,0) { init(line); setup_local_feature_functions(); @@ -149,12 +150,17 @@ namespace Moses // set defaults for all parameters if not specified so far pair dflt("input-factor","0"); - input_factor = atoi(param.insert(dflt).first->second.c_str()); - // shouldn't that be a string? - + string ifactors = param.insert(dflt).first->second; + size_t p = 0; + for (size_t q = ifactors.find(','); q < ifactors.size(); q = ifactors.find(',', p=q+1)) + m_ifactor.push_back(atoi(ifactors.substr(p, q-p).c_str())); + m_ifactor.push_back(atoi(ifactors.substr(p).c_str())); + dflt = pair ("output-factor","0"); - output_factor = atoi(param.insert(dflt).first->second.c_str()); - ofactor.assign(1,output_factor); + string ofactors = param.insert(dflt).first->second; + for (size_t q = ofactors.find(',', p=0); q < ifactors.size(); q = ifactors.find(',', p=q+1)) + m_ofactor.push_back(atoi(ifactors.substr(p, q-p).c_str())); + m_ofactor.push_back(atoi(ofactors.substr(p).c_str())); dflt = pair ("smooth",".01"); m_lbop_conf = atof(param.insert(dflt).first->second.c_str()); @@ -561,7 +567,8 @@ namespace Moses for (uint32_t k = 0; k < len; ++k, x = x->next()) { StringPiece wrd = (*(btfix->V2))[x->id()]; - Word w; w.CreateFromString(Output,ofactor,wrd,false); + Word w; + w.CreateFromString(Output, m_ofactor, wrd, false); tp->AddWord(w); } tp->SetAlignTerm(pool.aln); @@ -616,7 +623,7 @@ namespace Moses { // map from Moses Phrase to internal id sequence vector sphrase; - fillIdSeq(src,input_factor,*(btfix->V1),sphrase); + fillIdSeq(src, m_ifactor, *(btfix->V1), sphrase); if (sphrase.size() == 0) return NULL; // Reserve a local copy of the dynamic bitext in its current form. /btdyn/ @@ -895,7 +902,7 @@ namespace Moses sptr const& scope = ttask->GetScope(); vector myphrase; - fillIdSeq(phrase,input_factor,*btfix->V1,myphrase); + fillIdSeq(phrase, m_ifactor, *btfix->V1, myphrase); TSA::tree_iterator mfix(btfix->I1.get(),&myphrase[0],myphrase.size()); if (mfix.size() == myphrase.size()) diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index a717ccff8..77c955af5 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -95,8 +95,12 @@ namespace Moses boost::shared_ptr m_bias; // for global default bias boost::shared_ptr m_cache; // for global default bias size_t m_cache_size; // - size_t input_factor; // - size_t output_factor; // we can actually return entire Tokens! + // size_t input_factor; // + // size_t output_factor; // we can actually return entire Tokens! + + std::vector m_input_factor; + std::vector m_output_factor; + // for display for human inspection (ttable dumps): std::vector m_feature_names; // names of features activated @@ -139,7 +143,7 @@ namespace Moses // for more complex operations on the cache bool withPbwd; bool poolCounts; - std::vector ofactor; + std::vector m_ifactor, m_ofactor; void setup_local_feature_functions(); void set_bias_via_server(ttasksptr const& ttask); From b13b9a36beeb4329d606c125b6404c393682b4f7 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Tue, 14 Jul 2015 00:00:43 +0100 Subject: [PATCH 127/286] daily automatic beautifier --- moses/BaseManager.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/BaseManager.cpp b/moses/BaseManager.cpp index e51f7e3a0..fc01d9144 100644 --- a/moses/BaseManager.cpp +++ b/moses/BaseManager.cpp @@ -28,7 +28,7 @@ BaseManager::GetSource() const } const ttasksptr -BaseManager::GetTtask() const +BaseManager::GetTtask() const { return m_ttask.lock(); } From 7e3050f7f219962e618d56f1a934b6a9cfc7036c Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Tue, 14 Jul 2015 05:27:03 -0400 Subject: [PATCH 128/286] allow saving of model from fast-align (for incremental use) --- scripts/ems/experiment.meta | 24 +++++-- scripts/ems/experiment.perl | 20 ++++-- scripts/ems/support/fast-align-in-parts.perl | 69 ++++++++++++++++++-- 3 files changed, 97 insertions(+), 16 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 6113bcc61..54773f184 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -521,14 +521,14 @@ fast-align in: prepared-data-fast-align out: fast-alignment rerun-on-change: fast-align-settings - ignore-if: fast-align-max-lines + ignore-if: fast-align-max-lines fast-align-save-model template: $external-bin-dir/fast_align -i IN $fast-align-settings > OUT default-name: fast-align fast-align-inverse in: prepared-data-fast-align out: fast-alignment-inverse rerun-on-change: fast-align-settings - ignore-if: fast-align-max-lines + ignore-if: fast-align-max-lines fast-align-save-model template: $external-bin-dir/fast_align -i IN -r $fast-align-settings > OUT default-name: fast-align-inverse fast-align-in-parts @@ -537,7 +537,7 @@ fast-align-in-parts rerun-on-change: fast-align-settings fast-align-max-lines ignore-unless: fast-align-max-lines tmp-name: training/tmp.fast-align - template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT + template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' -save-model '$?fast-align-save-model' -o OUT default-name: fast-align fast-align-in-parts-inverse in: prepared-data-fast-align @@ -545,8 +545,24 @@ fast-align-in-parts-inverse rerun-on-change: fast-align-settings fast-align-max-lines ignore-unless: fast-align-max-lines tmp-name: training/tmp.fast-align-inverse - template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -r -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT + template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -r -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' -save-model '$?fast-align-save-model' -o OUT + default-name: fast-align-inverse +fast-align-save-model + in: prepared-data-fast-align + out: fast-alignment + ignore-unless: fast-align-save-model + ignore-if: fast-align-max-lines default-name: fast-align + tmp-name: training/tmp.fast-align-inverse + template: $external-bin-dir/fast_align -i IN $fast-align-settings -p OUT.parameters > OUT 2> OUT.log +fast-align-save-model-inverse + in: prepared-data-fast-align + out: fast-alignment-inverse + ignore-unless: fast-align-save-model + ignore-if: fast-align-max-lines + default-name: fast-align-inverse + tmp-name: training/tmp.fast-align-inverse + template: $external-bin-dir/fast_align -r -i IN $fast-align-settings -p OUT.parameters > OUT 2> OUT.log symmetrize-fast-align in: fast-alignment fast-alignment-inverse corpus-mml-prefilter=OR=corpus out: word-alignment diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index efdda5df9..8846acc1c 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -3527,12 +3527,20 @@ sub define_template { } $cmd =~ s/VERSION/$VERSION/g; print "\tcmd is $cmd\n" if $VERBOSE; - while ($cmd =~ /^([\S\s]*)\$\{([^\s\/\"\']+)\}([\S\s]*)$/ || - $cmd =~ /^([\S\s]*)\$([^\s\/\"\']+)([\S\s]*)$/) { - my ($pre,$variable,$post) = ($1,$2,$3); - $cmd = $pre - . &check_backoff_and_get(&extend_local_name($module,$set,$variable)) - . $post; + + # replace variables + while ($cmd =~ /^([\S\s]*)\$(\??)\{([^\s\/\"\']+)\}([\S\s]*)$/ || + $cmd =~ /^([\S\s]*)\$(\??)([^\s\/\"\']+)([\S\s]*)$/) { + my ($pre,$optional,$variable,$post) = ($1,$2,$3,$4); + my $value; + if ($optional eq '?') { + $value = &backoff_and_get(&extend_local_name($module,$set,$variable)); + $value = "" unless $value; + } + else { + $value = &check_backoff_and_get(&extend_local_name($module,$set,$variable)); + } + $cmd = $pre.$value.$post; } # deal with pipelined commands diff --git a/scripts/ems/support/fast-align-in-parts.perl b/scripts/ems/support/fast-align-in-parts.perl index bc340a50f..4180f7308 100755 --- a/scripts/ems/support/fast-align-in-parts.perl +++ b/scripts/ems/support/fast-align-in-parts.perl @@ -12,28 +12,33 @@ use warnings; use strict; use Getopt::Long qw(:config pass_through no_ignore_case permute); -my ($BIN,$IN,$MAX_LINES,$SETTINGS,$REVERSE,$TMP); +my ($BIN,$IN,$OUT,$MAX_LINES,$SETTINGS,$REVERSE,$SAVE_MODEL,$TMP); GetOptions('bin=s' => \$BIN, 'i=s' => \$IN, + 'o=s' => \$OUT, 'max-lines=i' => \$MAX_LINES, 'settings=s' => \$SETTINGS, + 'save-model=s' => \$SAVE_MODEL, 'r' => \$REVERSE, 'tmp=s' => \$TMP, ) or exit(1); -die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR") - unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP) && defined($MAX_LINES) +die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR [-save-model MODEL] -o ALIGNMENTS") + unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP) + && defined($MAX_LINES) && defined($OUT) && $MAX_LINES > 0; die("ERROR - input file does not exist: $IN") unless -e $IN; die("ERROR - fast_align binary does not exist: $BIN") unless -e $BIN; +$SAVE_MODEL = defined($SAVE_MODEL) && $SAVE_MODEL && $SAVE_MODEL ne 'no'; chomp(my $line_count = `cat $IN | wc -l`); # not more than maximal number of lines -> just run it regulary if ($MAX_LINES > $line_count) { - my $cmd = "$BIN -i $IN $SETTINGS"; + my $cmd = "$BIN -i $IN $SETTINGS >$OUT"; $cmd .= " -r" if defined($REVERSE); + $cmd .= " -p $OUT.parameters 2> $OUT.log" if $SAVE_MODEL; safesystem($cmd) or die; exit(0); } @@ -56,6 +61,7 @@ foreach my $input_file (@INPUT_FILES) { # process part my $cmd = "$BIN -i $input_file $SETTINGS"; $cmd .= " -r" if defined($REVERSE); + $cmd .= " -p $output_file.parameters 2> $output_file.log" if $SAVE_MODEL; $cmd .= " >$output_file"; safesystem($cmd) or die; die("ERROR: no output produced from command $cmd") unless -e $output_file; @@ -67,12 +73,63 @@ foreach my $input_file (@INPUT_FILES) { } # join output -$cmd = "cat $TMP/aligned-*"; +$cmd = "cat $TMP/aligned-?? > $OUT"; safesystem($cmd) or die; -$cmd = "rm -r $TMP/* ; rmdir $TMP"; +# join model +&join_model(scalar @INPUT_FILES) if $SAVE_MODEL; +&join_log(scalar @INPUT_FILES) if $SAVE_MODEL; + +$cmd = "rm $TMP/* ; rmdir $TMP"; safesystem($cmd); +sub join_model { + my ($count) = @_; + open(CONCAT,"cat $TMP/aligned-*.parameters | LC_ALL=C sort -T $TMP -S 10%|"); + open(JOINED,">$OUT.parameters"); + my ($last_f,$last_e,$f,$e,$score,$merged_score); + while() { + ($f,$e,$score) = split; + if (!defined($last_f) || $f ne $last_f || $e ne $last_e) { + printf JOINED "%s %s %f\n",$last_f,$last_e,log($merged_score) if defined($last_f); + $last_f = $f; + $last_e = $e; + $merged_score = 0; + } + $merged_score += exp($score)/$count; + } + printf JOINED "%s %s %f\n",$f,$e,log($merged_score); + close(CONCAT); + close(JOINED); +} + +sub merge_entry { + my ($count,$f,$e,@SCORE) = @_; + my $score = 0; + foreach (@SCORE) { + $score += exp($_)/$count; + } + $score = log($score); + print JOINED "$f $e $score\n"; +} + +sub join_log { + my ($count) = @_; + open(CONCAT,"cat $TMP/aligned-*.log |"); + my ($length,$tension,$tension_count) = (0,0,0); + while() { + $length += $1 if /expected target length = source length \* ([\d\.]+)/; + $tension += $1 if /final tension: ([\d\.]+)/ and (++$tension_count % 3 == 0); + } + close(CONCAT); + $length /= $count; + $tension /= $count; + open(JOINED,">$OUT.log"); + print JOINED "expected target length = source length * $length\n"; + print JOINED " final tension: $tension\n"; + close(JOINED); +} + sub safesystem { print STDERR "Executing: @_\n"; system(@_); From 3c30210dad5acef8407f44c19232860b48436e1d Mon Sep 17 00:00:00 2001 From: David Madl Date: Tue, 14 Jul 2015 13:04:00 +0100 Subject: [PATCH 129/286] Fix 'Use of uninitialized value' error through explicit setting of 0s in hash. Fixes the following errors in bootstrap-hypothesis-difference-significance.pl on Perl v5.14.2: Use of uninitialized value $coocUpd in numeric gt (>) at /fs/lofn0/dmadl/software/mosesdecoder/scripts/analysis/bootstrap-hypothesis-difference-significance.pl line 317. Use of uninitialized value $b in numeric lt (<) at /fs/lofn0/dmadl/software/mosesdecoder/scripts/analysis/bootstrap-hypothesis-difference-significance.pl line 543. Use of uninitialized value $coocUpd in addition (+) at /fs/lofn0/dmadl/software/mosesdecoder/scripts/analysis/bootstrap-hypothesis-difference-significance.pl line 314. Use of uninitialized value $coocUpd in numeric gt (>) at /fs/lofn0/dmadl/software/mosesdecoder/scripts/analysis/bootstrap-hypothesis-difference-significance.pl line 317. Use of uninitialized value $a in numeric gt (>) at /fs/lofn0/dmadl/software/mosesdecoder/scripts/analysis/bootstrap-hypothesis-difference-significance.pl line 552. --- .../bootstrap-hypothesis-difference-significance.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl index 9a3f63d69..aa98e2c4f 100755 --- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl +++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl @@ -313,6 +313,9 @@ sub preEvalHypoSnt { #correct, total for my $ngram (keys %$hypNgramCounts) { + if (!exists $refNgramCounts->{$ngram}) { + $refNgramCounts->{$ngram} = 0; + } $coocUpd = min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram}); $correctNgramCounts += $coocUpd; $totalNgramCounts += $hypNgramCounts->{$ngram}; @@ -514,6 +517,9 @@ sub groupNgramsMultiSrc { my $currNgramCounts = groupNgrams($ref->[$lineIdx], $order); for my $currNgram (keys %$currNgramCounts) { + if (!exists $result{$currNgram}) { + $result{$currNgram} = 0; + } $result{$currNgram} = max($result{$currNgram}, $currNgramCounts->{$currNgram}); } } From ca72105fdf4490e2e9da1b769d500a48445fdc3f Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Tue, 14 Jul 2015 13:16:11 +0100 Subject: [PATCH 130/286] fix ems regression --- scripts/ems/experiment.perl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 8846acc1c..fab1eb7b9 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2707,7 +2707,9 @@ sub define_training_create_config { $cmd .= "-config-add-weight-lines \"$weight_lines\" "; } - $cmd .= "-additional-ini-file " . join(":", @additional_ini_files); + if (@additional_ini_files) { + $cmd .= "-additional-ini-file " . join(":", @additional_ini_files); + } &create_step($step_id,$cmd); } From 66ecf98cf7b5bd74a8ad187c59e5679a1f1dee04 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Tue, 14 Jul 2015 11:01:22 -0400 Subject: [PATCH 131/286] minor bug fix --- scripts/ems/experiment.meta | 1 + scripts/ems/experiment.perl | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 54773f184..c2fc38260 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -775,6 +775,7 @@ create-config rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature default-name: model/moses.ini error: Unknown option + error: requires an argument final-model: yes binarize-config in: config diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index fab1eb7b9..e2a01d123 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -853,7 +853,7 @@ sub delete_output { `rm -r $file` if $EXECUTE; } # delete regular file that matches exactly - if (-e $file) { + elsif (-e $file) { print "\tdelete file $file\n"; `rm $file` if $EXECUTE; } @@ -864,14 +864,14 @@ sub delete_output { foreach (`ls $dir`) { chop; next unless substr($_,0,length($f)) eq $f; - if (-e "$dir/$_") { + if (-d "$dir/$_") { + print "\tdelete directory $file\n"; + `rm -r $dir/$_` if $EXECUTE; + } + elsif (-e "$dir/$_") { print "\tdelete file $dir/$_\n"; `rm $dir/$_` if $EXECUTE; } - else { - print "\tdelete directory $dir/$_\n"; - `rm -r $dir/$_` if $EXECUTE; - } } } From 0ca2bcb28d2bd0658a252bcfafe0820bc1e49660 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Thu, 16 Jul 2015 15:51:16 +0700 Subject: [PATCH 132/286] End line after printing progress dots to stderr. --- phrase-extract/consolidate-main.cpp | 4 ++++ phrase-extract/extract-main.cpp | 4 ++++ phrase-extract/score-main.cpp | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index 5964bf686..732185eb3 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -259,6 +259,7 @@ void processFiles( const std::string& fileNameDirect, // loop through all extracted phrase translations int i=0; while(true) { + // Print progress dots to stderr. i++; if (i%100000 == 0) std::cerr << "." << std::flush; @@ -436,6 +437,9 @@ void processFiles( const std::string& fileNameDirect, fileDirect.Close(); fileIndirect.Close(); fileConsolidated.Close(); + + // We've been printing progress dots to stderr. End the line. + std::cerr << std::endl; } diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index 70d4cad35..328a33a25 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -283,6 +283,7 @@ int main(int argc, char* argv[]) string englishString, foreignString, alignmentString, weightString; while(getline(*eFileP, englishString)) { + // Print progress dots to stderr. i++; if (i%10000 == 0) cerr << "." << flush; @@ -337,6 +338,9 @@ int main(int argc, char* argv[]) extractFileContextInv.Close(); } } + + // We've been printing progress dots to stderr. End the line. + cerr << endl; } namespace MosesTraining diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index cf28f90b9..444087b2d 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -359,6 +359,7 @@ int main(int argc, char* argv[]) while ( getline(extractFile, line) ) { + // Print progress dots to stderr. if ( ++i % 100000 == 0 ) { std::cerr << "." << std::flush; } @@ -450,6 +451,9 @@ int main(int argc, char* argv[]) } + // We've been printing progress dots to stderr. End the line. + std::cerr << std::endl; + processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb ); for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); iter!=phrasePairsWithSameSource.end(); ++iter) { From d4508e984f9a9d224ee59df17fca80dc8fce0f7f Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Thu, 16 Jul 2015 16:48:20 +0700 Subject: [PATCH 133/286] Compare nullness in Hypothesis::RecombineCompare. The comparison code subtracted two pointers in the case where at least one was null, to get a signed int comparison result. But that subtraction is undefined, and the cast to int (dropping the most-significant bits!) made the outcome even less uncertain. In this patch I compare the nullness of the pointers instead, which should always return a well-defined -1, 0, or 1. --- moses/Hypothesis.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp index 59d53a61d..cc51d5cee 100644 --- a/moses/Hypothesis.cpp +++ b/moses/Hypothesis.cpp @@ -213,7 +213,8 @@ RecombineCompare(const Hypothesis &compare) const for (unsigned i = 0; i < m_ffStates.size(); ++i) { if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL) { - comp = m_ffStates[i] - compare.m_ffStates[i]; + // TODO: Can this situation actually occur? + comp = int(m_ffStates[i] != NULL) - int(compare.m_ffStates[i] != NULL); } else { comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]); } From 43300459b351b8f6edbde712c575750edafa3156 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Thu, 16 Jul 2015 17:56:20 +0700 Subject: [PATCH 134/286] Replace WordsBitmap malloc() with std::vector. It'd be great to use the specialized vector, or Boost's dynamic_bitset, for this. But gcc and clang don't have an optimized find() for vector, making it slower rather than faster. And dynamic_bitset doesn't have reverse searches or searches for false. --- moses/WordsBitmap.h | 124 ++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 73 deletions(-) diff --git a/moses/WordsBitmap.h b/moses/WordsBitmap.h index ca17f6ac0..7bed93e2f 100644 --- a/moses/WordsBitmap.h +++ b/moses/WordsBitmap.h @@ -22,6 +22,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #ifndef moses_WordsBitmap_h #define moses_WordsBitmap_h +#include #include #include #include @@ -35,50 +36,29 @@ namespace Moses { typedef unsigned long WordsBitmapID; -/** vector of boolean used to represent whether a word has been translated or not -*/ +/** Vector of boolean to represent whether a word has been translated or not. + * + * Implemented using a vector of char. A vector of bool, or a Boost + * dynamic_bitset, could be much more efficient in theory but unfortunately + * algorithms like std::find() are not optimized for them. + */ class WordsBitmap { friend std::ostream& operator<<(std::ostream& out, const WordsBitmap& wordsBitmap); -protected: - const size_t m_size; /**< number of words in sentence */ - bool *m_bitmap; /**< ticks of words that have been done */ - size_t m_firstGap; /** Position of first gap, pre-calculated as it is consulted often */ +private: + std::vector m_bitmap; //! Ticks of words in sentence that have been done. + size_t m_firstGap; //! Cached position of first gap, or NOT_FOUND. WordsBitmap(); // not implemented WordsBitmap& operator= (const WordsBitmap& other); - //! set all elements to false - void Initialize() { - for (size_t pos = 0 ; pos < m_size ; pos++) { - m_bitmap[pos] = false; - } - } - - //sets elements by vector - void Initialize(const std::vector& vector) { - size_t vector_size = vector.size(); - bool gapFound = false; - for (size_t pos = 0 ; pos < m_size ; pos++) { - if (pos < vector_size && vector[pos] == true) m_bitmap[pos] = true; - else { - m_bitmap[pos] = false; - if (!gapFound) { - m_firstGap = pos; - gapFound = true; - } - } - } - if (!gapFound) m_firstGap = NOT_FOUND; - } - /** Update the first gap, when bits are flipped */ void UpdateFirstGap(size_t startPos, size_t endPos, bool value) { if (value) { //may remove gap if (startPos <= m_firstGap && m_firstGap <= endPos) { m_firstGap = NOT_FOUND; - for (size_t i = endPos + 1 ; i < m_size; ++i) { + for (size_t i = endPos + 1 ; i < m_bitmap.size(); ++i) { if (!m_bitmap[i]) { m_firstGap = i; break; @@ -96,38 +76,35 @@ protected: public: - //! create WordsBitmap of length size and initialise with vector - WordsBitmap(size_t size, const std::vector& initialize_vector) - :m_size (size), m_firstGap(0) { - m_bitmap = (bool*) malloc(sizeof(bool) * size); - Initialize(initialize_vector); + //! Create WordsBitmap of length size, and initialise with vector. + WordsBitmap(size_t size, const std::vector& initializer) + :m_bitmap(initializer.begin(), initializer.end()), m_firstGap(0) { + + // The initializer may not be of the same length. Change to the desired + // length. If we need to add any elements, initialize them to false. + m_bitmap.resize(size, false); + + // Find the first gap, and cache it. + std::vector::const_iterator first_gap = std::find( + m_bitmap.begin(), m_bitmap.end(), false); + m_firstGap = ( + (first_gap == m_bitmap.end()) ? + NOT_FOUND : first_gap - m_bitmap.begin()); } - //! create WordsBitmap of length size and initialise + + //! Create WordsBitmap of length size and initialise. WordsBitmap(size_t size) - :m_size (size), m_firstGap(0) { - m_bitmap = (bool*) malloc(sizeof(bool) * size); - Initialize(); + :m_bitmap(size, false), m_firstGap(0) { } - //! deep copy + + //! Deep copy. WordsBitmap(const WordsBitmap ©) - :m_size (copy.m_size), m_firstGap(copy.m_firstGap) { - m_bitmap = (bool*) malloc(sizeof(bool) * m_size); - for (size_t pos = 0 ; pos < copy.m_size ; pos++) { - m_bitmap[pos] = copy.GetValue(pos); - } - m_firstGap = copy.m_firstGap; + :m_bitmap(copy.m_bitmap), m_firstGap(copy.m_firstGap) { } - ~WordsBitmap() { - free(m_bitmap); - } - //! count of words translated + + //! Count of words translated. size_t GetNumWordsCovered() const { - size_t count = 0; - for (size_t pos = 0 ; pos < m_size ; pos++) { - if (m_bitmap[pos]) - count++; - } - return count; + return std::count(m_bitmap.begin(), m_bitmap.end(), true); } //! position of 1st word not yet translated, or NOT_FOUND if everything already translated @@ -138,7 +115,7 @@ public: //! position of last word not yet translated, or NOT_FOUND if everything already translated size_t GetLastGapPos() const { - for (int pos = (int) m_size - 1 ; pos >= 0 ; pos--) { + for (int pos = int(m_bitmap.size()) - 1 ; pos >= 0 ; pos--) { if (!m_bitmap[pos]) { return pos; } @@ -150,7 +127,7 @@ public: //! position of last translated word size_t GetLastPos() const { - for (int pos = (int) m_size - 1 ; pos >= 0 ; pos--) { + for (int pos = int(m_bitmap.size()) - 1 ; pos >= 0 ; pos--) { if (m_bitmap[pos]) { return pos; } @@ -163,7 +140,7 @@ public: //! whether a word has been translated at a particular position bool GetValue(size_t pos) const { - return m_bitmap[pos]; + return bool(m_bitmap[pos]); } //! set value at a particular position void SetValue( size_t pos, bool value ) { @@ -198,7 +175,7 @@ public: } //! number of elements size_t GetSize() const { - return m_size; + return m_bitmap.size(); } //! transitive comparison of WordsBitmap @@ -213,7 +190,8 @@ public: if (thisSize != compareSize) { return (thisSize < compareSize) ? -1 : 1; } - return std::memcmp(m_bitmap, compare.m_bitmap, thisSize * sizeof(bool)); + return std::memcmp( + &m_bitmap[0], &compare.m_bitmap[0], thisSize * sizeof(bool)); } bool operator< (const WordsBitmap &compare) const { @@ -229,20 +207,20 @@ public: } inline size_t GetEdgeToTheRightOf(size_t r) const { - if (r+1 == m_size) return r; - while (r+1 < m_size && !m_bitmap[r+1]) { - ++r; - } - return r; + if (r+1 == m_bitmap.size()) return r; + return ( + std::find(m_bitmap.begin() + r + 1, m_bitmap.end(), true) - + m_bitmap.begin() + ) - 1; } //! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16 WordsBitmapID GetID() const { - assert(m_size < (1<<16)); + assert(m_bitmap.size() < (1<<16)); size_t start = GetFirstGapPos(); - if (start == NOT_FOUND) start = m_size; // nothing left + if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left size_t end = GetLastPos(); if (end == NOT_FOUND) end = 0; // nothing translated yet @@ -257,10 +235,10 @@ public: //! converts bitmap into an integer ID, with an additional span covered WordsBitmapID GetIDPlus( size_t startPos, size_t endPos ) const { - assert(m_size < (1<<16)); + assert(m_bitmap.size() < (1<<16)); size_t start = GetFirstGapPos(); - if (start == NOT_FOUND) start = m_size; // nothing left + if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left size_t end = GetLastPos(); if (end == NOT_FOUND) end = 0; // nothing translated yet @@ -284,8 +262,8 @@ public: // friend inline std::ostream& operator<<(std::ostream& out, const WordsBitmap& wordsBitmap) { - for (size_t i = 0 ; i < wordsBitmap.m_size ; i++) { - out << (wordsBitmap.GetValue(i) ? 1 : 0); + for (size_t i = 0 ; i < wordsBitmap.m_bitmap.size() ; i++) { + out << int(wordsBitmap.GetValue(i)); } return out; } From c83628a92b430a307518b92db7f24924a11c4df2 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Thu, 16 Jul 2015 14:54:00 +0100 Subject: [PATCH 135/286] Fix errors from multiline `` commands in transliteration Perl scripts Replace the backslash-newline sequence with backslash-backslash-newline in multiline backquote command strings. i.e. replace expressions like this: `some-command \ -option1 \ -option2`; with ones like this `some-command \\ -option1 \\ -option2`; If I understand this right, the shell converts a backslash-newline sequence to an empty string (i.e. it discards it), but Perl does not. Unless the backslash itself is escaped, using a backslash-newline in a Perl command string results in errors in most instances. By escaping the backslash, it gets passed through to the shell where it is interpreted as intended. --- .../post-decoding-transliteration.pl | 106 +++++++++--------- .../prepare-transliteration-phrase-table.pl | 36 +++--- .../train-transliteration-module.pl | 100 ++++++++--------- 3 files changed, 121 insertions(+), 121 deletions(-) diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl index df840c709..c8e19fc7f 100755 --- a/scripts/Transliteration/post-decoding-transliteration.pl +++ b/scripts/Transliteration/post-decoding-transliteration.pl @@ -137,38 +137,38 @@ sub run_transliteration print "Filter Table\n"; - `$MOSES_SRC/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -score-options '--KneserNey' \ - -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \ - -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \ + `$MOSES_SRC/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -score-options '--KneserNey' \\ + -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \\ + -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \\ -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`; - `$MOSES_SRC/scripts/training/filter-model-given-input.pl \ - $TRANSLIT_MODEL/evaluation/$eval_file.filtered \ - $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \ - $TRANSLIT_MODEL/evaluation/$eval_file \ + `$MOSES_SRC/scripts/training/filter-model-given-input.pl \\ + $TRANSLIT_MODEL/evaluation/$eval_file.filtered \\ + $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \\ + $TRANSLIT_MODEL/evaluation/$eval_file \\ -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; `rm $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`; print "Apply Filter\n"; - `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \ - $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \ - $TRANSLIT_MODEL/model/moses.ini \ - $TRANSLIT_MODEL/tuning/moses.tuned.ini \ + `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \\ + $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \\ + $TRANSLIT_MODEL/model/moses.ini \\ + $TRANSLIT_MODEL/tuning/moses.tuned.ini \\ $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`; my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null"; - `$DECODER \ - -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \ - -threads 16 -drop-unknown -distortion-limit 0 \ - -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 \ - distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \ - < $TRANSLIT_MODEL/evaluation/$eval_file \ + `$DECODER \\ + -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\ + -threads 16 -drop-unknown -distortion-limit 0 \\ + -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 \\ + distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \\ + < $TRANSLIT_MODEL/evaluation/$eval_file \\ > $TRANSLIT_MODEL/evaluation/$eval_file.op $drop_stderr`; } @@ -315,52 +315,52 @@ sub run_decoder `mkdir $corpus_dir/evaluation`; - `$MOSES_SRC/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -lmodel-oov-feature "yes" -post-decoding-translit "yes" \ - -phrase-translation-table $corpus_dir/model/phrase-table \ + `$MOSES_SRC/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -lmodel-oov-feature "yes" -post-decoding-translit "yes" \\ + -phrase-translation-table $corpus_dir/model/phrase-table \\ -config $corpus_dir/model/moses.ini -lm 0:5:$LM_FILE:8`; `touch $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`; - `$MOSES_SRC/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -lmodel-oov-feature "yes" -post-decoding-translit "yes" \ - -phrase-translation-table $corpus_dir/model/phrase-table \ - -config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \ + `$MOSES_SRC/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -lmodel-oov-feature "yes" -post-decoding-translit "yes" \\ + -phrase-translation-table $corpus_dir/model/phrase-table \\ + -config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \\ -lm 0:3:$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini:8`; - `$MOSES_SRC/scripts/training/filter-model-given-input.pl \ - $corpus_dir/evaluation/filtered \ - $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \ - $INPUT_FILE -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt \ + `$MOSES_SRC/scripts/training/filter-model-given-input.pl \\ + $corpus_dir/evaluation/filtered \\ + $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \\ + $INPUT_FILE -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt \\ 1 1 4 100 2"`; `rm $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`; - `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl \ - $corpus_dir/evaluation/filtered/moses.ini \ - < $corpus_dir/model/moses.ini \ + `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl \\ + $corpus_dir/evaluation/filtered/moses.ini \\ + < $corpus_dir/model/moses.ini \\ > $corpus_dir/evaluation/moses.filtered.ini`; my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null"; - `$DECODER \ - -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \ - -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \ - -max-trans-opt-per-coverage 100 \ - -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \ - < $INPUT_FILE \ + `$DECODER \\ + -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\ + -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \\ + -max-trans-opt-per-coverage 100 \\ + -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \\ + < $INPUT_FILE \\ > $OUTPUT_FILE $drop_stderr`; - print "$DECODER \ - -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \ - -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \ - -max-trans-opt-per-coverage 100 \ - -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \ - < $INPUT_FILE \ + print "$DECODER \\ + -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\ + -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \\ + -max-trans-opt-per-coverage 100 \\ + -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \\ + < $INPUT_FILE \\ > $OUTPUT_FILE $drop_stderr\n"; } diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl index fd8b5a978..100ec5747 100755 --- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl +++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl @@ -103,34 +103,34 @@ sub run_transliteration print STDERR "Filter Table\n"; - `$MOSES_SRC/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -reordering msd-bidirectional-fe -score-options '--KneserNey' \ - -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \ - -reordering-table $TRANSLIT_MODEL/model/reordering-table \ - -config $eval_file.moses.table.ini \ + `$MOSES_SRC/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -reordering msd-bidirectional-fe -score-options '--KneserNey' \\ + -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \\ + -reordering-table $TRANSLIT_MODEL/model/reordering-table \\ + -config $eval_file.moses.table.ini \\ -lm 0:3:$eval_file.moses.table.ini:8`; - `$MOSES_SRC/scripts/training/filter-model-given-input.pl \ - $eval_file.filtered $eval_file.moses.table.ini $eval_file \ + `$MOSES_SRC/scripts/training/filter-model-given-input.pl \\ + $eval_file.filtered $eval_file.moses.table.ini $eval_file \\ -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; `rm $eval_file.moses.table.ini`; print STDERR "Apply Filter\n"; - `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \ - $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini \ + `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \\ + $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini \\ $TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`; - `$MOSES_SRC/bin/moses \ - -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \ - -threads 16 -drop-unknown -distortion-limit 0 \ - -n-best-list $eval_file.op.nBest 50 \ - -f $eval_file.filtered.ini \ - < $eval_file \ + `$MOSES_SRC/bin/moses \\ + -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\ + -threads 16 -drop-unknown -distortion-limit 0 \\ + -n-best-list $eval_file.op.nBest 50 \\ + -f $eval_file.filtered.ini \\ + < $eval_file \\ > $eval_file.op`; } diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl index 817e2d815..d072719d1 100755 --- a/scripts/Transliteration/train-transliteration-module.pl +++ b/scripts/Transliteration/train-transliteration-module.pl @@ -118,80 +118,80 @@ sub learn_transliteration_model{ print "Align Corpus\n"; - `$MOSES_SRC_DIR/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -last-step 1 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t \ + `$MOSES_SRC_DIR/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -last-step 1 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t \\ -corpus-dir $OUT_DIR/training/prepared`; - `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 \ - -dont-zip -first-step 2 -last-step 2 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \ + `$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 \\ + -dont-zip -first-step 2 -last-step 2 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \\ -giza-e2f $OUT_DIR/training/giza -direction 2`; - `$MOSES_SRC_DIR/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \ + `$MOSES_SRC_DIR/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared \\ -giza-f2e $OUT_DIR/training/giza-inverse -direction 1`; - `$MOSES_SRC_DIR/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza \ - -giza-f2e $OUT_DIR/training/giza-inverse \ - -alignment-file $OUT_DIR/model/aligned \ + `$MOSES_SRC_DIR/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza \\ + -giza-f2e $OUT_DIR/training/giza-inverse \\ + -alignment-file $OUT_DIR/model/aligned \\ -alignment-stem $OUT_DIR/model/aligned -alignment grow-diag-final-and`; print "Train Translation Models\n"; - `$MOSES_SRC_DIR/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex \ - -alignment-file $OUT_DIR/model/aligned \ - -alignment-stem $OUT_DIR/model/aligned \ + `$MOSES_SRC_DIR/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex \\ + -alignment-file $OUT_DIR/model/aligned \\ + -alignment-stem $OUT_DIR/model/aligned \\ -corpus $OUT_DIR/training/corpus$t`; - `$MOSES_SRC_DIR/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned \ - -alignment-stem $OUT_DIR/model/aligned -extract-file \ + `$MOSES_SRC_DIR/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned \\ + -alignment-stem $OUT_DIR/model/aligned -extract-file \\ $OUT_DIR/model/extract -corpus $OUT_DIR/training/corpus$t`; - `$MOSES_SRC_DIR/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract \ - -lexical-file $OUT_DIR/model/lex -phrase-translation-table \ + `$MOSES_SRC_DIR/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract \\ + -lexical-file $OUT_DIR/model/lex -phrase-translation-table \\ $OUT_DIR/model/phrase-table`; print "Train Language Models\n"; - `$SRILM_DIR/ngram-count \ - -order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk \ + `$SRILM_DIR/ngram-count \\ + -order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk \\ -text $OUT_DIR/lm/target -lm $OUT_DIR/lm/targetLM`; - `$MOSES_SRC_DIR/bin/build_binary \ + `$MOSES_SRC_DIR/bin/build_binary \\ $OUT_DIR/lm/targetLM $OUT_DIR/lm/targetLM.bin`; print "Create Config File\n"; - `$MOSES_SRC_DIR/scripts/training/train-model.perl \ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \ - -score-options '--KneserNey' \ - -phrase-translation-table $OUT_DIR/model/phrase-table \ + `$MOSES_SRC_DIR/scripts/training/train-model.perl \\ + -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\ + -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ + -score-options '--KneserNey' \\ + -phrase-translation-table $OUT_DIR/model/phrase-table \\ -config $OUT_DIR/model/moses.ini -lm 0:5:$OUT_DIR/lm/targetLM.bin:8`; } From b3baade7f022edbcea2969679a40616683f63523 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 16 Jul 2015 19:55:16 +0100 Subject: [PATCH 136/286] changes to make static build work, and limit the number of concurrent connections to moses server. /Pidong Wang --- Jamroot | 2 +- contrib/server/mosesserver.cpp | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Jamroot b/Jamroot index b3544274b..30c979766 100644 --- a/Jamroot +++ b/Jamroot @@ -179,7 +179,7 @@ if [ option.get "with-icu" : : "yes" ] requirements += icui18n/shared ; requirements += -fPIC ; requirements += 64 ; - requirements += shared ; +# requirements += shared ; } if [ option.get "with-probing-pt" : : "yes" ] diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 337962aa6..8af1d82a3 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -740,20 +740,23 @@ int main(int argc, char** argv) myRegistry.addMethod("updater", updater); myRegistry.addMethod("optimize", optimizer); + /* CODE FOR old xmlrpc-c v. 1.32 or lower xmlrpc_c::serverAbyss myAbyssServer( myRegistry, port, // TCP port on which to listen logfile ); - /* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04 + */ + + /* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04 */ xmlrpc_c::serverAbyss myAbyssServer( xmlrpc_c::serverAbyss::constrOpt() - .registryPtr(&myRegistry) + .registryP(&myRegistry) .portNumber(port) // TCP port on which to listen .logFileName(logfile) .allowOrigin("*") + .maxConn((unsigned int)numThreads) ); - */ XVERBOSE(1,"Listening on port " << port << endl); if (isSerial) { From 1ffcd17ce5cbe106d622a219beb3ec9421faa24f Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Fri, 17 Jul 2015 00:00:42 +0100 Subject: [PATCH 137/286] daily automatic beautifier --- moses/WordsBitmap.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/moses/WordsBitmap.h b/moses/WordsBitmap.h index 7bed93e2f..af4cf06b6 100644 --- a/moses/WordsBitmap.h +++ b/moses/WordsBitmap.h @@ -86,10 +86,10 @@ public: // Find the first gap, and cache it. std::vector::const_iterator first_gap = std::find( - m_bitmap.begin(), m_bitmap.end(), false); + m_bitmap.begin(), m_bitmap.end(), false); m_firstGap = ( - (first_gap == m_bitmap.end()) ? - NOT_FOUND : first_gap - m_bitmap.begin()); + (first_gap == m_bitmap.end()) ? + NOT_FOUND : first_gap - m_bitmap.begin()); } //! Create WordsBitmap of length size and initialise. @@ -191,7 +191,7 @@ public: return (thisSize < compareSize) ? -1 : 1; } return std::memcmp( - &m_bitmap[0], &compare.m_bitmap[0], thisSize * sizeof(bool)); + &m_bitmap[0], &compare.m_bitmap[0], thisSize * sizeof(bool)); } bool operator< (const WordsBitmap &compare) const { @@ -209,9 +209,9 @@ public: inline size_t GetEdgeToTheRightOf(size_t r) const { if (r+1 == m_bitmap.size()) return r; return ( - std::find(m_bitmap.begin() + r + 1, m_bitmap.end(), true) - - m_bitmap.begin() - ) - 1; + std::find(m_bitmap.begin() + r + 1, m_bitmap.end(), true) - + m_bitmap.begin() + ) - 1; } From 342598af0c51f509aee4f68e0c6e8e7b66ae65bc Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Fri, 17 Jul 2015 19:17:38 +0700 Subject: [PATCH 138/286] Cosmetic. --- moses/WordsBitmap.cpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/moses/WordsBitmap.cpp b/moses/WordsBitmap.cpp index 4ac2625b9..17340ffac 100644 --- a/moses/WordsBitmap.cpp +++ b/moses/WordsBitmap.cpp @@ -28,20 +28,10 @@ TO_STRING_BODY(WordsBitmap); bool WordsBitmap::IsAdjacent(size_t startPos, size_t endPos) const { - if (GetNumWordsCovered() == 0) { - return true; - } - - size_t first = GetFirstGapPos(); - size_t last = GetLastGapPos(); - - if (startPos == last || endPos == first) { - return true; - } - - return false; + return + GetNumWordsCovered() == 0 || + startPos == GetFirstGapPos() || + endPos == GetLastGapPos(); } - } - From a64468a9919867174beba71962ada795a4ce12d3 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Fri, 17 Jul 2015 19:23:47 +0700 Subject: [PATCH 139/286] Document WordsBitmap performance considerations. --- moses/WordsBitmap.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/moses/WordsBitmap.h b/moses/WordsBitmap.h index af4cf06b6..c1dcb8acf 100644 --- a/moses/WordsBitmap.h +++ b/moses/WordsBitmap.h @@ -38,9 +38,14 @@ typedef unsigned long WordsBitmapID; /** Vector of boolean to represent whether a word has been translated or not. * - * Implemented using a vector of char. A vector of bool, or a Boost - * dynamic_bitset, could be much more efficient in theory but unfortunately - * algorithms like std::find() are not optimized for them. + * Implemented using a vector of char, which is usually the same representation + * for the elements that a C array of bool would use. A vector of bool, or a + * Boost dynamic_bitset, could be much more efficient in theory. Unfortunately + * algorithms like std::find() are not optimized for vector on gcc or + * clang, and dynamic_bitset lacks all the optimized search operations we want. + * Only benchmarking will tell what works best. Perhaps dynamic_bitset could + * still be a dramatic improvement, if we flip the meaning of the bits around + * so we can use its find_first() and find_next() for the most common searches. */ class WordsBitmap { From c1142741a159034eee3b1d10453498c09a77beb0 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 17 Jul 2015 14:10:50 +0100 Subject: [PATCH 140/286] relax-parse: write node attributes to output --- phrase-extract/relax-parse-main.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/phrase-extract/relax-parse-main.cpp b/phrase-extract/relax-parse-main.cpp index f7a2a271b..0ce211e2d 100644 --- a/phrase-extract/relax-parse-main.cpp +++ b/phrase-extract/relax-parse-main.cpp @@ -120,8 +120,13 @@ void store( SyntaxNodeCollection &tree, const vector< string > &words ) for( size_t i=0; istart << "-" << nodes[i]->end - << "\" label=\"" << nodes[i]->label - << "\"/>"; + << "\" label=\"" << nodes[i]->label << "\""; + for (SyntaxNode::AttributeMap::const_iterator + p = nodes[i]->attributes.begin(); + p != nodes[i]->attributes.end(); ++p) { + cout << " " << p->first << "=\"" << p->second << "\""; + } + cout << "/>"; } cout << endl; } From df50e454d2a472b37b0198df4e18d02eabb0d523 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 17 Jul 2015 14:32:30 +0100 Subject: [PATCH 141/286] Proper handling of moses parameters with double dash in create_config(). --- scripts/training/mert-moses.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index c73e75a87..6490ec5a5 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -1510,7 +1510,7 @@ sub create_config { $___DECODER_FLAGS =~ s/^\s*|\s*$//; $___DECODER_FLAGS =~ s/\s+/ /; foreach (split(/ /, $___DECODER_FLAGS)) { - if (/^\-([^\d].*)$/) { + if (/^\--?([^\d].*)$/) { $parameter = $1; } else { my $value = $_; From ad4fdc59c240e21ab43abef51ec557a8a2a030e8 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 18 Jul 2015 19:02:47 +0100 Subject: [PATCH 142/286] Re-enabled actual caching (get always returned NULL). Moved point of locking in release() in an attempt to battle segfaults. --- .../UG/TargetPhraseCollectionCache.cc | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc b/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc index 1217b9711..54f7ffa5a 100644 --- a/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc +++ b/moses/TranslationModel/UG/TargetPhraseCollectionCache.cc @@ -114,7 +114,8 @@ namespace Moses } encache(m->second); - return NULL; + // return NULL; + return m->second; } // TPCollCache::get(...) void @@ -136,6 +137,8 @@ namespace Moses { if (!ptr) return; + boost::upgrade_lock lock(m_cache_lock); + if (--ptr->refCount || ptr->idx >= 0) // tpc is still in use { ptr = NULL; @@ -151,7 +154,6 @@ namespace Moses << " at " << __FILE__ << ":" << __LINE__ << endl; #endif - boost::upgrade_lock lock(m_cache_lock); cache_t::iterator m = m_cache.find(ptr->key); if (m != m_cache.end() && m->second == ptr) { // the cache could have been updated with a new pointer @@ -161,7 +163,12 @@ namespace Moses boost::upgrade_to_unique_lock xlock(lock); m_cache.erase(m); } + + delete ptr; + + + ptr = NULL; } // TPCollCache::release(...) From e85f353898a6c68990d1f4d56644441126862f0f Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Fri, 17 Jul 2015 14:45:38 +0100 Subject: [PATCH 143/286] code simplification by removing language-specific, unused hack. --- moses/LM/RDLM.cpp | 50 +++++-------------- moses/LM/RDLM.h | 4 +- .../training/rdlm/extract_syntactic_ngrams.py | 20 ++------ scripts/training/rdlm/extract_vocab.py | 14 +----- 4 files changed, 18 insertions(+), 70 deletions(-) diff --git a/moses/LM/RDLM.cpp b/moses/LM/RDLM.cpp index 70fabbc6e..1e9f2b4d3 100644 --- a/moses/LM/RDLM.cpp +++ b/moses/LM/RDLM.cpp @@ -290,8 +290,8 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost } std::pair head_ids; - InternalTree* found = GetHead(root, back_pointers, head_ids); - if (found == NULL) { + bool found = GetHead(root, back_pointers, head_ids); + if (!found) { head_ids = std::make_pair(static_dummy_head, static_dummy_head); } @@ -516,7 +516,7 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost ancestor_labels.pop_back(); } -InternalTree* RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair & IDs, InternalTree* head_ptr) const +bool RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair & IDs) const { InternalTree *tree; @@ -528,51 +528,27 @@ InternalTree* RDLM::GetHead(InternalTree* root, const TreePointerMap & back_poin } if (m_binarized && tree->GetLabel()[0] == '^') { - head_ptr = GetHead(tree, back_pointers, IDs, head_ptr); - if (head_ptr != NULL && !m_isPTKVZ) { - return head_ptr; + bool found = GetHead(tree, back_pointers, IDs); + if (found) { + return true; } } // assumption (only true for dependency parse): each constituent has a preterminal label, and corresponding terminal is head // if constituent has multiple preterminals, first one is picked; if it has no preterminals, dummy_head is returned - else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal() && head_ptr == NULL) { - head_ptr = tree; - if (!m_isPTKVZ) { - GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs); - return head_ptr; - } - } - - // add PTKVZ to lemma of verb - else if (m_isPTKVZ && head_ptr && tree->GetLabel() == "avz") { - InternalTree *tree2; - for (std::vector::const_iterator it2 = tree->GetChildren().begin(); it2 != tree->GetChildren().end(); ++it2) { - if ((*it2)->IsLeafNT()) { - tree2 = back_pointers.find(it2->get())->second.get(); - } else { - tree2 = it2->get(); - } - if (tree2->GetLabel() == "PTKVZ" && tree2->GetLength() == 1 && tree2->GetChildren()[0]->IsTerminal()) { - std::string verb = tree2->GetChildren()[0]->GetLabel() + head_ptr->GetChildren()[0]->GetLabel(); - GetIDs(verb, head_ptr->GetLabel(), IDs); - return head_ptr; - } - } + else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal()) { + GetIDs(tree->GetChildren()[0]->GetLabel(), tree->GetLabel(), IDs); + return true; } } - if (head_ptr != NULL) { - GetIDs(head_ptr->GetChildren()[0]->GetLabel(), head_ptr->GetLabel(), IDs); - } - return head_ptr; + return false; } void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_label, std::vector & heads, std::vector & labels, std::vector & heads_output, std::vector & labels_output) const { std::pair child_ids; - InternalTree* found; size_t j = 0; // score start label (if enabled) for all nonterminal nodes (but not for terminal or preterminal nodes) @@ -616,8 +592,8 @@ void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & bac continue; } - found = GetHead(child, back_pointers, child_ids); - if (found == NULL) { + bool found = GetHead(child, back_pointers, child_ids); + if (!found) { child_ids = std::make_pair(static_dummy_head, static_dummy_head); } @@ -714,8 +690,6 @@ void RDLM::SetParameter(const std::string& key, const std::string& value) m_path_head_lm = value; } else if (key == "path_label_lm") { m_path_label_lm = value; - } else if (key == "ptkvz") { - m_isPTKVZ = Scan(value); } else if (key == "backoff") { m_isPretermBackoff = Scan(value); } else if (key == "context_up") { diff --git a/moses/LM/RDLM.h b/moses/LM/RDLM.h index 1b92ed7c9..c5480b6c4 100644 --- a/moses/LM/RDLM.h +++ b/moses/LM/RDLM.h @@ -68,7 +68,6 @@ class RDLM : public StatefulFeatureFunction std::string m_endTag; std::string m_path_head_lm; std::string m_path_label_lm; - bool m_isPTKVZ; bool m_isPretermBackoff; size_t m_context_left; size_t m_context_right; @@ -111,7 +110,6 @@ public: , m_startSymbol("SSTART") , m_endSymbol("SEND") , m_endTag("") - , m_isPTKVZ(false) , m_isPretermBackoff(true) , m_context_left(3) , m_context_right(0) @@ -133,7 +131,7 @@ public: } void Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array &score, std::vector &ancestor_heads, std::vector &ancestor_labels, size_t &boundary_hash, int num_virtual = 0, int rescoring_levels = 0) const; - InternalTree* GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair & IDs, InternalTree * head_ptr=NULL) const; + bool GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair & IDs) const; void GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_labels, std::vector & heads, std::vector & labels, std::vector & heads_output, std::vector & labels_output) const; void GetIDs(const std::string & head, const std::string & preterminal, std::pair & IDs) const; void ScoreFile(std::string &path); //for debugging diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py index be4ed2335..406523691 100755 --- a/scripts/training/rdlm/extract_syntactic_ngrams.py +++ b/scripts/training/rdlm/extract_syntactic_ngrams.py @@ -89,11 +89,6 @@ def create_parser(): help=( "Sentence end symbol. Will be skipped during extraction " "(default: %(default)s)")) - parser.add_argument( - '--ptkvz', action='store_true', - help=( - "Special rule for German dependency trees: " - "concatenate separable verb prefix and verb.")) return parser @@ -107,22 +102,15 @@ def escape_text(s): return s -def get_head(xml, add_ptkvz): +def get_head(xml): """Deterministic heuristic to get head of subtree.""" head = None preterminal = None for child in xml: if not len(child): - if head is not None: - continue preterminal = child.get('label') head = escape_text(child.text.strip()) - - elif add_ptkvz and head and child.get('label') == 'avz': - for grandchild in child: - if grandchild.get('label') == 'PTKVZ': - head = escape_text(grandchild.text.strip()) + head - break + return head, preterminal return head, preterminal @@ -159,7 +147,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_labels = ( [vocab.get('', 0)] * options.up_context) - head, preterminal = get_head(xml, options.ptkvz) + head, preterminal = get_head(xml) if not head: head = '' preterminal = head @@ -222,7 +210,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, preterminal_child = head_child child_label = '' else: - head_child, preterminal_child = get_head(child, options.ptkvz) + head_child, preterminal_child = get_head(child) child_label = child.get('label') if head_child is None: diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py index 48e5215c3..70b8da612 100755 --- a/scripts/training/rdlm/extract_vocab.py +++ b/scripts/training/rdlm/extract_vocab.py @@ -46,11 +46,6 @@ def create_parser(): parser.add_argument( '--output', '-o', type=str, default='vocab', metavar='PREFIX', help="Output prefix (default: 'vocab')") - parser.add_argument( - '--ptkvz', action="store_true", - help=( - "Special rule for German dependency trees: attach separable " - "verb prefixes to verb.")) return parser @@ -70,16 +65,9 @@ def get_head(xml, args): preterminal = None for child in xml: if not len(child): - if head is not None: - continue preterminal = child.get('label') head = escape_text(child.text.strip()) - - elif args.ptkvz and head and child.get('label') == 'avz': - for grandchild in child: - if grandchild.get('label') == 'PTKVZ': - head = escape_text(grandchild.text.strip()) + head - break + return head, preterminal return head, preterminal From bec950cf728993ab6b7f846cc57f11d34d3f41f1 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Fri, 17 Jul 2015 17:27:31 +0100 Subject: [PATCH 144/286] support factors in InternalTree --- mert/HwcmScorer.h | 5 +- mert/InternalTree.cpp | 113 ++++++++++++++++++++++++ mert/InternalTree.h | 77 +++++++++++++++++ mert/Jamfile | 2 +- moses/FF/InternalTree.cpp | 137 ++++++------------------------ moses/FF/InternalTree.h | 50 +++-------- moses/FF/TreeStructureFeature.cpp | 27 +----- moses/FF/TreeStructureFeature.h | 10 ++- moses/LM/RDLM.cpp | 30 ++++--- moses/LM/RDLM.h | 36 +++++--- 10 files changed, 274 insertions(+), 213 deletions(-) create mode 100644 mert/InternalTree.cpp create mode 100644 mert/InternalTree.h diff --git a/mert/HwcmScorer.h b/mert/HwcmScorer.h index 16d563424..2e52f0be9 100644 --- a/mert/HwcmScorer.h +++ b/mert/HwcmScorer.h @@ -5,10 +5,7 @@ #include #include "StatisticsBasedScorer.h" -#include "moses/FF/InternalTree.h" - -using Moses::TreePointer; -using Moses::InternalTree; +#include "InternalTree.h" namespace MosesTuning { diff --git a/mert/InternalTree.cpp b/mert/InternalTree.cpp new file mode 100644 index 000000000..d82fbcc72 --- /dev/null +++ b/mert/InternalTree.cpp @@ -0,0 +1,113 @@ +#include "InternalTree.h" + +namespace MosesTuning +{ + +InternalTree::InternalTree(const std::string & line, const bool terminal): + m_isTerminal(terminal) + { + + size_t found = line.find_first_of("[] "); + + if (found == line.npos) { + m_value = line; + } + + else { + AddSubTree(line, 0); + } +} + +size_t InternalTree::AddSubTree(const std::string & line, size_t pos) { + + std::string value; + char token = 0; + + while (token != ']' && pos != std::string::npos) + { + size_t oldpos = pos; + pos = line.find_first_of("[] ", pos); + if (pos == std::string::npos) break; + token = line[pos]; + value = line.substr(oldpos,pos-oldpos); + + if (token == '[') { + if (m_value.size() > 0) { + m_children.push_back(boost::make_shared(value,false)); + pos = m_children.back()->AddSubTree(line, pos+1); + } + else { + if (value.size() > 0) { + m_value = value; + } + pos = AddSubTree(line, pos+1); + } + } + else if (token == ' ' || token == ']') { + if (value.size() > 0 && !(m_value.size() > 0)) { + m_value = value; + } + else if (value.size() > 0) { + m_isTerminal = false; + m_children.push_back(boost::make_shared(value,true)); + } + if (token == ' ') { + pos++; + } + } + + if (m_children.size() > 0) { + m_isTerminal = false; + } + } + + if (pos == std::string::npos) { + return line.size(); + } + return std::min(line.size(),pos+1); + +} + +std::string InternalTree::GetString(bool start) const { + + std::string ret = ""; + if (!start) { + ret += " "; + } + + if (!m_isTerminal) { + ret += "["; + } + + ret += m_value; + for (std::vector::const_iterator it = m_children.begin(); it != m_children.end(); ++it) + { + ret += (*it)->GetString(false); + } + + if (!m_isTerminal) { + ret += "]"; + } + return ret; + +} + + +void InternalTree::Combine(const std::vector &previous) { + + std::vector::iterator it; + bool found = false; + leafNT next_leafNT(this); + for (std::vector::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) { + found = next_leafNT(it); + if (found) { + *it = *it_prev; + } + else { + std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n"; + } + } +} + + +} diff --git a/mert/InternalTree.h b/mert/InternalTree.h new file mode 100644 index 000000000..f8416101c --- /dev/null +++ b/mert/InternalTree.h @@ -0,0 +1,77 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "util/generator.hh" +#include "util/exception.hh" + +namespace MosesTuning +{ + +class InternalTree; +typedef boost::shared_ptr TreePointer; +typedef int NTLabel; + +class InternalTree +{ +std::string m_value; +std::vector m_children; +bool m_isTerminal; +public: + InternalTree(const std::string & line, const bool terminal = false); + InternalTree(const InternalTree & tree): + m_value(tree.m_value), + m_isTerminal(tree.m_isTerminal) { + const std::vector & children = tree.m_children; + for (std::vector::const_iterator it = children.begin(); it != children.end(); it++) { + m_children.push_back(boost::make_shared(**it)); + } + } + size_t AddSubTree(const std::string & line, size_t start); + + std::string GetString(bool start = true) const; + void Combine(const std::vector &previous); + const std::string & GetLabel() const { + return m_value; + } + + size_t GetLength() const { + return m_children.size(); + } + std::vector & GetChildren() { + return m_children; + } + + bool IsTerminal() const { + return m_isTerminal; + } + + bool IsLeafNT() const { + return (!m_isTerminal && m_children.size() == 0); + } +}; + +// Python-like generator that yields next nonterminal leaf on every call +$generator(leafNT) { + std::vector::iterator it; + InternalTree* tree; + leafNT(InternalTree* root = 0): tree(root) {} + $emit(std::vector::iterator) + for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) { + if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) { + $yield(it); + } + else if ((*it)->GetLength() > 0) { + if ((*it).get()) { // normal pointer to same object that TreePointer points to + $restart(tree = (*it).get()); + } + } + } + $stop; +}; + +} \ No newline at end of file diff --git a/mert/Jamfile b/mert/Jamfile index 7a8d98bae..e5adce76e 100644 --- a/mert/Jamfile +++ b/mert/Jamfile @@ -30,7 +30,7 @@ InterpolatedScorer.cpp Point.cpp PerScorer.cpp HwcmScorer.cpp -../moses/FF/InternalTree.cpp +InternalTree.cpp Scorer.cpp ScorerFactory.cpp Optimizer.cpp diff --git a/moses/FF/InternalTree.cpp b/moses/FF/InternalTree.cpp index 4a01ea1b2..c38fc5747 100644 --- a/moses/FF/InternalTree.cpp +++ b/moses/FF/InternalTree.cpp @@ -1,27 +1,24 @@ #include "InternalTree.h" +#include "moses/StaticData.h" namespace Moses { -InternalTree::InternalTree(const std::string & line, size_t start, size_t len, const bool terminal): - m_value_nt(0), - m_isTerminal(terminal) +InternalTree::InternalTree(const std::string & line, size_t start, size_t len, const bool nonterminal) { if (len > 0) { - m_value.assign(line, start, len); + m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(start, len), nonterminal); } } -InternalTree::InternalTree(const std::string & line, const bool terminal): - m_value_nt(0), - m_isTerminal(terminal) +InternalTree::InternalTree(const std::string & line, const bool nonterminal) { size_t found = line.find_first_of("[] "); if (found == line.npos) { - m_value = line; + m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), line, nonterminal); } else { AddSubTree(line, 0); } @@ -32,6 +29,7 @@ size_t InternalTree::AddSubTree(const std::string & line, size_t pos) char token = 0; size_t len = 0; + bool has_value = false; while (token != ']' && pos != std::string::npos) { size_t oldpos = pos; @@ -41,30 +39,27 @@ size_t InternalTree::AddSubTree(const std::string & line, size_t pos) len = pos-oldpos; if (token == '[') { - if (!m_value.empty()) { - m_children.push_back(boost::make_shared(line, oldpos, len, false)); + if (has_value) { + m_children.push_back(boost::make_shared(line, oldpos, len, true)); pos = m_children.back()->AddSubTree(line, pos+1); } else { if (len > 0) { - m_value.assign(line, oldpos, len); + m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(oldpos, len), false); + has_value = true; } pos = AddSubTree(line, pos+1); } } else if (token == ' ' || token == ']') { - if (len > 0 && m_value.empty()) { - m_value.assign(line, oldpos, len); + if (len > 0 && !has_value) { + m_value.CreateFromString(Output, StaticData::Instance().GetOutputFactorOrder(), StringPiece(line).substr(oldpos, len), true); + has_value = true; } else if (len > 0) { - m_isTerminal = false; - m_children.push_back(boost::make_shared(line, oldpos, len, true)); + m_children.push_back(boost::make_shared(line, oldpos, len, false)); } if (token == ' ') { pos++; } } - - if (!m_children.empty()) { - m_isTerminal = false; - } } if (pos == std::string::npos) { @@ -82,16 +77,16 @@ std::string InternalTree::GetString(bool start) const ret += " "; } - if (!m_isTerminal) { + if (!IsTerminal()) { ret += "["; } - ret += m_value; + ret += m_value.GetString(StaticData::Instance().GetOutputFactorOrder(), false); for (std::vector::const_iterator it = m_children.begin(); it != m_children.end(); ++it) { ret += (*it)->GetString(false); } - if (!m_isTerminal) { + if (!IsTerminal()) { ret += "]"; } return ret; @@ -120,13 +115,13 @@ void InternalTree::Unbinarize() { // nodes with virtual label cannot be unbinarized - if (m_value.empty() || m_value[0] == '^') { + if (m_value.GetString(0).empty() || m_value.GetString(0).as_string()[0] == '^') { return; } //if node has child that is virtual node, get unbinarized list of children for (std::vector::iterator it = m_children.begin(); it != m_children.end(); ++it) { - if (!(*it)->IsTerminal() && (*it)->GetLabel()[0] == '^') { + if (!(*it)->IsTerminal() && (*it)->GetLabel().GetString(0).as_string()[0] == '^') { std::vector new_children; GetUnbinarizedChildren(new_children); m_children = new_children; @@ -144,8 +139,8 @@ void InternalTree::Unbinarize() void InternalTree::GetUnbinarizedChildren(std::vector &ret) const { for (std::vector::const_iterator itx = m_children.begin(); itx != m_children.end(); ++itx) { - const std::string &label = (*itx)->GetLabel(); - if (!label.empty() && label[0] == '^') { + const StringPiece label = (*itx)->GetLabel().GetString(0); + if (!label.empty() && label.as_string()[0] == '^') { (*itx)->GetUnbinarizedChildren(ret); } else { ret.push_back(*itx); @@ -153,7 +148,7 @@ void InternalTree::GetUnbinarizedChildren(std::vector &ret) const } } -bool InternalTree::FlatSearch(const std::string & label, std::vector::const_iterator & it) const +bool InternalTree::FlatSearch(const Word & label, std::vector::const_iterator & it) const { for (it = m_children.begin(); it != m_children.end(); ++it) { if ((*it)->GetLabel() == label) { @@ -163,7 +158,7 @@ bool InternalTree::FlatSearch(const std::string & label, std::vector::const_iterator & it) const +bool InternalTree::RecursiveSearch(const Word & label, std::vector::const_iterator & it) const { for (it = m_children.begin(); it != m_children.end(); ++it) { if ((*it)->GetLabel() == label) { @@ -178,7 +173,7 @@ bool InternalTree::RecursiveSearch(const std::string & label, std::vector::const_iterator & it, InternalTree const* &parent) const +bool InternalTree::RecursiveSearch(const Word & label, std::vector::const_iterator & it, InternalTree const* &parent) const { for (it = m_children.begin(); it != m_children.end(); ++it) { if ((*it)->GetLabel() == label) { @@ -194,88 +189,4 @@ bool InternalTree::RecursiveSearch(const std::string & label, std::vector::const_iterator & it) const -{ - for (it = m_children.begin(); it != m_children.end(); ++it) { - if ((*it)->GetNTLabel() == label) { - return true; - } - } - return false; -} - -bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector::const_iterator & it) const -{ - for (it = m_children.begin(); it != m_children.end(); ++it) { - if ((*it)->GetNTLabel() == label) { - return true; - } - std::vector::const_iterator it2; - if ((*it)->RecursiveSearch(label, it2)) { - it = it2; - return true; - } - } - return false; -} - -bool InternalTree::RecursiveSearch(const NTLabel & label, std::vector::const_iterator & it, InternalTree const* &parent) const -{ - for (it = m_children.begin(); it != m_children.end(); ++it) { - if ((*it)->GetNTLabel() == label) { - parent = this; - return true; - } - std::vector::const_iterator it2; - if ((*it)->RecursiveSearch(label, it2, parent)) { - it = it2; - return true; - } - } - return false; -} - - -bool InternalTree::FlatSearch(const std::vector & labels, std::vector::const_iterator & it) const -{ - for (it = m_children.begin(); it != m_children.end(); ++it) { - if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) { - return true; - } - } - return false; -} - -bool InternalTree::RecursiveSearch(const std::vector & labels, std::vector::const_iterator & it) const -{ - for (it = m_children.begin(); it != m_children.end(); ++it) { - if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) { - return true; - } - std::vector::const_iterator it2; - if ((*it)->RecursiveSearch(labels, it2)) { - it = it2; - return true; - } - } - return false; -} - -bool InternalTree::RecursiveSearch(const std::vector & labels, std::vector::const_iterator & it, InternalTree const* &parent) const -{ - for (it = m_children.begin(); it != m_children.end(); ++it) { - if (std::binary_search(labels.begin(), labels.end(), (*it)->GetNTLabel())) { - parent = this; - return true; - } - std::vector::const_iterator it2; - if ((*it)->RecursiveSearch(labels, it2, parent)) { - it = it2; - return true; - } - } - return false; -} - } \ No newline at end of file diff --git a/moses/FF/InternalTree.h b/moses/FF/InternalTree.h index 8f982c6aa..a3db3487e 100644 --- a/moses/FF/InternalTree.h +++ b/moses/FF/InternalTree.h @@ -5,30 +5,28 @@ #include #include #include "FFState.h" +#include "moses/Word.h" #include #include #include "util/generator.hh" #include "util/exception.hh" +#include "util/string_piece.hh" namespace Moses { class InternalTree; typedef boost::shared_ptr TreePointer; -typedef int NTLabel; class InternalTree { - std::string m_value; - NTLabel m_value_nt; + Word m_value; std::vector m_children; - bool m_isTerminal; public: InternalTree(const std::string & line, size_t start, size_t len, const bool terminal); - InternalTree(const std::string & line, const bool terminal = false); + InternalTree(const std::string & line, const bool nonterminal = true); InternalTree(const InternalTree & tree): - m_value(tree.m_value), - m_isTerminal(tree.m_isTerminal) { + m_value(tree.m_value) { const std::vector & children = tree.m_children; for (std::vector::const_iterator it = children.begin(); it != children.end(); it++) { m_children.push_back(boost::make_shared(**it)); @@ -40,20 +38,10 @@ public: void Combine(const std::vector &previous); void Unbinarize(); void GetUnbinarizedChildren(std::vector &children) const; - const std::string & GetLabel() const { + const Word & GetLabel() const { return m_value; } - // optionally identify label by int instead of string; - // allows abstraction if multiple nonterminal strings should map to same label. - const NTLabel & GetNTLabel() const { - return m_value_nt; - } - - void SetNTLabel(NTLabel value) { - m_value_nt = value; - } - size_t GetLength() const { return m_children.size(); } @@ -62,38 +50,22 @@ public: } bool IsTerminal() const { - return m_isTerminal; + return !m_value.IsNonTerminal(); } bool IsLeafNT() const { - return (!m_isTerminal && m_children.size() == 0); + return (m_value.IsNonTerminal() && m_children.size() == 0); } // different methods to search a tree (either just direct children (FlatSearch) or all children (RecursiveSearch)) for constituents. // can be used for formulating syntax constraints. // if found, 'it' is iterator to first tree node that matches search string - bool FlatSearch(const std::string & label, std::vector::const_iterator & it) const; - bool RecursiveSearch(const std::string & label, std::vector::const_iterator & it) const; + bool FlatSearch(const Word & label, std::vector::const_iterator & it) const; + bool RecursiveSearch(const Word & label, std::vector::const_iterator & it) const; // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node - bool RecursiveSearch(const std::string & label, std::vector::const_iterator & it, InternalTree const* &parent) const; - - // use NTLabel for search to reduce number of string comparisons / deal with synonymous labels - // if found, 'it' is iterator to first tree node that matches search string - bool FlatSearch(const NTLabel & label, std::vector::const_iterator & it) const; - bool RecursiveSearch(const NTLabel & label, std::vector::const_iterator & it) const; - - // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node - bool RecursiveSearch(const NTLabel & label, std::vector::const_iterator & it, InternalTree const* &parent) const; - - // pass vector of possible labels to search - // if found, 'it' is iterator to first tree node that matches search string - bool FlatSearch(const std::vector & labels, std::vector::const_iterator & it) const; - bool RecursiveSearch(const std::vector & labels, std::vector::const_iterator & it) const; - - // if found, 'it' is iterator to first tree node that matches search string, and 'parent' to its parent node - bool RecursiveSearch(const std::vector & labels, std::vector::const_iterator & it, InternalTree const* &parent) const; + bool RecursiveSearch(const Word & label, std::vector::const_iterator & it, InternalTree const* &parent) const; // Python-like generator that yields next nonterminal leaf on every call $generator(leafNT) { diff --git a/moses/FF/TreeStructureFeature.cpp b/moses/FF/TreeStructureFeature.cpp index fc1fcdc5b..108c99143 100644 --- a/moses/FF/TreeStructureFeature.cpp +++ b/moses/FF/TreeStructureFeature.cpp @@ -13,33 +13,12 @@ void TreeStructureFeature::Load() // syntactic constraints can be hooked in here. m_constraints = NULL; - m_labelset = NULL; StaticData &staticData = StaticData::InstanceNonConst(); staticData.SetTreeStructure(this); } -// define NT labels (ints) that are mapped from strings for quicker comparison. -void TreeStructureFeature::AddNTLabels(TreePointer root) const -{ - std::string label = root->GetLabel(); - - if (root->IsTerminal()) { - return; - } - - std::map::const_iterator it = m_labelset->string_to_label.find(label); - if (it != m_labelset->string_to_label.end()) { - root->SetNTLabel(it->second); - } - - std::vector children = root->GetChildren(); - for (std::vector::const_iterator it2 = children.begin(); it2 != children.end(); ++it2) { - AddNTLabels(*it2); - } -} - FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo , int featureID /* used to index the state in the previous hypotheses */ , ScoreComponentCollection* accumulator) const @@ -48,10 +27,6 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy const std::string *tree = property->GetValueString(); TreePointer mytree (boost::make_shared(*tree)); - if (m_labelset) { - AddNTLabels(mytree); - } - //get subtrees (in target order) std::vector previous_trees; for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) { @@ -70,7 +45,7 @@ FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hy } mytree->Combine(previous_trees); - bool full_sentence = (mytree->GetChildren().back()->GetLabel() == "" || (mytree->GetChildren().back()->GetLabel() == "SEND" && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == "")); + bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_send || (mytree->GetChildren().back()->GetLabel() == m_send_nt && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_send)); if (m_binarized && full_sentence) { mytree->Unbinarize(); } diff --git a/moses/FF/TreeStructureFeature.h b/moses/FF/TreeStructureFeature.h index ecb2ce7cb..cef87e7ee 100644 --- a/moses/FF/TreeStructureFeature.h +++ b/moses/FF/TreeStructureFeature.h @@ -4,6 +4,7 @@ #include #include "StatefulFeatureFunction.h" #include "FFState.h" +#include "moses/Word.h" #include "InternalTree.h" namespace Moses @@ -35,11 +36,18 @@ class TreeStructureFeature : public StatefulFeatureFunction SyntaxConstraints* m_constraints; LabelSet* m_labelset; bool m_binarized; + Word m_send; + Word m_send_nt; + public: TreeStructureFeature(const std::string &line) :StatefulFeatureFunction(0, line) , m_binarized(false) { ReadParameters(); + std::vector factors; + factors.push_back(0); + m_send.CreateFromString(Output, factors, "", false); + m_send_nt.CreateFromString(Output, factors, "SEND", true); } ~TreeStructureFeature() { delete m_constraints; @@ -49,8 +57,6 @@ public: return new TreeState(TreePointer()); } - void AddNTLabels(TreePointer root) const; - bool IsUseable(const FactorMask &mask) const { return true; } diff --git a/moses/LM/RDLM.cpp b/moses/LM/RDLM.cpp index 1e9f2b4d3..33bdc9c55 100644 --- a/moses/LM/RDLM.cpp +++ b/moses/LM/RDLM.cpp @@ -70,7 +70,7 @@ void RDLM::Load() static_label_null[i] = lm_label_base_instance_->lookup_input_word(numstr); } - static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head); + static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head.GetString(0).as_string()); static_start_head = lm_head_base_instance_->lookup_input_word(""); static_start_label = lm_head_base_instance_->lookup_input_word(""); @@ -211,7 +211,7 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost } // ignore virtual nodes (in binarization; except if it's the root) - if (m_binarized && root->GetLabel()[0] == '^' && !ancestor_heads.empty()) { + if (m_binarized && root->GetLabel().GetString(0).as_string()[0] == '^' && !ancestor_heads.empty()) { // recursion if (root->IsLeafNT() && m_context_up > 1 && ancestor_heads.size()) { root = back_pointers.find(root)->second.get(); @@ -241,9 +241,9 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost // root of tree: score without context if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) { std::vector ngram_head_null (static_head_null); - ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel()); + ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel().GetString(m_factorType).as_string()); if (m_isPretermBackoff && ngram_head_null.back() == 0) { - ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel()); + ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel().GetString(m_factorType).as_string()); } if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) { std::vector::iterator it = ngram_head_null.begin(); @@ -296,7 +296,7 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost } size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size()); - const std::string & head_label = root->GetLabel(); + const std::string & head_label = root->GetLabel().GetString(0).as_string(); bool virtual_head = false; int reached_end = 0; int label_idx, label_idx_out; @@ -527,7 +527,7 @@ bool RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std tree = it->get(); } - if (m_binarized && tree->GetLabel()[0] == '^') { + if (m_binarized && tree->GetLabel().GetString(0).as_string()[0] == '^') { bool found = GetHead(tree, back_pointers, IDs); if (found) { return true; @@ -597,8 +597,8 @@ void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & bac child_ids = std::make_pair(static_dummy_head, static_dummy_head); } - labels[j] = lm_head->lookup_input_word(child->GetLabel()); - labels_output[j] = lm_label->lookup_output_word(child->GetLabel()); + labels[j] = lm_head->lookup_input_word(child->GetLabel().GetString(0).as_string()); + labels_output[j] = lm_label->lookup_output_word(child->GetLabel().GetString(0).as_string()); heads[j] = child_ids.first; heads_output[j] = child_ids.second; j++; @@ -613,18 +613,18 @@ void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & bac } -void RDLM::GetIDs(const std::string & head, const std::string & preterminal, std::pair & IDs) const +void RDLM::GetIDs(const Word & head, const Word & preterminal, std::pair & IDs) const { - IDs.first = lm_head_base_instance_->lookup_input_word(head); + IDs.first = lm_head_base_instance_->lookup_input_word(head.GetString(m_factorType).as_string()); if (m_isPretermBackoff && IDs.first == 0) { - IDs.first = lm_head_base_instance_->lookup_input_word(preterminal); + IDs.first = lm_head_base_instance_->lookup_input_word(preterminal.GetString(0).as_string()); } if (m_sharedVocab) { IDs.second = IDs.first; } else { - IDs.second = lm_head_base_instance_->lookup_output_word(head); + IDs.second = lm_head_base_instance_->lookup_output_word(head.GetString(m_factorType).as_string()); if (m_isPretermBackoff && IDs.second == 0) { - IDs.second = lm_head_base_instance_->lookup_output_word(preterminal); + IDs.second = lm_head_base_instance_->lookup_output_word(preterminal.GetString(0).as_string()); } } } @@ -718,7 +718,9 @@ void RDLM::SetParameter(const std::string& key, const std::string& value) else UTIL_THROW(util::Exception, "Unknown value for argument " << key << "=" << value); } else if (key == "glue_symbol") { - m_glueSymbol = value; + m_glueSymbolString = value; + } else if (key == "factor") { + m_factorType = Scan(value); } else if (key == "cache_size") { m_cacheSize = Scan(value); } else { diff --git a/moses/LM/RDLM.h b/moses/LM/RDLM.h index c5480b6c4..3d8c62f7e 100644 --- a/moses/LM/RDLM.h +++ b/moses/LM/RDLM.h @@ -3,6 +3,7 @@ #include "moses/FF/StatefulFeatureFunction.h" #include "moses/FF/FFState.h" #include "moses/FF/InternalTree.h" +#include "moses/Word.h" #include #include @@ -61,11 +62,12 @@ class RDLM : public StatefulFeatureFunction nplm::neuralTM* lm_label_base_instance_; mutable boost::thread_specific_ptr lm_label_backend_; - std::string dummy_head; - std::string m_glueSymbol; - std::string m_startSymbol; - std::string m_endSymbol; - std::string m_endTag; + std::string m_glueSymbolString; + Word dummy_head; + Word m_glueSymbol; + Word m_startSymbol; + Word m_endSymbol; + Word m_endTag; std::string m_path_head_lm; std::string m_path_label_lm; bool m_isPretermBackoff; @@ -102,14 +104,12 @@ class RDLM : public StatefulFeatureFunction int static_stop_label_output; int static_start_label_output; + FactorType m_factorType; + public: RDLM(const std::string &line) : StatefulFeatureFunction(2, line) - , dummy_head("") - , m_glueSymbol("Q") - , m_startSymbol("SSTART") - , m_endSymbol("SEND") - , m_endTag("") + , m_glueSymbolString("Q") , m_isPretermBackoff(true) , m_context_left(3) , m_context_right(0) @@ -120,8 +120,16 @@ public: , m_normalizeLabelLM(false) , m_sharedVocab(false) , m_binarized(0) - , m_cacheSize(1000000) { + , m_cacheSize(1000000) + , m_factorType(0) { ReadParameters(); + std::vector factors; + factors.push_back(0); + dummy_head.CreateFromString(Output, factors, "", false); + m_glueSymbol.CreateFromString(Output, factors, m_glueSymbolString, true); + m_startSymbol.CreateFromString(Output, factors, "SSTART", true); + m_endSymbol.CreateFromString(Output, factors, "SEND", true); + m_endTag.CreateFromString(Output, factors, "", false); } ~RDLM(); @@ -133,7 +141,7 @@ public: void Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array &score, std::vector &ancestor_heads, std::vector &ancestor_labels, size_t &boundary_hash, int num_virtual = 0, int rescoring_levels = 0) const; bool GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair & IDs) const; void GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_labels, std::vector & heads, std::vector & labels, std::vector & heads_output, std::vector & labels_output) const; - void GetIDs(const std::string & head, const std::string & preterminal, std::pair & IDs) const; + void GetIDs(const Word & head, const Word & preterminal, std::pair & IDs) const; void ScoreFile(std::string &path); //for debugging void PrintInfo(std::vector &ngram, nplm::neuralTM* lm) const; //for debugging @@ -190,7 +198,7 @@ public: _end = current->GetChildren().end(); iter = current->GetChildren().begin(); // expand virtual node - while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') { + while (binarized && !(*iter)->GetLabel().GetString(0).empty() && (*iter)->GetLabel().GetString(0).data()[0] == '^') { stack.push_back(std::make_pair(current, iter)); // also go through trees or previous hypotheses to rescore nodes for which more context has become available if ((*iter)->IsLeafNT()) { @@ -227,7 +235,7 @@ public: } } // expand virtual node - while (binarized && !(*iter)->GetLabel().empty() && (*iter)->GetLabel()[0] == '^') { + while (binarized && !(*iter)->GetLabel().GetString(0).empty() && (*iter)->GetLabel().GetString(0).data()[0] == '^') { stack.push_back(std::make_pair(current, iter)); // also go through trees or previous hypotheses to rescore nodes for which more context has become available if ((*iter)->IsLeafNT()) { From 1b1bafb1e8091b7141029083947269b4c2318322 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Mon, 20 Jul 2015 10:43:23 +0100 Subject: [PATCH 145/286] ems: add option to factorize after truecase/split/etc. --- scripts/ems/experiment.meta | 39 ++++++++++++++++++++++++++++++++----- scripts/ems/experiment.perl | 6 +++--- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index c2fc38260..c4e9d3e77 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -61,6 +61,7 @@ factorize rerun-on-change: TRAINING:input-factors TRAINING:output-factors default-name: corpus/factored pass-unless: TRAINING:input-factors + pass-if: factorize-after-split parallelizable: yes error: can't open error: incompatible number of words in factor @@ -112,6 +113,15 @@ post-split-clean-syntax pass-unless: input-splitter output-splitter template: $moses-script-dir/training/clean-corpus-n.perl IN $input-extension $output-extension OUT 1 10000 OUT.lines-retained --ignore-xml error: there is a blank factor +post-split-factorize + in: clean-split-stem + out: post-split-factorized-stem + rerun-on-change: TRAINING:input-factors TRAINING:output-factors + default-name: corpus/split-factored + pass-unless: AND TRAINING:input-factors factorize-after-split + parallelizable: yes + error: can't open + error: incompatible number of words in factor [RECASING] single tokenize @@ -205,6 +215,7 @@ factorize out: factorized-corpus default-name: lm/factored pass-unless: factors + pass-if: factorize-after-split ignore-if: concatenate-files concatenate-files-split parallelizable: yes error: can't open @@ -236,8 +247,17 @@ split pass-unless: output-splitter ignore-if: concatenate-files concatenate-files-split template: $output-splitter -model IN1.$output-extension < IN > OUT -strip +post-split-factorize in: split-corpus + out: split-factorized-corpus + default-name: lm/split-factored + pass-unless: AND factors factorize-after-split + ignore-if: concatenate-files concatenate-files-split + parallelizable: yes + error: can't open + error: incompatible number of words in factor +strip + in: split-factorized-corpus out: stripped-corpus default-name: lm/stripped pass-unless: mock-output-parser-lm @@ -276,7 +296,7 @@ train-custom template: $custom-training -text IN -lm OUT final-model: yes train-custom-syntax - in: split-corpus + in: split-factorized-corpus out: binlm default-name: lm/custom-lm rerun-on-change: custom-training @@ -335,6 +355,7 @@ factorize-tuning out: factorized-tuning default-name: lm/interpolate-tuning.factored pass-unless: TRAINING:output-factors + pass-if: factorize-after-split parallelizable: yes error: can't open error: incompatible number of words in factor @@ -359,8 +380,16 @@ split-tuning default-name: lm/interpolate-tuning.split pass-unless: output-splitter template: $output-splitter -model IN1.$output-extension < IN > OUT +post-split-factorize-tuning + in: split-tuning + out: post-split-factorized-tuning + default-name: lm/interpolate-tuning.split-factored + pass-unless: AND TRAINING:output-factors factorize-after-split + parallelizable: yes + error: can't open + error: incompatible number of words in factor strip-tuning - in: split-tuning + in: post-split-factorized-tuning out: stripped-tuning default-name: lm/interpolate-tuning.stripped pass-unless: mock-output-parser-lm @@ -488,12 +517,12 @@ train-in-mono template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings [TRAINING] single consolidate - in: CORPUS:clean-split-stem + in: CORPUS:post-split-factorized-stem out: corpus default-name: corpus template: $moses-script-dir/ems/support/consolidate-training-data.perl $input-extension $output-extension OUT IN build-domains - in: CORPUS:clean-split-stem + in: CORPUS:post-split-factorized-stem out: domains default-name: model/domains ignore-unless: domain-features mml-filter-corpora diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index e2a01d123..e625b37bc 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1119,13 +1119,13 @@ sub define_step { next if $RE_USE[$i]; next if defined($PASS{$i}); next if &define_template($i); - if ($DO_STEP[$i] =~ /^CORPUS:(.+):factorize$/) { + if ($DO_STEP[$i] =~ /^CORPUS:(.+):(post-split-)?factorize$/) { &define_corpus_factorize($i); } elsif ($DO_STEP[$i] eq 'SPLITTER:train') { &define_splitter_train($i); } - elsif ($DO_STEP[$i] =~ /^LM:(.+):factorize$/) { + elsif ($DO_STEP[$i] =~ /^LM:(.+):(post-split-)?factorize$/) { &define_lm_factorize($i,$1); } elsif ($DO_STEP[$i] =~ /^LM:(.+):randomize$/ || @@ -1188,7 +1188,7 @@ sub define_step { elsif ($DO_STEP[$i] eq 'TRAINING:create-config' || $DO_STEP[$i] eq 'TRAINING:create-config-interpolated-lm') { &define_training_create_config($i); } - elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:factorize-tuning') { + elsif ($DO_STEP[$i] =~ /^INTERPOLATED-LM:(post-split-)?factorize-tuning$/) { &define_interpolated_lm_factorize_tuning($i); } elsif ($DO_STEP[$i] eq 'INTERPOLATED-LM:interpolate') { From 7b19d83d439173a31de64cd954c9b1df69613ed0 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Mon, 20 Jul 2015 10:45:18 +0100 Subject: [PATCH 146/286] xml support for combine-factors.pl --- scripts/training/combine_factors.pl | 60 +++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/scripts/training/combine_factors.pl b/scripts/training/combine_factors.pl index fcc9ab3f5..e6a0a5000 100755 --- a/scripts/training/combine_factors.pl +++ b/scripts/training/combine_factors.pl @@ -37,9 +37,7 @@ while (defined $_) { $nr++; print STDERR "." if $nr % 10000 == 0; print STDERR "($nr)" if $nr % 100000 == 0; - chomp; - s/\s+/ /g; s/^ //; s/ $//; - my @intokens = split / /; + my ($intokens,$MARKUP) = split_xml($_); # load lines of corresponding streams and ensure equal number of words my @lines_of_extratoks; foreach my $factor (0..$#streams) { @@ -49,14 +47,17 @@ while (defined $_) { chomp($line); $line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//; my @toks = split / /, $line; - die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#intokens)" - if $#toks != $#intokens; + die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)" + if $#toks != $#$intokens; $lines_of_extratoks[$factor] = \@toks; } # for every token, print the factors in the order as user wished - for(my $i=0; $i<=$#intokens; $i++) { - my $token = $intokens[$i]; + for(my $i=0; $i<=$#$intokens; $i++) { + print " " if $i && $$MARKUP[$i] eq ''; + print $$MARKUP[$i]; + + my $token = $$intokens[$i]; my @outtoken = (); push @outtoken, $token; # add the first one # print STDERR "Token: $token\n"; @@ -69,11 +70,56 @@ while (defined $_) { print " " if $i != 0; print join("|", @outtoken); } + print $$MARKUP[$#$MARKUP]; print "\n"; $_ = readline($firststream); } close $firststream; print STDERR "Done.\n"; +# store away xml markup +sub split_xml { + my ($line) = @_; + my (@WORD,@MARKUP); + my $i = 0; + $MARKUP[0] = ""; + while($line =~ /\S/) { + # XML tag + if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { + my $potential_xml = $1; + my $line_next = $2; + # exception for factor that is an XML tag + if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) { + $WORD[$i-1] .= $potential_xml; + if ($line_next =~ /^(\|+)(.*)$/) { + $WORD[$i-1] .= $1; + $line_next = $2; + } + } + else { + $MARKUP[$i] .= $potential_xml." "; + } + $line = $line_next; + } + # non-XML text + elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + # '<' or '>' occurs in word, but it's not an XML tag + elsif ($line =~ /^\s*(\S+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + else { + die("ERROR: huh? $line\n"); + } + } + chop($MARKUP[$#MARKUP]); + return (\@WORD,\@MARKUP); +} + From ec3413f7b54e8a498ea7fe1a63e9b35fdc40a7f3 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Mon, 20 Jul 2015 11:40:53 -0400 Subject: [PATCH 147/286] allow customized prefix/suffix to mark unknown words --- moses/Hypothesis.cpp | 4 +++- moses/Manager.cpp | 4 +++- moses/Parameter.cpp | 2 ++ moses/Phrase.cpp | 7 +++++-- moses/StaticData.cpp | 2 ++ moses/StaticData.h | 8 ++++++++ 6 files changed, 23 insertions(+), 4 deletions(-) diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp index cc51d5cee..65d5944ce 100644 --- a/moses/Hypothesis.cpp +++ b/moses/Hypothesis.cpp @@ -586,7 +586,9 @@ OutputSurface(std::ostream &out, const Hypothesis &edge, //preface surface form with UNK if marking unknowns const Word &word = phrase.GetWord(pos); if(markUnknown && word.IsOOV()) { - out << "UNK" << *factor; + out << StaticData::Instance().GetUnknownWordPrefix() + << *factor + << StaticData::Instance().GetUnknownWordSuffix(); } else { out << *factor; } diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 7580b4e6e..ec4f57739 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -1737,7 +1737,9 @@ void Manager::OutputSurface(std::ostream &out, const Hypothesis &edge, const std //preface surface form with UNK if marking unknowns const Word &word = phrase.GetWord(pos); if(markUnknown && word.IsOOV()) { - out << "UNK" << *factor; + out << StaticData::Instance().GetUnknownWordPrefix() + << *factor + << StaticData::Instance().GetUnknownWordSuffix(); } else { out << *factor; } diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 4eaf419c4..4a0941521 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -141,6 +141,8 @@ Parameter::Parameter() po::options_description oov_opts("OOV Handling Options"); AddParam(oov_opts,"drop-unknown", "du", "drop unknown words instead of copying them"); AddParam(oov_opts,"mark-unknown", "mu", "mark unknown words in output"); + AddParam(oov_opts,"unknown-word-prefix", "prefix to unknwon word when marked (default: 'UNK')"); + AddParam(oov_opts,"unknown-word-suffix", "suffix to unknwon word when marked (default: '')"); AddParam(oov_opts,"lmodel-oov-feature", "add language model oov feature, one per model"); AddParam(oov_opts,"output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence"); AddParam(oov_opts,"always-create-direct-transopt", "Always create a translation that translates the source word ad-verbatim"); diff --git a/moses/Phrase.cpp b/moses/Phrase.cpp index fe69ce008..7a9e847ba 100644 --- a/moses/Phrase.cpp +++ b/moses/Phrase.cpp @@ -119,10 +119,13 @@ std::string Phrase::GetStringRep(const vector factorsToPrint) const stringstream strme; for (size_t pos = 0 ; pos < GetSize() ; pos++) { - if(markUnknown && GetWord(pos).IsOOV()) { - strme << "UNK"; + if (markUnknown && GetWord(pos).IsOOV()) { + strme << StaticData::Instance().GetUnknownWordPrefix(); } strme << GetWord(pos).GetString(factorsToPrint, (pos != GetSize()-1)); + if (markUnknown && GetWord(pos).IsOOV()) { + strme << StaticData::Instance().GetUnknownWordSuffix(); + } } return strme.str(); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 8fb88c257..1293f5d44 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -438,6 +438,8 @@ StaticData // unknown word processing m_parameter->SetParameter(m_dropUnknown, "drop-unknown", false ); m_parameter->SetParameter(m_markUnknown, "mark-unknown", false ); + m_parameter->SetParameter(m_unknownWordPrefix, "unknown-word-prefix", "UNK" ); + m_parameter->SetParameter(m_unknownWordSuffix, "unknown-word-suffix", "" ); m_parameter->SetParameter(m_lmEnableOOVFeature, "lmodel-oov-feature", false); diff --git a/moses/StaticData.h b/moses/StaticData.h index a93e67003..8128d0b97 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -114,6 +114,8 @@ protected: // bool m_labeledNBestList,m_nBestIncludesSegmentation; bool m_dropUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = drop (ignore) them bool m_markUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = mark and (ignore) them + std::string m_unknownWordPrefix; + std::string m_unknownWordSuffix; bool m_wordDeletionEnabled; bool m_disableDiscarding; @@ -326,6 +328,12 @@ public: inline bool GetMarkUnknown() const { return m_markUnknown; } + inline std::string GetUnknownWordPrefix() const { + return m_unknownWordPrefix; + } + inline std::string GetUnknownWordSuffix() const { + return m_unknownWordSuffix; + } inline bool GetDisableDiscarding() const { return m_disableDiscarding; } From 6d0f482361b0b879496c5f0497cf1b1ca2c9d4c4 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Mon, 20 Jul 2015 11:41:48 -0400 Subject: [PATCH 148/286] extended phrase lookup: print sentences, document id --- biconcor/SuffixArray.cpp | 265 +++++++++++++++++++++++++++++-------- biconcor/SuffixArray.h | 19 +++ biconcor/phrase-lookup.cpp | 77 +++++------ 3 files changed, 271 insertions(+), 90 deletions(-) diff --git a/biconcor/SuffixArray.cpp b/biconcor/SuffixArray.cpp index 9566ede99..9466b0e0f 100644 --- a/biconcor/SuffixArray.cpp +++ b/biconcor/SuffixArray.cpp @@ -21,6 +21,11 @@ SuffixArray::SuffixArray() m_wordInSentence(NULL), m_sentence(NULL), m_sentenceLength(NULL), + m_document(NULL), + m_documentName(NULL), + m_documentNameLength(0), + m_documentCount(0), + m_useDocument(false), m_vcb(), m_size(0), m_sentenceCount(0) { } @@ -32,6 +37,8 @@ SuffixArray::~SuffixArray() free(m_wordInSentence); free(m_sentence); free(m_sentenceLength); + free(m_document); + free(m_documentName); } void SuffixArray::Create(const string& fileName ) @@ -46,22 +53,32 @@ void SuffixArray::Create(const string& fileName ) textFile.open(fileName.c_str()); if (!textFile) { - cerr << "no such file or directory " << fileName << endl; + cerr << "Error: no such file or directory " << fileName << endl; exit(1); } + // first pass through data: get size istream *fileP = &textFile; m_size = 0; m_sentenceCount = 0; + m_documentCount = 0; while(!fileP->eof()) { SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); if (fileP->eof()) break; + if (m_useDocument && ProcessDocumentLine(line,0)) continue; vector< WORD_ID > words = m_vcb.Tokenize( line ); m_size += words.size() + 1; m_sentenceCount++; } textFile.close(); cerr << m_size << " words (incl. sentence boundaries)" << endl; + if (m_useDocument) { + cerr << m_documentCount << " documents" << endl; + if (m_documentCount == 0) { + cerr << "Error: no documents found, aborting." << endl; + exit(1); + } + } // allocate memory m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); @@ -69,21 +86,31 @@ void SuffixArray::Create(const string& fileName ) m_wordInSentence = (char*) calloc( sizeof( char ), m_size ); m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size ); m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount ); - - // fill the array - int wordIndex = 0; - int sentenceId = 0; - textFile.open(fileName.c_str()); - - if (!textFile) { - cerr << "no such file or directory " << fileName << endl; - exit(1); + CheckAllocation(m_array != NULL, "m_array"); + CheckAllocation(m_index != NULL, "m_index"); + CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence"); + CheckAllocation(m_sentence != NULL, "m_sentence"); + CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength"); + if (m_useDocument) { + m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount ); + m_documentName = (INDEX*) calloc( sizeof( char ), m_documentCount ); + m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength ); + CheckAllocation(m_document != NULL, "m_document"); + CheckAllocation(m_documentName != NULL, "m_documentName"); + CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer"); } + // second pass through data: fill the arrays + int wordIndex = 0; + int sentenceId = 0; + m_documentNameLength = 0; // re-use as counter + m_documentCount = 0; // re-use as counter + textFile.open(fileName.c_str()); fileP = &textFile; while(!fileP->eof()) { SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); if (fileP->eof()) break; + if (m_useDocument && ProcessDocumentLine(line,sentenceId)) continue; vector< WORD_ID > words = m_vcb.Tokenize( line ); vector< WORD_ID >::const_iterator i; @@ -105,7 +132,7 @@ void SuffixArray::Create(const string& fileName ) m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size ); if (m_buffer == NULL) { - cerr << "cannot allocate memory to m_buffer" << endl; + cerr << "Error: cannot allocate memory to m_buffer" << endl; exit(1); } @@ -114,6 +141,45 @@ void SuffixArray::Create(const string& fileName ) cerr << "done sorting" << endl; } +// very specific code to deal with common crawl document ids +bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId ) +{ + size_t i; + // first 32 characters are hex-hash + for(i=0; i<32; i++) { + if ((line[i] < '0' || line[i] > '9') && (line[i] < 'a' || line[i] > 'f')) { + return false; + } + } + if (line[i++] != ' ') return false; + + // second token is float + for (; line[i] != ' ' && line[i] != 0; i++) { + if (line[i] != '.' && (line[i] < '0' || line[i] > '9')) { + return false; + } + } + i++; + + // last token is url (=name) + size_t startName = i; + for (; line[i] != ' ' && line[i] != 0; i++) {} + if (line[i] == ' ') return false; + size_t endName = i+1; // include '\0' + + // second pass: record name and sentence number + if (m_document != NULL) { + m_documentName[m_documentCount] = m_documentNameLength; + for(size_t i=startName; i0) cout << " "; + cout << phrase[i]; + } + cout << '\t'; + INDEX start = 0; + INDEX end = m_size-1; + INDEX mid = FindFirst( phrase, start, end ); + if (mid == m_size) { // no matches + cout << "0 matches" << endl; + return; + } + + INDEX firstMatch = FindLast( phrase, mid, start, -1 ); + INDEX lastMatch = FindLast( phrase, mid, end, 1 ); + + // loop through all matches + cout << (lastMatch-firstMatch+1) << " matches" << endl; + for(INDEX i=firstMatch; i<=lastMatch;i++) { + // get sentence information + INDEX pos = GetPosition( i ); + INDEX start = pos - GetWordInSentence( pos ); + char length = GetSentenceLength( GetSentence( pos ) ); + // print document name + if (m_useDocument) { + INDEX sentence = GetSentence( pos ); + INDEX document = GetDocument( sentence ); + PrintDocumentName( document ); + cout << '\t'; + } + // print sentence + for(char i=0; i0) cout << " "; + cout << GetWord( start + i ); + } + cout << endl; + } +} + +SuffixArray::INDEX SuffixArray::GetDocument( INDEX sentence ) const +{ + // binary search + INDEX min = 0; + INDEX max = m_documentCount-1; + if (sentence >= m_document[max]) { + return max; + } + while(true) { + INDEX mid = (min + max) / 2; + if (sentence >= m_document[mid] && sentence < m_document[mid+1]) { + return mid; + } + if (sentence < m_document[mid]) { + max = mid-1; + } + else { + min = mid+1; + } + } +} + void SuffixArray::Save(const string& fileName ) const { FILE *pFile = fopen ( fileName.c_str() , "w" ); - if (pFile == NULL) { - cerr << "Cannot open " << fileName << endl; - exit(1); - } + if (pFile == NULL) Error("cannot open",fileName); fwrite( &m_size, sizeof(INDEX), 1, pFile ); fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus @@ -288,6 +414,16 @@ void SuffixArray::Save(const string& fileName ) const fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile ); fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length + + char useDocument = m_useDocument; // not sure if that is needed + fwrite( &useDocument, sizeof(char), 1, pFile ); + if (m_useDocument) { + fwrite( &m_documentCount, sizeof(INDEX), 1, pFile ); + fwrite( m_document, sizeof(INDEX), m_documentCount, pFile ); + fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile ); + fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile ); + fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile ); + } fclose( pFile ); m_vcb.Save( fileName + ".src-vcb" ); @@ -296,56 +432,81 @@ void SuffixArray::Save(const string& fileName ) const void SuffixArray::Load(const string& fileName ) { FILE *pFile = fopen ( fileName.c_str() , "r" ); - if (pFile == NULL) { - cerr << "no such file or directory " << fileName << endl; - exit(1); - } + if (pFile == NULL) Error("no such file or directory", fileName); cerr << "loading from " << fileName << endl; - fread( &m_size, sizeof(INDEX), 1, pFile ); + fread( &m_size, sizeof(INDEX), 1, pFile ) + || Error("could not read m_size from", fileName); cerr << "words in corpus: " << m_size << endl; + m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); m_index = (INDEX*) calloc( sizeof( INDEX ), m_size ); m_wordInSentence = (char*) calloc( sizeof( char ), m_size ); m_sentence = (INDEX*) calloc( sizeof( INDEX ), m_size ); + CheckAllocation(m_array != NULL, "m_array"); + CheckAllocation(m_index != NULL, "m_index"); + CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence"); + CheckAllocation(m_sentence != NULL, "m_sentence"); + fread( m_array, sizeof(WORD_ID), m_size, pFile ) // corpus + || Error("could not read m_array from", fileName); + fread( m_index, sizeof(INDEX), m_size, pFile ) // suffix array + || Error("could not read m_index from", fileName); + fread( m_wordInSentence, sizeof(char), m_size, pFile) // word index + || Error("could not read m_wordInSentence from", fileName); + fread( m_sentence, sizeof(INDEX), m_size, pFile ) // sentence index + || Error("could not read m_sentence from", fileName); - if (m_array == NULL) { - cerr << "Error: cannot allocate memory to m_array" << endl; - exit(1); - } - - if (m_index == NULL) { - cerr << "Error: cannot allocate memory to m_index" << endl; - exit(1); - } - - if (m_wordInSentence == NULL) { - cerr << "Error: cannot allocate memory to m_wordInSentence" << endl; - exit(1); - } - - if (m_sentence == NULL) { - cerr << "Error: cannot allocate memory to m_sentence" << endl; - exit(1); - } - - fread( m_array, sizeof(WORD_ID), m_size, pFile ); // corpus - fread( m_index, sizeof(INDEX), m_size, pFile ); // suffix array - fread( m_wordInSentence, sizeof(char), m_size, pFile); // word index - fread( m_sentence, sizeof(INDEX), m_size, pFile); // sentence index - - fread( &m_sentenceCount, sizeof(INDEX), 1, pFile ); + fread( &m_sentenceCount, sizeof(INDEX), 1, pFile ) + || Error("could not read m_sentenceCount from", fileName); cerr << "sentences in corpus: " << m_sentenceCount << endl; - m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount ); - if (m_sentenceLength == NULL) { - cerr << "Error: cannot allocate memory to m_sentenceLength" << endl; - exit(1); + m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount ); + CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength"); + fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile) // sentence length + || Error("could not read m_sentenceLength from", fileName); + + if (m_useDocument) { // do not read it when you do not need it + char useDocument; + fread( &useDocument, sizeof(char), 1, pFile ) + || Error("could not read m_useDocument from", fileName); + if (!useDocument) { + cerr << "Error: stored suffix array does not have a document index\n"; + exit(1); + } + fread( &m_documentCount, sizeof(INDEX), 1, pFile ) + || Error("could not read m_documentCount from", fileName); + m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount ); + m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount ); + CheckAllocation(m_document != NULL, "m_document"); + CheckAllocation(m_documentName != NULL, "m_documentName"); + fread( m_document, sizeof(INDEX), m_documentCount, pFile ) + || Error("could not read m_document from", fileName); + fread( m_documentName, sizeof(INDEX), m_documentCount, pFile ) + || Error("could not read m_documentName from", fileName); + fread( &m_documentNameLength, sizeof(INDEX), 1, pFile ) + || Error("could not read m_documentNameLength from", fileName); + m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength ); + CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer"); + fread( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile ) + || Error("could not read m_document from", fileName); } - fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length fclose( pFile ); m_vcb.Load( fileName + ".src-vcb" ); } + +void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const +{ + if (check) return; + cerr << "Error: could not allocate memory for " << dataStructure << endl; + exit(1); +} + +bool SuffixArray::Error( const char *message, const string &fileName) const +{ + cerr << "Error: " << message << " " << fileName << endl; + exit(1); + return true; // yeah, i know. +} diff --git a/biconcor/SuffixArray.h b/biconcor/SuffixArray.h index af7f5567e..f20702e41 100644 --- a/biconcor/SuffixArray.h +++ b/biconcor/SuffixArray.h @@ -15,6 +15,12 @@ private: INDEX *m_sentence; char *m_sentenceLength; WORD_ID m_endOfSentence; + INDEX *m_document; + INDEX *m_documentName; + char *m_documentNameBuffer; + size_t m_documentNameLength; + size_t m_documentCount; + bool m_useDocument; Vocabulary m_vcb; INDEX m_size; INDEX m_sentenceCount; @@ -28,6 +34,7 @@ public: ~SuffixArray(); void Create(const std::string& fileName ); + bool ProcessDocumentLine( const char* const, const size_t ); void Sort(INDEX start, INDEX end); int CompareIndex( INDEX a, INDEX b ) const; inline int CompareWord( WORD_ID a, WORD_ID b ) const; @@ -40,6 +47,7 @@ public: INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction ); int Match( const std::vector< WORD > &phrase, INDEX index ); void List( INDEX start, INDEX end ); + void PrintSentenceMatches( const std::vector< WORD > &phrase ); inline INDEX GetPosition( INDEX index ) const { return m_index[ index ]; } @@ -58,6 +66,17 @@ public: inline WORD GetWord( INDEX position ) const { return m_vcb.GetWord( m_array[position] ); } + void UseDocument() { + m_useDocument = true; + } + INDEX GetDocument( INDEX sentence ) const; + void PrintDocumentName( INDEX document ) { + for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) { + std::cout << m_documentNameBuffer[ i ]; + } + } void Save(const std::string& fileName ) const; void Load(const std::string& fileName ); + void CheckAllocation(bool, const char *dataStructure) const; + bool Error( const char* message, const std::string& fileName) const; }; diff --git a/biconcor/phrase-lookup.cpp b/biconcor/phrase-lookup.cpp index 60ab8db66..0b940a4e9 100644 --- a/biconcor/phrase-lookup.cpp +++ b/biconcor/phrase-lookup.cpp @@ -1,4 +1,5 @@ #include "SuffixArray.h" +#include "../util/tokenize.hh" #include using namespace std; @@ -13,10 +14,12 @@ int main(int argc, char* argv[]) string query; string fileNameSuffix; string fileNameSource; - int loadFlag = false; - int saveFlag = false; - int createFlag = false; - int queryFlag = false; + bool loadFlag = false; + bool saveFlag = false; + bool createFlag = false; + bool queryFlag = false; + bool querySentenceFlag = false; + int stdioFlag = false; // receive requests from STDIN, respond to STDOUT string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n"; while(1) { @@ -25,11 +28,14 @@ int main(int argc, char* argv[]) {"save", required_argument, 0, 's'}, {"create", required_argument, 0, 'c'}, {"query", required_argument, 0, 'q'}, + {"query-sentence", required_argument, 0, 'Q'}, + {"document", required_argument, 0, 'd'}, {"stdio", no_argument, 0, 'i'}, + {"stdio-sentence", no_argument, 0, 'I'}, {0, 0, 0, 0} }; int option_index = 0; - int c = getopt_long (argc, argv, "l:s:c:q:i", long_options, &option_index); + int c = getopt_long (argc, argv, "l:s:c:q:Q:iId", long_options, &option_index); if (c == -1) break; switch (c) { case 'l': @@ -48,17 +54,25 @@ int main(int argc, char* argv[]) query = string(optarg); queryFlag = true; break; + case 'Q': + query = string(optarg); + querySentenceFlag = true; + break; case 'i': stdioFlag = true; break; + case 'I': + stdioFlag = true; + querySentenceFlag = true; + break; + case 'd': + suffixArray.UseDocument(); + break; default: cerr << info; exit(1); } } - if (stdioFlag) { - queryFlag = true; - } // check if parameter settings are legal if (saveFlag && !createFlag) { @@ -74,7 +88,7 @@ int main(int argc, char* argv[]) exit(1); } - // do your thing + // get suffix array if (createFlag) { cerr << "will create\n"; cerr << "corpus is in " << fileNameSource << endl; @@ -88,16 +102,29 @@ int main(int argc, char* argv[]) cerr << "will load from " << fileNameSuffix << endl; suffixArray.Load( fileNameSuffix ); } + + // do something with it if (stdioFlag) { while(true) { string query; if (getline(cin, query, '\n').eof()) { return 0; } - cout << lookup( query ) << endl; + if (querySentenceFlag) { + vector< string > queryString = util::tokenize( query.c_str() ); + suffixArray.PrintSentenceMatches( queryString ); + } + else { + cout << lookup( query ) << endl; + } } - } else if (queryFlag) { + } + else if (queryFlag) { cout << lookup( query ) << endl; + } + else if (querySentenceFlag) { + vector< string > queryString = util::tokenize( query.c_str() ); + suffixArray.PrintSentenceMatches( queryString ); } return 0; } @@ -105,32 +132,6 @@ int main(int argc, char* argv[]) size_t lookup( string query ) { cerr << "query is " << query << endl; - vector< string > queryString = tokenize( query.c_str() ); + vector< string > queryString = util::tokenize( query.c_str() ); return suffixArray.Count( queryString ); } - -// Duplicate of definition in util/tokenize.hh. -// TODO: Can we de-duplicate this? At the time of writing biconcor does not -// use util at all. -vector tokenize(const char input[]) -{ - vector< string > token; - bool betweenWords = true; - int start=0; - int i; - for(i = 0; input[i] != '\0'; i++) { - const bool isSpace = (input[i] == ' ' || input[i] == '\t'); - - if (!isSpace && betweenWords) { - start = i; - betweenWords = false; - } else if (isSpace && !betweenWords) { - token.push_back( string( input+start, i-start ) ); - betweenWords = true; - } - } - if (!betweenWords) - token.push_back( string( input+start, i-start ) ); - return token; -} - From fcf2934a2f38902a749a0e91382419af096f1879 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Mon, 20 Jul 2015 11:43:02 -0400 Subject: [PATCH 149/286] customized phrase table pruning step --- scripts/ems/experiment.meta | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index c4e9d3e77..f876acc25 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -659,7 +659,7 @@ build-biconcor final-model: yes build-suffix-array in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus - out: phrase-translation-table + out: sigtest-filter-phrase-translation-table default-name: model/suffix-array ignore-unless: suffix-array error: usage @@ -731,11 +731,18 @@ build-ttable final-model: yes build-mmsapt in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus - out: phrase-translation-table + out: sigtest-filter-phrase-translation-table ignore-unless: mmsapt default-name: model/phrase-table-mmsapt template: $moses-script-dir/training/build-mmsapt.perl --alignment IN.$alignment-symmetrization-method --corpus IN1 --f $input-extension --e $output-extension --dir OUT --settings '$mmsapt' final-model: yes +custom-phrase-table-pruning + in: phrase-translation-table + out: sigtest-filter-phrase-translation-table + ignore-unless: custom-phrase-table-pruning + ignore-if: mmsapt + template: $custom-phrase-table-pruning IN OUT + default-name: model/phrase-table-pruned sigtest-filter-suffix-array in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus out: sigtest-filter-suffix-array @@ -757,7 +764,7 @@ sigtest-filter-ttable out: sigtest-filter-phrase-translation-table default-name: model/phrase-table-sigtest-filter pass-unless: sigtest-filter - ignore-if: TRAINING:config + ignore-if: TRAINING:config custom-phrase-table-pruning final-model: yes sigtest-filter-reordering in: reordering-table sigtest-filter-suffix-array From 496f8c6d850ff98c4e6a8e5fa06ebe59d5d0a15b Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Mon, 20 Jul 2015 11:44:22 -0400 Subject: [PATCH 150/286] only extract reordering phrase pairs if use mmsapt phrase table --- scripts/ems/experiment.perl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index e625b37bc..2198c22a4 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2906,6 +2906,7 @@ sub get_training_setting { my $pcfg = &get("TRAINING:use-pcfg-feature"); my $baseline_alignment = &get("TRAINING:baseline-alignment-model"); my $no_glue_grammar = &get("TRAINING:no-glue-grammar"); + my $mmsapt = &get("TRAINING:mmsapt"); my $xml = $source_syntax || $target_syntax; @@ -2930,6 +2931,7 @@ sub get_training_setting { $cmd .= "-parallel " if $parallel; $cmd .= "-pcfg " if $pcfg; $cmd .= "-baseline-alignment-model $baseline_alignment " if defined($baseline_alignment) && ($step == 1 || $step == 2); + $cmd .= "-mmsapt " if defined($mmsapt); # factored training if (&backoff_and_get("TRAINING:input-factors")) { From b4b30cff7a34d59f18868f5bbda96912ed40c56d Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Mon, 20 Jul 2015 11:45:23 -0400 Subject: [PATCH 151/286] fix some compile time warnings about unsigned / signed int --- .../fuzzy-match/FuzzyMatchWrapper.cpp | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp index 8cc2e3f57..89287ca91 100644 --- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp @@ -114,14 +114,14 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co // find match ranges in suffix array vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range; - for(size_t start=0; startGetSize()-1; vector< string > substring; bool stillMatched = true; vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart; //cerr << "start: " << start; - for(int word=start; stillMatched && wordGetPosition( i ); + size_t position = suffixArray->GetPosition( i ); // sentence length mismatch size_t sentence_id = suffixArray->GetSentence( position ); @@ -261,7 +261,7 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co // quick look: how many words are matched int words_matched = 0; - for(int m=0; m pruned = prune_matches( match, best_cost ); words_matched = 0; - for(int p=0; p best_cost) { @@ -323,7 +323,7 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co // do not try to find the best ... report multiple matches if (multiple_flag) { - for(int si=0; si position_vector; wordIndex[ input[i] ] = position_vector; @@ -799,7 +799,7 @@ void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translation input_word_hit = wordIndex.find( tm[t_pos] ); if (input_word_hit != wordIndex.end()) { vector< int > &position_vector = input_word_hit->second; - for(int j=0; j &match, int input_length, return input_length+tm_length; int this_best_cost = input_length + tm_length; - for(int i=0; i &match, int input_length, vector< Match > &first_match = multi_match[ first_level ]; vector< Match > &second_match = multi_match[ second_level ]; - for(int i1 = 0; i1 < first_match.size(); i1++) { - for(int i2 = 0; i2 < second_match.size(); i2++) { + for(size_t i1 = 0; i1 < first_match.size(); i1++) { + for(size_t i2 = 0; i2 < second_match.size(); i2++) { // do not combine the same pair twice if (first_level == second_level && i2 <= i1) { From 93238fa319adfaa0c429cb43e9fdb64ba62eaaec Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Mon, 20 Jul 2015 11:46:07 -0400 Subject: [PATCH 152/286] directly write gzipped file --- phrase-extract/lexical-reordering/Jamfile | 2 +- .../lexical-reordering/reordering_classes.cpp | 45 +++++-------------- .../lexical-reordering/reordering_classes.h | 4 +- phrase-extract/lexical-reordering/score.cpp | 5 +-- 4 files changed, 15 insertions(+), 41 deletions(-) diff --git a/phrase-extract/lexical-reordering/Jamfile b/phrase-extract/lexical-reordering/Jamfile index b010e44a1..dc988675d 100644 --- a/phrase-extract/lexical-reordering/Jamfile +++ b/phrase-extract/lexical-reordering/Jamfile @@ -1,2 +1,2 @@ -exe lexical-reordering-score : InputFileStream.cpp reordering_classes.cpp score.cpp ../../util//kenutil ../..//z ; +exe lexical-reordering-score : InputFileStream.cpp reordering_classes.cpp score.cpp ../OutputFileStream.cpp ../..//boost_iostreams ../..//boost_filesystem ../../util//kenutil ../..//z ; diff --git a/phrase-extract/lexical-reordering/reordering_classes.cpp b/phrase-extract/lexical-reordering/reordering_classes.cpp index b9490e447..304616e5c 100644 --- a/phrase-extract/lexical-reordering/reordering_classes.cpp +++ b/phrase-extract/lexical-reordering/reordering_classes.cpp @@ -277,7 +277,7 @@ void Model::score_fe(const string& f, const string& e) { if (!fe) //Make sure we do not do anything if it is not a fe model return; - fprintf(file,"%s ||| %s ||| ",f.c_str(),e.c_str()); + outputFile << f << " ||| " << e << " |||"; //condition on the previous phrase if (previous) { vector scores; @@ -288,9 +288,8 @@ void Model::score_fe(const string& f, const string& e) sum += scores[i]; } for(size_t i=0; i scores; @@ -323,9 +322,8 @@ void Model::score_f(const string& f) sum += scores[i]; } for(size_t i=0; i 0) { - gzwrite(gzfile, inbuffer, num_read); - } - fclose(file); - gzclose(gzfile); - - //Remove the unzipped file - remove(filename.c_str()); -} - void Model::split_config(const string& config, string& dir, string& lang, string& orient) { istringstream is(config); diff --git a/phrase-extract/lexical-reordering/reordering_classes.h b/phrase-extract/lexical-reordering/reordering_classes.h index 67e7daead..1e0f596c9 100644 --- a/phrase-extract/lexical-reordering/reordering_classes.h +++ b/phrase-extract/lexical-reordering/reordering_classes.h @@ -13,7 +13,7 @@ #include #include "util/string_piece.hh" - +#include "../OutputFileStream.h" enum ORIENTATION {MONO, SWAP, DRIGHT, DLEFT, OTHER, NOMONO}; @@ -122,8 +122,8 @@ private: ModelScore* modelscore; Scorer* scorer; - std::FILE* file; std::string filename; + Moses::OutputFileStream outputFile; bool fe; bool previous; diff --git a/phrase-extract/lexical-reordering/score.cpp b/phrase-extract/lexical-reordering/score.cpp index d404822b8..6b234b4c9 100644 --- a/phrase-extract/lexical-reordering/score.cpp +++ b/phrase-extract/lexical-reordering/score.cpp @@ -205,11 +205,10 @@ int main(int argc, char* argv[]) models[i]->score_f(f_current); } - //Zip all files + // delete model objects (and close files) for (size_t i=0; izipFile(); + delete models[i]; } - return 0; } From 777a88673d8a2e1d79e524ecfe3204461b6d9364 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Mon, 20 Jul 2015 11:46:47 -0400 Subject: [PATCH 153/286] compress sort tmp files by default --- scripts/training/filter-model-given-input.pl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index a44d9c193..fc13ded31 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -37,6 +37,12 @@ my $MAX_LENGTH = 10; # utilities my $ZCAT = "gzip -cd"; +# sometimes you just have to do the right thing without asking +my $sort_option = ""; +if (`echo 'youcandoit' | sort --compress-program gzip 2>/dev/null` =~ /youcandoit/) { + $sort_option = "--compress-program gzip "; +} + # get optional parameters my $opt_hierarchical = 0; my $binarizer = undef; @@ -410,13 +416,13 @@ for(my $i=0;$i<=$#TABLE;$i++) { # ... phrase translation model elsif ($binarizer =~ /processPhraseTableMin/) { #compact phrase table - my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $binarizer -in $mid_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted"; + my $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | gzip - > $mid_file.sorted.gz && $binarizer -in $mid_file.sorted.gz -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted.gz"; safesystem($cmd) or die "Can't binarize"; } elsif ($binarizer =~ /CreateOnDiskPt/) { my $cmd = "$binarizer $mid_file $new_file.bin"; safesystem($cmd) or die "Can't binarize"; } else { - my $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file"; + my $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file"; safesystem($cmd) or die "Can't binarize"; } } @@ -431,7 +437,7 @@ for(my $i=0;$i<=$#TABLE;$i++) { $lexbin =~ s/PhraseTable/LexicalTable/; my $cmd; if ($lexbin =~ /processLexicalTableMin/) { - $cmd = "$catcmd $mid_file | LC_ALL=C sort -T $tempdir > $mid_file.sorted && $lexbin -in $mid_file.sorted -out $new_file -threads $threads && rm $mid_file.sorted"; + $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | gzip - > $mid_file.sorted.gz && $lexbin -in $mid_file.sorted.gz -out $new_file -threads $threads && rm $mid_file.sorted.gz"; } else { $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options $cmd = "$lexbin -in $mid_file -out $new_file"; From 1a795f549e01751fa23a9caa04b04faa871c9de6 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Mon, 20 Jul 2015 11:47:24 -0400 Subject: [PATCH 154/286] only extract reordering phrase pairs if use mmsapt phrase table --- scripts/training/train-model.perl | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 112e9d286..56cbf016c 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -1604,6 +1604,7 @@ sub extract_phrase { $cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE; $cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/; $cmd .= " --FlexibilityScore" if $_FLEXIBILITY_SCORE; + $cmd .= " --NoTTable" if $_MMSAPT; map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a); print STDERR "$cmd\n"; @@ -1611,12 +1612,16 @@ sub extract_phrase { if (defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT !~ /extract-parallel.perl/) { print STDERR "merging with baseline extract from $_BASELINE_EXTRACT\n"; - safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | $GZIP_EXEC > $extract_file.gz"); - safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | $GZIP_EXEC > $extract_file.inv.gz"); + safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | $GZIP_EXEC > $extract_file.gz") + if -e "$extract_file$suffix.gz"; + safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | $GZIP_EXEC > $extract_file.inv.gz") + if -e "$extract_file$suffix.inv.gz"; safesystem("$ZCAT $_BASELINE_EXTRACT.o.gz $extract_file$suffix.o.gz | $GZIP_EXEC > $extract_file.o.gz") if -e "$extract_file$suffix.o.gz"; - safesystem("rm $extract_file$suffix.gz"); - safesystem("rm $extract_file$suffix.inv.gz"); + safesystem("rm $extract_file$suffix.gz") + if -e "$extract_file$suffix.gz"; + safesystem("rm $extract_file$suffix.inv.gz") + if -e "$extract_file$suffix.inv.gz"; safesystem("rm $extract_file$suffix.o.gz") if -e "$extract_file$suffix.o.gz"; } From c3424ce5411dd3d72ca5d18a611c520d17d0725e Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Tue, 21 Jul 2015 00:00:42 +0100 Subject: [PATCH 155/286] daily automatic beautifier --- biconcor/SuffixArray.cpp | 63 ++++++++------- biconcor/phrase-lookup.cpp | 11 +-- mert/InternalTree.cpp | 153 ++++++++++++++++++------------------- mert/InternalTree.h | 90 +++++++++++----------- 4 files changed, 155 insertions(+), 162 deletions(-) diff --git a/biconcor/SuffixArray.cpp b/biconcor/SuffixArray.cpp index 9466b0e0f..f98e40ed3 100644 --- a/biconcor/SuffixArray.cpp +++ b/biconcor/SuffixArray.cpp @@ -142,7 +142,7 @@ void SuffixArray::Create(const string& fileName ) } // very specific code to deal with common crawl document ids -bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId ) +bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId ) { size_t i; // first 32 characters are hex-hash @@ -158,7 +158,7 @@ bool SuffixArray::ProcessDocumentLine( const char *line, const size_t sentenceId if (line[i] != '.' && (line[i] < '0' || line[i] > '9')) { return false; } - } + } i++; // last token is url (=name) @@ -337,7 +337,7 @@ void SuffixArray::List(INDEX start, INDEX end) } } -void SuffixArray::PrintSentenceMatches( const std::vector< WORD > &phrase ) +void SuffixArray::PrintSentenceMatches( const std::vector< WORD > &phrase ) { cout << "QUERY\t"; for(size_t i=0; i &phrase ) // loop through all matches cout << (lastMatch-firstMatch+1) << " matches" << endl; - for(INDEX i=firstMatch; i<=lastMatch;i++) { + for(INDEX i=firstMatch; i<=lastMatch; i++) { // get sentence information INDEX pos = GetPosition( i ); INDEX start = pos - GetWordInSentence( pos ); @@ -394,8 +394,7 @@ SuffixArray::INDEX SuffixArray::GetDocument( INDEX sentence ) const } if (sentence < m_document[mid]) { max = mid-1; - } - else { + } else { min = mid+1; } } @@ -416,13 +415,13 @@ void SuffixArray::Save(const string& fileName ) const fwrite( m_sentenceLength, sizeof(char), m_sentenceCount, pFile); // sentence length char useDocument = m_useDocument; // not sure if that is needed - fwrite( &useDocument, sizeof(char), 1, pFile ); + fwrite( &useDocument, sizeof(char), 1, pFile ); if (m_useDocument) { - fwrite( &m_documentCount, sizeof(INDEX), 1, pFile ); - fwrite( m_document, sizeof(INDEX), m_documentCount, pFile ); - fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile ); - fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile ); - fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile ); + fwrite( &m_documentCount, sizeof(INDEX), 1, pFile ); + fwrite( m_document, sizeof(INDEX), m_documentCount, pFile ); + fwrite( m_documentName, sizeof(INDEX), m_documentCount, pFile ); + fwrite( &m_documentNameLength, sizeof(INDEX), 1, pFile ); + fwrite( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile ); } fclose( pFile ); @@ -436,8 +435,8 @@ void SuffixArray::Load(const string& fileName ) cerr << "loading from " << fileName << endl; - fread( &m_size, sizeof(INDEX), 1, pFile ) - || Error("could not read m_size from", fileName); + fread( &m_size, sizeof(INDEX), 1, pFile ) + || Error("could not read m_size from", fileName); cerr << "words in corpus: " << m_size << endl; m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); @@ -449,47 +448,47 @@ void SuffixArray::Load(const string& fileName ) CheckAllocation(m_wordInSentence != NULL, "m_wordInSentence"); CheckAllocation(m_sentence != NULL, "m_sentence"); fread( m_array, sizeof(WORD_ID), m_size, pFile ) // corpus - || Error("could not read m_array from", fileName); + || Error("could not read m_array from", fileName); fread( m_index, sizeof(INDEX), m_size, pFile ) // suffix array - || Error("could not read m_index from", fileName); + || Error("could not read m_index from", fileName); fread( m_wordInSentence, sizeof(char), m_size, pFile) // word index - || Error("could not read m_wordInSentence from", fileName); + || Error("could not read m_wordInSentence from", fileName); fread( m_sentence, sizeof(INDEX), m_size, pFile ) // sentence index - || Error("could not read m_sentence from", fileName); + || Error("could not read m_sentence from", fileName); fread( &m_sentenceCount, sizeof(INDEX), 1, pFile ) - || Error("could not read m_sentenceCount from", fileName); + || Error("could not read m_sentenceCount from", fileName); cerr << "sentences in corpus: " << m_sentenceCount << endl; m_sentenceLength = (char*) calloc( sizeof( char ), m_sentenceCount ); CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength"); fread( m_sentenceLength, sizeof(char), m_sentenceCount, pFile) // sentence length - || Error("could not read m_sentenceLength from", fileName); + || Error("could not read m_sentenceLength from", fileName); if (m_useDocument) { // do not read it when you do not need it char useDocument; fread( &useDocument, sizeof(char), 1, pFile ) - || Error("could not read m_useDocument from", fileName); + || Error("could not read m_useDocument from", fileName); if (!useDocument) { cerr << "Error: stored suffix array does not have a document index\n"; exit(1); } - fread( &m_documentCount, sizeof(INDEX), 1, pFile ) - || Error("could not read m_documentCount from", fileName); + fread( &m_documentCount, sizeof(INDEX), 1, pFile ) + || Error("could not read m_documentCount from", fileName); m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount ); m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount ); CheckAllocation(m_document != NULL, "m_document"); CheckAllocation(m_documentName != NULL, "m_documentName"); - fread( m_document, sizeof(INDEX), m_documentCount, pFile ) - || Error("could not read m_document from", fileName); - fread( m_documentName, sizeof(INDEX), m_documentCount, pFile ) - || Error("could not read m_documentName from", fileName); + fread( m_document, sizeof(INDEX), m_documentCount, pFile ) + || Error("could not read m_document from", fileName); + fread( m_documentName, sizeof(INDEX), m_documentCount, pFile ) + || Error("could not read m_documentName from", fileName); fread( &m_documentNameLength, sizeof(INDEX), 1, pFile ) - || Error("could not read m_documentNameLength from", fileName); + || Error("could not read m_documentNameLength from", fileName); m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength ); CheckAllocation(m_documentNameBuffer != NULL, "m_documentNameBuffer"); fread( m_documentNameBuffer, sizeof(char), m_documentNameLength, pFile ) - || Error("could not read m_document from", fileName); + || Error("could not read m_document from", fileName); } fclose( pFile ); @@ -497,16 +496,16 @@ void SuffixArray::Load(const string& fileName ) m_vcb.Load( fileName + ".src-vcb" ); } -void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const +void SuffixArray::CheckAllocation( bool check, const char *dataStructure ) const { if (check) return; cerr << "Error: could not allocate memory for " << dataStructure << endl; exit(1); } -bool SuffixArray::Error( const char *message, const string &fileName) const +bool SuffixArray::Error( const char *message, const string &fileName) const { cerr << "Error: " << message << " " << fileName << endl; exit(1); - return true; // yeah, i know. + return true; // yeah, i know. } diff --git a/biconcor/phrase-lookup.cpp b/biconcor/phrase-lookup.cpp index 0b940a4e9..84b17c095 100644 --- a/biconcor/phrase-lookup.cpp +++ b/biconcor/phrase-lookup.cpp @@ -19,7 +19,7 @@ int main(int argc, char* argv[]) bool createFlag = false; bool queryFlag = false; bool querySentenceFlag = false; - + int stdioFlag = false; // receive requests from STDIN, respond to STDOUT string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create corpus]\n\t[--query string]\n\t[--stdio]\n"; while(1) { @@ -113,16 +113,13 @@ int main(int argc, char* argv[]) if (querySentenceFlag) { vector< string > queryString = util::tokenize( query.c_str() ); suffixArray.PrintSentenceMatches( queryString ); - } - else { + } else { cout << lookup( query ) << endl; } } - } - else if (queryFlag) { + } else if (queryFlag) { cout << lookup( query ) << endl; - } - else if (querySentenceFlag) { + } else if (querySentenceFlag) { vector< string > queryString = util::tokenize( query.c_str() ); suffixArray.PrintSentenceMatches( queryString ); } diff --git a/mert/InternalTree.cpp b/mert/InternalTree.cpp index d82fbcc72..16f47bd9a 100644 --- a/mert/InternalTree.cpp +++ b/mert/InternalTree.cpp @@ -4,109 +4,106 @@ namespace MosesTuning { InternalTree::InternalTree(const std::string & line, const bool terminal): - m_isTerminal(terminal) - { + m_isTerminal(terminal) +{ - size_t found = line.find_first_of("[] "); + size_t found = line.find_first_of("[] "); - if (found == line.npos) { - m_value = line; - } + if (found == line.npos) { + m_value = line; + } - else { - AddSubTree(line, 0); - } + else { + AddSubTree(line, 0); + } } -size_t InternalTree::AddSubTree(const std::string & line, size_t pos) { +size_t InternalTree::AddSubTree(const std::string & line, size_t pos) +{ - std::string value; - char token = 0; + std::string value; + char token = 0; - while (token != ']' && pos != std::string::npos) - { - size_t oldpos = pos; - pos = line.find_first_of("[] ", pos); - if (pos == std::string::npos) break; - token = line[pos]; - value = line.substr(oldpos,pos-oldpos); + while (token != ']' && pos != std::string::npos) { + size_t oldpos = pos; + pos = line.find_first_of("[] ", pos); + if (pos == std::string::npos) break; + token = line[pos]; + value = line.substr(oldpos,pos-oldpos); - if (token == '[') { - if (m_value.size() > 0) { - m_children.push_back(boost::make_shared(value,false)); - pos = m_children.back()->AddSubTree(line, pos+1); - } - else { - if (value.size() > 0) { - m_value = value; - } - pos = AddSubTree(line, pos+1); - } - } - else if (token == ' ' || token == ']') { - if (value.size() > 0 && !(m_value.size() > 0)) { - m_value = value; - } - else if (value.size() > 0) { - m_isTerminal = false; - m_children.push_back(boost::make_shared(value,true)); - } - if (token == ' ') { - pos++; - } - } - - if (m_children.size() > 0) { - m_isTerminal = false; + if (token == '[') { + if (m_value.size() > 0) { + m_children.push_back(boost::make_shared(value,false)); + pos = m_children.back()->AddSubTree(line, pos+1); + } else { + if (value.size() > 0) { + m_value = value; } + pos = AddSubTree(line, pos+1); + } + } else if (token == ' ' || token == ']') { + if (value.size() > 0 && !(m_value.size() > 0)) { + m_value = value; + } else if (value.size() > 0) { + m_isTerminal = false; + m_children.push_back(boost::make_shared(value,true)); + } + if (token == ' ') { + pos++; + } } - if (pos == std::string::npos) { - return line.size(); + if (m_children.size() > 0) { + m_isTerminal = false; } - return std::min(line.size(),pos+1); + } + + if (pos == std::string::npos) { + return line.size(); + } + return std::min(line.size(),pos+1); } -std::string InternalTree::GetString(bool start) const { +std::string InternalTree::GetString(bool start) const +{ - std::string ret = ""; - if (!start) { - ret += " "; - } + std::string ret = ""; + if (!start) { + ret += " "; + } - if (!m_isTerminal) { - ret += "["; - } + if (!m_isTerminal) { + ret += "["; + } - ret += m_value; - for (std::vector::const_iterator it = m_children.begin(); it != m_children.end(); ++it) - { - ret += (*it)->GetString(false); - } + ret += m_value; + for (std::vector::const_iterator it = m_children.begin(); it != m_children.end(); ++it) { + ret += (*it)->GetString(false); + } - if (!m_isTerminal) { - ret += "]"; - } - return ret; + if (!m_isTerminal) { + ret += "]"; + } + return ret; } -void InternalTree::Combine(const std::vector &previous) { +void InternalTree::Combine(const std::vector &previous) +{ - std::vector::iterator it; - bool found = false; - leafNT next_leafNT(this); - for (std::vector::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) { - found = next_leafNT(it); - if (found) { - *it = *it_prev; - } - else { - std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n"; - } + std::vector::iterator it; + bool found = false; + leafNT next_leafNT(this); + for (std::vector::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) { + found = next_leafNT(it); + if (found) { + *it = *it_prev; + } else { + std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n"; } + } } diff --git a/mert/InternalTree.h b/mert/InternalTree.h index f8416101c..475bf9970 100644 --- a/mert/InternalTree.h +++ b/mert/InternalTree.h @@ -18,60 +18,60 @@ typedef int NTLabel; class InternalTree { -std::string m_value; -std::vector m_children; -bool m_isTerminal; + std::string m_value; + std::vector m_children; + bool m_isTerminal; public: - InternalTree(const std::string & line, const bool terminal = false); - InternalTree(const InternalTree & tree): - m_value(tree.m_value), - m_isTerminal(tree.m_isTerminal) { - const std::vector & children = tree.m_children; - for (std::vector::const_iterator it = children.begin(); it != children.end(); it++) { - m_children.push_back(boost::make_shared(**it)); - } - } - size_t AddSubTree(const std::string & line, size_t start); + InternalTree(const std::string & line, const bool terminal = false); + InternalTree(const InternalTree & tree): + m_value(tree.m_value), + m_isTerminal(tree.m_isTerminal) { + const std::vector & children = tree.m_children; + for (std::vector::const_iterator it = children.begin(); it != children.end(); it++) { + m_children.push_back(boost::make_shared(**it)); + } + } + size_t AddSubTree(const std::string & line, size_t start); - std::string GetString(bool start = true) const; - void Combine(const std::vector &previous); - const std::string & GetLabel() const { - return m_value; - } + std::string GetString(bool start = true) const; + void Combine(const std::vector &previous); + const std::string & GetLabel() const { + return m_value; + } - size_t GetLength() const { - return m_children.size(); - } - std::vector & GetChildren() { - return m_children; - } + size_t GetLength() const { + return m_children.size(); + } + std::vector & GetChildren() { + return m_children; + } - bool IsTerminal() const { - return m_isTerminal; - } + bool IsTerminal() const { + return m_isTerminal; + } - bool IsLeafNT() const { - return (!m_isTerminal && m_children.size() == 0); - } + bool IsLeafNT() const { + return (!m_isTerminal && m_children.size() == 0); + } }; // Python-like generator that yields next nonterminal leaf on every call -$generator(leafNT) { - std::vector::iterator it; - InternalTree* tree; - leafNT(InternalTree* root = 0): tree(root) {} - $emit(std::vector::iterator) - for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) { - if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) { - $yield(it); - } - else if ((*it)->GetLength() > 0) { - if ((*it).get()) { // normal pointer to same object that TreePointer points to - $restart(tree = (*it).get()); - } - } +$generator(leafNT) +{ + std::vector::iterator it; + InternalTree* tree; + leafNT(InternalTree* root = 0): tree(root) {} + $emit(std::vector::iterator) + for (it = tree->GetChildren().begin(); it !=tree->GetChildren().end(); ++it) { + if (!(*it)->IsTerminal() && (*it)->GetLength() == 0) { + $yield(it); + } else if ((*it)->GetLength() > 0) { + if ((*it).get()) { // normal pointer to same object that TreePointer points to + $restart(tree = (*it).get()); + } } - $stop; + } + $stop; }; } \ No newline at end of file From c989b8f34aec4e2ba07fd053cb70d944341ba6c1 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 21 Jul 2015 16:46:32 +0400 Subject: [PATCH 156/286] apply Pidong Wang's patch for server to new server code in moses/ --- contrib/server/Jamfile | 2 +- moses/ExportInterface.cpp | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/contrib/server/Jamfile b/contrib/server/Jamfile index 048e540b0..d6f9cdc13 100644 --- a/contrib/server/Jamfile +++ b/contrib/server/Jamfile @@ -13,7 +13,7 @@ with-xmlrpc-c = [ option.get "with-xmlrpc-c" ] ; if $(with-xmlrpc-c) { echo While building mosesserver ... ; echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ; - echo "!!! You are linking the XMLRPC-C library; Do NOT use v.1.25.29 !!!" ; + echo "!!! You are linking the XMLRPC-C library; Must be v.1.32 (September 2012) or higher !!!" ; echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ; build-moses-server = true ; diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index 54121609b..f457d4879 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -167,8 +167,15 @@ run_as_server() myRegistry.addMethod("updater", updater); myRegistry.addMethod("optimize", optimizer); - xmlrpc_c::serverAbyss myAbyssServer(myRegistry, port, logfile); - + xmlrpc_c::serverAbyss myAbyssServer( + xmlrpc_c::serverAbyss::constrOpt() + .registryP(&myRegistry) + .portNumber(port) // TCP port on which to listen + .logFileName(logfile) + .allowOrigin("*") + .maxConn((unsigned int)num_threads) + ); + XVERBOSE(1,"Listening on port " << port << endl); if (isSerial) { while(1) myAbyssServer.runOnce(); From 1b654743c3da76acefb9866d0828a7c2617a1555 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 21 Jul 2015 15:30:16 +0100 Subject: [PATCH 157/286] added switch -fopenmp to compilation of BiLM_NPML.o --- moses/LM/Jamfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/LM/Jamfile b/moses/LM/Jamfile index 6dac9179f..6ce50f5f0 100644 --- a/moses/LM/Jamfile +++ b/moses/LM/Jamfile @@ -89,7 +89,7 @@ local with-nplm = [ option.get "with-nplm" ] ; if $(with-nplm) { lib nplm : : $(with-nplm)/lib $(with-nplm)/lib64 ; obj NeuralLMWrapper.o : NeuralLMWrapper.cpp nplm ..//headers : $(with-nplm)/src $(with-nplm)/3rdparty/eigen ; - obj BiLM_NPLM.o : bilingual-lm/BiLM_NPLM.cpp nplm ..//headers : $(with-nplm)/src $(with-nplm)/3rdparty/eigen ; + obj BiLM_NPLM.o : bilingual-lm/BiLM_NPLM.cpp nplm ..//headers : $(with-nplm)/src $(with-nplm)/3rdparty/eigen -fopenmp ; obj RDLM.o : RDLM.cpp nplm ..//headers : $(with-nplm)/src $(with-nplm)/3rdparty/eigen ; alias neural : NeuralLMWrapper.o nplm : : : -fopenmp -fopenmp LM_NEURAL ; alias bilinguallm : BiLM_NPLM.o nplm : : : -fopenmp -fopenmp LM_NEURAL ; From 80db5487bc575f6979e3412829384ef0b3535294 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 21 Jul 2015 15:31:46 +0100 Subject: [PATCH 158/286] Fixed typo in comment. --- moses/TranslationModel/UG/mm/ug_mm_ttrack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h index 91167822d..dd8031ea6 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h @@ -54,7 +54,7 @@ namespace ugdiss // return size of corpus (in number of sentences) size_t size() const; - // return size of corpus (in number of sentences) + // return size of corpus (in number of tokens) size_t numTokens() const; // open an mmTtrack file From 506e02bdeca6dcc2a5eeba5cfda09619a31591ff Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 21 Jul 2015 15:32:47 +0100 Subject: [PATCH 159/286] Added utility function len_from_pid(). --- moses/TranslationModel/UG/mm/ug_ttrack_base.cc | 7 +++++++ moses/TranslationModel/UG/mm/ug_ttrack_base.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.cc b/moses/TranslationModel/UG/mm/ug_ttrack_base.cc index 60d20a5f9..e754539f6 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_base.cc +++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.cc @@ -36,6 +36,13 @@ namespace ugdiss return buf.str(); } + size_t + len_from_pid(uint64_t pid) + { + static uint64_t two16 = uint64_t(1)<<16; + return pid%two16; + } + #if 0 template<> string diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.h b/moses/TranslationModel/UG/mm/ug_ttrack_base.h index 9668bee0e..fbbc131ad 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_base.h +++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.h @@ -26,6 +26,8 @@ namespace ugdiss typedef boost::dynamic_bitset bdBitset; + size_t len_from_pid(uint64_t pid); + template void parse_pid(uint64_t const pid, sid_t & sid, From 5aaa8fcbfae98765144d5914e083dacaace5ad60 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 21 Jul 2015 15:35:08 +0100 Subject: [PATCH 160/286] 1. Fixed concurrency issue in context handling. 2. Added phrase table feature function PScoreLengthRatio. --- moses/ExportInterface.cpp | 50 +- moses/IOWrapper.cpp | 60 +- moses/IOWrapper.h | 8 +- moses/InputType.h | 6 +- moses/TranslationModel/UG/mmsapt.cpp | 525 +++++++++--------- moses/TranslationModel/UG/mmsapt.h | 2 +- .../TranslationModel/UG/sapt_phrase_scorers.h | 21 +- .../UG/sapt_pscore_length_ratio.h | 70 +++ moses/TranslationTask.cpp | 18 +- moses/TranslationTask.h | 9 +- moses/server/TranslationRequest.cpp | 3 +- 11 files changed, 432 insertions(+), 340 deletions(-) create mode 100644 moses/TranslationModel/UG/sapt_pscore_length_ratio.h diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index ff24d8007..cf86aa876 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -212,23 +212,36 @@ batch_run() ThreadPool pool(staticData.ThreadCount()); #endif - std::string context_string; + // using context for adaptation: + // e.g., context words / strings from config file / cmd line + std::string context_string; params.SetParameter(context_string,"context-string",string("")); + // ... or weights for documents/domains from config file / cmd. line std::string context_weights; params.SetParameter(context_weights,"context-weights",string("")); - // main loop over set of input sentences + // ... or the surrounding context (--context-window ...) + size_t size_t_max = std::numeric_limits::max(); + bool use_context_window = ioWrapper->GetLookAhead() || ioWrapper->GetLookBack(); + bool use_context = use_context_window || context_string.size(); + bool use_sliding_context_window = (use_context_window + && ioWrapper->GetLookAhead() != size_t_max); - boost::shared_ptr source; + boost::shared_ptr > context_window; + boost::shared_ptr >* cw; + cw = use_context_window ? &context_window : NULL; + if (!cw && context_string.size()) + context_window.reset(new std::vector(1,context_string)); // global scope of caches, biases, etc., if any boost::shared_ptr gscope; - if ((ioWrapper->GetLookAhead() + ioWrapper->GetLookBack() == 0) - || ioWrapper->GetLookAhead() == std::numeric_limits::max()) + if (!use_sliding_context_window) gscope.reset(new ContextScope); - while ((source = ioWrapper->ReadInput()) != NULL) { + // main loop over set of input sentences + boost::shared_ptr source; + while ((source = ioWrapper->ReadInput(cw)) != NULL) { IFVERBOSE(1) ResetUserTime(); // set up task of translating one sentence @@ -236,19 +249,22 @@ batch_run() if (gscope) lscope = gscope; else lscope.reset(new ContextScope); - boost::shared_ptr task; + boost::shared_ptr task; task = TranslationTask::create(source, ioWrapper, lscope); - - if (source->GetContext()) - task->SetContextString(*source->GetContext()); - else task->SetContextString(context_string); - - //if (source->GetContextWeights().isEmpty()) - // task->SetContextWeights(*source->GetContextWeights()); - /*else //The context_weights will never be passed to the config file.*/ - if (context_weights != "") { + + if (cw) + { + if (context_string.size()) + context_window->push_back(context_string); + if(!use_sliding_context_window) + cw = NULL; + } + if (context_window) + task->SetContextWindow(context_window); + + if (context_weights != "") task->SetContextWeights(context_weights); - } + // Allow for (sentence-)context-specific processing prior to // decoding. This can be used, for example, for context-sensitive // phrase lookup. diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp index 94287dd0b..65c3f20c8 100644 --- a/moses/IOWrapper.cpp +++ b/moses/IOWrapper.cpp @@ -35,6 +35,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "moses/Syntax/KBestExtractor.h" #include "moses/Syntax/PVertex.h" @@ -297,7 +298,7 @@ GetBufferedInput() boost::shared_ptr IOWrapper:: -ReadInput() +ReadInput(boost::shared_ptr >* cw) { #ifdef WITH_THREADS boost::lock_guard lock(m_lock); @@ -305,48 +306,33 @@ ReadInput() boost::shared_ptr source = GetBufferedInput(); if (source) { source->SetTranslationId(m_currentLine++); - if (m_look_ahead || m_look_back) - this->set_context_for(*source); + + // when using a sliding context window, remove obsolete past input from buffer: + if (m_past_input.size() && m_look_back != std::numeric_limits::max()) + { + list >::iterator m = m_past_input.end(); + for (size_t cnt = 0; cnt < m_look_back && --m != m_past_input.begin();) + cnt += (*m)->GetSize(); + while (m_past_input.begin() != m) m_past_input.pop_front(); + } + + if (m_look_back) + m_past_input.push_back(source); } - m_past_input.push_back(source); + if (cw) *cw = GetCurrentContextWindow(); return source; } -void +boost::shared_ptr > IOWrapper:: -set_context_for(InputType& source) +GetCurrentContextWindow() const { - boost::shared_ptr context(new string); - list >::iterator m = m_past_input.end(); - // remove obsolete past input from buffer: - if (m_past_input.end() != m_past_input.begin()) { - for (size_t cnt = 0; cnt < m_look_back && --m != m_past_input.begin(); - cnt += (*m)->GetSize()); - while (m_past_input.begin() != m) m_past_input.pop_front(); - } - // cerr << string(80,'=') << endl; - if (m_past_input.size()) { - m = m_past_input.begin(); - *context += (*m)->ToString(); - // cerr << (*m)->ToString() << endl; - for (++m; m != m_past_input.end(); ++m) { - // cerr << "\n" << (*m)->ToString() << endl; - *context += string(" ") + (*m)->ToString(); - } - // cerr << string(80,'-') << endl; - } - // cerr << source.ToString() << endl; - if (m_future_input.size()) { - // cerr << string(80,'-') << endl; - for (m = m_future_input.begin(); m != m_future_input.end(); ++m) { - // if (m != m_future_input.begin()) cerr << "\n"; - // cerr << (*m)->ToString() << endl; - if (context->size()) *context += " "; - *context += (*m)->ToString(); - } - } - // cerr << string(80,'=') << endl; - if (context->size()) source.SetContext(context); + boost::shared_ptr > context(new std::vector); + BOOST_FOREACH(boost::shared_ptr const& i, m_past_input) + context->push_back(i->ToString()); + BOOST_FOREACH(boost::shared_ptr const& i, m_future_input) + context->push_back(i->ToString()); + return context; } diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h index d97023193..98d558410 100644 --- a/moses/IOWrapper.h +++ b/moses/IOWrapper.h @@ -128,7 +128,9 @@ public: ~IOWrapper(); // Moses::InputType* GetInput(Moses::InputType *inputType); - boost::shared_ptr ReadInput(); + + boost::shared_ptr + ReadInput(boost::shared_ptr >* cw = NULL); Moses::OutputCollector *GetSingleBestOutputCollector() { return m_singleBestOutputCollector.get(); @@ -205,8 +207,8 @@ private: boost::shared_ptr GetBufferedInput(); - void - set_context_for(InputType& source); + boost::shared_ptr > + GetCurrentContextWindow() const; }; template diff --git a/moses/InputType.h b/moses/InputType.h index 655823a3f..f75c65a0f 100644 --- a/moses/InputType.h +++ b/moses/InputType.h @@ -58,7 +58,7 @@ protected: ReorderingConstraint m_reorderingConstraint; /**< limits on reordering specified either by "-mp" switch or xml tags */ std::string m_textType; std::string m_passthrough; - boost::shared_ptr m_context; + boost::shared_ptr > m_context; public: // used in -continue-partial-translation @@ -173,13 +173,13 @@ public: //! number of words in this sentence/confusion network virtual size_t GetSize() const =0; - virtual boost::shared_ptr const& + virtual boost::shared_ptr > const& GetContext() const { return m_context; } virtual void - SetContext(boost::shared_ptr const& ctx) { + SetContext(boost::shared_ptr > const& ctx) { m_context = ctx; } diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 7ce29f1cc..48bada2d9 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // #ifdef HAVE_CURLPP // #include // #include @@ -22,13 +23,13 @@ namespace Moses void fillIdSeq(Phrase const& mophrase, std::vector const& ifactors, - TokenIndex const& V, vector& dest) + TokenIndex const& V, vector& dest) { dest.resize(mophrase.GetSize()); for (size_t i = 0; i < mophrase.GetSize(); ++i) { - // Factor const* f = mophrase.GetFactor(i,ifactor); - dest[i] = V[mophrase.GetWord(i).GetString(ifactors, false)]; // f->ToString()]; + // Factor const* f = mophrase.GetFactor(i,ifactor); + dest[i] = V[mophrase.GetWord(i).GetString(ifactors, false)]; // f->ToString()]; } } @@ -39,17 +40,17 @@ namespace Moses tokenizer > tokens(line,sep); BOOST_FOREACH(string const& t,tokens) { - size_t i = t.find_first_not_of(" ="); - size_t j = t.find_first_of(" =",i+1); - size_t k = t.find_first_not_of(" =",j+1); - UTIL_THROW_IF2(i == string::npos || k == string::npos, - "[" << HERE << "] " - << "Parameter specification error near '" - << t << "' in moses ini line\n" - << line); - assert(i != string::npos); - assert(k != string::npos); - param[t.substr(i,j)] = t.substr(k); + size_t i = t.find_first_not_of(" ="); + size_t j = t.find_first_of(" =",i+1); + size_t k = t.find_first_not_of(" =",j+1); + UTIL_THROW_IF2(i == string::npos || k == string::npos, + "[" << HERE << "] " + << "Parameter specification error near '" + << t << "' in moses ini line\n" + << line); + assert(i != string::npos); + assert(k != string::npos); + param[t.substr(i,j)] = t.substr(k); } } @@ -88,17 +89,17 @@ namespace Moses ifstream config(fname.c_str()); while (getline(config,line)) { - if (line[0] == '#') continue; - char_separator sep(" \t"); - tokenizer > tokens(line,sep); - tokenizer >::const_iterator t = tokens.begin(); - if (t == tokens.end()) continue; - string& foo = param[*t++]; - if (t == tokens.end() || foo.size()) continue; - // second condition: do not overwrite settings from the line in moses.ini - UTIL_THROW_IF2(*t++ != "=" || t == tokens.end(), - "Syntax error in Mmsapt config file '" << fname << "'."); - for (foo = *t++; t != tokens.end(); foo += " " + *t++); + if (line[0] == '#') continue; + char_separator sep(" \t"); + tokenizer > tokens(line,sep); + tokenizer >::const_iterator t = tokens.begin(); + if (t == tokens.end()) continue; + string& foo = param[*t++]; + if (t == tokens.end() || foo.size()) continue; + // second condition: do not overwrite settings from the line in moses.ini + UTIL_THROW_IF2(*t++ != "=" || t == tokens.end(), + "Syntax error in Mmsapt config file '" << fname << "'."); + for (foo = *t++; t != tokens.end(); foo += " " + *t++); } } @@ -110,9 +111,9 @@ namespace Moses ff->setIndex(m_feature_names.size()); for (int i = 0; i < ff->fcnt(); ++i) { - m_feature_names.push_back(ff->fname(i)); - m_is_logval.push_back(ff->isLogVal(i)); - m_is_integer.push_back(ff->isIntegerValued(i)); + m_feature_names.push_back(ff->fname(i)); + m_is_logval.push_back(ff->isLogVal(i)); + m_is_integer.push_back(ff->isIntegerValued(i)); } } @@ -133,12 +134,12 @@ namespace Moses m = param.find("base"); if (m != param.end()) { - m_bname = m->second; - m = param.find("path"); - UTIL_THROW_IF2((m != param.end() && m->second != m_bname), - "Conflicting aliases for path:\n" - << "path=" << string(m->second) << "\n" - << "base=" << m_bname.c_str() ); + m_bname = m->second; + m = param.find("path"); + UTIL_THROW_IF2((m != param.end() && m->second != m_bname), + "Conflicting aliases for path:\n" + << "path=" << string(m->second) << "\n" + << "base=" << m_bname.c_str() ); } else m_bname = param["path"]; L1 = param["L1"]; @@ -192,37 +193,38 @@ namespace Moses // Feature functions are initialized in function Load(); param.insert(pair("pfwd", "g")); param.insert(pair("pbwd", "g")); + param.insert(pair("lenrat", "1")); + param.insert(pair("rare", "1")); param.insert(pair("logcnt", "0")); param.insert(pair("coh", "0")); - param.insert(pair("rare", "1")); - param.insert(pair("prov", "1")); + param.insert(pair("prov", "0")); param.insert(pair("cumb", "0")); poolCounts = true; // this is for pre-comuted sentence-level bias; DEPRECATED! if ((m = param.find("bias")) != param.end()) - m_bias_file = m->second; + m_bias_file = m->second; if ((m = param.find("bias-server")) != param.end()) - m_bias_server = m->second; + m_bias_server = m->second; if (m_bias_loglevel) { - dflt = pair("bias-logfile","/dev/stderr"); - param.insert(dflt); + dflt = pair("bias-logfile","/dev/stderr"); + param.insert(dflt); } if ((m = param.find("bias-logfile")) != param.end()) { - m_bias_logfile = m->second; - if (m_bias_logfile == "/dev/stderr") - m_bias_log = &std::cerr; - else if (m_bias_logfile == "/dev/stdout") - m_bias_log = &std::cout; - else - { - m_bias_logger.reset(new ofstream(m_bias_logfile.c_str())); - m_bias_log = m_bias_logger.get(); - } + m_bias_logfile = m->second; + if (m_bias_logfile == "/dev/stderr") + m_bias_log = &std::cerr; + else if (m_bias_logfile == "/dev/stdout") + m_bias_log = &std::cout; + else + { + m_bias_logger.reset(new ofstream(m_bias_logfile.c_str())); + m_bias_log = m_bias_logger.get(); + } } if ((m = param.find("lr-func")) != param.end()) @@ -233,14 +235,14 @@ namespace Moses if ((m = param.find("method")) != param.end()) { - if (m->second == "rank" || m->second == "ranked") - m_sampling_method = ranked_sampling; - else if (m->second == "random") - m_sampling_method = random_sampling; - else if (m->second == "full") - m_sampling_method = full_coverage; - else UTIL_THROW2("unrecognized specification 'method='" << m->second - << "' in line:\n" << line); + if (m->second == "rank" || m->second == "ranked") + m_sampling_method = ranked_sampling; + else if (m->second == "random") + m_sampling_method = random_sampling; + else if (m->second == "full") + m_sampling_method = full_coverage; + else UTIL_THROW2("unrecognized specification 'method='" << m->second + << "' in line:\n" << line); } dflt = pair("tuneable","true"); @@ -270,6 +272,7 @@ namespace Moses known_parameters.push_back("extra"); known_parameters.push_back("feature-sets"); known_parameters.push_back("input-factor"); + known_parameters.push_back("lenrat"); known_parameters.push_back("lexalpha"); // known_parameters.push_back("limit"); // replaced by "table-limit" known_parameters.push_back("logcnt"); @@ -292,10 +295,10 @@ namespace Moses sort(known_parameters.begin(),known_parameters.end()); for (map::iterator m = param.begin(); m != param.end(); ++m) { - UTIL_THROW_IF2(!binary_search(known_parameters.begin(), - known_parameters.end(), m->first), - HERE << ": Unknown parameter specification for Mmsapt: " - << m->first); + UTIL_THROW_IF2(!binary_search(known_parameters.begin(), + known_parameters.end(), m->first), + HERE << ": Unknown parameter specification for Mmsapt: " + << m->first); } } @@ -344,20 +347,20 @@ namespace Moses if (spec == "" || spec == "0") return; if (registry) { - sptr ff(new fftype(spec)); - register_ff(ff, *registry); + sptr ff(new fftype(spec)); + register_ff(ff, *registry); } else if (spec[spec.size()-1] == '+') // corpus specific { - sptr ff(new fftype(spec)); - register_ff(ff, m_active_ff_fix); - ff.reset(new fftype(spec)); - register_ff(ff, m_active_ff_dyn); + sptr ff(new fftype(spec)); + register_ff(ff, m_active_ff_fix); + ff.reset(new fftype(spec)); + register_ff(ff, m_active_ff_dyn); } else { - sptr ff(new fftype(spec)); - register_ff(ff, m_active_ff_common); + sptr ff(new fftype(spec)); + register_ff(ff, m_active_ff_common); } } @@ -365,36 +368,36 @@ namespace Moses void Mmsapt:: check_ff(string const ffname, float const xtra, - vector >* registry) + vector >* registry) { string const& spec = param[ffname]; if (spec == "" || spec == "0") return; if (registry) { - sptr ff(new fftype(xtra,spec)); - register_ff(ff, *registry); + sptr ff(new fftype(xtra,spec)); + register_ff(ff, *registry); } else if (spec[spec.size()-1] == '+') // corpus specific { - sptr ff(new fftype(xtra,spec)); - register_ff(ff, m_active_ff_fix); - ff.reset(new fftype(xtra,spec)); - register_ff(ff, m_active_ff_dyn); + sptr ff(new fftype(xtra,spec)); + register_ff(ff, m_active_ff_fix); + ff.reset(new fftype(xtra,spec)); + register_ff(ff, m_active_ff_dyn); } else { - sptr ff(new fftype(xtra,spec)); - register_ff(ff, m_active_ff_common); + sptr ff(new fftype(xtra,spec)); + register_ff(ff, m_active_ff_common); } } - + void Mmsapt:: Load() { Load(true); } - + void Mmsapt ::setup_local_feature_functions() @@ -403,41 +406,42 @@ namespace Moses // load feature sets BOOST_FOREACH(string const& fsname, m_feature_set_names) { - // standard (default) feature set - if (fsname == "standard") - { - // lexical scores - string lexfile = m_bname + L1 + "-" + L2 + ".lex"; - sptr > - ff(new PScoreLex1(param["lex_alpha"],lexfile)); - register_ff(ff,m_active_ff_common); - - // these are always computed on pooled data - check_ff > ("rare", &m_active_ff_common); - check_ff >("unal", &m_active_ff_common); - check_ff >("coh", &m_active_ff_common); - check_ff >("cumb", &m_active_ff_common); - - // for these ones either way is possible (specification ends with '+' - // if corpus-specific - check_ff >("pfwd", m_lbop_conf); - check_ff >("pbwd", m_lbop_conf); - check_ff >("logcnt"); - - // These are always corpus-specific - check_ff >("prov", &m_active_ff_fix); - check_ff >("prov", &m_active_ff_dyn); - } - - // data source features (copies of phrase and word count specific to - // this translation model) - else if (fsname == "datasource") - { - sptr > ffpcnt(new PScorePC("pcnt")); - register_ff(ffpcnt,m_active_ff_common); - sptr > ffwcnt(new PScoreWC("wcnt")); - register_ff(ffwcnt,m_active_ff_common); - } + // standard (default) feature set + if (fsname == "standard") + { + // lexical scores + string lexfile = m_bname + L1 + "-" + L2 + ".lex"; + sptr > + ff(new PScoreLex1(param["lex_alpha"],lexfile)); + register_ff(ff,m_active_ff_common); + + // these are always computed on pooled data + check_ff > ("rare", &m_active_ff_common); + check_ff >("unal", &m_active_ff_common); + check_ff >("coh", &m_active_ff_common); + check_ff >("cumb", &m_active_ff_common); + check_ff > ("lenrat", &m_active_ff_common); + + // for these ones either way is possible (specification ends with '+' + // if corpus-specific + check_ff >("pfwd", m_lbop_conf); + check_ff >("pbwd", m_lbop_conf); + check_ff >("logcnt"); + + // These are always corpus-specific + check_ff >("prov", &m_active_ff_fix); + check_ff >("prov", &m_active_ff_dyn); + } + + // data source features (copies of phrase and word count specific to + // this translation model) + else if (fsname == "datasource") + { + sptr > ffpcnt(new PScorePC("pcnt")); + register_ff(ffpcnt,m_active_ff_common); + sptr > ffwcnt(new PScoreWC("wcnt")); + register_ff(ffwcnt,m_active_ff_common); + } } // cerr << "Features: " << Join("|",m_feature_names) << endl; this->m_numScoreComponents = this->m_feature_names.size(); @@ -456,11 +460,11 @@ namespace Moses #if 0 if (with_checks) { - UTIL_THROW_IF2(this->m_feature_names.size() != this->m_numScoreComponents, - "At " << HERE << ": number of feature values provided by " - << "Phrase table (" << this->m_feature_names.size() - << ") does not match number specified in Moses config file (" - << this->m_numScoreComponents << ")!\n";); + UTIL_THROW_IF2(this->m_feature_names.size() != this->m_numScoreComponents, + "At " << HERE << ": number of feature values provided by " + << "Phrase table (" << this->m_feature_names.size() + << ") does not match number specified in Moses config file (" + << this->m_numScoreComponents << ")!\n";); } #endif @@ -487,7 +491,7 @@ namespace Moses wlex21.resize(COOC.numCols); for (size_t r = 0; r < COOC.numRows; ++r) for (cell_t const* c = COOC[r].start; c < COOC[r].stop; ++c) - wlex21[c->id].push_back(r); + wlex21[c->id].push_back(r); COOCraw.open(m_bname + L1 + "-" + L2 + ".coc"); #endif assert(btdyn); @@ -510,55 +514,55 @@ namespace Moses Mmsapt:: mkTPhrase(ttasksptr const& ttask, Phrase const& src, - PhrasePair* fix, - PhrasePair* dyn, - sptr > const& dynbt) const + PhrasePair* fix, + PhrasePair* dyn, + sptr > const& dynbt) const { UTIL_THROW_IF2(!fix && !dyn, HERE << - ": Can't create target phrase from nothing."); + ": Can't create target phrase from nothing."); vector fvals(this->m_numScoreComponents); PhrasePair pool = fix ? *fix : *dyn; if (fix) { - BOOST_FOREACH(sptr const& ff, m_active_ff_fix) - (*ff)(*btfix, *fix, &fvals); + BOOST_FOREACH(sptr const& ff, m_active_ff_fix) + (*ff)(*btfix, *fix, &fvals); } if (dyn) { - BOOST_FOREACH(sptr const& ff, m_active_ff_dyn) - (*ff)(*dynbt, *dyn, &fvals); + BOOST_FOREACH(sptr const& ff, m_active_ff_dyn) + (*ff)(*dynbt, *dyn, &fvals); } if (fix && dyn) { pool += *dyn; } else if (fix) { - PhrasePair zilch; zilch.init(); - TSA::tree_iterator m(dynbt->I2.get(), fix->start2, fix->len2); - if (m.size() == fix->len2) - zilch.raw2 = m.approxOccurrenceCount(); - pool += zilch; - BOOST_FOREACH(sptr const& ff, m_active_ff_dyn) - (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); + PhrasePair zilch; zilch.init(); + TSA::tree_iterator m(dynbt->I2.get(), fix->start2, fix->len2); + if (m.size() == fix->len2) + zilch.raw2 = m.approxOccurrenceCount(); + pool += zilch; + BOOST_FOREACH(sptr const& ff, m_active_ff_dyn) + (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); } else if (dyn) { - PhrasePair zilch; zilch.init(); - TSA::tree_iterator m(btfix->I2.get(), dyn->start2, dyn->len2); - if (m.size() == dyn->len2) - zilch.raw2 = m.approxOccurrenceCount(); - pool += zilch; - BOOST_FOREACH(sptr const& ff, m_active_ff_fix) - (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); + PhrasePair zilch; zilch.init(); + TSA::tree_iterator m(btfix->I2.get(), dyn->start2, dyn->len2); + if (m.size() == dyn->len2) + zilch.raw2 = m.approxOccurrenceCount(); + pool += zilch; + BOOST_FOREACH(sptr const& ff, m_active_ff_fix) + (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); } if (fix) { - BOOST_FOREACH(sptr const& ff, m_active_ff_common) - (*ff)(*btfix, pool, &fvals); + BOOST_FOREACH(sptr const& ff, m_active_ff_common) + (*ff)(*btfix, pool, &fvals); } else { - BOOST_FOREACH(sptr const& ff, m_active_ff_common) - (*ff)(*dynbt, pool, &fvals); + BOOST_FOREACH(sptr const& ff, m_active_ff_common) + (*ff)(*dynbt, pool, &fvals); } TargetPhrase* tp = new TargetPhrase(const_cast(ttask), this); @@ -566,10 +570,10 @@ namespace Moses uint32_t len = fix ? fix->len2 : dyn->len2; for (uint32_t k = 0; k < len; ++k, x = x->next()) { - StringPiece wrd = (*(btfix->V2))[x->id()]; - Word w; - w.CreateFromString(Output, m_ofactor, wrd, false); - tp->AddWord(w); + StringPiece wrd = (*(btfix->V2))[x->id()]; + Word w; + w.CreateFromString(Output, m_ofactor, wrd, false); + tp->AddWord(w); } tp->SetAlignTerm(pool.aln); tp->GetScoreBreakdown().Assign(this, fvals); @@ -577,11 +581,11 @@ namespace Moses if (m_lr_func) { - LRModel::ModelType mdl = m_lr_func->GetModel().GetModelType(); - LRModel::Direction dir = m_lr_func->GetModel().GetDirection(); - sptr scores(new Scores()); - pool.fill_lr_vec(dir, mdl, *scores); - tp->SetExtraScores(m_lr_func, scores); + LRModel::ModelType mdl = m_lr_func->GetModel().GetModelType(); + LRModel::Direction dir = m_lr_func->GetModel().GetDirection(); + sptr scores(new Scores()); + pool.fill_lr_vec(dir, mdl, *scores); + tp->SetExtraScores(m_lr_func, scores); } return tp; @@ -590,22 +594,22 @@ namespace Moses void Mmsapt:: GetTargetPhraseCollectionBatch(ttasksptr const& ttask, - const InputPathList &inputPathQueue) const + const InputPathList &inputPathQueue) const { InputPathList::const_iterator iter; for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) { - InputPath &inputPath = **iter; - const Phrase &phrase = inputPath.GetPhrase(); - PrefixExists(ttask, phrase); // launches parallel lookup + InputPath &inputPath = **iter; + const Phrase &phrase = inputPath.GetPhrase(); + PrefixExists(ttask, phrase); // launches parallel lookup } for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) { - InputPath &inputPath = **iter; - const Phrase &phrase = inputPath.GetPhrase(); - const TargetPhraseCollection *targetPhrases - = this->GetTargetPhraseCollectionLEGACY(ttask,phrase); - inputPath.SetTargetPhrases(*this, targetPhrases, NULL); + InputPath &inputPath = **iter; + const Phrase &phrase = inputPath.GetPhrase(); + const TargetPhraseCollection *targetPhrases + = this->GetTargetPhraseCollectionLEGACY(ttask,phrase); + inputPath.SetTargetPhrases(*this, targetPhrases, NULL); } } @@ -642,14 +646,14 @@ namespace Moses TSA::tree_iterator mdyn(dyn->I1.get()); if (dyn->I1.get()) for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i) - mdyn.extend(sphrase[i]); + mdyn.extend(sphrase[i]); if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size()) return NULL; // phrase not found in either bitext // do we have cached results for this phrase? uint64_t phrasekey = (mfix.size() == sphrase.size() - ? (mfix.getPid()<<1) : (mdyn.getPid()<<1)+1); + ? (mfix.getPid()<<1) : (mdyn.getPid()<<1)+1); // get context-specific cache of items previously looked up sptr const& scope = ttask->GetScope(); @@ -676,16 +680,16 @@ namespace Moses if (mfix.size() == sphrase.size()) { - sptr context = scope->get(btfix.get()); - sptr const* foo = context->cache1->get(mfix.getPid()); - if (foo) { sfix = *foo; sfix->wait(); } - else - { - BitextSampler s(btfix.get(), mfix, context->bias, - m_default_sample_size, m_sampling_method); - s(); - sfix = s.stats(); - } + sptr context = scope->get(btfix.get()); + sptr const* foo = context->cache1->get(mfix.getPid()); + if (foo) { sfix = *foo; sfix->wait(); } + else + { + BitextSampler s(btfix.get(), mfix, context->bias, + m_default_sample_size, m_sampling_method); + s(); + sfix = s.stats(); + } } if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(ttask, mdyn); @@ -693,13 +697,13 @@ namespace Moses PhrasePair::SortByTargetIdSeq sort_by_tgt_id; if (sfix) { - expand(mfix, *btfix, *sfix, ppfix, m_bias_log); - sort(ppfix.begin(), ppfix.end(),sort_by_tgt_id); + expand(mfix, *btfix, *sfix, ppfix, m_bias_log); + sort(ppfix.begin(), ppfix.end(),sort_by_tgt_id); } if (sdyn) { - expand(mdyn, *dyn, *sdyn, ppdyn, m_bias_log); - sort(ppdyn.begin(), ppdyn.end(),sort_by_tgt_id); + expand(mdyn, *dyn, *sdyn, ppdyn, m_bias_log); + sort(ppdyn.begin(), ppdyn.end(),sort_by_tgt_id); } // now we have two lists of Phrase Pairs, let's merge them ret = new TPCollWrapper(dyn->revision(), phrasekey); @@ -707,10 +711,10 @@ namespace Moses size_t i = 0; size_t k = 0; while (i < ppfix.size() && k < ppdyn.size()) { - int cmp = sorter.cmp(ppfix[i], ppdyn[k]); - if (cmp < 0) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],NULL,dyn)); - else if (cmp == 0) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],&ppdyn[k++],dyn)); - else ret->Add(mkTPhrase(ttask,src,NULL,&ppdyn[k++],dyn)); + int cmp = sorter.cmp(ppfix[i], ppdyn[k]); + if (cmp < 0) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],NULL,dyn)); + else if (cmp == 0) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],&ppdyn[k++],dyn)); + else ret->Add(mkTPhrase(ttask,src,NULL,&ppdyn[k++],dyn)); } while (i < ppfix.size()) ret->Add(mkTPhrase(ttask,src,&ppfix[i++],NULL,dyn)); while (k < ppdyn.size()) ret->Add(mkTPhrase(ttask,src,NULL,&ppdyn[k++],dyn)); @@ -720,13 +724,13 @@ namespace Moses #if 1 if (m_bias_log && m_lr_func && m_bias_loglevel > 3) { - PhrasePair::SortDescendingByJointCount sorter; - sort(ppfix.begin(), ppfix.end(),sorter); - BOOST_FOREACH(PhrasePair const& pp, ppfix) - { - if (&pp != &ppfix.front() && pp.joint <= 1) break; - pp.print(*m_bias_log,*btfix->V1, *btfix->V2, m_lr_func->GetModel()); - } + PhrasePair::SortDescendingByJointCount sorter; + sort(ppfix.begin(), ppfix.end(),sorter); + BOOST_FOREACH(PhrasePair const& pp, ppfix) + { + // if (&pp != &ppfix.front() && pp.joint <= 1) break; + pp.print(*m_bias_log,*btfix->V1, *btfix->V2, m_lr_func->GetModel()); + } } #endif cache->add(phrasekey, ret); @@ -757,7 +761,7 @@ namespace Moses ChartRuleLookupManager* Mmsapt:: CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &, - size_t ) + size_t ) { throw "CreateRuleLookupManager is currently not supported in Mmsapt!"; } @@ -768,41 +772,47 @@ namespace Moses { sptr const& scope = ttask->GetScope(); sptr context = scope->get(btfix.get(), true); - if (m_bias_server.size() && context->bias == NULL) + if (m_bias_server.size() && context->bias == NULL && ttask->GetContextWindow()) { // we need to create the bias - boost::unique_lock lock(context->lock); - string const& context_words = ttask->GetContextString(); - if (context_words.size()) - { - if (m_bias_log) - { - *m_bias_log << HERE << endl << "BIAS LOOKUP CONTEXT: " - << context_words << endl; - context->bias_log = m_bias_log; - } - context->bias - = btfix->SetupDocumentBias(m_bias_server, context_words, m_bias_log); - context->bias->loglevel = m_bias_loglevel; - context->bias->log = m_bias_log; + boost::unique_lock lock(context->lock); + // string const& context_words = ttask->GetContextString(); + string context_words; + BOOST_FOREACH(string const& line, *ttask->GetContextWindow()) + { + if (context_words.size()) context_words += " "; + context_words += line; + } + if (context_words.size()) + { + if (m_bias_log) + { + *m_bias_log << HERE << endl << "BIAS LOOKUP CONTEXT: " + << context_words << endl; + context->bias_log = m_bias_log; + } + context->bias + = btfix->SetupDocumentBias(m_bias_server, context_words, m_bias_log); + context->bias->loglevel = m_bias_loglevel; + context->bias->log = m_bias_log; //Reset the bias in the ttaskptr so that other functions //so that other functions can utilize the biases; ttask->ReSetContextWeights(context->bias->getBiasMap()); - } - // if (!context->cache1) context->cache1.reset(new pstats::cache_t); - // if (!context->cache2) context->cache2.reset(new pstats::cache_t); + } + // if (!context->cache1) context->cache1.reset(new pstats::cache_t); + // if (!context->cache2) context->cache2.reset(new pstats::cache_t); } else if (!ttask->GetContextWeights().empty()) { - if (m_bias_log) - { - *m_bias_log << HERE << endl - << "BIAS FROM MAP LOOKUP" << endl; - context->bias_log = m_bias_log; - } - context->bias - = btfix->SetupDocumentBias(ttask->GetContextWeights(), m_bias_log); - context->bias->loglevel = m_bias_loglevel; - context->bias->log = m_bias_log; + if (m_bias_log) + { + *m_bias_log << HERE << endl + << "BIAS FROM MAP LOOKUP" << endl; + context->bias_log = m_bias_log; + } + context->bias + = btfix->SetupDocumentBias(ttask->GetContextWeights(), m_bias_log); + context->bias->loglevel = m_bias_loglevel; + context->bias->log = m_bias_log; // if (!context->cache1) context->cache1.reset(new pstats::cache_t); // if (!context->cache2) context->cache2.reset(new pstats::cache_t); } @@ -817,7 +827,10 @@ namespace Moses sptr const& scope = ttask->GetScope(); if (!scope) return; - + + sptr const> input = ttask->GetContextWindow(); + if (!input) return; + sptr context = scope->get(bt.get(), true); boost::unique_lock lock(context->lock); if (context->bias) return; @@ -825,18 +838,18 @@ namespace Moses if (!context->cache1) context->cache1.reset(new pstats::cache_t); if (!context->cache2) context->cache2.reset(new pstats::cache_t); - sptr iowrapper = ttask->GetIOWrapper(); - vector input; - input.reserve(iowrapper->GetPastInput().size() + - iowrapper->GetFutureInput().size()); - BOOST_FOREACH(sptr const& s, iowrapper->GetPastInput()) - input.push_back(s->ToString()); - BOOST_FOREACH(sptr const& s, iowrapper->GetFutureInput()) - input.push_back(s->ToString()); + // sptr iowrapper = ttask->GetIOWrapper(); + // vector input; + // input.reserve(iowrapper->GetPastInput().size() + + // iowrapper->GetFutureInput().size()); + // BOOST_FOREACH(sptr const& s, iowrapper->GetPastInput()) + // input.push_back(s->ToString()); + // BOOST_FOREACH(sptr const& s, iowrapper->GetFutureInput()) + // input.push_back(s->ToString()); size_t N = 10 * m_default_sample_size; VERBOSE(1,"Priming bias for ranking. [" << HERE << "]" << endl); - context->bias = prime_sampling1(*bt->V1, *bt->I1, input, N); + context->bias = prime_sampling1(*bt->V1, *bt->I1, *input, N); VERBOSE(1,"Done. [" << HERE << "]" << endl); } @@ -872,18 +885,18 @@ namespace Moses sptr localcache = scope->get(cache_key); if (!localcache) { - if (context->bias) localcache.reset(new TPCollCache(m_cache_size)); - else localcache = m_cache; - scope->set(cache_key, localcache); + if (context->bias) localcache.reset(new TPCollCache(m_cache_size)); + else localcache = m_cache; + scope->set(cache_key, localcache); } if (m_lr_func_name.size() && m_lr_func == NULL) { - FeatureFunction* lr = &FeatureFunction::FindFeatureFunction(m_lr_func_name); - m_lr_func = dynamic_cast(lr); - UTIL_THROW_IF2(lr == NULL, "FF " << m_lr_func_name - << " does not seem to be a lexical reordering function!"); - // todo: verify that lr_func implements a hierarchical reordering model + FeatureFunction* lr = &FeatureFunction::FindFeatureFunction(m_lr_func_name); + m_lr_func = dynamic_cast(lr); + UTIL_THROW_IF2(lr == NULL, "FF " << m_lr_func_name + << " does not seem to be a lexical reordering function!"); + // todo: verify that lr_func implements a hierarchical reordering model } } @@ -907,18 +920,18 @@ namespace Moses TSA::tree_iterator mfix(btfix->I1.get(),&myphrase[0],myphrase.size()); if (mfix.size() == myphrase.size()) { - sptr context = scope->get(btfix.get(), true); - uint64_t pid = mfix.getPid(); - if (!context->cache1->get(pid)) - { - BitextSampler s(btfix.get(), mfix, context->bias, - m_default_sample_size, m_sampling_method); - if (*context->cache1->get(pid, s.stats()) == s.stats()) - m_thread_pool->add(s); - } - // btfix->prep(ttask, mfix); - // cerr << phrase << " " << mfix.approxOccurrenceCount() << endl; - return true; + sptr context = scope->get(btfix.get(), true); + uint64_t pid = mfix.getPid(); + if (!context->cache1->get(pid)) + { + BitextSampler s(btfix.get(), mfix, context->bias, + m_default_sample_size, m_sampling_method); + if (*context->cache1->get(pid, s.stats()) == s.stats()) + m_thread_pool->add(s); + } + // btfix->prep(ttask, mfix); + // cerr << phrase << " " << mfix.approxOccurrenceCount() << endl; + return true; } sptr > dyn; @@ -930,10 +943,10 @@ namespace Moses TSA::tree_iterator mdyn(dyn->I1.get()); if (dyn->I1.get()) { - for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i) - mdyn.extend(myphrase[i]); - // let's assume a uniform bias over the foreground corpus - if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn); + for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i) + mdyn.extend(myphrase[i]); + // let's assume a uniform bias over the foreground corpus + if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn); } return mdyn.size() == myphrase.size(); } diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 77c955af5..bea1bcfc2 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // Sampling phrase table implementation based on memory-mapped suffix arrays. // Design and code by Ulrich Germann. #pragma once diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h index efdcaffaa..fa2425c14 100644 --- a/moses/TranslationModel/UG/sapt_phrase_scorers.h +++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h @@ -2,14 +2,15 @@ // Phrase scoring functions for suffix array-based phrase tables // written by Ulrich Germann #pragma once -#include "sapt_pscore_unaligned.h" // count # of unaligned words -#include "sapt_pscore_provenance.h" // reward for joint phrase occ. per corpus -#include "sapt_pscore_rareness.h" // penalty for rare occurrences (global?) -#include "sapt_pscore_logcnt.h" // logs of observed counts -#include "sapt_pscore_lex1.h" // plain vanilla Moses lexical scores -#include "sapt_pscore_pfwd.h" // fwd phrase prob -#include "sapt_pscore_pbwd.h" // bwd phrase prob -#include "sapt_pscore_coherence.h" // coherence feature: good/sample-size -#include "sapt_pscore_phrasecount.h" // phrase count -#include "sapt_pscore_wordcount.h" // word count +#include "sapt_pscore_unaligned.h" // count # of unaligned words +#include "sapt_pscore_provenance.h" // reward for joint phrase occ. per corpus +#include "sapt_pscore_rareness.h" // penalty for rare occurrences (global?) +#include "sapt_pscore_length_ratio.h" // model of phrase length ratio +#include "sapt_pscore_logcnt.h" // logs of observed counts +#include "sapt_pscore_lex1.h" // plain vanilla Moses lexical scores +#include "sapt_pscore_pfwd.h" // fwd phrase prob +#include "sapt_pscore_pbwd.h" // bwd phrase prob +#include "sapt_pscore_coherence.h" // coherence feature: good/sample-size +#include "sapt_pscore_phrasecount.h" // phrase count +#include "sapt_pscore_wordcount.h" // word count #include "sapt_pscore_cumulative_bias.h" // cumulative bias score diff --git a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h new file mode 100644 index 000000000..ea1069c59 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h @@ -0,0 +1,70 @@ +// -*- mode: c++; intent-tabs-mode: nil; tab-width: 2 -*- +// Phrase scorer that considers the length ratio of the two phrases. +// Written by Ulrich Germann. +// +// Phrase pair generation is modeled as a bernully experiment with a biased coin: +// heads: produce a word in L1, tails: produce a word in L2 +// total number of coin tosses: len(phrase 1) + len(phrase 2) +// probability p(w from L1) = length(corpus 1) / (length(corpus 1) + length(corpus 2) +#pragma once + +#include "sapt_pscore_base.h" +#include +#include +#include "mm/ug_ttrack_base.h" +using namespace std; +namespace Moses { + namespace bitext { + + + // // return the probability that a phrase length ratio is as extrem as + // // or more extreme as alen:blen. Based on a binomial experiment with + // // (alen + blen) trials and the probability of producing ratio L1 tokens per + // // L2 token + // float + // length_ratio_prob(float const alen, float const blen, float const ratio) + // { + // if (alen + blen == 0) return 1; + // float p = 1./(1 + ratio); + // boost::math::binomial bino(alen+blen,p); + // if (blen/(alen+blen) < p) + // return cdf(bino,blen); + // else + // return cdf(complement(bino,blen - 1)); + // } + + template + class + PScoreLengthRatio : public PhraseScorer + { + public: + PScoreLengthRatio(std::string const& spec) + { + this->m_feature_names.push_back("lenrat"); + this->m_num_feats = this->m_feature_names.size(); + } + + bool + isIntegerValued(int i) const { return false; } + + void + operator()(Bitext const& bt, + PhrasePair& pp, + vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + float p = float(bt.T1->numTokens()); + p /= bt.T1->numTokens() + bt.T2->numTokens(); + float len1 = ugdiss::len_from_pid(pp.p1); + float len2 = ugdiss::len_from_pid(pp.p2); + + boost::math::binomial binomi(len1 + len2, p); + float& x = (*dest)[this->m_index]; + if (len2/(len1 + len2) < p) + x = log(boost::math::cdf(binomi,len2)); + else + x = log(boost::math::cdf(boost::math::complement(binomi,len2 - 1))); + } + }; + } // namespace bitext +} // namespace Moses diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp index ad3e3bb56..30535eb3c 100644 --- a/moses/TranslationTask.cpp +++ b/moses/TranslationTask.cpp @@ -23,11 +23,11 @@ using namespace std; namespace Moses { -std::string const& -TranslationTask -::GetContextString() const +boost::shared_ptr > +TranslationTask:: +GetContextWindow() const { - return m_context_string; + return m_context; } std::map const& @@ -44,15 +44,15 @@ TranslationTask } void -TranslationTask -::SetContextString(std::string const& context) +TranslationTask:: +SetContextWindow(boost::shared_ptr > const& cw) { - m_context_string = context; + m_context = cw; } void -TranslationTask -::SetContextWeights(std::string const& context_weights) +TranslationTask:: +SetContextWeights(std::string const& context_weights) { std::vector tokens = Tokenize(context_weights,":"); for (std::vector::iterator it = tokens.begin(); it != tokens.end(); it++) { diff --git a/moses/TranslationTask.h b/moses/TranslationTask.h index 4cb817451..54733ac73 100644 --- a/moses/TranslationTask.h +++ b/moses/TranslationTask.h @@ -66,7 +66,7 @@ protected: // task is still live or not, or maintain a shared_ptr to ensure the // task stays alive till it's done with it. - std::string m_context_string; + boost::shared_ptr > m_context; std::map m_context_weights; public: @@ -124,8 +124,11 @@ public: return m_scope; } - std::string const& GetContextString() const; - void SetContextString(std::string const& context); + boost::shared_ptr > + GetContextWindow() const; + + void + SetContextWindow(boost::shared_ptr > const& cw); std::map const& GetContextWeights() const; void SetContextWeights(std::string const& context_weights); diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index bc2b5032b..5c23bbf13 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -274,7 +274,8 @@ parse_request(std::map const& params) si = params.find("context"); if (si != params.end()) { - m_context_string = xmlrpc_c::value_string(si->second); + string context = xmlrpc_c::value_string(si->second); + m_context.reset(new std::vector(1,context)); } // // biased sampling for suffix-array-based sampling phrase table? // if ((si = params.find("bias")) != params.end()) From ce625656475ec2bd5a6eced22b791897fe399c75 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Wed, 22 Jul 2015 00:00:54 +0100 Subject: [PATCH 161/286] daily automatic beautifier --- moses/ExportInterface.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index f457d4879..2b6db30d8 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -168,14 +168,14 @@ run_as_server() myRegistry.addMethod("optimize", optimizer); xmlrpc_c::serverAbyss myAbyssServer( - xmlrpc_c::serverAbyss::constrOpt() - .registryP(&myRegistry) - .portNumber(port) // TCP port on which to listen - .logFileName(logfile) - .allowOrigin("*") - .maxConn((unsigned int)num_threads) - ); - + xmlrpc_c::serverAbyss::constrOpt() + .registryP(&myRegistry) + .portNumber(port) // TCP port on which to listen + .logFileName(logfile) + .allowOrigin("*") + .maxConn((unsigned int)num_threads) + ); + XVERBOSE(1,"Listening on port " << port << endl); if (isSerial) { while(1) myAbyssServer.runOnce(); From 56da7122837ef8f2fa245739d0d5b39b03bcb7e9 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 23 Jul 2015 00:06:54 +0100 Subject: [PATCH 162/286] Bug fix: TargetPhrase::GetTtask() must return shared pointer, not reference to shared pointer. --- moses/TargetPhrase.cpp | 2 +- moses/TargetPhrase.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 8e95fc0aa..da1d9edae 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -181,7 +181,7 @@ bool TargetPhrase::HasTtaskSPtr() const { return m_ttask_flag; } -const ttasksptr& TargetPhrase::GetTtask() const { +const ttasksptr TargetPhrase::GetTtask() const { return m_ttask.lock(); } diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h index 460dcc33f..56ed27af3 100644 --- a/moses/TargetPhrase.h +++ b/moses/TargetPhrase.h @@ -92,7 +92,7 @@ public: TargetPhrase(ttasksptr &ttask, const PhraseDictionary *pt = NULL); TargetPhrase(ttasksptr &ttask, std::string out_string, const PhraseDictionary *pt = NULL); explicit TargetPhrase(ttasksptr &ttask, const Phrase &targetPhrase, const PhraseDictionary *pt); - const ttasksptr& GetTtask() const; + const ttasksptr GetTtask() const; bool HasTtaskSPtr() const; ~TargetPhrase(); From 053037816bb3ec648af689d5ed7d4bd5151f7f7e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 23 Jul 2015 00:08:33 +0100 Subject: [PATCH 163/286] Increased verbosity threshold for logging document map. --- moses/TranslationModel/UG/mm/ug_mm_bitext.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_mm_bitext.h b/moses/TranslationModel/UG/mm/ug_mm_bitext.h index 82a007a9d..63feb9427 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_mm_bitext.h @@ -46,7 +46,7 @@ namespace Moses this->m_docname.push_back(docname); line >> b; #ifndef NO_MOSES - VERBOSE(1, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << std::endl); + VERBOSE(2, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << std::endl); #endif for (b += a; a < b; ++a) (*this->m_sid2docid)[a] = docid; From 09d0909e0fa16793156ce6cfe62f1b822edb2137 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 23 Jul 2015 00:10:52 +0100 Subject: [PATCH 164/286] Trying to make sampling more efficient for large document collections underlying the sampling phrase table. --- .../UG/mm/ug_sampling_bias.cc | 61 +++++++++++++------ 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index 7bb2e2afe..37e114ff1 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -31,6 +31,7 @@ namespace Moses // std::string response = c.content(); // std::cerr << "SERVER RESPONSE: " << response << std::endl; + UTIL_THROW_IF2(c.content().size() == 0, "No response from bias server!"); return c.content(); } @@ -52,29 +53,33 @@ namespace Moses DocumentBias(std::vector const& sid2doc, std::map const& docname2docid, std::string const& server_url, std::string const& text, - std::ostream* log) + std::ostream* _log) : SamplingBias(&sid2doc) - , m_bias(docname2docid.size(), 0) + // , m_bias(docname2docid.size(), 0) { - // #ifdef HAVE_CURLPP - // #ifndef NO_MOSES + this->log = _log; +#ifndef NO_MOSES Timer timer; - if (log) timer.start(NULL); + if (_log) timer.start(NULL); +#endif std::string json = query_bias_server(server_url, text); // std::cerr << "SERVER RESPONSE " << json << std::endl; init_from_json(json, docname2docid, log); - if (log) *log << "Bias query took " << timer << " seconds." << std::endl; +#ifndef NO_MOSES + if (_log) *_log << "Bias query took " << timer << " seconds." << std::endl; +#endif } DocumentBias:: DocumentBias(std::vector const& sid2doc, std::map const& docname2docid, std::map const& context_weights, - std::ostream* log) + std::ostream* _log) : SamplingBias(&sid2doc) - , m_bias(docname2docid.size(), 0) + // , m_bias(docname2docid.size(), 0) { - init(context_weights, docname2docid); + this->log = _log; + init(context_weights, docname2docid); } std::map& SamplingBias::getBiasMap() { @@ -144,25 +149,41 @@ namespace Moses init(std::map const& biasmap, std::map const& docname2docid) { - typedef std::map::value_type doc_record; + typedef std::map::value_type bias_record; float total = 0; - BOOST_FOREACH(doc_record const& d, docname2docid) + BOOST_FOREACH(bias_record const& b, biasmap) { - std::map::const_iterator m = biasmap.find(d.first); - if (m != biasmap.end()) total += (m_bias[d.second] = m->second); - } - if (total) { BOOST_FOREACH(float& f, m_bias) f /= total; } - BOOST_FOREACH(doc_record const& d, docname2docid) - std::cerr << "BIAS " << d.first << " " << m_bias[d.second] << std::endl; + std::map::const_iterator m = docname2docid.find(b.first); + if (m != docname2docid.end()) + total += (m_bias[m->second] = b.second); + } + if (total) + { + typedef std::map::value_type item; + BOOST_FOREACH(item& i, m_bias) i.second /= total; + } + + if (log) + { + BOOST_FOREACH(bias_record const& b, biasmap) + { + std::map::const_iterator m = docname2docid.find(b.first); + if (m != docname2docid.end()) + *log << "BIAS " << b.first << " " << m_bias[m->second] << std::endl; + else + *log << "WARNING: bias reported for unknown document " << b.first << std::endl; + } + } } float DocumentBias:: operator[](id_type const idx) const { - UTIL_THROW_IF2(idx >= m_sid2docid->size(), "Out of bounds: " - << idx << "/" << m_sid2docid->size()); - return m_bias[(*m_sid2docid)[idx]]; + // UTIL_THROW_IF2(idx >= m_sid2docid->size(), "Out of bounds: " + // << idx << "/" << m_sid2docid->size()); + std::map::const_iterator m = m_bias.find((*m_sid2docid)[idx]); + return m != m_bias.end() ? m->second : 0; } size_t From f1bde0af05166626935c4ddfd12bf8a4b2f501b4 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 23 Jul 2015 00:12:00 +0100 Subject: [PATCH 165/286] Map instead of vector for bias map in SamplingBias. --- moses/TranslationModel/UG/mm/ug_sampling_bias.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index cb6861774..ad7735bc1 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -23,7 +23,8 @@ namespace Moses SamplingBias(std::vector const* sid2docid); int loglevel; std::ostream* log; - std::map m_bias_map; //Map to store the biasmap as you get it from the server + // Map to store the biasmap as you get it from the server: + std::map m_bias_map; std::map& getBiasMap(); virtual float operator[](id_type const ID) const = 0; @@ -40,7 +41,8 @@ namespace Moses class DocumentBias : public SamplingBias { - std::vector m_bias; + // std::vector m_bias; + std::map m_bias; public: DocumentBias(std::vector const& sid2doc, From 8e393c79ab58053a3738835e6941c1ef1ae21cb3 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 23 Jul 2015 00:12:34 +0100 Subject: [PATCH 166/286] Logging of priming time in ranked sampling. --- moses/TranslationModel/UG/mmsapt.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 48bada2d9..a058d76dc 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -14,6 +14,7 @@ #include #include "util/exception.hh" #include +#include "util/usage.hh" namespace Moses { @@ -849,8 +850,11 @@ namespace Moses size_t N = 10 * m_default_sample_size; VERBOSE(1,"Priming bias for ranking. [" << HERE << "]" << endl); + + double t = util::WallTime(); context->bias = prime_sampling1(*bt->V1, *bt->I1, *input, N); - VERBOSE(1,"Done. [" << HERE << "]" << endl); + VERBOSE(1,"Priming took " << util::WallTime() - t << " sec. (wall) " + << "[" << HERE << "]" << endl); } From 8da4804631febe6603ea10dfdb10ad680bff6fc5 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 23 Jul 2015 00:13:19 +0100 Subject: [PATCH 167/286] Initial check-in. --- moses/TranslationModel/UG/test-domspec.cc | 126 ++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 moses/TranslationModel/UG/test-domspec.cc diff --git a/moses/TranslationModel/UG/test-domspec.cc b/moses/TranslationModel/UG/test-domspec.cc new file mode 100644 index 000000000..d4ab504d7 --- /dev/null +++ b/moses/TranslationModel/UG/test-domspec.cc @@ -0,0 +1,126 @@ +// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*- +// test domain specificity +// Written by Ulrich Germann + +#include +#include +#include +#include "mm/ug_bitext.h" +#include "mm/tpt_typedefs.h" +#include "mm/ug_prime_sampling1.h" +#include "mm/ug_bitext_sampler.h" +#include "mm/ug_phrasepair.h" +#include "mm/ug_lru_cache.h" +#include "generic/sorting/VectorIndexSorter.h" +#include "generic/sorting/NBestList.h" +#include +#include +#include "moses/thread_safe_container.h" +#include "mm/ug_prep_phrases.h" + +using namespace std; +using namespace Moses; +using namespace Moses::bitext; +namespace po=boost::program_options; +using namespace boost::algorithm; +typedef L2R_Token Token; +typedef mmBitext mmbitext; +typedef Bitext::tsa tsa; +typedef Bitext::iter iter; +typedef imTtrack imttrack; +typedef imTSA imtsa; +typedef vector > pplist_t; + +string bname, bname1, bname2, ifile, L1, L2, Q1, Q2; +size_t maxhits; +size_t cache_size; +void interpret_args(int ac, char* av[]); + +typedef PhrasePair::SortDescendingByJointCount sorter_t; +sorter_t sorter; + +void +show(Bitext const& B, iter const& m, pstats& stats) +{ + pplist_t pplist; + expand(m, B, stats, pplist, NULL); + if (pplist.empty()) return; + cout << "\n" << m.str(B.V1.get()) << " [" << m.ca() << "]" << endl; + VectorIndexSorter, sorter_t> viso(pplist, sorter); + sptr > ranked = viso.GetOrder(); + size_t ctr=0; + BOOST_FOREACH(size_t const i, *ranked) + { + PhrasePair const& pp = pplist[i]; + cout << boost::format(" %6d | ") % pp.joint + << toString(*B.V2, pp.start2, pp.len2) << endl; + typedef map::value_type entry_t; + BOOST_FOREACH(entry_t const& e, pp.indoc) + { + cout << float(pp.joint)/pp.raw1 * stats.indoc[e.first] + << "/" << e.second << "/" << stats.indoc[e.first] << endl; + } + } +} + + +void +process(Bitext const* bitext, TSA::tree_iterator& m) +{ + if (m.approxOccurrenceCount() <= 5000) return; + boost::shared_ptr nil; + Moses::bitext::sampling_method random = Moses::bitext::random_sampling; + Moses::bitext::BitextSampler s(bitext, m, nil, 10000, random); + s(); + show(*bitext, m, *s.stats()); + if (m.down()) + { + do { process(bitext, m); } while (m.over()); + m.up(); + } +} + +int main(int argc, char* argv[]) +{ + interpret_args(argc, argv); + mmbitext B; + B.open(bname, L1, L2); + TSA::tree_iterator m(B.I1.get()); + process(&B, m); +} + +void +interpret_args(int ac, char* av[]) +{ + po::variables_map vm; + po::options_description o("Options"); + o.add_options() + ("help,h", "print this message") + ; + + po::options_description h("Hidden Options"); + h.add_options() + ("bname", po::value(&bname), "base name of corpus") + ("L1", po::value(&L1), "L1 tag") + ("L2", po::value(&L2), "L2 tag") + ; + + h.add(o); + po::positional_options_description a; + a.add("bname",1); + a.add("L1",1); + a.add("L2",1); + + po::store(po::command_line_parser(ac,av) + .options(h) + .positional(a) + .run(),vm); + po::notify(vm); + if (vm.count("help")) + { + cout << "\nusage:\n\t" << av[0] + << " " << endl; + cout << o << endl; + exit(0); + } +} From e53ad4085942872f1c4ce75cb99afe66137e1e17 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Thu, 23 Jul 2015 10:37:26 +0100 Subject: [PATCH 168/286] Support for nplm in ems --- scripts/ems/experiment.meta | 8 ++++- scripts/ems/experiment.perl | 34 +++++++++++++++++++- scripts/training/create_nplm_ini.py | 50 +++++++++++++++++++++++++++++ scripts/training/train-neurallm.py | 9 +++--- 4 files changed, 95 insertions(+), 6 deletions(-) create mode 100755 scripts/training/create_nplm_ini.py diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index f876acc25..1d38881b2 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -182,6 +182,12 @@ train-bilingual-lm ignore-unless: bilingual-lm rerun-on-change: numberized_ngrams default-name: lm/blm +train-nplm + in: stripped-corpus + out: binlm + ignore-unless: nplm + rerun-on-change: stripped-corpus + default-name: lm/nplm get-corpus in: get-corpus-script out: raw-corpus @@ -279,7 +285,7 @@ train in: stripped-corpus out: lm default-name: lm/lm - ignore-if: rlm-training custom-training bilingual-lm + ignore-if: rlm-training custom-training bilingual-lm nplm rerun-on-change: lm-training order settings template: $lm-training -order $order $settings -text IN -lm OUT error: cannot execute binary file diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 2198c22a4..8f70471c6 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1140,6 +1140,9 @@ sub define_step { } elsif ($DO_STEP[$i] =~ /^LM:(.+):prepare-bilingual-lm$/) { &define_lm_prepare_bilingual_lm($i,$1); + } + elsif ($DO_STEP[$i] =~ /^LM:(.+):train-nplm$/) { + &define_lm_train_nplm($i,$1); } elsif ($DO_STEP[$i] eq 'TRAINING:prepare-data') { &define_training_prepare_data($i); @@ -1830,6 +1833,32 @@ sub define_lm_prepare_bilingual_lm { &create_step($step_id,$cmd); } +sub define_lm_train_nplm { + my ($step_id,$set) = @_; + my ($working_dir, $corpus) = &get_output_and_input($step_id); + my $scripts = &check_backoff_and_get("LM:moses-script-dir"); + my $cmd = "$scripts/training/train-neurallm.py --mmap --working-dir $working_dir --corpus $corpus"; + my $nplm_dir = &check_backoff_and_get("LM:$set:nplm-dir"); + $cmd .= " --nplm-home $nplm_dir"; + + my $epochs = &backoff_and_get("LM:$set:epochs"); + $epochs = 2 unless defined($epochs); + $cmd .= " --epochs $epochs"; + + my $nplm_settings = backoff_and_get("LM:$set:nplm-settings"); + $cmd .= " $nplm_settings" if defined($nplm_settings); + + my $order = &backoff_and_get("LM:$set:order"); + $order = 5 unless defined($order); + $cmd .= " --order $order"; + + # Create the ini file + $cmd .= "\n"; + $cmd .= "$scripts/training/create_nplm_ini.py -w $working_dir -e $epochs -x $set -n $order"; + + &create_step($step_id,$cmd); +} + sub get_bilingual_lm_order { my ($set) = @_; my $order = &backoff_and_get("LM:$set:order"); @@ -2669,6 +2698,8 @@ sub define_training_create_config { if (&get("LM:$set:config-feature-line") && &get("LM:$set:config-weight-line")) { $feature_lines .= &get("LM:$set:config-feature-line") . ";"; $weight_lines .= &get("LM:$set:config-weight-line") . ";"; + } elsif (&get("LM:$set:nplm")) { + push(@additional_ini_files, "$lm/nplm.ini"); } elsif (&get("LM:$set:bilingual-lm")) { push(@additional_ini_files, "$lm/blm.ini"); } else { @@ -2870,7 +2901,8 @@ sub get_interpolated_lm_sets { my $count=0; my $icount=0; foreach my $set (@LM_SETS) { - next if (&get("LM:$set:exclude-from-interpolation")) or (&get("LM:$set:bilingual-lm")); + next if (&get("LM:$set:exclude-from-interpolation")) or (&get("LM:$set:bilingual-lm")) + or (&get("LM:$set:nplm")); my $order = &check_backoff_and_get("LM:$set:order"); my $factor = 0; diff --git a/scripts/training/create_nplm_ini.py b/scripts/training/create_nplm_ini.py new file mode 100755 index 000000000..557de511e --- /dev/null +++ b/scripts/training/create_nplm_ini.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +import argparse +import os +import os.path +import sys + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-w", "--working-dir", dest="working_dir") + parser.add_argument("-n", "--order", dest="n") + parser.add_argument("-i", "--ini_filename", dest="ini_filename") + parser.add_argument("-x", "--name", dest="name") + parser.add_argument("-e", "--epochs", dest="epochs") + parser.add_argument("-f", "--factor", dest="factor") + + + parser.set_defaults( + working_dir="working", + n = "5", + ini_filename = "nplm.ini", + name = "neural", + epochs = "10", + factor = "0" + ) + + options = parser.parse_args() + + if not os.path.exists(options.working_dir): + os.makedirs(options.working_dir) + + + ini_filename = os.path.join(options.working_dir,options.ini_filename) + with open(ini_filename,"w") as ifh: + print>>ifh, "[feature]" + print>>ifh,"NeuralLM factor=%s name=NPLM%s order=%s path=%s/train.model.nplm.%s" \ + % (options.factor,options.name, options.n, options.working_dir, options.epochs) + print>>ifh + print>>ifh,"[weight]" + print>>ifh,"NPLM%s= 0.1" % options.name + print>>ifh + + +if __name__ == "__main__": + main() + diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py index 00da64986..625fc69d0 100755 --- a/scripts/training/train-neurallm.py +++ b/scripts/training/train-neurallm.py @@ -122,10 +122,11 @@ def main(options): if options.output_dir is None: options.output_dir = options.working_dir - else: - # Create output dir if necessary - if not os.path.exists(options.output_dir): - os.makedirs(options.output_dir) + # Create dirs if necessary + if not os.path.exists(options.working_dir): + os.makedirs(options.working_dir) + if not os.path.exists(options.output_dir): + os.makedirs(options.output_dir) numberized_file = os.path.basename(options.corpus_stem) + '.numberized' train_file = numberized_file From 5fca4c67a1f5485ada79723782bbf4df930bfb2e Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Fri, 24 Jul 2015 19:26:16 +0100 Subject: [PATCH 169/286] minor maintenance of sparse features code (PhrasePairFeature, RulePairUnlexicalizedSource, SoftMatchingFeature, SourceWordDeletionFeature, WordTranslationFeature) --- moses/FF/PhrasePairFeature.cpp | 106 +++++++++++++---------- moses/FF/PhrasePairFeature.h | 19 ++-- moses/FF/RulePairUnlexicalizedSource.cpp | 5 +- moses/FF/SoftMatchingFeature.cpp | 9 +- moses/FF/SoftMatchingFeature.h | 1 + moses/FF/SourceWordDeletionFeature.cpp | 3 +- moses/FF/WordTranslationFeature.cpp | 3 +- 7 files changed, 87 insertions(+), 59 deletions(-) diff --git a/moses/FF/PhrasePairFeature.cpp b/moses/FF/PhrasePairFeature.cpp index bb806f8e7..1e343877c 100644 --- a/moses/FF/PhrasePairFeature.cpp +++ b/moses/FF/PhrasePairFeature.cpp @@ -16,21 +16,29 @@ namespace Moses PhrasePairFeature::PhrasePairFeature(const std::string &line) :StatelessFeatureFunction(0, line) + ,m_unrestricted(false) + ,m_simple(true) + ,m_sourceContext(false) + ,m_domainTrigger(false) + ,m_ignorePunctuation(false) { - std::cerr << "Initializing PhrasePairFeature.." << std::endl; + VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ..."); ReadParameters(); - if (m_simple == 1) std::cerr << "using simple phrase pairs.. "; - if (m_sourceContext == 1) std::cerr << "using source context.. "; - if (m_domainTrigger == 1) std::cerr << "using domain triggers.. "; + if (m_simple == 1) VERBOSE(1, " Using simple phrase pairs."); + if (m_sourceContext == 1) VERBOSE(1, " Using source context."); + if (m_domainTrigger == 1) VERBOSE(1, " Using domain triggers."); // compile a list of punctuation characters if (m_ignorePunctuation) { - std::cerr << "ignoring punctuation for triggers.. "; + VERBOSE(1, " Ignoring punctuation for triggers."); char punctuation[] = "\"'!?¿·()#_,.:;•&@‑/\\0123456789~="; - for (size_t i=0; i < sizeof(punctuation)-1; ++i) + for (size_t i=0; i < sizeof(punctuation)-1; ++i) { m_punctuationHash[punctuation[i]] = 1; + } } + + VERBOSE(1, " Done." << std::endl); } void PhrasePairFeature::SetParameter(const std::string& key, const std::string& value) @@ -76,7 +84,7 @@ void PhrasePairFeature::Load() } inFileSource.close(); - } else { + } else if (!m_unrestricted) { // restricted source word vocabulary ifstream inFileSource(m_filePathSource.c_str()); UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource); @@ -101,8 +109,6 @@ void PhrasePairFeature::Load() } inFileTarget.close();*/ - - m_unrestricted = false; } } @@ -114,25 +120,6 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input , ScoreComponentCollection *estimatedFutureScore) const { const Phrase& source = inputPath.GetPhrase(); - if (m_simple) { - ostringstream namestr; - namestr << "pp_"; - namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString(); - for (size_t i = 1; i < source.GetSize(); ++i) { - const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId); - namestr << ","; - namestr << sourceFactor->GetString(); - } - namestr << "~"; - namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString(); - for (size_t i = 1; i < targetPhrase.GetSize(); ++i) { - const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId); - namestr << ","; - namestr << targetFactor->GetString(); - } - - scoreBreakdown.SparsePlusEquals(namestr.str(),1); - } if (m_domainTrigger) { const Sentence& isnt = static_cast(input); const bool use_topicid = isnt.GetUseTopicId(); @@ -140,18 +127,18 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input // compute pair ostringstream pair; - pair << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString(); + pair << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() ); for (size_t i = 1; i < source.GetSize(); ++i) { const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId); - pair << ","; - pair << sourceFactor->GetString(); + pair << "~"; + pair << ReplaceTilde( sourceFactor->GetString() ); } - pair << "~"; - pair << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString(); + pair << "~~"; + pair << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() ); for (size_t i = 1; i < targetPhrase.GetSize(); ++i) { const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId); - pair << ","; - pair << targetFactor->GetString(); + pair << "~"; + pair << ReplaceTilde( targetFactor->GetString() ); } if (use_topicid || use_topicid_prob) { @@ -159,7 +146,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input // use topicid as trigger const long topicid = isnt.GetTopicId(); stringstream feature; - feature << "pp_"; + feature << m_description << "_"; if (topicid == -1) feature << "unk"; else @@ -173,13 +160,13 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input const vector &topicid_prob = *(isnt.GetTopicIdAndProb()); if (atol(topicid_prob[0].c_str()) == -1) { stringstream feature; - feature << "pp_unk_"; + feature << m_description << "_unk_"; feature << pair.str(); scoreBreakdown.SparsePlusEquals(feature.str(), 1); } else { for (size_t i=0; i+1 < topicid_prob.size(); i+=2) { stringstream feature; - feature << "pp_"; + feature << m_description << "_"; feature << topicid_prob[i]; feature << "_"; feature << pair.str(); @@ -193,7 +180,7 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input for (set::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) { string sourceTrigger = *p; ostringstream namestr; - namestr << "pp_"; + namestr << m_description << "_"; namestr << sourceTrigger; namestr << "_"; namestr << pair.str(); @@ -221,21 +208,21 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input if (m_unrestricted || sourceTriggerExists) { ostringstream namestr; - namestr << "pp_"; + namestr << m_description << "_"; namestr << sourceTrigger; namestr << "~"; - namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString(); + namestr << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() ); for (size_t i = 1; i < source.GetSize(); ++i) { const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId); - namestr << ","; - namestr << sourceFactor->GetString(); + namestr << "~"; + namestr << ReplaceTilde( sourceFactor->GetString() ); } - namestr << "~"; - namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString(); + namestr << "~~"; + namestr << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() ); for (size_t i = 1; i < targetPhrase.GetSize(); ++i) { const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId); - namestr << ","; - namestr << targetFactor->GetString(); + namestr << "~"; + namestr << ReplaceTilde( targetFactor->GetString() ); } scoreBreakdown.SparsePlusEquals(namestr.str(),1); @@ -244,6 +231,31 @@ void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input } } +void PhrasePairFeature::EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const +{ + if (m_simple) { + ostringstream namestr; + namestr << m_description << "_"; + namestr << ReplaceTilde( source.GetWord(0).GetFactor(m_sourceFactorId)->GetString() ); + for (size_t i = 1; i < source.GetSize(); ++i) { + const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId); + namestr << "~"; + namestr << ReplaceTilde( sourceFactor->GetString() ); + } + namestr << "~~"; + namestr << ReplaceTilde( targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString() ); + for (size_t i = 1; i < targetPhrase.GetSize(); ++i) { + const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId); + namestr << "~"; + namestr << ReplaceTilde( targetFactor->GetString() ); + } + scoreBreakdown.SparsePlusEquals(namestr.str(),1); + } +} + bool PhrasePairFeature::IsUseable(const FactorMask &mask) const { bool ret = mask[m_targetFactorId]; diff --git a/moses/FF/PhrasePairFeature.h b/moses/FF/PhrasePairFeature.h index ff22340e9..587de6676 100644 --- a/moses/FF/PhrasePairFeature.h +++ b/moses/FF/PhrasePairFeature.h @@ -1,5 +1,4 @@ -#ifndef moses_PhrasePairFeature_h -#define moses_PhrasePairFeature_h +#pragma once #include #include @@ -32,6 +31,17 @@ class PhrasePairFeature: public StatelessFeatureFunction CharHash m_punctuationHash; std::string m_filePathSource; + inline std::string ReplaceTilde(const StringPiece &str) const + { + std::string out = str.as_string(); + size_t pos = out.find('~'); + while ( pos != std::string::npos ) { + out.replace(pos,1,""); + pos = out.find('~',pos); + } + return out; + }; + public: PhrasePairFeature(const std::string &line); @@ -43,8 +53,7 @@ public: void EvaluateInIsolation(const Phrase &source , const TargetPhrase &targetPhrase , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const { - } + , ScoreComponentCollection &estimatedFutureScore) const; void EvaluateTranslationOptionListWithSourceContext(const InputType &input , const TranslationOptionList &translationOptionList) const { @@ -69,5 +78,3 @@ public: } - -#endif diff --git a/moses/FF/RulePairUnlexicalizedSource.cpp b/moses/FF/RulePairUnlexicalizedSource.cpp index f490a2b1a..d65810af8 100644 --- a/moses/FF/RulePairUnlexicalizedSource.cpp +++ b/moses/FF/RulePairUnlexicalizedSource.cpp @@ -12,7 +12,7 @@ namespace Moses { RulePairUnlexicalizedSource::RulePairUnlexicalizedSource(const std::string &line) - : StatelessFeatureFunction(0, line) + : StatelessFeatureFunction(1, line) , m_glueRules(false) , m_nonGlueRules(true) , m_glueTargetLHSStr("Q") @@ -81,6 +81,9 @@ void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source } scoreBreakdown.PlusEquals(this, namestr.str(), 1); + if ( targetPhraseLHS != m_glueTargetLHS ) { + scoreBreakdown.PlusEquals(this, 1); + } } } diff --git a/moses/FF/SoftMatchingFeature.cpp b/moses/FF/SoftMatchingFeature.cpp index b2d8e7ea5..1bdfe70c2 100644 --- a/moses/FF/SoftMatchingFeature.cpp +++ b/moses/FF/SoftMatchingFeature.cpp @@ -13,6 +13,7 @@ namespace Moses SoftMatchingFeature::SoftMatchingFeature(const std::string &line) : StatelessFeatureFunction(0, line) , m_softMatches(moses_MaxNumNonterminals) + , m_scoreIdentical(true) { ReadParameters(); } @@ -26,6 +27,8 @@ void SoftMatchingFeature::SetParameter(const std::string& key, const std::string } else if (key == "path") { const std::string filePath = value; Load(filePath); + } else if (key == "score-identical") { + m_scoreIdentical = Scan(value); } else { UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value); } @@ -80,8 +83,10 @@ void SoftMatchingFeature::EvaluateWhenApplied(const ChartHypothesis& hypo, const ChartHypothesis* prevHypo = hypo.GetPrevHypo(nonTermInd); const Word& prevLHS = prevHypo->GetTargetLHS(); - const std::string &name = GetOrSetFeatureName(word, prevLHS); - accumulator->PlusEquals(this,name,1); + if ( (word != prevLHS) || m_scoreIdentical ) { + const std::string &name = GetOrSetFeatureName(word, prevLHS); + accumulator->PlusEquals(this,name,1); + } } } } diff --git a/moses/FF/SoftMatchingFeature.h b/moses/FF/SoftMatchingFeature.h index d524a1d07..7bdcc7717 100644 --- a/moses/FF/SoftMatchingFeature.h +++ b/moses/FF/SoftMatchingFeature.h @@ -55,6 +55,7 @@ public: private: mutable std::vector > m_softMatches; // map RHS of new rule to list of possible LHS of old rule (subtree) mutable std::vector > m_nameCache; + bool m_scoreIdentical; #ifdef WITH_THREADS //reader-writer lock diff --git a/moses/FF/SourceWordDeletionFeature.cpp b/moses/FF/SourceWordDeletionFeature.cpp index 91a3137b4..936a844f3 100644 --- a/moses/FF/SourceWordDeletionFeature.cpp +++ b/moses/FF/SourceWordDeletionFeature.cpp @@ -38,9 +38,8 @@ void SourceWordDeletionFeature::SetParameter(const std::string& key, const std:: void SourceWordDeletionFeature::Load() { - if (m_filename == "") { + if (m_filename.empty()) return; - } FEATUREVERBOSE(1, "Loading source word deletion word list from " << m_filename << std::endl); ifstream inFile(m_filename.c_str()); diff --git a/moses/FF/WordTranslationFeature.cpp b/moses/FF/WordTranslationFeature.cpp index 05883df12..1059e34de 100644 --- a/moses/FF/WordTranslationFeature.cpp +++ b/moses/FF/WordTranslationFeature.cpp @@ -110,7 +110,8 @@ void WordTranslationFeature::Load() } inFileSource.close(); - } else { + } else if (!m_filePathSource.empty() || !m_filePathTarget.empty()) { + return; // restricted source word vocabulary ifstream inFileSource(m_filePathSource.c_str()); UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource); From 9e31bced9afa395bc470de2c9958044e357574ad Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Fri, 24 Jul 2015 19:42:15 +0100 Subject: [PATCH 170/286] MinCount parameter in score-main --- phrase-extract/score-main.cpp | 71 +++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 444087b2d..a09bd0543 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -70,6 +71,7 @@ bool nonTermContextTarget = false; int countOfCounts[COC_MAX+1]; int totalDistinct = 0; +float minCount = 0; float minCountHierarchical = 0; bool phraseOrientationPriorsFlag = false; @@ -107,7 +109,7 @@ void writeLeftHandSideLabelCounts( const boost::unordered_map const std::string &fileNameLeftHandSideSourceLabelCounts, const std::string &fileNameLeftHandSideTargetSourceLabelCounts ); void writeLabelSet( const std::set &labelSet, const std::string &fileName ); -void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile, +void processPhrasePairs( std::list< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ); void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, std::ostream &phraseTableFile, const ScoreFeatureManager &featureManager, const MaybeLog &maybeLog ); double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ); @@ -131,14 +133,28 @@ int main(int argc, char* argv[]) ScoreFeatureManager featureManager; if (argc < 4) { std::cerr << - "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] " - "[--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] " - "[--NoWordAlignment] [--UnalignedPenalty] " + "syntax: score extract lex phrase-table " + "[--Inverse] " + "[--Hierarchical] " + "[--LogProb] " + "[--NegLogProb] " + "[--NoLex] " + "[--GoodTuring] " + "[--KneserNey] " + "[--NoWordAlignment] " + "[--UnalignedPenalty] " "[--UnalignedFunctionWordPenalty function-word-file] " - "[--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] " - "[--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] " - "[--TargetPreferenceLabels] [--UnpairedExtractFormat] " - "[--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl; + "[--MinCountHierarchical count] " + "[--PartsOfSpeech] " + "[--PCFG] " + "[--TreeFragments] " + "[--SourceLabels] " + "[--SourceLabelCountsLHS] " + "[--TargetPreferenceLabels] " + "[--UnpairedExtractFormat] " + "[--ConditionOnTargetLHS] " + "[--CrossedNonTerm]" + << std::endl; std::cerr << featureManager.usage() << std::endl; exit(1); } @@ -235,9 +251,13 @@ int main(int argc, char* argv[]) logProbFlag = true; negLogProb = -1; std::cerr << "using negative log-probabilities" << std::endl; + } else if (strcmp(argv[i],"--MinCount") == 0) { + minCount = Moses::Scan( argv[++i] ); + std::cerr << "dropping all phrase pairs occurring less than " << minCount << " times" << std::endl; + minCount -= 0.00001; // account for rounding } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) { minCountHierarchical = Moses::Scan( argv[++i] ); - std::cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl; + std::cerr << "dropping all hierarchical phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl; minCountHierarchical -= 0.00001; // account for rounding } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) { crossedNonTerm = true; @@ -325,8 +345,8 @@ int main(int argc, char* argv[]) // loop through all extracted phrase translations std::string line, lastLine; ExtractionPhrasePair *phrasePair = NULL; - std::vector< ExtractionPhrasePair* > phrasePairsWithSameSource; - std::vector< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible + std::list< ExtractionPhrasePair* > phrasePairsWithSameSource; + std::list< ExtractionPhrasePair* > phrasePairsWithSameSourceAndTarget; // required for hierarchical rules only, as non-terminal alignments might make the phrases incompatible int tmpSentenceId; PHRASE *tmpPhraseSource, *tmpPhraseTarget; @@ -390,7 +410,7 @@ int main(int argc, char* argv[]) // once the first of them has been found to have to be set to false if ( hierarchicalFlag ) { - for ( std::vector< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin(); + for ( std::list< ExtractionPhrasePair* >::const_iterator iter = phrasePairsWithSameSourceAndTarget.begin(); iter != phrasePairsWithSameSourceAndTarget.end(); ++iter ) { if ( (*iter)->Matches( tmpPhraseSource, tmpPhraseTarget, tmpTargetToSourceAlignment, sourceMatch, targetMatch, alignmentMatch ) ) { @@ -420,7 +440,7 @@ int main(int argc, char* argv[]) if ( !phrasePairsWithSameSource.empty() && !sourceMatch ) { processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb ); - for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); + for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); iter!=phrasePairsWithSameSource.end(); ++iter) { delete *iter; } @@ -455,7 +475,7 @@ int main(int argc, char* argv[]) std::cerr << std::endl; processPhrasePairs( phrasePairsWithSameSource, *phraseTableFile, featureManager, maybeLogProb ); - for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); + for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); iter!=phrasePairsWithSameSource.end(); ++iter) { delete *iter; } @@ -656,7 +676,7 @@ void writeLabelSet( const std::set &labelSet, const std::string &fi } -void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile, +void processPhrasePairs( std::list< ExtractionPhrasePair* > &phrasePairsWithSameSource, std::ostream &phraseTableFile, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb ) { if (phrasePairsWithSameSource.size() == 0) { @@ -668,14 +688,14 @@ void processPhrasePairs( std::vector< ExtractionPhrasePair* > &phrasePairsWithSa //std::cerr << "phrasePairs.size() = " << phrasePairs.size() << std::endl; // loop through phrase pairs - for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); + for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); iter!=phrasePairsWithSameSource.end(); ++iter) { // add to total count totalSource += (*iter)->GetCount(); } // output the distinct phrase pairs, one at a time - for ( std::vector< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); + for ( std::list< ExtractionPhrasePair* >::const_iterator iter=phrasePairsWithSameSource.begin(); iter!=phrasePairsWithSameSource.end(); ++iter) { // add to total count outputPhrasePair( **iter, totalSource, phrasePairsWithSameSource.size(), phraseTableFile, featureManager, maybeLogProb ); @@ -704,16 +724,15 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, countOfCounts[ countInt ]++; } - // compute PCFG score - float pcfgScore = 0; - if (pcfgFlag && !inverseFlag) { - pcfgScore = phrasePair.GetPcfgScore() / count; - } - // output phrases const PHRASE *phraseSource = phrasePair.GetSource(); const PHRASE *phraseTarget = phrasePair.GetTarget(); + // do not output if count below threshold + if (count < minCount) { + return; + } + // do not output if hierarchical and count below threshold if (hierarchicalFlag && count < minCountHierarchical) { for(size_t j=0; jsize()-1; ++j) { @@ -722,6 +741,12 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, } } + // compute PCFG score + float pcfgScore = 0; + if (pcfgFlag && !inverseFlag) { + pcfgScore = phrasePair.GetPcfgScore() / count; + } + // source phrase (unless inverse) if (!inverseFlag) { printSourcePhrase(phraseSource, phraseTarget, bestAlignmentT2S, phraseTableFile); From 472529ade857a69e01f81cac6675fa7eeb9c2ba9 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Fri, 24 Jul 2015 20:43:29 +0100 Subject: [PATCH 171/286] Moses::Scan too inefficient --- phrase-extract/consolidate-main.cpp | 21 +++++++++++---------- phrase-extract/score-main.cpp | 9 +++++---- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index 732185eb3..c9496f988 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -17,6 +17,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ +#include #include #include @@ -123,7 +124,7 @@ int main(int argc, char* argv[]) std::cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:"; int prev = 0; while(i+1='0' && argv[i+1][0]<='9') { - int binCount = Moses::Scan(argv[++i]); + int binCount = std::atoi( argv[++i] ); countBin.push_back( binCount ); if (prev+1 == binCount) { std::cerr << " " << binCount; @@ -164,8 +165,8 @@ int main(int argc, char* argv[]) } pos = single_setting.find(":"); UTIL_THROW_IF2(pos == std::string::npos, "faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'"); - unsigned int field = Moses::Scan( single_setting.substr(0,pos) ); - float threshold = Moses::Scan( single_setting.substr(pos+1) ); + unsigned int field = std::atoll( single_setting.substr(0,pos).c_str() ); + float threshold = std::atof( single_setting.substr(pos+1).c_str() ); if (field == 0) { minScore0 = threshold; std::cerr << "setting minScore0 to " << threshold << std::endl; @@ -195,9 +196,9 @@ void loadCountOfCounts( const std::string& fileNameCountOfCounts ) std::string line; while (getline(fileCountOfCounts, line)) { if (totalCount < 0) - totalCount = Moses::Scan(line); // total number of distinct phrase pairs + totalCount = std::atof( line.c_str() ); // total number of distinct phrase pairs else - countOfCounts.push_back( Moses::Scan(line) ); + countOfCounts.push_back( std::atof( line.c_str() ) ); } fileCountOfCounts.Close(); @@ -286,13 +287,13 @@ void processFiles( const std::string& fileNameDirect, Moses::Tokenize( directCounts, itemDirect[4] ); std::vector indirectCounts; Moses::Tokenize( indirectCounts, itemIndirect[4] ); - float countF = Moses::Scan(directCounts[0]); - float countE = Moses::Scan(indirectCounts[0]); - float countEF = Moses::Scan(indirectCounts[1]); + float countF = std::atof( directCounts[0].c_str() ); + float countE = std::atof( indirectCounts[0].c_str() ); + float countEF = std::atof( indirectCounts[1].c_str() ); float n1_F, n1_E; if (kneserNeyFlag) { - n1_F = Moses::Scan(directCounts[2]); - n1_E = Moses::Scan(indirectCounts[2]); + n1_F = std::atof( directCounts[2].c_str() ); + n1_E = std::atof( indirectCounts[2].c_str() ); } // Good Turing discounting diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index a09bd0543..09cec8fbe 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -252,11 +253,11 @@ int main(int argc, char* argv[]) negLogProb = -1; std::cerr << "using negative log-probabilities" << std::endl; } else if (strcmp(argv[i],"--MinCount") == 0) { - minCount = Moses::Scan( argv[++i] ); + minCount = std::atof( argv[++i] ); std::cerr << "dropping all phrase pairs occurring less than " << minCount << " times" << std::endl; minCount -= 0.00001; // account for rounding } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) { - minCountHierarchical = Moses::Scan( argv[++i] ); + minCountHierarchical = std::atof( argv[++i] ); std::cerr << "dropping all hierarchical phrase pairs occurring less than " << minCountHierarchical << " times" << std::endl; minCountHierarchical -= 0.00001; // account for rounding } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) { @@ -570,7 +571,7 @@ void processLine( std::string line, } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count sscanf(token[j].c_str(), "%f", &count); } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // target syntax PCFG score - float pcfgScore = Moses::Scan( token[j] ); + float pcfgScore = std::atof( token[j].c_str() ); pcfgSum = pcfgScore * count; } } @@ -1196,7 +1197,7 @@ void LexicalTable::load( const std::string &fileName ) continue; } - double prob = Moses::Scan( token[2] ); + double prob = std::atof( token[2].c_str() ); WORD_ID wordT = vcbT.storeIfNew( token[0] ); WORD_ID wordS = vcbS.storeIfNew( token[1] ); ltable[ wordS ][ wordT ] = prob; From 21aaec0105ffc69c48a4c8977b965af3e05c7a04 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Fri, 24 Jul 2015 21:01:13 +0100 Subject: [PATCH 172/286] Removed some duplicate code. Can we move all or parts of moses/Util to util/, and from the Moses namespace to the util namespace? There's quite some common functionality in it that is not only relevant to the decoder, but also to phrase extraction and possibly other parts of the toolkit. --- phrase-extract/extract-lex-main.cpp | 9 ++--- phrase-extract/extract-lex.h | 53 ----------------------------- 2 files changed, 5 insertions(+), 57 deletions(-) diff --git a/phrase-extract/extract-lex-main.cpp b/phrase-extract/extract-lex-main.cpp index f63015a6a..78182396d 100644 --- a/phrase-extract/extract-lex-main.cpp +++ b/phrase-extract/extract-lex-main.cpp @@ -4,6 +4,7 @@ #include #include "extract-lex.h" #include "InputFileStream.h" +#include "moses/Util.h" using namespace std; using namespace MosesTraining; @@ -53,9 +54,9 @@ int main(int argc, char* argv[]) assert(isAlign); vector toksTarget, toksSource, toksAlign; - Tokenize(toksTarget, lineTarget); - Tokenize(toksSource, lineSource); - Tokenize(toksAlign, lineAlign); + Moses::Tokenize(toksTarget, lineTarget); + Moses::Tokenize(toksSource, lineSource); + Moses::Tokenize(toksAlign, lineAlign); /* cerr << endl @@ -99,7 +100,7 @@ void ExtractLex::Process(vector &toksTarget, vector &toksSource, const string &alignTok = *iterAlign; vector alignPos; - Tokenize(alignPos, alignTok, "-"); + Moses::Tokenize(alignPos, alignTok, "-"); assert(alignPos.size() == 2); if (alignPos[0] >= toksSource.size()) { diff --git a/phrase-extract/extract-lex.h b/phrase-extract/extract-lex.h index 044a32cf8..1d49465c8 100644 --- a/phrase-extract/extract-lex.h +++ b/phrase-extract/extract-lex.h @@ -9,59 +9,6 @@ namespace MosesTraining { - -//! convert string to variable of type T. Used to reading floats, int etc from files -template -inline T Scan(const std::string &input) -{ - std::stringstream stream(input); - T ret; - stream >> ret; - return ret; -} - - -//! speeded up version of above -template -inline void Scan(std::vector &output, const std::vector< std::string > &input) -{ - output.resize(input.size()); - for (size_t i = 0 ; i < input.size() ; i++) { - output[i] = Scan( input[i] ); - } -} - - -inline void Tokenize(std::vector &output - , const std::string& str - , const std::string& delimiters = " \t") -{ - // Skip delimiters at beginning. - std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); - // Find first "non-delimiter". - std::string::size_type pos = str.find_first_of(delimiters, lastPos); - - while (std::string::npos != pos || std::string::npos != lastPos) { - // Found a token, add it to the vector. - output.push_back(str.substr(lastPos, pos - lastPos)); - // Skip delimiters. Note the "not_of" - lastPos = str.find_first_not_of(delimiters, pos); - // Find next "non-delimiter" - pos = str.find_first_of(delimiters, lastPos); - } -} - -// speeded up version of above -template -inline void Tokenize( std::vector &output - , const std::string &input - , const std::string& delimiters = " \t") -{ - std::vector stringVector; - Tokenize(stringVector, input, delimiters); - return Scan(output, stringVector ); -} - class WordCount { friend std::ostream& operator<<(std::ostream&, const WordCount&); From 31df9593523974331990c0338d6717060b6bcc03 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Fri, 24 Jul 2015 21:51:34 +0100 Subject: [PATCH 173/286] Model1Feature --- moses/FF/Model1Feature.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/moses/FF/Model1Feature.cpp b/moses/FF/Model1Feature.cpp index 09cfd47ab..3bde70cfc 100644 --- a/moses/FF/Model1Feature.cpp +++ b/moses/FF/Model1Feature.cpp @@ -75,7 +75,7 @@ void Model1Vocabulary::Load(const std::string& fileName) ++i; std::vector tokens = Tokenize(line); UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens."); - unsigned id = Scan(tokens[0]); + unsigned id = std::atoll( tokens[0].c_str() ); if (! ( (id == 1) && (tokens[1] == "UNK") )) { const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading? bool stored = Store(factor, id); @@ -86,7 +86,7 @@ void Model1Vocabulary::Load(const std::string& fileName) ++i; std::vector tokens = Tokenize(line); UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens."); - unsigned id = Scan(tokens[0]); + unsigned id = std::atoll( tokens[0].c_str() ); const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading? bool stored = Store(factor, id); UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry."); @@ -105,11 +105,11 @@ void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabular ++i; std::vector tokens = Tokenize(line); UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens."); - unsigned idS = Scan(tokens[0]); - unsigned idT = Scan(tokens[1]); + unsigned idS = std::atoll( tokens[0].c_str() ); + unsigned idT = std::atoll( tokens[1].c_str() ); const Factor* wordS = vcbS.GetWord(idS); const Factor* wordT = vcbT.GetWord(idT); - float prob = Scan(tokens[2]); + float prob = std::atof( tokens[2].c_str() ); if ( (wordS != NULL) && (wordT != NULL) ) { m_ltable[ wordS ][ wordT ] = prob; } From b64af59af64d9c8d5eb7600e25e74f98c32a3008 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Sat, 25 Jul 2015 00:00:40 +0100 Subject: [PATCH 174/286] daily automatic beautifier --- moses/FF/PhrasePairFeature.h | 3 +-- phrase-extract/score-main.cpp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/moses/FF/PhrasePairFeature.h b/moses/FF/PhrasePairFeature.h index 587de6676..4d0e76f9f 100644 --- a/moses/FF/PhrasePairFeature.h +++ b/moses/FF/PhrasePairFeature.h @@ -31,8 +31,7 @@ class PhrasePairFeature: public StatelessFeatureFunction CharHash m_punctuationHash; std::string m_filePathSource; - inline std::string ReplaceTilde(const StringPiece &str) const - { + inline std::string ReplaceTilde(const StringPiece &str) const { std::string out = str.as_string(); size_t pos = out.find('~'); while ( pos != std::string::npos ) { diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 09cec8fbe..391330718 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -154,7 +154,7 @@ int main(int argc, char* argv[]) "[--TargetPreferenceLabels] " "[--UnpairedExtractFormat] " "[--ConditionOnTargetLHS] " - "[--CrossedNonTerm]" + "[--CrossedNonTerm]" << std::endl; std::cerr << featureManager.usage() << std::endl; exit(1); From a1652b4a972c92768993fd7ce0a705653d365495 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 26 Jul 2015 21:20:40 +0100 Subject: [PATCH 175/286] Fix typo. --- moses/TranslationModel/UG/sapt_pscore_length_ratio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h index ea1069c59..8bf7a07bf 100644 --- a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h +++ b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h @@ -1,4 +1,4 @@ -// -*- mode: c++; intent-tabs-mode: nil; tab-width: 2 -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- // Phrase scorer that considers the length ratio of the two phrases. // Written by Ulrich Germann. // From f26e2008caf4adc970b0321064c4ad6d7c814caf Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 26 Jul 2015 21:21:19 +0100 Subject: [PATCH 176/286] work in progress --- moses/TranslationModel/UG/test-domspec.cc | 78 ++++++++++++++++++----- 1 file changed, 61 insertions(+), 17 deletions(-) diff --git a/moses/TranslationModel/UG/test-domspec.cc b/moses/TranslationModel/UG/test-domspec.cc index d4ab504d7..8d59bead5 100644 --- a/moses/TranslationModel/UG/test-domspec.cc +++ b/moses/TranslationModel/UG/test-domspec.cc @@ -5,6 +5,8 @@ #include #include #include +#include + #include "mm/ug_bitext.h" #include "mm/tpt_typedefs.h" #include "mm/ug_prime_sampling1.h" @@ -49,16 +51,49 @@ show(Bitext const& B, iter const& m, pstats& stats) VectorIndexSorter, sorter_t> viso(pplist, sorter); sptr > ranked = viso.GetOrder(); size_t ctr=0; + size_t cumul=0; BOOST_FOREACH(size_t const i, *ranked) { - PhrasePair const& pp = pplist[i]; - cout << boost::format(" %6d | ") % pp.joint - << toString(*B.V2, pp.start2, pp.len2) << endl; typedef map::value_type entry_t; + + PhrasePair const& pp = pplist[i]; + if (pp.joint < pp.good1 * .01) break; + size_t remarkable = 0; + float p = float(pp.joint)/pp.good1; BOOST_FOREACH(entry_t const& e, pp.indoc) { - cout << float(pp.joint)/pp.raw1 * stats.indoc[e.first] - << "/" << e.second << "/" << stats.indoc[e.first] << endl; + boost::math::binomial binomi(stats.indoc[e.first], p); + float x = boost::math::cdf(binomi, e.second); + float y = boost::math::cdf(boost::math::complement(binomi, e.second-1)); + if ((x > .01 && y > .01) || e.second < 5) continue; + remarkable += e.second; + // cout << p * stats.indoc[e.first] + // << "/" << e.second << "/" << stats.indoc[e.first] + // << " " << boost::math::cdf(binomi, e.second) + // << " " << boost::math::cdf(boost::math::complement(binomi, e.second-1)) + // << " " << toString(*B.V2, pp.start2, pp.len2) + // << endl; + } + if (remarkable*20 > pp.good1) + { + cout << boost::format(" %6d | ") % pp.joint + << toString(*B.V2, pp.start2, pp.len2) + << boost::format(" (%d: %.2f)") % cumul % (float(cumul)/pp.good1) + << endl; + BOOST_FOREACH(entry_t const& e, pp.indoc) + { + boost::math::binomial binomi(stats.indoc[e.first], p); + float x = boost::math::cdf(binomi, e.second); + float y = boost::math::cdf(boost::math::complement(binomi, e.second-1)); + if ((x > .001 && y > .001) || e.second < 20) continue; + cout << p * stats.indoc[e.first] + << "/" << e.second << "/" << stats.indoc[e.first] + << " " << boost::math::cdf(binomi, e.second) + << " " << boost::math::cdf(boost::math::complement + (binomi, e.second-1)) + << " " << toString(*B.V2, pp.start2, pp.len2) + << endl; + } } } } @@ -67,15 +102,23 @@ show(Bitext const& B, iter const& m, pstats& stats) void process(Bitext const* bitext, TSA::tree_iterator& m) { - if (m.approxOccurrenceCount() <= 5000) return; - boost::shared_ptr nil; - Moses::bitext::sampling_method random = Moses::bitext::random_sampling; - Moses::bitext::BitextSampler s(bitext, m, nil, 10000, random); - s(); - show(*bitext, m, *s.stats()); - if (m.down()) + static boost::shared_ptr nil(new SamplingBiasAlways(bitext->sid2did())); + static Moses::bitext::sampling_method random = Moses::bitext::random_sampling; + // if (m.down()) + if (m.extend((*bitext->V1)["job"])) { - do { process(bitext, m); } while (m.over()); + do + { + if (m.ca() >= 5000) + { + // cout << m.str(bitext->V1.get()) << " [" << m.ca() << "]" << endl; + Moses::bitext::BitextSampler s(bitext, m, nil, 10000, random); + s(); + show(*bitext, m, *s.stats()); + process(bitext, m); + } + } + while (m.over()); m.up(); } } @@ -83,10 +126,11 @@ process(Bitext const* bitext, TSA::tree_iterator& m) int main(int argc, char* argv[]) { interpret_args(argc, argv); - mmbitext B; - B.open(bname, L1, L2); - TSA::tree_iterator m(B.I1.get()); - process(&B, m); + iptr B(new mmbitext); + B->open(bname, L1, L2); + TSA::tree_iterator m(B->I1.get()); + // m.extend((*B.V1)["job"]); + process(B.get(), m); } void From 70a1c886142ed2e9cce716507bb68c9845f1650b Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 26 Jul 2015 21:23:13 +0100 Subject: [PATCH 177/286] New dummy bias that always returns 1. Purpose: to keep track of phrase counts per document. If no bias is given, no per-documents counts are stored. --- moses/TranslationModel/UG/mm/ug_sampling_bias.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index ad7735bc1..351d7be8c 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -91,5 +91,17 @@ namespace Moses }; + class + SamplingBiasAlways : public SamplingBias + { + public: + SamplingBiasAlways(std::vector const* sid2docid) + : SamplingBias(sid2docid) {} + + float operator[](id_type const idx) { return 1; } + float operator[](id_type const idx) const { return 1; } + size_t size() const { return 0; } + }; + } } From 2faa9e6fe4e48de265821ce026b824600cc89509 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 28 Jul 2015 14:23:23 +0100 Subject: [PATCH 178/286] Multi-threaded sorting when building suffix array. --- moses/TranslationModel/UG/mm/ug_im_tsa.h | 68 ++++++++++++++++++++---- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_im_tsa.h b/moses/TranslationModel/UG/mm/ug_im_tsa.h index 92e7f033c..60cc8dc47 100644 --- a/moses/TranslationModel/UG/mm/ug_im_tsa.h +++ b/moses/TranslationModel/UG/mm/ug_im_tsa.h @@ -12,12 +12,16 @@ #include #include #include +#include #include "tpt_tightindex.h" #include "tpt_tokenindex.h" #include "ug_tsa_base.h" #include "tpt_pickler.h" +#include "moses/TranslationModel/UG/generic/threading/ug_thread_pool.h" +#include "util/usage.hh" + namespace ugdiss { // using namespace std; @@ -26,6 +30,33 @@ namespace ugdiss // template class imBitext; + + template + class TsaSorter + { + public: + typedef typename Ttrack::Position cpos; + typedef typename std::vector::iterator iter; + private: + SORTER m_sorter; + iter m_begin; + iter m_end; + public: + TsaSorter(SORTER sorter, iter& begin, iter& end) + : m_sorter(sorter), + m_begin(begin), + m_end(end) { } + + bool + operator()() + { + std::sort(m_begin, m_end, m_sorter); + return true; + } + + }; + + //----------------------------------------------------------------------- template class imTSA : public TSA @@ -52,9 +83,8 @@ namespace ugdiss public: imTSA(); - imTSA(boost::shared_ptr const> c, - bdBitset const* filt, - std::ostream* log = NULL); + imTSA(boost::shared_ptr const> c, bdBitset const* filt, + std::ostream* log = NULL, size_t threads = 0); imTSA(imTSA const& prior, boost::shared_ptr const> const& crp, @@ -140,8 +170,11 @@ namespace ugdiss // specified in filter template imTSA:: - imTSA(boost::shared_ptr const> c, bdBitset const* filter, std::ostream* log) + imTSA(boost::shared_ptr const> c, + bdBitset const* filter, std::ostream* log, size_t threads) { + if (threads == 0) + threads = boost::thread::hardware_concurrency(); assert(c); this->corpus = c; bdBitset filter2; @@ -198,19 +231,34 @@ namespace ugdiss } // Now sort the array - if (log) *log << "sorting ...." << std::endl; + if (log) *log << "sorting .... with " << threads << " threads." << std::endl; + double start_time = util::WallTime(); + boost::scoped_ptr tpool; + tpool.reset(new ug::ThreadPool(threads)); + index.resize(wcnt.size()+1,0); - typename ttrack::Position::LESS > sorter(c.get()); + typedef typename ttrack::Position::LESS > sorter_t; + sorter_t sorter(c.get()); for (size_t i = 0; i < wcnt.size(); i++) { - if (log && wcnt[i] > 5000) - *log << "sorting " << wcnt[i] - << " entries starting with id " << i << "." << std::endl; + // if (log && wcnt[i] > 5000) + // *log << "sorting " << wcnt[i] + // << " entries starting with id " << i << "." << std::endl; index[i+1] = index[i]+wcnt[i]; assert(index[i+1]==tmp[i]); // sanity check if (wcnt[i]>1) - sort(sufa.begin()+index[i],sufa.begin()+index[i+1],sorter); + { + typename std::vector::iterator b,e; + b = sufa.begin()+index[i]; + e = sufa.begin()+index[i+1]; + TsaSorter foo(sorter,b,e); + tpool->add(foo); + // sort(sufa.begin()+index[i],sufa.begin()+index[i+1],sorter); + } } + tpool.reset(); + if (log) *log << "Done sorting after " << util::WallTime() - start_time + << " seconds." << std::endl; this->startArray = reinterpret_cast(&(*sufa.begin())); this->endArray = reinterpret_cast(&(*sufa.end())); this->numTokens = sufa.size(); From d1cb249a7f263dbc5cf39b2805697e5ca8a7ad1f Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 28 Jul 2015 14:24:06 +0100 Subject: [PATCH 179/286] Removed building of cooccurrence table from mmlex-build. --- moses/TranslationModel/UG/mm/mmlex-build.cc | 32 +++++++++++---------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/moses/TranslationModel/UG/mm/mmlex-build.cc b/moses/TranslationModel/UG/mm/mmlex-build.cc index 1e7bee5cb..d79938de3 100644 --- a/moses/TranslationModel/UG/mm/mmlex-build.cc +++ b/moses/TranslationModel/UG/mm/mmlex-build.cc @@ -221,14 +221,14 @@ processSentence(id_type sid) Token const* e1 = T1.sntEnd(sid); Token const* s2 = T2.sntStart(sid); Token const* e2 = T2.sntEnd(sid); - vector cnt1(V1.ksize(),0); - vector cnt2(V2.ksize(),0); - for (Token const* x = s1; x < e1; ++x) - ++cnt1.at(x->id()); - for (Token const* x = s2; x < e2; ++x) - ++cnt2.at(x->id()); + // vector cnt1(V1.ksize(),0); + // vector cnt2(V2.ksize(),0); + // for (Token const* x = s1; x < e1; ++x) + // ++cnt1.at(x->id()); + // for (Token const* x = s2; x < e2; ++x) + // ++cnt2.at(x->id()); - boost::unordered_set seen; + // boost::unordered_set seen; bitvector check1(T1.sntLen(sid)); check1.set(); bitvector check2(T2.sntLen(sid)); check2.set(); @@ -236,7 +236,8 @@ processSentence(id_type sid) char const* p = Tx.sntStart(sid); char const* q = Tx.sntEnd(sid); ushort r,c; - // cout << sid << " " << q-p << endl; + if (verbose && sid % 1000000 == 0) + cerr << sid/1000000 << " M sentences processed" << endl; while (p < q) { p = binread(p,r); @@ -257,8 +258,8 @@ processSentence(id_type sid) wpair k(id1,id2); Count& cnt = CNT[k]; cnt.a++; - if (seen.insert(k).second) - cnt.c += cnt1[id1] * cnt2[id2]; + // if (seen.insert(k).second) + // cnt.c += cnt1[id1] * cnt2[id2]; } // count unaliged words for (size_t i = check1.find_first(); @@ -292,11 +293,11 @@ main(int argc, char* argv[]) // cerr << "done counting" << endl; ofstream aln_out,coc_out; if (oname.size()) aln_out.open(oname.c_str()); - if (cooc.size()) coc_out.open(cooc.c_str()); + // if (cooc.size()) coc_out.open(cooc.c_str()); writeTable(oname.size() ? &aln_out : NULL, cooc.size() ? &coc_out : NULL); if (oname.size()) aln_out.close(); - if (cooc.size()) coc_out.close(); + // if (cooc.size()) coc_out.close(); } void @@ -312,8 +313,8 @@ interpret_args(int ac, char* av[]) ("help,h", "print this message") ("cfg,f", po::value(&cfgFile),"config file") ("oname,o", po::value(&oname),"output file name") - ("cooc,c", po::value(&cooc), - "file name for raw co-occurrence counts") + // ("cooc,c", po::value(&cooc), + // "file name for raw co-occurrence counts") ("verbose,v", po::value(&verbose)->default_value(0)->implicit_value(1), "verbosity level") ("threads,t", po::value(&num_threads)->default_value(4), @@ -339,7 +340,8 @@ interpret_args(int ac, char* av[]) cout << o << endl; exit(0); } - num_threads = min(num_threads,24UL); + size_t num_cores = boost::thread::hardware_concurrency(); + num_threads = min(num_threads,num_cores); } From a968536176aa508fff827ef2a5765618b13132ad Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Tue, 28 Jul 2015 16:37:50 +0100 Subject: [PATCH 180/286] ems fix: pass-unless doesn't understand AND --- scripts/ems/experiment.meta | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 1d38881b2..555b4ed48 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -118,7 +118,7 @@ post-split-factorize out: post-split-factorized-stem rerun-on-change: TRAINING:input-factors TRAINING:output-factors default-name: corpus/split-factored - pass-unless: AND TRAINING:input-factors factorize-after-split + pass-unless: factorize-after-split parallelizable: yes error: can't open error: incompatible number of words in factor @@ -257,7 +257,8 @@ post-split-factorize in: split-corpus out: split-factorized-corpus default-name: lm/split-factored - pass-unless: AND factors factorize-after-split + rerun-on-change: TRAINING:input-factors TRAINING:output-factors + pass-unless: factorize-after-split ignore-if: concatenate-files concatenate-files-split parallelizable: yes error: can't open @@ -390,7 +391,8 @@ post-split-factorize-tuning in: split-tuning out: post-split-factorized-tuning default-name: lm/interpolate-tuning.split-factored - pass-unless: AND TRAINING:output-factors factorize-after-split + rerun-on-change: TRAINING:input-factors TRAINING:output-factors + pass-unless: factorize-after-split parallelizable: yes error: can't open error: incompatible number of words in factor From 2cda286a069e2c40be9223174265838847392307 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Tue, 28 Jul 2015 16:55:55 +0100 Subject: [PATCH 181/286] experiment.meta: re-run fast_align symmetrization if symmetrization type changes --- scripts/ems/experiment.meta | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 555b4ed48..8a508a946 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -604,6 +604,7 @@ symmetrize-fast-align in: fast-alignment fast-alignment-inverse corpus-mml-prefilter=OR=corpus out: word-alignment ignore-unless: fast-align-settings + rerun-on-change: alignment-symmetrization-method template: $moses-script-dir/ems/support/symmetrize-fast-align.perl IN IN1 IN2.$input-extension IN2.$output-extension OUT $alignment-symmetrization-method $moses-src-dir/bin/symal default-name: model/aligned prepare-data From 451016891e6e3757c743d329377ac797316da1ec Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 28 Jul 2015 19:35:27 +0100 Subject: [PATCH 182/286] No ranked sampling for the time being. --- .../UG/mm/ug_prime_sampling1.h | 75 ------------------- 1 file changed, 75 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_prime_sampling1.h b/moses/TranslationModel/UG/mm/ug_prime_sampling1.h index 4c39e57a2..661530cd4 100644 --- a/moses/TranslationModel/UG/mm/ug_prime_sampling1.h +++ b/moses/TranslationModel/UG/mm/ug_prime_sampling1.h @@ -11,81 +11,6 @@ namespace Moses { namespace bitext { -typedef L2R_Token Token; -typedef mmBitext mmbitext; -typedef Bitext::tsa tsa; -typedef imTtrack imttrack; -typedef imTSA imtsa; - -template -void -mark(typename TSA::tree_iterator const& r, SentenceBias& hits) -{ - char const* stop = r.upper_bound(-1); - for (tsa::ArrayEntry I(r.lower_bound(-1)); I.next < stop;) - { - r.root->readEntry(I.next,I); - size_t slen = r.root->getCorpus()->sntLen(I.sid); - hits[I.sid] += 1./(r.ca() * slen); - } -} - -template -void -process(typename TSA::tree_iterator& m, - typename TSA::tree_iterator& r, - SentenceBias& hits, size_t const max_count=1000) -{ - if (m.down()) - { - do - { - if (r.extend(m.getToken(-1)->id())) - { - if (r.approxOccurrenceCount() > max_count) - // don't mark phrases that occur very often - process(m, r, hits, max_count); - else mark(r,hits); - r.up(); - } - else if (r.size() && r.size() == 1) // && r.ca() < max_count) - mark(r,hits); - } - while (m.over()); - m.up(); - } -} - -template -sptr -prime_sampling1(TSA const& refIdx, - TSA const& newIdx, - size_t const max_count, - std::vector const* sid2docid = NULL) -{ - typename TSA::tree_iterator m(&newIdx); - typename TSA::tree_iterator r(&refIdx); - sptr ret; - ret.reset(new SentenceBias(refIdx.getCorpus()->size(),0, sid2docid)); - process(m, r, *ret, max_count); - return ret; -} - -template -sptr -prime_sampling1(TokenIndex& V, TSA const& refIdx, - typename std::vector const& input, - size_t const max_count) -{ - sptr > > crp; - crp.reset(new typename std::vector >(input.size())); - for (size_t i = 0; i < input.size(); ++i) - fill_token_seq(V, input[i], (*crp)[i]); - sptr idoc(new imttrack(crp)); - imtsa newIdx(idoc,NULL); - return prime_sampling1(refIdx, newIdx, max_count); -} - } // end of namespace bitext // #ifndef NO_MOSES } // end of namespace Moses From 6ab2d4d3eb1bcf4e1c76be6a89e69cc288af4c4d Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 28 Jul 2015 20:14:46 +0100 Subject: [PATCH 183/286] Removed ranking stuff prior to merge with master. --- .../UG/mm/ug_bitext_sampler.h | 3 +- .../UG/mm/ug_prime_sampling1.h | 18 ------ moses/TranslationModel/UG/mmsapt.cpp | 60 +------------------ moses/TranslationModel/UG/mmsapt.h | 5 +- 4 files changed, 7 insertions(+), 79 deletions(-) delete mode 100644 moses/TranslationModel/UG/mm/ug_prime_sampling1.h diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index a069ef008..ffb389129 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -243,8 +243,7 @@ perform_ranked_sampling() return m_ctr; } -// Ranked sampling sorts all samples by score and then considers the top-ranked -// candidates for phrase extraction. +// Uniform sampling template size_t BitextSampler:: diff --git a/moses/TranslationModel/UG/mm/ug_prime_sampling1.h b/moses/TranslationModel/UG/mm/ug_prime_sampling1.h deleted file mode 100644 index 661530cd4..000000000 --- a/moses/TranslationModel/UG/mm/ug_prime_sampling1.h +++ /dev/null @@ -1,18 +0,0 @@ -// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*- -// Functions for "priming" sample selection for sampling phrase tables. -// Author: Ulrich Germann -#pragma once -#include "ug_bitext.h" -#include "ug_sampling_bias.h" -#include -// #ifndef NO_MOSES -namespace Moses { -// #endif -namespace bitext -{ - -} // end of namespace bitext -// #ifndef NO_MOSES -} // end of namespace Moses -// #endif - diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index a058d76dc..fb2dc147b 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -69,7 +69,7 @@ namespace Moses , m_bias_log(NULL) , m_bias_loglevel(0) , m_lr_func(NULL) - , m_sampling_method(ranked_sampling) + , m_sampling_method(random_sampling) , bias_key(((char*)this)+3) , cache_key(((char*)this)+2) , context_key(((char*)this)+1) @@ -236,9 +236,7 @@ namespace Moses if ((m = param.find("method")) != param.end()) { - if (m->second == "rank" || m->second == "ranked") - m_sampling_method = ranked_sampling; - else if (m->second == "random") + if (m->second == "random") m_sampling_method = random_sampling; else if (m->second == "full") m_sampling_method = full_coverage; @@ -821,56 +819,6 @@ namespace Moses if (!context->cache2) context->cache2.reset(new pstats::cache_t); } - void - Mmsapt:: - set_bias_for_ranking(ttasksptr const& ttask, iptr const> bt) - { // thinking ahead: shard-specific set-up, for multi-shard Mmsapts - - sptr const& scope = ttask->GetScope(); - if (!scope) return; - - sptr const> input = ttask->GetContextWindow(); - if (!input) return; - - sptr context = scope->get(bt.get(), true); - boost::unique_lock lock(context->lock); - if (context->bias) return; - - if (!context->cache1) context->cache1.reset(new pstats::cache_t); - if (!context->cache2) context->cache2.reset(new pstats::cache_t); - - // sptr iowrapper = ttask->GetIOWrapper(); - // vector input; - // input.reserve(iowrapper->GetPastInput().size() + - // iowrapper->GetFutureInput().size()); - // BOOST_FOREACH(sptr const& s, iowrapper->GetPastInput()) - // input.push_back(s->ToString()); - // BOOST_FOREACH(sptr const& s, iowrapper->GetFutureInput()) - // input.push_back(s->ToString()); - - size_t N = 10 * m_default_sample_size; - VERBOSE(1,"Priming bias for ranking. [" << HERE << "]" << endl); - - double t = util::WallTime(); - context->bias = prime_sampling1(*bt->V1, *bt->I1, *input, N); - VERBOSE(1,"Priming took " << util::WallTime() - t << " sec. (wall) " - << "[" << HERE << "]" << endl); - - } - - // void - // Mmsapt:: - // set_bias_via_ranking(ttasksptr const& ttask) - // { - // sptr const& scope = ttask->GetScope(); - // if (!scope) return; - // sptr bias = scope->get(bias_key); - // // For the time being, let's assume that ranking is always primed - // // on the entire document and leave local priming for another day. - // if (bias) return; - // // - // } - void Mmsapt:: InitializeForInput(ttasksptr const& ttask) @@ -879,9 +827,7 @@ namespace Moses sptr context = scope->get(btfix.get(), true); // set sampling bias, depending on sampling method specified - if (m_sampling_method == ranked_sampling) - set_bias_for_ranking(ttask, this->btfix); - else if (m_sampling_method == random_sampling) + if (m_sampling_method == random_sampling) set_bias_via_server(ttask); else UTIL_THROW2("Unknown sampling method: " << m_sampling_method); diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index bea1bcfc2..827722865 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -2,6 +2,7 @@ // Sampling phrase table implementation based on memory-mapped suffix arrays. // Design and code by Ulrich Germann. #pragma once +#define PROVIDES_RANKED_SAMPLING 0 #include #include @@ -20,7 +21,6 @@ #include "moses/TranslationModel/UG/mm/ug_typedefs.h" #include "moses/TranslationModel/UG/mm/tpt_pickler.h" #include "moses/TranslationModel/UG/mm/ug_bitext.h" -#include "moses/TranslationModel/UG/mm/ug_prime_sampling1.h" #include "moses/TranslationModel/UG/mm/ug_bitext_sampler.h" #include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h" @@ -148,9 +148,10 @@ namespace Moses void setup_local_feature_functions(); void set_bias_via_server(ttasksptr const& ttask); +#if PROVIDES_RANKED_SAMPLING void set_bias_for_ranking(ttasksptr const& ttask, iptr const> bt); - +#endif private: void read_config_file(std::string fname, std::map& param); From 0173512ddc5543727fe9ae115b35826f37754608 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Tue, 28 Jul 2015 23:27:59 +0100 Subject: [PATCH 184/286] separate xml version of combine_factors.pl xml version causes slow-down for users who use factors, but not xml (which are most) --- scripts/training/combine_factors.pl | 60 ++-------- scripts/training/combine_factors_syntax.pl | 125 +++++++++++++++++++++ 2 files changed, 132 insertions(+), 53 deletions(-) create mode 100755 scripts/training/combine_factors_syntax.pl diff --git a/scripts/training/combine_factors.pl b/scripts/training/combine_factors.pl index e6a0a5000..fcc9ab3f5 100755 --- a/scripts/training/combine_factors.pl +++ b/scripts/training/combine_factors.pl @@ -37,7 +37,9 @@ while (defined $_) { $nr++; print STDERR "." if $nr % 10000 == 0; print STDERR "($nr)" if $nr % 100000 == 0; - my ($intokens,$MARKUP) = split_xml($_); + chomp; + s/\s+/ /g; s/^ //; s/ $//; + my @intokens = split / /; # load lines of corresponding streams and ensure equal number of words my @lines_of_extratoks; foreach my $factor (0..$#streams) { @@ -47,17 +49,14 @@ while (defined $_) { chomp($line); $line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//; my @toks = split / /, $line; - die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)" - if $#toks != $#$intokens; + die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#intokens)" + if $#toks != $#intokens; $lines_of_extratoks[$factor] = \@toks; } # for every token, print the factors in the order as user wished - for(my $i=0; $i<=$#$intokens; $i++) { - print " " if $i && $$MARKUP[$i] eq ''; - print $$MARKUP[$i]; - - my $token = $$intokens[$i]; + for(my $i=0; $i<=$#intokens; $i++) { + my $token = $intokens[$i]; my @outtoken = (); push @outtoken, $token; # add the first one # print STDERR "Token: $token\n"; @@ -70,56 +69,11 @@ while (defined $_) { print " " if $i != 0; print join("|", @outtoken); } - print $$MARKUP[$#$MARKUP]; print "\n"; $_ = readline($firststream); } close $firststream; print STDERR "Done.\n"; -# store away xml markup -sub split_xml { - my ($line) = @_; - my (@WORD,@MARKUP); - my $i = 0; - $MARKUP[0] = ""; - while($line =~ /\S/) { - # XML tag - if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { - my $potential_xml = $1; - my $line_next = $2; - # exception for factor that is an XML tag - if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) { - $WORD[$i-1] .= $potential_xml; - if ($line_next =~ /^(\|+)(.*)$/) { - $WORD[$i-1] .= $1; - $line_next = $2; - } - } - else { - $MARKUP[$i] .= $potential_xml." "; - } - $line = $line_next; - } - # non-XML text - elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { - $WORD[$i++] = $1; - $MARKUP[$i] = ""; - $line = $2; - } - # '<' or '>' occurs in word, but it's not an XML tag - elsif ($line =~ /^\s*(\S+)(.*)$/) { - $WORD[$i++] = $1; - $MARKUP[$i] = ""; - $line = $2; - } - else { - die("ERROR: huh? $line\n"); - } - } - chop($MARKUP[$#MARKUP]); - return (\@WORD,\@MARKUP); -} - diff --git a/scripts/training/combine_factors_syntax.pl b/scripts/training/combine_factors_syntax.pl new file mode 100755 index 000000000..99d6ae2b3 --- /dev/null +++ b/scripts/training/combine_factors_syntax.pl @@ -0,0 +1,125 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# $Id$ +# given a list of files, combines them to a single corpus (sent to stdout) + +use strict; +use warnings; +use Getopt::Long; +use IO::File; +use File::Basename; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +binmode(STDERR, ":utf8"); + +my @addfactors = @ARGV; +die "usage: combine_factors.pl corpusfile1 corpusfile2 ..." + if 0 == scalar @addfactors; + +my @streams = map { + my $fn = $_; + my $opn = ($fn =~ /\.gz$/ ? "zcat $fn |" : "$fn"); + my $stream = new IO::File; + $stream->open($opn) or die "Can't open '$opn'"; + binmode($stream, ":utf8"); + $stream; +} @addfactors; + +my $nr=0; +my $firststream = shift @streams; +shift @addfactors; # just to keep the lengths sync'ed +$_ = readline($firststream); +while (defined $_) { + $nr++; + print STDERR "." if $nr % 10000 == 0; + print STDERR "($nr)" if $nr % 100000 == 0; + my ($intokens,$MARKUP) = split_xml($_); + # load lines of corresponding streams and ensure equal number of words + my @lines_of_extratoks; + foreach my $factor (0..$#streams) { + my $line = readline($streams[$factor]); + die "Additional factor file $addfactors[$factor] contains too few sentences!" + if !defined $line; + chomp($line); + $line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//; + my @toks = split / /, $line; + die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#$intokens)" + if $#toks != $#$intokens; + $lines_of_extratoks[$factor] = \@toks; + } + + # for every token, print the factors in the order as user wished + for(my $i=0; $i<=$#$intokens; $i++) { + print "" if $i && $$MARKUP[$i] eq ''; + print $$MARKUP[$i]; + + my $token = $$intokens[$i]; + my @outtoken = (); + push @outtoken, $token; # add the first one + # print STDERR "Token: $token\n"; + foreach my $factor (0..$#streams) { + my $f = $lines_of_extratoks[$factor]->[$i]; + die "Missed factor value for word $i+1 on line $nr in $addfactors[$factor]" + if !defined $f || $f eq ""; + push @outtoken, $f; + } + print " " if $i != 0; + print join("|", @outtoken); + } + print $$MARKUP[$#$MARKUP]; + print "\n"; + $_ = readline($firststream); +} +close $firststream; +print STDERR "Done.\n"; + +# store away xml markup +sub split_xml { + my ($line) = @_; + my (@WORD,@MARKUP); + my $i = 0; + $MARKUP[0] = ""; + while($line =~ /\S/) { + # XML tag + if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { + my $potential_xml = $1; + my $line_next = $2; + # exception for factor that is an XML tag + if ($line =~ /^\S/ && scalar(@WORD)>0 && $WORD[$i-1] =~ /\|$/) { + $WORD[$i-1] .= $potential_xml; + if ($line_next =~ /^(\|+)(.*)$/) { + $WORD[$i-1] .= $1; + $line_next = $2; + } + } + else { + $MARKUP[$i] .= $potential_xml." "; + } + $line = $line_next; + } + # non-XML text + elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + # '<' or '>' occurs in word, but it's not an XML tag + elsif ($line =~ /^\s*(\S+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + else { + die("ERROR: huh? $line\n"); + } + } + chop($MARKUP[$#MARKUP]); + return (\@WORD,\@MARKUP); +} + + + From 3a2116b2c973c16f247b3151fd4147cb90bd4642 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Wed, 29 Jul 2015 09:35:19 +0100 Subject: [PATCH 185/286] add quotes so arguments don't get lost --- scripts/ems/experiment.meta | 4 ++-- scripts/training/binarize-model.perl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 8a508a946..b9adb83f9 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -826,9 +826,9 @@ binarize-config in: config out: bin-config pass-unless: binarize-all - rerun-on-change: config + rerun-on-change: config default-name: model/moses.bin.ini - template: $binarize-all IN OUT -Binarizer $ttable-binarizer + template: $binarize-all IN OUT -Binarizer "$ttable-binarizer" final-model: yes hiero-compile-source-suffix-array in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl index 0131d2222..7d0f7a951 100755 --- a/scripts/training/binarize-model.perl +++ b/scripts/training/binarize-model.perl @@ -40,7 +40,7 @@ my $hierarchical = ""; $hierarchical = "-Hierarchical" if $opt_hierarchical; my $targetdir = "$output_config.tables"; -safesystem("$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer") || die "binarising failed"; +safesystem("$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer \"$binarizer\"") || die "binarising failed"; safesystem("rm -f $output_config; ln -s $targetdir/moses.ini $output_config") || die "failed to link new ini file"; #FIXME: Why isn't this in a module? From 4992f620be35c27cfb6dae366eeea7355a95e819 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 29 Jul 2015 13:41:43 +0100 Subject: [PATCH 186/286] bjam: accept argument-less form of --no-xmlrpc-c option --- jam-files/xmlrpc-c.jam | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jam-files/xmlrpc-c.jam b/jam-files/xmlrpc-c.jam index d14810e6a..401bcc006 100644 --- a/jam-files/xmlrpc-c.jam +++ b/jam-files/xmlrpc-c.jam @@ -2,7 +2,7 @@ # xmlrpc-c library (including the abyss server) that is needed for # moses server functionality -if [ option.get "no-xmlrpc-c" ] +if [ option.get "no-xmlrpc-c" : : "yes" ] { rule xmlrpc ( what ? ) { } # never return anything } From ae9cd14948c5848ed36809d535611695fd482de2 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Wed, 29 Jul 2015 10:44:57 -0400 Subject: [PATCH 187/286] fixes --- .../post-decoding-transliteration.pl | 144 +++++++++--------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl index c8e19fc7f..17446bf00 100755 --- a/scripts/Transliteration/post-decoding-transliteration.pl +++ b/scripts/Transliteration/post-decoding-transliteration.pl @@ -18,7 +18,7 @@ binmode(STDERR, ':utf8'); my $___FACTOR_DELIMITER = "|"; -my ($MOSES_SRC_DIR,$TRANSLIT_MODEL,$EVAL_DIR,$OUTPUT_FILE,$OUTPUT_FILE_NAME,$OOV_FILE, $OOV_FILE_NAME, $EXTERNAL_BIN_DIR, $LM_FILE, $INPUT_EXTENSION, $OUTPUT_EXTENSION, $INPUT_FILE,$VERBOSE,$DECODER); +my ($MOSES_SRC_DIR,$TRANSLIT_MODEL,$EVAL_DIR,$OUTPUT_FILE,$OUTPUT_FILE_NAME,$OOV_FILE, $OOV_FILE_NAME, $EXTERNAL_BIN_DIR, $LM_FILE, $INPUT_EXTENSION, $OUTPUT_EXTENSION, $INPUT_FILE,$VERBOSE,$DECODER,$TMP_DIR); die("ERROR: wrong syntax when invoking postDecodingTransliteration.perl") unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR, 'external-bin-dir=s' => \$EXTERNAL_BIN_DIR, @@ -27,6 +27,7 @@ die("ERROR: wrong syntax when invoking postDecodingTransliteration.perl") 'output-extension=s' => \$OUTPUT_EXTENSION, 'decoder=s' => \$DECODER, 'oov-file=s' => \$OOV_FILE, + 'tmp-dir=s' => \$TMP_DIR, 'input-file=s' => \$INPUT_FILE, 'output-file=s' => \$OUTPUT_FILE, 'verbose' => \$VERBOSE, @@ -60,17 +61,19 @@ $EVAL_DIR = dirname($INPUT_FILE); $OUTPUT_FILE_NAME = basename ($OUTPUT_FILE); $OOV_FILE_NAME = basename ($OOV_FILE); -`mkdir $TRANSLIT_MODEL/evaluation`; -`cp $OOV_FILE $TRANSLIT_MODEL/evaluation/`; -my $translitFile = $TRANSLIT_MODEL . "/evaluation/" . $OOV_FILE_NAME; +$TMP_DIR = $OUTPUT_FILE.".tmp" unless defined($TMP_DIR); + +`mkdir -p $TMP_DIR/transliteration`; +`cp $OOV_FILE $TMP_DIR/transliteration`; +my $translitFile = "$TMP_DIR/transliteration/$OOV_FILE_NAME"; print "Preparing for Transliteration\n"; -prepare_for_transliteration ($OOV_FILE, $translitFile); +&prepare_for_transliteration ($OOV_FILE, $translitFile); print "Run Transliteration\n"; -run_transliteration ($MOSES_SRC_DIR , $EXTERNAL_BIN_DIR , $TRANSLIT_MODEL , $OOV_FILE_NAME); +&run_transliteration ($MOSES_SRC_DIR , $EXTERNAL_BIN_DIR , $TRANSLIT_MODEL , $OOV_FILE_NAME); print "Pick Best Transliteration\n"; -form_corpus ($translitFile , $translitFile.".op.nBest" , $EVAL_DIR); -run_decoder($MOSES_SRC_DIR, $EXTERNAL_BIN_DIR, $LM_FILE); +&form_corpus ($translitFile , $translitFile.".op.nBest" , $EVAL_DIR); +&run_decoder($MOSES_SRC_DIR, $EXTERNAL_BIN_DIR, $LM_FILE); ################### Read the UNK word file and prepare for Transliteration ############################### @@ -132,16 +135,18 @@ sub run_transliteration my $EXTERNAL_BIN_DIR = $list[1]; my $TRANSLIT_MODEL = $list[2]; my $eval_file = $list[3]; + print "run_transliteration($MOSES_SRC,$EXTERNAL_BIN_DIR,$TRANSLIT_MODEL,$eval_file)\n"; - `touch $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`; + `touch $TMP_DIR/transliteration/$eval_file.moses.table.ini`; - print "Filter Table\n"; + print "Filter Table... ".`date`; +<<<<<<< Updated upstream `$MOSES_SRC/scripts/training/train-model.perl \\ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ - -score-options '--KneserNey' \\ + -first-step 9 \\ + -external-bin-dir $EXTERNAL_BIN_DIR \\ + -f $INPUT_EXTENSION \\ + -e $OUTPUT_EXTENSION \\ -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \\ -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \\ -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`; @@ -151,11 +156,21 @@ sub run_transliteration $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \\ $TRANSLIT_MODEL/evaluation/$eval_file \\ -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; +======= + my $cmd = "$MOSES_SRC/scripts/training/train-model.perl -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -config $TMP_DIR/transliteration/$eval_file.moses.table.ini -lm 0:3:$TMP_DIR/transliteration/$eval_file.moses.table.ini:8"; + print $cmd."\n"; + `$cmd`; - `rm $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`; + $cmd = "$MOSES_SRC/scripts/training/filter-model-given-input.pl $TMP_DIR/transliteration/$eval_file.filtered $TMP_DIR/transliteration/$eval_file.moses.table.ini $TMP_DIR/transliteration/$eval_file -Binarizer \"$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2\""; + print $cmd."\n"; + `$cmd`; +>>>>>>> Stashed changes + + `rm $TMP_DIR/transliteration/$eval_file.moses.table.ini`; print "Apply Filter\n"; +<<<<<<< Updated upstream `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \\ $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \\ $TRANSLIT_MODEL/model/moses.ini \\ @@ -170,7 +185,16 @@ sub run_transliteration distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \\ < $TRANSLIT_MODEL/evaluation/$eval_file \\ > $TRANSLIT_MODEL/evaluation/$eval_file.op $drop_stderr`; +======= + $cmd = "$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $TMP_DIR/transliteration/$eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $TMP_DIR/transliteration/$eval_file.filtered.ini"; + print $cmd."\n"; + `$cmd`; + my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null"; + $cmd = "$DECODER -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $TMP_DIR/transliteration/$eval_file.op.nBest 1000 distinct -f $TMP_DIR/transliteration/$eval_file.filtered.ini < $TMP_DIR/transliteration/$eval_file > $TMP_DIR/transliteration/$eval_file.op $drop_stderr"; + print $cmd."\n"; + `$cmd`; +>>>>>>> Stashed changes } ################### Read the output of Transliteration Model and Form Corpus ############################### @@ -191,10 +215,10 @@ sub form_corpus my @UNK; my %vocab; - `mkdir -p $EVAL_DIR/Transliteration-Module/$OUTPUT_FILE_NAME/model`; + `mkdir -p $TMP_DIR/retranslation/model`; my $antLog = exp(0.2); - my $phraseTable = $EVAL_DIR . "/Transliteration-Module/$OUTPUT_FILE_NAME/model/phrase-table"; + my $phraseTable = "$TMP_DIR/retranslation/model/phrase-table"; open MYFILE, "<:encoding(UTF-8)", $inp_file or die "Can't open $inp_file: $!\n"; open PT, ">:encoding(UTF-8)", $phraseTable or die "Can't open $phraseTable: $!\n"; @@ -301,66 +325,42 @@ sub run_decoder my @list = @_; my $MOSES_SRC = $list[0]; my $EXTERNAL_BIN_DIR = $list[1]; - my $corpus_dir = $EVAL_DIR . "/Transliteration-Module/$OUTPUT_FILE_NAME"; my $LM_FILE = $list[2]; my @words; - my $final_file = $EVAL_DIR . "/$OUTPUT_FILE_NAME"; - my $find = ".cleaned."; - my $replace = ".transliterated."; - if ($final_file !~ /$find/) { - $find = ".output."; - } - $final_file =~ s/$find/$replace/g; + `mkdir -p $TMP_DIR/retranslation/evaluation`; - `mkdir $corpus_dir/evaluation`; + print "Creating config file... ".`date`; + my $cmd = "$MOSES_SRC/scripts/training/train-model.perl " + ."-mgiza -mgiza-cpus 10 -dont-zip -first-step 9 " + ."-external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION " + ."-e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 " + ."-lmodel-oov-feature \"yes\" -post-decoding-translit \"yes\" " + ."-phrase-translation-table $TMP_DIR/retranslation/model/phrase-table " + ."-config $TMP_DIR/retranslation/model/moses.ini -lm 0:5:$LM_FILE:8"; + print $cmd."\n"; + `$cmd`; - `$MOSES_SRC/scripts/training/train-model.perl \\ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ - -lmodel-oov-feature "yes" -post-decoding-translit "yes" \\ - -phrase-translation-table $corpus_dir/model/phrase-table \\ - -config $corpus_dir/model/moses.ini -lm 0:5:$LM_FILE:8`; + print "Filtering transliteration phrase table... ".`date`; + $cmd = "$MOSES_SRC/scripts/training/filter-model-given-input.pl " + ."$TMP_DIR/retranslation/filtered " + ."$TMP_DIR/retranslation/model/moses.ini " + ."$INPUT_FILE -Binarizer \"$MOSES_SRC/bin/CreateOnDiskPt " + ."1 1 4 100 2\""; + print $cmd."\n"; + `$cmd`; - `touch $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`; + print "Retranslating... ".`date`; + my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null"; + $cmd = "$DECODER " + ."-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 " + ."-threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' " + ."-max-trans-opt-per-coverage 100 " + ."-f $TMP_DIR/retranslation/filtered/moses.ini -distortion-limit 0 " + ."< $INPUT_FILE " + ."> $OUTPUT_FILE $drop_stderr"; + print $cmd."\n"; + `$cmd`; - `$MOSES_SRC/scripts/training/train-model.perl \\ - -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 \\ - -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION \\ - -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 \\ - -lmodel-oov-feature "yes" -post-decoding-translit "yes" \\ - -phrase-translation-table $corpus_dir/model/phrase-table \\ - -config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \\ - -lm 0:3:$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini:8`; - - `$MOSES_SRC/scripts/training/filter-model-given-input.pl \\ - $corpus_dir/evaluation/filtered \\ - $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini \\ - $INPUT_FILE -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt \\ - 1 1 4 100 2"`; - - `rm $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`; - - `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables.perl \\ - $corpus_dir/evaluation/filtered/moses.ini \\ - < $corpus_dir/model/moses.ini \\ - > $corpus_dir/evaluation/moses.filtered.ini`; - - my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null"; - `$DECODER \\ - -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\ - -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \\ - -max-trans-opt-per-coverage 100 \\ - -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \\ - < $INPUT_FILE \\ - > $OUTPUT_FILE $drop_stderr`; - - print "$DECODER \\ - -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\ - -threads 16 -feature-overwrite 'TranslationModel0 table-limit=100' \\ - -max-trans-opt-per-coverage 100 \\ - -f $corpus_dir/evaluation/moses.filtered.ini -distortion-limit 0 \\ - < $INPUT_FILE \\ - > $OUTPUT_FILE $drop_stderr\n"; + print "Done. ".`date`; } From 837bcde69d13e5e28964b90d63fb931ee1a07d07 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Wed, 29 Jul 2015 10:46:09 -0400 Subject: [PATCH 188/286] bug with legacy --- scripts/training/threshold-filter.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/threshold-filter.perl b/scripts/training/threshold-filter.perl index 0aed67d25..b594cda69 100755 --- a/scripts/training/threshold-filter.perl +++ b/scripts/training/threshold-filter.perl @@ -10,7 +10,7 @@ my %MIN_SCORE; # legacy: same threshold for direct and indirect phrase translation probabilities if ($ARGV[0] =~ /^[\d\.]+$/) { $MIN_SCORE{0} = $ARGV[0]; - $MIN_SCORE{2} = $ARGV[2]; + $MIN_SCORE{2} = $ARGV[0]; } # advanced: field:threshold,field:threshold # recommended use is "2:0.0001" From b29166e2fe6dd5d1818ad9b3c834b67425a93dc8 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Wed, 29 Jul 2015 11:02:55 -0400 Subject: [PATCH 189/286] fix of fix --- .../post-decoding-transliteration.pl | 34 ------------------- 1 file changed, 34 deletions(-) diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl index 17446bf00..54678c139 100755 --- a/scripts/Transliteration/post-decoding-transliteration.pl +++ b/scripts/Transliteration/post-decoding-transliteration.pl @@ -141,22 +141,6 @@ sub run_transliteration print "Filter Table... ".`date`; -<<<<<<< Updated upstream - `$MOSES_SRC/scripts/training/train-model.perl \\ - -first-step 9 \\ - -external-bin-dir $EXTERNAL_BIN_DIR \\ - -f $INPUT_EXTENSION \\ - -e $OUTPUT_EXTENSION \\ - -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table \\ - -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \\ - -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`; - - `$MOSES_SRC/scripts/training/filter-model-given-input.pl \\ - $TRANSLIT_MODEL/evaluation/$eval_file.filtered \\ - $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini \\ - $TRANSLIT_MODEL/evaluation/$eval_file \\ - -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`; -======= my $cmd = "$MOSES_SRC/scripts/training/train-model.perl -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -config $TMP_DIR/transliteration/$eval_file.moses.table.ini -lm 0:3:$TMP_DIR/transliteration/$eval_file.moses.table.ini:8"; print $cmd."\n"; `$cmd`; @@ -164,28 +148,11 @@ sub run_transliteration $cmd = "$MOSES_SRC/scripts/training/filter-model-given-input.pl $TMP_DIR/transliteration/$eval_file.filtered $TMP_DIR/transliteration/$eval_file.moses.table.ini $TMP_DIR/transliteration/$eval_file -Binarizer \"$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2\""; print $cmd."\n"; `$cmd`; ->>>>>>> Stashed changes `rm $TMP_DIR/transliteration/$eval_file.moses.table.ini`; print "Apply Filter\n"; -<<<<<<< Updated upstream - `$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl \\ - $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini \\ - $TRANSLIT_MODEL/model/moses.ini \\ - $TRANSLIT_MODEL/tuning/moses.tuned.ini \\ - $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`; - - my $drop_stderr = $VERBOSE ? "" : " 2>/dev/null"; - `$DECODER \\ - -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 \\ - -threads 16 -drop-unknown -distortion-limit 0 \\ - -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 1000 \\ - distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini \\ - < $TRANSLIT_MODEL/evaluation/$eval_file \\ - > $TRANSLIT_MODEL/evaluation/$eval_file.op $drop_stderr`; -======= $cmd = "$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $TMP_DIR/transliteration/$eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $TMP_DIR/transliteration/$eval_file.filtered.ini"; print $cmd."\n"; `$cmd`; @@ -194,7 +161,6 @@ sub run_transliteration $cmd = "$DECODER -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $TMP_DIR/transliteration/$eval_file.op.nBest 1000 distinct -f $TMP_DIR/transliteration/$eval_file.filtered.ini < $TMP_DIR/transliteration/$eval_file > $TMP_DIR/transliteration/$eval_file.op $drop_stderr"; print $cmd."\n"; `$cmd`; ->>>>>>> Stashed changes } ################### Read the output of Transliteration Model and Form Corpus ############################### From 836ca8212a1da92e5d9dd10d40d3260db1284bbb Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Wed, 29 Jul 2015 11:03:24 -0400 Subject: [PATCH 190/286] better support of grid engine cluster --- scripts/ems/experiment.perl | 32 ++++++--- scripts/generic/moses-parallel.pl | 113 +++++++++++++++++++++++------- scripts/generic/qsub-wrapper.pl | 6 +- scripts/training/mert-moses.pl | 13 +++- 4 files changed, 125 insertions(+), 39 deletions(-) diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 8f70471c6..8a8b35d78 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1098,7 +1098,7 @@ sub draw_agenda_graph { print DOT "}\n"; close(DOT); my $graph_file = &steps_file("graph.$VERSION",$VERSION); - `dot -Tps $graph_file.dot >$graph_file.ps`; + `dot -Tps $graph_file.dot >$graph_file.ps 2>/dev/null`; `convert -alpha off $graph_file.ps $graph_file.png`; } @@ -1992,6 +1992,12 @@ sub define_tuning_tune { my $tune_inputtype = &backoff_and_get("TUNING:inputtype"); my $jobs = &backoff_and_get("TUNING:jobs"); my $decoder = &check_backoff_and_get("TUNING:decoder"); + my $cache_model = &backoff_and_get("GENERAL:cache-model"); + + if (defined($cache_model) && !($jobs && $jobs>1 && $CLUSTER)) { + $cmd .= "MOSES_INI=`$scripts/ems/support/cache-model.perl $config $cache_model`\n"; + $config = "\$MOSES_INI"; + } my $decoder_settings = &backoff_and_get("TUNING:decoder-settings"); $decoder_settings = "" unless $decoder_settings; @@ -2000,7 +2006,7 @@ sub define_tuning_tune { my $tuning_settings = &backoff_and_get("TUNING:tuning-settings"); $tuning_settings = "" unless $tuning_settings; - $cmd = "$tuning_script $input $reference $decoder $config --nbest $nbest_size --working-dir $tmp_dir --decoder-flags \"$decoder_settings\" --rootdir $scripts $tuning_settings --no-filter-phrase-table"; + $cmd .= "$tuning_script $input $reference $decoder $config --nbest $nbest_size --working-dir $tmp_dir --decoder-flags \"$decoder_settings\" --rootdir $scripts $tuning_settings --no-filter-phrase-table"; $cmd .= " --lambdas \"$lambda\"" if $lambda; $cmd .= " --continue" if $tune_continue; $cmd .= " --skip-decoder" if $skip_decoder; @@ -2009,6 +2015,7 @@ sub define_tuning_tune { my $qsub_args = &get_qsub_args($DO_STEP[$step_id]); $cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args); $cmd .= " --jobs $jobs" if $CLUSTER && $jobs && $jobs>1; + $cmd .= " --cache-model $cache_model" if $cache_model && $CLUSTER && $jobs && $jobs>1; my $tuning_dir = $tuned_config; $tuning_dir =~ s/\/[^\/]+$//; $cmd .= "\nmkdir -p $tuning_dir"; @@ -3190,12 +3197,6 @@ sub define_evaluation_decode { my $word_alignment = &backoff_and_get("TRAINING:include-word-alignment-in-rules"); my $post_decoding_transliteration = &get("TRAINING:post-decoding-transliteration"); - # If Transliteration Module is to be used as post-decoding step ... - if (defined($post_decoding_transliteration) && $post_decoding_transliteration eq "yes"){ - $settings .= " -output-unknowns $system_output.oov"; - } - - # specify additional output for analysis if (defined($report_precision_by_coverage) && $report_precision_by_coverage eq "yes") { $settings .= " -alignment-output-file $system_output.wa"; @@ -3224,8 +3225,16 @@ sub define_evaluation_decode { $input = $input_with_tags; } - # create command + # cache model on local disk my $cmd; + my $cache_model = &backoff_and_get("GENERAL:cache-model"); + if (defined($cache_model) && !($jobs && $jobs>1 && $CLUSTER)) { + my $scripts = &check_and_get("GENERAL:moses-script-dir"); + $cmd = "MOSES_INI=`$scripts/ems/support/cache-model.perl $config $cache_model`\n"; + $config = "\$MOSES_INI"; + } + + # create command my $nbest_size; $nbest_size = $nbest if $nbest; $nbest_size =~ s/[^\d]//g if $nbest; @@ -3241,6 +3250,7 @@ sub define_evaluation_decode { $cmd .= " -queue-parameters \"$qsub_args\"" if ($CLUSTER && $qsub_args); $cmd .= " -decoder $decoder"; $cmd .= " -config $config"; + $cmd .= " -cache-model $cache_model" if defined($cache_model); $cmd .= " -input-file $input"; $cmd .= " --jobs $jobs"; $cmd .= " -decoder-parameters \"$settings\" > $system_output"; @@ -3251,6 +3261,10 @@ sub define_evaluation_decode { $cmd .= " -n-best-list $system_output.best$nbest_size $nbest" if $nbest; } + # If Transliteration Module is to be used as post-decoding step ... + $cmd .= " -output-unknowns $system_output.oov" + if defined($post_decoding_transliteration) && $post_decoding_transliteration eq "yes"; + &create_step($step_id,$cmd); } diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl index 144b7d6b2..7d9d61658 100755 --- a/scripts/generic/moses-parallel.pl +++ b/scripts/generic/moses-parallel.pl @@ -20,6 +20,7 @@ use warnings; use strict; +use FindBin qw($RealBin); ####################### #Customizable parameters @@ -33,7 +34,7 @@ my $queueparameters=""; # etc. # look for the correct pwdcmd -my $pwdcmd = getPwdCmd(); +my $pwdcmd = &getPwdCmd(); my $workingdir = `$pwdcmd`; chomp $workingdir; my $tmpdir="$workingdir/tmp$$"; @@ -57,6 +58,7 @@ my $version=undef; my $help=0; my $dbg=0; my $jobs=4; +my $cache_model=undef; my $mosescmd="$ENV{MOSESBIN}/moses"; #decoder in use my $inputlist=undef; my $inputfile=undef; @@ -67,6 +69,9 @@ my $nbestfile=undef; my $oldnbestfile=undef; my $oldnbest=undef; my $nbestflag=0; +my $oovlist=undef; +my $oovfile=undef; +my $oovflag=0; my @wordgraphlist=(); my $wordgraphlist=undef; my $wordgraphfile=undef; @@ -94,6 +99,7 @@ sub init(){ 'help'=>\$help, 'debug'=>\$dbg, 'jobs=i'=>\$jobs, + 'cache-model=s'=>\$cache_model, 'decoder=s'=> \$mosescmd, 'robust=i' => \$robust, 'decoder-parameters=s'=> \$mosesparameters, @@ -105,6 +111,7 @@ sub init(){ 'n-best-size=i'=> \$oldnbest, 'output-search-graph|osg=s'=> \$searchgraphlist, 'output-word-graph|owg=s'=> \$wordgraphlist, + 'output-unknowns=s'=> \$oovlist, 'alignment-output-file=s'=> \$alifile, 'translation-details|T=s'=> \$detailsfile, 'qsub-prefix=s'=> \$qsubname, @@ -120,6 +127,8 @@ sub init(){ getWordGraphParameters(); + getOOVParameters(); + getLogParameters(); #print_parameters(); @@ -130,7 +139,7 @@ print STDERR "wordgraphflag:$wordgraphflag\n"; chomp($inputfile=`basename $inputlist`) if defined($inputlist); - $mosesparameters.="@ARGV -config $cfgfile -inputtype $inputtype"; + $mosesparameters.="@ARGV -inputtype $inputtype"; } @@ -162,6 +171,7 @@ sub usage(){ print STDERR "* -decoder Moses decoder to use\n"; print STDERR "* -i|inputfile|input-file the input text to translate\n"; print STDERR "* -jobs number of required jobs\n"; + print STDERR " -cache-model local directory for copying model files\n"; print STDERR " -logfile file where storing log files of all jobs\n"; print STDERR " -qsub-prefix name for sumbitte jobs\n"; print STDERR " -queue-parameters specific requirements for queue\n"; @@ -199,9 +209,11 @@ sub print_parameters(){ print STDERR "Configuration file: $cfgfile\n"; print STDERR "Decoder in use: $mosescmd\n"; print STDERR "Number of jobs:$jobs\n"; + print STDERR "Model cache directory: $cache_model\n" if ($cache_model); print STDERR "Nbest list: $nbestlist\n" if ($nbestflag); print STDERR "Output Search Graph: $searchgraphlist\n" if ($searchgraphflag); print STDERR "Output Word Graph: $wordgraphlist\n" if ($wordgraphflag); + print STDERR "Output OOV: $oovlist\n" if ($oovflag); print STDERR "LogFile:$logfile\n" if ($logflag); print STDERR "Qsub name: $qsubname\n"; print STDERR "Queue parameters: $queueparameters\n"; @@ -209,7 +221,7 @@ sub print_parameters(){ print STDERR "Inputtype: confusion network\n" if $inputtype == 1; print STDERR "Inputtype: lattices\n" if $inputtype == 2; - print STDERR "parameters directly passed to Moses: $mosesparameters\n"; + print STDERR "parameters directly passed to Moses: $mosesparameters -config $cfgfile\n"; } #get parameters for log file @@ -310,6 +322,19 @@ sub getWordGraphParameters(){ } } +sub getOOVParameters { + # only on command line + if ($oovlist) { + if ($oovlist eq "-") { + $oovfile = "oov"; + } + else { + chomp($oovfile = `basename $oovlist`); + } + $oovflag = 1; + } +} + ####################### #Script starts here @@ -436,7 +461,7 @@ grep(s/.+(\-\S+)$/$1/e,@idxlist); safesystem("mkdir -p $tmpdir") or die; -preparing_script(); +&preparing_script(); #launching process through the queue my @sgepids =(); @@ -483,18 +508,18 @@ while ($robust && scalar @idx_todo) { if ($old_sge) { # we need to implement our own waiting script my $syncscript = "${jobscript}.sync_workaround_script.sh"; - safesystem("echo 'date' > $syncscript") or kill_all_and_quit(); + safesystem("echo 'date' > $syncscript") or &kill_all_and_quit(); my $pwd = `$pwdcmd`; chomp $pwd; my $checkpointfile = "${jobscript}.sync_workaround_checkpoint"; # delete previous checkpoint, if left from previous runs - safesystem("\\rm -f $checkpointfile") or kill_all_and_quit(); + safesystem("\\rm -f $checkpointfile") or &kill_all_and_quit(); # start the 'hold' job, i.e. the job that will wait $cmd="qsub -cwd $queueparameters $hj -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript 2> $qsubname.W.log"; - safesystem($cmd) or kill_all_and_quit(); + safesystem($cmd) or &kill_all_and_quit(); # and wait for checkpoint file to appear my $nr=0; @@ -504,15 +529,15 @@ while ($robust && scalar @idx_todo) { print STDERR "w" if $nr % 3 == 0; } print STDERR "End of waiting.\n"; - safesystem("\\rm -f $checkpointfile $syncscript") or kill_all_and_quit(); + safesystem("\\rm -f $checkpointfile $syncscript") or &kill_all_and_quit(); my $failure = 1; - my $nr = 0; + $nr = 0; while ($nr < 60 && $failure) { $nr ++; $failure=&check_exit_status(); if (!$failure) { - $failure = check_translation_old_sge(); + $failure = &check_translation_old_sge(); } last if !$failure; print STDERR "Extra wait ($nr) for possibly unfinished processes.\n"; @@ -521,55 +546,65 @@ while ($robust && scalar @idx_todo) { } else { # use the -sync option for qsub $cmd="qsub $queueparameters -sync y $hj -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls > $qsubname.W.log"; - safesystem($cmd) or kill_all_and_quit(); + safesystem($cmd) or &kill_all_and_quit(); $failure=&check_exit_status(); } - kill_all_and_quit() if $failure && !$robust; + &kill_all_and_quit() if $failure && !$robust; # check if some translations failed - my @idx_still_todo = check_translation(); + my @idx_still_todo = &check_translation(); if ($robust) { # if robust, redo crashed jobs if ((scalar @idx_still_todo) == (scalar @idxlist)) { # ... but not if all crashed print STDERR "everything crashed, not trying to resubmit jobs\n"; $robust = 0; - kill_all_and_quit(); + &kill_all_and_quit(); } @idx_todo = @idx_still_todo; } else { if (scalar (@idx_still_todo)) { print STDERR "some jobs crashed: ".join(" ",@idx_still_todo)."\n"; - kill_all_and_quit(); + &kill_all_and_quit(); } } } #concatenating translations and removing temporary files -concatenate_1best(); -concatenate_logs() if $logflag; -concatenate_ali() if defined $alifile; -concatenate_details() if defined $detailsfile; -concatenate_nbest() if $nbestflag; +&concatenate_1best(); +&concatenate_logs() if $logflag; +&concatenate_ali() if defined $alifile; +&concatenate_details() if defined $detailsfile; +&concatenate_nbest() if $nbestflag; safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-'; -concatenate_searchgraph() if $searchgraphflag; +&concatenate_searchgraph() if $searchgraphflag; safesystem("cat searchgraph$$ >> /dev/stdout") if $searchgraphlist eq '-'; -concatenate_wordgraph() if $wordgraphflag; +&concatenate_wordgraph() if $wordgraphflag; safesystem("cat wordgraph$$ >> /dev/stdout") if $wordgraphlist[0] eq '-'; -remove_temporary_files(); +&concatenate_oov() if $oovflag; +safesystem("cat oov$$ >> /dev/stdout") if $oovlist eq '-'; + +&remove_temporary_files(); #script creation sub preparing_script(){ my $currStartTranslationId = 0; + my $possibly_modified_cfgfile = $cfgfile; + my $cache_model_cmd = ""; + if ($cache_model) { + $cache_model_cmd = "MOSES_INI=`$RealBin/../ems/support/cache-model.perl $cfgfile $cache_model`\n"; + $possibly_modified_cfgfile = "\$MOSES_INI"; + } + foreach my $idx (@idxlist){ my $scriptheader=""; $scriptheader.="\#\! /bin/bash\n\n"; @@ -594,6 +629,10 @@ sub preparing_script(){ open (OUT, "> ${jobscript}${idx}.bash"); print OUT $scriptheader; + + # copy model files into local directory + print OUT $cache_model_cmd; + my $inputmethod = $feed_moses_via_stdin ? "<" : "-input-file"; my $tmpnbestlist=""; @@ -623,9 +662,14 @@ sub preparing_script(){ $tmpwordgraphlist="-output-word-graph $tmpdir/$wordgraphfile.$splitpfx$idx $wordgraphlist[1]"; } + my $tmpoovlist=""; + if ($oovflag){ + $tmpoovlist="-output-unknowns $tmpdir/$oovfile.$splitpfx$idx"; + } + my $tmpStartTranslationId = ""; # "-start-translation-id $currStartTranslationId"; - print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n"; + print OUT "$mosescmd $mosesparameters -config $possibly_modified_cfgfile $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpoovlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n"; print OUT "echo exit status \$\?\n\n"; if (defined $alifile){ @@ -644,11 +688,14 @@ sub preparing_script(){ print OUT "\\mv -f $tmpdir/${searchgraphfile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; } - if ($wordgraphflag){ print OUT "\\mv -f $tmpdir/${wordgraphfile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; } + if ($oovflag){ + print OUT "\\mv -f $tmpdir/${oovfile}.$splitpfx$idx .\n\n"; + print OUT "echo exit status \$\?\n\n"; + } print OUT "\\mv -f $tmpdir/${inputfile}.$splitpfx$idx.trans .\n\n"; print OUT "echo exit status \$\?\n\n"; @@ -840,6 +887,20 @@ sub concatenate_1best(){ } } +sub concatenate_oov(){ + my $outoov=$oovlist; + if ($oovlist eq '-'){ $outoov="oov$$"; } + open (OUT, "> $outoov"); + foreach my $idx (@idxlist){ + my @in=(); + open (IN, "${oovfile}.${splitpfx}${idx}"); + @in=; + print OUT "@in"; + close(IN); + } + close(OUT); +} + sub concatenate_logs(){ open (OUT, "> ${logfile}"); foreach my $idx (@idxlist){ @@ -978,6 +1039,7 @@ sub remove_temporary_files(){ if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); } if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); } if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); } + if ($oovfile){ unlink("${oovfile}.${splitpfx}${idx}"); } unlink("${jobscript}${idx}.bash"); unlink("${jobscript}${idx}.log"); unlink("$qsubname.W.log"); @@ -988,6 +1050,7 @@ sub remove_temporary_files(){ if ($nbestflag && $nbestlist[0] eq '-'){ unlink("${nbestfile}$$"); }; if ($searchgraphflag && $searchgraphlist eq '-'){ unlink("${searchgraphfile}$$"); }; if ($wordgraphflag && $wordgraphlist eq '-'){ unlink("${wordgraphfile}$$"); }; + if ($oovflag && $oovlist eq '-'){ unlink("oov$$"); }; } sub safesystem { diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl index ef9938e07..c282b0600 100755 --- a/scripts/generic/qsub-wrapper.pl +++ b/scripts/generic/qsub-wrapper.pl @@ -14,7 +14,7 @@ use strict; my $queueparameters=""; # look for the correct pwdcmd -my $pwdcmd = getPwdCmd(); +my $pwdcmd = &getPwdCmd(); my $workingdir = `$pwdcmd`; chomp $workingdir; my $tmpdir="$workingdir/tmp$$"; @@ -109,14 +109,14 @@ else fi "; - if (defined $cmdout){ + if (defined($cmdout) && $cmdout ne "/dev/null") { print OUT "mv -f $tmpdir/cmdout$$ $cmdout || echo failed to preserve the log: $tmpdir/cmdout$$\n\n"; } else{ print OUT "rm -f $tmpdir/cmdout$$\n\n"; } - if (defined $cmderr){ + if (defined($cmderr) && $cmderr ne "/dev/null") { print OUT "mv -f $tmpdir/cmderr$$ $cmderr || echo failed to preserve the log: $tmpdir/cmderr$$\n\n"; } else{ diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index c73e75a87..7bb499cdf 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -81,6 +81,7 @@ my $___LATTICE_SAMPLES = 0; my $queue_flags = "-hard"; # extra parameters for parallelizer # the -l ws0ssmt was relevant only to JHU 2006 workshop my $___JOBS = undef; # if parallel, number of jobs to use (undef or <= 0 -> serial) +my $___CACHE_MODEL = undef; # if models need to be copied to local disk from NFS my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder my $continue = 0; # should we try to continue from the last saved step? my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert) @@ -183,6 +184,7 @@ GetOptions( "lattice-samples=i" => \$___LATTICE_SAMPLES, "queue-flags=s" => \$queue_flags, "jobs=i" => \$___JOBS, + "cache-model=s" => \$___CACHE_MODEL, "decoder-flags=s" => \$___DECODER_FLAGS, "continue" => \$continue, "skip-decoder" => \$skip_decoder, @@ -245,6 +247,7 @@ Options: --nbest=100 ... how big nbestlist to generate --lattice-samples ... how many lattice samples (Chatterjee & Cancedda, emnlp 2010) --jobs=N ... set this to anything to run moses in parallel + --cache-model=STRING ... local directory into which copy model before running decoder --mosesparallelcmd=STR ... use a different script instead of moses-parallel --queue-flags=STRING ... anything you with to pass to qsub, eg. '-l ws06osssmt=true'. The default is: '-hard' @@ -1143,7 +1146,7 @@ if($___RETURN_BEST_DEV) { } my $cmd = "$mert_eval_cmd --reference " . join(",", @references) . " $mert_extract_args $candidate"; $cmd .= " -l $__REMOVE_SEGMENTATION" if defined( $__PROMIX_TRAINING); - safesystem("$cmd 2> /dev/null 1> $evalout"); + &submit_or_exec($cmd, $evalout, "/dev/null", 1); open my $fh, '<', $evalout or die "Can't read $evalout : $!"; my $bleu = <$fh>; chomp $bleu; @@ -1291,6 +1294,7 @@ sub run_decoder { die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA; $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG"; $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); + $decoder_cmd .= " -cache-model $___CACHE_MODEL" if defined($___CACHE_MODEL); $decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; } else { my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct"; @@ -1380,7 +1384,12 @@ sub get_featlist_from_moses { print STDERR "Using cached features list: $featlistfn\n"; } else { print STDERR "Asking moses for feature names and values from $___CONFIG\n"; - my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn"; + my $cmd; + if ($___CACHE_MODEL) { + $cmd = "MOSES_INI=`$SCRIPTS_ROOTDIR/ems/support/cache-model.perl $configfn $___CACHE_MODEL` && "; + $configfn = "\$MOSES_INI"; + } + $cmd .= "$___DECODER $___DECODER_FLAGS -config $configfn"; $cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); $cmd .= " -show-weights"; print STDERR "Executing: $cmd\n"; From a5ee3c1b6deffdd96fe5d26b85c53ed52e8a50f9 Mon Sep 17 00:00:00 2001 From: Philipp Koehn Date: Wed, 29 Jul 2015 11:10:13 -0400 Subject: [PATCH 191/286] script to copy model files to local disk before running the decoder - useful for grid --- scripts/ems/support/cache-model.perl | 115 +++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100755 scripts/ems/support/cache-model.perl diff --git a/scripts/ems/support/cache-model.perl b/scripts/ems/support/cache-model.perl new file mode 100755 index 000000000..f4c7e9ef6 --- /dev/null +++ b/scripts/ems/support/cache-model.perl @@ -0,0 +1,115 @@ +#!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +# utility script for deploying decode (may be within tune) jobs over a cluster +# with NFS-mounted drives. copy all the model files to local disk. + +use strict; + +die("ERROR: syntax is cache-model.perl moses.ini cache-dir") + unless scalar @ARGV == 2; +my ($CONFIG,$CACHE_DIR) = @ARGV; + +# create dir (if nor already there) +`mkdir -p $CACHE_DIR`; + +# name for new config file +my $cached_config = $CONFIG; +$cached_config =~ s/\//_/g; +$cached_config = "$CACHE_DIR/$cached_config"; + +# lock / already +while(-e "$cached_config.lock") { + sleep(10); +} +my $just_update_timestamps = (-e $cached_config); +`touch $cached_config.lock` unless $just_update_timestamps; + +# find files to cache (and produce new config) +open(OLD,$CONFIG) || die("ERROR: could not open config '$CONFIG'"); +open(NEW,">$cached_config") unless $just_update_timestamps; +while() { + if (/(PhraseDictionary.+ path=)(\S+)(.*)$/ || + /(LexicalReordering.+ path=)(\S+)(.*)$/ || + /(Generation.+ path=)(\S+)(.*)$/ || + /(OpSequenceModel.+ path=)(\S+)(.*)$/ || + /(KENLM.+ path=)(\S+)(.*)$/) { + my ($pre,$path,$post) = ($1,$2,$3); + my $new_path; + if (/^PhraseDictionaryCompact/) { + $new_path = &cache_file($path,".minphr"); + } + elsif (/^PhraseDictionaryBinary/) { + foreach my $suffix (".binphr.idx",".binphr.srctree.wa",".binphr.srcvoc",".binphr.tgtdata.wa",".binphr.tgtvoc") { + $new_path = &cache_file($path,$suffix); + } + } + elsif (/^LexicalReordering/ && -e "$path.minlexr") { + $new_path = &cache_file($path,".minlexr"); + } + elsif (/^LexicalReordering/ && -e "$path.binlexr.idx") { + foreach my $suffix (".binlexr.idx",".binlexr.srctree",".binlexr.tgtdata",".binlexr.voc0",".binlexr.voc1") { + $new_path = &cache_file($path,$suffix); + } + } + # some other files may need some more special handling + # but this works for me right now. feel free to add + else { + $new_path = &cache_file($path,""); + } + print NEW "$pre$new_path$post\n" unless $just_update_timestamps; + } + else { + print NEW $_ unless $just_update_timestamps; + } +} +close(NEW) unless $just_update_timestamps; +close(OLD); + +`rm $cached_config.lock` unless $just_update_timestamps; +print "$cached_config\n"; + +sub cache_file { + my ($path,$suffix) = @_; + + # add gzipped extension if that's what it is + if (! -e "$path$suffix" && -e "$path$suffix.gz") { + $suffix .= ".gz"; + } + + # file does not exist... nothing to do + if (! -e "$path$suffix") { + print STDERR "WARINING: $path$suffix does not exist - cannot be cached by cache-model.perl\n"; + return $path; + } + + # follow symbolic link + my $uniq_path = `readlink -f $path$suffix`; + chop($uniq_path); + + # create cached file name + my $cached_path = $uniq_path; + $cached_path = substr($cached_path,0,length($cached_path)-length($suffix)); + $cached_path =~ s/\//_/g; + $cached_path = "$CACHE_DIR/$cached_path"; + + # sleep if another process is copying right now... + while(-e "$cached_path$suffix.lock") { + sleep(10); + } + # done if already there + if (-e "$cached_path$suffix") { + `touch $cached_path$suffix`; # update time stamp + return $cached_path; + } + + # okay, go for it + `touch $cached_path$suffix.lock`; + `cp $path$suffix $cached_path$suffix`; + `rm $cached_path$suffix.lock`; + + return $cached_path; +} + From 89d16a491a022d78466893ed78e53fb587864eba Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Fri, 31 Jul 2015 11:20:29 +0100 Subject: [PATCH 192/286] fix ems regression (concatenate-split step) --- scripts/ems/experiment.meta | 2 +- scripts/training/wrappers/parse-en-stanford.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index b9adb83f9..ffc71a3c0 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -259,7 +259,7 @@ post-split-factorize default-name: lm/split-factored rerun-on-change: TRAINING:input-factors TRAINING:output-factors pass-unless: factorize-after-split - ignore-if: concatenate-files concatenate-files-split + ignore-if: concatenate-files parallelizable: yes error: can't open error: incompatible number of words in factor diff --git a/scripts/training/wrappers/parse-en-stanford.py b/scripts/training/wrappers/parse-en-stanford.py index f77a2d92e..db406ff8c 100755 --- a/scripts/training/wrappers/parse-en-stanford.py +++ b/scripts/training/wrappers/parse-en-stanford.py @@ -71,7 +71,7 @@ def process_stanford(infile, javacmd, stanfordpath): '-textFile', '-', 'outFile', '-', ], - stdin=infile, stdout=PIPE, stderr=open('/dev/null', 'w')) + stdin=infile, stdout=PIPE) return stanford.stdout From 4359490e97e066d05de09188cfaabfe292f56075 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 31 Jul 2015 13:09:40 +0000 Subject: [PATCH 193/286] rm bin/moses_chart => rm -f bin/moses_chart. --- Jamroot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jamroot b/Jamroot index 30c979766..2e8075916 100644 --- a/Jamroot +++ b/Jamroot @@ -301,5 +301,5 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist { #local temp = [ _shell "bash source ./s.sh" ] ; local temp = [ _shell "mkdir -p $(TOP)/bin" ] ; -local temp = [ _shell "rm $(TOP)/bin/moses_chart" ] ; +local temp = [ _shell "rm -f $(TOP)/bin/moses_chart" ] ; local temp = [ _shell "cd $(TOP)/bin && ln -s moses moses_chart" ] ; From ecfc8d8b1a0b0dd385d9391624586176acd0f2bc Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 31 Jul 2015 17:06:31 +0100 Subject: [PATCH 194/286] Logging in query_bias_server(). --- .../TranslationModel/UG/mm/ug_sampling_bias.cc | 17 +++++++++++------ moses/TranslationModel/UG/mm/ug_sampling_bias.h | 5 ++++- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index 37e114ff1..7bfc488ec 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -22,17 +22,21 @@ namespace Moses using ugdiss::id_type; std::string - query_bias_server(std::string const& server, std::string const& context) + query_bias_server(std::string const& server, + std::string const& context, + ostream* log) { std::string query = server+uri_encode(context); boost::asio::io_service io_service; Moses::http_client c(io_service, query); io_service.run(); - - // std::string response = c.content(); - // std::cerr << "SERVER RESPONSE: " << response << std::endl; + + if (log) + { + std::string response = c.content(); + *log << "SERVER RESPONSE: " << response << std::endl; + } UTIL_THROW_IF2(c.content().size() == 0, "No response from bias server!"); - return c.content(); } // #endif @@ -62,7 +66,8 @@ namespace Moses Timer timer; if (_log) timer.start(NULL); #endif - std::string json = query_bias_server(server_url, text); + std::string json = query_bias_server(server_url, text, _log); + // std::cerr << "SERVER RESPONSE " << json << std::endl; init_from_json(json, docname2docid, log); #ifndef NO_MOSES diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index 351d7be8c..8e61dc81f 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -13,7 +13,10 @@ namespace Moses { using ugdiss::id_type; - std::string query_bias_server(std::string const& url, std::string const& text); + std::string + query_bias_server(std::string const& url, + std::string const& text, + ostream* log); class SamplingBias { From bfeb3bf66bab44dcb5321edbac4cd467e4d8c49e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 31 Jul 2015 17:07:13 +0100 Subject: [PATCH 195/286] Reporting of context parameter in TranslationRequest constructor. --- moses/server/TranslationRequest.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index 5c23bbf13..72cd2b405 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -275,6 +275,7 @@ parse_request(std::map const& params) si = params.find("context"); if (si != params.end()) { string context = xmlrpc_c::value_string(si->second); + VERBOSE(1,"CONTEXT " << context); m_context.reset(new std::vector(1,context)); } // // biased sampling for suffix-array-based sampling phrase table? From f894dec0fd8d5b15eb16c35d3d2599338894ee9d Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 31 Jul 2015 22:28:45 +0400 Subject: [PATCH 196/286] multi-threaded decoding by default /Vincent Nguyen --- scripts/ems/example/config.basic | 4 ++-- scripts/ems/example/config.factored | 4 ++-- scripts/ems/example/config.hierarchical | 4 ++-- scripts/ems/example/config.syntax | 4 ++-- scripts/ems/example/config.toy | 4 ++-- scripts/ems/example/config.toy.bilinguallm | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic index 7d545121a..648091707 100644 --- a/scripts/ems/example/config.basic +++ b/scripts/ems/example/config.basic @@ -520,7 +520,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "" +decoder-settings = "-threads all" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -589,7 +589,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" +decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads all" ### specify size of n-best list, if produced # diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored index 6675b4e8c..586a90355 100644 --- a/scripts/ems/example/config.factored +++ b/scripts/ems/example/config.factored @@ -539,7 +539,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "" +decoder-settings = "-threads all" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -608,7 +608,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" +decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads all" ### specify size of n-best list, if produced # diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical index d9f54d153..5d2770a71 100644 --- a/scripts/ems/example/config.hierarchical +++ b/scripts/ems/example/config.hierarchical @@ -521,7 +521,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "" +decoder-settings = "-threads all" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -590,7 +590,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -#decoder-settings = "" +decoder-settings = "-threads all" ### specify size of n-best list, if produced # diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax index 4de80b0e0..570e4c9ef 100644 --- a/scripts/ems/example/config.syntax +++ b/scripts/ems/example/config.syntax @@ -525,7 +525,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "" +decoder-settings = "-threads all" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -594,7 +594,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -#decoder-settings = "" +decoder-settings = "-threads all" ### specify size of n-best list, if produced # diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy index ddc6d65df..193b26f8e 100644 --- a/scripts/ems/example/config.toy +++ b/scripts/ems/example/config.toy @@ -503,7 +503,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "" +decoder-settings = "-threads all" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -568,7 +568,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" +decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads all" ### specify size of n-best list, if produced # diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm index fbc85866e..8f7e04029 100644 --- a/scripts/ems/example/config.toy.bilinguallm +++ b/scripts/ems/example/config.toy.bilinguallm @@ -519,7 +519,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "" +decoder-settings = "-threads all" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -584,7 +584,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" +decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads all" ### specify size of n-best list, if produced # From 2090640d4d5675712511564779eecfe346420b9e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 31 Jul 2015 23:16:02 +0000 Subject: [PATCH 197/286] xmlrpc-c server log file now defaults to /dev/null instead of empty string --- moses/ExportInterface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index d3933ae6a..3c9e334da 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -152,7 +152,7 @@ run_as_server() bool isSerial; params.SetParameter(isSerial, "serial", false); string logfile; - params.SetParameter(logfile, "server-log", string("")); + params.SetParameter(logfile, "server-log", string("/dev/null")); size_t num_threads; params.SetParameter(num_threads, "threads", size_t(10)); if (isSerial) VERBOSE(1,"Running server in serial mode." << endl); From 3439de634102224894ca53ccaff85a9b8641c6cb Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 31 Jul 2015 23:18:08 +0000 Subject: [PATCH 198/286] Bug fix in uri_encode. Logging of lookup in http_client constructor. --- .../TranslationModel/UG/mm/ug_http_client.cc | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_http_client.cc b/moses/TranslationModel/UG/mm/ug_http_client.cc index da8537910..ff2a0a1c6 100644 --- a/moses/TranslationModel/UG/mm/ug_http_client.cc +++ b/moses/TranslationModel/UG/mm/ug_http_client.cc @@ -1,5 +1,7 @@ #include "ug_http_client.h" #include "moses/Util.h" +#include + namespace Moses { using boost::asio::ip::tcp; @@ -17,7 +19,7 @@ http_client(boost::asio::io_service& io_service, } http_client:: -http_client(boost::asio::io_service& io_service, std::string url) +http_client(boost::asio::io_service& io_service, std::string url, std::ostream* log) : resolver_(io_service), socket_(io_service) { std::string server; @@ -36,11 +38,14 @@ http_client(boost::asio::io_service& io_service, std::string url) server = url.substr(0,p); if (q < url.size()) path = url.substr(q); -#if 0 - std::cerr << HERE << std::endl; - std::cerr << "SERVER " << server << std::endl; - std::cerr << "PORT |" << port << "|" << std::endl; - std::cerr << "PATH " << path << std::endl; +#if 1 + if (log) + { + *log << HERE << std::endl; + *log << "SERVER " << server << std::endl; + *log << "PORT " << port << "" << std::endl; + *log << "PATH " << path << std::endl; + } #endif init(server, port, path); } @@ -204,12 +209,12 @@ uri_encode(std::string const& in) // cout << *c << " " << int(*c) << endl; if (*c == ' ') buf[i++] = '+'; else if (*c == '.' || *c == '~' || *c == '_' || *c == '-') buf[i++] = *c; - else if (*c < '0') i += sprintf(buf+i, "%%%x", int(*c)); + else if (*c < '0') i += sprintf(buf+i, "%%%02x", int(*c)); else if (*c <= '9') buf[i++] = *c; - else if (*c < 'A') i += sprintf(buf+i, "%%%x", int(*c)); + else if (*c < 'A') i += sprintf(buf+i, "%%%02x", int(*c)); else if (*c <= 'Z') buf[i++] = *c; - else if (*c < 'a') i += sprintf(buf+i, "%%%x", int(*c)); - else if (*c <= 'z') buf[i++] = *c; + else if (*c < 'a') i += sprintf(buf+i, "%%%02x", int(*c)); + else if (*c <= 'z') buf[i++] = *c; else i += sprintf(buf+i, "%%%x", int(*c)); } buf[i] = 0; From 362f26d5eba09cc8bdf1a40602e252863e78ccfd Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 31 Jul 2015 23:19:03 +0000 Subject: [PATCH 199/286] Logging in constructor of http_client. Member function to return server error message. --- moses/TranslationModel/UG/mm/ug_http_client.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_http_client.h b/moses/TranslationModel/UG/mm/ug_http_client.h index 825c0c37e..ece6a58fe 100644 --- a/moses/TranslationModel/UG/mm/ug_http_client.h +++ b/moses/TranslationModel/UG/mm/ug_http_client.h @@ -33,7 +33,7 @@ class http_client std::ostringstream m_error; public: - http_client(boost::asio::io_service& io_service, std::string url); + http_client(boost::asio::io_service& io_service, std::string url, std::ostream* log); http_client(boost::asio::io_service& io_service, std::string const& server, std::string const& port, @@ -58,6 +58,7 @@ private: boost::asio::streambuf response_; public: std::string content() const; + std::string error_msg() const { return m_error.str(); } }; } From fd93df3b8f97d1173e430f94c0f0be2552cf58f7 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 31 Jul 2015 23:21:01 +0000 Subject: [PATCH 200/286] Added virtual deconstructor. --- moses/TranslationModel/UG/sapt_pscore_base.h | 2 ++ moses/TranslationModel/UG/sapt_pscore_pbwd.h | 1 + moses/TranslationModel/UG/sapt_pscore_pfwd.h | 1 + moses/TranslationModel/UG/sapt_pscore_provenance.h | 1 + 4 files changed, 5 insertions(+) diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h index 388c83d9b..8e8e3852d 100644 --- a/moses/TranslationModel/UG/sapt_pscore_base.h +++ b/moses/TranslationModel/UG/sapt_pscore_base.h @@ -22,6 +22,8 @@ namespace Moses { vector m_feature_names; public: + + virtual ~PhraseScorer() {} virtual void operator()(Bitext const& pt, diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h index 2cbe58209..81a97f59f 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pbwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h @@ -17,6 +17,7 @@ namespace Moses { string denom; public: + virtual ~PScorePbwd(){}; PScorePbwd(float const c, string d) { this->m_index = -1; diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h index 95956e861..23f7a2abd 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pfwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h @@ -18,6 +18,7 @@ namespace Moses { public: + virtual ~PScorePfwd(){}; PScorePfwd(float const c, string d) { this->m_index = -1; diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h index ee7b08bda..388ee75ec 100644 --- a/moses/TranslationModel/UG/sapt_pscore_provenance.h +++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h @@ -18,6 +18,7 @@ namespace Moses { { public: + virtual ~PScoreProvenance() {} PScoreProvenance(string const& spec) { this->m_tag = "prov"; From 51bc36d131128d103aeecda4204d75aa95411247 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 31 Jul 2015 23:21:34 +0000 Subject: [PATCH 201/286] Report bias server errors. --- moses/TranslationModel/UG/mm/ug_sampling_bias.cc | 10 +++++++--- moses/TranslationModel/UG/mm/ug_sampling_bias.h | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index 7bfc488ec..c67b2cd09 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -24,11 +24,11 @@ namespace Moses std::string query_bias_server(std::string const& server, std::string const& context, - ostream* log) + std::ostream* log) { std::string query = server+uri_encode(context); boost::asio::io_service io_service; - Moses::http_client c(io_service, query); + Moses::http_client c(io_service, query, log); io_service.run(); if (log) @@ -36,7 +36,11 @@ namespace Moses std::string response = c.content(); *log << "SERVER RESPONSE: " << response << std::endl; } - UTIL_THROW_IF2(c.content().size() == 0, "No response from bias server!"); + if (c.content().size() == 0) + { + if (log) *log << "BIAS SERVER ERROR: " << c.error_msg() << std::endl; + // UTIL_THROW_IF2(c.content().size() == 0, "No response from bias server!"); + } return c.content(); } // #endif diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index 8e61dc81f..bbdadc62f 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -16,7 +16,7 @@ namespace Moses std::string query_bias_server(std::string const& url, std::string const& text, - ostream* log); + std::ostream* log); class SamplingBias { From bfd45fdfc33390495be176b478e0d346850931d6 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sat, 1 Aug 2015 11:35:47 +0400 Subject: [PATCH 202/286] don't use all threads --- scripts/ems/example/config.basic | 6 +++--- scripts/ems/example/config.factored | 6 +++--- scripts/ems/example/config.hierarchical | 6 +++--- scripts/ems/example/config.syntax | 6 +++--- scripts/ems/example/config.toy | 6 +++--- scripts/ems/example/config.toy.bilinguallm | 6 +++--- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic index 648091707..60a4f1a84 100644 --- a/scripts/ems/example/config.basic +++ b/scripts/ems/example/config.basic @@ -86,7 +86,7 @@ detruecaser = $moses-script-dir/recaser/detruecase.perl ### multi-core settings # when the generic parallelizer is used, the number of cores # specified here -cores = 8 +cores = 4 ################################################################# # PARALLEL CORPUS PREPARATION: @@ -520,7 +520,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "-threads all" +decoder-settings = "-threads $cores" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -589,7 +589,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads all" +decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads $cores" ### specify size of n-best list, if produced # diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored index 586a90355..dd4227ae9 100644 --- a/scripts/ems/example/config.factored +++ b/scripts/ems/example/config.factored @@ -86,7 +86,7 @@ detruecaser = $moses-script-dir/recaser/detruecase.perl ### multi-core settings # when the generic parallelizer is used, the number of cores # specified here -cores = 8 +cores = 4 ################################################################# # PARALLEL CORPUS PREPARATION: @@ -539,7 +539,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "-threads all" +decoder-settings = "-threads $cores" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -608,7 +608,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads all" +decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads $cores" ### specify size of n-best list, if produced # diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical index 5d2770a71..9f389710b 100644 --- a/scripts/ems/example/config.hierarchical +++ b/scripts/ems/example/config.hierarchical @@ -89,7 +89,7 @@ detruecaser = $moses-script-dir/recaser/detruecase.perl ### multi-core settings # when the generic parallelizer is used, the number of cores # specified here -cores = 8 +cores = 4 ################################################################# # PARALLEL CORPUS PREPARATION: @@ -521,7 +521,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "-threads all" +decoder-settings = "-threads $cores" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -590,7 +590,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -decoder-settings = "-threads all" +decoder-settings = "-threads $cores" ### specify size of n-best list, if produced # diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax index 570e4c9ef..c6133784a 100644 --- a/scripts/ems/example/config.syntax +++ b/scripts/ems/example/config.syntax @@ -93,7 +93,7 @@ output-parser = "$moses-script-dir/training/wrappers/parse-en-collins.perl" ### multi-core settings # when the generic parallelizer is used, the number of cores # specified here -cores = 8 +cores = 4 ################################################################# # PARALLEL CORPUS PREPARATION: @@ -525,7 +525,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "-threads all" +decoder-settings = "-threads $cores" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -594,7 +594,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -decoder-settings = "-threads all" +decoder-settings = "-threads $cores" ### specify size of n-best list, if produced # diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy index 193b26f8e..0274a7fc0 100644 --- a/scripts/ems/example/config.toy +++ b/scripts/ems/example/config.toy @@ -86,7 +86,7 @@ detruecaser = $moses-script-dir/recaser/detruecase.perl ### multi-core settings # when the generic parallelizer is used, the number of cores # specified here -cores = 8 +cores = 4 ################################################################# # PARALLEL CORPUS PREPARATION: @@ -503,7 +503,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "-threads all" +decoder-settings = "-threads $cores" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -568,7 +568,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads all" +decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads $cores" ### specify size of n-best list, if produced # diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm index 8f7e04029..4055b452c 100644 --- a/scripts/ems/example/config.toy.bilinguallm +++ b/scripts/ems/example/config.toy.bilinguallm @@ -86,7 +86,7 @@ detruecaser = $moses-script-dir/recaser/detruecase.perl ### multi-core settings # when the generic parallelizer is used, the number of cores # specified here -cores = 8 +cores = 4 ################################################################# # PARALLEL CORPUS PREPARATION: @@ -519,7 +519,7 @@ filter-settings = "" ### additional flags for the decoder # -decoder-settings = "-threads all" +decoder-settings = "-threads $cores" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains @@ -584,7 +584,7 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # -decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads all" +decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads $cores" ### specify size of n-best list, if produced # From faaf0bdf874037a8b4607d3f3f38e1eb837eff19 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 1 Aug 2015 16:11:35 +0100 Subject: [PATCH 203/286] Reorganization of server options. Added options for sesssion cache. --- .beautify-ignore | 1 + moses/ExportInterface.cpp | 36 +++++++++---------- moses/Parameter.cpp | 4 +++ moses/parameters/ServerOptions.cpp | 57 ++++++++++++++++++++++++++++++ moses/parameters/ServerOptions.h | 20 +++++++++++ 5 files changed, 98 insertions(+), 20 deletions(-) create mode 100644 moses/parameters/ServerOptions.cpp create mode 100644 moses/parameters/ServerOptions.h diff --git a/.beautify-ignore b/.beautify-ignore index ef4c2b762..15221c86a 100644 --- a/.beautify-ignore +++ b/.beautify-ignore @@ -22,6 +22,7 @@ mingw/MosesGUI/Ui_credits.py mingw/MosesGUI/Ui_mainWindow.py moses/TranslationModel/UG moses/server +moses/parameters phrase-extract/pcfg-common phrase-extract/syntax-common randlm diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index 3c9e334da..a6cb97918 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -66,6 +66,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "server/Translator.h" #include "server/Optimizer.h" #include "server/Updater.h" +#include "moses/parameters/ServerOptions.h" #endif using namespace std; @@ -147,21 +148,15 @@ int run_as_server() { #ifdef HAVE_XMLRPC_C - int port; - params.SetParameter(port, "server-port", 8080); - bool isSerial; - params.SetParameter(isSerial, "serial", false); - string logfile; - params.SetParameter(logfile, "server-log", string("/dev/null")); - size_t num_threads; - params.SetParameter(num_threads, "threads", size_t(10)); - if (isSerial) VERBOSE(1,"Running server in serial mode." << endl); + ServerOptions sopts(params); + if (sopts.is_serial) VERBOSE(1,"Running server in serial mode." << endl); xmlrpc_c::registry myRegistry; - xmlrpc_c::methodPtr const translator(new MosesServer::Translator(num_threads)); - xmlrpc_c::methodPtr const updater(new MosesServer::Updater); - xmlrpc_c::methodPtr const optimizer(new MosesServer::Optimizer); + xmlrpc_c::methodPtr const + translator(new MosesServer::Translator(sopts.num_threads)), + updater(new MosesServer::Updater), + optimizer(new MosesServer::Optimizer); myRegistry.addMethod("translate", translator); myRegistry.addMethod("updater", updater); @@ -170,16 +165,18 @@ run_as_server() xmlrpc_c::serverAbyss myAbyssServer( xmlrpc_c::serverAbyss::constrOpt() .registryP(&myRegistry) - .portNumber(port) // TCP port on which to listen - .logFileName(logfile) + .portNumber(sopts.port) // TCP port on which to listen + .logFileName(sopts.logfile) .allowOrigin("*") - .maxConn((unsigned int)num_threads) + .maxConn(sopts.num_threads) ); - XVERBOSE(1,"Listening on port " << port << endl); - if (isSerial) { - while(1) myAbyssServer.runOnce(); - } else myAbyssServer.run(); + XVERBOSE(1,"Listening on port " << sopts.port << endl); + if (sopts.is_serial) + { + while(true) myAbyssServer.runOnce(); + } + else myAbyssServer.run(); std::cerr << "xmlrpc_c::serverAbyss.run() returned but should not." << std::endl; // #pragma message("BUILDING MOSES WITH SERVER SUPPORT") @@ -188,7 +185,6 @@ run_as_server() std::cerr << "Moses was compiled without server support." << endl; #endif return 1; - } int diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 4a0941521..31bba1c14 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -214,6 +214,10 @@ Parameter::Parameter() AddParam(server_opts,"server", "Run moses as a translation server."); AddParam(server_opts,"server-port", "Port for moses server"); AddParam(server_opts,"server-log", "Log destination for moses server"); + AddParam(server_opts,"session-timeout", + "Timeout for sessions, e.g. '2h30m' or 1d (=24h)"); + AddParam(server_opts,"session-cache-size", string("Max. number of sessions cached.") + +"Least recently used session is dumped first."); AddParam(server_opts,"serial", "Run server in serial mode, processing only one request at a time."); po::options_description irstlm_opts("IRSTLM Options"); diff --git a/moses/parameters/ServerOptions.cpp b/moses/parameters/ServerOptions.cpp new file mode 100644 index 000000000..2609f0378 --- /dev/null +++ b/moses/parameters/ServerOptions.cpp @@ -0,0 +1,57 @@ +// -*- mode: c++; cc-style: gnu -*- +#include "ServerOptions.h" +#include +#include +namespace Moses +{ + +// parse the session timeout specifciation for moses server +// Format is "d[[h[m[s]]]]". +// If none of 'dhms' is given, it is assumed that it's seconds. +// Specs can be combined, e.g. 2h30m, although it's probably nonsense +// to be so specific. +size_t +parse_timespec(std::string const& spec) +{ + size_t t = 0, timeout = 0; + BOOST_FOREACH(char const& c, spec) + { + if (c >= '0' && c <= '9') + { + t = t * 10 + c - '0'; + } + else + { + if (c == 'd') timeout = t * 24 * 3600; + else if (c == 'h') timeout += t * 3600; + else if (c == 'm') timeout += t * 60; + else if (c == 's') timeout += t; + else UTIL_THROW2("Can't parse specification '" << spec + << " at " << HERE); + t = 0; + } + } + return timeout; +} + +ServerOptions:: +ServerOptions(Parameter const& P) +{ + init(P); +} + +bool +ServerOptions:: +init(Parameter const& P) +{ + P.SetParameter(this->port, "server-port", 8080); + P.SetParameter(this->is_serial, "serial", false); + P.SetParameter(this->logfile, "server-log", std::string("/dev/null")); + P.SetParameter(this->num_threads, "threads", uint32_t(10)); + P.SetParameter(this->session_cache_size, "session-cache_size",25UL); + std::string timeout_spec; + P.SetParameter(timeout_spec, "session-timeout",std::string("30m")); + this->session_timeout = parse_timespec(timeout_spec); + return true; +} +} // namespace Moses diff --git a/moses/parameters/ServerOptions.h b/moses/parameters/ServerOptions.h new file mode 100644 index 000000000..e9889f8d4 --- /dev/null +++ b/moses/parameters/ServerOptions.h @@ -0,0 +1,20 @@ +// -*- mode: c++; cc-style: gnu -*- +#include +#include "moses/Parameter.h" +namespace Moses +{ + + struct + ServerOptions + { + int port; + bool is_serial; + std::string logfile; + uint32_t num_threads; + size_t session_timeout; + size_t session_cache_size; + bool init(Parameter const& param); + ServerOptions(Parameter const& param); + }; + +} From d94c2d210b2206e0180479d1219787d4f0ad639f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 2 Aug 2015 06:33:32 +0400 Subject: [PATCH 204/286] #define name clashed with internal class name when using clang --- moses/TranslationModel/UG/mm/calc-coverage.cc | 4 +- moses/TranslationModel/UG/mm/custom-pt.cc | 2 +- .../UG/mm/test-dynamic-im-tsa.cc | 2 +- moses/TranslationModel/UG/mm/ug_bitext.cc | 4 +- moses/TranslationModel/UG/mm/ug_bitext.h | 70 +++++++++---------- .../TranslationModel/UG/mm/ug_bitext_agenda.h | 26 +++---- .../UG/mm/ug_bitext_agenda_job.h | 14 ++-- .../UG/mm/ug_bitext_agenda_worker.h | 4 +- .../TranslationModel/UG/mm/ug_bitext_moses.h | 18 ++--- .../TranslationModel/UG/mm/ug_bitext_pstats.h | 4 +- moses/TranslationModel/UG/mm/ug_im_bitext.cc | 4 +- moses/TranslationModel/UG/mm/ug_im_bitext.h | 26 +++---- moses/TranslationModel/UG/mm/ug_lru_cache.h | 10 +-- moses/TranslationModel/UG/mm/ug_mmbitext.cc | 20 +++--- moses/TranslationModel/UG/mm/ug_mmbitext.h | 14 ++-- moses/TranslationModel/UG/mm/ug_tsa_base.h | 8 +-- .../UG/mm/ug_tsa_tree_iterator.h | 6 +- moses/TranslationModel/UG/mm/ug_typedefs.h | 2 +- moses/TranslationModel/UG/mmsapt.cpp | 68 +++++++++--------- moses/TranslationModel/UG/mmsapt.h | 24 +++---- moses/TranslationModel/UG/mmsapt_align.cc | 14 ++-- .../UG/spe-check-coverage3.cc | 8 +-- moses/TranslationModel/UG/try-align.cc | 12 ++-- moses/TranslationModel/UG/try-align2.cc | 12 ++-- 24 files changed, 188 insertions(+), 188 deletions(-) diff --git a/moses/TranslationModel/UG/mm/calc-coverage.cc b/moses/TranslationModel/UG/mm/calc-coverage.cc index 83f67220d..4f02909b7 100644 --- a/moses/TranslationModel/UG/mm/calc-coverage.cc +++ b/moses/TranslationModel/UG/mm/calc-coverage.cc @@ -15,7 +15,7 @@ using namespace ugdiss; typedef L2R_Token Token; TokenIndex V; -sptr > > C(new vector >()); +SPTR > > C(new vector >()); void add_file(string fname) { @@ -34,7 +34,7 @@ main(int argc, char* argv[]) { V.setDynamic(true); add_file(argv[1]); - sptr > T(new imTtrack(C)); + SPTR > T(new imTtrack(C)); imTSA I(T,NULL,NULL); string line; while (getline(cin,line)) diff --git a/moses/TranslationModel/UG/mm/custom-pt.cc b/moses/TranslationModel/UG/mm/custom-pt.cc index 1a51aa8a4..44fc5d112 100644 --- a/moses/TranslationModel/UG/mm/custom-pt.cc +++ b/moses/TranslationModel/UG/mm/custom-pt.cc @@ -153,7 +153,7 @@ int main(int argc, char* argv[]) for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k) { uint64_t spid = m.getPid(); - sptr s = bt.lookup(m); + SPTR s = bt.lookup(m); for (size_t j = i; j <= k; ++j) cout << (*bt.V1)[snt[j]] << " "; cout << s->good << "/" diff --git a/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc b/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc index dd5f4c9b4..612f497a6 100644 --- a/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc +++ b/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc @@ -37,7 +37,7 @@ typedef L2R_Token L2R; int main() { - sptr > bt(new imBitext()); + SPTR > bt(new imBitext()); string s1,s2,aln; vector S1,S2,ALN; while (getline(cin,s1) && getline(cin,s2) && getline(cin,aln)) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index 809476aa9..fb75877ed 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -45,8 +45,8 @@ namespace Moses snt_adder >:: snt_adder(vector const& s, TokenIndex& v, - sptr > >& t, - sptr > >& i) + SPTR > >& t, + SPTR > >& i) : snt(s), V(v), track(t), index(i) { } diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index e14cc5d3d..a6c231d7e 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -93,8 +93,8 @@ namespace Moses { // ttasksptr const m_ttask; // size_t max_samples; boost::shared_mutex lock; - sptr bias; - sptr cache1, cache2; + SPTR bias; + SPTR cache1, cache2; std::ostream* bias_log; ContextForQuery() : bias_log(NULL) { } }; @@ -114,29 +114,29 @@ namespace Moses { mutable boost::shared_mutex m_lock; // for thread-safe operation class agenda; // for parallel sampling see ug_bitext_agenda.h - mutable sptr ag; + mutable SPTR ag; size_t m_num_workers; // number of workers available to the agenda size_t m_default_sample_size; size_t m_pstats_cache_threshold; // threshold for caching sampling results - sptr m_cache1, m_cache2; // caches for sampling results + SPTR m_cache1, m_cache2; // caches for sampling results std::vector m_docname; map m_docname2docid; // maps from doc names to ids - sptr > m_sid2docid; // maps from sentences to docs (ids) + SPTR > m_sid2docid; // maps from sentences to docs (ids) mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; // caches for unbiased sampling; biased sampling uses the caches that // are stored locally on the translation task public: - sptr > Tx; // word alignments - sptr > T1; // token track - sptr > T2; // token track - sptr V1; // vocab - sptr V2; // vocab - sptr > I1; // indices - sptr > I2; // indices + SPTR > Tx; // word alignments + SPTR > T1; // token track + SPTR > T2; // token track + SPTR V1; // vocab + SPTR V2; // vocab + SPTR > I1; // indices + SPTR > I2; // indices /// given the source phrase sid[start:stop] // find the possible start (s1 .. s2) and end (e1 .. e2) @@ -156,11 +156,11 @@ namespace Moses { // prep2 launches sampling and returns immediately. // lookup (below) waits for the job to finish before it returns - sptr + SPTR prep2(iter const& phrase, int max_sample = -1) const; #ifndef NO_MOSES - sptr + SPTR prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; #endif @@ -177,12 +177,12 @@ namespace Moses { virtual void open(string const base, string const L1, string const L2) = 0; - sptr + SPTR lookup(iter const& phrase, int max_sample = -1) const; void prep(iter const& phrase) const; #ifndef NO_MOSES - sptr + SPTR lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; void prep(ttasksptr const& ttask, iter const& phrase) const; #endif @@ -195,13 +195,13 @@ namespace Moses { virtual size_t revision() const { return 0; } - sptr + SPTR loadSentenceBias(string const& fname) const; - sptr + SPTR SetupDocumentBias(string const& bserver, string const& text, std::ostream* log) const; - sptr + SPTR SetupDocumentBias(map context_weights, std::ostream* log) const; void @@ -229,11 +229,11 @@ namespace Moses { } template - sptr + SPTR Bitext:: loadSentenceBias(string const& fname) const { - sptr ret(new SentenceBias(T1->size())); + SPTR ret(new SentenceBias(T1->size())); ifstream in(fname.c_str()); size_t i = 0; float v; while (in>>v) (*ret)[i++] = v; @@ -320,11 +320,11 @@ namespace Moses { typedef L2R_Token TKN; std::vector const & snt; TokenIndex & V; - sptr > & track; - sptr > & index; + SPTR > & track; + SPTR > & index; public: snt_adder(std::vector const& s, TokenIndex& v, - sptr >& t, sptr >& i); + SPTR >& t, SPTR >& i); void operator()(); }; @@ -431,12 +431,12 @@ namespace Moses { } template - sptr + SPTR Bitext:: SetupDocumentBias ( string const& bserver, string const& text, std::ostream* log ) const { - sptr ret; + SPTR ret; UTIL_THROW_IF2(m_sid2docid == NULL, "Document bias requested but no document map loaded."); ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, @@ -445,12 +445,12 @@ namespace Moses { } template - sptr + SPTR Bitext:: SetupDocumentBias ( map context_weights, std::ostream* log ) const { - sptr ret; + SPTR ret; UTIL_THROW_IF2(m_sid2docid == NULL, "Document bias requested but no document map loaded."); ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, @@ -473,14 +473,14 @@ namespace Moses { // and waits until the sampling is finished before it returns. // This allows sampling in the background template - sptr + SPTR Bitext ::prep2 (iter const& phrase, int max_sample) const { if (max_sample < 0) max_sample = m_default_sample_size; - sptr bias; - sptr cache; + SPTR bias; + SPTR cache; // - no caching for rare phrases and special requests (max_sample) // (still need to test what a good caching threshold is ...) // - use the task-specific cache when there is a sampling bias @@ -490,8 +490,8 @@ namespace Moses { cache = (phrase.root == I1.get() ? m_cache1 : m_cache2); } - sptr ret; - sptr const* cached; + SPTR ret; + SPTR const* cached; if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached) return *cached; @@ -513,7 +513,7 @@ namespace Moses { class pstats2pplist { Ttrack const& m_other; - sptr m_pstats; + SPTR m_pstats; std::vector >& m_pplist; typename PhrasePair::Scorer const* m_scorer; PhrasePair m_pp; @@ -526,7 +526,7 @@ namespace Moses { // CONSTRUCTOR pstats2pplist(typename TSA::tree_iterator const& m, Ttrack const& other, - sptr const& ps, + SPTR const& ps, std::vector >& dest, typename PhrasePair::Scorer const* scorer) : m_other(other) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h index 72e6c8638..5e899e0b4 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h @@ -14,8 +14,8 @@ public: class worker; private: boost::mutex lock; - std::list > joblist; - std::vector > workers; + std::list > joblist; + std::vector > workers; bool shutdown; size_t doomed; @@ -30,15 +30,15 @@ public: void add_workers(int n); - sptr + SPTR add_job(Bitext const* const theBitext, typename TSA::tree_iterator const& phrase, - size_t const max_samples, sptr const& bias); + size_t const max_samples, SPTR const& bias); // add_job(Bitext const* const theBitext, // typename TSA::tree_iterator const& phrase, // size_t const max_samples, SamplingBias const* const bias); - sptr + SPTR get_job(); }; @@ -82,23 +82,23 @@ void Bitext else while (int(workers.size()) < target) { - sptr w(new boost::thread(worker(*this))); + SPTR w(new boost::thread(worker(*this))); workers.push_back(w); } } template -sptr Bitext +SPTR Bitext ::agenda ::add_job(Bitext const* const theBitext, typename TSA::tree_iterator const& phrase, - size_t const max_samples, sptr const& bias) + size_t const max_samples, SPTR const& bias) { boost::unique_lock lk(this->lock); static boost::posix_time::time_duration nodelay(0,0,0,0); bool fwd = phrase.root == bt.I1.get(); - sptr j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, + SPTR j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd, bias)); j->stats->register_worker(); @@ -118,7 +118,7 @@ sptr Bitext --doomed; } else - workers[i++] = sptr(new boost::thread(worker(*this))); + workers[i++] = SPTR(new boost::thread(worker(*this))); } else ++i; } @@ -127,13 +127,13 @@ sptr Bitext } template -sptr::agenda::job> +SPTR::agenda::job> Bitext ::agenda ::get_job() { // cerr << workers.size() << " workers on record" << std::endl; - sptr ret; + SPTR ret; if (this->shutdown) return ret; boost::unique_lock lock(this->lock); if (this->doomed) @@ -142,7 +142,7 @@ Bitext return ret; } - typename list >::iterator j = joblist.begin(); + typename list >::iterator j = joblist.begin(); while (j != joblist.end()) { if ((*j)->done()) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h index 5975edd6f..b13222189 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -23,7 +23,7 @@ job public: size_t workers; // how many workers are working on this job? - sptr const> root; // root of the underlying suffix array + SPTR const> root; // root of the underlying suffix array char const* next; // next position to read from char const* stop; // end of index range size_t max_samples; // how many samples to extract at most @@ -32,8 +32,8 @@ public: */ size_t len; // phrase length bool fwd; // if true, source phrase is L1 - sptr stats; // stores statistics collected during sampling - sptr const m_bias; // sentence-level bias for sampling + SPTR stats; // stores statistics collected during sampling + SPTR const m_bias; // sentence-level bias for sampling float bias_total; bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence @@ -45,8 +45,8 @@ public: bool done() const; job(Bitext const* const theBitext, typename TSA::tree_iterator const& m, - sptr > const& r, size_t maxsmpl, bool isfwd, - sptr const& bias); + SPTR > const& r, size_t maxsmpl, bool isfwd, + SPTR const& bias); ~job(); }; @@ -65,8 +65,8 @@ template Bitext::agenda::job ::job(Bitext const* const theBitext, typename TSA::tree_iterator const& m, - sptr > const& r, size_t maxsmpl, - bool isfwd, sptr const& bias) + SPTR > const& r, size_t maxsmpl, + bool isfwd, SPTR const& bias) : m_bitext(theBitext) , rnd(0) , rnddenom(rnd.max() + 1.) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h index 104b7acb5..694b3568c 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h @@ -20,7 +20,7 @@ Bitext::agenda uint64_t sid=0, offset=0; // sid and offset of source phrase size_t s1=0, s2=0, e1=0, e2=0; // soft and hard boundaries of target phrase std::vector aln; // stores phrase-pair-internal alignment - while(sptr j = ag.get_job()) + while(SPTR j = ag.get_job()) { j->stats->register_worker(); bitvector full_alignment(100*100); // Is full_alignment still needed??? @@ -73,7 +73,7 @@ Bitext::agenda for (size_t s = s1; s <= s2; ++s) { TSA const& I = j->fwd ? *ag.bt.I2 : *ag.bt.I1; - sptr b = I.find(o + s, e1 - s); + SPTR b = I.find(o + s, e1 - s); UTIL_THROW_IF2(!b || b->size() < e1-s, "target phrase not found"); for (size_t i = e1; i <= e2; ++i) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_moses.h b/moses/TranslationModel/UG/mm/ug_bitext_moses.h index 539a9166d..d55fc9265 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_moses.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h @@ -5,11 +5,11 @@ namespace Moses { namespace bitext { template -sptr +SPTR Bitext:: lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const { - sptr ret = prep2(ttask, phrase, max_sample); + SPTR ret = prep2(ttask, phrase, max_sample); UTIL_THROW_IF2(!ret, "Got NULL pointer where I expected a valid pointer."); // Why were we locking here? @@ -42,17 +42,17 @@ prep(ttasksptr const& ttask, iter const& phrase) const // and waits until the sampling is finished before it returns. // This allows sampling in the background template -sptr +SPTR Bitext ::prep2 ( ttasksptr const& ttask, iter const& phrase, int max_sample) const { if (max_sample < 0) max_sample = m_default_sample_size; - sptr bias; - sptr scope = ttask->GetScope(); - sptr context = scope->get(this); + SPTR bias; + SPTR scope = ttask->GetScope(); + SPTR context = scope->get(this); if (context) bias = context->bias; - sptr cache; + SPTR cache; // - no caching for rare phrases and special requests (max_sample) // (still need to test what a good caching threshold is ...) // - use the task-specific cache when there is a sampling bias @@ -63,8 +63,8 @@ Bitext ? (bias ? context->cache1 : m_cache1) : (bias ? context->cache2 : m_cache2)); } - sptr ret; - sptr const* cached; + SPTR ret; + SPTR const* cached; if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached) return *cached; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index ca4e80418..387f78f4e 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -15,8 +15,8 @@ namespace Moses struct pstats { - typedef boost::unordered_map > map_t; - typedef ThreadSafeContainer, map_t> cache_t; + typedef boost::unordered_map > map_t; + typedef ThreadSafeContainer, map_t> cache_t; typedef std::vector alnvec; #if UG_BITEXT_TRACK_ACTIVE_THREADS static ThreadSafeCounter active; diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.cc b/moses/TranslationModel/UG/mm/ug_im_bitext.cc index 5efa3b8c4..55603e1e2 100644 --- a/moses/TranslationModel/UG/mm/ug_im_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.cc @@ -6,7 +6,7 @@ namespace Moses { template<> - sptr > > + SPTR > > imBitext >:: add(vector const& s1, vector const& s2, @@ -19,7 +19,7 @@ namespace Moses size_t first_new_snt = this->T1 ? this->T1->size() : 0; #endif - sptr > ret; + SPTR > ret; { boost::unique_lock guard(m_lock); ret.reset(new imBitext(*this)); diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.h b/moses/TranslationModel/UG/mm/ug_im_bitext.h index 9515ec98b..ca7c75c77 100644 --- a/moses/TranslationModel/UG/mm/ug_im_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.h @@ -9,25 +9,25 @@ namespace Moses template class imBitext : public Bitext { - sptr > myTx; - sptr > myT1; - sptr > myT2; - sptr > myI1; - sptr > myI2; + SPTR > myTx; + SPTR > myT1; + SPTR > myT2; + SPTR > myI1; + SPTR > myI2; static ThreadSafeCounter my_revision; public: size_t revision() const { return my_revision; } void open(string const base, string const L1, string L2); - imBitext(sptr const& V1, - sptr const& V2, + imBitext(SPTR const& V1, + SPTR const& V2, size_t max_sample = 5000, size_t num_workers=4); imBitext(size_t max_sample = 5000, size_t num_workers=4); imBitext(imBitext const& other); - // sptr > + // SPTR > // add(vector const& s1, std::vector const& s2, vector & a); - sptr > + SPTR > add(vector const& s1, std::vector const& s2, std::vector const& a) const; @@ -53,8 +53,8 @@ namespace Moses template imBitext:: - imBitext(sptr const& v1, - sptr const& v2, + imBitext(SPTR const& v1, + SPTR const& v2, size_t max_sample, size_t num_workers) : Bitext(max_sample, num_workers) { @@ -89,14 +89,14 @@ namespace Moses } template<> - sptr > > + SPTR > > imBitext >:: add(vector const& s1, vector const& s2, vector const& aln) const; template - sptr > + SPTR > imBitext:: add(vector const& s1, vector const& s2, diff --git a/moses/TranslationModel/UG/mm/ug_lru_cache.h b/moses/TranslationModel/UG/mm/ug_lru_cache.h index 682880a69..75d99fea7 100644 --- a/moses/TranslationModel/UG/mm/ug_lru_cache.h +++ b/moses/TranslationModel/UG/mm/ug_lru_cache.h @@ -8,8 +8,8 @@ #include -#ifndef sptr -#define sptr boost::shared_ptr +#ifndef SPTR +#define SPTR boost::shared_ptr #endif namespace lru_cache @@ -65,14 +65,14 @@ namespace lru_cache size_t capacity() const { return m_recs.capacity(); } void reserve(size_t s) { m_recs.reserve(s); } - sptr + SPTR get(KEY const& key) { uint32_t p; { // brackets needed for lock scoping boost::shared_lock rlock(m_lock); typename map_t::const_iterator i = m_idx.find(key); - if (i == m_idx.end()) return sptr(); + if (i == m_idx.end()) return SPTR(); p = i->second; } boost::lock_guard guard(m_lock); @@ -81,7 +81,7 @@ namespace lru_cache } void - set(KEY const& key, sptr const& ptr) + set(KEY const& key, SPTR const& ptr) { boost::lock_guard lock(m_lock); std::pair foo; diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.cc b/moses/TranslationModel/UG/mm/ug_mmbitext.cc index 34e3f1b1e..ef797b5c1 100644 --- a/moses/TranslationModel/UG/mm/ug_mmbitext.cc +++ b/moses/TranslationModel/UG/mm/ug_mmbitext.cc @@ -147,7 +147,7 @@ // prep2(phrase); // } -// sptr +// SPTR // mmbitext:: // prep2(iter const& phrase) // { @@ -156,20 +156,20 @@ // ag = new agenda(*this); // ag->add_workers(20); // } -// typedef boost::unordered_map > pcache_t; +// typedef boost::unordered_map > pcache_t; // uint64_t pid = phrase.getPid(); // pcache_t & cache(phrase.root == &this->I1 ? cache1 : cache2); -// pcache_t::value_type entry(pid,sptr()); +// pcache_t::value_type entry(pid,SPTR()); // pair foo = cache.emplace(entry); // if (foo.second) foo.first->second = ag->add_job(phrase, 1000); // return foo.first->second; // } -// sptr +// SPTR // mmbitext:: // lookup(iter const& phrase) // { -// sptr ret = prep2(phrase); +// SPTR ret = prep2(phrase); // boost::unique_lock lock(ret->lock); // while (ret->in_progress) // ret->ready.wait(lock); @@ -184,7 +184,7 @@ // { // uint64_t sid=0, offset=0, len=0; // of the source phrase // bool fwd=false; // source phrase is L1 -// sptr stats; +// SPTR stats; // size_t s1=0, s2=0, e1=0, e2=0; // for (; ag.get_task(sid,offset,len,fwd,stats); ) // { @@ -260,7 +260,7 @@ // if (ag) delete ag; // } -// sptr +// SPTR // mmbitext:: // agenda:: // add_job(mmbitext::iter const& phrase, size_t const max_samples) @@ -286,7 +286,7 @@ // { // if (workers[i]->timed_join(nodelay)) // { -// workers[i] = sptr(new boost::thread(worker(*this))); +// workers[i] = SPTR(new boost::thread(worker(*this))); // } // } // } @@ -297,7 +297,7 @@ // mmbitext:: // agenda:: // get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, -// bool & fwd, sptr & stats) +// bool & fwd, SPTR & stats) // { // boost::unique_lock lock(this->lock); // if (this->doomed || this->shutdown) @@ -385,7 +385,7 @@ // { // for (int i = 0; i < n; ++i) // { -// sptr w(new boost::thread(worker(*this))); +// SPTR w(new boost::thread(worker(*this))); // workers.push_back(w); // } // } diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.h b/moses/TranslationModel/UG/mm/ug_mmbitext.h index 04c54e60b..e07d92830 100644 --- a/moses/TranslationModel/UG/mm/ug_mmbitext.h +++ b/moses/TranslationModel/UG/mm/ug_mmbitext.h @@ -51,7 +51,7 @@ namespace Moses { class job; class worker; list joblist; - std::vector > workers; + std::vector > workers; bool shutdown; size_t doomed; public: @@ -59,10 +59,10 @@ namespace Moses { agenda(mmbitext const& bitext); ~agenda(); void add_workers(int n); - sptr add_job(mmbitext::iter const& phrase, + SPTR add_job(mmbitext::iter const& phrase, size_t const max_samples); bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, - bool & fwd, sptr & stats); + bool & fwd, SPTR & stats); }; // stores the list of unfinished jobs; @@ -85,9 +85,9 @@ namespace Moses { size_t & s1, size_t & s2, size_t & e1, size_t & e2, std::vector * core_alignment, bool const flip) const; - boost::unordered_map > cache1,cache2; + boost::unordered_map > cache1,cache2; private: - sptr + SPTR prep2(iter const& phrase); public: mmbitext(); @@ -95,7 +95,7 @@ namespace Moses { void open(string const base, string const L1, string const L2); - sptr lookup(iter const& phrase); + SPTR lookup(iter const& phrase); void prep(iter const& phrase); }; @@ -182,7 +182,7 @@ namespace Moses { size_t ctr; size_t len; bool fwd; - sptr stats; + SPTR stats; bool step(uint64_t & sid, uint64_t & offset); }; diff --git a/moses/TranslationModel/UG/mm/ug_tsa_base.h b/moses/TranslationModel/UG/mm/ug_tsa_base.h index 3eaf738ab..37597348e 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_base.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_base.h @@ -317,11 +317,11 @@ namespace ugdiss public: // virtual - sptr > + SPTR > find(TKN const* start, size_t len) const { typedef TSA_tree_iterator iter; - sptr ret(new iter(this)); + SPTR ret(new iter(this)); size_t i = 0; while (i < len && ret->extend(start[i])) ++i; if (i < len) ret.reset(); @@ -333,12 +333,12 @@ namespace ugdiss // ====================================================================== // template - // sptr > + // SPTR > // TSA:: // find(TOKEN const* start, size_t len) const // { // typedef TSA_tree_iterator iter; - // sptr ret(new iter(this)); + // SPTR ret(new iter(this)); // size_t i = 0; // while (i < len && ret->extend(start[i])) ++i; // if (i < len) ret.reset(); diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h index 42be0e0a1..19a54b676 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h @@ -181,7 +181,7 @@ namespace ugdiss return this->size(); } - sptr > + SPTR > randomSample(int level, size_t N) const; }; @@ -899,14 +899,14 @@ namespace ugdiss /// randomly select up to N occurrences of the sequence template - sptr > + SPTR > TSA_tree_iterator:: randomSample(int level, size_t N) const { if (level < 0) level += lower.size(); assert(level >=0); - sptr > + SPTR > ret(new std::vector(N)); size_t m=0; // number of samples selected so far diff --git a/moses/TranslationModel/UG/mm/ug_typedefs.h b/moses/TranslationModel/UG/mm/ug_typedefs.h index 78d78efe4..af50c8275 100644 --- a/moses/TranslationModel/UG/mm/ug_typedefs.h +++ b/moses/TranslationModel/UG/mm/ug_typedefs.h @@ -30,7 +30,7 @@ namespace ugdiss typedef std::vector int_4d_table; } -#define sptr boost::shared_ptr +#define SPTR boost::shared_ptr #define scoptr boost::scoped_ptr #define rcast reinterpret_cast #endif diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 132b720ba..d6f072e8f 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -109,7 +109,7 @@ namespace Moses void Mmsapt:: - register_ff(sptr const& ff, vector > & registry) + register_ff(SPTR const& ff, vector > & registry) { registry.push_back(ff); ff->setIndex(m_feature_names.size()); @@ -323,25 +323,25 @@ namespace Moses template void Mmsapt:: - check_ff(string const ffname, vector >* registry) + check_ff(string const ffname, vector >* registry) { string const& spec = param[ffname]; if (spec == "" || spec == "0") return; if (registry) { - sptr ff(new fftype(spec)); + SPTR ff(new fftype(spec)); register_ff(ff, *registry); } else if (spec[spec.size()-1] == '+') // corpus specific { - sptr ff(new fftype(spec)); + SPTR ff(new fftype(spec)); register_ff(ff, m_active_ff_fix); ff.reset(new fftype(spec)); register_ff(ff, m_active_ff_dyn); } else { - sptr ff(new fftype(spec)); + SPTR ff(new fftype(spec)); register_ff(ff, m_active_ff_common); } } @@ -350,32 +350,32 @@ namespace Moses void Mmsapt:: check_ff(string const ffname, float const xtra, - vector >* registry) + vector >* registry) { string const& spec = param[ffname]; if (spec == "" || spec == "0") return; if (registry) { - sptr ff(new fftype(xtra,spec)); + SPTR ff(new fftype(xtra,spec)); register_ff(ff, *registry); } else if (spec[spec.size()-1] == '+') // corpus specific { - sptr ff(new fftype(xtra,spec)); + SPTR ff(new fftype(xtra,spec)); register_ff(ff, m_active_ff_fix); ff.reset(new fftype(xtra,spec)); register_ff(ff, m_active_ff_dyn); } else { - sptr ff(new fftype(xtra,spec)); + SPTR ff(new fftype(xtra,spec)); register_ff(ff, m_active_ff_common); } } // void // Mmsapt:: - // add_corpus_specific_features(vector >& registry) + // add_corpus_specific_features(vector >& registry) // { // check_ff >("pbwd",m_lbop_conf,registry); // check_ff >("logcnt",registry); @@ -401,7 +401,7 @@ namespace Moses { // lexical scores string lexfile = m_bname + L1 + "-" + L2 + ".lex"; - sptr > + SPTR > ff(new PScoreLex1(param["lex_alpha"],lexfile)); register_ff(ff,m_active_ff_common); @@ -425,9 +425,9 @@ namespace Moses // this translation model) else if (fsname == "datasource") { - sptr > ffpcnt(new PScorePC("pcnt")); + SPTR > ffpcnt(new PScorePC("pcnt")); register_ff(ffpcnt,m_active_ff_common); - sptr > ffwcnt(new PScoreWC("wcnt")); + SPTR > ffwcnt(new PScoreWC("wcnt")); register_ff(ffwcnt,m_active_ff_common); } } @@ -441,9 +441,9 @@ namespace Moses Load(bool with_checks) { // load feature functions (i.e., load underlying data bases, if any) - BOOST_FOREACH(sptr& ff, m_active_ff_fix) ff->load(); - BOOST_FOREACH(sptr& ff, m_active_ff_dyn) ff->load(); - BOOST_FOREACH(sptr& ff, m_active_ff_common) ff->load(); + BOOST_FOREACH(SPTR& ff, m_active_ff_fix) ff->load(); + BOOST_FOREACH(SPTR& ff, m_active_ff_dyn) ff->load(); + BOOST_FOREACH(SPTR& ff, m_active_ff_common) ff->load(); #if 0 if (with_checks) { @@ -501,7 +501,7 @@ namespace Moses Phrase const& src, PhrasePair* fix, PhrasePair* dyn, - sptr > const& dynbt) const + SPTR > const& dynbt) const { UTIL_THROW_IF2(!fix && !dyn, HERE << ": Can't create target phrase from nothing."); @@ -509,12 +509,12 @@ namespace Moses PhrasePair pool = fix ? *fix : *dyn; if (fix) { - BOOST_FOREACH(sptr const& ff, m_active_ff_fix) + BOOST_FOREACH(SPTR const& ff, m_active_ff_fix) (*ff)(btfix, *fix, &fvals); } if (dyn) { - BOOST_FOREACH(sptr const& ff, m_active_ff_dyn) + BOOST_FOREACH(SPTR const& ff, m_active_ff_dyn) (*ff)(*dynbt, *dyn, &fvals); } @@ -526,7 +526,7 @@ namespace Moses if (m.size() == fix->len2) zilch.raw2 = m.approxOccurrenceCount(); pool += zilch; - BOOST_FOREACH(sptr const& ff, m_active_ff_dyn) + BOOST_FOREACH(SPTR const& ff, m_active_ff_dyn) (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); } else if (dyn) @@ -536,17 +536,17 @@ namespace Moses if (m.size() == dyn->len2) zilch.raw2 = m.approxOccurrenceCount(); pool += zilch; - BOOST_FOREACH(sptr const& ff, m_active_ff_fix) + BOOST_FOREACH(SPTR const& ff, m_active_ff_fix) (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); } if (fix) { - BOOST_FOREACH(sptr const& ff, m_active_ff_common) + BOOST_FOREACH(SPTR const& ff, m_active_ff_common) (*ff)(btfix, pool, &fvals); } else { - BOOST_FOREACH(sptr const& ff, m_active_ff_common) + BOOST_FOREACH(SPTR const& ff, m_active_ff_common) (*ff)(*dynbt, pool, &fvals); } @@ -567,7 +567,7 @@ namespace Moses { LRModel::ModelType mdl = m_lr_func->GetModel().GetModelType(); LRModel::Direction dir = m_lr_func->GetModel().GetDirection(); - sptr scores(new Scores()); + SPTR scores(new Scores()); pool.fill_lr_vec(dir, mdl, *scores); tp->SetExtraScores(m_lr_func, scores); } @@ -617,7 +617,7 @@ namespace Moses // Reserve a local copy of the dynamic bitext in its current form. /btdyn/ // is set to a new copy of the dynamic bitext every time a sentence pair // is added. /dyn/ keeps the old bitext around as long as we need it. - sptr > dyn; + SPTR > dyn; { // braces are needed for scoping mutex lock guard! boost::unique_lock guard(m_lock); assert(btdyn); @@ -646,8 +646,8 @@ namespace Moses ? (mfix.getPid()<<1) : (mdyn.getPid()<<1)+1); // get context-specific cache of items previously looked up - sptr const& scope = ttask->GetScope(); - sptr cache = scope->get(cache_key); + SPTR const& scope = ttask->GetScope(); + SPTR cache = scope->get(cache_key); if (!cache) cache = m_cache; TPCollWrapper* ret = cache->get(phrasekey, dyn->revision()); // TO DO: we should revise the revision mechanism: we take the length @@ -666,7 +666,7 @@ namespace Moses // TO DO: have Bitexts return lists of PhrasePairs instead of pstats // no need to expand pstats at every single lookup again, especially // for btfix. - sptr sfix,sdyn; + SPTR sfix,sdyn; if (mfix.size() == sphrase.size()) sfix = btfix.lookup(ttask, mfix); if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(ttask, mdyn); @@ -774,8 +774,8 @@ namespace Moses Mmsapt:: InitializeForInput(ttasksptr const& ttask) { - sptr const& scope = ttask->GetScope(); - sptr context + SPTR const& scope = ttask->GetScope(); + SPTR context = scope->get(&btfix, true); if (m_bias_server.size() && context->bias == NULL) { // we need to create the bias @@ -815,7 +815,7 @@ namespace Moses if (!context->cache2) context->cache2.reset(new pstats::cache_t); } boost::unique_lock mylock(m_lock); - sptr localcache = scope->get(cache_key); + SPTR localcache = scope->get(cache_key); if (!localcache) { if (context->bias) localcache.reset(new TPCollCache(m_cache_size)); @@ -856,7 +856,7 @@ namespace Moses return true; } - sptr > dyn; + SPTR > dyn; { // braces are needed for scoping lock! boost::unique_lock guard(m_lock); dyn = btdyn; @@ -877,7 +877,7 @@ namespace Moses Mmsapt ::Release(ttasksptr const& ttask, TargetPhraseCollection*& tpc) const { - sptr cache = ttask->GetScope()->get(cache_key); + SPTR cache = ttask->GetScope()->get(cache_key); TPCollWrapper* foo = static_cast(tpc); if (cache) cache->release(foo); tpc = NULL; @@ -889,7 +889,7 @@ namespace Moses string const& Mmsapt ::GetName() const { return m_name; } - // sptr + // SPTR // Mmsapt // ::setupDocumentBias(map const& bias) const // { diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 82003a24f..fd920b14a 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -62,9 +62,9 @@ namespace Moses typedef TSA tsa; typedef PhraseScorer pscorer; private: - // vector > shards; + // vector > shards; mmbitext btfix; - sptr btdyn; + SPTR btdyn; std::string m_bname, m_extra_data, m_bias_file,m_bias_server; std::string L1; std::string L2; @@ -96,27 +96,27 @@ namespace Moses std::vector m_is_logval; // keeps track of which features are log valued std::vector m_is_integer; // keeps track of which features are integer valued - std::vector > m_active_ff_fix; // activated feature functions (fix) - std::vector > m_active_ff_dyn; // activated feature functions (dyn) - std::vector > m_active_ff_common; + std::vector > m_active_ff_fix; // activated feature functions (fix) + std::vector > m_active_ff_dyn; // activated feature functions (dyn) + std::vector > m_active_ff_common; // activated feature functions (dyn) void - register_ff(sptr const& ff, std::vector > & registry); + register_ff(SPTR const& ff, std::vector > & registry); template void - check_ff(std::string const ffname,std::vector >* registry = NULL); + check_ff(std::string const ffname,std::vector >* registry = NULL); // add feature function if specified template void check_ff(std::string const ffname, float const xtra, - std::vector >* registry = NULL); + std::vector >* registry = NULL); // add feature function if specified void - add_corpus_specific_features(std::vector >& ffvec); + add_corpus_specific_features(std::vector >& ffvec); // built-in feature functions // PScorePfwd calc_pfwd_fix, calc_pfwd_dyn; @@ -153,7 +153,7 @@ namespace Moses Phrase const& src, Moses::bitext::PhrasePair* fix, Moses::bitext::PhrasePair* dyn, - sptr > const& dynbt) const; + SPTR > const& dynbt) const; void process_pstats @@ -233,13 +233,13 @@ namespace Moses void CleanUpAfterSentenceProcessing(ttasksptr const& ttask); // align two new sentences - sptr > + SPTR > align(std::string const& src, std::string const& trg) const; std::vector const& GetFeatureNames() const; - sptr + SPTR setupDocumentBias(std::map const& bias) const; vector DefaultWeights() const; diff --git a/moses/TranslationModel/UG/mmsapt_align.cc b/moses/TranslationModel/UG/mmsapt_align.cc index 13d8387d2..ea00a6ef9 100644 --- a/moses/TranslationModel/UG/mmsapt_align.cc +++ b/moses/TranslationModel/UG/mmsapt_align.cc @@ -105,7 +105,7 @@ // vector s,t; // pidmap_t sspan2pid, tspan2pid; // span -> phrase ID // pid2span_t spid2span,tpid2span; -// vector > > spstats; +// vector > > spstats; // vector PP; // // position-independent phrase pair info @@ -115,7 +115,7 @@ // // maps from target start positions to PhraseAlnHyps starting at // // that position -// sptr getPstats(span const& sspan); +// SPTR getPstats(span const& sspan); // void fill_tspan_maps(); // void fill_sspan_maps(); // public: @@ -183,14 +183,14 @@ // } // } -// sptr +// SPTR // Alignment:: // getPstats(span const& sspan) // { // size_t k = sspan.second - sspan.first - 1; // if (k < spstats[sspan.first].size()) // return spstats[sspan.first][k]; -// else return sptr(); +// else return SPTR(); // } // void @@ -278,7 +278,7 @@ // if (!L->second.size()) continue; // should never happen anyway // int i = L->second[0].first; // int k = L->second[0].second - i -1; -// sptr ps = spstats[i][k]; +// SPTR ps = spstats[i][k]; // PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents); // jStatsTable & J = ps->trg; // for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y) @@ -318,7 +318,7 @@ // return ret; // } -// sptr > +// SPTR > // Mmsapt:: // align(string const& src, string const& trg) const // { @@ -328,7 +328,7 @@ // VectorIndexSorter foo(A.PAH); // vector o; foo.GetOrder(o); // BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]); -// sptr > aln; +// SPTR > aln; // return aln; // } // } diff --git a/moses/TranslationModel/UG/spe-check-coverage3.cc b/moses/TranslationModel/UG/spe-check-coverage3.cc index a62daa7b8..62f078044 100644 --- a/moses/TranslationModel/UG/spe-check-coverage3.cc +++ b/moses/TranslationModel/UG/spe-check-coverage3.cc @@ -105,7 +105,7 @@ int main(int argc, char* argv[]) exit(1); } bg.open(argv[1],argv[2],argv[3]); - sptr fg(new imbitext(bg.V1,bg.V2)); + SPTR fg(new imbitext(bg.V1,bg.V2)); string base = argv[4]; if (*base.rbegin() != '.') base += '.'; string srcfile = base + argv[2]; @@ -124,10 +124,10 @@ int main(int argc, char* argv[]) // show_pair(sid); vector snt; fill_token_seq(*bg.V1,src[sid],snt); - vector > > > > FG,BG; + vector > > > > FG,BG; fg->lookup(snt,*fg->I1,FG,NULL,NULL,&bias,true); bg.lookup(snt,*bg.I1,BG,NULL,NULL,NULL,true); - set > > > seen; + set > > > seen; for (size_t i = 0; i < snt.size(); ++i) { Bitext::iter m0(fg->I1.get()); @@ -153,7 +153,7 @@ int main(int argc, char* argv[]) BOOST_FOREACH(PhrasePair const& pp, *FG[i][k]) { if (pp.joint < 2) continue; - sptr bgstats; + SPTR bgstats; jstats const* bgjstats = NULL; Bitext::iter m2(bg.I2.get(), pp.start2, pp.len2); if (m1.approxOccurrenceCount() > 5000 || diff --git a/moses/TranslationModel/UG/try-align.cc b/moses/TranslationModel/UG/try-align.cc index 60eabb9e7..c40fd2388 100644 --- a/moses/TranslationModel/UG/try-align.cc +++ b/moses/TranslationModel/UG/try-align.cc @@ -57,7 +57,7 @@ namespace stats struct SinglePhrase { - typedef map > cache_t; + typedef map > cache_t; uint64_t pid; // phrase id vector occs; // occurrences }; @@ -72,7 +72,7 @@ struct PhrasePair struct stats_t { - typedef map, sptr > cache_t; + typedef map, SPTR > cache_t; size_t m1,m2,j; float npmi; // normalized point-wise mutual information float pmi; // point-wise mutual information @@ -170,7 +170,7 @@ void lookup_phrases(vector const& snt, TokenIndex& V, ttrack_t const& T, tsa_t const& I, SinglePhrase::cache_t& cache, - vector > >& dest) + vector > >& dest) { dest.resize(snt.size()); for (size_t i = 0; i < snt.size(); ++i) @@ -181,7 +181,7 @@ lookup_phrases(vector const& snt, { if (m.approxOccurrenceCount() < 3) break; // if (k - i > 0) break; - sptr& o = cache[m.getPid()]; + SPTR& o = cache[m.getPid()]; if (!o) { o.reset(new SinglePhrase()); @@ -249,7 +249,7 @@ int main(int argc, char* argv[]) while (getline(cin,line1) and getline(cin,line2)) { cout << "\n" << line1 << "\n" << line2 << endl; - vector > > M1,M2; + vector > > M1,M2; vector snt1,snt2; V1.fillIdSeq(line1,snt1); V2.fillIdSeq(line2,snt2); @@ -282,7 +282,7 @@ int main(int argc, char* argv[]) for (size_t k2 = 0; k2 < M2[i2].size(); ++k2) { pp.e2 = i2 + k2 + 1; - sptr & s + SPTR & s = ppcache[make_pair(M1[i1][k1]->pid,M2[i2][k2]->pid)]; if (!s) { diff --git a/moses/TranslationModel/UG/try-align2.cc b/moses/TranslationModel/UG/try-align2.cc index a18ce8d92..129be4f4d 100644 --- a/moses/TranslationModel/UG/try-align2.cc +++ b/moses/TranslationModel/UG/try-align2.cc @@ -70,7 +70,7 @@ namespace stats struct SinglePhrase { - typedef map > cache_t; + typedef map > cache_t; uint64_t pid; // phrase id vector occs; // occurrences }; @@ -85,7 +85,7 @@ struct PhrasePair2 struct stats_t { - typedef map, sptr > cache_t; + typedef map, SPTR > cache_t; size_t m1,m2,j; float npmi; // normalized point-wise mutual information float pmi; // point-wise mutual information @@ -183,7 +183,7 @@ void lookup_phrases(vector const& snt, TokenIndex& V, ttrack_t const& T, tsa_t const& I, SinglePhrase::cache_t& cache, - vector > >& dest) + vector > >& dest) { dest.resize(snt.size()); for (size_t i = 0; i < snt.size(); ++i) @@ -194,7 +194,7 @@ lookup_phrases(vector const& snt, { if (m.approxOccurrenceCount() < 3) break; // if (k - i > 0) break; - sptr& o = cache[m.getPid()]; + SPTR& o = cache[m.getPid()]; if (!o) { o.reset(new SinglePhrase()); @@ -655,7 +655,7 @@ int main(int argc, char* argv[]) vector snt1,snt2; fill_token_seq(*BT.V1,line1,snt1); fill_token_seq(*BT.V2,line2,snt2); - vector > > > > pt1,pt2; + vector > > > > pt1,pt2; vector > pm1,pm2; BT.lookup(snt1,*BT.I1,pt1,&pm1,&scorer); BT.lookup(snt2,*BT.I2,pt2,&pm2,&scorer); @@ -672,7 +672,7 @@ int main(int argc, char* argv[]) for (ushort k = 0; k < pm2[i].size(); ++k) p2s2[pm2[i][k]].push_back(make_pair(i,i+k+1)); - boost::unordered_map > > > all1,all2; + boost::unordered_map > > > all1,all2; vector > pp_all; for (size_t i = 0; i < pt2.size(); ++i) for (size_t k = 0; k < pt2[i].size(); ++k) From b7f517bafa9cefe77c3d7cd136b3dd7a5aeae2da Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 2 Aug 2015 21:22:52 +0100 Subject: [PATCH 205/286] Restore readability. --- moses/thread_safe_container.h | 51 +++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/moses/thread_safe_container.h b/moses/thread_safe_container.h index 0045c73a9..e502677d6 100644 --- a/moses/thread_safe_container.h +++ b/moses/thread_safe_container.h @@ -23,7 +23,7 @@ namespace Moses template > class - ThreadSafeContainer +ThreadSafeContainer { protected: mutable boost::shared_mutex m_lock; @@ -49,28 +49,33 @@ public: : m_lock(lock), m_container(container), m_iter(iter) { } - entry_t const& operator->() { + entry_t const& operator->() + { UTIL_THROW_IF2(m_container == NULL, "This locking iterator is invalid " << "or has not been assigned."); return m_iter.operator->(); } - - // locking operators transfer the lock upon assignment and become invalid + + // locking operators transfer the lock upon assignment and become + // invalid locking_iterator const& - operator=(locking_iterator& other) { + operator=(locking_iterator& other) + { m_lock.swap(other.m_lock); m_iter = other.m_iter; other.m_iter = other.m_container.end(); } - + bool - operator==(const_iter_t const& other) { + operator==(const_iter_t const& other) + { return m_iter == other; } - + locking_iterator const& - operator++() { - ++m_iter; + operator++() + { + ++m_iter; return *this; } @@ -82,15 +87,21 @@ public: operator++(int); }; - const_iter_t const& end() const { + const_iter_t const& + end() const + { return m_container.end(); } - locking_iterator begin() const { + locking_iterator + begin() const + { return locking_iterator(m_lock, this, m_container.begin()); } - VAL const& set(KEY const& key, VAL const& val) { + VAL const& + set(KEY const& key, VAL const& val) + { boost::unique_lock< boost::shared_mutex > lock(m_lock); entry_t entry(key,val); iter_t foo = m_container.insert(entry).first; @@ -98,21 +109,27 @@ public: return foo->second; } - VAL const* get(KEY const& key, VAL const& default_val) { + VAL const* + get(KEY const& key, VAL const& default_val) + { boost::unique_lock< boost::shared_mutex > lock(m_lock); entry_t entry(key, default_val); iter_t foo = m_container.insert(entry).first; return &(foo->second); } - VAL const* get(KEY const& key) const { + VAL const* + get(KEY const& key) const + { boost::shared_lock< boost::shared_mutex > lock(m_lock); const_iter_t m = m_container.find(key); if (m == m_container.end()) return NULL; return &m->second; } - - size_t erase(KEY const& key) { + + size_t + erase(KEY const& key) + { boost::unique_lock< boost::shared_mutex > lock(m_lock); return m_container.erase(key); } From 511de3674e41c2773d951251e60b28d81b25edb0 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 2 Aug 2015 21:26:54 +0100 Subject: [PATCH 206/286] Reorganization of server options. --- .beautify-ignore | 2 ++ moses/ExportInterface.cpp | 2 +- moses/parameters/ServerOptions.h | 1 + moses/server/Translator.h | 13 +++++++++---- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.beautify-ignore b/.beautify-ignore index 15221c86a..b7eb51a20 100644 --- a/.beautify-ignore +++ b/.beautify-ignore @@ -23,6 +23,7 @@ mingw/MosesGUI/Ui_mainWindow.py moses/TranslationModel/UG moses/server moses/parameters +moses/thread_safe_container.h phrase-extract/pcfg-common phrase-extract/syntax-common randlm @@ -34,3 +35,4 @@ srilm util xmlrpc-c .git +util/ug_cache_with_timeout.h diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index a6cb97918..705b1b3fb 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -154,7 +154,7 @@ run_as_server() xmlrpc_c::registry myRegistry; xmlrpc_c::methodPtr const - translator(new MosesServer::Translator(sopts.num_threads)), + translator(new MosesServer::Translator(sopts)), updater(new MosesServer::Updater), optimizer(new MosesServer::Optimizer); diff --git a/moses/parameters/ServerOptions.h b/moses/parameters/ServerOptions.h index e9889f8d4..aa5f47018 100644 --- a/moses/parameters/ServerOptions.h +++ b/moses/parameters/ServerOptions.h @@ -1,4 +1,5 @@ // -*- mode: c++; cc-style: gnu -*- +#pragma once #include #include "moses/Parameter.h" namespace Moses diff --git a/moses/server/Translator.h b/moses/server/Translator.h index 4a6f889e8..21288e542 100644 --- a/moses/server/Translator.h +++ b/moses/server/Translator.h @@ -2,6 +2,8 @@ #pragma once #include "moses/ThreadPool.h" +#include "moses/parameters/ServerOptions.h" +#include "session.h" #include #include #include @@ -11,16 +13,19 @@ namespace MosesServer { class - // MosesServer:: - Translator : public xmlrpc_c::method +Translator : public xmlrpc_c::method { + Moses::ServerOptions m_server_options; public: - Translator(size_t numThreads = 10); - + Translator(Moses::ServerOptions const& sopts); + void execute(xmlrpc_c::paramList const& paramList, xmlrpc_c::value * const retvalP); + + Session const& get_session(uint64_t session_id); private: Moses::ThreadPool m_threadPool; + SessionCache m_session_cache; }; } From 6c5c5f60dafb6a1e09a33ce5c99a6dad45fdecbd Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 2 Aug 2015 21:28:47 +0100 Subject: [PATCH 207/286] Initial check-in of translation session management in the server. --- moses/server/TranslationRequest.cpp | 59 ++++++++++++++++++----------- moses/server/TranslationRequest.h | 11 ++++-- moses/server/Translator.cpp | 17 +++++++-- moses/server/session.h | 59 +++++++++++++++++++++++++++++ 4 files changed, 116 insertions(+), 30 deletions(-) create mode 100644 moses/server/session.h diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index 72cd2b405..a73c31962 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -24,14 +24,17 @@ using Moses::Sentence; boost::shared_ptr TranslationRequest:: -create(xmlrpc_c::paramList const& paramList, +create(Translator& translator, xmlrpc_c::paramList const& paramList, boost::condition_variable& cond, boost::mutex& mut) { boost::shared_ptr ret; ret.reset(new TranslationRequest(paramList,cond, mut)); ret->m_self = ret; - ret->m_scope.reset(new Moses::ContextScope); + if (ret->m_session_id) + ret->m_scope = translator.get_session(ret->m_session_id).scope; + else + ret->m_scope.reset(new Moses::ContextScope); return ret; } @@ -224,6 +227,7 @@ TranslationRequest(xmlrpc_c::paramList const& paramList, boost::condition_variable& cond, boost::mutex& mut) : m_cond(cond), m_mutex(mut), m_done(false), m_paramList(paramList) , m_nbestSize(0) + , m_session_id(0) { } void @@ -242,6 +246,12 @@ parse_request(std::map const& params) m_source_string = xmlrpc_c::value_string(si->second); XVERBOSE(1,"Input: " << m_source_string << endl); + si = params.find("session_id"); + if (si != params.end()) + m_session_id = xmlrpc_c::value_int(si->second); + else + m_session_id = 0; + m_withAlignInfo = check(params, "align"); m_withWordAlignInfo = check(params, "word-align"); m_withGraphInfo = check(params, "sg"); @@ -251,33 +261,36 @@ parse_request(std::map const& params) m_withScoreBreakdown = check(params, "add-score-breakdown"); m_source.reset(new Sentence(0,m_source_string)); si = params.find("lambda"); - if (si != params.end()) { - // muMo = multiModel - xmlrpc_c::value_array muMoArray = xmlrpc_c::value_array(si->second); - vector muMoValVec(muMoArray.vectorValueValue()); - vector w(muMoValVec.size()); - for (size_t i = 0; i < muMoValVec.size(); ++i) - w[i] = xmlrpc_c::value_double(muMoValVec[i]); - if (w.size() && (si = params.find("model_name")) != params.end()) { - string const model_name = xmlrpc_c::value_string(si->second); - PhraseDictionaryMultiModel* pdmm - = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name); - // Moses::PhraseDictionaryMultiModel* pdmm - // = FindPhraseDictionary(model_name); - pdmm->SetTemporaryMultiModelWeightsVector(w); + if (si != params.end()) + { + // muMo = multiModel + xmlrpc_c::value_array muMoArray = xmlrpc_c::value_array(si->second); + vector muMoValVec(muMoArray.vectorValueValue()); + vector w(muMoValVec.size()); + for (size_t i = 0; i < muMoValVec.size(); ++i) + w[i] = xmlrpc_c::value_double(muMoValVec[i]); + if (w.size() && (si = params.find("model_name")) != params.end()) + { + string const model_name = xmlrpc_c::value_string(si->second); + PhraseDictionaryMultiModel* pdmm + = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name); + // Moses::PhraseDictionaryMultiModel* pdmm + // = FindPhraseDictionary(model_name); + pdmm->SetTemporaryMultiModelWeightsVector(w); + } } - } - + si = params.find("nbest"); if (si != params.end()) m_nbestSize = xmlrpc_c::value_int(si->second); si = params.find("context"); - if (si != params.end()) { - string context = xmlrpc_c::value_string(si->second); - VERBOSE(1,"CONTEXT " << context); - m_context.reset(new std::vector(1,context)); - } + if (si != params.end()) + { + string context = xmlrpc_c::value_string(si->second); + VERBOSE(1,"CONTEXT " << context); + m_context.reset(new std::vector(1,context)); + } // // biased sampling for suffix-array-based sampling phrase table? // if ((si = params.find("bias")) != params.end()) // { diff --git a/moses/server/TranslationRequest.h b/moses/server/TranslationRequest.h index d67e55e03..728b6c525 100644 --- a/moses/server/TranslationRequest.h +++ b/moses/server/TranslationRequest.h @@ -19,12 +19,14 @@ #include "moses/TreeInput.h" #include "moses/TranslationTask.h" #include - #include + +#include "Translator.h" + namespace MosesServer { class - TranslationRequest : public virtual Moses::TranslationTask +TranslationRequest : public virtual Moses::TranslationTask { boost::condition_variable& m_cond; boost::mutex& m_mutex; @@ -44,6 +46,8 @@ class bool m_withScoreBreakdown; size_t m_nbestSize; + uint64_t m_session_id; // 0 means none, 1 means new + void parse_request(); @@ -99,7 +103,8 @@ public: static boost::shared_ptr - create(xmlrpc_c::paramList const& paramList, + create(Translator& translator, + xmlrpc_c::paramList const& paramList, boost::condition_variable& cond, boost::mutex& mut); diff --git a/moses/server/Translator.cpp b/moses/server/Translator.cpp index be8920abd..b91385066 100644 --- a/moses/server/Translator.cpp +++ b/moses/server/Translator.cpp @@ -8,8 +8,9 @@ using namespace std; using namespace Moses; Translator:: -Translator(size_t numThreads) - : m_threadPool(numThreads) +Translator(Moses::ServerOptions const& sopts) + : m_threadPool(sopts.num_threads), + m_server_options(sopts) { // signature and help strings are documentation -- the client // can query this information with a system.methodSignature and @@ -25,8 +26,8 @@ execute(xmlrpc_c::paramList const& paramList, { boost::condition_variable cond; boost::mutex mut; - boost::shared_ptr task - = TranslationRequest::create(paramList,cond,mut); + boost::shared_ptr task; + task = TranslationRequest::create(*this, paramList,cond,mut); m_threadPool.Submit(task); boost::unique_lock lock(mut); while (!task->IsDone()) @@ -34,4 +35,12 @@ execute(xmlrpc_c::paramList const& paramList, *retvalP = xmlrpc_c::value_struct(task->GetRetData()); } +Session const& +Translator:: +get_session(uint64_t const id) +{ + return m_session_cache[id]; +} + + } diff --git a/moses/server/session.h b/moses/server/session.h new file mode 100644 index 000000000..b396d50ce --- /dev/null +++ b/moses/server/session.h @@ -0,0 +1,59 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include "moses/Util.h" +#include "moses/ContextScope.h" +#include +#include +#ifdef WITH_THREADS +#include +#include +#endif +namespace MosesServer{ + + struct Session + { + uint64_t const id; + time_t start_time; + time_t last_access; + boost::shared_ptr const scope; // stores local info + + Session(uint64_t const session_id) + : id(session_id), scope(new Moses::ContextScope) + { + last_access = start_time = time(NULL); + } + + bool is_new() const { return last_access == start_time; } + }; + + class SessionCache + { + mutable boost::shared_mutex m_lock; + uint64_t m_session_counter; + boost::unordered_map m_cache; + public: + + SessionCache() : m_session_counter(1) {} + + Session const& + operator[](uint32_t id) + { + boost::upgrade_lock lock(m_lock); + if (id) + { + boost::unordered_map::iterator m = m_cache.find(id); + if (m != m_cache.end()) + { + m->second.last_access = time(NULL); + return m->second; + } + } + boost::upgrade_to_unique_lock xlock(lock); + if (!id) id = ++m_session_counter; + std::pair foo(id, Session(id)); + return m_cache.insert(foo).first->second; + } + }; + + +} From 44df19aa3bcc87842aece6796e867cf44debcd8c Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 3 Aug 2015 00:29:34 +0100 Subject: [PATCH 208/286] Make translation sessions in the server actually work. Time-out yet to be implemented. --- moses/server/TranslationRequest.cpp | 21 +++++++++++++++------ moses/server/TranslationRequest.h | 3 ++- moses/server/Translator.cpp | 2 +- moses/server/session.h | 4 ++-- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index a73c31962..bba57fc9f 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -24,17 +24,14 @@ using Moses::Sentence; boost::shared_ptr TranslationRequest:: -create(Translator& translator, xmlrpc_c::paramList const& paramList, +create(Translator* translator, xmlrpc_c::paramList const& paramList, boost::condition_variable& cond, boost::mutex& mut) { boost::shared_ptr ret; ret.reset(new TranslationRequest(paramList,cond, mut)); ret->m_self = ret; - if (ret->m_session_id) - ret->m_scope = translator.get_session(ret->m_session_id).scope; - else - ret->m_scope.reset(new Moses::ContextScope); + ret->m_translator = translator; return ret; } @@ -43,6 +40,16 @@ TranslationRequest:: Run() { parse_request(m_paramList.getStruct(0)); + // cerr << "SESSION ID" << ret->m_session_id << endl; + if (m_session_id) + { + Session const& S = m_translator->get_session(m_session_id); + m_scope = S.scope; + m_session_id = S.id; + // cerr << "SESSION ID" << m_session_id << endl; + } + else + m_scope.reset(new Moses::ContextScope); Moses::StaticData const& SD = Moses::StaticData::Instance(); @@ -378,7 +385,9 @@ run_phrase_decoder() manager.Decode(); pack_hypothesis(manager.GetBestHypothesis(), "text", m_retData); - + if (m_session_id) + m_retData["session_id"] = xmlrpc_c::value_int(m_session_id); + if (m_withGraphInfo) insertGraphInfo(manager,m_retData); if (m_withTopts) insertTranslationOptions(manager,m_retData); if (m_nbestSize) outputNBest(manager, m_retData); diff --git a/moses/server/TranslationRequest.h b/moses/server/TranslationRequest.h index 728b6c525..866eca20e 100644 --- a/moses/server/TranslationRequest.h +++ b/moses/server/TranslationRequest.h @@ -36,6 +36,7 @@ TranslationRequest : public virtual Moses::TranslationTask std::map m_retData; std::map m_bias; // for biased sampling + Translator* m_translator; std::string m_source_string, m_target_string; bool m_withAlignInfo; bool m_withWordAlignInfo; @@ -103,7 +104,7 @@ public: static boost::shared_ptr - create(Translator& translator, + create(Translator* translator, xmlrpc_c::paramList const& paramList, boost::condition_variable& cond, boost::mutex& mut); diff --git a/moses/server/Translator.cpp b/moses/server/Translator.cpp index b91385066..7a159b611 100644 --- a/moses/server/Translator.cpp +++ b/moses/server/Translator.cpp @@ -27,7 +27,7 @@ execute(xmlrpc_c::paramList const& paramList, boost::condition_variable cond; boost::mutex mut; boost::shared_ptr task; - task = TranslationRequest::create(*this, paramList,cond,mut); + task = TranslationRequest::create(this, paramList,cond,mut); m_threadPool.Submit(task); boost::unique_lock lock(mut); while (!task->IsDone()) diff --git a/moses/server/session.h b/moses/server/session.h index b396d50ce..cc0135615 100644 --- a/moses/server/session.h +++ b/moses/server/session.h @@ -39,7 +39,7 @@ namespace MosesServer{ operator[](uint32_t id) { boost::upgrade_lock lock(m_lock); - if (id) + if (id > 1) { boost::unordered_map::iterator m = m_cache.find(id); if (m != m_cache.end()) @@ -49,7 +49,7 @@ namespace MosesServer{ } } boost::upgrade_to_unique_lock xlock(lock); - if (!id) id = ++m_session_counter; + if (id==1) id = ++m_session_counter; std::pair foo(id, Session(id)); return m_cache.insert(foo).first->second; } From 8e33b9a07d955d98445a6af75c17edd952501d69 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 3 Aug 2015 10:24:01 +0100 Subject: [PATCH 209/286] Bug fix: ensure a new session ID is assigned every time a session isn't found. --- moses/server/session.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/server/session.h b/moses/server/session.h index cc0135615..02c455f07 100644 --- a/moses/server/session.h +++ b/moses/server/session.h @@ -49,7 +49,7 @@ namespace MosesServer{ } } boost::upgrade_to_unique_lock xlock(lock); - if (id==1) id = ++m_session_counter; + id = ++m_session_counter; std::pair foo(id, Session(id)); return m_cache.insert(foo).first->second; } From a39544bbcb05dea2fbe255bda929e62661ea3383 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Mon, 3 Aug 2015 16:46:57 +0100 Subject: [PATCH 210/286] fix inconsistency in the example --- scripts/ems/example/data/weight_bilinguallm.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/example/data/weight_bilinguallm.ini b/scripts/ems/example/data/weight_bilinguallm.ini index fbe26fc03..38cb3861b 100644 --- a/scripts/ems/example/data/weight_bilinguallm.ini +++ b/scripts/ems/example/data/weight_bilinguallm.ini @@ -10,5 +10,5 @@ TranslationModel0= 0.2 0.2 0.2 0.2 PhrasePenalty0= 0.2 LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3 LM0= 0.5 -BilingualNPLM0= 0.1 +BLMbilingual-lm= 0.1 From f808b3203040115ba135063bd3f6022d67602685 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Mon, 3 Aug 2015 16:47:25 +0100 Subject: [PATCH 211/286] support version of nplm that picks best on heldout --- scripts/ems/experiment.meta | 2 +- scripts/ems/experiment.perl | 2 +- scripts/training/create_nplm_ini.py | 9 +++++++-- scripts/training/train-neurallm.py | 11 ++++++++--- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index b9adb83f9..75f932399 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -186,7 +186,7 @@ train-nplm in: stripped-corpus out: binlm ignore-unless: nplm - rerun-on-change: stripped-corpus + rerun-on-change: stripped-corpus nplm-settings default-name: lm/nplm get-corpus in: get-corpus-script diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 8a8b35d78..cb56704d8 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2783,7 +2783,7 @@ sub define_interpolated_lm_interpolate { my $weight_list = ""; foreach my $id_set (@{$$ILM_SETS{$factor}{$order}}) { my ($id,$set) = split(/ /,$id_set,2); - $lm_list .= $LM[$id].","; + $lm_list .= $LM[$id]."," if $LM[$id]; if (defined($weights)) { die("ERROR: no interpolation weight set for $factor:$order:$set (factor:order:set)") unless defined($WEIGHT{"$factor:$order:$set"}); diff --git a/scripts/training/create_nplm_ini.py b/scripts/training/create_nplm_ini.py index 557de511e..ff86920af 100755 --- a/scripts/training/create_nplm_ini.py +++ b/scripts/training/create_nplm_ini.py @@ -35,10 +35,15 @@ def main(): ini_filename = os.path.join(options.working_dir,options.ini_filename) + path = "%s/train.model.nplm.%s" % (options.working_dir, "best") + if not os.path.exists(path): + path = "%s/train.model.nplm.%s" % (options.working_dir, options.epochs) + + with open(ini_filename,"w") as ifh: print>>ifh, "[feature]" - print>>ifh,"NeuralLM factor=%s name=NPLM%s order=%s path=%s/train.model.nplm.%s" \ - % (options.factor,options.name, options.n, options.working_dir, options.epochs) + print>>ifh,"NeuralLM factor=%s name=NPLM%s order=%s path=%s" \ + % (options.factor,options.name, options.n, path) print>>ifh print>>ifh,"[weight]" print>>ifh,"NPLM%s= 0.1" % options.name diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py index 625fc69d0..d86286f67 100755 --- a/scripts/training/train-neurallm.py +++ b/scripts/training/train-neurallm.py @@ -203,10 +203,15 @@ def main(options): train_nplm.main(options) sys.stderr.write('averaging null words\n') + output_model_file = os.path.join( + options.output_dir, + options.output_model + '.model.nplm.best') + if not os.path.exists(output_model_file): + output_model_file = os.path.join( + options.output_dir, + options.output_model + '.model.nplm.' + str(options.epochs)) average_options = averageNullEmbedding.parser.parse_args([ - '-i', os.path.join( - options.output_dir, - options.output_model + '.model.nplm.' + str(options.epochs)), + '-i', output_model_file , '-o', os.path.join( options.output_dir, options.output_model + '.model.nplm'), '-t', os.path.join(options.working_dir, numberized_file), From 4c3a6a3f3fb66e4f489ae264595f0a1792339ed8 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Mon, 3 Aug 2015 21:19:08 +0100 Subject: [PATCH 212/286] remove dash --- scripts/ems/example/config.toy.bilinguallm | 2 +- scripts/ems/example/data/weight_bilinguallm.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm index 4055b452c..7fda90933 100644 --- a/scripts/ems/example/config.toy.bilinguallm +++ b/scripts/ems/example/config.toy.bilinguallm @@ -192,7 +192,7 @@ raw-corpus = $toy-data/nc-5k.$output-extension # #lm = -[LM:bilingual-lm] +[LM:bilinguallm] #bilingual-lm #required settings diff --git a/scripts/ems/example/data/weight_bilinguallm.ini b/scripts/ems/example/data/weight_bilinguallm.ini index 38cb3861b..55153e035 100644 --- a/scripts/ems/example/data/weight_bilinguallm.ini +++ b/scripts/ems/example/data/weight_bilinguallm.ini @@ -10,5 +10,5 @@ TranslationModel0= 0.2 0.2 0.2 0.2 PhrasePenalty0= 0.2 LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3 LM0= 0.5 -BLMbilingual-lm= 0.1 +BLMbilinguallm= 0.1 From 8644a3bbf239a3e16ff8b288a5688c4c56f5e073 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 3 Aug 2015 22:00:26 +0100 Subject: [PATCH 213/286] Renamed for file name consistency. --- moses/server/{session.h => Session.h} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename moses/server/{session.h => Session.h} (100%) diff --git a/moses/server/session.h b/moses/server/Session.h similarity index 100% rename from moses/server/session.h rename to moses/server/Session.h From c5b193346c420b873ee77b2e22a8bc7e5dedcb2d Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 4 Aug 2015 01:58:28 +0100 Subject: [PATCH 214/286] More extensive filtering of compiler warnings with --filter-warnings. --- Jamroot | 1 + 1 file changed, 1 insertion(+) diff --git a/Jamroot b/Jamroot index 2e8075916..130ad5d88 100644 --- a/Jamroot +++ b/Jamroot @@ -134,6 +134,7 @@ if [ option.get "filter-warnings" : : "yes" ] { requirements += -Wno-unused-result ; requirements += -Wno-unused-variable ; requirements += -Wcomment ; + requirements += -Wstrict-aliasing ; } if [ option.get "debug-build" : : "yes" ] { From fc10ad4afb812200d7529256797a10438cfaf1f3 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Tue, 4 Aug 2015 01:59:28 +0100 Subject: [PATCH 215/286] Code reorganization for better session management in Moses server. Clients can now close sessions. --- moses/ExportInterface.cpp | 45 ++---------------- moses/Util.h | 2 +- moses/server/CloseSession.cpp | 30 ++++++++++++ moses/server/CloseSession.h | 24 ++++++++++ moses/server/Server.cpp | 72 +++++++++++++++++++++++++++++ moses/server/Server.h | 40 ++++++++++++++++ moses/server/Session.h | 12 +++++ moses/server/TranslationRequest.cpp | 7 +-- moses/server/Translator.cpp | 10 ++-- moses/server/Translator.h | 33 +++++++------ 10 files changed, 209 insertions(+), 66 deletions(-) create mode 100644 moses/server/CloseSession.cpp create mode 100644 moses/server/CloseSession.h create mode 100644 moses/server/Server.cpp create mode 100644 moses/server/Server.h diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index 705b1b3fb..a884f8e3b 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -63,10 +63,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include -#include "server/Translator.h" -#include "server/Optimizer.h" -#include "server/Updater.h" -#include "moses/parameters/ServerOptions.h" +#include "server/Server.h" #endif using namespace std; @@ -147,44 +144,8 @@ Parameter params; int run_as_server() { -#ifdef HAVE_XMLRPC_C - ServerOptions sopts(params); - if (sopts.is_serial) VERBOSE(1,"Running server in serial mode." << endl); - - xmlrpc_c::registry myRegistry; - - xmlrpc_c::methodPtr const - translator(new MosesServer::Translator(sopts)), - updater(new MosesServer::Updater), - optimizer(new MosesServer::Optimizer); - - myRegistry.addMethod("translate", translator); - myRegistry.addMethod("updater", updater); - myRegistry.addMethod("optimize", optimizer); - - xmlrpc_c::serverAbyss myAbyssServer( - xmlrpc_c::serverAbyss::constrOpt() - .registryP(&myRegistry) - .portNumber(sopts.port) // TCP port on which to listen - .logFileName(sopts.logfile) - .allowOrigin("*") - .maxConn(sopts.num_threads) - ); - - XVERBOSE(1,"Listening on port " << sopts.port << endl); - if (sopts.is_serial) - { - while(true) myAbyssServer.runOnce(); - } - else myAbyssServer.run(); - - std::cerr << "xmlrpc_c::serverAbyss.run() returned but should not." << std::endl; - // #pragma message("BUILDING MOSES WITH SERVER SUPPORT") -#else - // #pragma message("BUILDING MOSES WITHOUT SERVER SUPPORT") - std::cerr << "Moses was compiled without server support." << endl; -#endif - return 1; + MosesServer::Server server(params); + return server.run(); // actually: don't return. see Server::run() } int diff --git a/moses/Util.h b/moses/Util.h index b6d4ef613..58152f7ce 100644 --- a/moses/Util.h +++ b/moses/Util.h @@ -71,7 +71,7 @@ namespace Moses #ifdef IFVERBOSE #undef IFVERBOSE #endif -#define IFVERBOSE(level) if (StaticData::Instance().GetVerboseLevel() >= level) +#define IFVERBOSE(level) if (Moses::StaticData::Instance().GetVerboseLevel() >= level) #define XVERBOSE(level,str) VERBOSE(level, "[" << HERE << "] " << str) #define HERE __FILE__ << ":" << __LINE__ #define FEATUREVERBOSE(level,str) FEATUREVERBOSE2(level, "[" << GetScoreProducerDescription() << "] " << str) diff --git a/moses/server/CloseSession.cpp b/moses/server/CloseSession.cpp new file mode 100644 index 000000000..5cc6175d3 --- /dev/null +++ b/moses/server/CloseSession.cpp @@ -0,0 +1,30 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: -*- +#include "CloseSession.h" +#include "Server.h" +#include "moses/StaticData.h" + +namespace MosesServer +{ + CloseSession:: + CloseSession(Server& server) + : m_server(server) + { } + + void + CloseSession:: + execute(xmlrpc_c::paramList const& paramList, + xmlrpc_c::value * const retvalP) + { + typedef std::map params_t; + paramList.verifyEnd(1); // ??? UG + params_t const& params = paramList.getStruct(0); + params_t::const_iterator si = params.find("session-id"); + if (si != params.end()) + { + uint64_t session_id = xmlrpc_c::value_int(si->second); + m_server.delete_session(session_id); + *retvalP = xmlrpc_c::value_string("Session closed"); + } + } + +} diff --git a/moses/server/CloseSession.h b/moses/server/CloseSession.h new file mode 100644 index 000000000..833781a68 --- /dev/null +++ b/moses/server/CloseSession.h @@ -0,0 +1,24 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: -*- +#pragma once +#include +#include +#include +#ifndef WITH_THREADS +#pragma message("COMPILING WITHOUT THREADS!") +#endif +namespace MosesServer +{ + class Server; + class + CloseSession : public xmlrpc_c::method + { + Server& m_server; + public: + CloseSession(Server& server); + + void execute(xmlrpc_c::paramList const& paramList, + xmlrpc_c::value * const retvalP); + + }; + +} diff --git a/moses/server/Server.cpp b/moses/server/Server.cpp new file mode 100644 index 000000000..fde116224 --- /dev/null +++ b/moses/server/Server.cpp @@ -0,0 +1,72 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "Server.h" + +namespace MosesServer +{ + Server:: + Server(Moses::Parameter& params) + : m_server_options(params), + m_updater(new Updater), + m_optimizer(new Optimizer), + m_translator(new Translator(*this)), + m_close_session(new CloseSession(*this)) + { + m_registry.addMethod("translate", m_translator); + m_registry.addMethod("updater", m_updater); + m_registry.addMethod("optimize", m_optimizer); + m_registry.addMethod("close_session", m_close_session); + } + + int + Server:: + run() + { +#ifdef HAVE_XMLRPC_C + xmlrpc_c::serverAbyss myAbyssServer + (xmlrpc_c::serverAbyss::constrOpt() + .registryP(&m_registry) + .portNumber(m_server_options.port) // TCP port on which to listen + .logFileName(m_server_options.logfile) + .allowOrigin("*") + .maxConn(m_server_options.num_threads)); + + XVERBOSE(1,"Listening on port " << m_server_options.port << endl); + if (m_server_options.is_serial) + { + VERBOSE(1,"Running server in serial mode." << endl); + while(true) myAbyssServer.runOnce(); + } + else myAbyssServer.run(); + + std::cerr << "xmlrpc_c::serverAbyss.run() returned but should not." << std::endl; + // #pragma message("BUILDING MOSES WITH SERVER SUPPORT") +#else + // #pragma message("BUILDING MOSES WITHOUT SERVER SUPPORT") + std::cerr << "Moses was compiled without server support." << endl; +#endif + return 1; + } + + Moses::ServerOptions const& + Server:: + options() const + { + return m_server_options; + } + + Session const& + Server:: + get_session(uint64_t session_id) + { + return m_session_cache[session_id]; + } + + void + Server:: + delete_session(uint64_t const session_id) + { + return m_session_cache.erase(session_id); + } + + +} diff --git a/moses/server/Server.h b/moses/server/Server.h new file mode 100644 index 000000000..72cfa5640 --- /dev/null +++ b/moses/server/Server.h @@ -0,0 +1,40 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#ifdef HAVE_XMLRPC_C +#include +#include +#include +#include "Translator.h" +#include "Optimizer.h" +#include "Updater.h" +#include "CloseSession.h" +#include "Session.h" +#include "moses/parameters/ServerOptions.h" +#endif + +namespace MosesServer +{ + class Server + { + Moses::ServerOptions m_server_options; + SessionCache m_session_cache; + xmlrpc_c::registry m_registry; + xmlrpc_c::methodPtr const m_updater; + xmlrpc_c::methodPtr const m_optimizer; + xmlrpc_c::methodPtr const m_translator; + xmlrpc_c::methodPtr const m_close_session; + + public: + Server(Moses::Parameter& params); + + int run(); + void delete_session(uint64_t const session_id); + + Moses::ServerOptions const& + options() const; + + Session const& + get_session(uint64_t session_id); + + }; +} diff --git a/moses/server/Session.h b/moses/server/Session.h index 02c455f07..daba6d006 100644 --- a/moses/server/Session.h +++ b/moses/server/Session.h @@ -4,6 +4,7 @@ #include "moses/ContextScope.h" #include #include + #ifdef WITH_THREADS #include #include @@ -17,6 +18,8 @@ namespace MosesServer{ time_t last_access; boost::shared_ptr const scope; // stores local info + + Session(uint64_t const session_id) : id(session_id), scope(new Moses::ContextScope) { @@ -53,6 +56,15 @@ namespace MosesServer{ std::pair foo(id, Session(id)); return m_cache.insert(foo).first->second; } + + void + erase(uint32_t const id) + { + boost::unique_lock lock(m_lock); + m_cache.erase(id); + } + + }; diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index bba57fc9f..454ac8a65 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -39,7 +39,8 @@ void TranslationRequest:: Run() { - parse_request(m_paramList.getStruct(0)); + std::mapconst& params = m_paramList.getStruct(0); + parse_request(params); // cerr << "SESSION ID" << ret->m_session_id << endl; if (m_session_id) { @@ -253,7 +254,7 @@ parse_request(std::map const& params) m_source_string = xmlrpc_c::value_string(si->second); XVERBOSE(1,"Input: " << m_source_string << endl); - si = params.find("session_id"); + si = params.find("session-id"); if (si != params.end()) m_session_id = xmlrpc_c::value_int(si->second); else @@ -386,7 +387,7 @@ run_phrase_decoder() pack_hypothesis(manager.GetBestHypothesis(), "text", m_retData); if (m_session_id) - m_retData["session_id"] = xmlrpc_c::value_int(m_session_id); + m_retData["session-id"] = xmlrpc_c::value_int(m_session_id); if (m_withGraphInfo) insertGraphInfo(manager,m_retData); if (m_withTopts) insertTranslationOptions(manager,m_retData); diff --git a/moses/server/Translator.cpp b/moses/server/Translator.cpp index 7a159b611..42275e0fd 100644 --- a/moses/server/Translator.cpp +++ b/moses/server/Translator.cpp @@ -1,5 +1,6 @@ #include "Translator.h" #include "TranslationRequest.h" +#include "Server.h" namespace MosesServer { @@ -8,9 +9,9 @@ using namespace std; using namespace Moses; Translator:: -Translator(Moses::ServerOptions const& sopts) - : m_threadPool(sopts.num_threads), - m_server_options(sopts) +Translator(Server& server) + : m_threadPool(server.options().num_threads), + m_server(server) { // signature and help strings are documentation -- the client // can query this information with a system.methodSignature and @@ -39,8 +40,7 @@ Session const& Translator:: get_session(uint64_t const id) { - return m_session_cache[id]; + return m_server.get_session(id); } - } diff --git a/moses/server/Translator.h b/moses/server/Translator.h index 21288e542..8137f59bd 100644 --- a/moses/server/Translator.h +++ b/moses/server/Translator.h @@ -3,7 +3,7 @@ #include "moses/ThreadPool.h" #include "moses/parameters/ServerOptions.h" -#include "session.h" +#include "Session.h" #include #include #include @@ -12,20 +12,23 @@ #endif namespace MosesServer { -class -Translator : public xmlrpc_c::method -{ - Moses::ServerOptions m_server_options; -public: - Translator(Moses::ServerOptions const& sopts); - - void execute(xmlrpc_c::paramList const& paramList, - xmlrpc_c::value * const retvalP); - Session const& get_session(uint64_t session_id); -private: - Moses::ThreadPool m_threadPool; - SessionCache m_session_cache; -}; + class Server; + + class + Translator : public xmlrpc_c::method + { + Server& m_server; + // Moses::ServerOptions m_server_options; + public: + Translator(Server& server); + + void execute(xmlrpc_c::paramList const& paramList, + xmlrpc_c::value * const retvalP); + + Session const& get_session(uint64_t session_id); + private: + Moses::ThreadPool m_threadPool; + }; } From 3323a027278d099332a184971fc2c370ebceb2ca Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 5 Aug 2015 02:15:34 +0100 Subject: [PATCH 216/286] Option reorganisation in StaticData. --- moses/ConfusionNet.cpp | 9 +- moses/FF/BleuScoreFeature.cpp | 2 +- moses/FF/HyperParameterAsWeight.cpp | 6 +- moses/Sentence.cpp | 6 +- moses/StaticData.cpp | 187 +++++------------------ moses/StaticData.h | 190 ++++++++---------------- moses/TranslationTask.cpp | 2 +- moses/TypeDef.h | 11 +- moses/parameters/AllOptions.cpp | 31 ++++ moses/parameters/AllOptions.h | 31 ++++ moses/parameters/BeamSearchOptions.h | 15 ++ moses/parameters/ContextParameters.cpp | 9 +- moses/parameters/ContextParameters.h | 2 +- moses/parameters/CubePruningOptions.cpp | 19 +++ moses/parameters/CubePruningOptions.h | 20 +++ moses/parameters/InputOptions.cpp | 65 ++++++++ moses/parameters/InputOptions.h | 25 ++++ moses/parameters/NBestOptions.h | 7 +- moses/parameters/ReorderingOptions.cpp | 21 +++ moses/parameters/ReorderingOptions.h | 20 +++ moses/parameters/SearchOptions.cpp | 39 +++++ moses/parameters/SearchOptions.h | 34 +++++ 22 files changed, 445 insertions(+), 306 deletions(-) create mode 100644 moses/parameters/AllOptions.cpp create mode 100644 moses/parameters/AllOptions.h create mode 100644 moses/parameters/BeamSearchOptions.h create mode 100644 moses/parameters/CubePruningOptions.cpp create mode 100644 moses/parameters/CubePruningOptions.h create mode 100644 moses/parameters/InputOptions.cpp create mode 100644 moses/parameters/InputOptions.h create mode 100644 moses/parameters/ReorderingOptions.cpp create mode 100644 moses/parameters/ReorderingOptions.h create mode 100644 moses/parameters/SearchOptions.cpp create mode 100644 moses/parameters/SearchOptions.h diff --git a/moses/ConfusionNet.cpp b/moses/ConfusionNet.cpp index 0c355fd94..d9f92cd47 100644 --- a/moses/ConfusionNet.cpp +++ b/moses/ConfusionNet.cpp @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- // $Id$ #include "ConfusionNet.h" @@ -65,10 +66,10 @@ ConfusionNet() : InputType() { stats.createOne(); - const StaticData& staticData = StaticData::Instance(); - if (staticData.IsSyntax()) { - m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal()); - } + const StaticData& SD = StaticData::Instance(); + if (is_syntax(SD.GetSearchAlgorithm())) { + m_defaultLabelSet.insert(SD.GetInputDefaultNonTerminal()); + } UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified"); } diff --git a/moses/FF/BleuScoreFeature.cpp b/moses/FF/BleuScoreFeature.cpp index a98964386..626ff717d 100644 --- a/moses/FF/BleuScoreFeature.cpp +++ b/moses/FF/BleuScoreFeature.cpp @@ -27,7 +27,7 @@ int BleuScoreState::Compare(const FFState& o) const if (&o == this) return 0; - if (StaticData::Instance().IsSyntax()) + if (is_syntax(StaticData::Instance().GetSearchAlgorithm())) return 0; const BleuScoreState& other = dynamic_cast(o); diff --git a/moses/FF/HyperParameterAsWeight.cpp b/moses/FF/HyperParameterAsWeight.cpp index a2c068530..1548df158 100644 --- a/moses/FF/HyperParameterAsWeight.cpp +++ b/moses/FF/HyperParameterAsWeight.cpp @@ -19,9 +19,9 @@ HyperParameterAsWeight::HyperParameterAsWeight(const std::string &line) vector weights = staticData.GetWeights(this); - staticData.m_maxHypoStackSize = weights[0] * 1000; - staticData.m_beamWidth = weights[1] * 10; - + staticData.m_options.search.stack_size = weights[0] * 1000; + staticData.m_options.search.beam_width = weights[1] * 10; + } diff --git a/moses/Sentence.cpp b/moses/Sentence.cpp index e4dab8547..b5ee2a1b5 100644 --- a/moses/Sentence.cpp +++ b/moses/Sentence.cpp @@ -43,7 +43,7 @@ Sentence:: Sentence() : Phrase(0) , InputType() { const StaticData& SD = StaticData::Instance(); - if (SD.IsSyntax()) + if (is_syntax(SD.GetSearchAlgorithm())) m_defaultLabelSet.insert(SD.GetInputDefaultNonTerminal()); } @@ -153,7 +153,7 @@ aux_interpret_xml(std::string& line, std::vector & xmlWalls, using namespace std; if (SD.GetXmlInputType() != XmlPassThrough) { - int offset = SD.IsSyntax() ? 1 : 0; + int offset = is_syntax(SD.GetSearchAlgorithm()) ? 1 : 0; bool OK = ProcessAndStripXMLTags(line, m_xmlOptions, m_reorderingConstraint, xmlWalls, placeholders, offset, @@ -181,7 +181,7 @@ init(string line, std::vector const& factorOrder) aux_interpret_dlt(line); // some poorly documented cache-based stuff // if sentences is specified as "" - if (SD.IsPassthroughEnabled() || SD.IsPassthroughInNBestEnabled()) { + if (SD.IsPassthroughEnabled() || SD.options().nbest.include_passthrough) { string pthru = PassthroughSGML(line,"passthrough"); this->SetPassthroughInformation(pthru); } diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 1293f5d44..432c4ca98 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- // $Id$ // vim:tabstop=2 @@ -62,17 +63,11 @@ StaticData StaticData::s_instance; StaticData::StaticData() : m_sourceStartPosMattersForRecombination(false) , m_requireSortingAfterSourceContext(false) - , m_inputType(SentenceInput) - // , m_onlyDistinctNBest(false) - // , m_needAlignmentInfo(false) , m_lmEnableOOVFeature(false) , m_isAlwaysCreateDirectTranslationOption(false) , m_currentWeightSetting("default") , m_treeStructure(NULL) { - m_xmlBrackets.first="<"; - m_xmlBrackets.second=">"; - // memory pools Phrase::InitializeMemPool(); } @@ -80,15 +75,6 @@ StaticData::StaticData() StaticData::~StaticData() { RemoveAllInColl(m_decodeGraphs); - - /* - const std::vector &producers = FeatureFunction::GetFeatureFunctions(); - for(size_t i=0;iSetParameter(m_inputType, "inputtype", SentenceInput); - - m_parameter->SetParameter(m_continuePartialTranslation, - "continue-partial-translation", false ); - - std::string s_it = "text input"; - if (m_inputType == 1) { - s_it = "confusion net"; - } - if (m_inputType == 2) { - s_it = "word lattice"; - } - if (m_inputType == 3) { - s_it = "tree"; - } - VERBOSE(2,"input type is: "<SetParameter(m_xmlInputType, "xml-input", XmlPassThrough); - - // specify XML tags opening and closing brackets for XML option - params = m_parameter->GetParam("xml-brackets"); - if (params && params->size()) { - std::vector brackets = Tokenize(params->at(0)); - if(brackets.size()!=2) { - cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl; - exit(1); - } - m_xmlBrackets.first= brackets[0]; - m_xmlBrackets.second=brackets[1]; - VERBOSE(1,"XML tags opening and closing brackets for XML input are: " - << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl); - } - - m_parameter->SetParameter(m_defaultNonTermOnlyForEmptyRange, - "default-non-term-for-empty-range-only", false ); - -} - bool StaticData ::ini_output_options() @@ -193,7 +133,7 @@ StaticData m_parameter->SetParameter(m_recoverPath, "recover-input-path", false); - if (m_recoverPath && m_inputType == SentenceInput) { + if (m_recoverPath && m_options.input.input_type == SentenceInput) { TRACE_ERR("--recover-input-path should only be used with confusion net or word lattice input!\n"); m_recoverPath = false; } @@ -222,14 +162,10 @@ StaticData m_parameter->SetParameter( m_PrintID, "print-id", false ); m_parameter->SetParameter( m_PrintPassthroughInformation, "print-passthrough", false ); - // m_parameter->SetParameter( m_PrintPassthroughInformationInNBest, "print-passthrough-in-n-best", false ); // => now in BookkeepingOptions::init() // word graph params = m_parameter->GetParam("output-word-graph"); - if (params && params->size() == 2) - m_outputWordGraph = true; - else - m_outputWordGraph = false; + m_outputWordGraph = (params && params->size() == 2); // search graph params = m_parameter->GetParam("output-search-graph"); @@ -323,14 +259,6 @@ StaticData return true; } - -bool -StaticData -::ini_nbest_options() -{ - return m_nbest_options.init(*m_parameter); -} - void StaticData ::ini_compact_table_options() @@ -388,18 +316,6 @@ StaticData return true; } -void -StaticData -::ini_cube_pruning_options() -{ - m_parameter->SetParameter(m_cubePruningPopLimit, "cube-pruning-pop-limit", - DEFAULT_CUBE_PRUNING_POP_LIMIT); - m_parameter->SetParameter(m_cubePruningDiversity, "cube-pruning-diversity", - DEFAULT_CUBE_PRUNING_DIVERSITY); - m_parameter->SetParameter(m_cubePruningLazyScoring, "cube-pruning-lazy-scoring", - false); -} - void StaticData ::ini_factor_maps() @@ -449,52 +365,6 @@ StaticData m_parameter->SetParameter(m_isAlwaysCreateDirectTranslationOption, "always-create-direct-transopt", false ); } -void -StaticData -::ini_distortion_options() -{ - // reordering constraints - m_parameter->SetParameter(m_maxDistortion, "distortion-limit", -1); - - m_parameter->SetParameter(m_reorderingConstraint, "monotone-at-punctuation", false ); - - // early distortion cost - m_parameter->SetParameter(m_useEarlyDistortionCost, "early-distortion-cost", false ); - - - -} - -bool -StaticData -::ini_stack_decoding_options() -{ - const PARAM_VEC *params; - // settings for pruning - m_parameter->SetParameter(m_maxHypoStackSize, "stack", DEFAULT_MAX_HYPOSTACK_SIZE); - - m_minHypoStackDiversity = 0; - params = m_parameter->GetParam("stack-diversity"); - if (params && params->size()) { - if (m_maxDistortion > 15) { - std::cerr << "stack diversity > 0 is not allowed for distortion limits larger than 15"; - return false; - } - if (m_inputType == WordLatticeInput) { - std::cerr << "stack diversity > 0 is not allowed for lattice input"; - return false; - } - m_minHypoStackDiversity = Scan(params->at(0)); - } - - m_parameter->SetParameter(m_beamWidth, "beam-threshold", DEFAULT_BEAM_WIDTH); - m_beamWidth = TransformScore(m_beamWidth); - - m_parameter->SetParameter(m_earlyDiscardingThreshold, "early-discarding-threshold", DEFAULT_EARLY_DISCARDING_THRESHOLD); - m_earlyDiscardingThreshold = TransformScore(m_earlyDiscardingThreshold); - return true; -} - void StaticData ::ini_phrase_lookup_options() @@ -582,21 +452,15 @@ bool StaticData::LoadData(Parameter *parameter) m_parameter = parameter; const PARAM_VEC *params; + m_options.init(*parameter); - m_context_parameters.init(*parameter); - - // to cube or not to cube - m_parameter->SetParameter(m_searchAlgorithm, "search-algorithm", Normal); - - if (IsSyntax()) + if (is_syntax(m_options.search.algo)) LoadChartDecodingParameters(); // ORDER HERE MATTERS, SO DON'T CHANGE IT UNLESS YOU KNOW WHAT YOU ARE DOING! // input, output ini_factor_maps(); - ini_input_options(); m_bookkeeping_options.init(*parameter); - m_nbest_options.init(*parameter); // if (!ini_nbest_options()) return false; if (!ini_output_options()) return false; // threading etc. @@ -605,11 +469,8 @@ bool StaticData::LoadData(Parameter *parameter) // model loading ini_compact_table_options(); - // search - ini_distortion_options(); - if (!ini_stack_decoding_options()) return false; + ini_phrase_lookup_options(); - ini_cube_pruning_options(); ini_oov_options(); ini_mbr_options(); @@ -625,7 +486,7 @@ bool StaticData::LoadData(Parameter *parameter) || m_outputSearchGraphPB #endif || m_latticeSamplesFilePath.size()) { - m_nbest_options.enabled = true; + m_options.nbest.enabled = true; } // S2T decoder @@ -637,6 +498,24 @@ bool StaticData::LoadData(Parameter *parameter) m_parameter->SetParameter(m_placeHolderFactor, "placeholder-factor", NOT_FOUND); + + // sanity checks on parameters + // should eventually go into AllOptions. + if (m_options.search.stack_diversity) + { + if (m_options.reordering.max_distortion > 15) + { + std::cerr << "stack diversity > 0 is not allowed for distortion limits " + << "larger than 15"; + return false; + } + if (m_options.input.input_type == WordLatticeInput) + { + std::cerr << "stack diversity > 0 is not allowed for lattice input"; + return false; + } + } + // FEATURE FUNCTION INITIALIZATION HAPPENS HERE =============================== initialize_features(); @@ -644,7 +523,7 @@ bool StaticData::LoadData(Parameter *parameter) LoadFeatureFunctions(); LoadDecodeGraphs(); - + // sanity check that there are no weights without an associated FF if (!CheckWeights()) return false; @@ -846,7 +725,7 @@ void StaticData::LoadDecodeGraphsOld(const vector &mappingVector, const UTIL_THROW_IF2(decodeStep == NULL, "Null decode step"); if (m_decodeGraphs.size() < decodeGraphInd + 1) { DecodeGraph *decodeGraph; - if (IsSyntax()) { + if (is_syntax(m_options.search.algo)) { size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN; VERBOSE(1,"max-chart-span: " << maxChartSpans[decodeGraphInd] << endl); decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan); @@ -913,7 +792,7 @@ void StaticData::LoadDecodeGraphsNew(const std::vector &mappingVect UTIL_THROW_IF2(decodeStep == NULL, "Null decode step"); if (m_decodeGraphs.size() < decodeGraphInd + 1) { DecodeGraph *decodeGraph; - if (IsSyntax()) { + if (is_syntax(m_options.search.algo)) { size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN; VERBOSE(1,"max-chart-span: " << maxChartSpans[decodeGraphInd] << endl); decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan); @@ -1259,9 +1138,11 @@ StaticData const PARAM_VEC *params = m_parameter->GetParam("feature-name-overwrite"); if (params && params->size()) { - UTIL_THROW_IF2(params->size() != 1, "Only provide 1 line in the section [feature-name-overwrite]"); + UTIL_THROW_IF2(params->size() != 1, + "Only provide 1 line in the section [feature-name-overwrite]"); vector toks = Tokenize(params->at(0)); - UTIL_THROW_IF2(toks.size() % 2 != 0, "Format of -feature-name-overwrite must be [old-name new-name]*"); + UTIL_THROW_IF2(toks.size() % 2 != 0, + "Format of -feature-name-overwrite must be [old-name new-name]*"); for (size_t i = 0; i < toks.size(); i += 2) { const string &oldName = toks[i]; @@ -1272,8 +1153,8 @@ StaticData // FIXME Does this make sense for F2S? Perhaps it should be changed once // FIXME the pipeline uses RuleTable consistently. - if (m_searchAlgorithm == SyntaxS2T || m_searchAlgorithm == SyntaxT2S || - m_searchAlgorithm == SyntaxT2S_SCFG || m_searchAlgorithm == SyntaxF2S) { + if (m_options.search.algo == SyntaxS2T || m_options.search.algo == SyntaxT2S || + m_options.search.algo == SyntaxT2S_SCFG || m_options.search.algo == SyntaxF2S) { // Automatically override PhraseDictionary{Memory,Scope3}. This will // have to change if the FF parameters diverge too much in the future, // but for now it makes switching between the old and new decoders much diff --git a/moses/StaticData.h b/moses/StaticData.h index 8128d0b97..4aebe559d 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -44,6 +44,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "moses/FF/Factory.h" #include "moses/PP/Factory.h" +#include "moses/parameters/AllOptions.h" #include "moses/parameters/ContextParameters.h" #include "moses/parameters/NBestOptions.h" #include "moses/parameters/BookkeepingOptions.h" @@ -70,48 +71,26 @@ class StaticData friend class HyperParameterAsWeight; private: - static StaticData s_instance; + static StaticData s_instance; protected: Parameter *m_parameter; + AllOptions m_options; + BookkeepingOptions m_bookkeeping_options; - ContextParameters m_context_parameters; - - std::vector m_inputFactorOrder, m_outputFactorOrder; + std::vector m_inputFactorOrder, m_outputFactorOrder; mutable ScoreComponentCollection m_allWeights; std::vector m_decodeGraphs; - // Initial = 0 = can be used when creating poss trans - // Other = 1 = used to calculate LM score once all steps have been processed - float - m_beamWidth, - m_earlyDiscardingThreshold, - m_translationOptionThreshold, - m_wordDeletionWeight; - - - // PhraseTrans, Generation & LanguageModelScore has multiple weights. - int m_maxDistortion; - // do it differently from old pharaoh - // -ve = no limit on distortion - // 0 = no disortion (monotone in old pharaoh) - bool m_reorderingConstraint; //! use additional reordering constraints - bool m_useEarlyDistortionCost; - size_t m_maxHypoStackSize; //! hypothesis-stack size that triggers pruning - size_t m_minHypoStackDiversity; //! minimum number of hypothesis in stack for each source word coverage; - NBestOptions m_nbest_options; - BookkeepingOptions m_bookkeeping_options; - // size_t m_nBestSize; - // size_t m_nBestFactor; + float m_translationOptionThreshold; + float m_wordDeletionWeight; size_t m_latticeSamplesSize; size_t m_maxNoTransOptPerCoverage; size_t m_maxNoPartTransOpt; size_t m_maxPhraseLength; - // std::string m_nBestFilePath; std::string m_latticeSamplesFilePath; - // bool m_labeledNBestList,m_nBestIncludesSegmentation; bool m_dropUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = drop (ignore) them bool m_markUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = mark and (ignore) them std::string m_unknownWordPrefix; @@ -127,37 +106,23 @@ protected: bool m_outputHypoScore; bool m_requireSortingAfterSourceContext; - SearchAlgorithm m_searchAlgorithm; - InputTypeEnum m_inputType; - mutable size_t m_verboseLevel; bool m_reportSegmentation; bool m_reportSegmentationEnriched; bool m_reportAllFactors; - // bool m_reportAllFactorsNBest; std::string m_detailedTranslationReportingFilePath; std::string m_detailedTreeFragmentsTranslationReportingFilePath; - - //DIMw std::string m_detailedAllTranslationReportingFilePath; - // bool m_onlyDistinctNBest; bool m_PrintAlignmentInfo; - // bool m_needAlignmentInfo; // => BookkeepingOptions - // bool m_PrintAlignmentInfoNbest; - bool m_PrintID; bool m_PrintPassthroughInformation; - // bool m_PrintPassthroughInformationInNBest; std::string m_alignmentOutputFile; std::string m_factorDelimiter; //! by default, |, but it can be changed - XmlInputType m_xmlInputType; //! method for handling sentence XML input - std::pair m_xmlBrackets; //! strings to use as XML tags' opening and closing brackets. Default are "<" and ">" - bool m_mbr; //! use MBR decoder bool m_useLatticeMBR; //! use MBR decoder bool m_mira; // do mira training @@ -178,7 +143,6 @@ protected: size_t m_timeout_threshold; //! seconds after which time out is activated bool m_isAlwaysCreateDirectTranslationOption; - //! constructor. only the 1 static variable can be created bool m_outputWordGraph; //! whether to output word graph bool m_outputSearchGraph; //! whether to output search graph @@ -192,9 +156,6 @@ protected: bool m_includeLHSInSearchGraph; //! include LHS of rules in search graph std::string m_outputUnknownsFile; //! output unknowns in this file - size_t m_cubePruningPopLimit; - size_t m_cubePruningDiversity; - bool m_cubePruningLazyScoring; size_t m_ruleLimit; // Whether to load compact phrase table and reordering table into memory @@ -221,11 +182,10 @@ protected: bool m_useLegacyPT; bool m_defaultNonTermOnlyForEmptyRange; S2TParsingAlgorithm m_s2tParsingAlgorithm; - // bool m_printNBestTrees; FeatureRegistry m_registry; PhrasePropertyFactory m_phrasePropertyFactory; - + StaticData(); void LoadChartDecodingParameters(); @@ -238,7 +198,6 @@ protected: void NoCache(); - bool m_continuePartialTranslation; std::string m_binPath; // soft NT lookup for chart models @@ -246,26 +205,18 @@ protected: const StatefulFeatureFunction* m_treeStructure; - // number of nonterminal labels -// size_t m_nonTerminalSize; - - void ini_compact_table_options(); void ini_consensus_decoding_options(); - void ini_cube_pruning_options(); - void ini_distortion_options(); void ini_factor_maps(); void ini_input_options(); void ini_lm_options(); void ini_lmbr_options(); void ini_mbr_options(); void ini_mira_options(); - bool ini_nbest_options(); void ini_oov_options(); bool ini_output_options(); bool ini_performance_options(); void ini_phrase_lookup_options(); - bool ini_stack_decoding_options(); void ini_zombie_options(); void initialize_features(); @@ -274,6 +225,7 @@ public: bool IsAlwaysCreateDirectTranslationOption() const { return m_isAlwaysCreateDirectTranslationOption; } + //! destructor ~StaticData(); @@ -288,8 +240,8 @@ public: } /** delete current static instance and replace with another. - * Used by gui front end - */ + * Used by gui front end + */ #ifdef WIN32 static void Reset() { s_instance = StaticData(); @@ -309,7 +261,7 @@ public: const ContextParameters& GetContextParameters() const { - return m_context_parameters; + return m_options.context; } const std::vector &GetInputFactorOrder() const { @@ -349,54 +301,53 @@ public: bool IsWordDeletionEnabled() const { return m_wordDeletionEnabled; } + + AllOptions const& options() const { + return m_options; + } + + SearchOptions const& + searchOptions() const { + return m_options.search; + } + size_t GetMaxHypoStackSize() const { - return m_maxHypoStackSize; + return m_options.search.stack_size; } + size_t GetMinHypoStackDiversity() const { - return m_minHypoStackDiversity; - } - size_t GetCubePruningPopLimit() const { - return m_cubePruningPopLimit; - } - size_t GetCubePruningDiversity() const { - return m_cubePruningDiversity; - } - bool GetCubePruningLazyScoring() const { - return m_cubePruningLazyScoring; - } - size_t IsPathRecoveryEnabled() const { - return m_recoverPath; - } - bool IsIDEnabled() const { - return m_PrintID; - } - bool IsPassthroughEnabled() const { - return m_PrintPassthroughInformation; - } - bool IsPassthroughInNBestEnabled() const { - return m_nbest_options.include_passthrough; - // return m_PrintPassthroughInformationInNBest; - } - int GetMaxDistortion() const { - return m_maxDistortion; + return m_options.search.stack_diversity; } + + size_t GetCubePruningPopLimit() const { return m_options.cube.pop_limit; } + size_t GetCubePruningDiversity() const { return m_options.cube.diversity; } + size_t IsPathRecoveryEnabled() const { return m_recoverPath; } + bool GetCubePruningLazyScoring() const { return m_options.cube.lazy_scoring; } + bool IsIDEnabled() const { return m_PrintID; } + bool IsPassthroughEnabled() const { return m_PrintPassthroughInformation; } + + int GetMaxDistortion() const { return m_options.reordering.max_distortion; } + bool UseReorderingConstraint() const { - return m_reorderingConstraint; + return m_options.reordering.monotone_at_punct; } + + bool UseEarlyDistortionCost() const { + return m_options.reordering.use_early_distortion_cost; + } + float GetBeamWidth() const { - return m_beamWidth; + return m_options.search.beam_width; } float GetEarlyDiscardingThreshold() const { - return m_earlyDiscardingThreshold; + return m_options.search.early_discarding_threshold; } bool UseEarlyDiscarding() const { - return m_earlyDiscardingThreshold != -std::numeric_limits::infinity(); - } - bool UseEarlyDistortionCost() const { - return m_useEarlyDistortionCost; + return m_options.search.early_discarding_threshold + != -std::numeric_limits::infinity(); } float GetTranslationOptionThreshold() const { - return m_translationOptionThreshold; + return m_options.search.translationOptionThreshold; } size_t GetVerboseLevel() const { @@ -424,7 +375,7 @@ public: return m_reportAllFactors; } bool GetReportAllFactorsNBest() const { - return m_nbest_options.include_all_factors; + return m_options.nbest.include_all_factors; // return m_reportAllFactorsNBest; } bool IsDetailedTranslationReportingEnabled() const { @@ -445,7 +396,7 @@ public: return m_detailedTreeFragmentsTranslationReportingFilePath; } bool IsLabeledNBestList() const { - return m_nbest_options.include_feature_labels; + return m_options.nbest.include_feature_labels; // return m_labeledNBestList; } @@ -459,17 +410,17 @@ public: // for mert size_t GetNBestSize() const { - return m_nbest_options.nbest_size; + return m_options.nbest.nbest_size; // return m_nBestSize; } const std::string &GetNBestFilePath() const { - return m_nbest_options.output_file_path; + return m_options.nbest.output_file_path; // return m_nBestFilePath; } bool IsNBestEnabled() const { - return m_nbest_options.enabled; + return m_options.nbest.enabled; // return (!m_nBestFilePath.empty() || m_mbr || m_useLatticeMBR || m_mira || // m_outputSearchGraph || m_outputSearchGraphSLF || // m_outputSearchGraphHypergraph || m_useConsensusDecoding || @@ -488,7 +439,7 @@ public: } size_t GetNBestFactor() const { - return m_nbest_options.factor; + return m_options.nbest.factor; // return m_nBestFactor; } bool GetOutputWordGraph() const { @@ -497,28 +448,13 @@ public: //! Sets the global score vector weights for a given FeatureFunction. InputTypeEnum GetInputType() const { - return m_inputType; + return m_options.input.input_type; } SearchAlgorithm GetSearchAlgorithm() const { - return m_searchAlgorithm; + return m_options.search.algo; } - // bool IsSyntax() const { - // return m_searchAlgorithm == CYKPlus || - // m_searchAlgorithm == ChartIncremental || - // m_searchAlgorithm == SyntaxS2T || - // m_searchAlgorithm == SyntaxT2S || - // m_searchAlgorithm == SyntaxT2S_SCFG || - // m_searchAlgorithm == SyntaxF2S; - // } - - bool IsSyntax(SearchAlgorithm algo = DefaultSearchAlgorithm) const { - if (algo == DefaultSearchAlgorithm) - algo = m_searchAlgorithm; - return (algo == CYKPlus || algo == ChartIncremental || - algo == SyntaxS2T || algo == SyntaxT2S || - algo == SyntaxF2S || algo == SyntaxT2S_SCFG); - } + bool IsSyntax() const { return is_syntax(m_options.search.algo); } const ScoreComponentCollection& GetAllWeights() const { @@ -547,8 +483,7 @@ public: void SetWeights(const FeatureFunction* sp, const std::vector& weights); bool GetDistinctNBest() const { - return m_nbest_options.only_distinct; - // return m_onlyDistinctNBest; + return m_options.nbest.only_distinct; } const std::string& GetFactorDelimiter() const { return m_factorDelimiter; @@ -651,11 +586,11 @@ public: } XmlInputType GetXmlInputType() const { - return m_xmlInputType; + return m_options.input.xml_policy; } std::pair GetXmlBrackets() const { - return m_xmlBrackets; + return m_options.input.xml_brackets; } bool PrintTranslationOptions() const { @@ -692,7 +627,7 @@ public: } bool ContinuePartialTranslation() const { - return m_continuePartialTranslation; + return m_options.input.continue_partial_translation; } void ReLoadBleuScoreFeatureParameter(float weight); @@ -723,7 +658,7 @@ public: return m_PrintAlignmentInfo; } bool PrintAlignmentInfoInNbest() const { - return m_nbest_options.include_alignment_info; + return m_options.nbest.include_alignment_info; // return m_PrintAlignmentInfoNbest; } WordAlignmentSort GetWordAlignmentSort() const { @@ -731,7 +666,7 @@ public: } bool NBestIncludesSegmentation() const { - return m_nbest_options.include_segmentation; + return m_options.nbest.include_segmentation; // return m_nBestIncludesSegmentation; } @@ -873,8 +808,7 @@ public: } bool PrintNBestTrees() const { - return m_nbest_options.print_trees; - // return m_printNBestTrees; + return m_options.nbest.print_trees; } bool RequireSortingAfterSourceContext() const { diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp index 30535eb3c..5bbd03769 100644 --- a/moses/TranslationTask.cpp +++ b/moses/TranslationTask.cpp @@ -114,7 +114,7 @@ TranslationTask StaticData const& staticData = StaticData::Instance(); if (algo == DefaultSearchAlgorithm) algo = staticData.GetSearchAlgorithm(); - if (!staticData.IsSyntax(algo)) + if (!is_syntax(algo)) manager.reset(new Manager(this->self())); // phrase-based else if (algo == SyntaxF2S || algo == SyntaxT2S) { diff --git a/moses/TypeDef.h b/moses/TypeDef.h index 366a9dc77..d8f881df4 100644 --- a/moses/TypeDef.h +++ b/moses/TypeDef.h @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh @@ -113,12 +114,12 @@ enum DistortionOrientationOptions { } enum InputTypeEnum { - SentenceInput = 0, + SentenceInput = 0, ConfusionNetworkInput = 1, - WordLatticeInput = 2, - TreeInputType = 3, - //,WordLatticeInput2 = 4, - TabbedSentenceInput = 5, + WordLatticeInput = 2, + TreeInputType = 3, + //,WordLatticeInput2 = 4, + TabbedSentenceInput = 5, ForestInputType = 6 }; diff --git a/moses/parameters/AllOptions.cpp b/moses/parameters/AllOptions.cpp new file mode 100644 index 000000000..c1ad39f5c --- /dev/null +++ b/moses/parameters/AllOptions.cpp @@ -0,0 +1,31 @@ +// -*- mode: c++; cc-style: gnu -*- +#include "AllOptions.h" + +namespace Moses +{ + AllOptions:: + AllOptions(Parameter const& param) + { + init(param); + } + + bool + AllOptions:: + init(Parameter const& param) + { + if (!search.init(param)) return false; + if (!cube.init(param)) return false; + if (!nbest.init(param)) return false; + if (!reordering.init(param)) return false; + if (!context.init(param)) return false; + if (!input.init(param)) return false; + return sanity_check(); + } + + bool + AllOptions:: + sanity_check() + { + return true; + } +} diff --git a/moses/parameters/AllOptions.h b/moses/parameters/AllOptions.h new file mode 100644 index 000000000..4c7bd25c5 --- /dev/null +++ b/moses/parameters/AllOptions.h @@ -0,0 +1,31 @@ +// -*- mode: c++; cc-style: gnu -*- +#pragma once +#include +#include "moses/Parameter.h" +#include "SearchOptions.h" +#include "CubePruningOptions.h" +#include "NBestOptions.h" +#include "ReorderingOptions.h" +#include "ContextParameters.h" +#include "InputOptions.h" + +namespace Moses +{ + struct + AllOptions + { + SearchOptions search; + CubePruningOptions cube; + NBestOptions nbest; + ReorderingOptions reordering; + ContextParameters context; + InputOptions input; + // StackOptions stack; + // BeamSearchOptions beam; + bool init(Parameter const& param); + bool sanity_check(); + AllOptions() {} + AllOptions(Parameter const& param); + }; + +} diff --git a/moses/parameters/BeamSearchOptions.h b/moses/parameters/BeamSearchOptions.h new file mode 100644 index 000000000..ce784968b --- /dev/null +++ b/moses/parameters/BeamSearchOptions.h @@ -0,0 +1,15 @@ +// -*- mode: c++; cc-style: gnu -*- +#pragma once +#include +#include "moses/Parameter.h" +namespace Moses +{ + + struct + BeamSearchOptions + { + bool init(Parameter const& param); + BeamSearchOptions(Parameter const& param); + }; + +} diff --git a/moses/parameters/ContextParameters.cpp b/moses/parameters/ContextParameters.cpp index 7ee9323bd..a06f98014 100644 --- a/moses/parameters/ContextParameters.cpp +++ b/moses/parameters/ContextParameters.cpp @@ -9,9 +9,9 @@ ContextParameters() : look_ahead(0), look_back(0) { } -void +bool ContextParameters:: -init(Parameter& params) +init(Parameter const& params) { look_back = look_ahead = 0; params.SetParameter(context_string, "context-string", std::string("")); @@ -19,12 +19,12 @@ init(Parameter& params) params.SetParameter(context_window, "context-window", std::string("")); if (context_window == "") - return; + return true; if (context_window.substr(0,3) == "all") { look_back = look_ahead = std::numeric_limits::max(); - return; + return true; } size_t p = context_window.find_first_of("0123456789"); @@ -47,5 +47,6 @@ init(Parameter& params) else UTIL_THROW2("Invalid specification of context window."); } + return true; } } diff --git a/moses/parameters/ContextParameters.h b/moses/parameters/ContextParameters.h index aff7783e4..ce140e422 100644 --- a/moses/parameters/ContextParameters.h +++ b/moses/parameters/ContextParameters.h @@ -12,7 +12,7 @@ class ContextParameters { public: ContextParameters(); - void init(Parameter& params); + bool init(Parameter const& params); size_t look_ahead; // # of words to look ahead for context-sensitive decoding size_t look_back; // # of works to look back for context-sensitive decoding std::string context_string; // fixed context string specified on command line diff --git a/moses/parameters/CubePruningOptions.cpp b/moses/parameters/CubePruningOptions.cpp new file mode 100644 index 000000000..0c2bc9b4c --- /dev/null +++ b/moses/parameters/CubePruningOptions.cpp @@ -0,0 +1,19 @@ +// -*- mode: c++; cc-style: gnu -*- +#include "CubePruningOptions.h" + +namespace Moses +{ + + bool + CubePruningOptions:: + init(Parameter const& param) + { + param.SetParameter(pop_limit, "cube-pruning-pop-limit", + DEFAULT_CUBE_PRUNING_POP_LIMIT); + param.SetParameter(diversity, "cube-pruning-diversity", + DEFAULT_CUBE_PRUNING_DIVERSITY); + param.SetParameter(lazy_scoring, "cube-pruning-lazy-scoring", false); + return true; + } + +} diff --git a/moses/parameters/CubePruningOptions.h b/moses/parameters/CubePruningOptions.h new file mode 100644 index 000000000..a0b8ed461 --- /dev/null +++ b/moses/parameters/CubePruningOptions.h @@ -0,0 +1,20 @@ +// -*- mode: c++; cc-style: gnu -*- +#pragma once +#include +#include "moses/Parameter.h" +namespace Moses +{ + + struct + CubePruningOptions + { + size_t pop_limit; + size_t diversity; + bool lazy_scoring; + + bool init(Parameter const& param); + CubePruningOptions(Parameter const& param); + CubePruningOptions() {}; + }; + +} diff --git a/moses/parameters/InputOptions.cpp b/moses/parameters/InputOptions.cpp new file mode 100644 index 000000000..206be4660 --- /dev/null +++ b/moses/parameters/InputOptions.cpp @@ -0,0 +1,65 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "InputOptions.h" +#include +#include +#include "moses/StaticData.h" + +namespace Moses { + + InputOptions:: + InputOptions() + { + xml_brackets.first = "<"; + xml_brackets.second = ">"; + input_type = SentenceInput; + } + + bool + InputOptions:: + init(Parameter const& param) + { + param.SetParameter(input_type, "inputtype", SentenceInput); + if (input_type == SentenceInput) + { VERBOSE(2, "input type is: text input"); } + else if (input_type == ConfusionNetworkInput) + { VERBOSE(2, "input type is: confusion net"); } + else if (input_type == WordLatticeInput) + { VERBOSE(2, "input type is: word lattice"); } + else if (input_type == TreeInputType) + { VERBOSE(2, "input type is: tree"); } + else if (input_type == TabbedSentenceInput) + { VERBOSE(2, "input type is: tabbed sentence"); } + else if (input_type == ForestInputType) + { VERBOSE(2, "input type is: forest"); } + + param.SetParameter(continue_partial_translation, + "continue-partial-translation", false); + param.SetParameter(default_non_term_only_for_empty_range, + "default-non-term-for-empty-range-only", false); + + param.SetParameter(xml_policy, "xml-input", XmlPassThrough); + + // specify XML tags opening and closing brackets for XML option + // Do we really want this to be configurable???? UG + const PARAM_VEC *pspec; + pspec = param.GetParam("xml-brackets"); + if (pspec && pspec->size()) + { + std::vector brackets = Tokenize(pspec->at(0)); + if(brackets.size()!=2) + { + std::cerr << "invalid xml-brackets value, " + << "must specify exactly 2 blank-delimited strings " + << "for XML tags opening and closing brackets" << std::endl; + exit(1); + } + xml_brackets.first= brackets[0]; + xml_brackets.second=brackets[1]; + VERBOSE(1,"XML tags opening and closing brackets for XML input are: " + << xml_brackets.first << " and " + << xml_brackets.second << std::endl); + } + return true; + } + +} diff --git a/moses/parameters/InputOptions.h b/moses/parameters/InputOptions.h new file mode 100644 index 000000000..c5379bfae --- /dev/null +++ b/moses/parameters/InputOptions.h @@ -0,0 +1,25 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +#include +namespace Moses +{ + struct + InputOptions + { + bool continue_partial_translation; + bool default_non_term_only_for_empty_range; // whatever that means + InputTypeEnum input_type; + XmlInputType xml_policy; // pass through, ignore, exclusive, inclusive + + std::pair xml_brackets; + // strings to use as XML tags' opening and closing brackets. + // Default are "<" and ">" + + bool init(Parameter const& param); + InputOptions(); + }; + +} + diff --git a/moses/parameters/NBestOptions.h b/moses/parameters/NBestOptions.h index bc125c2b6..3a9364f5d 100644 --- a/moses/parameters/NBestOptions.h +++ b/moses/parameters/NBestOptions.h @@ -1,10 +1,11 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; cc-style: gnu; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once #include - namespace Moses { -struct NBestOptions { +struct NBestOptions +{ size_t nbest_size; size_t factor; bool enabled; diff --git a/moses/parameters/ReorderingOptions.cpp b/moses/parameters/ReorderingOptions.cpp new file mode 100644 index 000000000..016c4ab0d --- /dev/null +++ b/moses/parameters/ReorderingOptions.cpp @@ -0,0 +1,21 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "ReorderingOptions.h" + +namespace Moses { + + ReorderingOptions:: + ReorderingOptions(Parameter const& param) + { + init(param); + } + + bool + ReorderingOptions:: + init(Parameter const& param) + { + param.SetParameter(max_distortion, "distortion-limit", -1); + param.SetParameter(monotone_at_punct, "monotone-at-punctuation", false); + param.SetParameter(use_early_distortion_cost, "early-distortion-cost", false); + return true; + } +} diff --git a/moses/parameters/ReorderingOptions.h b/moses/parameters/ReorderingOptions.h new file mode 100644 index 000000000..e18c7deab --- /dev/null +++ b/moses/parameters/ReorderingOptions.h @@ -0,0 +1,20 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +namespace Moses +{ + + struct + ReorderingOptions + { + int max_distortion; + bool monotone_at_punct; + bool use_early_distortion_cost; + bool init(Parameter const& param); + ReorderingOptions(Parameter const& param); + ReorderingOptions() {} + }; + +} + diff --git a/moses/parameters/SearchOptions.cpp b/moses/parameters/SearchOptions.cpp new file mode 100644 index 000000000..ee77f9479 --- /dev/null +++ b/moses/parameters/SearchOptions.cpp @@ -0,0 +1,39 @@ +// -*- mode: c++; cc-style: gnu -*- +#include "SearchOptions.h" + +namespace Moses +{ + SearchOptions:: + SearchOptions(Parameter const& param) + : stack_diversity(0) + { + init(param); + } + + bool + SearchOptions:: + init(Parameter const& param) + { + param.SetParameter(algo, "search-algorithm", Normal); + param.SetParameter(stack_size, "stack", DEFAULT_MAX_HYPOSTACK_SIZE); + param.SetParameter(stack_diversity, "stack-diversity", size_t(0)); + param.SetParameter(beam_width, "beam-threshold", DEFAULT_BEAM_WIDTH); + param.SetParameter(early_discarding_threshold, "early-discarding-threshold", + DEFAULT_EARLY_DISCARDING_THRESHOLD); + + // transformation to log of a few scores + beam_width = TransformScore(beam_width); + early_discarding_threshold = TransformScore(early_discarding_threshold); + return true; + } + + bool + is_syntax(SearchAlgorithm algo) + { + return (algo == CYKPlus || algo == ChartIncremental || + algo == SyntaxS2T || algo == SyntaxT2S || + algo == SyntaxF2S || algo == SyntaxT2S_SCFG); + } + + +} diff --git a/moses/parameters/SearchOptions.h b/moses/parameters/SearchOptions.h new file mode 100644 index 000000000..deff2cc3b --- /dev/null +++ b/moses/parameters/SearchOptions.h @@ -0,0 +1,34 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +namespace Moses +{ + + bool is_syntax(SearchAlgorithm algo); + + struct + SearchOptions + { + SearchAlgorithm algo; + + // stack options + size_t stack_size; // maxHypoStackSize; + size_t stack_diversity; // minHypoStackDiversity; + + // beam search + float beam_width; + + // reordering options + // bool reorderingConstraint; //! use additional reordering constraints + // bool useEarlyDistortionCost; + + float early_discarding_threshold; + float translationOptionThreshold; + + bool init(Parameter const& param); + SearchOptions(Parameter const& param); + SearchOptions() {} + }; + +} From 6527c238754e561575e8408bf69598ed2c495547 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Wed, 5 Aug 2015 14:17:58 +0100 Subject: [PATCH 217/286] Updated emacs code formatting setting. --- moses/StaticData.h | 2 +- moses/parameters/AllOptions.cpp | 2 +- moses/parameters/AllOptions.h | 2 +- moses/parameters/BeamSearchOptions.h | 2 +- moses/parameters/BookkeepingOptions.h | 2 +- moses/parameters/ContextParameters.h | 2 +- moses/parameters/CubePruningOptions.h | 2 +- moses/parameters/NBestOptions.h | 2 +- moses/parameters/SearchOptions.cpp | 2 +- moses/parameters/ServerOptions.h | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/moses/StaticData.h b/moses/StaticData.h index 4aebe559d..1dea8a6d7 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- // $Id$ /*********************************************************************** diff --git a/moses/parameters/AllOptions.cpp b/moses/parameters/AllOptions.cpp index c1ad39f5c..d194ac08b 100644 --- a/moses/parameters/AllOptions.cpp +++ b/moses/parameters/AllOptions.cpp @@ -1,4 +1,4 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #include "AllOptions.h" namespace Moses diff --git a/moses/parameters/AllOptions.h b/moses/parameters/AllOptions.h index 4c7bd25c5..54055e77b 100644 --- a/moses/parameters/AllOptions.h +++ b/moses/parameters/AllOptions.h @@ -1,4 +1,4 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #pragma once #include #include "moses/Parameter.h" diff --git a/moses/parameters/BeamSearchOptions.h b/moses/parameters/BeamSearchOptions.h index ce784968b..85a8d5a64 100644 --- a/moses/parameters/BeamSearchOptions.h +++ b/moses/parameters/BeamSearchOptions.h @@ -1,4 +1,4 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #pragma once #include #include "moses/Parameter.h" diff --git a/moses/parameters/BookkeepingOptions.h b/moses/parameters/BookkeepingOptions.h index 08bc1d59d..20cf5d4bd 100644 --- a/moses/parameters/BookkeepingOptions.h +++ b/moses/parameters/BookkeepingOptions.h @@ -1,4 +1,4 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #include "moses/Parameter.h" // #include diff --git a/moses/parameters/ContextParameters.h b/moses/parameters/ContextParameters.h index ce140e422..280d3795e 100644 --- a/moses/parameters/ContextParameters.h +++ b/moses/parameters/ContextParameters.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #pragma once #include #include "moses/Parameter.h" diff --git a/moses/parameters/CubePruningOptions.h b/moses/parameters/CubePruningOptions.h index a0b8ed461..29959f4fe 100644 --- a/moses/parameters/CubePruningOptions.h +++ b/moses/parameters/CubePruningOptions.h @@ -1,4 +1,4 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #pragma once #include #include "moses/Parameter.h" diff --git a/moses/parameters/NBestOptions.h b/moses/parameters/NBestOptions.h index 3a9364f5d..894c35d1c 100644 --- a/moses/parameters/NBestOptions.h +++ b/moses/parameters/NBestOptions.h @@ -1,4 +1,4 @@ -// -*- mode: c++; cc-style: gnu; indent-tabs-mode: nil; tab-width: 2 -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #pragma once #include namespace Moses diff --git a/moses/parameters/SearchOptions.cpp b/moses/parameters/SearchOptions.cpp index ee77f9479..c6d062913 100644 --- a/moses/parameters/SearchOptions.cpp +++ b/moses/parameters/SearchOptions.cpp @@ -1,4 +1,4 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #include "SearchOptions.h" namespace Moses diff --git a/moses/parameters/ServerOptions.h b/moses/parameters/ServerOptions.h index aa5f47018..b66de7de9 100644 --- a/moses/parameters/ServerOptions.h +++ b/moses/parameters/ServerOptions.h @@ -1,4 +1,4 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #pragma once #include #include "moses/Parameter.h" From a201a981191399edab749c4880b1ee3bea8b4197 Mon Sep 17 00:00:00 2001 From: Ales Tamchyna Date: Wed, 5 Aug 2015 16:33:52 +0200 Subject: [PATCH 218/286] fix compilation with VW (TODO I really should write a regression test for VW) --- moses/FF/VW/VW.h | 2 +- moses/FF/VW/VWFeatureBase.h | 1 + moses/FF/VW/VWFeatureSourceExternalFeatures.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/moses/FF/VW/VW.h b/moses/FF/VW/VW.h index 88ca9ccfa..ba5406df5 100644 --- a/moses/FF/VW/VW.h +++ b/moses/FF/VW/VW.h @@ -307,7 +307,7 @@ public: } virtual void InitializeForInput(ttasksptr const& ttask) { - InputType const& source = ttask->GetSource(); + InputType const& source = *(ttask->GetSource().get()); // tabbed sentence is assumed only in training if (! m_train) return; diff --git a/moses/FF/VW/VWFeatureBase.h b/moses/FF/VW/VWFeatureBase.h index d0f5ebebe..29b689af8 100644 --- a/moses/FF/VW/VWFeatureBase.h +++ b/moses/FF/VW/VWFeatureBase.h @@ -5,6 +5,7 @@ #include "vw/Classifier.h" #include "moses/TypeDef.h" +#include "moses/TranslationTask.h" #include "moses/Util.h" #include "moses/FF/StatelessFeatureFunction.h" diff --git a/moses/FF/VW/VWFeatureSourceExternalFeatures.h b/moses/FF/VW/VWFeatureSourceExternalFeatures.h index 4032714be..4596f7106 100644 --- a/moses/FF/VW/VWFeatureSourceExternalFeatures.h +++ b/moses/FF/VW/VWFeatureSourceExternalFeatures.h @@ -40,7 +40,7 @@ public: } virtual void InitializeForInput(ttasksptr const& ttask) { - InputType const& source = ttask->GetSource(); + InputType const& source = *(ttask->GetSource().get()); UTIL_THROW_IF2(source.GetType() != TabbedSentenceInput, "This feature function requires the TabbedSentence input type"); From 626a53e7dfe463f18b0e685cad0989283dc3748e Mon Sep 17 00:00:00 2001 From: "U-DESKTOP-ONHNTIV\\hieuh" Date: Wed, 5 Aug 2015 18:15:09 +0100 Subject: [PATCH 219/286] compile error on 4.9.3 on cygwin --- moses/FF/Model1Feature.cpp | 8 ++++---- phrase-extract/consolidate-main.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/moses/FF/Model1Feature.cpp b/moses/FF/Model1Feature.cpp index 3bde70cfc..7ea7cd651 100644 --- a/moses/FF/Model1Feature.cpp +++ b/moses/FF/Model1Feature.cpp @@ -75,7 +75,7 @@ void Model1Vocabulary::Load(const std::string& fileName) ++i; std::vector tokens = Tokenize(line); UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens."); - unsigned id = std::atoll( tokens[0].c_str() ); + unsigned id = atoll( tokens[0].c_str() ); if (! ( (id == 1) && (tokens[1] == "UNK") )) { const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading? bool stored = Store(factor, id); @@ -86,7 +86,7 @@ void Model1Vocabulary::Load(const std::string& fileName) ++i; std::vector tokens = Tokenize(line); UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens."); - unsigned id = std::atoll( tokens[0].c_str() ); + unsigned id = atoll( tokens[0].c_str() ); const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading? bool stored = Store(factor, id); UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry."); @@ -105,8 +105,8 @@ void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabular ++i; std::vector tokens = Tokenize(line); UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens."); - unsigned idS = std::atoll( tokens[0].c_str() ); - unsigned idT = std::atoll( tokens[1].c_str() ); + unsigned idS = atoll( tokens[0].c_str() ); + unsigned idT = atoll( tokens[1].c_str() ); const Factor* wordS = vcbS.GetWord(idS); const Factor* wordT = vcbT.GetWord(idT); float prob = std::atof( tokens[2].c_str() ); diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index c9496f988..0f276144b 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -165,7 +165,7 @@ int main(int argc, char* argv[]) } pos = single_setting.find(":"); UTIL_THROW_IF2(pos == std::string::npos, "faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'"); - unsigned int field = std::atoll( single_setting.substr(0,pos).c_str() ); + unsigned int field = atoll( single_setting.substr(0,pos).c_str() ); float threshold = std::atof( single_setting.substr(pos+1).c_str() ); if (field == 0) { minScore0 = threshold; From 3c682fa8b05af6bff1a09f420141795875cf9685 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 6 Aug 2015 13:42:03 +0400 Subject: [PATCH 220/286] =?UTF-8?q?performance=20issue=20using=20moses=20s?= =?UTF-8?q?erver.=20Related=20to=20email=20thread=20https://www.mail-archi?= =?UTF-8?q?ve.com/moses-support@mit.edu/msg12775.html=20Fix=20by=20Martin?= =?UTF-8?q?=20Baumg=C3=A4rtner?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/server/mosesserver.cpp | 54 +++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 8af1d82a3..f6b2deaa1 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -678,6 +678,14 @@ int main(int argc, char** argv) bool isSerial = false; size_t numThreads = 10; //for translation tasks + //Abyss server configuration: initial values reflect hard-coded default + //-> http://xmlrpc-c.sourceforge.net/doc/libxmlrpc_server_abyss.html#max_conn + size_t maxConn = 15; + size_t maxConnBacklog = 15; + size_t keepaliveTimeout = 15; + size_t keepaliveMaxConn = 30; + size_t timeout = 15; + for (int i = 0; i < argc; ++i) { if (!strcmp(argv[i],"--server-port")) { ++i; @@ -695,6 +703,46 @@ int main(int argc, char** argv) } else { logfile = argv[i]; } + } else if (!strcmp(argv[i],"--server-maxconn")) { + ++i; + if (i >= argc) { + cerr << "Error: Missing argument to --server-maxconn" << endl; + exit(1); + } else { + maxConn = atoi(argv[i]); + } + } else if (!strcmp(argv[i],"--server-maxconn-backlog")) { + ++i; + if (i >= argc) { + cerr << "Error: Missing argument to --server-maxconn-backlog" << endl; + exit(1); + } else { + maxConnBacklog = atoi(argv[i]); + } + } else if (!strcmp(argv[i],"--server-keepalive-timeout")) { + ++i; + if (i >= argc) { + cerr << "Error: Missing argument to --server-keepalive-timeout" << endl; + exit(1); + } else { + keepaliveTimeout = atoi(argv[i]); + } + } else if (!strcmp(argv[i],"--server-keepalive-maxconn")) { + ++i; + if (i >= argc) { + cerr << "Error: Missing argument to --server-keepalive-maxconn" << endl; + exit(1); + } else { + keepaliveMaxConn = atoi(argv[i]); + } + } else if (!strcmp(argv[i],"--server-timeout")) { + ++i; + if (i >= argc) { + cerr << "Error: Missing argument to --server-timeout" << endl; + exit(1); + } else { + timeout = atoi(argv[i]); + } } else if (!strcmp(argv[i], "--threads")) { ++i; if (i>=argc) { @@ -755,7 +803,11 @@ int main(int argc, char** argv) .portNumber(port) // TCP port on which to listen .logFileName(logfile) .allowOrigin("*") - .maxConn((unsigned int)numThreads) + .maxConn((unsigned int)maxConn) + .maxConnBacklog((unsigned int)maxConnBacklog) + .keepaliveTimeout((unsigned int)keepaliveTimeout) + .keepaliveMaxConn((unsigned int)keepaliveMaxConn) + .timeout((unsigned int)timeout) ); XVERBOSE(1,"Listening on port " << port << endl); From 776da79442b723da7dcded1b5de79dff05788b65 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 6 Aug 2015 17:42:44 +0400 Subject: [PATCH 221/286] delete old unused code --- moses/SearchNormalBatch.cpp | 212 ------------------------------------ moses/SearchNormalBatch.h | 44 -------- 2 files changed, 256 deletions(-) delete mode 100644 moses/SearchNormalBatch.cpp delete mode 100644 moses/SearchNormalBatch.h diff --git a/moses/SearchNormalBatch.cpp b/moses/SearchNormalBatch.cpp deleted file mode 100644 index 612a5deea..000000000 --- a/moses/SearchNormalBatch.cpp +++ /dev/null @@ -1,212 +0,0 @@ -#include "SearchNormalBatch.h" -#include "LM/Base.h" -#include "Manager.h" -#include "Hypothesis.h" -#include "util/exception.hh" - -//#include - -using namespace std; - -namespace Moses -{ -SearchNormalBatch::SearchNormalBatch(Manager& manager, const InputType &source, const TranslationOptionCollection &transOptColl) - :SearchNormal(manager, source, transOptColl) - ,m_batch_size(10000) -{ - m_max_stack_size = StaticData::Instance().GetMaxHypoStackSize(); - - // Split the feature functions into sets of stateless, stateful - // distributed lm, and stateful non-distributed. - const vector& ffs = - StatefulFeatureFunction::GetStatefulFeatureFunctions(); - for (unsigned i = 0; i < ffs.size(); ++i) { - if (ffs[i]->GetScoreProducerDescription() == "DLM_5gram") { // TODO WFT - m_dlm_ffs[i] = const_cast(static_cast(ffs[i])); - m_dlm_ffs[i]->SetFFStateIdx(i); - } else { - m_stateful_ffs[i] = const_cast(ffs[i]); - } - } - m_stateless_ffs = StatelessFeatureFunction::GetStatelessFeatureFunctions(); - -} - -SearchNormalBatch::~SearchNormalBatch() -{ -} - -/** - * Main decoder loop that translates a sentence by expanding - * hypotheses stack by stack, until the end of the sentence. - */ -void SearchNormalBatch::Decode() -{ - const StaticData &staticData = StaticData::Instance(); - SentenceStats &stats = m_manager.GetSentenceStats(); - - // initial seed hypothesis: nothing translated, no words produced - Hypothesis *hypo = Hypothesis::Create(m_manager,m_source, m_initialTransOpt); - m_hypoStackColl[0]->AddPrune(hypo); - - // go through each stack - std::vector < HypothesisStack* >::iterator iterStack; - for (iterStack = m_hypoStackColl.begin() ; iterStack != m_hypoStackColl.end() ; ++iterStack) { - // check if decoding ran out of time - double _elapsed_time = GetUserTime(); - if (_elapsed_time > staticData.GetTimeoutThreshold()) { - VERBOSE(1,"Decoding is out of time (" << _elapsed_time << "," << staticData.GetTimeoutThreshold() << ")" << std::endl); - interrupted_flag = 1; - return; - } - HypothesisStackNormal &sourceHypoColl = *static_cast(*iterStack); - - // the stack is pruned before processing (lazy pruning): - VERBOSE(3,"processing hypothesis from next stack"); - IFVERBOSE(2) { - stats.StartTimeStack(); - } - sourceHypoColl.PruneToSize(staticData.GetMaxHypoStackSize()); - VERBOSE(3,std::endl); - sourceHypoColl.CleanupArcList(); - IFVERBOSE(2) { - stats.StopTimeStack(); - } - - // go through each hypothesis on the stack and try to expand it - HypothesisStackNormal::const_iterator iterHypo; - for (iterHypo = sourceHypoColl.begin() ; iterHypo != sourceHypoColl.end() ; ++iterHypo) { - Hypothesis &hypothesis = **iterHypo; - ProcessOneHypothesis(hypothesis); // expand the hypothesis - } - EvalAndMergePartialHypos(); - - // some logging - IFVERBOSE(2) { - OutputHypoStackSize(); - } - - // this stack is fully expanded; - actual_hypoStack = &sourceHypoColl; - } - - EvalAndMergePartialHypos(); -} - -/** - * Expand one hypothesis with a translation option. - * this involves initial creation, scoring and adding it to the proper stack - * \param hypothesis hypothesis to be expanded upon - * \param transOpt translation option (phrase translation) - * that is applied to create the new hypothesis - * \param expectedScore base score for early discarding - * (base hypothesis score plus future score estimation) - */ - -void -SearchNormalBatch:: -ExpandHypothesis(const Hypothesis &hypothesis, - const TranslationOption &transOpt, float expectedScore) -{ - // Check if the number of partial hypotheses exceeds the batch size. - if (m_partial_hypos.size() >= m_batch_size) { - EvalAndMergePartialHypos(); - } - - const StaticData &staticData = StaticData::Instance(); - SentenceStats &stats = m_manager.GetSentenceStats(); - - Hypothesis *newHypo; - if (! staticData.UseEarlyDiscarding()) { - // simple build, no questions asked - IFVERBOSE(2) { - stats.StartTimeBuildHyp(); - } - newHypo = hypothesis.CreateNext(transOpt); - IFVERBOSE(2) { - stats.StopTimeBuildHyp(); - } - if (newHypo==NULL) return; - //newHypo->Evaluate(m_transOptColl.GetFutureScore()); - - // Issue DLM requests for new hypothesis and put into the list of - // partial hypotheses. - std::map::iterator dlm_iter; - for (dlm_iter = m_dlm_ffs.begin(); - dlm_iter != m_dlm_ffs.end(); - ++dlm_iter) { - const FFState* input_state = newHypo->GetPrevHypo() ? newHypo->GetPrevHypo()->GetFFState((*dlm_iter).first) : NULL; - (*dlm_iter).second->IssueRequestsFor(*newHypo, input_state); - } - m_partial_hypos.push_back(newHypo); - } else { - UTIL_THROW2("can't use early discarding with batch decoding!"); - } -} - -void SearchNormalBatch::EvalAndMergePartialHypos() -{ - std::vector::iterator partial_hypo_iter; - for (partial_hypo_iter = m_partial_hypos.begin(); - partial_hypo_iter != m_partial_hypos.end(); - ++partial_hypo_iter) { - Hypothesis* hypo = *partial_hypo_iter; - - // Evaluate with other ffs. - std::map::iterator sfff_iter; - for (sfff_iter = m_stateful_ffs.begin(); - sfff_iter != m_stateful_ffs.end(); - ++sfff_iter) { - const StatefulFeatureFunction &ff = *(sfff_iter->second); - int state_idx = sfff_iter->first; - hypo->EvaluateWhenApplied(ff, state_idx); - } - std::vector::iterator slff_iter; - for (slff_iter = m_stateless_ffs.begin(); - slff_iter != m_stateless_ffs.end(); - ++slff_iter) { - hypo->EvaluateWhenApplied(**slff_iter); - } - } - - // Wait for all requests from the distributed LM to come back. - std::map::iterator dlm_iter; - for (dlm_iter = m_dlm_ffs.begin(); - dlm_iter != m_dlm_ffs.end(); - ++dlm_iter) { - (*dlm_iter).second->sync(); - } - - // Incorporate the DLM scores into all hypotheses and put into their - // stacks. - for (partial_hypo_iter = m_partial_hypos.begin(); - partial_hypo_iter != m_partial_hypos.end(); - ++partial_hypo_iter) { - Hypothesis* hypo = *partial_hypo_iter; - - // Calculate DLM scores. - std::map::iterator dlm_iter; - for (dlm_iter = m_dlm_ffs.begin(); - dlm_iter != m_dlm_ffs.end(); - ++dlm_iter) { - LanguageModel &lm = *(dlm_iter->second); - hypo->EvaluateWhenApplied(lm, (*dlm_iter).first); - } - - // Put completed hypothesis onto its stack. - size_t wordsTranslated = hypo->GetWordsBitmap().GetNumWordsCovered(); - m_hypoStackColl[wordsTranslated]->AddPrune(hypo); - } - m_partial_hypos.clear(); - - std::vector < HypothesisStack* >::iterator stack_iter; - HypothesisStackNormal* stack; - for (stack_iter = m_hypoStackColl.begin(); - stack_iter != m_hypoStackColl.end(); - ++stack_iter) { - stack = static_cast(*stack_iter); - stack->PruneToSize(m_max_stack_size); - } -} - -} diff --git a/moses/SearchNormalBatch.h b/moses/SearchNormalBatch.h deleted file mode 100644 index 1e8acc579..000000000 --- a/moses/SearchNormalBatch.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef moses_SearchNormalBatch_h -#define moses_SearchNormalBatch_h - -#include "SearchNormal.h" -#include "SentenceStats.h" - -namespace Moses -{ - -class Manager; -class InputType; -class TranslationOptionCollection; - -/** Implements the phrase-based stack decoding algorithm (no cube pruning) with a twist... - * Language model requests are batched together, duplicate requests are removed, and requests are sent together. - * Useful for distributed LM where network latency is an issue. - */ -class SearchNormalBatch: public SearchNormal -{ -protected: - - // Added for asynclm decoding. - std::vector m_stateless_ffs; - std::map m_dlm_ffs; - std::map m_stateful_ffs; - std::vector m_partial_hypos; - uint32_t m_batch_size; - int m_max_stack_size; - - // functions for creating hypotheses - void ExpandHypothesis(const Hypothesis &hypothesis,const TranslationOption &transOpt, float expectedScore); - void EvalAndMergePartialHypos(); - -public: - SearchNormalBatch(Manager& manager, const InputType &source, const TranslationOptionCollection &transOptColl); - ~SearchNormalBatch(); - - void Decode(); - -}; - -} - -#endif From 83ef138961a9e4711ba9efecaa9f81eb701fcb3f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 6 Aug 2015 18:16:02 +0400 Subject: [PATCH 222/286] delete old unused code --- moses/Search.cpp | 3 --- moses/TypeDef.h | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/moses/Search.cpp b/moses/Search.cpp index 8f947f622..d9b6d5af8 100644 --- a/moses/Search.cpp +++ b/moses/Search.cpp @@ -1,7 +1,6 @@ #include "Manager.h" #include "SearchCubePruning.h" #include "SearchNormal.h" -#include "SearchNormalBatch.h" #include "util/exception.hh" namespace Moses @@ -24,8 +23,6 @@ Search *Search::CreateSearch(Manager& manager, const InputType &source, return new SearchNormal(manager,source, transOptColl); case CubePruning: return new SearchCubePruning(manager, source, transOptColl); - case NormalBatch: - return new SearchNormalBatch(manager, source, transOptColl); default: UTIL_THROW2("ERROR: search. Aborting\n"); return NULL; diff --git a/moses/TypeDef.h b/moses/TypeDef.h index 366a9dc77..21eb09497 100644 --- a/moses/TypeDef.h +++ b/moses/TypeDef.h @@ -143,7 +143,7 @@ enum SearchAlgorithm { CubePruning = 1, //,CubeGrowing = 2 CYKPlus = 3, - NormalBatch = 4, + //NormalBatch = 4, ChartIncremental = 5, SyntaxS2T = 6, SyntaxT2S = 7, From 524109e2cabe5bd3ac693d3187e439885484a8c7 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 6 Aug 2015 00:51:02 +0100 Subject: [PATCH 223/286] Reorganisation of options. The purpose of this effort is to have options local to the individual translation task, so that they can be changed in the running system in a multi-threaded system. --- moses/BaseManager.cpp | 8 + moses/BaseManager.h | 3 +- moses/ChartCell.cpp | 4 +- moses/ChartHypothesis.cpp | 7 +- moses/ChartHypothesisCollection.cpp | 4 +- moses/ChartManager.cpp | 20 +-- moses/ChartParser.cpp | 3 +- moses/FF/ConstrainedDecoding.cpp | 6 +- moses/FF/HyperParameterAsWeight.cpp | 4 +- moses/Hypothesis.cpp | 4 +- moses/HypothesisStackCubePruning.cpp | 2 +- moses/HypothesisStackNormal.cpp | 2 +- moses/IOWrapper.cpp | 8 +- moses/Incremental.cpp | 13 +- moses/Manager.cpp | 25 +-- moses/RuleCube.cpp | 4 +- moses/RuleCubeQueue.cpp | 2 +- moses/ScoreComponentCollection.cpp | 2 +- moses/Search.cpp | 27 ++- moses/Search.h | 6 + moses/SearchCubePruning.cpp | 21 ++- moses/SearchNormal.cpp | 115 +++++++------ moses/SearchNormal.h | 31 +++- moses/SearchNormalBatch.cpp | 52 ++---- moses/Sentence.cpp | 2 +- moses/StaticData.cpp | 152 ++++++----------- moses/StaticData.h | 208 +++++++++--------------- moses/Syntax/F2S/Manager-inl.h | 6 +- moses/Syntax/Manager.cpp | 8 +- moses/Syntax/RuleTableFF.cpp | 8 +- moses/Syntax/S2T/Manager-inl.h | 6 +- moses/Syntax/T2S/Manager-inl.h | 6 +- moses/TranslationTask.cpp | 13 +- moses/TranslationTask.h | 3 +- moses/parameters/AllOptions.cpp | 31 ++++ moses/parameters/AllOptions.h | 31 ++++ moses/parameters/BeamSearchOptions.h | 15 ++ moses/parameters/ContextParameters.cpp | 9 +- moses/parameters/ContextParameters.h | 2 +- moses/parameters/CubePruningOptions.cpp | 19 +++ moses/parameters/CubePruningOptions.h | 20 +++ moses/parameters/InputOptions.cpp | 65 ++++++++ moses/parameters/InputOptions.h | 25 +++ moses/parameters/NBestOptions.h | 3 +- moses/parameters/ReorderingOptions.cpp | 21 +++ moses/parameters/ReorderingOptions.h | 20 +++ moses/parameters/SearchOptions.cpp | 50 ++++++ moses/parameters/SearchOptions.h | 44 +++++ regression-testing/run-single-test.perl | 2 +- 49 files changed, 709 insertions(+), 433 deletions(-) create mode 100644 moses/parameters/AllOptions.cpp create mode 100644 moses/parameters/AllOptions.h create mode 100644 moses/parameters/BeamSearchOptions.h create mode 100644 moses/parameters/CubePruningOptions.cpp create mode 100644 moses/parameters/CubePruningOptions.h create mode 100644 moses/parameters/InputOptions.cpp create mode 100644 moses/parameters/InputOptions.h create mode 100644 moses/parameters/ReorderingOptions.cpp create mode 100644 moses/parameters/ReorderingOptions.h create mode 100644 moses/parameters/SearchOptions.cpp create mode 100644 moses/parameters/SearchOptions.h diff --git a/moses/BaseManager.cpp b/moses/BaseManager.cpp index fc01d9144..bd35e2098 100644 --- a/moses/BaseManager.cpp +++ b/moses/BaseManager.cpp @@ -140,6 +140,14 @@ void BaseManager::WriteApplicationContext(std::ostream &out, } } +AllOptions const& +BaseManager:: +options() const +{ + return GetTtask()->options(); +} + + } // namespace diff --git a/moses/BaseManager.h b/moses/BaseManager.h index f4c7eeff2..ed9ccaffb 100644 --- a/moses/BaseManager.h +++ b/moses/BaseManager.h @@ -5,7 +5,7 @@ #include #include "ScoreComponentCollection.h" #include "InputType.h" - +#include "moses/parameters/AllOptions.h" namespace Moses { class ScoreComponentCollection; @@ -51,6 +51,7 @@ public: //! the input sentence being decoded const InputType& GetSource() const; const ttasksptr GetTtask() const; + AllOptions const& options() const; virtual void Decode() = 0; // outputs diff --git a/moses/ChartCell.cpp b/moses/ChartCell.cpp index c942375e2..02f132bae 100644 --- a/moses/ChartCell.cpp +++ b/moses/ChartCell.cpp @@ -53,7 +53,7 @@ ChartCell::ChartCell(size_t startPos, size_t endPos, ChartManager &manager) : ChartCellBase(startPos, endPos), m_manager(manager) { const StaticData &staticData = StaticData::Instance(); - m_nBestIsEnabled = staticData.IsNBestEnabled(); + m_nBestIsEnabled = staticData.options().nbest.enabled; } ChartCell::~ChartCell() {} @@ -100,7 +100,7 @@ void ChartCell::Decode(const ChartTranslationOptionList &transOptList } // pluck things out of queue and add to hypo collection - const size_t popLimit = staticData.GetCubePruningPopLimit(); + const size_t popLimit = staticData.options().cube.pop_limit; for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) { ChartHypothesis *hypo = queue.Pop(); AddHypothesis(hypo); diff --git a/moses/ChartHypothesis.cpp b/moses/ChartHypothesis.cpp index 0d62e33bf..96e7c4552 100644 --- a/moses/ChartHypothesis.cpp +++ b/moses/ChartHypothesis.cpp @@ -287,8 +287,11 @@ void ChartHypothesis::CleanupArcList() * so we'll keep all of arc list if nedd distinct n-best list */ const StaticData &staticData = StaticData::Instance(); - size_t nBestSize = staticData.GetNBestSize(); - bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphHypergraph(); + size_t nBestSize = staticData.options().nbest.nbest_size; + bool distinctNBest = (staticData.options().nbest.only_distinct + || staticData.UseMBR() + || staticData.GetOutputSearchGraph() + || staticData.GetOutputSearchGraphHypergraph()); if (!distinctNBest && m_arcList->size() > nBestSize) { // prune arc list only if there too many arcs diff --git a/moses/ChartHypothesisCollection.cpp b/moses/ChartHypothesisCollection.cpp index d53211f34..42717c261 100644 --- a/moses/ChartHypothesisCollection.cpp +++ b/moses/ChartHypothesisCollection.cpp @@ -38,8 +38,8 @@ ChartHypothesisCollection::ChartHypothesisCollection() const StaticData &staticData = StaticData::Instance(); m_beamWidth = staticData.GetBeamWidth(); - m_maxHypoStackSize = staticData.GetMaxHypoStackSize(); - m_nBestIsEnabled = staticData.IsNBestEnabled(); + m_maxHypoStackSize = staticData.options().search.stack_size; + m_nBestIsEnabled = staticData.options().nbest.enabled; m_bestScore = -std::numeric_limits::infinity(); } diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp index d59b68cc8..b1870791b 100644 --- a/moses/ChartManager.cpp +++ b/moses/ChartManager.cpp @@ -207,7 +207,7 @@ void ChartManager::CalcNBest( // with 0 being 'unlimited.' This actually sets a large-ish limit in case // too many translations are identical. const StaticData &staticData = StaticData::Instance(); - const std::size_t nBestFactor = staticData.GetNBestFactor(); + const std::size_t nBestFactor = staticData.options().nbest.factor; std::size_t numDerivations = (nBestFactor == 0) ? n*1000 : n*nBestFactor; // Extract the derivations. @@ -318,13 +318,14 @@ void ChartManager::OutputBest(OutputCollector *collector) const void ChartManager::OutputNBest(OutputCollector *collector) const { const StaticData &staticData = StaticData::Instance(); - size_t nBestSize = staticData.GetNBestSize(); + size_t nBestSize = staticData.options().nbest.nbest_size; if (nBestSize > 0) { const size_t translationId = m_source.GetTranslationId(); - VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl); + VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " + << staticData.options().nbest.output_file_path << endl); std::vector > nBestList; - CalcNBest(nBestSize, nBestList,staticData.GetDistinctNBest()); + CalcNBest(nBestSize, nBestList,staticData.options().nbest.only_distinct); OutputNBestList(collector, nBestList, translationId); IFVERBOSE(2) { PrintUserTime("N-Best Hypotheses Generation Time:"); @@ -348,10 +349,9 @@ void ChartManager::OutputNBestList(OutputCollector *collector, FixPrecision(out); } - bool includeWordAlignment = - StaticData::Instance().PrintAlignmentInfoInNbest(); - - bool PrintNBestTrees = StaticData::Instance().PrintNBestTrees(); + NBestOptions const& nbo = StaticData::Instance().options().nbest; + bool includeWordAlignment = nbo.include_alignment_info; + bool PrintNBestTrees = nbo.print_trees; for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin(); p != nBestList.end(); ++p) { @@ -620,9 +620,9 @@ void ChartManager::OutputDetailedTranslationReport( if (staticData.IsDetailedAllTranslationReportingEnabled()) { const Sentence &sentence = dynamic_cast(m_source); - size_t nBestSize = staticData.GetNBestSize(); + size_t nBestSize = staticData.options().nbest.nbest_size; std::vector > nBestList; - CalcNBest(nBestSize, nBestList, staticData.GetDistinctNBest()); + CalcNBest(nBestSize, nBestList, staticData.options().nbest.nbest_size); OutputDetailedAllTranslationReport(collector, nBestList, sentence, translationId); } diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp index 66e22a055..8c569cec9 100644 --- a/moses/ChartParser.cpp +++ b/moses/ChartParser.cpp @@ -106,7 +106,8 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo("0-0"); targetPhrase->EvaluateInIsolation(*unksrc); - if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.PrintNBestTrees() || staticData.GetTreeStructure() != NULL) { + + if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.options().nbest.print_trees || staticData.GetTreeStructure() != NULL) { targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]"); } diff --git a/moses/FF/ConstrainedDecoding.cpp b/moses/FF/ConstrainedDecoding.cpp index 89ad2c9d6..6743a9085 100644 --- a/moses/FF/ConstrainedDecoding.cpp +++ b/moses/FF/ConstrainedDecoding.cpp @@ -43,8 +43,10 @@ ConstrainedDecoding::ConstrainedDecoding(const std::string &line) void ConstrainedDecoding::Load() { const StaticData &staticData = StaticData::Instance(); - bool addBeginEndWord = (staticData.GetSearchAlgorithm() == CYKPlus) || (staticData.GetSearchAlgorithm() == ChartIncremental); - + bool addBeginEndWord + = ((staticData.options().search.algo == CYKPlus) + || (staticData.options().search.algo == ChartIncremental)); + for(size_t i = 0; i < m_paths.size(); ++i) { InputFileStream constraintFile(m_paths[i]); std::string line; diff --git a/moses/FF/HyperParameterAsWeight.cpp b/moses/FF/HyperParameterAsWeight.cpp index a2c068530..e6303c2bb 100644 --- a/moses/FF/HyperParameterAsWeight.cpp +++ b/moses/FF/HyperParameterAsWeight.cpp @@ -19,8 +19,8 @@ HyperParameterAsWeight::HyperParameterAsWeight(const std::string &line) vector weights = staticData.GetWeights(this); - staticData.m_maxHypoStackSize = weights[0] * 1000; - staticData.m_beamWidth = weights[1] * 10; + staticData.options().search.stack_size = weights[0] * 1000; + staticData.options().search.beam_width = weights[1] * 10; } diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp index 65d5944ce..bc9ff0eb0 100644 --- a/moses/Hypothesis.cpp +++ b/moses/Hypothesis.cpp @@ -362,8 +362,8 @@ CleanupArcList() * so we'll keep all of arc list if nedd distinct n-best list */ const StaticData &staticData = StaticData::Instance(); - size_t nBestSize = staticData.GetNBestSize(); - bool distinctNBest = (staticData.GetDistinctNBest() || + size_t nBestSize = staticData.options().nbest.nbest_size; + bool distinctNBest = (staticData.options().nbest.only_distinct || staticData.GetLatticeSamplesSize() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || diff --git a/moses/HypothesisStackCubePruning.cpp b/moses/HypothesisStackCubePruning.cpp index c39aa8641..23fc2b01a 100644 --- a/moses/HypothesisStackCubePruning.cpp +++ b/moses/HypothesisStackCubePruning.cpp @@ -36,7 +36,7 @@ namespace Moses HypothesisStackCubePruning::HypothesisStackCubePruning(Manager& manager) : HypothesisStack(manager) { - m_nBestIsEnabled = StaticData::Instance().IsNBestEnabled(); + m_nBestIsEnabled = StaticData::Instance().options().nbest.enabled; m_bestScore = -std::numeric_limits::infinity(); m_worstScore = -std::numeric_limits::infinity(); } diff --git a/moses/HypothesisStackNormal.cpp b/moses/HypothesisStackNormal.cpp index 0992c5478..7c99528fc 100644 --- a/moses/HypothesisStackNormal.cpp +++ b/moses/HypothesisStackNormal.cpp @@ -36,7 +36,7 @@ namespace Moses HypothesisStackNormal::HypothesisStackNormal(Manager& manager) : HypothesisStack(manager) { - m_nBestIsEnabled = StaticData::Instance().IsNBestEnabled(); + m_nBestIsEnabled = StaticData::Instance().options().nbest.enabled; m_bestScore = -std::numeric_limits::infinity(); m_worstScore = -std::numeric_limits::infinity(); } diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp index 65c3f20c8..b36fd0644 100644 --- a/moses/IOWrapper.cpp +++ b/moses/IOWrapper.cpp @@ -96,8 +96,8 @@ IOWrapper::IOWrapper() const StaticData &staticData = StaticData::Instance(); // context buffering for context-sensitive decoding - m_look_ahead = staticData.GetContextParameters().look_ahead; - m_look_back = staticData.GetContextParameters().look_back; + m_look_ahead = staticData.options().context.look_ahead; + m_look_back = staticData.options().context.look_back; m_inputType = staticData.GetInputType(); @@ -108,8 +108,8 @@ IOWrapper::IOWrapper() m_inputFactorOrder = &staticData.GetInputFactorOrder(); - size_t nBestSize = staticData.GetNBestSize(); - string nBestFilePath = staticData.GetNBestFilePath(); + size_t nBestSize = staticData.options().nbest.nbest_size; + string nBestFilePath = staticData.options().nbest.output_file_path; staticData.GetParameter().SetParameter(m_inputFilePath, "input-file", ""); if (m_inputFilePath.empty()) { diff --git a/moses/Incremental.cpp b/moses/Incremental.cpp index f5f5dde66..51fcb0bf5 100644 --- a/moses/Incremental.cpp +++ b/moses/Incremental.cpp @@ -208,7 +208,7 @@ Manager::Manager(ttasksptr const& ttask) : BaseManager(ttask) , cells_(m_source, ChartCellBaseFactory(), parser_) , parser_(ttask, cells_) - , n_best_(search::NBestConfig(StaticData::Instance().GetNBestSize())) + , n_best_(search::NBestConfig(StaticData::Instance().options().nbest.nbest_size)) { } Manager::~Manager() @@ -223,12 +223,17 @@ namespace const float log_10 = logf(10); } -template search::History Manager::PopulateBest(const Model &model, const std::vector &words, Best &out) +template +search::History +Manager:: +PopulateBest(const Model &model, const std::vector &words, Best &out) { const LanguageModel &abstract = LanguageModel::GetFirstLM(); const float oov_weight = abstract.OOVFeatureEnabled() ? abstract.GetOOVWeight() : 0.0; const StaticData &data = StaticData::Instance(); - search::Config config(abstract.GetWeight() * log_10, data.GetCubePruningPopLimit(), search::NBestConfig(data.GetNBestSize())); + size_t cpl = data.options().cube.pop_limit; + size_t nbs = data.options().nbest.nbest_size; + search::Config config(abstract.GetWeight() * log_10, cpl, search::NBestConfig(nbs)); search::Context context(config, model); size_t size = m_source.GetSize(); @@ -255,7 +260,7 @@ template search::History Manager::PopulateBest(const M template void Manager::LMCallback(const Model &model, const std::vector &words) { - std::size_t nbest = StaticData::Instance().GetNBestSize(); + std::size_t nbest = StaticData::Instance().options().nbest.nbest_size; if (nbest <= 1) { search::History ret = PopulateBest(model, words, single_best_); if (ret) { diff --git a/moses/Manager.cpp b/moses/Manager.cpp index ec4f57739..c16aaa407 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -71,7 +71,7 @@ Manager::Manager(ttasksptr const& ttask) m_transOptColl = source->CreateTranslationOptionCollection(ttask); const StaticData &staticData = StaticData::Instance(); - SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm(); + SearchAlgorithm searchAlgorithm = staticData.options().search.algo; m_search = Search::CreateSearch(*this, *source, searchAlgorithm, *m_transOptColl); @@ -264,7 +264,7 @@ void Manager::CalcNBest(size_t count, TrellisPathList &ret,bool onlyDistinct) co } // factor defines stopping point for distinct n-best list if too many candidates identical - size_t nBestFactor = StaticData::Instance().GetNBestFactor(); + size_t nBestFactor = StaticData::Instance().options().nbest.factor; if (nBestFactor < 1) nBestFactor = 1000; // 0 = unlimited // MAIN loop @@ -288,7 +288,7 @@ void Manager::CalcNBest(size_t count, TrellisPathList &ret,bool onlyDistinct) co if(onlyDistinct) { - const size_t nBestFactor = StaticData::Instance().GetNBestFactor(); + const size_t nBestFactor = StaticData::Instance().options().nbest.factor; if (nBestFactor > 0) contenders.Prune(count * nBestFactor); } else { @@ -1548,10 +1548,10 @@ void Manager::OutputBest(OutputCollector *collector) const // lattice MBR if (staticData.UseLatticeMBR()) { - if (staticData.IsNBestEnabled()) { + if (staticData.options().nbest.enabled) { //lattice mbr nbest vector solutions; - size_t n = min(nBestSize, staticData.GetNBestSize()); + size_t n = min(nBestSize, staticData.options().nbest.nbest_size); getLatticeMBRNBest(*this,nBestList,solutions,n); OutputLatticeMBRNBest(m_latticeNBestOut, solutions, translationId); } else { @@ -1609,14 +1609,16 @@ void Manager::OutputNBest(OutputCollector *collector) const long translationId = m_source.GetTranslationId(); if (staticData.UseLatticeMBR()) { - if (staticData.IsNBestEnabled()) { + if (staticData.options().nbest.enabled) { collector->Write(translationId, m_latticeNBestOut.str()); } } else { TrellisPathList nBestList; ostringstream out; - CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest()); - OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), m_source.GetTranslationId(), + CalcNBest(staticData.options().nbest.nbest_size, nBestList, + staticData.options().nbest.only_distinct); + OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), + m_source.GetTranslationId(), staticData.GetReportSegmentation()); collector->Write(m_source.GetTranslationId(), out.str()); } @@ -1630,9 +1632,10 @@ void Manager::OutputNBest(std::ostream& out , char reportSegmentation) const { const StaticData &staticData = StaticData::Instance(); - bool reportAllFactors = staticData.GetReportAllFactorsNBest(); - bool includeSegmentation = staticData.NBestIncludesSegmentation(); - bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest(); + NBestOptions const& nbo = staticData.options().nbest; + bool reportAllFactors = nbo.include_all_factors; + bool includeSegmentation = nbo.include_segmentation; + bool includeWordAlignment = nbo.include_alignment_info; TrellisPathList::const_iterator iter; for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { diff --git a/moses/RuleCube.cpp b/moses/RuleCube.cpp index 3a33ba5e5..f05d95270 100644 --- a/moses/RuleCube.cpp +++ b/moses/RuleCube.cpp @@ -44,7 +44,7 @@ RuleCube::RuleCube(const ChartTranslationOptions &transOpt, { RuleCubeItem *item = new RuleCubeItem(transOpt, allChartCells); m_covered.insert(item); - if (StaticData::Instance().GetCubePruningLazyScoring()) { + if (StaticData::Instance().options().cube.lazy_scoring) { item->EstimateScore(); } else { item->CreateHypothesis(transOpt, manager); @@ -92,7 +92,7 @@ void RuleCube::CreateNeighbor(const RuleCubeItem &item, int dimensionIndex, if (!result.second) { delete newItem; // already seen it } else { - if (StaticData::Instance().GetCubePruningLazyScoring()) { + if (StaticData::Instance().options().cube.lazy_scoring) { newItem->EstimateScore(); } else { newItem->CreateHypothesis(m_transOpt, manager); diff --git a/moses/RuleCubeQueue.cpp b/moses/RuleCubeQueue.cpp index 89020a4e5..e4a13c528 100644 --- a/moses/RuleCubeQueue.cpp +++ b/moses/RuleCubeQueue.cpp @@ -50,7 +50,7 @@ ChartHypothesis *RuleCubeQueue::Pop() // pop the most promising item from the cube and get the corresponding // hypothesis RuleCubeItem *item = cube->Pop(m_manager); - if (StaticData::Instance().GetCubePruningLazyScoring()) { + if (StaticData::Instance().options().cube.lazy_scoring) { item->CreateHypothesis(cube->GetTranslationOption(), m_manager); } ChartHypothesis *hypo = item->ReleaseHypothesis(); diff --git a/moses/ScoreComponentCollection.cpp b/moses/ScoreComponentCollection.cpp index 31de139ea..4d79d5565 100644 --- a/moses/ScoreComponentCollection.cpp +++ b/moses/ScoreComponentCollection.cpp @@ -330,7 +330,7 @@ void ScoreComponentCollection::OutputFeatureScores( std::ostream& out , std::string &lastName ) const { const StaticData &staticData = StaticData::Instance(); - bool labeledOutput = staticData.IsLabeledNBestList(); + bool labeledOutput = staticData.options().nbest.include_feature_labels; // regular features (not sparse) if (ff->HasTuneableComponents()) { diff --git a/moses/Search.cpp b/moses/Search.cpp index 8f947f622..7412d996b 100644 --- a/moses/Search.cpp +++ b/moses/Search.cpp @@ -9,15 +9,20 @@ namespace Moses Search::Search(Manager& manager) : m_manager(manager) - ,m_inputPath() - ,m_initialTransOpt() + , m_inputPath() + , m_initialTransOpt() + , m_options(manager.options()) + , interrupted_flag(0) { m_initialTransOpt.SetInputPath(m_inputPath); } -Search *Search::CreateSearch(Manager& manager, const InputType &source, - SearchAlgorithm searchAlgorithm, const TranslationOptionCollection &transOptColl) +Search * +Search:: +CreateSearch(Manager& manager, const InputType &source, + SearchAlgorithm searchAlgorithm, + const TranslationOptionCollection &transOptColl) { switch(searchAlgorithm) { case Normal: @@ -32,4 +37,18 @@ Search *Search::CreateSearch(Manager& manager, const InputType &source, } } +bool +Search:: +out_of_time() +{ + int const& timelimit = m_options.search.timeout; + if (!timelimit) return false; + double elapsed_time = GetUserTime(); + if (elapsed_time <= timelimit) return false; + VERBOSE(1,"Decoding is out of time (" << elapsed_time << "," + << timelimit << ")" << std::endl); + interrupted_flag = 1; + return true; +} + } diff --git a/moses/Search.h b/moses/Search.h index 164cc33ef..7bafa3157 100644 --- a/moses/Search.h +++ b/moses/Search.h @@ -43,6 +43,12 @@ protected: Manager& m_manager; InputPath m_inputPath; // for initial hypo TranslationOption m_initialTransOpt; /**< used to seed 1st hypo */ + AllOptions const& m_options; + + /** flag indicating that decoder ran out of time (see switch -time-out) */ + size_t interrupted_flag; + + bool out_of_time(); }; } diff --git a/moses/SearchCubePruning.cpp b/moses/SearchCubePruning.cpp index 09994024b..f6b2b90a9 100644 --- a/moses/SearchCubePruning.cpp +++ b/moses/SearchCubePruning.cpp @@ -48,8 +48,8 @@ SearchCubePruning::SearchCubePruning(Manager& manager, const InputType &source, std::vector < HypothesisStackCubePruning >::iterator iterStack; for (size_t ind = 0 ; ind < m_hypoStackColl.size() ; ++ind) { HypothesisStackCubePruning *sourceHypoColl = new HypothesisStackCubePruning(m_manager); - sourceHypoColl->SetMaxHypoStackSize(staticData.GetMaxHypoStackSize()); - sourceHypoColl->SetBeamWidth(staticData.GetBeamWidth()); + sourceHypoColl->SetMaxHypoStackSize(m_options.search.stack_size); + sourceHypoColl->SetBeamWidth(m_options.search.beam_width); m_hypoStackColl[ind] = sourceHypoColl; } @@ -66,7 +66,8 @@ SearchCubePruning::~SearchCubePruning() */ void SearchCubePruning::Decode() { - const StaticData &staticData = StaticData::Instance(); + const StaticData &SD = StaticData::Instance(); + AllOptions const& opts = SD.options(); // initial seed hypothesis: nothing translated, no words produced Hypothesis *hypo = Hypothesis::Create(m_manager,m_source, m_initialTransOpt); @@ -77,20 +78,22 @@ void SearchCubePruning::Decode() firstStack.CleanupArcList(); CreateForwardTodos(firstStack); - const size_t PopLimit = StaticData::Instance().GetCubePruningPopLimit(); - VERBOSE(3,"Cube Pruning pop limit is " << PopLimit << std::endl) + const size_t PopLimit = StaticData::Instance().options().cube.pop_limit; + VERBOSE(3,"Cube Pruning pop limit is " << PopLimit << std::endl); - const size_t Diversity = StaticData::Instance().GetCubePruningDiversity(); + const size_t Diversity = StaticData::Instance().options().cube.diversity; VERBOSE(3,"Cube Pruning diversity is " << Diversity << std::endl) // go through each stack size_t stackNo = 1; + int timelimit = m_options.search.timeout; std::vector < HypothesisStack* >::iterator iterStack; for (iterStack = m_hypoStackColl.begin() + 1 ; iterStack != m_hypoStackColl.end() ; ++iterStack) { // check if decoding ran out of time double _elapsed_time = GetUserTime(); - if (_elapsed_time > staticData.GetTimeoutThreshold()) { - VERBOSE(1,"Decoding is out of time (" << _elapsed_time << "," << staticData.GetTimeoutThreshold() << ")" << std::endl); + if (timelimit && _elapsed_time > timelimit) { + VERBOSE(1,"Decoding is out of time (" << _elapsed_time << "," + << timelimit << ")" << std::endl); return; } HypothesisStackCubePruning &sourceHypoColl = *static_cast(*iterStack); @@ -144,7 +147,7 @@ void SearchCubePruning::Decode() IFVERBOSE(2) { m_manager.GetSentenceStats().StartTimeStack(); } - sourceHypoColl.PruneToSize(staticData.GetMaxHypoStackSize()); + sourceHypoColl.PruneToSize(m_options.search.stack_size); VERBOSE(3,std::endl); sourceHypoColl.CleanupArcList(); IFVERBOSE(2) { diff --git a/moses/SearchNormal.cpp b/moses/SearchNormal.cpp index 377a906dd..227c27479 100644 --- a/moses/SearchNormal.cpp +++ b/moses/SearchNormal.cpp @@ -15,15 +15,21 @@ namespace Moses * /param source input sentence * /param transOptColl collection of translation options to be used for this sentence */ -SearchNormal::SearchNormal(Manager& manager, const InputType &source, const TranslationOptionCollection &transOptColl) - :Search(manager) - ,m_source(source) - ,m_hypoStackColl(source.GetSize() + 1) - ,interrupted_flag(0) - ,m_transOptColl(transOptColl) +SearchNormal:: +SearchNormal(Manager& manager, const InputType &source, + const TranslationOptionCollection &transOptColl) + : Search(manager) + , m_source(source) + , m_hypoStackColl(source.GetSize() + 1) + , m_transOptColl(transOptColl) { VERBOSE(1, "Translating: " << m_source << endl); - const StaticData &staticData = StaticData::Instance(); + + // m_beam_width = manager.options().search.beam_width; + // m_stack_size = manager.options().search.stack_size; + // m_stack_diversity = manager.options().search.stack_diversity; + // m_timeout = manager.options().search.timeout; + // m_max_distortion = manager.options().reordering.max_distortion; // only if constraint decoding (having to match a specified output) // long sentenceID = source.GetTranslationId(); @@ -32,10 +38,9 @@ SearchNormal::SearchNormal(Manager& manager, const InputType &source, const Tran std::vector < HypothesisStackNormal >::iterator iterStack; for (size_t ind = 0 ; ind < m_hypoStackColl.size() ; ++ind) { HypothesisStackNormal *sourceHypoColl = new HypothesisStackNormal(m_manager); - sourceHypoColl->SetMaxHypoStackSize(staticData.GetMaxHypoStackSize(), - staticData.GetMinHypoStackDiversity()); - sourceHypoColl->SetBeamWidth(staticData.GetBeamWidth()); - + sourceHypoColl->SetMaxHypoStackSize(this->m_options.search.stack_size, + this->m_options.search.stack_diversity); + sourceHypoColl->SetBeamWidth(this->m_options.search.beam_width); m_hypoStackColl[ind] = sourceHypoColl; } } @@ -45,59 +50,49 @@ SearchNormal::~SearchNormal() RemoveAllInColl(m_hypoStackColl); } + +bool +SearchNormal:: +ProcessOneStack(HypothesisStack* hstack) +{ + if (this->out_of_time()) return false; + SentenceStats &stats = m_manager.GetSentenceStats(); + HypothesisStackNormal &sourceHypoColl + = *static_cast(hstack); + + // the stack is pruned before processing (lazy pruning): + VERBOSE(3,"processing hypothesis from next stack"); + IFVERBOSE(2) stats.StartTimeStack(); + sourceHypoColl.PruneToSize(m_options.search.stack_size); + VERBOSE(3,std::endl); + sourceHypoColl.CleanupArcList(); + IFVERBOSE(2) stats.StopTimeStack(); + + // go through each hypothesis on the stack and try to expand it + BOOST_FOREACH(Hypothesis* h, sourceHypoColl) + ProcessOneHypothesis(*h); + return true; +} + + /** * Main decoder loop that translates a sentence by expanding * hypotheses stack by stack, until the end of the sentence. */ void SearchNormal::Decode() { - const StaticData &staticData = StaticData::Instance(); SentenceStats &stats = m_manager.GetSentenceStats(); // initial seed hypothesis: nothing translated, no words produced - Hypothesis *hypo = Hypothesis::Create(m_manager,m_source, m_initialTransOpt); + Hypothesis *hypo = Hypothesis::Create(m_manager, m_source, m_initialTransOpt); m_hypoStackColl[0]->AddPrune(hypo); // go through each stack - std::vector < HypothesisStack* >::iterator iterStack; - for (iterStack = m_hypoStackColl.begin() ; iterStack != m_hypoStackColl.end() ; ++iterStack) { - // check if decoding ran out of time - double _elapsed_time = GetUserTime(); - if (_elapsed_time > staticData.GetTimeoutThreshold()) { - VERBOSE(1,"Decoding is out of time (" << _elapsed_time << "," << staticData.GetTimeoutThreshold() << ")" << std::endl); - interrupted_flag = 1; - return; - } - HypothesisStackNormal &sourceHypoColl = *static_cast(*iterStack); - - // the stack is pruned before processing (lazy pruning): - VERBOSE(3,"processing hypothesis from next stack"); - IFVERBOSE(2) { - stats.StartTimeStack(); - } - sourceHypoColl.PruneToSize(staticData.GetMaxHypoStackSize()); - VERBOSE(3,std::endl); - sourceHypoColl.CleanupArcList(); - IFVERBOSE(2) { - stats.StopTimeStack(); - } - - // go through each hypothesis on the stack and try to expand it - HypothesisStackNormal::const_iterator iterHypo; - for (iterHypo = sourceHypoColl.begin() ; iterHypo != sourceHypoColl.end() ; ++iterHypo) { - Hypothesis &hypothesis = **iterHypo; - ProcessOneHypothesis(hypothesis); // expand the hypothesis - } - // some logging - IFVERBOSE(2) { - OutputHypoStackSize(); - } - - // this stack is fully expanded; - actual_hypoStack = &sourceHypoColl; - + BOOST_FOREACH(HypothesisStack* hstack, m_hypoStackColl) { + if (!ProcessOneStack(hstack)) return; + IFVERBOSE(2) OutputHypoStackSize(); + actual_hypoStack = static_cast(hstack); } - //OutputHypoStack(); } @@ -111,8 +106,8 @@ SearchNormal:: ProcessOneHypothesis(const Hypothesis &hypothesis) { // since we check for reordering limits, its good to have that limit handy - int maxDistortion = StaticData::Instance().GetMaxDistortion(); - bool isWordLattice = StaticData::Instance().GetInputType() == WordLatticeInput; + // int maxDistortion = StaticData::Instance().GetMaxDistortion(); + bool isWordLattice = m_source.GetType() == WordLatticeInput; const WordsBitmap hypoBitmap = hypothesis.GetWordsBitmap(); const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos(); @@ -122,7 +117,7 @@ ProcessOneHypothesis(const Hypothesis &hypothesis) ReoConstraint = m_source.GetReorderingConstraint(); // no limit of reordering: only check for overlap - if (maxDistortion < 0) { + if (m_options.reordering.max_distortion < 0) { for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos) { TranslationOptionList const* tol; @@ -152,7 +147,7 @@ ProcessOneHypothesis(const Hypothesis &hypothesis) if(hypoBitmap.GetValue(startPos)) continue; size_t maxSize = sourceSize - startPos; - size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength(); + size_t maxSizePhrase = m_options.search.max_phrase_length; maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase; size_t closestLeft = hypoBitmap.GetEdgeToTheLeftOf(startPos); @@ -178,7 +173,7 @@ ProcessOneHypothesis(const Hypothesis &hypothesis) WordsRange currentStartRange(startPos, startPos); if(m_source.ComputeDistortionDistance(prevRange, currentStartRange) - > maxDistortion) + > m_options.reordering.max_distortion) continue; TranslationOptionList const* tol; @@ -227,7 +222,7 @@ ProcessOneHypothesis(const Hypothesis &hypothesis) WordsRange bestNextExtension(hypoFirstGapPos, hypoFirstGapPos); if (m_source.ComputeDistortionDistance(extRange, bestNextExtension) - > maxDistortion) continue; + > m_options.reordering.max_distortion) continue; // everything is fine, we're good to go ExpandAllHypotheses(hypothesis, startPos, endPos); @@ -251,7 +246,7 @@ ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos // early discarding: check if hypothesis is too bad to build // this idea is explained in (Moore&Quirk, MT Summit 2007) float expectedScore = 0.0f; - if (StaticData::Instance().UseEarlyDiscarding()) { + if (m_options.search.UseEarlyDiscarding()) { // expected score is based on score of current hypothesis expectedScore = hypothesis.GetScore(); @@ -286,7 +281,7 @@ void SearchNormal::ExpandHypothesis(const Hypothesis &hypothesis, const Translat SentenceStats &stats = m_manager.GetSentenceStats(); Hypothesis *newHypo; - if (! staticData.UseEarlyDiscarding()) { + if (! m_options.search.UseEarlyDiscarding()) { // simple build, no questions asked IFVERBOSE(2) { stats.StartTimeBuildHyp(); @@ -303,7 +298,7 @@ void SearchNormal::ExpandHypothesis(const Hypothesis &hypothesis, const Translat // worst possible score may have changed -> recompute size_t wordsTranslated = hypothesis.GetWordsBitmap().GetNumWordsCovered() + transOpt.GetSize(); float allowedScore = m_hypoStackColl[wordsTranslated]->GetWorstScore(); - if (staticData.GetMinHypoStackDiversity()) { + if (m_options.search.stack_diversity) { WordsBitmapID id = hypothesis.GetWordsBitmap().GetIDPlus(transOpt.GetStartPos(), transOpt.GetEndPos()); float allowedScoreForBitmap = m_hypoStackColl[wordsTranslated]->GetWorstScoreForBitmap( id ); allowedScore = std::min( allowedScore, allowedScoreForBitmap ); diff --git a/moses/SearchNormal.h b/moses/SearchNormal.h index 2d43187b6..2cba53ab8 100644 --- a/moses/SearchNormal.h +++ b/moses/SearchNormal.h @@ -14,23 +14,38 @@ class Manager; class InputType; class TranslationOptionCollection; -/** Functions and variables you need to decoder an input using the phrase-based decoder (NO cube-pruning) +/** Functions and variables you need to decoder an input using the + * phrase-based decoder (NO cube-pruning) * Instantiated by the Manager class */ class SearchNormal: public Search { protected: const InputType &m_source; - std::vector < HypothesisStack* > m_hypoStackColl; /**< stacks to store hypotheses (partial translations) */ + //! stacks to store hypotheses (partial translations) // no of elements = no of words in source + 1 - size_t interrupted_flag; /**< flag indicating that decoder ran out of time (see switch -time-out) */ - HypothesisStackNormal* actual_hypoStack; /**actual (full expanded) stack of hypotheses*/ - const TranslationOptionCollection &m_transOptColl; /**< pre-computed list of translation options for the phrases in this sentence */ + std::vector < HypothesisStack* > m_hypoStackColl; + + /** actual (full expanded) stack of hypotheses*/ + HypothesisStackNormal* actual_hypoStack; + + /** pre-computed list of translation options for the phrases in this sentence */ + const TranslationOptionCollection &m_transOptColl; // functions for creating hypotheses - void ProcessOneHypothesis(const Hypothesis &hypothesis); - void ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos); - virtual void ExpandHypothesis(const Hypothesis &hypothesis,const TranslationOption &transOpt, float expectedScore); + + virtual bool + ProcessOneStack(HypothesisStack* hstack); + + virtual void + ProcessOneHypothesis(const Hypothesis &hypothesis); + + virtual void + ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos); + + virtual void + ExpandHypothesis(const Hypothesis &hypothesis, const TranslationOption &transOpt, + float expectedScore); public: SearchNormal(Manager& manager, const InputType &source, const TranslationOptionCollection &transOptColl); diff --git a/moses/SearchNormalBatch.cpp b/moses/SearchNormalBatch.cpp index 612a5deea..191543b50 100644 --- a/moses/SearchNormalBatch.cpp +++ b/moses/SearchNormalBatch.cpp @@ -3,6 +3,7 @@ #include "Manager.h" #include "Hypothesis.h" #include "util/exception.hh" +#include //#include @@ -14,7 +15,7 @@ SearchNormalBatch::SearchNormalBatch(Manager& manager, const InputType &source, :SearchNormal(manager, source, transOptColl) ,m_batch_size(10000) { - m_max_stack_size = StaticData::Instance().GetMaxHypoStackSize(); + m_max_stack_size = m_options.search.stack_size; // Split the feature functions into sets of stateless, stateful // distributed lm, and stateful non-distributed. @@ -50,47 +51,13 @@ void SearchNormalBatch::Decode() m_hypoStackColl[0]->AddPrune(hypo); // go through each stack - std::vector < HypothesisStack* >::iterator iterStack; - for (iterStack = m_hypoStackColl.begin() ; iterStack != m_hypoStackColl.end() ; ++iterStack) { - // check if decoding ran out of time - double _elapsed_time = GetUserTime(); - if (_elapsed_time > staticData.GetTimeoutThreshold()) { - VERBOSE(1,"Decoding is out of time (" << _elapsed_time << "," << staticData.GetTimeoutThreshold() << ")" << std::endl); - interrupted_flag = 1; - return; - } - HypothesisStackNormal &sourceHypoColl = *static_cast(*iterStack); - - // the stack is pruned before processing (lazy pruning): - VERBOSE(3,"processing hypothesis from next stack"); - IFVERBOSE(2) { - stats.StartTimeStack(); - } - sourceHypoColl.PruneToSize(staticData.GetMaxHypoStackSize()); - VERBOSE(3,std::endl); - sourceHypoColl.CleanupArcList(); - IFVERBOSE(2) { - stats.StopTimeStack(); - } - - // go through each hypothesis on the stack and try to expand it - HypothesisStackNormal::const_iterator iterHypo; - for (iterHypo = sourceHypoColl.begin() ; iterHypo != sourceHypoColl.end() ; ++iterHypo) { - Hypothesis &hypothesis = **iterHypo; - ProcessOneHypothesis(hypothesis); // expand the hypothesis - } - EvalAndMergePartialHypos(); - - // some logging - IFVERBOSE(2) { - OutputHypoStackSize(); - } - - // this stack is fully expanded; - actual_hypoStack = &sourceHypoColl; + BOOST_FOREACH(HypothesisStack* hstack, m_hypoStackColl) { + if (!ProcessOneStack(hstack)) return; + EvalAndMergePartialHypos(); // <= THAT is the difference to SearchNormal! + IFVERBOSE(2) OutputHypoStackSize(); + actual_hypoStack = static_cast(hstack); } - - EvalAndMergePartialHypos(); + EvalAndMergePartialHypos(); // <= THAT is the difference to SearchNormal! } /** @@ -106,7 +73,8 @@ void SearchNormalBatch::Decode() void SearchNormalBatch:: ExpandHypothesis(const Hypothesis &hypothesis, - const TranslationOption &transOpt, float expectedScore) + const TranslationOption &transOpt, + float expectedScore) { // Check if the number of partial hypotheses exceeds the batch size. if (m_partial_hypos.size() >= m_batch_size) { diff --git a/moses/Sentence.cpp b/moses/Sentence.cpp index e4dab8547..77dd2fa95 100644 --- a/moses/Sentence.cpp +++ b/moses/Sentence.cpp @@ -181,7 +181,7 @@ init(string line, std::vector const& factorOrder) aux_interpret_dlt(line); // some poorly documented cache-based stuff // if sentences is specified as "" - if (SD.IsPassthroughEnabled() || SD.IsPassthroughInNBestEnabled()) { + if (SD.IsPassthroughEnabled() || SD.options().nbest.include_passthrough) { string pthru = PassthroughSGML(line,"passthrough"); this->SetPassthroughInformation(pthru); } diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 1293f5d44..0acdd7b38 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- // $Id$ // vim:tabstop=2 @@ -63,8 +64,6 @@ StaticData::StaticData() : m_sourceStartPosMattersForRecombination(false) , m_requireSortingAfterSourceContext(false) , m_inputType(SentenceInput) - // , m_onlyDistinctNBest(false) - // , m_needAlignmentInfo(false) , m_lmEnableOOVFeature(false) , m_isAlwaysCreateDirectTranslationOption(false) , m_currentWeightSetting("default") @@ -80,16 +79,6 @@ StaticData::StaticData() StaticData::~StaticData() { RemoveAllInColl(m_decodeGraphs); - - /* - const std::vector &producers = FeatureFunction::GetFeatureFunctions(); - for(size_t i=0;iSetParameter(m_outputHypoScore, "output-hypo-score", false ); - - //word-to-word alignment - // alignments m_parameter->SetParameter(m_PrintAlignmentInfo, "print-alignment-info", false ); - - // if (m_PrintAlignmentInfo) { // => now in BookkeepingOptions::init() - // m_needAlignmentInfo = true; - // } - m_parameter->SetParameter(m_wordAlignmentSort, "sort-word-alignment", NoSort); - - // if (m_PrintAlignmentInfoNbest) { // => now in BookkeepingOptions::init() - // m_needAlignmentInfo = true; - // } - params = m_parameter->GetParam("alignment-output-file"); if (params && params->size()) { m_alignmentOutputFile = Scan(params->at(0)); - // m_needAlignmentInfo = true; // => now in BookkeepingOptions::init() } m_parameter->SetParameter( m_PrintID, "print-id", false ); m_parameter->SetParameter( m_PrintPassthroughInformation, "print-passthrough", false ); - // m_parameter->SetParameter( m_PrintPassthroughInformationInNBest, "print-passthrough-in-n-best", false ); // => now in BookkeepingOptions::init() - // word graph params = m_parameter->GetParam("output-word-graph"); - if (params && params->size() == 2) - m_outputWordGraph = true; - else - m_outputWordGraph = false; + m_outputWordGraph = (params && params->size() == 2); - // search graph params = m_parameter->GetParam("output-search-graph"); if (params && params->size()) { if (params->size() != 1) { @@ -240,6 +209,7 @@ StaticData } m_outputSearchGraph = true; } + // ... in extended format else if (m_parameter->GetParam("output-search-graph-extended") && m_parameter->GetParam("output-search-graph-extended")->size()) { @@ -298,15 +268,14 @@ StaticData m_parameter->SetParameter(m_printAllDerivations , "print-all-derivations", false ); // additional output - m_parameter->SetParameter(m_detailedTranslationReportingFilePath, "translation-details", ""); - m_parameter->SetParameter(m_detailedTreeFragmentsTranslationReportingFilePath, "tree-translation-details", ""); - - //DIMw - m_parameter->SetParameter(m_detailedAllTranslationReportingFilePath, "translation-all-details", ""); - + m_parameter->SetParameter(m_detailedTranslationReportingFilePath, + "translation-details", ""); + m_parameter->SetParameter(m_detailedTreeFragmentsTranslationReportingFilePath, + "tree-translation-details", ""); + m_parameter->SetParameter(m_detailedAllTranslationReportingFilePath, + "translation-all-details", ""); m_parameter->SetParameter(m_startTranslationId, "start-translation-id", 0); - //lattice samples params = m_parameter->GetParam("lattice-samples"); if (params) { @@ -323,14 +292,6 @@ StaticData return true; } - -bool -StaticData -::ini_nbest_options() -{ - return m_nbest_options.init(*m_parameter); -} - void StaticData ::ini_compact_table_options() @@ -353,8 +314,8 @@ StaticData ::ini_performance_options() { const PARAM_VEC *params; - m_parameter->SetParameter(m_timeout_threshold, "time-out", -1); - m_timeout = (GetTimeoutThreshold() == (size_t)-1) ? false : true; + // m_parameter->SetParameter(m_timeout_threshold, "time-out", -1); + // m_timeout = (GetTimeoutThreshold() == (size_t)-1) ? false : true; m_threadCount = 1; params = m_parameter->GetParam("threads"); @@ -388,18 +349,6 @@ StaticData return true; } -void -StaticData -::ini_cube_pruning_options() -{ - m_parameter->SetParameter(m_cubePruningPopLimit, "cube-pruning-pop-limit", - DEFAULT_CUBE_PRUNING_POP_LIMIT); - m_parameter->SetParameter(m_cubePruningDiversity, "cube-pruning-diversity", - DEFAULT_CUBE_PRUNING_DIVERSITY); - m_parameter->SetParameter(m_cubePruningLazyScoring, "cube-pruning-lazy-scoring", - false); -} - void StaticData ::ini_factor_maps() @@ -453,45 +402,42 @@ void StaticData ::ini_distortion_options() { - // reordering constraints - m_parameter->SetParameter(m_maxDistortion, "distortion-limit", -1); - - m_parameter->SetParameter(m_reorderingConstraint, "monotone-at-punctuation", false ); - - // early distortion cost - m_parameter->SetParameter(m_useEarlyDistortionCost, "early-distortion-cost", false ); - + // // reordering constraints + // m_parameter->SetParameter(m_maxDistortion, "distortion-limit", -1); + // m_parameter->SetParameter(m_reorderingConstraint, "monotone-at-punctuation", false ); + // // early distortion cost + // m_parameter->SetParameter(m_useEarlyDistortionCost, "early-distortion-cost", false ); } bool StaticData ::ini_stack_decoding_options() { - const PARAM_VEC *params; - // settings for pruning - m_parameter->SetParameter(m_maxHypoStackSize, "stack", DEFAULT_MAX_HYPOSTACK_SIZE); + // const PARAM_VEC *params; + // // settings for pruning + // m_parameter->SetParameter(m_maxHypoStackSize, "stack", DEFAULT_MAX_HYPOSTACK_SIZE); - m_minHypoStackDiversity = 0; - params = m_parameter->GetParam("stack-diversity"); - if (params && params->size()) { - if (m_maxDistortion > 15) { - std::cerr << "stack diversity > 0 is not allowed for distortion limits larger than 15"; - return false; - } - if (m_inputType == WordLatticeInput) { - std::cerr << "stack diversity > 0 is not allowed for lattice input"; - return false; - } - m_minHypoStackDiversity = Scan(params->at(0)); - } + // m_minHypoStackDiversity = 0; + // params = m_parameter->GetParam("stack-diversity"); + // if (params && params->size()) { + // if (m_maxDistortion > 15) { + // std::cerr << "stack diversity > 0 is not allowed for distortion limits larger than 15"; + // return false; + // } + // if (m_inputType == WordLatticeInput) { + // std::cerr << "stack diversity > 0 is not allowed for lattice input"; + // return false; + // } + // m_minHypoStackDiversity = Scan(params->at(0)); + // } - m_parameter->SetParameter(m_beamWidth, "beam-threshold", DEFAULT_BEAM_WIDTH); - m_beamWidth = TransformScore(m_beamWidth); + // m_parameter->SetParameter(m_beamWidth, "beam-threshold", DEFAULT_BEAM_WIDTH); + // m_beamWidth = TransformScore(m_beamWidth); - m_parameter->SetParameter(m_earlyDiscardingThreshold, "early-discarding-threshold", DEFAULT_EARLY_DISCARDING_THRESHOLD); - m_earlyDiscardingThreshold = TransformScore(m_earlyDiscardingThreshold); + // m_parameter->SetParameter(m_earlyDiscardingThreshold, "early-discarding-threshold", DEFAULT_EARLY_DISCARDING_THRESHOLD); + // m_earlyDiscardingThreshold = TransformScore(m_earlyDiscardingThreshold); return true; } @@ -499,12 +445,12 @@ void StaticData ::ini_phrase_lookup_options() { - m_parameter->SetParameter(m_translationOptionThreshold, "translation-option-threshold", DEFAULT_TRANSLATION_OPTION_THRESHOLD); - m_translationOptionThreshold = TransformScore(m_translationOptionThreshold); + // m_parameter->SetParameter(m_translationOptionThreshold, "translation-option-threshold", DEFAULT_TRANSLATION_OPTION_THRESHOLD); + // m_translationOptionThreshold = TransformScore(m_translationOptionThreshold); - m_parameter->SetParameter(m_maxNoTransOptPerCoverage, "max-trans-opt-per-coverage", DEFAULT_MAX_TRANS_OPT_SIZE); - m_parameter->SetParameter(m_maxNoPartTransOpt, "max-partial-trans-opt", DEFAULT_MAX_PART_TRANS_OPT_SIZE); - m_parameter->SetParameter(m_maxPhraseLength, "max-phrase-length", DEFAULT_MAX_PHRASE_LENGTH); + // m_parameter->SetParameter(m_maxNoTransOptPerCoverage, "max-trans-opt-per-coverage", DEFAULT_MAX_TRANS_OPT_SIZE); + // m_parameter->SetParameter(m_maxNoPartTransOpt, "max-partial-trans-opt", DEFAULT_MAX_PART_TRANS_OPT_SIZE); + // m_parameter->SetParameter(m_maxPhraseLength, "max-phrase-length", DEFAULT_MAX_PHRASE_LENGTH); } @@ -583,10 +529,11 @@ bool StaticData::LoadData(Parameter *parameter) const PARAM_VEC *params; - m_context_parameters.init(*parameter); + m_options.init(*parameter); + // m_context_parameters.init(*parameter); // to cube or not to cube - m_parameter->SetParameter(m_searchAlgorithm, "search-algorithm", Normal); + // m_parameter->SetParameter(m_searchAlgorithm, "search-algorithm", Normal); if (IsSyntax()) LoadChartDecodingParameters(); @@ -596,7 +543,7 @@ bool StaticData::LoadData(Parameter *parameter) ini_factor_maps(); ini_input_options(); m_bookkeeping_options.init(*parameter); - m_nbest_options.init(*parameter); // if (!ini_nbest_options()) return false; + // m_nbest_options.init(*parameter); if (!ini_output_options()) return false; // threading etc. @@ -609,7 +556,7 @@ bool StaticData::LoadData(Parameter *parameter) ini_distortion_options(); if (!ini_stack_decoding_options()) return false; ini_phrase_lookup_options(); - ini_cube_pruning_options(); + // ini_cube_pruning_options(); ini_oov_options(); ini_mbr_options(); @@ -625,7 +572,7 @@ bool StaticData::LoadData(Parameter *parameter) || m_outputSearchGraphPB #endif || m_latticeSamplesFilePath.size()) { - m_nbest_options.enabled = true; + m_options.nbest.enabled = true; } // S2T decoder @@ -1272,8 +1219,9 @@ StaticData // FIXME Does this make sense for F2S? Perhaps it should be changed once // FIXME the pipeline uses RuleTable consistently. - if (m_searchAlgorithm == SyntaxS2T || m_searchAlgorithm == SyntaxT2S || - m_searchAlgorithm == SyntaxT2S_SCFG || m_searchAlgorithm == SyntaxF2S) { + SearchAlgorithm algo = m_options.search.algo; + if (algo == SyntaxS2T || algo == SyntaxT2S || + algo == SyntaxT2S_SCFG || algo == SyntaxF2S) { // Automatically override PhraseDictionary{Memory,Scope3}. This will // have to change if the FF parameters diverge too much in the future, // but for now it makes switching between the old and new decoders much diff --git a/moses/StaticData.h b/moses/StaticData.h index 8128d0b97..e20b9390a 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- // $Id$ /*********************************************************************** @@ -44,8 +44,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "moses/FF/Factory.h" #include "moses/PP/Factory.h" -#include "moses/parameters/ContextParameters.h" -#include "moses/parameters/NBestOptions.h" +#include "moses/parameters/AllOptions.h" #include "moses/parameters/BookkeepingOptions.h" namespace Moses @@ -70,11 +69,10 @@ class StaticData friend class HyperParameterAsWeight; private: - static StaticData s_instance; + static StaticData s_instance; protected: Parameter *m_parameter; - - ContextParameters m_context_parameters; + AllOptions m_options; std::vector m_inputFactorOrder, m_outputFactorOrder; mutable ScoreComponentCollection m_allWeights; @@ -84,34 +82,29 @@ protected: // Initial = 0 = can be used when creating poss trans // Other = 1 = used to calculate LM score once all steps have been processed float - m_beamWidth, - m_earlyDiscardingThreshold, - m_translationOptionThreshold, + // m_beamWidth, + // m_earlyDiscardingThreshold, + // m_translationOptionThreshold, m_wordDeletionWeight; // PhraseTrans, Generation & LanguageModelScore has multiple weights. - int m_maxDistortion; + // int m_maxDistortion; // do it differently from old pharaoh // -ve = no limit on distortion // 0 = no disortion (monotone in old pharaoh) bool m_reorderingConstraint; //! use additional reordering constraints - bool m_useEarlyDistortionCost; - size_t m_maxHypoStackSize; //! hypothesis-stack size that triggers pruning - size_t m_minHypoStackDiversity; //! minimum number of hypothesis in stack for each source word coverage; - NBestOptions m_nbest_options; + // bool m_useEarlyDistortionCost; + // size_t m_maxHypoStackSize; //! hypothesis-stack size that triggers pruning + // size_t m_minHypoStackDiversity; //! minimum number of hypothesis in stack for each source word coverage; BookkeepingOptions m_bookkeeping_options; - // size_t m_nBestSize; - // size_t m_nBestFactor; size_t m_latticeSamplesSize; - size_t m_maxNoTransOptPerCoverage; - size_t m_maxNoPartTransOpt; - size_t m_maxPhraseLength; + // size_t m_maxNoTransOptPerCoverage; + // size_t m_maxNoPartTransOpt; + // size_t m_maxPhraseLength; - // std::string m_nBestFilePath; std::string m_latticeSamplesFilePath; - // bool m_labeledNBestList,m_nBestIncludesSegmentation; bool m_dropUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = drop (ignore) them bool m_markUnknown; //! false = treat unknown words as unknowns, and translate them as themselves; true = mark and (ignore) them std::string m_unknownWordPrefix; @@ -127,7 +120,7 @@ protected: bool m_outputHypoScore; bool m_requireSortingAfterSourceContext; - SearchAlgorithm m_searchAlgorithm; + // SearchAlgorithm m_searchAlgorithm; InputTypeEnum m_inputType; mutable size_t m_verboseLevel; @@ -135,21 +128,15 @@ protected: bool m_reportSegmentation; bool m_reportSegmentationEnriched; bool m_reportAllFactors; - // bool m_reportAllFactorsNBest; std::string m_detailedTranslationReportingFilePath; std::string m_detailedTreeFragmentsTranslationReportingFilePath; - //DIMw std::string m_detailedAllTranslationReportingFilePath; - // bool m_onlyDistinctNBest; bool m_PrintAlignmentInfo; - // bool m_needAlignmentInfo; // => BookkeepingOptions - // bool m_PrintAlignmentInfoNbest; bool m_PrintID; bool m_PrintPassthroughInformation; - // bool m_PrintPassthroughInformationInNBest; std::string m_alignmentOutputFile; @@ -174,8 +161,8 @@ protected: size_t m_lmcache_cleanup_threshold; //! number of translations after which LM claenup is performed (0=never, N=after N translations; default is 1) bool m_lmEnableOOVFeature; - bool m_timeout; //! use timeout - size_t m_timeout_threshold; //! seconds after which time out is activated + // bool m_timeout; //! use timeout + // size_t m_timeout_threshold; //! seconds after which time out is activated bool m_isAlwaysCreateDirectTranslationOption; //! constructor. only the 1 static variable can be created @@ -192,9 +179,6 @@ protected: bool m_includeLHSInSearchGraph; //! include LHS of rules in search graph std::string m_outputUnknownsFile; //! output unknowns in this file - size_t m_cubePruningPopLimit; - size_t m_cubePruningDiversity; - bool m_cubePruningLazyScoring; size_t m_ruleLimit; // Whether to load compact phrase table and reordering table into memory @@ -221,7 +205,6 @@ protected: bool m_useLegacyPT; bool m_defaultNonTermOnlyForEmptyRange; S2TParsingAlgorithm m_s2tParsingAlgorithm; - // bool m_printNBestTrees; FeatureRegistry m_registry; PhrasePropertyFactory m_phrasePropertyFactory; @@ -260,7 +243,6 @@ protected: void ini_lmbr_options(); void ini_mbr_options(); void ini_mira_options(); - bool ini_nbest_options(); void ini_oov_options(); bool ini_output_options(); bool ini_performance_options(); @@ -307,9 +289,14 @@ public: return *m_parameter; } - const ContextParameters& - GetContextParameters() const { - return m_context_parameters; + AllOptions const& + options() const { + return m_options; + } + + AllOptions& + options() { + return m_options; } const std::vector &GetInputFactorOrder() const { @@ -338,32 +325,24 @@ public: return m_disableDiscarding; } inline size_t GetMaxNoTransOptPerCoverage() const { - return m_maxNoTransOptPerCoverage; + return m_options.search.max_trans_opt_per_cov; } inline size_t GetMaxNoPartTransOpt() const { - return m_maxNoPartTransOpt; + return m_options.search.max_partial_trans_opt; } inline size_t GetMaxPhraseLength() const { - return m_maxPhraseLength; + return m_options.search.max_phrase_length; } bool IsWordDeletionEnabled() const { return m_wordDeletionEnabled; } - size_t GetMaxHypoStackSize() const { - return m_maxHypoStackSize; - } - size_t GetMinHypoStackDiversity() const { - return m_minHypoStackDiversity; - } - size_t GetCubePruningPopLimit() const { - return m_cubePruningPopLimit; - } - size_t GetCubePruningDiversity() const { - return m_cubePruningDiversity; - } - bool GetCubePruningLazyScoring() const { - return m_cubePruningLazyScoring; - } + // size_t GetMaxHypoStackSize() const { + // return m_options.search.stack_size; + // } + // size_t GetMinHypoStackDiversity() const { + // return m_options.search.stack_diversity; + // } + size_t IsPathRecoveryEnabled() const { return m_recoverPath; } @@ -373,30 +352,30 @@ public: bool IsPassthroughEnabled() const { return m_PrintPassthroughInformation; } - bool IsPassthroughInNBestEnabled() const { - return m_nbest_options.include_passthrough; - // return m_PrintPassthroughInformationInNBest; - } + int GetMaxDistortion() const { - return m_maxDistortion; + return m_options.reordering.max_distortion; } bool UseReorderingConstraint() const { return m_reorderingConstraint; } float GetBeamWidth() const { - return m_beamWidth; + return m_options.search.beam_width; } float GetEarlyDiscardingThreshold() const { - return m_earlyDiscardingThreshold; + return m_options.search.early_discarding_threshold; } + bool UseEarlyDiscarding() const { - return m_earlyDiscardingThreshold != -std::numeric_limits::infinity(); + return m_options.search.early_discarding_threshold + != -std::numeric_limits::infinity(); } bool UseEarlyDistortionCost() const { - return m_useEarlyDistortionCost; + return m_options.reordering.use_early_distortion_cost; + // return m_useEarlyDistortionCost; } float GetTranslationOptionThreshold() const { - return m_translationOptionThreshold; + return m_options.search.trans_opt_threshold; } size_t GetVerboseLevel() const { @@ -420,13 +399,11 @@ public: else std::cerr << "Warning: Invalid value for reportSegmentation (0 - 2)! Ignoring"; } + bool GetReportAllFactors() const { return m_reportAllFactors; } - bool GetReportAllFactorsNBest() const { - return m_nbest_options.include_all_factors; - // return m_reportAllFactorsNBest; - } + bool IsDetailedTranslationReportingEnabled() const { return !m_detailedTranslationReportingFilePath.empty(); } @@ -444,10 +421,10 @@ public: const std::string &GetDetailedTreeFragmentsTranslationReportingFilePath() const { return m_detailedTreeFragmentsTranslationReportingFilePath; } - bool IsLabeledNBestList() const { - return m_nbest_options.include_feature_labels; - // return m_labeledNBestList; - } + + // bool IsLabeledNBestList() const { + // return m_options.nbest.include_feature_labels; + // } bool UseMinphrInMemory() const { return m_minphrMemory; @@ -458,26 +435,17 @@ public: } // for mert - size_t GetNBestSize() const { - return m_nbest_options.nbest_size; - // return m_nBestSize; - } + // size_t GetNBestSize() const { + // return m_options.nbest.nbest_size; + // } - const std::string &GetNBestFilePath() const { - return m_nbest_options.output_file_path; - // return m_nBestFilePath; - } + // const std::string &GetNBestFilePath() const { + // return m_options.nbest.output_file_path; + // } - bool IsNBestEnabled() const { - return m_nbest_options.enabled; - // return (!m_nBestFilePath.empty() || m_mbr || m_useLatticeMBR || m_mira || - // m_outputSearchGraph || m_outputSearchGraphSLF || - // m_outputSearchGraphHypergraph || m_useConsensusDecoding || - // #ifdef HAVE_PROTOBUF - // m_outputSearchGraphPB || - // #endif - // !m_latticeSamplesFilePath.empty()); - } + // bool IsNBestEnabled() const { + // return m_options.nbest.enabled; + // } size_t GetLatticeSamplesSize() const { return m_latticeSamplesSize; @@ -487,10 +455,9 @@ public: return m_latticeSamplesFilePath; } - size_t GetNBestFactor() const { - return m_nbest_options.factor; - // return m_nBestFactor; - } + // size_t GetNBestFactor() const { + // return m_options.nbest.factor; + // } bool GetOutputWordGraph() const { return m_outputWordGraph; } @@ -499,22 +466,15 @@ public: InputTypeEnum GetInputType() const { return m_inputType; } - SearchAlgorithm GetSearchAlgorithm() const { - return m_searchAlgorithm; - } - // bool IsSyntax() const { - // return m_searchAlgorithm == CYKPlus || - // m_searchAlgorithm == ChartIncremental || - // m_searchAlgorithm == SyntaxS2T || - // m_searchAlgorithm == SyntaxT2S || - // m_searchAlgorithm == SyntaxT2S_SCFG || - // m_searchAlgorithm == SyntaxF2S; + // SearchAlgorithm GetSearchAlgorithm() const { + // return m_searchAlgorithm; // } bool IsSyntax(SearchAlgorithm algo = DefaultSearchAlgorithm) const { if (algo == DefaultSearchAlgorithm) - algo = m_searchAlgorithm; + algo = m_options.search.algo; + return (algo == CYKPlus || algo == ChartIncremental || algo == SyntaxS2T || algo == SyntaxT2S || algo == SyntaxF2S || algo == SyntaxT2S_SCFG); @@ -546,10 +506,9 @@ public: //Weights for feature with fixed number of values void SetWeights(const FeatureFunction* sp, const std::vector& weights); - bool GetDistinctNBest() const { - return m_nbest_options.only_distinct; - // return m_onlyDistinctNBest; - } + // bool GetDistinctNBest() const { + // return m_options.nbest.only_distinct; + // } const std::string& GetFactorDelimiter() const { return m_factorDelimiter; } @@ -603,12 +562,12 @@ public: return m_lmbrMapWeight; } - bool UseTimeout() const { - return m_timeout; - } - size_t GetTimeoutThreshold() const { - return m_timeout_threshold; - } + // bool UseTimeout() const { + // return m_timeout; + // } + // size_t GetTimeoutThreshold() const { + // return m_timeout_threshold; + // } size_t GetLMCacheCleanupThreshold() const { return m_lmcache_cleanup_threshold; @@ -722,19 +681,11 @@ public: bool PrintAlignmentInfo() const { return m_PrintAlignmentInfo; } - bool PrintAlignmentInfoInNbest() const { - return m_nbest_options.include_alignment_info; - // return m_PrintAlignmentInfoNbest; - } + WordAlignmentSort GetWordAlignmentSort() const { return m_wordAlignmentSort; } - bool NBestIncludesSegmentation() const { - return m_nbest_options.include_segmentation; - // return m_nBestIncludesSegmentation; - } - bool GetHasAlternateWeightSettings() const { return m_weightSetting.size() > 0; } @@ -872,11 +823,6 @@ public: return m_s2tParsingAlgorithm; } - bool PrintNBestTrees() const { - return m_nbest_options.print_trees; - // return m_printNBestTrees; - } - bool RequireSortingAfterSourceContext() const { return m_requireSortingAfterSourceContext; } diff --git a/moses/Syntax/F2S/Manager-inl.h b/moses/Syntax/F2S/Manager-inl.h index 55f85e888..077464208 100644 --- a/moses/Syntax/F2S/Manager-inl.h +++ b/moses/Syntax/F2S/Manager-inl.h @@ -59,9 +59,9 @@ void Manager::Decode() const StaticData &staticData = StaticData::Instance(); // Get various pruning-related constants. - const std::size_t popLimit = staticData.GetCubePruningPopLimit(); + const std::size_t popLimit = staticData.options().cube.pop_limit; const std::size_t ruleLimit = staticData.GetRuleLimit(); - const std::size_t stackLimit = staticData.GetMaxHypoStackSize(); + const std::size_t stackLimit = staticData.options().search.stack_size; // Initialize the stacks. InitializeStacks(); @@ -254,7 +254,7 @@ void Manager::ExtractKBest( // with 0 being 'unlimited.' This actually sets a large-ish limit in case // too many translations are identical. const StaticData &staticData = StaticData::Instance(); - const std::size_t nBestFactor = staticData.GetNBestFactor(); + const std::size_t nBestFactor = staticData.options().nbest.factor; std::size_t numDerivations = (nBestFactor == 0) ? k*1000 : k*nBestFactor; // Extract the derivations. diff --git a/moses/Syntax/Manager.cpp b/moses/Syntax/Manager.cpp index f84890927..f7df19d16 100644 --- a/moses/Syntax/Manager.cpp +++ b/moses/Syntax/Manager.cpp @@ -52,8 +52,8 @@ void Manager::OutputNBest(OutputCollector *collector) const long translationId = m_source.GetTranslationId(); KBestExtractor::KBestVec nBestList; - ExtractKBest(staticData.GetNBestSize(), nBestList, - staticData.GetDistinctNBest()); + ExtractKBest(staticData.options().nbest.nbest_size, nBestList, + staticData.options().nbest.only_distinct); OutputNBestList(collector, nBestList, translationId); } } @@ -90,8 +90,8 @@ void Manager::OutputNBestList(OutputCollector *collector, FixPrecision(out); } - bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest(); - bool PrintNBestTrees = staticData.PrintNBestTrees(); + bool includeWordAlignment = staticData.options().nbest.include_alignment_info; + bool PrintNBestTrees = staticData.options().nbest.print_trees; // PrintNBestTrees(); for (KBestExtractor::KBestVec::const_iterator p = nBestList.begin(); p != nBestList.end(); ++p) { diff --git a/moses/Syntax/RuleTableFF.cpp b/moses/Syntax/RuleTableFF.cpp index dd24493f0..fd203b2fc 100644 --- a/moses/Syntax/RuleTableFF.cpp +++ b/moses/Syntax/RuleTableFF.cpp @@ -31,14 +31,14 @@ void RuleTableFF::Load() SetFeaturesToApply(); const StaticData &staticData = StaticData::Instance(); - if (staticData.GetSearchAlgorithm() == SyntaxF2S || - staticData.GetSearchAlgorithm() == SyntaxT2S) { + if (staticData.options().search.algo == SyntaxF2S || + staticData.options().search.algo == SyntaxT2S) { F2S::HyperTree *trie = new F2S::HyperTree(this); F2S::HyperTreeLoader loader; loader.Load(m_input, m_output, m_filePath, *this, *trie, m_sourceTerminalSet); m_table = trie; - } else if (staticData.GetSearchAlgorithm() == SyntaxS2T) { + } else if (staticData.options().search.algo == SyntaxS2T) { S2TParsingAlgorithm algorithm = staticData.GetS2TParsingAlgorithm(); if (algorithm == RecursiveCYKPlus) { S2T::RuleTrieCYKPlus *trie = new S2T::RuleTrieCYKPlus(this); @@ -53,7 +53,7 @@ void RuleTableFF::Load() } else { UTIL_THROW2("ERROR: unhandled S2T parsing algorithm"); } - } else if (staticData.GetSearchAlgorithm() == SyntaxT2S_SCFG) { + } else if (staticData.options().search.algo == SyntaxT2S_SCFG) { T2S::RuleTrie *trie = new T2S::RuleTrie(this); T2S::RuleTrieLoader loader; loader.Load(m_input, m_output, m_filePath, *this, *trie); diff --git a/moses/Syntax/S2T/Manager-inl.h b/moses/Syntax/S2T/Manager-inl.h index ef08752b6..4963ec788 100644 --- a/moses/Syntax/S2T/Manager-inl.h +++ b/moses/Syntax/S2T/Manager-inl.h @@ -162,9 +162,9 @@ void Manager::Decode() const StaticData &staticData = StaticData::Instance(); // Get various pruning-related constants. - const std::size_t popLimit = staticData.GetCubePruningPopLimit(); + const std::size_t popLimit = staticData.options().cube.pop_limit; const std::size_t ruleLimit = staticData.GetRuleLimit(); - const std::size_t stackLimit = staticData.GetMaxHypoStackSize(); + const std::size_t stackLimit = staticData.options().search.stack_size; // Initialise the PChart and SChart. InitializeCharts(); @@ -302,7 +302,7 @@ void Manager::ExtractKBest( // with 0 being 'unlimited.' This actually sets a large-ish limit in case // too many translations are identical. const StaticData &staticData = StaticData::Instance(); - const std::size_t nBestFactor = staticData.GetNBestFactor(); + const std::size_t nBestFactor = staticData.options().nbest.factor; std::size_t numDerivations = (nBestFactor == 0) ? k*1000 : k*nBestFactor; // Extract the derivations. diff --git a/moses/Syntax/T2S/Manager-inl.h b/moses/Syntax/T2S/Manager-inl.h index 90ecb35bf..344d804e7 100644 --- a/moses/Syntax/T2S/Manager-inl.h +++ b/moses/Syntax/T2S/Manager-inl.h @@ -96,9 +96,9 @@ void Manager::Decode() const StaticData &staticData = StaticData::Instance(); // Get various pruning-related constants. - const std::size_t popLimit = staticData.GetCubePruningPopLimit(); + const std::size_t popLimit = this->options().cube.pop_limit; const std::size_t ruleLimit = staticData.GetRuleLimit(); - const std::size_t stackLimit = staticData.GetMaxHypoStackSize(); + const std::size_t stackLimit = this->options().search.stack_size; // Initialize the stacks. InitializeStacks(); @@ -214,7 +214,7 @@ void Manager::ExtractKBest( // with 0 being 'unlimited.' This actually sets a large-ish limit in case // too many translations are identical. const StaticData &staticData = StaticData::Instance(); - const std::size_t nBestFactor = staticData.GetNBestFactor(); + const std::size_t nBestFactor = staticData.options().nbest.factor; std::size_t numDerivations = (nBestFactor == 0) ? k*1000 : k*nBestFactor; // Extract the derivations. diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp index 30535eb3c..10d84aab9 100644 --- a/moses/TranslationTask.cpp +++ b/moses/TranslationTask.cpp @@ -100,7 +100,9 @@ TranslationTask ::TranslationTask(boost::shared_ptr const& source, boost::shared_ptr const& ioWrapper) : m_source(source) , m_ioWrapper(ioWrapper) -{ } +{ + m_options = StaticData::Instance().options(); +} TranslationTask::~TranslationTask() { } @@ -112,7 +114,7 @@ TranslationTask { boost::shared_ptr manager; StaticData const& staticData = StaticData::Instance(); - if (algo == DefaultSearchAlgorithm) algo = staticData.GetSearchAlgorithm(); + if (algo == DefaultSearchAlgorithm) algo = staticData.options().search.algo; if (!staticData.IsSyntax(algo)) manager.reset(new Manager(this->self())); // phrase-based @@ -154,6 +156,13 @@ TranslationTask return manager; } +AllOptions const& +TranslationTask:: +options() const +{ + return m_options; +} + void TranslationTask::Run() { UTIL_THROW_IF2(!m_source || !m_ioWrapper, diff --git a/moses/TranslationTask.h b/moses/TranslationTask.h index 54733ac73..987fd12ab 100644 --- a/moses/TranslationTask.h +++ b/moses/TranslationTask.h @@ -43,8 +43,8 @@ class TranslationTask : public Moses::Task operator=(TranslationTask const& other) { return *this; } - protected: + AllOptions m_options; boost::weak_ptr m_self; // weak ptr to myself boost::shared_ptr m_scope; // sores local info // pointer to ContextScope, which stores context-specific information @@ -134,6 +134,7 @@ public: void SetContextWeights(std::string const& context_weights); void ReSetContextWeights(std::map const& new_weights); + AllOptions const& options() const; protected: boost::shared_ptr m_source; diff --git a/moses/parameters/AllOptions.cpp b/moses/parameters/AllOptions.cpp new file mode 100644 index 000000000..d194ac08b --- /dev/null +++ b/moses/parameters/AllOptions.cpp @@ -0,0 +1,31 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "AllOptions.h" + +namespace Moses +{ + AllOptions:: + AllOptions(Parameter const& param) + { + init(param); + } + + bool + AllOptions:: + init(Parameter const& param) + { + if (!search.init(param)) return false; + if (!cube.init(param)) return false; + if (!nbest.init(param)) return false; + if (!reordering.init(param)) return false; + if (!context.init(param)) return false; + if (!input.init(param)) return false; + return sanity_check(); + } + + bool + AllOptions:: + sanity_check() + { + return true; + } +} diff --git a/moses/parameters/AllOptions.h b/moses/parameters/AllOptions.h new file mode 100644 index 000000000..54055e77b --- /dev/null +++ b/moses/parameters/AllOptions.h @@ -0,0 +1,31 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +#include "SearchOptions.h" +#include "CubePruningOptions.h" +#include "NBestOptions.h" +#include "ReorderingOptions.h" +#include "ContextParameters.h" +#include "InputOptions.h" + +namespace Moses +{ + struct + AllOptions + { + SearchOptions search; + CubePruningOptions cube; + NBestOptions nbest; + ReorderingOptions reordering; + ContextParameters context; + InputOptions input; + // StackOptions stack; + // BeamSearchOptions beam; + bool init(Parameter const& param); + bool sanity_check(); + AllOptions() {} + AllOptions(Parameter const& param); + }; + +} diff --git a/moses/parameters/BeamSearchOptions.h b/moses/parameters/BeamSearchOptions.h new file mode 100644 index 000000000..85a8d5a64 --- /dev/null +++ b/moses/parameters/BeamSearchOptions.h @@ -0,0 +1,15 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +namespace Moses +{ + + struct + BeamSearchOptions + { + bool init(Parameter const& param); + BeamSearchOptions(Parameter const& param); + }; + +} diff --git a/moses/parameters/ContextParameters.cpp b/moses/parameters/ContextParameters.cpp index 7ee9323bd..a06f98014 100644 --- a/moses/parameters/ContextParameters.cpp +++ b/moses/parameters/ContextParameters.cpp @@ -9,9 +9,9 @@ ContextParameters() : look_ahead(0), look_back(0) { } -void +bool ContextParameters:: -init(Parameter& params) +init(Parameter const& params) { look_back = look_ahead = 0; params.SetParameter(context_string, "context-string", std::string("")); @@ -19,12 +19,12 @@ init(Parameter& params) params.SetParameter(context_window, "context-window", std::string("")); if (context_window == "") - return; + return true; if (context_window.substr(0,3) == "all") { look_back = look_ahead = std::numeric_limits::max(); - return; + return true; } size_t p = context_window.find_first_of("0123456789"); @@ -47,5 +47,6 @@ init(Parameter& params) else UTIL_THROW2("Invalid specification of context window."); } + return true; } } diff --git a/moses/parameters/ContextParameters.h b/moses/parameters/ContextParameters.h index aff7783e4..ce140e422 100644 --- a/moses/parameters/ContextParameters.h +++ b/moses/parameters/ContextParameters.h @@ -12,7 +12,7 @@ class ContextParameters { public: ContextParameters(); - void init(Parameter& params); + bool init(Parameter const& params); size_t look_ahead; // # of words to look ahead for context-sensitive decoding size_t look_back; // # of works to look back for context-sensitive decoding std::string context_string; // fixed context string specified on command line diff --git a/moses/parameters/CubePruningOptions.cpp b/moses/parameters/CubePruningOptions.cpp new file mode 100644 index 000000000..0c2bc9b4c --- /dev/null +++ b/moses/parameters/CubePruningOptions.cpp @@ -0,0 +1,19 @@ +// -*- mode: c++; cc-style: gnu -*- +#include "CubePruningOptions.h" + +namespace Moses +{ + + bool + CubePruningOptions:: + init(Parameter const& param) + { + param.SetParameter(pop_limit, "cube-pruning-pop-limit", + DEFAULT_CUBE_PRUNING_POP_LIMIT); + param.SetParameter(diversity, "cube-pruning-diversity", + DEFAULT_CUBE_PRUNING_DIVERSITY); + param.SetParameter(lazy_scoring, "cube-pruning-lazy-scoring", false); + return true; + } + +} diff --git a/moses/parameters/CubePruningOptions.h b/moses/parameters/CubePruningOptions.h new file mode 100644 index 000000000..29959f4fe --- /dev/null +++ b/moses/parameters/CubePruningOptions.h @@ -0,0 +1,20 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +namespace Moses +{ + + struct + CubePruningOptions + { + size_t pop_limit; + size_t diversity; + bool lazy_scoring; + + bool init(Parameter const& param); + CubePruningOptions(Parameter const& param); + CubePruningOptions() {}; + }; + +} diff --git a/moses/parameters/InputOptions.cpp b/moses/parameters/InputOptions.cpp new file mode 100644 index 000000000..206be4660 --- /dev/null +++ b/moses/parameters/InputOptions.cpp @@ -0,0 +1,65 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "InputOptions.h" +#include +#include +#include "moses/StaticData.h" + +namespace Moses { + + InputOptions:: + InputOptions() + { + xml_brackets.first = "<"; + xml_brackets.second = ">"; + input_type = SentenceInput; + } + + bool + InputOptions:: + init(Parameter const& param) + { + param.SetParameter(input_type, "inputtype", SentenceInput); + if (input_type == SentenceInput) + { VERBOSE(2, "input type is: text input"); } + else if (input_type == ConfusionNetworkInput) + { VERBOSE(2, "input type is: confusion net"); } + else if (input_type == WordLatticeInput) + { VERBOSE(2, "input type is: word lattice"); } + else if (input_type == TreeInputType) + { VERBOSE(2, "input type is: tree"); } + else if (input_type == TabbedSentenceInput) + { VERBOSE(2, "input type is: tabbed sentence"); } + else if (input_type == ForestInputType) + { VERBOSE(2, "input type is: forest"); } + + param.SetParameter(continue_partial_translation, + "continue-partial-translation", false); + param.SetParameter(default_non_term_only_for_empty_range, + "default-non-term-for-empty-range-only", false); + + param.SetParameter(xml_policy, "xml-input", XmlPassThrough); + + // specify XML tags opening and closing brackets for XML option + // Do we really want this to be configurable???? UG + const PARAM_VEC *pspec; + pspec = param.GetParam("xml-brackets"); + if (pspec && pspec->size()) + { + std::vector brackets = Tokenize(pspec->at(0)); + if(brackets.size()!=2) + { + std::cerr << "invalid xml-brackets value, " + << "must specify exactly 2 blank-delimited strings " + << "for XML tags opening and closing brackets" << std::endl; + exit(1); + } + xml_brackets.first= brackets[0]; + xml_brackets.second=brackets[1]; + VERBOSE(1,"XML tags opening and closing brackets for XML input are: " + << xml_brackets.first << " and " + << xml_brackets.second << std::endl); + } + return true; + } + +} diff --git a/moses/parameters/InputOptions.h b/moses/parameters/InputOptions.h new file mode 100644 index 000000000..c5379bfae --- /dev/null +++ b/moses/parameters/InputOptions.h @@ -0,0 +1,25 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +#include +namespace Moses +{ + struct + InputOptions + { + bool continue_partial_translation; + bool default_non_term_only_for_empty_range; // whatever that means + InputTypeEnum input_type; + XmlInputType xml_policy; // pass through, ignore, exclusive, inclusive + + std::pair xml_brackets; + // strings to use as XML tags' opening and closing brackets. + // Default are "<" and ">" + + bool init(Parameter const& param); + InputOptions(); + }; + +} + diff --git a/moses/parameters/NBestOptions.h b/moses/parameters/NBestOptions.h index bc125c2b6..f7274a30a 100644 --- a/moses/parameters/NBestOptions.h +++ b/moses/parameters/NBestOptions.h @@ -1,4 +1,5 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once #include namespace Moses diff --git a/moses/parameters/ReorderingOptions.cpp b/moses/parameters/ReorderingOptions.cpp new file mode 100644 index 000000000..016c4ab0d --- /dev/null +++ b/moses/parameters/ReorderingOptions.cpp @@ -0,0 +1,21 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "ReorderingOptions.h" + +namespace Moses { + + ReorderingOptions:: + ReorderingOptions(Parameter const& param) + { + init(param); + } + + bool + ReorderingOptions:: + init(Parameter const& param) + { + param.SetParameter(max_distortion, "distortion-limit", -1); + param.SetParameter(monotone_at_punct, "monotone-at-punctuation", false); + param.SetParameter(use_early_distortion_cost, "early-distortion-cost", false); + return true; + } +} diff --git a/moses/parameters/ReorderingOptions.h b/moses/parameters/ReorderingOptions.h new file mode 100644 index 000000000..e18c7deab --- /dev/null +++ b/moses/parameters/ReorderingOptions.h @@ -0,0 +1,20 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +namespace Moses +{ + + struct + ReorderingOptions + { + int max_distortion; + bool monotone_at_punct; + bool use_early_distortion_cost; + bool init(Parameter const& param); + ReorderingOptions(Parameter const& param); + ReorderingOptions() {} + }; + +} + diff --git a/moses/parameters/SearchOptions.cpp b/moses/parameters/SearchOptions.cpp new file mode 100644 index 000000000..995d0b83e --- /dev/null +++ b/moses/parameters/SearchOptions.cpp @@ -0,0 +1,50 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "SearchOptions.h" + +namespace Moses +{ + SearchOptions:: + SearchOptions(Parameter const& param) + : stack_diversity(0) + { + init(param); + } + + bool + SearchOptions:: + init(Parameter const& param) + { + param.SetParameter(algo, "search-algorithm", Normal); + param.SetParameter(stack_size, "stack", DEFAULT_MAX_HYPOSTACK_SIZE); + param.SetParameter(stack_diversity, "stack-diversity", size_t(0)); + param.SetParameter(beam_width, "beam-threshold", DEFAULT_BEAM_WIDTH); + param.SetParameter(early_discarding_threshold, "early-discarding-threshold", + DEFAULT_EARLY_DISCARDING_THRESHOLD); + param.SetParameter(timeout, "time-out", 0); + param.SetParameter(max_phrase_length, "max-phrase-length", + DEFAULT_MAX_PHRASE_LENGTH); + param.SetParameter(trans_opt_threshold, "translation-option-threshold", + DEFAULT_TRANSLATION_OPTION_THRESHOLD); + param.SetParameter(max_trans_opt_per_cov, "max-trans-opt-per-coverage", + DEFAULT_MAX_TRANS_OPT_SIZE); + param.SetParameter(max_partial_trans_opt, "max-partial-trans-opt", + DEFAULT_MAX_PART_TRANS_OPT_SIZE); + + + // transformation to log of a few scores + beam_width = TransformScore(beam_width); + trans_opt_threshold = TransformScore(trans_opt_threshold); + early_discarding_threshold = TransformScore(early_discarding_threshold); + return true; + } + + bool + is_syntax(SearchAlgorithm algo) + { + return (algo == CYKPlus || algo == ChartIncremental || + algo == SyntaxS2T || algo == SyntaxT2S || + algo == SyntaxF2S || algo == SyntaxT2S_SCFG); + } + + +} diff --git a/moses/parameters/SearchOptions.h b/moses/parameters/SearchOptions.h new file mode 100644 index 000000000..050eeb54e --- /dev/null +++ b/moses/parameters/SearchOptions.h @@ -0,0 +1,44 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +namespace Moses +{ + + bool is_syntax(SearchAlgorithm algo); + + struct + SearchOptions + { + SearchAlgorithm algo; + + // stack decoding + size_t stack_size; // maxHypoStackSize; + size_t stack_diversity; // minHypoStackDiversity; + + size_t max_phrase_length; + size_t max_trans_opt_per_cov; + size_t max_partial_trans_opt; + // beam search + float beam_width; + + int timeout; + + // reordering options + // bool reorderingConstraint; //! use additional reordering constraints + // bool useEarlyDistortionCost; + + float early_discarding_threshold; + float trans_opt_threshold; + + bool init(Parameter const& param); + SearchOptions(Parameter const& param); + SearchOptions() {} + + bool UseEarlyDiscarding() const { + return early_discarding_threshold != -std::numeric_limits::infinity(); + } + + }; + +} diff --git a/regression-testing/run-single-test.perl b/regression-testing/run-single-test.perl index 232c2392d..37c48b495 100755 --- a/regression-testing/run-single-test.perl +++ b/regression-testing/run-single-test.perl @@ -57,7 +57,7 @@ die "Cannot locate input at $input" unless (-f $input); my $local_moses_ini = MosesRegressionTesting::get_localized_moses_ini($conf, $data_dir, $results_dir); my ($nbestfile,$nbestsize) = MosesRegressionTesting::get_nbestlist($conf); -if (defined($nbestsize) && $nbestsize > 0){ +if (defined($nbestsize) && $nbestsize > 0) { $NBEST=$nbestsize; } From 6c1d9e2431a1dbc270ea5ab401f47c2c32621103 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Thu, 6 Aug 2015 22:52:34 +0100 Subject: [PATCH 224/286] More reorganisation of options. --- moses-cmd/LatticeMBRGrid.cpp | 14 ++-- moses/ChartHypothesis.cpp | 2 +- moses/Hypothesis.cpp | 6 +- moses/LatticeMBR.cpp | 35 ++++++--- moses/Manager.cpp | 16 ++-- moses/StaticData.cpp | 118 ++++++++++++++-------------- moses/StaticData.h | 120 ++++++++++++++--------------- moses/mbr.cpp | 7 +- moses/parameters/AllOptions.cpp | 25 ++++++ moses/parameters/AllOptions.h | 7 ++ moses/parameters/LMBR_Options.cpp | 24 ++++++ moses/parameters/LMBR_Options.h | 25 ++++++ moses/parameters/MBR_Options.cpp | 16 ++++ moses/parameters/MBR_Options.h | 21 +++++ moses/parameters/SearchOptions.cpp | 2 + moses/parameters/SearchOptions.h | 2 + 16 files changed, 290 insertions(+), 150 deletions(-) create mode 100644 moses/parameters/LMBR_Options.cpp create mode 100644 moses/parameters/LMBR_Options.h create mode 100644 moses/parameters/MBR_Options.cpp create mode 100644 moses/parameters/MBR_Options.h diff --git a/moses-cmd/LatticeMBRGrid.cpp b/moses-cmd/LatticeMBRGrid.cpp index 0447a16fa..0ad338975 100644 --- a/moses-cmd/LatticeMBRGrid.cpp +++ b/moses-cmd/LatticeMBRGrid.cpp @@ -159,13 +159,15 @@ int main(int argc, char* argv[]) } StaticData& SD = const_cast(StaticData::Instance()); - SD.SetUseLatticeMBR(true); + LMBR_Options& lmbr = SD.options().lmbr; + MBR_Options& mbr = SD.options().mbr; + lmbr.enabled = true; boost::shared_ptr ioWrapper(new IOWrapper); if (!ioWrapper) { throw runtime_error("Failed to initialise IOWrapper"); } - size_t nBestSize = SD.GetMBRSize(); + size_t nBestSize = mbr.size; if (nBestSize <= 0) { throw new runtime_error("Non-positive size specified for n-best list"); @@ -187,13 +189,13 @@ int main(int argc, char* argv[]) manager.CalcNBest(nBestSize, nBestList,true); //grid search BOOST_FOREACH(float const& p, pgrid) { - SD.SetLatticeMBRPrecision(p); + lmbr.precision = p; BOOST_FOREACH(float const& r, rgrid) { - SD.SetLatticeMBRPRatio(r); + lmbr.ratio = r; BOOST_FOREACH(size_t const prune_i, prune_grid) { - SD.SetLatticeMBRPruningFactor(size_t(prune_i)); + lmbr.pruning_factor = prune_i; BOOST_FOREACH(float const& scale_i, scale_grid) { - SD.SetMBRScale(scale_i); + mbr.scale = scale_i; size_t lineCount = source->GetTranslationId(); cout << lineCount << " ||| " << p << " " << r << " " << size_t(prune_i) << " " << scale_i diff --git a/moses/ChartHypothesis.cpp b/moses/ChartHypothesis.cpp index 96e7c4552..d0d6bf625 100644 --- a/moses/ChartHypothesis.cpp +++ b/moses/ChartHypothesis.cpp @@ -289,7 +289,7 @@ void ChartHypothesis::CleanupArcList() const StaticData &staticData = StaticData::Instance(); size_t nBestSize = staticData.options().nbest.nbest_size; bool distinctNBest = (staticData.options().nbest.only_distinct - || staticData.UseMBR() + || staticData.options().mbr.enabled || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphHypergraph()); diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp index bc9ff0eb0..5a7833c9f 100644 --- a/moses/Hypothesis.cpp +++ b/moses/Hypothesis.cpp @@ -363,13 +363,13 @@ CleanupArcList() */ const StaticData &staticData = StaticData::Instance(); size_t nBestSize = staticData.options().nbest.nbest_size; - bool distinctNBest = (staticData.options().nbest.only_distinct || + bool distinctNBest = (m_manager.options().nbest.only_distinct || staticData.GetLatticeSamplesSize() || - staticData.UseMBR() || + m_manager.options().mbr.enabled || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || - staticData.UseLatticeMBR()); + m_manager.options().lmbr.enabled); if (!distinctNBest && m_arcList->size() > nBestSize * 5) { // prune arc list only if there too many arcs diff --git a/moses/LatticeMBR.cpp b/moses/LatticeMBR.cpp index b6fa14757..46fb16242 100644 --- a/moses/LatticeMBR.cpp +++ b/moses/LatticeMBR.cpp @@ -490,13 +490,18 @@ bool Edge::operator< (const Edge& compare ) const ostream& operator<< (ostream& out, const Edge& edge) { - out << "Head: " << edge.m_headNode->GetId() << ", Tail: " << edge.m_tailNode->GetId() << ", Score: " << edge.m_score << ", Phrase: " << edge.m_targetPhrase << endl; + out << "Head: " << edge.m_headNode->GetId() + << ", Tail: " << edge.m_tailNode->GetId() + << ", Score: " << edge.m_score + << ", Phrase: " << edge.m_targetPhrase << endl; return out; } bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b) { - return a->GetWordsBitmap().GetNumWordsCovered() < b->GetWordsBitmap().GetNumWordsCovered(); + return (a->GetWordsBitmap().GetNumWordsCovered() + < + b->GetWordsBitmap().GetNumWordsCovered()); } void getLatticeMBRNBest(const Manager& manager, const TrellisPathList& nBestList, @@ -509,15 +514,20 @@ void getLatticeMBRNBest(const Manager& manager, const TrellisPathList& nBestList std::map < const Hypothesis*, set > outgoingHyps; map > incomingEdges; vector< float> estimatedScores; - manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores); - pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale()); + manager.GetForwardBackwardSearchGraph(&connected, &connectedList, + &outgoingHyps, &estimatedScores); + LMBR_Options const& lmbr = manager.options().lmbr; + MBR_Options const& mbr = manager.options().mbr; + pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, + manager.GetBestHypothesis(), lmbr.pruning_factor, mbr.scale); calcNgramExpectations(connectedList, incomingEdges, ngramPosteriors,true); - vector mbrThetas = staticData.GetLatticeMBRThetas(); - float p = staticData.GetLatticeMBRPrecision(); - float r = staticData.GetLatticeMBRPRatio(); - float mapWeight = staticData.GetLatticeMBRMapWeight(); - if (mbrThetas.size() == 0) { //thetas not specified on the command line, use p and r instead + vector mbrThetas = lmbr.theta; + float p = lmbr.precision; + float r = lmbr.ratio; + float mapWeight = lmbr.map_weight; + if (mbrThetas.size() == 0) { + // thetas were not specified on the command line, so use p and r instead mbrThetas.push_back(-1); //Theta 0 mbrThetas.push_back(1/(bleu_order*p)); for (size_t i = 2; i <= bleu_order; ++i) { @@ -537,7 +547,7 @@ void getLatticeMBRNBest(const Manager& manager, const TrellisPathList& nBestList for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter, ++ctr) { const TrellisPath &path = **iter; solutions.push_back(LatticeMBRSolution(path,iter==nBestList.begin())); - solutions.back().CalcScore(ngramPosteriors,mbrThetas,mapWeight); + solutions.back().CalcScore(ngramPosteriors, mbrThetas, mapWeight); sort(solutions.begin(), solutions.end(), comparator); while (solutions.size() > n) { solutions.pop_back(); @@ -568,7 +578,10 @@ const TrellisPath doConsensusDecoding(const Manager& manager, const TrellisPathL map > incomingEdges; vector< float> estimatedScores; manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores); - pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale()); + LMBR_Options const& lmbr = manager.options().lmbr; + MBR_Options const& mbr = manager.options().mbr; + pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, + manager.GetBestHypothesis(), lmbr.pruning_factor, mbr.scale); calcNgramExpectations(connectedList, incomingEdges, ngramExpectations,false); //expected length is sum of expected unigram counts diff --git a/moses/Manager.cpp b/moses/Manager.cpp index c16aaa407..8d67dcced 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -1492,7 +1492,7 @@ void Manager::OutputBest(OutputCollector *collector) const // MAP decoding: best hypothesis const Hypothesis* bestHypo = NULL; - if (!staticData.UseMBR()) { + if (!options().mbr.enabled) { bestHypo = GetBestHypothesis(); if (bestHypo) { if (StaticData::Instance().GetOutputHypoScore()) { @@ -1534,7 +1534,7 @@ void Manager::OutputBest(OutputCollector *collector) const // MBR decoding (n-best MBR, lattice MBR, consensus) else { // we first need the n-best translations - size_t nBestSize = staticData.GetMBRSize(); + size_t nBestSize = options().mbr.size; if (nBestSize <= 0) { cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl; exit(1); @@ -1547,11 +1547,11 @@ void Manager::OutputBest(OutputCollector *collector) const } // lattice MBR - if (staticData.UseLatticeMBR()) { + if (options().lmbr.enabled) { if (staticData.options().nbest.enabled) { //lattice mbr nbest vector solutions; - size_t n = min(nBestSize, staticData.options().nbest.nbest_size); + size_t n = min(nBestSize, options().nbest.nbest_size); getLatticeMBRNBest(*this,nBestList,solutions,n); OutputLatticeMBRNBest(m_latticeNBestOut, solutions, translationId); } else { @@ -1566,7 +1566,7 @@ void Manager::OutputBest(OutputCollector *collector) const } // consensus decoding - else if (staticData.UseConsensusDecoding()) { + else if (options().search.consensus) { const TrellisPath &conBestHypo = doConsensusDecoding(*this,nBestList); OutputBestHypo(conBestHypo, translationId, staticData.GetReportSegmentation(), @@ -1608,15 +1608,15 @@ void Manager::OutputNBest(OutputCollector *collector) const const StaticData &staticData = StaticData::Instance(); long translationId = m_source.GetTranslationId(); - if (staticData.UseLatticeMBR()) { + if (options().lmbr.enabled) { if (staticData.options().nbest.enabled) { collector->Write(translationId, m_latticeNBestOut.str()); } } else { TrellisPathList nBestList; ostringstream out; - CalcNBest(staticData.options().nbest.nbest_size, nBestList, - staticData.options().nbest.only_distinct); + CalcNBest(options().nbest.nbest_size, nBestList, + options().nbest.only_distinct); OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), m_source.GetTranslationId(), staticData.GetReportSegmentation()); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 0acdd7b38..200f35a5a 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -463,64 +463,64 @@ StaticData } -void -StaticData -::ini_mbr_options() -{ - // minimum Bayes risk decoding - m_parameter->SetParameter(m_mbr, "minimum-bayes-risk", false ); - m_parameter->SetParameter(m_mbrSize, "mbr-size", 200); - m_parameter->SetParameter(m_mbrScale, "mbr-scale", 1.0f); -} +// void +// StaticData +// ::ini_mbr_options() +// { +// // minimum Bayes risk decoding +// m_parameter->SetParameter(m_mbr, "minimum-bayes-risk", false ); +// m_parameter->SetParameter(m_mbrSize, "mbr-size", 200); +// m_parameter->SetParameter(m_mbrScale, "mbr-scale", 1.0f); +// } -void -StaticData -::ini_lmbr_options() -{ - const PARAM_VEC *params; - //lattice mbr - m_parameter->SetParameter(m_useLatticeMBR, "lminimum-bayes-risk", false ); - if (m_useLatticeMBR && m_mbr) { - cerr << "Error: Cannot use both n-best mbr and lattice mbr together" << endl; - exit(1); - } - // lattice MBR - if (m_useLatticeMBR) m_mbr = true; +// void +// StaticData +// ::ini_lmbr_options() +// { +// const PARAM_VEC *params; +// //lattice mbr +// // m_parameter->SetParameter(m_useLatticeMBR, "lminimum-bayes-risk", false ); +// // if (m_useLatticeMBR && m_mbr) { +// // cerr << "Error: Cannot use both n-best mbr and lattice mbr together" << endl; +// // exit(1); +// // } +// // // lattice MBR +// // if (m_useLatticeMBR) m_mbr = true; - m_parameter->SetParameter(m_lmbrPruning, "lmbr-pruning-factor", 30); - m_parameter->SetParameter(m_lmbrPrecision, "lmbr-p", 0.8f); - m_parameter->SetParameter(m_lmbrPRatio, "lmbr-r", 0.6f); - m_parameter->SetParameter(m_lmbrMapWeight, "lmbr-map-weight", 0.0f); - m_parameter->SetParameter(m_useLatticeHypSetForLatticeMBR, "lattice-hypo-set", false ); +// m_parameter->SetParameter(m_lmbrPruning, "lmbr-pruning-factor", 30); +// m_parameter->SetParameter(m_lmbrPrecision, "lmbr-p", 0.8f); +// m_parameter->SetParameter(m_lmbrPRatio, "lmbr-r", 0.6f); +// m_parameter->SetParameter(m_lmbrMapWeight, "lmbr-map-weight", 0.0f); +// m_parameter->SetParameter(m_useLatticeHypSetForLatticeMBR, "lattice-hypo-set", false ); - params = m_parameter->GetParam("lmbr-thetas"); - if (params) { - m_lmbrThetas = Scan(*params); - } +// params = m_parameter->GetParam("lmbr-thetas"); +// if (params) { +// m_lmbrThetas = Scan(*params); +// } -} +// } -void -StaticData -::ini_consensus_decoding_options() -{ - //consensus decoding - m_parameter->SetParameter(m_useConsensusDecoding, "consensus-decoding", false ); - if (m_useConsensusDecoding && m_mbr) { - cerr<< "Error: Cannot use consensus decoding together with mbr" << endl; - exit(1); - } - if (m_useConsensusDecoding) m_mbr=true; -} +// void +// StaticData +// ::ini_consensus_decoding_options() +// { +// //consensus decoding +// m_parameter->SetParameter(m_useConsensusDecoding, "consensus-decoding", false ); +// if (m_useConsensusDecoding && m_mbr) { +// cerr<< "Error: Cannot use consensus decoding together with mbr" << endl; +// exit(1); +// } +// if (m_useConsensusDecoding) m_mbr=true; +// } -void -StaticData -::ini_mira_options() -{ - //mira training - m_parameter->SetParameter(m_mira, "mira", false ); -} +// void +// StaticData +// ::ini_mira_options() +// { +// //mira training +// m_parameter->SetParameter(m_mira, "mira", false ); +// } bool StaticData::LoadData(Parameter *parameter) { @@ -559,15 +559,19 @@ bool StaticData::LoadData(Parameter *parameter) // ini_cube_pruning_options(); ini_oov_options(); - ini_mbr_options(); - ini_lmbr_options(); - ini_consensus_decoding_options(); + // ini_mbr_options(); + // ini_lmbr_options(); + // ini_consensus_decoding_options(); - ini_mira_options(); + // ini_mira_options(); // set m_nbest_options.enabled = true if necessary: - if (m_mbr || m_useLatticeMBR || m_outputSearchGraph || m_outputSearchGraphSLF - || m_mira || m_outputSearchGraphHypergraph || m_useConsensusDecoding + if (m_options.mbr.enabled + || m_options.mira + || m_options.search.consensus + || m_outputSearchGraph + || m_outputSearchGraphSLF + || m_outputSearchGraphHypergraph #ifdef HAVE_PROTOBUF || m_outputSearchGraphPB #endif diff --git a/moses/StaticData.h b/moses/StaticData.h index e20b9390a..991bdb014 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -145,18 +145,18 @@ protected: XmlInputType m_xmlInputType; //! method for handling sentence XML input std::pair m_xmlBrackets; //! strings to use as XML tags' opening and closing brackets. Default are "<" and ">" - bool m_mbr; //! use MBR decoder - bool m_useLatticeMBR; //! use MBR decoder - bool m_mira; // do mira training - bool m_useConsensusDecoding; //! Use Consensus decoding (DeNero et al 2009) - size_t m_mbrSize; //! number of translation candidates considered - float m_mbrScale; //! scaling factor for computing marginal probability of candidate translation - size_t m_lmbrPruning; //! average number of nodes per word wanted in pruned lattice - std::vector m_lmbrThetas; //! theta(s) for lattice mbr calculation - bool m_useLatticeHypSetForLatticeMBR; //! to use nbest as hypothesis set during lattice MBR - float m_lmbrPrecision; //! unigram precision theta - see Tromble et al 08 for more details - float m_lmbrPRatio; //! decaying factor for ngram thetas - see Tromble et al 08 for more details - float m_lmbrMapWeight; //! Weight given to the map solution. See Kumar et al 09 for details + // bool m_mbr; //! use MBR decoder + // bool m_useLatticeMBR; //! use MBR decoder + // bool m_mira; // do mira training + // bool m_useConsensusDecoding; //! Use Consensus decoding (DeNero et al 2009) + // size_t m_mbrSize; //! number of translation candidates considered + // float m_mbrScale; //! scaling factor for computing marginal probability of candidate translation + // size_t m_lmbrPruning; //! average number of nodes per word wanted in pruned lattice + // std::vector m_lmbrThetas; //! theta(s) for lattice mbr calculation + // bool m_useLatticeHypSetForLatticeMBR; //! to use nbest as hypothesis set during lattice MBR + // float m_lmbrPrecision; //! unigram precision theta - see Tromble et al 08 for more details + // float m_lmbrPRatio; //! decaying factor for ngram thetas - see Tromble et al 08 for more details + // float m_lmbrMapWeight; //! Weight given to the map solution. See Kumar et al 09 for details size_t m_lmcache_cleanup_threshold; //! number of translations after which LM claenup is performed (0=never, N=after N translations; default is 1) bool m_lmEnableOOVFeature; @@ -512,55 +512,55 @@ public: const std::string& GetFactorDelimiter() const { return m_factorDelimiter; } - bool UseMBR() const { - return m_mbr; - } - bool UseLatticeMBR() const { - return m_useLatticeMBR ; - } - bool UseConsensusDecoding() const { - return m_useConsensusDecoding; - } - void SetUseLatticeMBR(bool flag) { - m_useLatticeMBR = flag; - } - size_t GetMBRSize() const { - return m_mbrSize; - } - float GetMBRScale() const { - return m_mbrScale; - } - void SetMBRScale(float scale) { - m_mbrScale = scale; - } - size_t GetLatticeMBRPruningFactor() const { - return m_lmbrPruning; - } - void SetLatticeMBRPruningFactor(size_t prune) { - m_lmbrPruning = prune; - } - const std::vector& GetLatticeMBRThetas() const { - return m_lmbrThetas; - } - bool UseLatticeHypSetForLatticeMBR() const { - return m_useLatticeHypSetForLatticeMBR; - } - float GetLatticeMBRPrecision() const { - return m_lmbrPrecision; - } - void SetLatticeMBRPrecision(float p) { - m_lmbrPrecision = p; - } - float GetLatticeMBRPRatio() const { - return m_lmbrPRatio; - } - void SetLatticeMBRPRatio(float r) { - m_lmbrPRatio = r; - } + // bool UseMBR() const { + // return m_mbr; + // } + // bool UseLatticeMBR() const { + // return m_useLatticeMBR ; + // } + // bool UseConsensusDecoding() const { + // return m_useConsensusDecoding; + // } + // void SetUseLatticeMBR(bool flag) { + // m_useLatticeMBR = flag; + // } + // size_t GetMBRSize() const { + // return m_mbrSize; + // } + // float GetMBRScale() const { + // return m_mbrScale; + // } + // void SetMBRScale(float scale) { + // m_mbrScale = scale; + // } + // size_t GetLatticeMBRPruningFactor() const { + // return m_lmbrPruning; + // } + // void SetLatticeMBRPruningFactor(size_t prune) { + // m_lmbrPruning = prune; + // } + // const std::vector& GetLatticeMBRThetas() const { + // return m_lmbrThetas; + // } + // bool UseLatticeHypSetForLatticeMBR() const { + // return m_useLatticeHypSetForLatticeMBR; + // } + // float GetLatticeMBRPrecision() const { + // return m_lmbrPrecision; + // } + // void SetLatticeMBRPrecision(float p) { + // m_lmbrPrecision = p; + // } + // float GetLatticeMBRPRatio() const { + // return m_lmbrPRatio; + // } + // void SetLatticeMBRPRatio(float r) { + // m_lmbrPRatio = r; + // } - float GetLatticeMBRMapWeight() const { - return m_lmbrMapWeight; - } + // float GetLatticeMBRMapWeight() const { + // return m_lmbrMapWeight; + // } // bool UseTimeout() const { // return m_timeout; diff --git a/moses/mbr.cpp b/moses/mbr.cpp index 66dac47f7..e49b1c5d9 100644 --- a/moses/mbr.cpp +++ b/moses/mbr.cpp @@ -92,7 +92,7 @@ float calculate_score(const vector< vector > & sents, int ref, in const TrellisPath doMBR(const TrellisPathList& nBestList) { float marginal = 0; - + float mbr_scale = StaticData::Instance().options().mbr.scale; vector joint_prob_vec; vector< vector > translations; float joint_prob; @@ -104,14 +104,13 @@ const TrellisPath doMBR(const TrellisPathList& nBestList) float maxScore = -1e20; for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { const TrellisPath &path = **iter; - float score = StaticData::Instance().GetMBRScale() - * path.GetScoreBreakdown()->GetWeightedScore(); + float score = mbr_scale * path.GetScoreBreakdown()->GetWeightedScore(); if (maxScore < score) maxScore = score; } for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) { const TrellisPath &path = **iter; - joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown()->GetWeightedScore() - maxScore); + joint_prob = UntransformScore(mbr_scale * path.GetScoreBreakdown()->GetWeightedScore() - maxScore); marginal += joint_prob; joint_prob_vec.push_back(joint_prob); diff --git a/moses/parameters/AllOptions.cpp b/moses/parameters/AllOptions.cpp index d194ac08b..fd4417fca 100644 --- a/moses/parameters/AllOptions.cpp +++ b/moses/parameters/AllOptions.cpp @@ -19,6 +19,11 @@ namespace Moses if (!reordering.init(param)) return false; if (!context.init(param)) return false; if (!input.init(param)) return false; + if (!mbr.init(param)) return false; + if (!lmbr.init(param)) return false; + + param.SetParameter(mira, "mira", false); + return sanity_check(); } @@ -26,6 +31,26 @@ namespace Moses AllOptions:: sanity_check() { + using namespace std; + if (lmbr.enabled) + { + if (mbr.enabled) + { + cerr << "Error: Cannot use both n-best mbr and lattice mbr together" << endl; + return false; + } + mbr.enabled = true; + } + if (search.consensus) + { + if (mbr.enabled) + { + cerr << "Error: Cannot use consensus decoding together with mbr" << endl; + return false; + } + mbr.enabled = true; + } + return true; } } diff --git a/moses/parameters/AllOptions.h b/moses/parameters/AllOptions.h index 54055e77b..9fa3bc514 100644 --- a/moses/parameters/AllOptions.h +++ b/moses/parameters/AllOptions.h @@ -8,6 +8,8 @@ #include "ReorderingOptions.h" #include "ContextParameters.h" #include "InputOptions.h" +#include "MBR_Options.h" +#include "LMBR_Options.h" namespace Moses { @@ -20,6 +22,11 @@ namespace Moses ReorderingOptions reordering; ContextParameters context; InputOptions input; + MBR_Options mbr; + LMBR_Options lmbr; + + bool mira; + // StackOptions stack; // BeamSearchOptions beam; bool init(Parameter const& param); diff --git a/moses/parameters/LMBR_Options.cpp b/moses/parameters/LMBR_Options.cpp new file mode 100644 index 000000000..808be5f18 --- /dev/null +++ b/moses/parameters/LMBR_Options.cpp @@ -0,0 +1,24 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "LMBR_Options.h" + +namespace Moses { + + bool + LMBR_Options:: + init(Parameter const& param) + { + param.SetParameter(enabled, "lminimum-bayes-risk", false); + + param.SetParameter(ratio, "lmbr-r", 0.6f); + param.SetParameter(precision, "lmbr-p", 0.8f); + param.SetParameter(map_weight, "lmbr-map-weight", 0.0f); + param.SetParameter(pruning_factor, "lmbr-pruning-factor", size_t(30)); + param.SetParameter(use_lattice_hyp_set, "lattice-hypo-set", false); + + PARAM_VEC const* params = param.GetParam("lmbr-thetas"); + if (params) theta = Scan(*params); + + return true; + } + +} diff --git a/moses/parameters/LMBR_Options.h b/moses/parameters/LMBR_Options.h new file mode 100644 index 000000000..29dee3c2e --- /dev/null +++ b/moses/parameters/LMBR_Options.h @@ -0,0 +1,25 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include +#include "moses/Parameter.h" +namespace Moses +{ + + // Options for mimum bayes risk decoding + struct + LMBR_Options + { + bool enabled; + bool use_lattice_hyp_set; //! to use nbest as hypothesis set during lattice MBR + float precision; //! unigram precision theta - see Tromble et al 08 for more details + float ratio; //! decaying factor for ngram thetas - see Tromble et al 08 + float map_weight; //! Weight given to the map solution. See Kumar et al 09 + size_t pruning_factor; //! average number of nodes per word wanted in pruned lattice + std::vector theta; //! theta(s) for lattice mbr calculation + bool init(Parameter const& param); + LMBR_Options() {} + }; + +} + diff --git a/moses/parameters/MBR_Options.cpp b/moses/parameters/MBR_Options.cpp new file mode 100644 index 000000000..9a9fbc966 --- /dev/null +++ b/moses/parameters/MBR_Options.cpp @@ -0,0 +1,16 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "MBR_Options.h" + +namespace Moses { + +bool +MBR_Options:: +init(Parameter const& param) +{ + param.SetParameter(enabled, "minimum-bayes-risk", false); + param.SetParameter(size, "mbr-size", 200); + param.SetParameter(scale, "mbr-scale", 1.0f); + return true; +} + +} diff --git a/moses/parameters/MBR_Options.h b/moses/parameters/MBR_Options.h new file mode 100644 index 000000000..56d5b34c1 --- /dev/null +++ b/moses/parameters/MBR_Options.h @@ -0,0 +1,21 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +namespace Moses +{ + + // Options for mimum bayes risk decoding + struct + MBR_Options + { + bool enabled; + size_t size; //! number of translation candidates considered + float scale; /*! scaling factor for computing marginal probability + * of candidate translation */ + bool init(Parameter const& param); + MBR_Options() {} + }; + +} + diff --git a/moses/parameters/SearchOptions.cpp b/moses/parameters/SearchOptions.cpp index 995d0b83e..7f9c2e748 100644 --- a/moses/parameters/SearchOptions.cpp +++ b/moses/parameters/SearchOptions.cpp @@ -31,6 +31,8 @@ namespace Moses DEFAULT_MAX_PART_TRANS_OPT_SIZE); + param.SetParameter(consensus, "consensus-decoding", false); + // transformation to log of a few scores beam_width = TransformScore(beam_width); trans_opt_threshold = TransformScore(trans_opt_threshold); diff --git a/moses/parameters/SearchOptions.h b/moses/parameters/SearchOptions.h index 050eeb54e..1df9d034b 100644 --- a/moses/parameters/SearchOptions.h +++ b/moses/parameters/SearchOptions.h @@ -24,6 +24,8 @@ namespace Moses int timeout; + bool consensus; //! Use Consensus decoding (DeNero et al 2009) + // reordering options // bool reorderingConstraint; //! use additional reordering constraints // bool useEarlyDistortionCost; From a2eacb9d3d73c1dd6b57b3e5175a12bbb9d6fdb4 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 7 Aug 2015 01:45:29 +0100 Subject: [PATCH 225/286] Construct OutputCollector directly from string(s). --- moses/OutputCollector.h | 42 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/moses/OutputCollector.h b/moses/OutputCollector.h index 4ca0f5ac1..71bc90369 100644 --- a/moses/OutputCollector.h +++ b/moses/OutputCollector.h @@ -42,9 +42,45 @@ namespace Moses class OutputCollector { public: - OutputCollector(std::ostream* outStream= &std::cout, std::ostream* debugStream=&std::cerr) : - m_nextOutput(0),m_outStream(outStream),m_debugStream(debugStream), - m_isHoldingOutputStream(false), m_isHoldingDebugStream(false) {} + OutputCollector(std::ostream* outStream= &std::cout, + std::ostream* debugStream=&std::cerr) + : m_nextOutput(0) + , m_outStream(outStream) + , m_debugStream(debugStream) + , m_isHoldingOutputStream(false) + , m_isHoldingDebugStream(false) {} + + OutputCollector(std::string xout, std::string xerr = "") + : m_nextOutput(0) + { + // TO DO open magic streams instead of regular ofstreams! [UG] + + if (xout == "/dev/stderr") { + m_outStream = &std::cerr; + m_isHoldingOutputStream = false; + } else if (xout.size() && xout != "/dev/stdout" && xout != "-") { + m_outStream = new std::ofstream(xout.c_str()); + UTIL_THROW_IF2(!m_outputStream->good(), "Failed to open output file" + << xout << std::endl); + m_isHoldingOutputStream = true; + } else { + m_outStream = &std::cout; + m_isHoldingOutputStream = false; + } + + if (xerr == "/dev/stdout") { + m_debugStream = &std::cout; + m_isHoldingDebugStream = false; + } else if (xerr.size() && xerr != "/dev/stderr") { + m_debugStream = new std::ofstream(xerr.c_str()); + UTIL_THROW_IF2(!m_debugStream->good(), "Failed to open debug stream" + << xerr << std::endl); + m_isHoldingDebugStream = true; + } else { + m_debugStream = &std::cerr; + m_isHoldingDebugStream = false; + } + } ~OutputCollector() { if (m_isHoldingOutputStream) From f19c8abcc2971ad154a72af1d29c4c23a8eb6202 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 7 Aug 2015 01:46:50 +0100 Subject: [PATCH 226/286] Updated Emacs code formatting parameters. --- moses/parameters/BookkeepingOptions.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/moses/parameters/BookkeepingOptions.h b/moses/parameters/BookkeepingOptions.h index 08bc1d59d..331f0b83c 100644 --- a/moses/parameters/BookkeepingOptions.h +++ b/moses/parameters/BookkeepingOptions.h @@ -1,15 +1,13 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #include "moses/Parameter.h" -// #include - namespace Moses { -struct BookkeepingOptions { - bool need_alignment_info; - bool init(Parameter const& param); -}; - + struct BookkeepingOptions { + bool need_alignment_info; + bool init(Parameter const& param); + }; + } From 4ad6f2a5a6fc860bd889edff23e9b05c1a38a83e Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 7 Aug 2015 02:22:45 +0100 Subject: [PATCH 227/286] Added Emacs code formatting parameters. --- moses/ReorderingConstraint.h | 1 + 1 file changed, 1 insertion(+) diff --git a/moses/ReorderingConstraint.h b/moses/ReorderingConstraint.h index 05d0c2b87..bc87e7e95 100644 --- a/moses/ReorderingConstraint.h +++ b/moses/ReorderingConstraint.h @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- // $Id$ // vim:tabstop=2 From d11906abcdc43c6e5d41ff75d99b91c654a49daf Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 7 Aug 2015 03:01:24 +0100 Subject: [PATCH 228/286] Code simplification in IOWrapper. --- moses/IOWrapper.cpp | 133 ++++++++++++---------------------------- moses/IOWrapper.h | 10 +-- moses/OutputCollector.h | 9 +-- 3 files changed, 49 insertions(+), 103 deletions(-) diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp index b36fd0644..eabf0c8c2 100644 --- a/moses/IOWrapper.cpp +++ b/moses/IOWrapper.cpp @@ -79,12 +79,12 @@ namespace Moses IOWrapper::IOWrapper() : m_nBestStream(NULL) - , m_outputWordGraphStream(NULL) - , m_outputSearchGraphStream(NULL) - , m_detailedTranslationReportingStream(NULL) - , m_unknownsStream(NULL) - , m_alignmentInfoStream(NULL) - , m_latticeSamplesStream(NULL) + // , m_outputWordGraphStream(NULL) + // , m_outputSearchGraphStream(NULL) + // , m_detailedTranslationReportingStream(NULL) + // , m_unknownsStream(NULL) + // , m_alignmentInfoStream(NULL) + // , m_latticeSamplesStream(NULL) , m_surpressSingleBestOutput(false) , m_look_ahead(0) , m_look_back(0) @@ -94,6 +94,7 @@ IOWrapper::IOWrapper() , spe_aln(NULL) { const StaticData &staticData = StaticData::Instance(); + Parameter const& P = staticData.GetParameter(); // context buffering for context-sensitive decoding m_look_ahead = staticData.options().context.look_ahead; @@ -122,98 +123,41 @@ IOWrapper::IOWrapper() } if (nBestSize > 0) { - if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout") { - m_nBestStream = &std::cout; - m_nBestOutputCollector.reset(new Moses::OutputCollector(&std::cout)); + m_nBestOutputCollector.reset(new Moses::OutputCollector(nBestFilePath)); + if (m_nBestOutputCollector->OutputIsCout()) { m_surpressSingleBestOutput = true; - } else { - std::ofstream *file = new std::ofstream; - file->open(nBestFilePath.c_str()); - m_nBestStream = file; - - m_nBestOutputCollector.reset(new Moses::OutputCollector(file)); - //m_nBestOutputCollector->HoldOutputStream(); } } - // search graph output - if (staticData.GetOutputSearchGraph()) { - string fileName; - if (staticData.GetOutputSearchGraphExtended()) { - staticData.GetParameter().SetParameter(fileName, "output-search-graph-extended", ""); - } else { - staticData.GetParameter().SetParameter(fileName, "output-search-graph", ""); - } - std::ofstream *file = new std::ofstream; - m_outputSearchGraphStream = file; - file->open(fileName.c_str()); - } + std::string path; + P.SetParameter(path, "output-search-graph-extended", ""); + if (!path.size()) P.SetParameter(path, "output-search-graph", ""); + if (path.size()) m_searchGraphOutputCollector.reset(new OutputCollector(path)); - if (!staticData.GetOutputUnknownsFile().empty()) { - m_unknownsStream = new std::ofstream(staticData.GetOutputUnknownsFile().c_str()); - m_unknownsCollector.reset(new Moses::OutputCollector(m_unknownsStream)); - UTIL_THROW_IF2(!m_unknownsStream->good(), - "File for unknowns words could not be opened: " << - staticData.GetOutputUnknownsFile()); - } + P.SetParameter(path, "output-unknowns", ""); + if (path.size()) m_unknownsCollector.reset(new OutputCollector(path)); - if (!staticData.GetAlignmentOutputFile().empty()) { - m_alignmentInfoStream = new std::ofstream(staticData.GetAlignmentOutputFile().c_str()); - m_alignmentInfoCollector.reset(new Moses::OutputCollector(m_alignmentInfoStream)); - UTIL_THROW_IF2(!m_alignmentInfoStream->good(), - "File for alignment output could not be opened: " << staticData.GetAlignmentOutputFile()); - } - - if (staticData.GetOutputSearchGraph()) { - string fileName; - staticData.GetParameter().SetParameter(fileName, "output-search-graph", ""); - - std::ofstream *file = new std::ofstream; - m_outputSearchGraphStream = file; - file->open(fileName.c_str()); - m_searchGraphOutputCollector.reset(new Moses::OutputCollector(m_outputSearchGraphStream)); - } - - // detailed translation reporting - if (staticData.IsDetailedTranslationReportingEnabled()) { - const std::string &path = staticData.GetDetailedTranslationReportingFilePath(); - m_detailedTranslationReportingStream = new std::ofstream(path.c_str()); - m_detailedTranslationCollector.reset(new Moses::OutputCollector(m_detailedTranslationReportingStream)); - } - - if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) { - const std::string &path = staticData.GetDetailedTreeFragmentsTranslationReportingFilePath(); - m_detailedTreeFragmentsTranslationReportingStream = new std::ofstream(path.c_str()); - m_detailTreeFragmentsOutputCollector.reset(new Moses::OutputCollector(m_detailedTreeFragmentsTranslationReportingStream)); - } - - // wordgraph output - if (staticData.GetOutputWordGraph()) { - string fileName; - staticData.GetParameter().SetParameter(fileName, "output-word-graph", ""); - - std::ofstream *file = new std::ofstream; - m_outputWordGraphStream = file; - file->open(fileName.c_str()); - m_wordGraphCollector.reset(new OutputCollector(m_outputWordGraphStream)); - } + P.SetParameter(path, "alignment-output-file", ""); + if (path.size()) m_alignmentInfoCollector.reset(new OutputCollector(path)); + P.SetParameter(path, "translation-details", ""); + if (path.size()) m_detailedTranslationCollector.reset(new OutputCollector(path)); + + P.SetParameter(path, "tree-translation-details", ""); + if (path.size()) m_detailTreeFragmentsOutputCollector.reset(new OutputCollector(path)); + + P.SetParameter(path, "output-word-graph", ""); + if (path.size()) m_wordGraphCollector.reset(new OutputCollector(path)); + size_t latticeSamplesSize = staticData.GetLatticeSamplesSize(); string latticeSamplesFile = staticData.GetLatticeSamplesFilePath(); if (latticeSamplesSize) { - if (latticeSamplesFile == "-" || latticeSamplesFile == "/dev/stdout") { - m_latticeSamplesCollector.reset(new OutputCollector()); + m_latticeSamplesCollector.reset(new OutputCollector(latticeSamplesFile)); + if (m_latticeSamplesCollector->OutputIsCout()) { m_surpressSingleBestOutput = true; - } else { - m_latticeSamplesStream = new ofstream(latticeSamplesFile.c_str()); - if (!m_latticeSamplesStream->good()) { - TRACE_ERR("ERROR: Failed to open " << latticeSamplesFile << " for lattice samples" << endl); - exit(1); - } - m_latticeSamplesCollector.reset(new OutputCollector(m_latticeSamplesStream)); } } - + if (!m_surpressSingleBestOutput) { m_singleBestOutputCollector.reset(new Moses::OutputCollector(&std::cout)); } @@ -236,6 +180,7 @@ IOWrapper::IOWrapper() << "' for hypergraph output!"); fmt += string("%d.") + extension; + // input streams for simulated post-editing if (staticData.GetParameter().GetParam("spe-src")) { spe_src = new ifstream(staticData.GetParameter().GetParam("spe-src")->at(0).c_str()); spe_trg = new ifstream(staticData.GetParameter().GetParam("spe-trg")->at(0).c_str()); @@ -247,17 +192,17 @@ IOWrapper::~IOWrapper() { if (m_inputFile != NULL) delete m_inputFile; - if (m_nBestStream != NULL && !m_surpressSingleBestOutput) { + // if (m_nBestStream != NULL && !m_surpressSingleBestOutput) { // outputting n-best to file, rather than stdout. need to close file and delete obj - delete m_nBestStream; - } + // delete m_nBestStream; + // } - delete m_detailedTranslationReportingStream; - delete m_alignmentInfoStream; - delete m_unknownsStream; - delete m_outputSearchGraphStream; - delete m_outputWordGraphStream; - delete m_latticeSamplesStream; + // delete m_detailedTranslationReportingStream; + // delete m_alignmentInfoStream; + // delete m_unknownsStream; + // delete m_outputSearchGraphStream; + // delete m_outputWordGraphStream; + // delete m_latticeSamplesStream; } // InputType* diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h index 98d558410..ee90042b0 100644 --- a/moses/IOWrapper.h +++ b/moses/IOWrapper.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- // $Id$ /*********************************************************************** @@ -86,11 +86,11 @@ protected: Moses::InputFileStream *m_inputFile; std::istream *m_inputStream; std::ostream *m_nBestStream; - std::ostream *m_outputWordGraphStream; - std::ostream *m_outputSearchGraphStream; - std::ostream *m_detailedTranslationReportingStream; + // std::ostream *m_outputWordGraphStream; + // std::auto_ptr m_outputSearchGraphStream; + // std::ostream *m_detailedTranslationReportingStream; std::ostream *m_unknownsStream; - std::ostream *m_detailedTreeFragmentsTranslationReportingStream; + // std::ostream *m_detailedTreeFragmentsTranslationReportingStream; std::ofstream *m_alignmentInfoStream; std::ofstream *m_latticeSamplesStream; diff --git a/moses/OutputCollector.h b/moses/OutputCollector.h index 71bc90369..d66c16f20 100644 --- a/moses/OutputCollector.h +++ b/moses/OutputCollector.h @@ -33,7 +33,8 @@ #include #include #include - +#include "Util.h" +#include "util/exception.hh" namespace Moses { /** @@ -60,8 +61,8 @@ public: m_isHoldingOutputStream = false; } else if (xout.size() && xout != "/dev/stdout" && xout != "-") { m_outStream = new std::ofstream(xout.c_str()); - UTIL_THROW_IF2(!m_outputStream->good(), "Failed to open output file" - << xout << std::endl); + UTIL_THROW_IF2(!m_outStream->good(), "Failed to open output file" + << xout); m_isHoldingOutputStream = true; } else { m_outStream = &std::cout; @@ -74,7 +75,7 @@ public: } else if (xerr.size() && xerr != "/dev/stderr") { m_debugStream = new std::ofstream(xerr.c_str()); UTIL_THROW_IF2(!m_debugStream->good(), "Failed to open debug stream" - << xerr << std::endl); + << xerr); m_isHoldingDebugStream = true; } else { m_debugStream = &std::cerr; From e85aa7e9327d294f44c6654bb9ee95e15e3f7d0c Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 7 Aug 2015 12:27:42 +0100 Subject: [PATCH 229/286] Post-merge bug fixes and cleanup. --- moses/ConfusionNet.cpp | 2 +- moses/FF/BleuScoreFeature.cpp | 2 +- moses/StaticData.cpp | 130 ---------------------------------- 3 files changed, 2 insertions(+), 132 deletions(-) diff --git a/moses/ConfusionNet.cpp b/moses/ConfusionNet.cpp index d9f92cd47..225f3034f 100644 --- a/moses/ConfusionNet.cpp +++ b/moses/ConfusionNet.cpp @@ -67,7 +67,7 @@ ConfusionNet() : InputType() stats.createOne(); const StaticData& SD = StaticData::Instance(); - if (is_syntax(SD.GetSearchAlgorithm())) { + if (SD.IsSyntax()) { m_defaultLabelSet.insert(SD.GetInputDefaultNonTerminal()); } UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified"); diff --git a/moses/FF/BleuScoreFeature.cpp b/moses/FF/BleuScoreFeature.cpp index 626ff717d..a98964386 100644 --- a/moses/FF/BleuScoreFeature.cpp +++ b/moses/FF/BleuScoreFeature.cpp @@ -27,7 +27,7 @@ int BleuScoreState::Compare(const FFState& o) const if (&o == this) return 0; - if (is_syntax(StaticData::Instance().GetSearchAlgorithm())) + if (StaticData::Instance().IsSyntax()) return 0; const BleuScoreState& other = dynamic_cast(o); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index 3750e83a6..d8ab06e09 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -397,62 +397,6 @@ StaticData m_parameter->SetParameter(m_isAlwaysCreateDirectTranslationOption, "always-create-direct-transopt", false ); } -void -StaticData -::ini_distortion_options() -{ - // // reordering constraints - // m_parameter->SetParameter(m_maxDistortion, "distortion-limit", -1); - - // m_parameter->SetParameter(m_reorderingConstraint, "monotone-at-punctuation", false ); - - // // early distortion cost - // m_parameter->SetParameter(m_useEarlyDistortionCost, "early-distortion-cost", false ); -} - -bool -StaticData -::ini_stack_decoding_options() -{ - // const PARAM_VEC *params; - // // settings for pruning - // m_parameter->SetParameter(m_maxHypoStackSize, "stack", DEFAULT_MAX_HYPOSTACK_SIZE); - - // m_minHypoStackDiversity = 0; - // params = m_parameter->GetParam("stack-diversity"); - // if (params && params->size()) { - // if (m_maxDistortion > 15) { - // std::cerr << "stack diversity > 0 is not allowed for distortion limits larger than 15"; - // return false; - // } - // if (m_inputType == WordLatticeInput) { - // std::cerr << "stack diversity > 0 is not allowed for lattice input"; - // return false; - // } - // m_minHypoStackDiversity = Scan(params->at(0)); - // } - - // m_parameter->SetParameter(m_beamWidth, "beam-threshold", DEFAULT_BEAM_WIDTH); - // m_beamWidth = TransformScore(m_beamWidth); - - // m_parameter->SetParameter(m_earlyDiscardingThreshold, "early-discarding-threshold", DEFAULT_EARLY_DISCARDING_THRESHOLD); - // m_earlyDiscardingThreshold = TransformScore(m_earlyDiscardingThreshold); - return true; -} - -void -StaticData -::ini_phrase_lookup_options() -{ - // m_parameter->SetParameter(m_translationOptionThreshold, "translation-option-threshold", DEFAULT_TRANSLATION_OPTION_THRESHOLD); - // m_translationOptionThreshold = TransformScore(m_translationOptionThreshold); - - // m_parameter->SetParameter(m_maxNoTransOptPerCoverage, "max-trans-opt-per-coverage", DEFAULT_MAX_TRANS_OPT_SIZE); - // m_parameter->SetParameter(m_maxNoPartTransOpt, "max-partial-trans-opt", DEFAULT_MAX_PART_TRANS_OPT_SIZE); - // m_parameter->SetParameter(m_maxPhraseLength, "max-phrase-length", DEFAULT_MAX_PHRASE_LENGTH); - -} - void StaticData ::ini_zombie_options() @@ -462,65 +406,6 @@ StaticData } -// void -// StaticData -// ::ini_mbr_options() -// { -// // minimum Bayes risk decoding -// m_parameter->SetParameter(m_mbr, "minimum-bayes-risk", false ); -// m_parameter->SetParameter(m_mbrSize, "mbr-size", 200); -// m_parameter->SetParameter(m_mbrScale, "mbr-scale", 1.0f); -// } - - -// void -// StaticData -// ::ini_lmbr_options() -// { -// const PARAM_VEC *params; -// //lattice mbr -// // m_parameter->SetParameter(m_useLatticeMBR, "lminimum-bayes-risk", false ); -// // if (m_useLatticeMBR && m_mbr) { -// // cerr << "Error: Cannot use both n-best mbr and lattice mbr together" << endl; -// // exit(1); -// // } -// // // lattice MBR -// // if (m_useLatticeMBR) m_mbr = true; - -// m_parameter->SetParameter(m_lmbrPruning, "lmbr-pruning-factor", 30); -// m_parameter->SetParameter(m_lmbrPrecision, "lmbr-p", 0.8f); -// m_parameter->SetParameter(m_lmbrPRatio, "lmbr-r", 0.6f); -// m_parameter->SetParameter(m_lmbrMapWeight, "lmbr-map-weight", 0.0f); -// m_parameter->SetParameter(m_useLatticeHypSetForLatticeMBR, "lattice-hypo-set", false ); - -// params = m_parameter->GetParam("lmbr-thetas"); -// if (params) { -// m_lmbrThetas = Scan(*params); -// } - -// } - -// void -// StaticData -// ::ini_consensus_decoding_options() -// { -// //consensus decoding -// m_parameter->SetParameter(m_useConsensusDecoding, "consensus-decoding", false ); -// if (m_useConsensusDecoding && m_mbr) { -// cerr<< "Error: Cannot use consensus decoding together with mbr" << endl; -// exit(1); -// } -// if (m_useConsensusDecoding) m_mbr=true; -// } - -// void -// StaticData -// ::ini_mira_options() -// { -// //mira training -// m_parameter->SetParameter(m_mira, "mira", false ); -// } - bool StaticData::LoadData(Parameter *parameter) { ResetUserTime(); @@ -529,10 +414,6 @@ bool StaticData::LoadData(Parameter *parameter) const PARAM_VEC *params; m_options.init(*parameter); - // m_context_parameters.init(*parameter); - - // to cube or not to cube - // m_parameter->SetParameter(m_searchAlgorithm, "search-algorithm", Normal); if (IsSyntax()) LoadChartDecodingParameters(); @@ -542,7 +423,6 @@ bool StaticData::LoadData(Parameter *parameter) ini_factor_maps(); ini_input_options(); m_bookkeeping_options.init(*parameter); - // m_nbest_options.init(*parameter); if (!ini_output_options()) return false; // threading etc. @@ -552,17 +432,7 @@ bool StaticData::LoadData(Parameter *parameter) ini_compact_table_options(); // search - ini_distortion_options(); - if (!ini_stack_decoding_options()) return false; - ini_phrase_lookup_options(); - // ini_cube_pruning_options(); - ini_oov_options(); - // ini_mbr_options(); - // ini_lmbr_options(); - // ini_consensus_decoding_options(); - - // ini_mira_options(); // set m_nbest_options.enabled = true if necessary: if (m_options.mbr.enabled From a07eb65118a6eec894e5d100cdd6f41f08039877 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 7 Aug 2015 13:35:02 +0100 Subject: [PATCH 230/286] sptr<> -> SPTR<> in preparation for merge with legacy master --- moses/TranslationModel/UG/mm/calc-coverage.cc | 4 +- moses/TranslationModel/UG/mm/custom-pt.cc | 2 +- .../UG/mm/test-dynamic-im-tsa.cc | 2 +- moses/TranslationModel/UG/mm/ug_bitext.cc | 4 +- moses/TranslationModel/UG/mm/ug_bitext.h | 70 ++++++++--------- .../TranslationModel/UG/mm/ug_bitext_agenda.h | 26 +++---- .../UG/mm/ug_bitext_agenda_job.h | 14 ++-- .../UG/mm/ug_bitext_agenda_worker.h | 4 +- .../TranslationModel/UG/mm/ug_bitext_moses.h | 18 ++--- .../TranslationModel/UG/mm/ug_bitext_pstats.h | 4 +- .../UG/mm/ug_bitext_sampler.h | 18 ++--- moses/TranslationModel/UG/mm/ug_im_bitext.cc | 4 +- moses/TranslationModel/UG/mm/ug_im_bitext.h | 26 +++---- moses/TranslationModel/UG/mm/ug_lru_cache.h | 6 +- moses/TranslationModel/UG/mm/ug_mmbitext.cc | 20 ++--- moses/TranslationModel/UG/mm/ug_mmbitext.h | 14 ++-- .../TranslationModel/UG/mm/ug_prep_phrases.h | 14 ++-- moses/TranslationModel/UG/mm/ug_tsa_base.h | 8 +- .../UG/mm/ug_tsa_tree_iterator.h | 6 +- moses/TranslationModel/UG/mm/ug_typedefs.h | 2 +- moses/TranslationModel/UG/mmsapt.cpp | 78 +++++++++---------- 21 files changed, 172 insertions(+), 172 deletions(-) diff --git a/moses/TranslationModel/UG/mm/calc-coverage.cc b/moses/TranslationModel/UG/mm/calc-coverage.cc index 83f67220d..4f02909b7 100644 --- a/moses/TranslationModel/UG/mm/calc-coverage.cc +++ b/moses/TranslationModel/UG/mm/calc-coverage.cc @@ -15,7 +15,7 @@ using namespace ugdiss; typedef L2R_Token Token; TokenIndex V; -sptr > > C(new vector >()); +SPTR > > C(new vector >()); void add_file(string fname) { @@ -34,7 +34,7 @@ main(int argc, char* argv[]) { V.setDynamic(true); add_file(argv[1]); - sptr > T(new imTtrack(C)); + SPTR > T(new imTtrack(C)); imTSA I(T,NULL,NULL); string line; while (getline(cin,line)) diff --git a/moses/TranslationModel/UG/mm/custom-pt.cc b/moses/TranslationModel/UG/mm/custom-pt.cc index 1a51aa8a4..44fc5d112 100644 --- a/moses/TranslationModel/UG/mm/custom-pt.cc +++ b/moses/TranslationModel/UG/mm/custom-pt.cc @@ -153,7 +153,7 @@ int main(int argc, char* argv[]) for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k) { uint64_t spid = m.getPid(); - sptr s = bt.lookup(m); + SPTR s = bt.lookup(m); for (size_t j = i; j <= k; ++j) cout << (*bt.V1)[snt[j]] << " "; cout << s->good << "/" diff --git a/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc b/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc index dd5f4c9b4..612f497a6 100644 --- a/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc +++ b/moses/TranslationModel/UG/mm/test-dynamic-im-tsa.cc @@ -37,7 +37,7 @@ typedef L2R_Token L2R; int main() { - sptr > bt(new imBitext()); + SPTR > bt(new imBitext()); string s1,s2,aln; vector S1,S2,ALN; while (getline(cin,s1) && getline(cin,s2) && getline(cin,aln)) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index 809476aa9..fb75877ed 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -45,8 +45,8 @@ namespace Moses snt_adder >:: snt_adder(vector const& s, TokenIndex& v, - sptr > >& t, - sptr > >& i) + SPTR > >& t, + SPTR > >& i) : snt(s), V(v), track(t), index(i) { } diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index de56c429e..11b08a276 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -96,8 +96,8 @@ namespace Moses { // ttasksptr const m_ttask; // size_t max_samples; boost::shared_mutex lock; - sptr bias; - sptr cache1, cache2; + SPTR bias; + SPTR cache1, cache2; std::ostream* bias_log; ContextForQuery() : bias_log(NULL) { } }; @@ -118,29 +118,29 @@ namespace Moses { mutable boost::shared_mutex m_lock; // for thread-safe operation class agenda; // for parallel sampling see ug_bitext_agenda.h - mutable sptr ag; + mutable SPTR ag; size_t m_num_workers; // number of workers available to the agenda size_t m_default_sample_size; size_t m_pstats_cache_threshold; // threshold for caching sampling results - sptr m_cache1, m_cache2; // caches for sampling results + SPTR m_cache1, m_cache2; // caches for sampling results std::vector m_docname; map m_docname2docid; // maps from doc names to ids - sptr > m_sid2docid; // maps from sentences to docs (ids) + SPTR > m_sid2docid; // maps from sentences to docs (ids) mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; // caches for unbiased sampling; biased sampling uses the caches that // are stored locally on the translation task public: - sptr > Tx; // word alignments - sptr > T1; // token track - sptr > T2; // token track - sptr V1; // vocab - sptr V2; // vocab - sptr > I1; // indices - sptr > I2; // indices + SPTR > Tx; // word alignments + SPTR > T1; // token track + SPTR > T2; // token track + SPTR V1; // vocab + SPTR V2; // vocab + SPTR > I1; // indices + SPTR > I2; // indices /// given the source phrase sid[start:stop] // find the possible start (s1 .. s2) and end (e1 .. e2) @@ -161,11 +161,11 @@ namespace Moses { // prep2 launches sampling and returns immediately. // lookup (below) waits for the job to finish before it returns - sptr + SPTR prep2(iter const& phrase, int max_sample = -1) const; #ifndef NO_MOSES - sptr + SPTR prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; #endif @@ -182,12 +182,12 @@ namespace Moses { virtual void open(string const base, string const L1, string const L2) = 0; - sptr + SPTR lookup(iter const& phrase, int max_sample = -1) const; void prep(iter const& phrase) const; #ifndef NO_MOSES - sptr + SPTR lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; void prep(ttasksptr const& ttask, iter const& phrase) const; #endif @@ -199,13 +199,13 @@ namespace Moses { virtual size_t revision() const { return 0; } - sptr + SPTR loadSentenceBias(string const& fname) const; - sptr + SPTR SetupDocumentBias(string const& bserver, string const& text, std::ostream* log) const; - sptr + SPTR SetupDocumentBias(map context_weights, std::ostream* log) const; void @@ -242,11 +242,11 @@ namespace Moses { } template - sptr + SPTR Bitext:: loadSentenceBias(string const& fname) const { - sptr ret(new SentenceBias(T1->size())); + SPTR ret(new SentenceBias(T1->size())); ifstream in(fname.c_str()); size_t i = 0; float v; while (in>>v) (*ret)[i++] = v; @@ -333,11 +333,11 @@ namespace Moses { typedef L2R_Token TKN; std::vector const & snt; TokenIndex & V; - sptr > & track; - sptr > & index; + SPTR > & track; + SPTR > & index; public: snt_adder(std::vector const& s, TokenIndex& v, - sptr >& t, sptr >& i); + SPTR >& t, SPTR >& i); void operator()(); }; @@ -467,12 +467,12 @@ namespace Moses { } template - sptr + SPTR Bitext:: SetupDocumentBias ( string const& bserver, string const& text, std::ostream* log ) const { - sptr ret; + SPTR ret; UTIL_THROW_IF2(m_sid2docid == NULL, "Document bias requested but no document map loaded."); ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, @@ -481,12 +481,12 @@ namespace Moses { } template - sptr + SPTR Bitext:: SetupDocumentBias ( std::map context_weights, std::ostream* log ) const { - sptr ret; + SPTR ret; UTIL_THROW_IF2(m_sid2docid == NULL, "Document bias requested but no document map loaded."); ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, @@ -509,14 +509,14 @@ namespace Moses { // and waits until the sampling is finished before it returns. // This allows sampling in the background template - sptr + SPTR Bitext ::prep2 (iter const& phrase, int max_sample) const { if (max_sample < 0) max_sample = m_default_sample_size; - sptr bias; - sptr cache; + SPTR bias; + SPTR cache; // - no caching for rare phrases and special requests (max_sample) // (still need to test what a good caching threshold is ...) // - use the task-specific cache when there is a sampling bias @@ -526,8 +526,8 @@ namespace Moses { cache = (phrase.root == I1.get() ? m_cache1 : m_cache2); } - sptr ret; - sptr const* cached; + SPTR ret; + SPTR const* cached; if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached) return *cached; @@ -549,7 +549,7 @@ namespace Moses { class pstats2pplist { Ttrack const& m_other; - sptr m_pstats; + SPTR m_pstats; std::vector >& m_pplist; typename PhrasePair::Scorer const* m_scorer; PhrasePair m_pp; @@ -562,7 +562,7 @@ namespace Moses { // CONSTRUCTOR pstats2pplist(typename TSA::tree_iterator const& m, Ttrack const& other, - sptr const& ps, + SPTR const& ps, std::vector >& dest, typename PhrasePair::Scorer const* scorer) : m_other(other) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h index 72e6c8638..5e899e0b4 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h @@ -14,8 +14,8 @@ public: class worker; private: boost::mutex lock; - std::list > joblist; - std::vector > workers; + std::list > joblist; + std::vector > workers; bool shutdown; size_t doomed; @@ -30,15 +30,15 @@ public: void add_workers(int n); - sptr + SPTR add_job(Bitext const* const theBitext, typename TSA::tree_iterator const& phrase, - size_t const max_samples, sptr const& bias); + size_t const max_samples, SPTR const& bias); // add_job(Bitext const* const theBitext, // typename TSA::tree_iterator const& phrase, // size_t const max_samples, SamplingBias const* const bias); - sptr + SPTR get_job(); }; @@ -82,23 +82,23 @@ void Bitext else while (int(workers.size()) < target) { - sptr w(new boost::thread(worker(*this))); + SPTR w(new boost::thread(worker(*this))); workers.push_back(w); } } template -sptr Bitext +SPTR Bitext ::agenda ::add_job(Bitext const* const theBitext, typename TSA::tree_iterator const& phrase, - size_t const max_samples, sptr const& bias) + size_t const max_samples, SPTR const& bias) { boost::unique_lock lk(this->lock); static boost::posix_time::time_duration nodelay(0,0,0,0); bool fwd = phrase.root == bt.I1.get(); - sptr j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, + SPTR j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd, bias)); j->stats->register_worker(); @@ -118,7 +118,7 @@ sptr Bitext --doomed; } else - workers[i++] = sptr(new boost::thread(worker(*this))); + workers[i++] = SPTR(new boost::thread(worker(*this))); } else ++i; } @@ -127,13 +127,13 @@ sptr Bitext } template -sptr::agenda::job> +SPTR::agenda::job> Bitext ::agenda ::get_job() { // cerr << workers.size() << " workers on record" << std::endl; - sptr ret; + SPTR ret; if (this->shutdown) return ret; boost::unique_lock lock(this->lock); if (this->doomed) @@ -142,7 +142,7 @@ Bitext return ret; } - typename list >::iterator j = joblist.begin(); + typename list >::iterator j = joblist.begin(); while (j != joblist.end()) { if ((*j)->done()) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h index 5975edd6f..b13222189 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -23,7 +23,7 @@ job public: size_t workers; // how many workers are working on this job? - sptr const> root; // root of the underlying suffix array + SPTR const> root; // root of the underlying suffix array char const* next; // next position to read from char const* stop; // end of index range size_t max_samples; // how many samples to extract at most @@ -32,8 +32,8 @@ public: */ size_t len; // phrase length bool fwd; // if true, source phrase is L1 - sptr stats; // stores statistics collected during sampling - sptr const m_bias; // sentence-level bias for sampling + SPTR stats; // stores statistics collected during sampling + SPTR const m_bias; // sentence-level bias for sampling float bias_total; bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence @@ -45,8 +45,8 @@ public: bool done() const; job(Bitext const* const theBitext, typename TSA::tree_iterator const& m, - sptr > const& r, size_t maxsmpl, bool isfwd, - sptr const& bias); + SPTR > const& r, size_t maxsmpl, bool isfwd, + SPTR const& bias); ~job(); }; @@ -65,8 +65,8 @@ template Bitext::agenda::job ::job(Bitext const* const theBitext, typename TSA::tree_iterator const& m, - sptr > const& r, size_t maxsmpl, - bool isfwd, sptr const& bias) + SPTR > const& r, size_t maxsmpl, + bool isfwd, SPTR const& bias) : m_bitext(theBitext) , rnd(0) , rnddenom(rnd.max() + 1.) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h index 656adfe63..97bdfd784 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h @@ -20,7 +20,7 @@ Bitext::agenda uint64_t sid=0, offset=0; // sid and offset of source phrase size_t s1=0, s2=0, e1=0, e2=0; // soft and hard boundaries of target phrase std::vector aln; // stores phrase-pair-internal alignment - while(sptr j = ag.get_job()) + while(SPTR j = ag.get_job()) { j->stats->register_worker(); bitvector full_alignment(100*100); // Is full_alignment still needed??? @@ -73,7 +73,7 @@ Bitext::agenda for (size_t s = s1; s <= s2; ++s) { TSA const& I = j->fwd ? *ag.bt.I2 : *ag.bt.I1; - sptr b = I.find(o + s, e1 - s); + SPTR b = I.find(o + s, e1 - s); UTIL_THROW_IF2(!b || b->size() < e1-s, "target phrase not found"); for (size_t i = e1; i <= e2; ++i) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_moses.h b/moses/TranslationModel/UG/mm/ug_bitext_moses.h index 9f3db56cf..100c5e1e6 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_moses.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h @@ -5,11 +5,11 @@ namespace Moses { namespace bitext { template -sptr +SPTR Bitext:: lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const { - sptr ret = prep2(ttask, phrase, max_sample); + SPTR ret = prep2(ttask, phrase, max_sample); UTIL_THROW_IF2(!ret, "Got NULL pointer where I expected a valid pointer."); // Why were we locking here? @@ -42,17 +42,17 @@ prep(ttasksptr const& ttask, iter const& phrase) const // and waits until the sampling is finished before it returns. // This allows sampling in the background template -sptr +SPTR Bitext ::prep2 ( ttasksptr const& ttask, iter const& phrase, int max_sample) const { if (max_sample < 0) max_sample = m_default_sample_size; - sptr bias; - sptr scope = ttask->GetScope(); - sptr context = scope->get(this); + SPTR bias; + SPTR scope = ttask->GetScope(); + SPTR context = scope->get(this); if (context) bias = context->bias; - sptr cache; + SPTR cache; // - no caching for rare phrases and special requests (max_sample) // (still need to test what a good caching threshold is ...) // - use the task-specific cache when there is a sampling bias @@ -63,8 +63,8 @@ Bitext ? (bias ? context->cache1 : m_cache1) : (bias ? context->cache2 : m_cache2)); } - sptr ret; - sptr const* cached; + SPTR ret; + SPTR const* cached; if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached) return *cached; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index 6821c9e4e..583cd4d73 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -15,8 +15,8 @@ namespace Moses struct pstats { - typedef boost::unordered_map > map_t; - typedef ThreadSafeContainer, map_t> cache_t; + typedef boost::unordered_map > map_t; + typedef ThreadSafeContainer, map_t> cache_t; typedef std::vector alnvec; #if UG_BITEXT_TRACK_ACTIVE_THREADS static ThreadSafeCounter active; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index ffb389129..52115a4ee 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -45,19 +45,19 @@ BitextSampler : public reference_counter mutable boost::condition_variable m_ready; mutable boost::mutex m_lock; // const members - // sptr const m_bitext; // keep bitext alive while I am + // SPTR const m_bitext; // keep bitext alive while I am // should be an iptr const m_bitext; // keep bitext alive as long as I am size_t const m_plen; // length of lookup phrase bool const m_fwd; // forward or backward direction? - sptr const m_root; // root of suffix array + SPTR const m_root; // root of suffix array char const* m_next; // current position char const* m_stop; // end of search range sampling_method const m_method; // look at all/random/ranked samples - sptr const m_bias; // bias over candidates + SPTR const m_bias; // bias over candidates size_t const m_samples; // how many samples at most // non-const members - sptr m_stats; // destination for phrase stats + SPTR m_stats; // destination for phrase stats size_t m_ctr; // number of samples considered float m_total_bias; // for random sampling with bias bool m_finished; @@ -77,11 +77,11 @@ public: BitextSampler(BitextSampler const& other); BitextSampler const& operator=(BitextSampler const& other); BitextSampler(bitext const* const bitext, typename bitext::iter const& phrase, - sptr const& bias, size_t const max_samples, + SPTR const& bias, size_t const max_samples, sampling_method const method); ~BitextSampler(); bool operator()(); // run sampling - sptr stats(); + SPTR stats(); bool done() const; }; @@ -172,7 +172,7 @@ template BitextSampler:: BitextSampler(Bitext const* const bitext, typename bitext::iter const& phrase, - sptr const& bias, size_t const max_samples, + SPTR const& bias, size_t const max_samples, sampling_method const method) : m_bitext(bitext) , m_plen(phrase.size()) @@ -310,7 +310,7 @@ consider_sample(TokenPosition const& p) for (size_t s = rec.s1; s <= rec.s2; ++s) { TSA const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1; - sptr b = I.find(o + s, rec.e1 - s); + SPTR b = I.find(o + s, rec.e1 - s); UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found"); for (size_t i = rec.e1; i <= rec.e2; ++i) @@ -359,7 +359,7 @@ done() const } template -sptr +SPTR BitextSampler:: stats() { diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.cc b/moses/TranslationModel/UG/mm/ug_im_bitext.cc index 5efa3b8c4..55603e1e2 100644 --- a/moses/TranslationModel/UG/mm/ug_im_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.cc @@ -6,7 +6,7 @@ namespace Moses { template<> - sptr > > + SPTR > > imBitext >:: add(vector const& s1, vector const& s2, @@ -19,7 +19,7 @@ namespace Moses size_t first_new_snt = this->T1 ? this->T1->size() : 0; #endif - sptr > ret; + SPTR > ret; { boost::unique_lock guard(m_lock); ret.reset(new imBitext(*this)); diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.h b/moses/TranslationModel/UG/mm/ug_im_bitext.h index 9515ec98b..ca7c75c77 100644 --- a/moses/TranslationModel/UG/mm/ug_im_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.h @@ -9,25 +9,25 @@ namespace Moses template class imBitext : public Bitext { - sptr > myTx; - sptr > myT1; - sptr > myT2; - sptr > myI1; - sptr > myI2; + SPTR > myTx; + SPTR > myT1; + SPTR > myT2; + SPTR > myI1; + SPTR > myI2; static ThreadSafeCounter my_revision; public: size_t revision() const { return my_revision; } void open(string const base, string const L1, string L2); - imBitext(sptr const& V1, - sptr const& V2, + imBitext(SPTR const& V1, + SPTR const& V2, size_t max_sample = 5000, size_t num_workers=4); imBitext(size_t max_sample = 5000, size_t num_workers=4); imBitext(imBitext const& other); - // sptr > + // SPTR > // add(vector const& s1, std::vector const& s2, vector & a); - sptr > + SPTR > add(vector const& s1, std::vector const& s2, std::vector const& a) const; @@ -53,8 +53,8 @@ namespace Moses template imBitext:: - imBitext(sptr const& v1, - sptr const& v2, + imBitext(SPTR const& v1, + SPTR const& v2, size_t max_sample, size_t num_workers) : Bitext(max_sample, num_workers) { @@ -89,14 +89,14 @@ namespace Moses } template<> - sptr > > + SPTR > > imBitext >:: add(vector const& s1, vector const& s2, vector const& aln) const; template - sptr > + SPTR > imBitext:: add(vector const& s1, vector const& s2, diff --git a/moses/TranslationModel/UG/mm/ug_lru_cache.h b/moses/TranslationModel/UG/mm/ug_lru_cache.h index ca4f6c4f3..3919b2ad5 100644 --- a/moses/TranslationModel/UG/mm/ug_lru_cache.h +++ b/moses/TranslationModel/UG/mm/ug_lru_cache.h @@ -66,14 +66,14 @@ namespace lru_cache size_t size() const { return m_idx.size(); } void reserve(size_t s) { m_recs.reserve(s); } - sptr + SPTR get(KEY const& key) { uint32_t p; { // brackets needed for lock scoping boost::shared_lock rlock(m_lock); typename map_t::const_iterator i = m_idx.find(key); - if (i == m_idx.end()) return sptr(); + if (i == m_idx.end()) return SPTR(); p = i->second; } boost::lock_guard guard(m_lock); @@ -82,7 +82,7 @@ namespace lru_cache } void - set(KEY const& key, sptr const& ptr) + set(KEY const& key, SPTR const& ptr) { boost::lock_guard lock(m_lock); std::pair foo; diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.cc b/moses/TranslationModel/UG/mm/ug_mmbitext.cc index 34e3f1b1e..ef797b5c1 100644 --- a/moses/TranslationModel/UG/mm/ug_mmbitext.cc +++ b/moses/TranslationModel/UG/mm/ug_mmbitext.cc @@ -147,7 +147,7 @@ // prep2(phrase); // } -// sptr +// SPTR // mmbitext:: // prep2(iter const& phrase) // { @@ -156,20 +156,20 @@ // ag = new agenda(*this); // ag->add_workers(20); // } -// typedef boost::unordered_map > pcache_t; +// typedef boost::unordered_map > pcache_t; // uint64_t pid = phrase.getPid(); // pcache_t & cache(phrase.root == &this->I1 ? cache1 : cache2); -// pcache_t::value_type entry(pid,sptr()); +// pcache_t::value_type entry(pid,SPTR()); // pair foo = cache.emplace(entry); // if (foo.second) foo.first->second = ag->add_job(phrase, 1000); // return foo.first->second; // } -// sptr +// SPTR // mmbitext:: // lookup(iter const& phrase) // { -// sptr ret = prep2(phrase); +// SPTR ret = prep2(phrase); // boost::unique_lock lock(ret->lock); // while (ret->in_progress) // ret->ready.wait(lock); @@ -184,7 +184,7 @@ // { // uint64_t sid=0, offset=0, len=0; // of the source phrase // bool fwd=false; // source phrase is L1 -// sptr stats; +// SPTR stats; // size_t s1=0, s2=0, e1=0, e2=0; // for (; ag.get_task(sid,offset,len,fwd,stats); ) // { @@ -260,7 +260,7 @@ // if (ag) delete ag; // } -// sptr +// SPTR // mmbitext:: // agenda:: // add_job(mmbitext::iter const& phrase, size_t const max_samples) @@ -286,7 +286,7 @@ // { // if (workers[i]->timed_join(nodelay)) // { -// workers[i] = sptr(new boost::thread(worker(*this))); +// workers[i] = SPTR(new boost::thread(worker(*this))); // } // } // } @@ -297,7 +297,7 @@ // mmbitext:: // agenda:: // get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, -// bool & fwd, sptr & stats) +// bool & fwd, SPTR & stats) // { // boost::unique_lock lock(this->lock); // if (this->doomed || this->shutdown) @@ -385,7 +385,7 @@ // { // for (int i = 0; i < n; ++i) // { -// sptr w(new boost::thread(worker(*this))); +// SPTR w(new boost::thread(worker(*this))); // workers.push_back(w); // } // } diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.h b/moses/TranslationModel/UG/mm/ug_mmbitext.h index 04c54e60b..e07d92830 100644 --- a/moses/TranslationModel/UG/mm/ug_mmbitext.h +++ b/moses/TranslationModel/UG/mm/ug_mmbitext.h @@ -51,7 +51,7 @@ namespace Moses { class job; class worker; list joblist; - std::vector > workers; + std::vector > workers; bool shutdown; size_t doomed; public: @@ -59,10 +59,10 @@ namespace Moses { agenda(mmbitext const& bitext); ~agenda(); void add_workers(int n); - sptr add_job(mmbitext::iter const& phrase, + SPTR add_job(mmbitext::iter const& phrase, size_t const max_samples); bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, - bool & fwd, sptr & stats); + bool & fwd, SPTR & stats); }; // stores the list of unfinished jobs; @@ -85,9 +85,9 @@ namespace Moses { size_t & s1, size_t & s2, size_t & e1, size_t & e2, std::vector * core_alignment, bool const flip) const; - boost::unordered_map > cache1,cache2; + boost::unordered_map > cache1,cache2; private: - sptr + SPTR prep2(iter const& phrase); public: mmbitext(); @@ -95,7 +95,7 @@ namespace Moses { void open(string const base, string const L1, string const L2); - sptr lookup(iter const& phrase); + SPTR lookup(iter const& phrase); void prep(iter const& phrase); }; @@ -182,7 +182,7 @@ namespace Moses { size_t ctr; size_t len; bool fwd; - sptr stats; + SPTR stats; bool step(uint64_t & sid, uint64_t & offset); }; diff --git a/moses/TranslationModel/UG/mm/ug_prep_phrases.h b/moses/TranslationModel/UG/mm/ug_prep_phrases.h index 93a5ea82a..25ba4b8f7 100644 --- a/moses/TranslationModel/UG/mm/ug_prep_phrases.h +++ b/moses/TranslationModel/UG/mm/ug_prep_phrases.h @@ -14,20 +14,20 @@ template // , typename BITEXT> struct StatsCollector { typedef lru_cache::LRU_Cache< uint64_t, pstats > hcache_t; - typedef ThreadSafeContainer > pcache_t; - typedef map > lcache_t; + typedef ThreadSafeContainer > pcache_t; + typedef map > lcache_t; iptr const> bitext; // underlying bitext sampling_method method; // sampling method size_t sample_size; // sample size - sptr bias; // sampling bias + SPTR bias; // sampling bias hcache_t* hcache; // "history" cache pcache_t* pcache; // permanent cache size_t pcache_th; // threshold for adding items to pcache - sptr lcache; // local cache + SPTR lcache; // local cache ug::ThreadPool* tpool; // thread pool to run jobs on StatsCollector(iptr > xbitext, - sptr const xbias) + SPTR const xbias) : method(ranked_sampling) , sample_size(100) , bias(xbias) @@ -51,11 +51,11 @@ struct StatsCollector if (!r.extend(m.getToken(-1)->id())) continue; this->process(m, r); uint64_t pid = r.getPid(); - sptr stats; + SPTR stats; if (hcache) stats = hcache->get(pid); if (!stats && pcache) { - sptr const* foo = pcache->get(pid); + SPTR const* foo = pcache->get(pid); if (foo) stats = *foo; } if (!stats) // need to sample diff --git a/moses/TranslationModel/UG/mm/ug_tsa_base.h b/moses/TranslationModel/UG/mm/ug_tsa_base.h index 3eaf738ab..37597348e 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_base.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_base.h @@ -317,11 +317,11 @@ namespace ugdiss public: // virtual - sptr > + SPTR > find(TKN const* start, size_t len) const { typedef TSA_tree_iterator iter; - sptr ret(new iter(this)); + SPTR ret(new iter(this)); size_t i = 0; while (i < len && ret->extend(start[i])) ++i; if (i < len) ret.reset(); @@ -333,12 +333,12 @@ namespace ugdiss // ====================================================================== // template - // sptr > + // SPTR > // TSA:: // find(TOKEN const* start, size_t len) const // { // typedef TSA_tree_iterator iter; - // sptr ret(new iter(this)); + // SPTR ret(new iter(this)); // size_t i = 0; // while (i < len && ret->extend(start[i])) ++i; // if (i < len) ret.reset(); diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h index 993fb4b5e..634342a23 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h @@ -189,7 +189,7 @@ namespace ugdiss return this->size(); } - sptr > + SPTR > randomSample(int level, size_t N) const; }; @@ -907,14 +907,14 @@ namespace ugdiss /// randomly select up to N occurrences of the sequence template - sptr > + SPTR > TSA_tree_iterator:: randomSample(int level, size_t N) const { if (level < 0) level += lower.size(); assert(level >=0); - sptr > + SPTR > ret(new std::vector(N)); size_t m=0; // number of samples selected so far diff --git a/moses/TranslationModel/UG/mm/ug_typedefs.h b/moses/TranslationModel/UG/mm/ug_typedefs.h index 1f4a2e1e7..1c35825a0 100644 --- a/moses/TranslationModel/UG/mm/ug_typedefs.h +++ b/moses/TranslationModel/UG/mm/ug_typedefs.h @@ -30,7 +30,7 @@ namespace ugdiss typedef std::vector int_4d_table; } -#define sptr boost::shared_ptr +#define SPTR boost::shared_ptr #define iptr boost::intrusive_ptr #define scoptr boost::scoped_ptr #define rcast reinterpret_cast diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index fb2dc147b..8592ab879 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -106,7 +106,7 @@ namespace Moses void Mmsapt:: - register_ff(sptr const& ff, vector > & registry) + register_ff(SPTR const& ff, vector > & registry) { registry.push_back(ff); ff->setIndex(m_feature_names.size()); @@ -340,25 +340,25 @@ namespace Moses template void Mmsapt:: - check_ff(string const ffname, vector >* registry) + check_ff(string const ffname, vector >* registry) { string const& spec = param[ffname]; if (spec == "" || spec == "0") return; if (registry) { - sptr ff(new fftype(spec)); + SPTR ff(new fftype(spec)); register_ff(ff, *registry); } else if (spec[spec.size()-1] == '+') // corpus specific { - sptr ff(new fftype(spec)); + SPTR ff(new fftype(spec)); register_ff(ff, m_active_ff_fix); ff.reset(new fftype(spec)); register_ff(ff, m_active_ff_dyn); } else { - sptr ff(new fftype(spec)); + SPTR ff(new fftype(spec)); register_ff(ff, m_active_ff_common); } } @@ -367,25 +367,25 @@ namespace Moses void Mmsapt:: check_ff(string const ffname, float const xtra, - vector >* registry) + vector >* registry) { string const& spec = param[ffname]; if (spec == "" || spec == "0") return; if (registry) { - sptr ff(new fftype(xtra,spec)); + SPTR ff(new fftype(xtra,spec)); register_ff(ff, *registry); } else if (spec[spec.size()-1] == '+') // corpus specific { - sptr ff(new fftype(xtra,spec)); + SPTR ff(new fftype(xtra,spec)); register_ff(ff, m_active_ff_fix); ff.reset(new fftype(xtra,spec)); register_ff(ff, m_active_ff_dyn); } else { - sptr ff(new fftype(xtra,spec)); + SPTR ff(new fftype(xtra,spec)); register_ff(ff, m_active_ff_common); } } @@ -410,7 +410,7 @@ namespace Moses { // lexical scores string lexfile = m_bname + L1 + "-" + L2 + ".lex"; - sptr > + SPTR > ff(new PScoreLex1(param["lex_alpha"],lexfile)); register_ff(ff,m_active_ff_common); @@ -436,9 +436,9 @@ namespace Moses // this translation model) else if (fsname == "datasource") { - sptr > ffpcnt(new PScorePC("pcnt")); + SPTR > ffpcnt(new PScorePC("pcnt")); register_ff(ffpcnt,m_active_ff_common); - sptr > ffwcnt(new PScoreWC("wcnt")); + SPTR > ffwcnt(new PScoreWC("wcnt")); register_ff(ffwcnt,m_active_ff_common); } } @@ -453,9 +453,9 @@ namespace Moses { boost::unique_lock lock(m_lock); // load feature functions (i.e., load underlying data bases, if any) - BOOST_FOREACH(sptr& ff, m_active_ff_fix) ff->load(); - BOOST_FOREACH(sptr& ff, m_active_ff_dyn) ff->load(); - BOOST_FOREACH(sptr& ff, m_active_ff_common) ff->load(); + BOOST_FOREACH(SPTR& ff, m_active_ff_fix) ff->load(); + BOOST_FOREACH(SPTR& ff, m_active_ff_dyn) ff->load(); + BOOST_FOREACH(SPTR& ff, m_active_ff_common) ff->load(); #if 0 if (with_checks) { @@ -515,7 +515,7 @@ namespace Moses Phrase const& src, PhrasePair* fix, PhrasePair* dyn, - sptr > const& dynbt) const + SPTR > const& dynbt) const { UTIL_THROW_IF2(!fix && !dyn, HERE << ": Can't create target phrase from nothing."); @@ -523,12 +523,12 @@ namespace Moses PhrasePair pool = fix ? *fix : *dyn; if (fix) { - BOOST_FOREACH(sptr const& ff, m_active_ff_fix) + BOOST_FOREACH(SPTR const& ff, m_active_ff_fix) (*ff)(*btfix, *fix, &fvals); } if (dyn) { - BOOST_FOREACH(sptr const& ff, m_active_ff_dyn) + BOOST_FOREACH(SPTR const& ff, m_active_ff_dyn) (*ff)(*dynbt, *dyn, &fvals); } @@ -540,7 +540,7 @@ namespace Moses if (m.size() == fix->len2) zilch.raw2 = m.approxOccurrenceCount(); pool += zilch; - BOOST_FOREACH(sptr const& ff, m_active_ff_dyn) + BOOST_FOREACH(SPTR const& ff, m_active_ff_dyn) (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); } else if (dyn) @@ -550,17 +550,17 @@ namespace Moses if (m.size() == dyn->len2) zilch.raw2 = m.approxOccurrenceCount(); pool += zilch; - BOOST_FOREACH(sptr const& ff, m_active_ff_fix) + BOOST_FOREACH(SPTR const& ff, m_active_ff_fix) (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); } if (fix) { - BOOST_FOREACH(sptr const& ff, m_active_ff_common) + BOOST_FOREACH(SPTR const& ff, m_active_ff_common) (*ff)(*btfix, pool, &fvals); } else { - BOOST_FOREACH(sptr const& ff, m_active_ff_common) + BOOST_FOREACH(SPTR const& ff, m_active_ff_common) (*ff)(*dynbt, pool, &fvals); } @@ -582,7 +582,7 @@ namespace Moses { LRModel::ModelType mdl = m_lr_func->GetModel().GetModelType(); LRModel::Direction dir = m_lr_func->GetModel().GetDirection(); - sptr scores(new Scores()); + SPTR scores(new Scores()); pool.fill_lr_vec(dir, mdl, *scores); tp->SetExtraScores(m_lr_func, scores); } @@ -632,7 +632,7 @@ namespace Moses // Reserve a local copy of the dynamic bitext in its current form. /btdyn/ // is set to a new copy of the dynamic bitext every time a sentence pair // is added. /dyn/ keeps the old bitext around as long as we need it. - sptr > dyn; + SPTR > dyn; { // braces are needed for scoping mutex lock guard! boost::unique_lock guard(m_lock); assert(btdyn); @@ -655,8 +655,8 @@ namespace Moses ? (mfix.getPid()<<1) : (mdyn.getPid()<<1)+1); // get context-specific cache of items previously looked up - sptr const& scope = ttask->GetScope(); - sptr cache = scope->get(cache_key); + SPTR const& scope = ttask->GetScope(); + SPTR cache = scope->get(cache_key); if (!cache) cache = m_cache; TPCollWrapper* ret = cache->get(phrasekey, dyn->revision()); // TO DO: we should revise the revision mechanism: we take the length @@ -675,12 +675,12 @@ namespace Moses // TO DO: have Bitexts return lists of PhrasePairs instead of pstats // no need to expand pstats at every single lookup again, especially // for btfix. - sptr sfix,sdyn; + SPTR sfix,sdyn; if (mfix.size() == sphrase.size()) { - sptr context = scope->get(btfix.get()); - sptr const* foo = context->cache1->get(mfix.getPid()); + SPTR context = scope->get(btfix.get()); + SPTR const* foo = context->cache1->get(mfix.getPid()); if (foo) { sfix = *foo; sfix->wait(); } else { @@ -769,8 +769,8 @@ namespace Moses Mmsapt:: set_bias_via_server(ttasksptr const& ttask) { - sptr const& scope = ttask->GetScope(); - sptr context = scope->get(btfix.get(), true); + SPTR const& scope = ttask->GetScope(); + SPTR context = scope->get(btfix.get(), true); if (m_bias_server.size() && context->bias == NULL && ttask->GetContextWindow()) { // we need to create the bias boost::unique_lock lock(context->lock); @@ -823,8 +823,8 @@ namespace Moses Mmsapt:: InitializeForInput(ttasksptr const& ttask) { - sptr const& scope = ttask->GetScope(); - sptr context = scope->get(btfix.get(), true); + SPTR const& scope = ttask->GetScope(); + SPTR context = scope->get(btfix.get(), true); // set sampling bias, depending on sampling method specified if (m_sampling_method == random_sampling) @@ -832,7 +832,7 @@ namespace Moses else UTIL_THROW2("Unknown sampling method: " << m_sampling_method); boost::unique_lock mylock(m_lock); - sptr localcache = scope->get(cache_key); + SPTR localcache = scope->get(cache_key); if (!localcache) { if (context->bias) localcache.reset(new TPCollCache(m_cache_size)); @@ -862,7 +862,7 @@ namespace Moses PrefixExists(ttasksptr const& ttask, Moses::Phrase const& phrase) const { if (phrase.GetSize() == 0) return false; - sptr const& scope = ttask->GetScope(); + SPTR const& scope = ttask->GetScope(); vector myphrase; fillIdSeq(phrase, m_ifactor, *btfix->V1, myphrase); @@ -870,7 +870,7 @@ namespace Moses TSA::tree_iterator mfix(btfix->I1.get(),&myphrase[0],myphrase.size()); if (mfix.size() == myphrase.size()) { - sptr context = scope->get(btfix.get(), true); + SPTR context = scope->get(btfix.get(), true); uint64_t pid = mfix.getPid(); if (!context->cache1->get(pid)) { @@ -884,7 +884,7 @@ namespace Moses return true; } - sptr > dyn; + SPTR > dyn; { // braces are needed for scoping lock! boost::unique_lock guard(m_lock); dyn = btdyn; @@ -905,7 +905,7 @@ namespace Moses Mmsapt ::Release(ttasksptr const& ttask, TargetPhraseCollection*& tpc) const { - sptr cache = ttask->GetScope()->get(cache_key); + SPTR cache = ttask->GetScope()->get(cache_key); TPCollWrapper* foo = static_cast(tpc); if (cache) cache->release(foo); tpc = NULL; @@ -917,7 +917,7 @@ namespace Moses string const& Mmsapt ::GetName() const { return m_name; } - // sptr + // SPTR // Mmsapt // ::setupDocumentBias(map const& bias) const // { From fd4e946ddc1b8eccf9253b9cadca3e92ab6f6be5 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 7 Aug 2015 19:14:21 +0100 Subject: [PATCH 231/286] Emacs code formatting settings. --- moses/ExportInterface.cpp | 1 + moses/ExportInterface.h | 1 + 2 files changed, 2 insertions(+) diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index a884f8e3b..3cf868cdd 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- // $Id: ExportInterface.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $ /*********************************************************************** diff --git a/moses/ExportInterface.h b/moses/ExportInterface.h index 03a8b1f1c..7ed6c0838 100644 --- a/moses/ExportInterface.h +++ b/moses/ExportInterface.h @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #pragma once // $Id$ From fcd0c17af3e192cc3b119ca90679e34754af3ce1 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 7 Aug 2015 19:16:27 +0100 Subject: [PATCH 232/286] Choice of map type in pstats. --- moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h | 10 ++++------ moses/TranslationModel/UG/mm/ug_bitext_pstats.h | 6 +++--- moses/TranslationModel/UG/mm/ug_bitext_sampler.h | 4 ++-- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h index b13222189..d6e3959d5 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -139,7 +139,8 @@ int Bitext::agenda::job id_type docid = m_bias->GetClass(sid); // uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0; - std::map::const_iterator m = stats->indoc.find(docid); + typedef pstats::indoc_map_t::const_iterator id_iter; + id_iter m = stats->indoc.find(docid); uint32_t k = m != stats->indoc.end() ? m->second : 0 ; // always consider candidates from dominating documents and @@ -162,8 +163,7 @@ int Bitext::agenda::job e = root->getCorpus()->sntEnd(sid); *log << docid << ":" << sid << " " << size_t(k) << "/" << N << " @" << p << " => " << d << " ["; - for (std::map::const_iterator m = stats->indoc.begin(); - m != stats->indoc.end(); ++m) + for (id_iter m = stats->indoc.begin(); m != stats->indoc.end(); ++m) { if (m != stats->indoc.begin()) *log << " "; *log << m->first << ":" << m->second; @@ -208,9 +208,7 @@ bool Bitext::agenda::job ::step(uint64_t & sid, uint64_t & offset) { // caller must lock! if (next == stop) return false; - UTIL_THROW_IF2 - ( next > stop, "Fatal error at " << HERE << ". How did that happen?" ); - // boost::lock_guard jguard(lock); // caller must lock! + UTIL_THROW_IF2(next > stop, "Fatal error at " << HERE << "."); next = root->readSid(next, stop, sid); next = root->readOffset(next, stop, offset); ++ctr; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index 583cd4d73..4be82234c 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -18,6 +18,8 @@ namespace Moses typedef boost::unordered_map > map_t; typedef ThreadSafeContainer, map_t> cache_t; typedef std::vector alnvec; + typedef boost::unordered_map trg_map_t; + typedef boost::unordered_map indoc_map_t; #if UG_BITEXT_TRACK_ACTIVE_THREADS static ThreadSafeCounter active; #endif @@ -33,9 +35,7 @@ namespace Moses uint32_t ofwd[Moses::LRModel::NONE+1]; // distribution of fwd phrase orientations uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations - // std::vector indoc; // distribution over where samples came from - std::map indoc; - typedef std::map trg_map_t; + indoc_map_t indoc; trg_map_t trg; pstats(); ~pstats(); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index 52115a4ee..6c08f9d48 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -104,7 +104,7 @@ check_sample_distribution(uint64_t const& sid, uint64_t const& offset) float p = (*m_bias)[sid]; id_type docid = m_bias->GetClass(sid); - std::map::const_iterator m = m_stats->indoc.find(docid); + pstats::indoc_map_t::const_iterator m = m_stats->indoc.find(docid); uint32_t k = m != m_stats->indoc.end() ? m->second : 0 ; // always consider candidates from dominating documents and @@ -128,7 +128,7 @@ check_sample_distribution(uint64_t const& sid, uint64_t const& offset) e = m_root->getCorpus()->sntEnd(sid); *log << docid << ":" << sid << " " << size_t(k) << "/" << N << " @" << p << " => " << d << " ["; - std::map::const_iterator m; + pstats::indoc_map_t::const_iterator m; for (m = m_stats->indoc.begin(); m != m_stats->indoc.end(); ++m) { if (m != m_stats->indoc.begin()) *log << " "; From c4fc7e6610c5dd7ea94984aea63cfb1c88eac0c9 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 7 Aug 2015 19:17:37 +0100 Subject: [PATCH 233/286] Update some options via xmlrpc_c calls. --- moses/parameters/AllOptions.cpp | 19 ++++++ moses/parameters/AllOptions.h | 9 ++- moses/parameters/BeamSearchOptions.h | 3 +- moses/parameters/BookkeepingOptions.h | 5 +- moses/parameters/ContextParameters.h | 3 +- moses/parameters/CubePruningOptions.cpp | 35 +++++++++- moses/parameters/CubePruningOptions.h | 8 ++- moses/parameters/InputOptions.h | 4 +- moses/parameters/LMBR_Options.h | 3 +- moses/parameters/LookupOptions.h | 17 +++++ moses/parameters/MBR_Options.h | 3 +- moses/parameters/NBestOptions.h | 3 +- moses/parameters/OptionsBaseClass.cpp | 15 +++++ moses/parameters/OptionsBaseClass.h | 17 +++++ moses/parameters/ReorderingOptions.h | 3 +- moses/parameters/ReportingOptions.cpp | 90 +++++++++++++++++++++++++ moses/parameters/ReportingOptions.h | 42 ++++++++++++ moses/parameters/SearchOptions.cpp | 39 ++++++++++- moses/parameters/SearchOptions.h | 11 ++- moses/server/TranslationRequest.cpp | 2 + 20 files changed, 316 insertions(+), 15 deletions(-) create mode 100644 moses/parameters/LookupOptions.h create mode 100644 moses/parameters/OptionsBaseClass.cpp create mode 100644 moses/parameters/OptionsBaseClass.h create mode 100644 moses/parameters/ReportingOptions.cpp create mode 100644 moses/parameters/ReportingOptions.h diff --git a/moses/parameters/AllOptions.cpp b/moses/parameters/AllOptions.cpp index fd4417fca..11a9e3cb8 100644 --- a/moses/parameters/AllOptions.cpp +++ b/moses/parameters/AllOptions.cpp @@ -53,4 +53,23 @@ namespace Moses return true; } + +#ifdef HAVE_XMLRPC_C + bool + AllOptions:: + update(std::mapconst& param) + { + if (!search.update(param)) return false; + if (!cube.update(param)) return false; + if (!nbest.update(param)) return false; + if (!reordering.update(param)) return false; + if (!context.update(param)) return false; + if (!input.update(param)) return false; + if (!mbr.update(param)) return false; + if (!lmbr.update(param)) return false; + return sanity_check(); + } +#endif + + } diff --git a/moses/parameters/AllOptions.h b/moses/parameters/AllOptions.h index 9fa3bc514..5f9949a76 100644 --- a/moses/parameters/AllOptions.h +++ b/moses/parameters/AllOptions.h @@ -2,6 +2,7 @@ #pragma once #include #include "moses/Parameter.h" +#include "OptionsBaseClass.h" #include "SearchOptions.h" #include "CubePruningOptions.h" #include "NBestOptions.h" @@ -10,11 +11,10 @@ #include "InputOptions.h" #include "MBR_Options.h" #include "LMBR_Options.h" - namespace Moses { struct - AllOptions + AllOptions : public OptionsBaseClass { SearchOptions search; CubePruningOptions cube; @@ -33,6 +33,11 @@ namespace Moses bool sanity_check(); AllOptions() {} AllOptions(Parameter const& param); + +#ifdef HAVE_XMLRPC_C + bool update(std::mapconst& param); +#endif + }; } diff --git a/moses/parameters/BeamSearchOptions.h b/moses/parameters/BeamSearchOptions.h index 85a8d5a64..90fc00803 100644 --- a/moses/parameters/BeamSearchOptions.h +++ b/moses/parameters/BeamSearchOptions.h @@ -2,11 +2,12 @@ #pragma once #include #include "moses/Parameter.h" +#include "OptionsBaseClass.h" namespace Moses { struct - BeamSearchOptions + BeamSearchOptions : public OptionsBaseClass { bool init(Parameter const& param); BeamSearchOptions(Parameter const& param); diff --git a/moses/parameters/BookkeepingOptions.h b/moses/parameters/BookkeepingOptions.h index 331f0b83c..0fd046766 100644 --- a/moses/parameters/BookkeepingOptions.h +++ b/moses/parameters/BookkeepingOptions.h @@ -1,9 +1,12 @@ // -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #include "moses/Parameter.h" +#include "OptionsBaseClass.h" + namespace Moses { - struct BookkeepingOptions { + struct BookkeepingOptions : public OptionsBaseClass + { bool need_alignment_info; bool init(Parameter const& param); }; diff --git a/moses/parameters/ContextParameters.h b/moses/parameters/ContextParameters.h index 280d3795e..6bd8fe738 100644 --- a/moses/parameters/ContextParameters.h +++ b/moses/parameters/ContextParameters.h @@ -4,11 +4,12 @@ #include "moses/Parameter.h" #include "moses/TypeDef.h" #include "moses/Util.h" +#include "OptionsBaseClass.h" namespace Moses { -class ContextParameters +class ContextParameters : public OptionsBaseClass { public: ContextParameters(); diff --git a/moses/parameters/CubePruningOptions.cpp b/moses/parameters/CubePruningOptions.cpp index 0c2bc9b4c..a8710c681 100644 --- a/moses/parameters/CubePruningOptions.cpp +++ b/moses/parameters/CubePruningOptions.cpp @@ -1,4 +1,4 @@ -// -*- mode: c++; cc-style: gnu -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #include "CubePruningOptions.h" namespace Moses @@ -16,4 +16,37 @@ namespace Moses return true; } +#ifdef HAVE_XMLRPC_C + bool + CubePruningOptions:: + update(std::mapconst& params) + { + typedef std::map params_t; + + params_t::const_iterator si = params.find("cube-pruning-pop-limit"); + if (si != params.end()) pop_limit = xmlrpc_c::value_int(si->second); + + si = params.find("cube-pruning-diversity"); + if (si != params.end()) diversity = xmlrpc_c::value_int(si->second); + + si = params.find("cube-pruning-lazy-scoring"); + if (si != params.end()) + { + std::string spec = xmlrpc_c::value_string(si->second); + if (spec == "true" or spec == "on" or spec == "1") + lazy_scoring = true; + else if (spec == "false" or spec == "off" or spec == "0") + lazy_scoring = false; + else + { + char const* msg + = "Error parsing specification for cube-pruning-lazy-scoring"; + xmlrpc_c::fault(msg, xmlrpc_c::fault::CODE_PARSE); + } + } + return true; + } +#endif + + } diff --git a/moses/parameters/CubePruningOptions.h b/moses/parameters/CubePruningOptions.h index 29959f4fe..5d27be6a9 100644 --- a/moses/parameters/CubePruningOptions.h +++ b/moses/parameters/CubePruningOptions.h @@ -2,11 +2,12 @@ #pragma once #include #include "moses/Parameter.h" +#include "OptionsBaseClass.h" namespace Moses { struct - CubePruningOptions + CubePruningOptions : public OptionsBaseClass { size_t pop_limit; size_t diversity; @@ -15,6 +16,11 @@ namespace Moses bool init(Parameter const& param); CubePruningOptions(Parameter const& param); CubePruningOptions() {}; + +#ifdef HAVE_XMLRPC_C + bool + update(std::mapconst& params); +#endif }; } diff --git a/moses/parameters/InputOptions.h b/moses/parameters/InputOptions.h index c5379bfae..f81ea06eb 100644 --- a/moses/parameters/InputOptions.h +++ b/moses/parameters/InputOptions.h @@ -3,10 +3,12 @@ #include #include "moses/Parameter.h" #include +#include "OptionsBaseClass.h" + namespace Moses { struct - InputOptions + InputOptions : public OptionsBaseClass { bool continue_partial_translation; bool default_non_term_only_for_empty_range; // whatever that means diff --git a/moses/parameters/LMBR_Options.h b/moses/parameters/LMBR_Options.h index 29dee3c2e..54fd0fcd0 100644 --- a/moses/parameters/LMBR_Options.h +++ b/moses/parameters/LMBR_Options.h @@ -3,12 +3,13 @@ #include #include #include "moses/Parameter.h" +#include "OptionsBaseClass.h" namespace Moses { // Options for mimum bayes risk decoding struct - LMBR_Options + LMBR_Options : public OptionsBaseClass { bool enabled; bool use_lattice_hyp_set; //! to use nbest as hypothesis set during lattice MBR diff --git a/moses/parameters/LookupOptions.h b/moses/parameters/LookupOptions.h new file mode 100644 index 000000000..3332f951d --- /dev/null +++ b/moses/parameters/LookupOptions.h @@ -0,0 +1,17 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +#include "OptionsBaseClass.h" +namespace Moses +{ + + struct + LookupOptions : public OptionsBaseClass + { + bool init(Parameter const& param); + ReorderingOptions() {} + }; + +} + diff --git a/moses/parameters/MBR_Options.h b/moses/parameters/MBR_Options.h index 56d5b34c1..0462ebc0f 100644 --- a/moses/parameters/MBR_Options.h +++ b/moses/parameters/MBR_Options.h @@ -2,12 +2,13 @@ #pragma once #include #include "moses/Parameter.h" +#include "OptionsBaseClass.h" namespace Moses { // Options for mimum bayes risk decoding struct - MBR_Options + MBR_Options : public OptionsBaseClass { bool enabled; size_t size; //! number of translation candidates considered diff --git a/moses/parameters/NBestOptions.h b/moses/parameters/NBestOptions.h index 894c35d1c..61e3c9806 100644 --- a/moses/parameters/NBestOptions.h +++ b/moses/parameters/NBestOptions.h @@ -1,10 +1,11 @@ // -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #pragma once #include +#include "OptionsBaseClass.h" namespace Moses { -struct NBestOptions +struct NBestOptions : public OptionsBaseClass { size_t nbest_size; size_t factor; diff --git a/moses/parameters/OptionsBaseClass.cpp b/moses/parameters/OptionsBaseClass.cpp new file mode 100644 index 000000000..e0b23babf --- /dev/null +++ b/moses/parameters/OptionsBaseClass.cpp @@ -0,0 +1,15 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#include "OptionsBaseClass.h" + +namespace Moses +{ +#ifdef HAVE_XMLRPC_C + bool + OptionsBaseClass:: + update(std::mapconst& params) + { + return true; + } +#endif + +} diff --git a/moses/parameters/OptionsBaseClass.h b/moses/parameters/OptionsBaseClass.h new file mode 100644 index 000000000..cb62467cf --- /dev/null +++ b/moses/parameters/OptionsBaseClass.h @@ -0,0 +1,17 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#ifdef HAVE_XMLRPC_C +#include +#endif +#include +#include +namespace Moses +{ + struct OptionsBaseClass + { +#ifdef HAVE_XMLRPC_C + virtual bool + update(std::mapconst& params); +#endif + }; +} diff --git a/moses/parameters/ReorderingOptions.h b/moses/parameters/ReorderingOptions.h index e18c7deab..f10fc4973 100644 --- a/moses/parameters/ReorderingOptions.h +++ b/moses/parameters/ReorderingOptions.h @@ -2,11 +2,12 @@ #pragma once #include #include "moses/Parameter.h" +#include "OptionsBaseClass.h" namespace Moses { struct - ReorderingOptions + ReorderingOptions : public OptionsBaseClass { int max_distortion; bool monotone_at_punct; diff --git a/moses/parameters/ReportingOptions.cpp b/moses/parameters/ReportingOptions.cpp new file mode 100644 index 000000000..25ae2f779 --- /dev/null +++ b/moses/parameters/ReportingOptions.cpp @@ -0,0 +1,90 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#if 0 +#include "ReportingOptions.h" +#include "moses/Parameter.h" + +namespace Moses { + using namespace std; + bool + ReportingOptions:: + init(Parameter const& param) + { + PARAM_VEC const* params; + + param.SetParameter(segmentation, "report-segmentation", false ); + param.SetParameter(segmentation_enriched, "report-segmentation-enriched", false); + param.SetParameter(all_factors, "report-all-factors", false ); + + // print ... + param.SetParameter(id, "print-id", false ); + param.SetParameter(aln_info, "print-alignment-info", false); + param.SetParameter(passthrough, "print-passthrough", false ); + + param.SetParameter(detailed_transrep_filepath, "translation-details", ""); + param.SetParameter(detailed_tree_transrep_filepath, + "tree-translation-details", ""); + param.SetParameter(detailed_all_transrep_filepath, + "translation-all-details", ""); + + // output search graph + param.SetParameter(output, + "translation-all-details", ""); + + + + param.SetParameter(sort_word_alignment, "sort-word-alignment", NoSort); + + + // Is there a reason why we can't use SetParameter here? [UG] + = param.GetParam("alignment-output-file"); + if (params && params->size()) { + m_alignmentOutputFile = Scan(params->at(0)); + } + + params = param.GetParam("output-word-graph"); + output_word_graph = (params && params->size() == 2); + + // bizarre code ahead! Why do we need to do the checks here? + // as adapted from StaticData.cpp + params = param.GetParam("output-search-graph"); + if (params && params->size()) { + if (params->size() != 1) { + std::cerr << "ERROR: wrong format for switch -output-search-graph file"; + return false; + } + output_search_graph = true; + } + else if (m_parameter->GetParam("output-search-graph-extended") && + m_parameter->GetParam("output-search-graph-extended")->size()) { + if (m_parameter->GetParam("output-search-graph-extended")->size() != 1) { + std::cerr << "ERROR: wrong format for switch -output-search-graph-extended file"; + return false; + } + output_search_graph = true; + m_outputSearchGraphExtended = true; + } else { + m_outputSearchGraph = false; + } + + params = m_parameter->GetParam("output-search-graph-slf"); + output_search_graph_slf = params && params->size(); + params = m_parameter->GetParam("output-search-graph-hypergraph"); + output_search_graph_hypergraph = params && params->size(); + +#ifdef HAVE_PROTOBUF + params = m_parameter->GetParam("output-search-graph-pb"); + if (params && params->size()) { + if (params->size() != 1) { + cerr << "ERROR: wrong format for switch -output-search-graph-pb path"; + return false; + } + m_outputSearchGraphPB = true; + } else + m_outputSearchGraphPB = false; +#endif + + + return true; + } +} +#endif diff --git a/moses/parameters/ReportingOptions.h b/moses/parameters/ReportingOptions.h new file mode 100644 index 000000000..343bd58ec --- /dev/null +++ b/moses/parameters/ReportingOptions.h @@ -0,0 +1,42 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include +#include "moses/Parameter.h" +namespace Moses +{ + + struct + ReportingOptions + { + + WordAlignmentSort sort_word_alignment; // 0: no, 1: target order + + + bool segmentation; // m_reportSegmentation; + bool segmentation_enriched; // m_reportSegmentationEnriched; + bool all_factors; // m_reportAllFactors; + + bool output_word_graph; + bool output_search_graph; + bool output_search_graph_extended; + bool output_search_graph_slf; + bool output_search_graph_hypergraph; + bool output_search_graph_protobuf; + + // print .. + bool aln_info; // m_PrintAlignmentInfo; + bool id; // m_PrintID; + bool passthrough; // m_PrintPassthroughInformation; + + // transrep = translation reporting + std::string detailed_transrep_filepath; + std::string detailed_tree_transrep_filepath; + std::string detailed_all_transrep_filepath; + + std::string aln_output_file; // m_alignmentOutputFile; + + bool init(Parameter const& param); + }; + +} + diff --git a/moses/parameters/SearchOptions.cpp b/moses/parameters/SearchOptions.cpp index 7f9c2e748..b75b0f15b 100644 --- a/moses/parameters/SearchOptions.cpp +++ b/moses/parameters/SearchOptions.cpp @@ -30,7 +30,6 @@ namespace Moses param.SetParameter(max_partial_trans_opt, "max-partial-trans-opt", DEFAULT_MAX_PART_TRANS_OPT_SIZE); - param.SetParameter(consensus, "consensus-decoding", false); // transformation to log of a few scores @@ -48,5 +47,43 @@ namespace Moses algo == SyntaxF2S || algo == SyntaxT2S_SCFG); } +#ifdef HAVE_XMLRPC_C + bool + SearchOptions:: + update(std::mapconst& params) + { + typedef std::map params_t; + + params_t::const_iterator si = params.find("search-algoritm"); + if (si != params.end()) + { + // use named parameters + std::string spec = xmlrpc_c::value_string(si->second); + if (spec == "normal" || spec == "0") algo = Normal; + else if (spec == "cube" || spec == "1") algo = CubePruning; + else throw xmlrpc_c::fault("Unsupported search algorithm", + xmlrpc_c::fault::CODE_PARSE); + } + + si = params.find("stack"); + if (si != params.end()) stack_size = xmlrpc_c::value_int(si->second); + + si = params.find("stack-diversity"); + if (si != params.end()) stack_diversity = xmlrpc_c::value_int(si->second); + + si = params.find("beam-threshold"); + if (si != params.end()) beam_width = xmlrpc_c::value_double(si->second); + + si = params.find("time-out"); + if (si != params.end()) timeout = xmlrpc_c::value_int(si->second); + + si = params.find("max-phrase-length"); + if (si != params.end()) max_phrase_length = xmlrpc_c::value_int(si->second); + + return true; + } +#endif + + } diff --git a/moses/parameters/SearchOptions.h b/moses/parameters/SearchOptions.h index 1df9d034b..2b2f39a65 100644 --- a/moses/parameters/SearchOptions.h +++ b/moses/parameters/SearchOptions.h @@ -2,13 +2,14 @@ #pragma once #include #include "moses/Parameter.h" +#include "OptionsBaseClass.h" namespace Moses { bool is_syntax(SearchAlgorithm algo); struct - SearchOptions + SearchOptions : public OptionsBaseClass { SearchAlgorithm algo; @@ -37,10 +38,16 @@ namespace Moses SearchOptions(Parameter const& param); SearchOptions() {} - bool UseEarlyDiscarding() const { + bool + UseEarlyDiscarding() const { return early_discarding_threshold != -std::numeric_limits::infinity(); } +#ifdef HAVE_XMLRPC_C + bool + update(std::mapconst& params); +#endif + }; } diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index 454ac8a65..b05330d2b 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -246,6 +246,8 @@ parse_request(std::map const& params) // params_t const params = m_paramList.getStruct(0); m_paramList.verifyEnd(1); // ??? UG + m_options.update(params); + // source text must be given, or we don't know what to translate typedef std::map params_t; params_t::const_iterator si = params.find("text"); From 1d38ca917716c7d8a18703c0ad8aa9c7738cd745 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Fri, 7 Aug 2015 19:19:04 +0100 Subject: [PATCH 234/286] Bug fix: BOOST_FOREACH seems to cause problems. --- moses/SearchCubePruning.cpp | 36 +++++++++++++++++++----------------- moses/SearchNormal.cpp | 6 ++++-- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/moses/SearchCubePruning.cpp b/moses/SearchCubePruning.cpp index f6b2b90a9..0c8b418f0 100644 --- a/moses/SearchCubePruning.cpp +++ b/moses/SearchCubePruning.cpp @@ -4,7 +4,7 @@ #include "StaticData.h" #include "InputType.h" #include "TranslationOptionCollection.h" - +#include using namespace std; namespace Moses @@ -37,11 +37,13 @@ public: } }; -SearchCubePruning::SearchCubePruning(Manager& manager, const InputType &source, const TranslationOptionCollection &transOptColl) - :Search(manager) - ,m_source(source) - ,m_hypoStackColl(source.GetSize() + 1) - ,m_transOptColl(transOptColl) +SearchCubePruning:: +SearchCubePruning(Manager& manager, const InputType &source, + const TranslationOptionCollection &transOptColl) + : Search(manager) + , m_source(source) + , m_hypoStackColl(source.GetSize() + 1) + , m_transOptColl(transOptColl) { const StaticData &staticData = StaticData::Instance(); @@ -72,7 +74,8 @@ void SearchCubePruning::Decode() // initial seed hypothesis: nothing translated, no words produced Hypothesis *hypo = Hypothesis::Create(m_manager,m_source, m_initialTransOpt); - HypothesisStackCubePruning &firstStack = *static_cast(m_hypoStackColl.front()); + HypothesisStackCubePruning &firstStack + = *static_cast(m_hypoStackColl.front()); firstStack.AddInitial(hypo); // Call this here because the loop below starts at the second stack. firstStack.CleanupArcList(); @@ -89,17 +92,16 @@ void SearchCubePruning::Decode() int timelimit = m_options.search.timeout; std::vector < HypothesisStack* >::iterator iterStack; for (iterStack = m_hypoStackColl.begin() + 1 ; iterStack != m_hypoStackColl.end() ; ++iterStack) { - // check if decoding ran out of time - double _elapsed_time = GetUserTime(); - if (timelimit && _elapsed_time > timelimit) { - VERBOSE(1,"Decoding is out of time (" << _elapsed_time << "," - << timelimit << ")" << std::endl); - return; - } - HypothesisStackCubePruning &sourceHypoColl = *static_cast(*iterStack); + // BOOST_FOREACH(HypothesisStack* hstack, m_hypoStackColl) { + if (this->out_of_time()) return; - // priority queue which has a single entry for each bitmap container, sorted by score of top hyp - std::priority_queue< BitmapContainer*, std::vector< BitmapContainer* >, BitmapContainerOrderer> BCQueue; + HypothesisStackCubePruning &sourceHypoColl + = *static_cast(*iterStack); + + // priority queue which has a single entry for each bitmap + // container, sorted by score of top hyp + std::priority_queue < BitmapContainer*, std::vector< BitmapContainer* >, + BitmapContainerOrderer > BCQueue; _BMType::const_iterator bmIter; const _BMType &accessor = sourceHypoColl.GetBitmapAccessor(); diff --git a/moses/SearchNormal.cpp b/moses/SearchNormal.cpp index 227c27479..e7983a93e 100644 --- a/moses/SearchNormal.cpp +++ b/moses/SearchNormal.cpp @@ -69,8 +69,10 @@ ProcessOneStack(HypothesisStack* hstack) IFVERBOSE(2) stats.StopTimeStack(); // go through each hypothesis on the stack and try to expand it - BOOST_FOREACH(Hypothesis* h, sourceHypoColl) - ProcessOneHypothesis(*h); + // BOOST_FOREACH(Hypothesis* h, sourceHypoColl) + HypothesisStackNormal::const_iterator h; + for (h = sourceHypoColl.begin(); h != sourceHypoColl.end(); ++h) + ProcessOneHypothesis(**h); return true; } From 21aa5af640946c0f00c717a0774d05441a4b6902 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Sat, 8 Aug 2015 00:00:45 +0100 Subject: [PATCH 235/286] daily automatic beautifier --- moses-cmd/LatticeMBRGrid.cpp | 6 +-- moses/ChartHypothesis.cpp | 8 ++-- moses/ChartManager.cpp | 4 +- moses/ChartParser.cpp | 2 +- moses/ConfusionNet.cpp | 4 +- moses/ExportInterface.cpp | 35 +++++++++-------- moses/FF/ConstrainedDecoding.cpp | 8 ++-- moses/FF/Factory.cpp | 2 +- moses/FF/HyperParameterAsWeight.cpp | 2 +- moses/HypothesisStackNormal.cpp | 2 +- moses/IOWrapper.cpp | 47 +++++++++++------------ moses/IOWrapper.h | 10 ++--- moses/Incremental.cpp | 4 +- moses/LatticeMBR.cpp | 26 ++++++------- moses/Manager.cpp | 6 +-- moses/OutputCollector.h | 59 ++++++++++++++--------------- moses/Parameter.cpp | 6 +-- moses/RuleCube.cpp | 2 +- moses/Search.cpp | 8 ++-- moses/Search.h | 2 +- moses/SearchCubePruning.cpp | 4 +- moses/SearchNormal.cpp | 24 ++++++------ moses/SearchNormal.h | 18 ++++----- moses/StaticData.cpp | 18 ++++----- moses/StaticData.h | 16 ++++---- moses/Syntax/F2S/Manager-inl.h | 2 +- moses/TargetPhrase.cpp | 3 +- moses/TranslationTask.cpp | 6 +-- moses/TranslationTask.h | 6 +-- 29 files changed, 169 insertions(+), 171 deletions(-) diff --git a/moses-cmd/LatticeMBRGrid.cpp b/moses-cmd/LatticeMBRGrid.cpp index 0ad338975..a0c0a7852 100644 --- a/moses-cmd/LatticeMBRGrid.cpp +++ b/moses-cmd/LatticeMBRGrid.cpp @@ -191,11 +191,11 @@ int main(int argc, char* argv[]) BOOST_FOREACH(float const& p, pgrid) { lmbr.precision = p; BOOST_FOREACH(float const& r, rgrid) { - lmbr.ratio = r; + lmbr.ratio = r; BOOST_FOREACH(size_t const prune_i, prune_grid) { - lmbr.pruning_factor = prune_i; + lmbr.pruning_factor = prune_i; BOOST_FOREACH(float const& scale_i, scale_grid) { - mbr.scale = scale_i; + mbr.scale = scale_i; size_t lineCount = source->GetTranslationId(); cout << lineCount << " ||| " << p << " " << r << " " << size_t(prune_i) << " " << scale_i diff --git a/moses/ChartHypothesis.cpp b/moses/ChartHypothesis.cpp index d0d6bf625..806222029 100644 --- a/moses/ChartHypothesis.cpp +++ b/moses/ChartHypothesis.cpp @@ -288,10 +288,10 @@ void ChartHypothesis::CleanupArcList() */ const StaticData &staticData = StaticData::Instance(); size_t nBestSize = staticData.options().nbest.nbest_size; - bool distinctNBest = (staticData.options().nbest.only_distinct - || staticData.options().mbr.enabled - || staticData.GetOutputSearchGraph() - || staticData.GetOutputSearchGraphHypergraph()); + bool distinctNBest = (staticData.options().nbest.only_distinct + || staticData.options().mbr.enabled + || staticData.GetOutputSearchGraph() + || staticData.GetOutputSearchGraphHypergraph()); if (!distinctNBest && m_arcList->size() > nBestSize) { // prune arc list only if there too many arcs diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp index b1870791b..296fd9768 100644 --- a/moses/ChartManager.cpp +++ b/moses/ChartManager.cpp @@ -322,8 +322,8 @@ void ChartManager::OutputNBest(OutputCollector *collector) const if (nBestSize > 0) { const size_t translationId = m_source.GetTranslationId(); - VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " - << staticData.options().nbest.output_file_path << endl); + VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " + << staticData.options().nbest.output_file_path << endl); std::vector > nBestList; CalcNBest(nBestSize, nBestList,staticData.options().nbest.only_distinct); OutputNBestList(collector, nBestList, translationId); diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp index 8c569cec9..98ff9bd00 100644 --- a/moses/ChartParser.cpp +++ b/moses/ChartParser.cpp @@ -106,7 +106,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo("0-0"); targetPhrase->EvaluateInIsolation(*unksrc); - + if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled() || staticData.options().nbest.print_trees || staticData.GetTreeStructure() != NULL) { targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]"); } diff --git a/moses/ConfusionNet.cpp b/moses/ConfusionNet.cpp index 225f3034f..16db99c44 100644 --- a/moses/ConfusionNet.cpp +++ b/moses/ConfusionNet.cpp @@ -68,8 +68,8 @@ ConfusionNet() : InputType() const StaticData& SD = StaticData::Instance(); if (SD.IsSyntax()) { - m_defaultLabelSet.insert(SD.GetInputDefaultNonTerminal()); - } + m_defaultLabelSet.insert(SD.GetInputDefaultNonTerminal()); + } UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified"); } diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index a884f8e3b..66ff8b436 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -177,8 +177,8 @@ batch_run() #endif // using context for adaptation: - // e.g., context words / strings from config file / cmd line - std::string context_string; + // e.g., context words / strings from config file / cmd line + std::string context_string; params.SetParameter(context_string,"context-string",string("")); // ... or weights for documents/domains from config file / cmd. line @@ -189,18 +189,18 @@ batch_run() size_t size_t_max = std::numeric_limits::max(); bool use_context_window = ioWrapper->GetLookAhead() || ioWrapper->GetLookBack(); bool use_context = use_context_window || context_string.size(); - bool use_sliding_context_window = (use_context_window - && ioWrapper->GetLookAhead() != size_t_max); + bool use_sliding_context_window = (use_context_window + && ioWrapper->GetLookAhead() != size_t_max); boost::shared_ptr > context_window; boost::shared_ptr >* cw; cw = use_context_window ? &context_window : NULL; - if (!cw && context_string.size()) + if (!cw && context_string.size()) context_window.reset(new std::vector(1,context_string)); // global scope of caches, biases, etc., if any boost::shared_ptr gscope; - if (!use_sliding_context_window) + if (!use_sliding_context_window) gscope.reset(new ContextScope); // main loop over set of input sentences @@ -212,21 +212,20 @@ batch_run() boost::shared_ptr lscope; if (gscope) lscope = gscope; else lscope.reset(new ContextScope); - - boost::shared_ptr task; + + boost::shared_ptr task; task = TranslationTask::create(source, ioWrapper, lscope); - - if (cw) - { - if (context_string.size()) - context_window->push_back(context_string); - if(!use_sliding_context_window) - cw = NULL; - } + + if (cw) { + if (context_string.size()) + context_window->push_back(context_string); + if(!use_sliding_context_window) + cw = NULL; + } if (context_window) task->SetContextWindow(context_window); - - if (context_weights != "") + + if (context_weights != "") task->SetContextWeights(context_weights); // Allow for (sentence-)context-specific processing prior to diff --git a/moses/FF/ConstrainedDecoding.cpp b/moses/FF/ConstrainedDecoding.cpp index 6743a9085..5485c401a 100644 --- a/moses/FF/ConstrainedDecoding.cpp +++ b/moses/FF/ConstrainedDecoding.cpp @@ -43,10 +43,10 @@ ConstrainedDecoding::ConstrainedDecoding(const std::string &line) void ConstrainedDecoding::Load() { const StaticData &staticData = StaticData::Instance(); - bool addBeginEndWord - = ((staticData.options().search.algo == CYKPlus) - || (staticData.options().search.algo == ChartIncremental)); - + bool addBeginEndWord + = ((staticData.options().search.algo == CYKPlus) + || (staticData.options().search.algo == ChartIncremental)); + for(size_t i = 0; i < m_paths.size(); ++i) { InputFileStream constraintFile(m_paths[i]); std::string line; diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 74c54117e..0042651bf 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -166,7 +166,7 @@ FeatureFactory weights.assign(feature->GetNumScoreComponents(),1.0); } else { VERBOSE(2,"WARNING: No weights specified in config file for FF " - << featureName << ". Using default values supplied by FF."); + << featureName << ". Using default values supplied by FF."); } } UTIL_THROW_IF2(weights.size() != feature->GetNumScoreComponents(), diff --git a/moses/FF/HyperParameterAsWeight.cpp b/moses/FF/HyperParameterAsWeight.cpp index 1548df158..37516af52 100644 --- a/moses/FF/HyperParameterAsWeight.cpp +++ b/moses/FF/HyperParameterAsWeight.cpp @@ -21,7 +21,7 @@ HyperParameterAsWeight::HyperParameterAsWeight(const std::string &line) staticData.m_options.search.stack_size = weights[0] * 1000; staticData.m_options.search.beam_width = weights[1] * 10; - + } diff --git a/moses/HypothesisStackNormal.cpp b/moses/HypothesisStackNormal.cpp index 7c99528fc..e72c803b7 100644 --- a/moses/HypothesisStackNormal.cpp +++ b/moses/HypothesisStackNormal.cpp @@ -36,7 +36,7 @@ namespace Moses HypothesisStackNormal::HypothesisStackNormal(Manager& manager) : HypothesisStack(manager) { - m_nBestIsEnabled = StaticData::Instance().options().nbest.enabled; + m_nBestIsEnabled = StaticData::Instance().options().nbest.enabled; m_bestScore = -std::numeric_limits::infinity(); m_worstScore = -std::numeric_limits::infinity(); } diff --git a/moses/IOWrapper.cpp b/moses/IOWrapper.cpp index eabf0c8c2..1c54ec619 100644 --- a/moses/IOWrapper.cpp +++ b/moses/IOWrapper.cpp @@ -79,12 +79,12 @@ namespace Moses IOWrapper::IOWrapper() : m_nBestStream(NULL) - // , m_outputWordGraphStream(NULL) - // , m_outputSearchGraphStream(NULL) - // , m_detailedTranslationReportingStream(NULL) - // , m_unknownsStream(NULL) - // , m_alignmentInfoStream(NULL) - // , m_latticeSamplesStream(NULL) + // , m_outputWordGraphStream(NULL) + // , m_outputSearchGraphStream(NULL) + // , m_detailedTranslationReportingStream(NULL) + // , m_unknownsStream(NULL) + // , m_alignmentInfoStream(NULL) + // , m_latticeSamplesStream(NULL) , m_surpressSingleBestOutput(false) , m_look_ahead(0) , m_look_back(0) @@ -142,13 +142,13 @@ IOWrapper::IOWrapper() P.SetParameter(path, "translation-details", ""); if (path.size()) m_detailedTranslationCollector.reset(new OutputCollector(path)); - + P.SetParameter(path, "tree-translation-details", ""); if (path.size()) m_detailTreeFragmentsOutputCollector.reset(new OutputCollector(path)); - + P.SetParameter(path, "output-word-graph", ""); if (path.size()) m_wordGraphCollector.reset(new OutputCollector(path)); - + size_t latticeSamplesSize = staticData.GetLatticeSamplesSize(); string latticeSamplesFile = staticData.GetLatticeSamplesFilePath(); if (latticeSamplesSize) { @@ -157,7 +157,7 @@ IOWrapper::IOWrapper() m_surpressSingleBestOutput = true; } } - + if (!m_surpressSingleBestOutput) { m_singleBestOutputCollector.reset(new Moses::OutputCollector(&std::cout)); } @@ -193,8 +193,8 @@ IOWrapper::~IOWrapper() if (m_inputFile != NULL) delete m_inputFile; // if (m_nBestStream != NULL && !m_surpressSingleBestOutput) { - // outputting n-best to file, rather than stdout. need to close file and delete obj - // delete m_nBestStream; + // outputting n-best to file, rather than stdout. need to close file and delete obj + // delete m_nBestStream; // } // delete m_detailedTranslationReportingStream; @@ -251,16 +251,15 @@ ReadInput(boost::shared_ptr >* cw) boost::shared_ptr source = GetBufferedInput(); if (source) { source->SetTranslationId(m_currentLine++); - + // when using a sliding context window, remove obsolete past input from buffer: - if (m_past_input.size() && m_look_back != std::numeric_limits::max()) - { - list >::iterator m = m_past_input.end(); - for (size_t cnt = 0; cnt < m_look_back && --m != m_past_input.begin();) - cnt += (*m)->GetSize(); - while (m_past_input.begin() != m) m_past_input.pop_front(); - } - + if (m_past_input.size() && m_look_back != std::numeric_limits::max()) { + list >::iterator m = m_past_input.end(); + for (size_t cnt = 0; cnt < m_look_back && --m != m_past_input.begin();) + cnt += (*m)->GetSize(); + while (m_past_input.begin() != m) m_past_input.pop_front(); + } + if (m_look_back) m_past_input.push_back(source); } @@ -268,15 +267,15 @@ ReadInput(boost::shared_ptr >* cw) return source; } -boost::shared_ptr > +boost::shared_ptr > IOWrapper:: GetCurrentContextWindow() const { boost::shared_ptr > context(new std::vector); BOOST_FOREACH(boost::shared_ptr const& i, m_past_input) - context->push_back(i->ToString()); + context->push_back(i->ToString()); BOOST_FOREACH(boost::shared_ptr const& i, m_future_input) - context->push_back(i->ToString()); + context->push_back(i->ToString()); return context; } diff --git a/moses/IOWrapper.h b/moses/IOWrapper.h index ee90042b0..c55793329 100644 --- a/moses/IOWrapper.h +++ b/moses/IOWrapper.h @@ -129,7 +129,7 @@ public: // Moses::InputType* GetInput(Moses::InputType *inputType); - boost::shared_ptr + boost::shared_ptr ReadInput(boost::shared_ptr >* cw = NULL); Moses::OutputCollector *GetSingleBestOutputCollector() { @@ -184,7 +184,7 @@ public: // post editing std::ifstream *spe_src, *spe_trg, *spe_aln; - std::list > const& GetPastInput() const { + std::list > const& GetPastInput() const { return m_past_input; } @@ -198,7 +198,7 @@ public: size_t GetLookBack() const { return m_look_back; } - + private: template boost::shared_ptr @@ -207,7 +207,7 @@ private: boost::shared_ptr GetBufferedInput(); - boost::shared_ptr > + boost::shared_ptr > GetCurrentContextWindow() const; }; @@ -230,7 +230,7 @@ BufferInput() } while (m_buffered_ahead < m_look_ahead) { source.reset(new itype); - if (!source->Read(*m_inputStream, *m_inputFactorOrder)) + if (!source->Read(*m_inputStream, *m_inputFactorOrder)) break; m_future_input.push_back(source); m_buffered_ahead += source->GetSize(); diff --git a/moses/Incremental.cpp b/moses/Incremental.cpp index 51fcb0bf5..d1eb3b532 100644 --- a/moses/Incremental.cpp +++ b/moses/Incremental.cpp @@ -223,8 +223,8 @@ namespace const float log_10 = logf(10); } -template -search::History +template +search::History Manager:: PopulateBest(const Model &model, const std::vector &words, Best &out) { diff --git a/moses/LatticeMBR.cpp b/moses/LatticeMBR.cpp index 46fb16242..8dffda1b1 100644 --- a/moses/LatticeMBR.cpp +++ b/moses/LatticeMBR.cpp @@ -490,18 +490,18 @@ bool Edge::operator< (const Edge& compare ) const ostream& operator<< (ostream& out, const Edge& edge) { - out << "Head: " << edge.m_headNode->GetId() - << ", Tail: " << edge.m_tailNode->GetId() - << ", Score: " << edge.m_score + out << "Head: " << edge.m_headNode->GetId() + << ", Tail: " << edge.m_tailNode->GetId() + << ", Score: " << edge.m_score << ", Phrase: " << edge.m_targetPhrase << endl; return out; } bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b) { - return (a->GetWordsBitmap().GetNumWordsCovered() - < - b->GetWordsBitmap().GetNumWordsCovered()); + return (a->GetWordsBitmap().GetNumWordsCovered() + < + b->GetWordsBitmap().GetNumWordsCovered()); } void getLatticeMBRNBest(const Manager& manager, const TrellisPathList& nBestList, @@ -514,19 +514,19 @@ void getLatticeMBRNBest(const Manager& manager, const TrellisPathList& nBestList std::map < const Hypothesis*, set > outgoingHyps; map > incomingEdges; vector< float> estimatedScores; - manager.GetForwardBackwardSearchGraph(&connected, &connectedList, - &outgoingHyps, &estimatedScores); + manager.GetForwardBackwardSearchGraph(&connected, &connectedList, + &outgoingHyps, &estimatedScores); LMBR_Options const& lmbr = manager.options().lmbr; MBR_Options const& mbr = manager.options().mbr; - pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, - manager.GetBestHypothesis(), lmbr.pruning_factor, mbr.scale); + pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, + manager.GetBestHypothesis(), lmbr.pruning_factor, mbr.scale); calcNgramExpectations(connectedList, incomingEdges, ngramPosteriors,true); vector mbrThetas = lmbr.theta; float p = lmbr.precision; float r = lmbr.ratio; float mapWeight = lmbr.map_weight; - if (mbrThetas.size() == 0) { + if (mbrThetas.size() == 0) { // thetas were not specified on the command line, so use p and r instead mbrThetas.push_back(-1); //Theta 0 mbrThetas.push_back(1/(bleu_order*p)); @@ -580,8 +580,8 @@ const TrellisPath doConsensusDecoding(const Manager& manager, const TrellisPathL manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores); LMBR_Options const& lmbr = manager.options().lmbr; MBR_Options const& mbr = manager.options().mbr; - pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, - manager.GetBestHypothesis(), lmbr.pruning_factor, mbr.scale); + pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, + manager.GetBestHypothesis(), lmbr.pruning_factor, mbr.scale); calcNgramExpectations(connectedList, incomingEdges, ngramExpectations,false); //expected length is sum of expected unigram counts diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 8d67dcced..9a678d99d 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -1616,9 +1616,9 @@ void Manager::OutputNBest(OutputCollector *collector) const TrellisPathList nBestList; ostringstream out; CalcNBest(options().nbest.nbest_size, nBestList, - options().nbest.only_distinct); - OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), - m_source.GetTranslationId(), + options().nbest.only_distinct); + OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), + m_source.GetTranslationId(), staticData.GetReportSegmentation()); collector->Write(m_source.GetTranslationId(), out.str()); } diff --git a/moses/OutputCollector.h b/moses/OutputCollector.h index d66c16f20..0d6f37472 100644 --- a/moses/OutputCollector.h +++ b/moses/OutputCollector.h @@ -43,8 +43,8 @@ namespace Moses class OutputCollector { public: - OutputCollector(std::ostream* outStream= &std::cout, - std::ostream* debugStream=&std::cerr) + OutputCollector(std::ostream* outStream= &std::cout, + std::ostream* debugStream=&std::cerr) : m_nextOutput(0) , m_outStream(outStream) , m_debugStream(debugStream) @@ -52,37 +52,36 @@ public: , m_isHoldingDebugStream(false) {} OutputCollector(std::string xout, std::string xerr = "") - : m_nextOutput(0) - { - // TO DO open magic streams instead of regular ofstreams! [UG] + : m_nextOutput(0) { + // TO DO open magic streams instead of regular ofstreams! [UG] - if (xout == "/dev/stderr") { - m_outStream = &std::cerr; - m_isHoldingOutputStream = false; - } else if (xout.size() && xout != "/dev/stdout" && xout != "-") { - m_outStream = new std::ofstream(xout.c_str()); - UTIL_THROW_IF2(!m_outStream->good(), "Failed to open output file" - << xout); - m_isHoldingOutputStream = true; - } else { - m_outStream = &std::cout; - m_isHoldingOutputStream = false; - } - - if (xerr == "/dev/stdout") { - m_debugStream = &std::cout; - m_isHoldingDebugStream = false; - } else if (xerr.size() && xerr != "/dev/stderr") { - m_debugStream = new std::ofstream(xerr.c_str()); - UTIL_THROW_IF2(!m_debugStream->good(), "Failed to open debug stream" - << xerr); - m_isHoldingDebugStream = true; - } else { - m_debugStream = &std::cerr; - m_isHoldingDebugStream = false; - } + if (xout == "/dev/stderr") { + m_outStream = &std::cerr; + m_isHoldingOutputStream = false; + } else if (xout.size() && xout != "/dev/stdout" && xout != "-") { + m_outStream = new std::ofstream(xout.c_str()); + UTIL_THROW_IF2(!m_outStream->good(), "Failed to open output file" + << xout); + m_isHoldingOutputStream = true; + } else { + m_outStream = &std::cout; + m_isHoldingOutputStream = false; } + if (xerr == "/dev/stdout") { + m_debugStream = &std::cout; + m_isHoldingDebugStream = false; + } else if (xerr.size() && xerr != "/dev/stderr") { + m_debugStream = new std::ofstream(xerr.c_str()); + UTIL_THROW_IF2(!m_debugStream->good(), "Failed to open debug stream" + << xerr); + m_isHoldingDebugStream = true; + } else { + m_debugStream = &std::cerr; + m_isHoldingDebugStream = false; + } + } + ~OutputCollector() { if (m_isHoldingOutputStream) delete m_outStream; diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 31bba1c14..d0b6d6374 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -214,10 +214,10 @@ Parameter::Parameter() AddParam(server_opts,"server", "Run moses as a translation server."); AddParam(server_opts,"server-port", "Port for moses server"); AddParam(server_opts,"server-log", "Log destination for moses server"); - AddParam(server_opts,"session-timeout", - "Timeout for sessions, e.g. '2h30m' or 1d (=24h)"); + AddParam(server_opts,"session-timeout", + "Timeout for sessions, e.g. '2h30m' or 1d (=24h)"); AddParam(server_opts,"session-cache-size", string("Max. number of sessions cached.") - +"Least recently used session is dumped first."); + +"Least recently used session is dumped first."); AddParam(server_opts,"serial", "Run server in serial mode, processing only one request at a time."); po::options_description irstlm_opts("IRSTLM Options"); diff --git a/moses/RuleCube.cpp b/moses/RuleCube.cpp index f05d95270..7ded39d54 100644 --- a/moses/RuleCube.cpp +++ b/moses/RuleCube.cpp @@ -44,7 +44,7 @@ RuleCube::RuleCube(const ChartTranslationOptions &transOpt, { RuleCubeItem *item = new RuleCubeItem(transOpt, allChartCells); m_covered.insert(item); - if (StaticData::Instance().options().cube.lazy_scoring) { + if (StaticData::Instance().options().cube.lazy_scoring) { item->EstimateScore(); } else { item->CreateHypothesis(transOpt, manager); diff --git a/moses/Search.cpp b/moses/Search.cpp index 908090dd4..010cc6b08 100644 --- a/moses/Search.cpp +++ b/moses/Search.cpp @@ -20,8 +20,8 @@ Search::Search(Manager& manager) Search * Search:: CreateSearch(Manager& manager, const InputType &source, - SearchAlgorithm searchAlgorithm, - const TranslationOptionCollection &transOptColl) + SearchAlgorithm searchAlgorithm, + const TranslationOptionCollection &transOptColl) { switch(searchAlgorithm) { case Normal: @@ -42,8 +42,8 @@ out_of_time() if (!timelimit) return false; double elapsed_time = GetUserTime(); if (elapsed_time <= timelimit) return false; - VERBOSE(1,"Decoding is out of time (" << elapsed_time << "," - << timelimit << ")" << std::endl); + VERBOSE(1,"Decoding is out of time (" << elapsed_time << "," + << timelimit << ")" << std::endl); interrupted_flag = 1; return true; } diff --git a/moses/Search.h b/moses/Search.h index 7bafa3157..7d33535e0 100644 --- a/moses/Search.h +++ b/moses/Search.h @@ -46,7 +46,7 @@ protected: AllOptions const& m_options; /** flag indicating that decoder ran out of time (see switch -time-out) */ - size_t interrupted_flag; + size_t interrupted_flag; bool out_of_time(); }; diff --git a/moses/SearchCubePruning.cpp b/moses/SearchCubePruning.cpp index f6b2b90a9..0de30e424 100644 --- a/moses/SearchCubePruning.cpp +++ b/moses/SearchCubePruning.cpp @@ -92,8 +92,8 @@ void SearchCubePruning::Decode() // check if decoding ran out of time double _elapsed_time = GetUserTime(); if (timelimit && _elapsed_time > timelimit) { - VERBOSE(1,"Decoding is out of time (" << _elapsed_time << "," - << timelimit << ")" << std::endl); + VERBOSE(1,"Decoding is out of time (" << _elapsed_time << "," + << timelimit << ")" << std::endl); return; } HypothesisStackCubePruning &sourceHypoColl = *static_cast(*iterStack); diff --git a/moses/SearchNormal.cpp b/moses/SearchNormal.cpp index 227c27479..52b56a30a 100644 --- a/moses/SearchNormal.cpp +++ b/moses/SearchNormal.cpp @@ -16,8 +16,8 @@ namespace Moses * /param transOptColl collection of translation options to be used for this sentence */ SearchNormal:: -SearchNormal(Manager& manager, const InputType &source, - const TranslationOptionCollection &transOptColl) +SearchNormal(Manager& manager, const InputType &source, + const TranslationOptionCollection &transOptColl) : Search(manager) , m_source(source) , m_hypoStackColl(source.GetSize() + 1) @@ -38,8 +38,8 @@ SearchNormal(Manager& manager, const InputType &source, std::vector < HypothesisStackNormal >::iterator iterStack; for (size_t ind = 0 ; ind < m_hypoStackColl.size() ; ++ind) { HypothesisStackNormal *sourceHypoColl = new HypothesisStackNormal(m_manager); - sourceHypoColl->SetMaxHypoStackSize(this->m_options.search.stack_size, - this->m_options.search.stack_diversity); + sourceHypoColl->SetMaxHypoStackSize(this->m_options.search.stack_size, + this->m_options.search.stack_diversity); sourceHypoColl->SetBeamWidth(this->m_options.search.beam_width); m_hypoStackColl[ind] = sourceHypoColl; } @@ -51,26 +51,26 @@ SearchNormal::~SearchNormal() } -bool +bool SearchNormal:: ProcessOneStack(HypothesisStack* hstack) { if (this->out_of_time()) return false; SentenceStats &stats = m_manager.GetSentenceStats(); - HypothesisStackNormal &sourceHypoColl - = *static_cast(hstack); + HypothesisStackNormal &sourceHypoColl + = *static_cast(hstack); // the stack is pruned before processing (lazy pruning): VERBOSE(3,"processing hypothesis from next stack"); - IFVERBOSE(2) stats.StartTimeStack(); + IFVERBOSE(2) stats.StartTimeStack(); sourceHypoColl.PruneToSize(m_options.search.stack_size); VERBOSE(3,std::endl); sourceHypoColl.CleanupArcList(); - IFVERBOSE(2) stats.StopTimeStack(); + IFVERBOSE(2) stats.StopTimeStack(); // go through each hypothesis on the stack and try to expand it - BOOST_FOREACH(Hypothesis* h, sourceHypoColl) - ProcessOneHypothesis(*h); + BOOST_FOREACH(Hypothesis* h, sourceHypoColl) + ProcessOneHypothesis(*h); return true; } @@ -90,7 +90,7 @@ void SearchNormal::Decode() // go through each stack BOOST_FOREACH(HypothesisStack* hstack, m_hypoStackColl) { if (!ProcessOneStack(hstack)) return; - IFVERBOSE(2) OutputHypoStackSize(); + IFVERBOSE(2) OutputHypoStackSize(); actual_hypoStack = static_cast(hstack); } } diff --git a/moses/SearchNormal.h b/moses/SearchNormal.h index 2cba53ab8..aa005a5e1 100644 --- a/moses/SearchNormal.h +++ b/moses/SearchNormal.h @@ -22,30 +22,30 @@ class SearchNormal: public Search { protected: const InputType &m_source; - //! stacks to store hypotheses (partial translations) + //! stacks to store hypotheses (partial translations) // no of elements = no of words in source + 1 - std::vector < HypothesisStack* > m_hypoStackColl; + std::vector < HypothesisStack* > m_hypoStackColl; /** actual (full expanded) stack of hypotheses*/ - HypothesisStackNormal* actual_hypoStack; + HypothesisStackNormal* actual_hypoStack; /** pre-computed list of translation options for the phrases in this sentence */ - const TranslationOptionCollection &m_transOptColl; + const TranslationOptionCollection &m_transOptColl; // functions for creating hypotheses virtual bool ProcessOneStack(HypothesisStack* hstack); - virtual void + virtual void ProcessOneHypothesis(const Hypothesis &hypothesis); - virtual void + virtual void ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos); - virtual void - ExpandHypothesis(const Hypothesis &hypothesis, const TranslationOption &transOpt, - float expectedScore); + virtual void + ExpandHypothesis(const Hypothesis &hypothesis, const TranslationOption &transOpt, + float expectedScore); public: SearchNormal(Manager& manager, const InputType &source, const TranslationOptionCollection &transOptColl); diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index d8ab06e09..53b60e850 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -267,12 +267,12 @@ StaticData m_parameter->SetParameter(m_printAllDerivations , "print-all-derivations", false ); // additional output - m_parameter->SetParameter(m_detailedTranslationReportingFilePath, - "translation-details", ""); + m_parameter->SetParameter(m_detailedTranslationReportingFilePath, + "translation-details", ""); m_parameter->SetParameter(m_detailedTreeFragmentsTranslationReportingFilePath, - "tree-translation-details", ""); - m_parameter->SetParameter(m_detailedAllTranslationReportingFilePath, - "translation-all-details", ""); + "tree-translation-details", ""); + m_parameter->SetParameter(m_detailedAllTranslationReportingFilePath, + "translation-all-details", ""); m_parameter->SetParameter(m_startTranslationId, "start-translation-id", 0); //lattice samples @@ -435,12 +435,12 @@ bool StaticData::LoadData(Parameter *parameter) ini_oov_options(); // set m_nbest_options.enabled = true if necessary: - if (m_options.mbr.enabled - || m_options.mira + if (m_options.mbr.enabled + || m_options.mira || m_options.search.consensus - || m_outputSearchGraph + || m_outputSearchGraph || m_outputSearchGraphSLF - || m_outputSearchGraphHypergraph + || m_outputSearchGraphHypergraph #ifdef HAVE_PROTOBUF || m_outputSearchGraphPB #endif diff --git a/moses/StaticData.h b/moses/StaticData.h index 991bdb014..eec992cee 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -290,13 +290,13 @@ public: } AllOptions const& - options() const { - return m_options; + options() const { + return m_options; } - AllOptions& - options() { - return m_options; + AllOptions& + options() { + return m_options; } const std::vector &GetInputFactorOrder() const { @@ -367,8 +367,8 @@ public: } bool UseEarlyDiscarding() const { - return m_options.search.early_discarding_threshold - != -std::numeric_limits::infinity(); + return m_options.search.early_discarding_threshold + != -std::numeric_limits::infinity(); } bool UseEarlyDistortionCost() const { return m_options.reordering.use_early_distortion_cost; @@ -474,7 +474,7 @@ public: bool IsSyntax(SearchAlgorithm algo = DefaultSearchAlgorithm) const { if (algo == DefaultSearchAlgorithm) algo = m_options.search.algo; - + return (algo == CYKPlus || algo == ChartIncremental || algo == SyntaxS2T || algo == SyntaxT2S || algo == SyntaxF2S || algo == SyntaxT2S_SCFG); diff --git a/moses/Syntax/F2S/Manager-inl.h b/moses/Syntax/F2S/Manager-inl.h index 077464208..5c1b41295 100644 --- a/moses/Syntax/F2S/Manager-inl.h +++ b/moses/Syntax/F2S/Manager-inl.h @@ -254,7 +254,7 @@ void Manager::ExtractKBest( // with 0 being 'unlimited.' This actually sets a large-ish limit in case // too many translations are identical. const StaticData &staticData = StaticData::Instance(); - const std::size_t nBestFactor = staticData.options().nbest.factor; + const std::size_t nBestFactor = staticData.options().nbest.factor; std::size_t numDerivations = (nBestFactor == 0) ? k*1000 : k*nBestFactor; // Extract the derivations. diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 3bd6e6d7d..450deeb81 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -182,7 +182,8 @@ bool TargetPhrase::HasTtaskSPtr() const return m_ttask_flag; } -const ttasksptr TargetPhrase::GetTtask() const { +const ttasksptr TargetPhrase::GetTtask() const +{ return m_ttask.lock(); } diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp index 3262ee2de..df77b60c7 100644 --- a/moses/TranslationTask.cpp +++ b/moses/TranslationTask.cpp @@ -23,7 +23,7 @@ using namespace std; namespace Moses { -boost::shared_ptr > +boost::shared_ptr > TranslationTask:: GetContextWindow() const { @@ -88,7 +88,7 @@ boost::shared_ptr TranslationTask ::create(boost::shared_ptr const& source, boost::shared_ptr const& ioWrapper, - boost::shared_ptr const& scope) + boost::shared_ptr const& scope) { boost::shared_ptr ret(new TranslationTask(source, ioWrapper)); ret->m_self = ret; @@ -100,7 +100,7 @@ TranslationTask ::TranslationTask(boost::shared_ptr const& source, boost::shared_ptr const& ioWrapper) : m_source(source) , m_ioWrapper(ioWrapper) -{ +{ m_options = StaticData::Instance().options(); } diff --git a/moses/TranslationTask.h b/moses/TranslationTask.h index 987fd12ab..211ade610 100644 --- a/moses/TranslationTask.h +++ b/moses/TranslationTask.h @@ -97,7 +97,7 @@ public: boost::shared_ptr create(boost::shared_ptr const& source, boost::shared_ptr const& ioWrapper, - boost::shared_ptr const& scope); + boost::shared_ptr const& scope); ~TranslationTask(); /** Translate one sentence @@ -124,10 +124,10 @@ public: return m_scope; } - boost::shared_ptr > + boost::shared_ptr > GetContextWindow() const; - void + void SetContextWindow(boost::shared_ptr > const& cw); std::map const& GetContextWeights() const; From ac862945bbf856d10c561951823a2321b35dba30 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 8 Aug 2015 12:57:05 +0100 Subject: [PATCH 236/286] Bug fix: endl => std::endl. --- moses/SearchNormal.cpp | 2 +- moses/TargetPhrase.cpp | 4 ++-- moses/server/Server.cpp | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/moses/SearchNormal.cpp b/moses/SearchNormal.cpp index 52b56a30a..85acb7632 100644 --- a/moses/SearchNormal.cpp +++ b/moses/SearchNormal.cpp @@ -81,7 +81,7 @@ ProcessOneStack(HypothesisStack* hstack) */ void SearchNormal::Decode() { - SentenceStats &stats = m_manager.GetSentenceStats(); + // SentenceStats &stats = m_manager.GetSentenceStats(); // initial seed hypothesis: nothing translated, no words produced Hypothesis *hypo = Hypothesis::Create(m_manager, m_source, m_initialTransOpt); diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index 450deeb81..9f27e1be8 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -82,14 +82,14 @@ TargetPhrase::TargetPhrase(ttasksptr& ttask, std::string out_string, const Phras TargetPhrase::TargetPhrase(ttasksptr& ttask, const PhraseDictionary *pt) :Phrase() + , m_ttask(ttask) + , m_ttask_flag(true) , m_fullScore(0.0) , m_futureScore(0.0) , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo()) , m_lhsTarget(NULL) , m_ruleSource(NULL) - , m_ttask(ttask) - , m_ttask_flag(true) , m_container(pt) { } diff --git a/moses/server/Server.cpp b/moses/server/Server.cpp index fde116224..f203942c6 100644 --- a/moses/server/Server.cpp +++ b/moses/server/Server.cpp @@ -30,11 +30,11 @@ namespace MosesServer .allowOrigin("*") .maxConn(m_server_options.num_threads)); - XVERBOSE(1,"Listening on port " << m_server_options.port << endl); + XVERBOSE(1,"Listening on port " << m_server_options.port << std::endl); if (m_server_options.is_serial) { - VERBOSE(1,"Running server in serial mode." << endl); - while(true) myAbyssServer.runOnce(); + VERBOSE(1,"Running server in serial mode." << std::endl); + while(true) myAbyssServer.runOnce(); } else myAbyssServer.run(); @@ -42,7 +42,7 @@ namespace MosesServer // #pragma message("BUILDING MOSES WITH SERVER SUPPORT") #else // #pragma message("BUILDING MOSES WITHOUT SERVER SUPPORT") - std::cerr << "Moses was compiled without server support." << endl; + std::cerr << "Moses was compiled without server support." << std::endl; #endif return 1; } From 243148b8487c37e79f2a8653ae5e5f7ffb07ab39 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sat, 8 Aug 2015 16:27:56 +0400 Subject: [PATCH 237/286] compile error without xmlrpc-c lib --- moses/ExportInterface.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index 66ff8b436..3de8387f7 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -144,8 +144,10 @@ Parameter params; int run_as_server() { +#ifdef HAVE_XMLRPC_C MosesServer::Server server(params); return server.run(); // actually: don't return. see Server::run() +#endif } int @@ -322,7 +324,7 @@ int decoder_main(int argc, char** argv) } if (params.GetParam("server")) - return run_as_server(); + return run_as_server(); else return batch_run(); From 0045a749806febaa3ee645aa47e89c6e3f830cef Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Sun, 9 Aug 2015 00:00:45 +0100 Subject: [PATCH 238/286] daily automatic beautifier --- moses/ExportInterface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp index 3de8387f7..5fa56c021 100644 --- a/moses/ExportInterface.cpp +++ b/moses/ExportInterface.cpp @@ -324,7 +324,7 @@ int decoder_main(int argc, char** argv) } if (params.GetParam("server")) - return run_as_server(); + return run_as_server(); else return batch_run(); From 5f81db9745c0f26eae9960c7f4848bdd344a1678 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 9 Aug 2015 20:50:06 +0100 Subject: [PATCH 239/286] Updated Emacs code formatting instructions. --- moses/FF/LexicalReordering/LexicalReordering.h | 2 +- moses/FF/LexicalReordering/LexicalReorderingState.h | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/moses/FF/LexicalReordering/LexicalReordering.h b/moses/FF/LexicalReordering/LexicalReordering.h index fcbc2af1b..ebf5cb3ca 100644 --- a/moses/FF/LexicalReordering/LexicalReordering.h +++ b/moses/FF/LexicalReordering/LexicalReordering.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #pragma once #include diff --git a/moses/FF/LexicalReordering/LexicalReorderingState.h b/moses/FF/LexicalReordering/LexicalReorderingState.h index 19904ae32..96b226a4e 100644 --- a/moses/FF/LexicalReordering/LexicalReorderingState.h +++ b/moses/FF/LexicalReordering/LexicalReorderingState.h @@ -1,6 +1,5 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #pragma once - #include #include @@ -12,7 +11,6 @@ #include "moses/WordsBitmap.h" #include "moses/TranslationOption.h" #include "moses/FF/FFState.h" - #include "ReorderingStack.h" namespace Moses From f20c4cbbc0363d01291b4cb70c43c19b7e1893b0 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 9 Aug 2015 20:51:21 +0100 Subject: [PATCH 240/286] Namespace refactoring in mmsapt. --- moses/TranslationModel/UG/bitext-find.cc | 3 +- .../UG/count-ptable-features.cc | 2 +- .../generic/program_options/ug_get_options.h | 2 +- .../UG/generic/sorting/NBestList.h | 154 +- moses/TranslationModel/UG/mm/calc-coverage.cc | 2 +- moses/TranslationModel/UG/mm/mam2symal.cc | 8 +- moses/TranslationModel/UG/mm/mam_verify.cc | 5 +- moses/TranslationModel/UG/mm/mmlex-build.cc | 27 +- moses/TranslationModel/UG/mm/mmlex-lookup.cc | 1 + moses/TranslationModel/UG/mm/mtt-build.cc | 25 +- .../TranslationModel/UG/mm/mtt-count-words.cc | 2 + moses/TranslationModel/UG/mm/mtt-demo1.cc | 4 +- moses/TranslationModel/UG/mm/mtt-dump.cc | 4 +- .../TranslationModel/UG/mm/num_read_write.cc | 2 +- moses/TranslationModel/UG/mm/num_read_write.h | 61 +- moses/TranslationModel/UG/mm/symal2mam.cc | 29 +- moses/TranslationModel/UG/mm/tpt_pickler.cc | 9 +- moses/TranslationModel/UG/mm/tpt_pickler.h | 7 +- .../TranslationModel/UG/mm/tpt_tightindex.cc | 43 +- moses/TranslationModel/UG/mm/tpt_tightindex.h | 4 +- .../TranslationModel/UG/mm/tpt_tokenindex.cc | 5 +- moses/TranslationModel/UG/mm/tpt_tokenindex.h | 14 +- moses/TranslationModel/UG/mm/tpt_typedefs.h | 4 +- moses/TranslationModel/UG/mm/ug_bitext.cc | 330 +++-- moses/TranslationModel/UG/mm/ug_bitext.h | 1260 ++++++++--------- .../TranslationModel/UG/mm/ug_bitext_agenda.h | 20 +- .../UG/mm/ug_bitext_agenda_job.h | 18 +- .../UG/mm/ug_bitext_agenda_worker.h | 1 + .../UG/mm/ug_bitext_jstats.cc | 166 ++- .../TranslationModel/UG/mm/ug_bitext_jstats.h | 84 +- .../TranslationModel/UG/mm/ug_bitext_moses.h | 6 +- .../mm/ug_bitext_phrase_extraction_record.h | 35 +- .../UG/mm/ug_bitext_pstats.cc | 179 ++- .../TranslationModel/UG/mm/ug_bitext_pstats.h | 88 +- .../UG/mm/ug_bitext_sampler.h | 31 +- .../UG/mm/ug_conll_bottom_up_token.h | 4 +- .../TranslationModel/UG/mm/ug_conll_record.cc | 3 +- .../TranslationModel/UG/mm/ug_conll_record.h | 8 +- .../TranslationModel/UG/mm/ug_corpus_token.cc | 6 +- .../TranslationModel/UG/mm/ug_corpus_token.h | 12 +- moses/TranslationModel/UG/mm/ug_deptree.cc | 3 +- moses/TranslationModel/UG/mm/ug_deptree.h | 4 +- moses/TranslationModel/UG/mm/ug_im_bitext.cc | 153 +- moses/TranslationModel/UG/mm/ug_im_bitext.h | 232 ++- moses/TranslationModel/UG/mm/ug_im_tsa.h | 28 +- moses/TranslationModel/UG/mm/ug_im_ttrack.h | 6 +- .../UG/mm/ug_lexical_phrase_scorer2.h | 18 +- .../UG/mm/ug_lexical_reordering.cc | 258 ++-- .../UG/mm/ug_lexical_reordering.h | 22 +- moses/TranslationModel/UG/mm/ug_mm_2d_table.h | 10 +- moses/TranslationModel/UG/mm/ug_mm_bitext.h | 142 +- moses/TranslationModel/UG/mm/ug_mm_tsa.h | 24 +- moses/TranslationModel/UG/mm/ug_mm_ttrack.h | 37 +- moses/TranslationModel/UG/mm/ug_phrasepair.cc | 7 +- moses/TranslationModel/UG/mm/ug_phrasepair.h | 600 ++++---- .../TranslationModel/UG/mm/ug_prep_phrases.h | 4 +- .../UG/mm/ug_sampling_bias.cc | 442 +++--- .../TranslationModel/UG/mm/ug_sampling_bias.h | 164 ++- .../UG/mm/ug_tsa_array_entry.cc | 3 +- .../UG/mm/ug_tsa_array_entry.h | 51 +- moses/TranslationModel/UG/mm/ug_tsa_base.h | 4 +- .../UG/mm/ug_tsa_bitset_cache.h | 4 +- .../UG/mm/ug_tsa_tree_iterator.h | 4 +- .../TranslationModel/UG/mm/ug_ttrack_base.cc | 3 +- moses/TranslationModel/UG/mm/ug_ttrack_base.h | 15 +- .../UG/mm/ug_ttrack_position.cc | 5 +- .../UG/mm/ug_ttrack_position.h | 5 +- moses/TranslationModel/UG/mm/ug_typedefs.h | 8 +- moses/TranslationModel/UG/mmsapt.cpp | 43 +- moses/TranslationModel/UG/mmsapt.h | 53 +- moses/TranslationModel/UG/mmsapt_align.cc | 2 +- .../UG/ptable-describe-features.cc | 3 +- moses/TranslationModel/UG/ptable-lookup.cc | 2 +- .../TranslationModel/UG/sapt_phrase_scorers.h | 2 +- moses/TranslationModel/UG/sapt_pscore_base.h | 37 +- .../UG/sapt_pscore_coherence.h | 43 +- .../UG/sapt_pscore_cumulative_bias.h | 52 +- .../UG/sapt_pscore_length_ratio.h | 101 +- moses/TranslationModel/UG/sapt_pscore_lex1.h | 116 +- .../TranslationModel/UG/sapt_pscore_logcnt.h | 101 +- moses/TranslationModel/UG/sapt_pscore_pbwd.h | 93 +- moses/TranslationModel/UG/sapt_pscore_pfwd.h | 115 +- .../UG/sapt_pscore_phrasecount.h | 43 +- .../UG/sapt_pscore_provenance.h | 62 +- .../UG/sapt_pscore_rareness.h | 53 +- .../UG/sapt_pscore_unaligned.h | 109 +- .../UG/sapt_pscore_wordcount.h | 43 +- moses/TranslationModel/UG/sim-pe.cc | 2 +- .../TranslationModel/UG/spe-check-coverage.cc | 2 +- .../UG/spe-check-coverage2.cc | 2 +- .../UG/spe-check-coverage3.cc | 2 +- moses/TranslationModel/UG/try-align.cc | 56 +- 92 files changed, 2949 insertions(+), 3112 deletions(-) diff --git a/moses/TranslationModel/UG/bitext-find.cc b/moses/TranslationModel/UG/bitext-find.cc index 0e94464ba..d02ce0710 100644 --- a/moses/TranslationModel/UG/bitext-find.cc +++ b/moses/TranslationModel/UG/bitext-find.cc @@ -4,7 +4,8 @@ using namespace std; using namespace Moses; -using namespace Moses::bitext; +using namespace sapt; + namespace po=boost::program_options; typedef L2R_Token Token; typedef mmBitext mmbitext; diff --git a/moses/TranslationModel/UG/count-ptable-features.cc b/moses/TranslationModel/UG/count-ptable-features.cc index 4c9022075..bb5d263e7 100644 --- a/moses/TranslationModel/UG/count-ptable-features.cc +++ b/moses/TranslationModel/UG/count-ptable-features.cc @@ -8,7 +8,7 @@ #include using namespace Moses; -using namespace bitext; +using namespace sapt; using namespace std; using namespace boost; diff --git a/moses/TranslationModel/UG/generic/program_options/ug_get_options.h b/moses/TranslationModel/UG/generic/program_options/ug_get_options.h index 636b11302..aeb41b254 100644 --- a/moses/TranslationModel/UG/generic/program_options/ug_get_options.h +++ b/moses/TranslationModel/UG/generic/program_options/ug_get_options.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2009 Ulrich Germann // boilerplate code to declutter my usual interpret_args() routine #ifndef __ug_get_options_h diff --git a/moses/TranslationModel/UG/generic/sorting/NBestList.h b/moses/TranslationModel/UG/generic/sorting/NBestList.h index c9490729f..12fd57900 100644 --- a/moses/TranslationModel/UG/generic/sorting/NBestList.h +++ b/moses/TranslationModel/UG/generic/sorting/NBestList.h @@ -12,86 +12,90 @@ namespace Moses { -using namespace std; -template -class + template + class NBestList -{ - vector m_heap; - vector m_list; - VectorIndexSorter m_better; - mutable vector m_order; - mutable bool m_changed; -public: - NBestList(size_t const max_size, CMP const& cmp); - NBestList(size_t const max_size); - bool add(THINGY const& item); - THINGY const& operator[](int i) const; - THINGY const& get_unsorted(int i) const; - size_t size() const { - return m_heap.size(); + { + vector m_heap; + vector m_list; + VectorIndexSorter m_better; + mutable vector m_order; + mutable bool m_changed; + public: + NBestList(size_t const max_size, CMP const& cmp); + NBestList(size_t const max_size); + bool add(THINGY const& item); + THINGY const& operator[](int i) const; + THINGY const& get_unsorted(int i) const; + size_t size() const { + return m_heap.size(); + } + }; + + template + NBestList:: + NBestList(size_t const max_size, CMP const& cmp) + : m_better(m_list, cmp), m_changed(false) + { + m_heap.reserve(max_size); } -}; - -template -NBestList:: -NBestList(size_t const max_size, CMP const& cmp) - : m_better(m_list, cmp), m_changed(false) -{ - m_heap.reserve(max_size); -} - -template -NBestList:: -NBestList(size_t const max_size) - : m_better(m_heap), m_changed(false) -{ - m_heap.reserve(max_size); -} - -template -bool -NBestList:: -add(THINGY const& item) -{ - if (m_heap.size() == m_heap.capacity()) { - if (m_better.Compare(item, m_list[m_heap.at(0)])) { - pop_heap(m_heap.begin(),m_heap.end(),m_better); - m_list[m_heap.back()] = item; - } else return false; - } else { - m_list.push_back(item); - m_heap.push_back(m_heap.size()); + + template + NBestList:: + NBestList(size_t const max_size) + : m_better(m_heap), m_changed(false) + { + m_heap.reserve(max_size); } - push_heap(m_heap.begin(),m_heap.end(),m_better); - return m_changed = true; -} - -template -THINGY const& -NBestList:: -operator[](int i) const -{ - if (m_changed) { - m_order.assign(m_heap.begin(),m_heap.end()); - for (size_t k = m_heap.size(); k != 0; --k) - pop_heap(m_order.begin(), m_order.begin()+k,m_better); - m_changed = false; + + template + bool + NBestList:: + add(THINGY const& item) + { + if (m_heap.size() == m_heap.capacity()) + { + if (m_better.Compare(item, m_list[m_heap.at(0)])) + { + pop_heap(m_heap.begin(),m_heap.end(),m_better); + m_list[m_heap.back()] = item; + } + else return false; + } + else + { + m_list.push_back(item); + m_heap.push_back(m_heap.size()); + } + push_heap(m_heap.begin(),m_heap.end(),m_better); + return m_changed = true; } - if (i < 0) i += m_order.size(); - return m_list[m_order.at(i)]; -} - -template -THINGY const& -NBestList:: -get_unsorted(int i) const -{ - if (i < 0) i += m_heap.size(); - return m_list[m_heap.at(i)]; -} - + template + THINGY const& + NBestList:: + operator[](int i) const + { + if (m_changed) + { + m_order.assign(m_heap.begin(),m_heap.end()); + for (size_t k = m_heap.size(); k != 0; --k) + pop_heap(m_order.begin(), m_order.begin()+k,m_better); + m_changed = false; + } + if (i < 0) i += m_order.size(); + return m_list[m_order.at(i)]; + } + + template + THINGY const& + NBestList:: + get_unsorted(int i) const + { + if (i < 0) i += m_heap.size(); + return m_list[m_heap.at(i)]; + } + } #endif diff --git a/moses/TranslationModel/UG/mm/calc-coverage.cc b/moses/TranslationModel/UG/mm/calc-coverage.cc index 4f02909b7..43ce60fde 100644 --- a/moses/TranslationModel/UG/mm/calc-coverage.cc +++ b/moses/TranslationModel/UG/mm/calc-coverage.cc @@ -11,7 +11,7 @@ // using namespace Moses; using namespace ugdiss; - +using namespace sapt; typedef L2R_Token Token; TokenIndex V; diff --git a/moses/TranslationModel/UG/mm/mam2symal.cc b/moses/TranslationModel/UG/mm/mam2symal.cc index eb5034aab..1bf1b7235 100644 --- a/moses/TranslationModel/UG/mm/mam2symal.cc +++ b/moses/TranslationModel/UG/mm/mam2symal.cc @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2008-2010 Ulrich Germann #include #include @@ -11,7 +11,7 @@ #include "tpt_pickler.h" using namespace std; -using namespace ugdiss; +using namespace sapt; namespace po = boost::program_options; string mamfile; @@ -68,8 +68,8 @@ printRangeMAM(size_t start, size_t stop) ushort s,t; while (p < q) { - p = binread(p,s); - p = binread(p,t); + p = tpt::binread(p,s); + p = tpt::binread(p,t); cout << s << "-" << t << " "; } cout << endl; diff --git a/moses/TranslationModel/UG/mm/mam_verify.cc b/moses/TranslationModel/UG/mm/mam_verify.cc index 798baa947..f14cee213 100644 --- a/moses/TranslationModel/UG/mm/mam_verify.cc +++ b/moses/TranslationModel/UG/mm/mam_verify.cc @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2008-2010 Ulrich Germann #include #include @@ -11,7 +11,8 @@ #include "tpt_pickler.h" using namespace std; -using namespace ugdiss; +using namespace tpt; +using namespace sapt; namespace po = boost::program_options; typedef L2R_Token Token; diff --git a/moses/TranslationModel/UG/mm/mmlex-build.cc b/moses/TranslationModel/UG/mm/mmlex-build.cc index d79938de3..0a79c5994 100644 --- a/moses/TranslationModel/UG/mm/mmlex-build.cc +++ b/moses/TranslationModel/UG/mm/mmlex-build.cc @@ -30,6 +30,7 @@ #include "ug_corpus_token.h" using namespace std; +using namespace sapt; using namespace ugdiss; using namespace boost::math; @@ -119,9 +120,9 @@ void writeTableHeader(ostream& out) { filepos_type idxOffset=0; - numwrite(out,idxOffset); // blank for the time being - numwrite(out,id_type(V1.ksize())); - numwrite(out,id_type(V2.ksize())); + tpt::numwrite(out,idxOffset); // blank for the time being + tpt::numwrite(out,id_type(V1.ksize())); + tpt::numwrite(out,id_type(V2.ksize())); } void writeTable(ostream* aln_out, ostream* coc_out) @@ -174,16 +175,16 @@ void writeTable(ostream* aln_out, ostream* coc_out) if (aln_out) { ++CellCountA; - numwrite(*aln_out,id2); - numwrite(*aln_out,aln); + tpt::numwrite(*aln_out,id2); + tpt::numwrite(*aln_out,aln); m1a[id1] += aln; m2a[id2] += aln; } if (coc_out && coc) { ++CellCountC; - numwrite(*coc_out,id2); - numwrite(*coc_out,coc); + tpt::numwrite(*coc_out,id2); + tpt::numwrite(*coc_out,coc); m1c[id1] += coc; m2c[id2] += coc; } @@ -195,21 +196,21 @@ void writeTable(ostream* aln_out, ostream* coc_out) { filepos_type idxOffsetA = aln_out->tellp(); BOOST_FOREACH(id_type foo, idxa) - numwrite(*aln_out,foo); + tpt::numwrite(*aln_out,foo); aln_out->write(reinterpret_cast(&m1a[0]),m1a.size()*4); aln_out->write(reinterpret_cast(&m2a[0]),m2a.size()*4); aln_out->seekp(0); - numwrite(*aln_out,idxOffsetA); + tpt::numwrite(*aln_out,idxOffsetA); } if (coc_out) { filepos_type idxOffsetC = coc_out->tellp(); BOOST_FOREACH(id_type foo, idxc) - numwrite(*coc_out,foo); + tpt::numwrite(*coc_out,foo); coc_out->write(reinterpret_cast(&m1c[0]),m1c.size()*4); coc_out->write(reinterpret_cast(&m2c[0]),m2c.size()*4); coc_out->seekp(0); - numwrite(*coc_out,idxOffsetC); + tpt::numwrite(*coc_out,idxOffsetC); } } @@ -240,8 +241,8 @@ processSentence(id_type sid) cerr << sid/1000000 << " M sentences processed" << endl; while (p < q) { - p = binread(p,r); - p = binread(p,c); + p = tpt::binread(p,r); + p = tpt::binread(p,c); // cout << sid << " " << r << "-" << c << endl; UTIL_THROW_IF2(r >= check1.size(), "out of bounds at line " << sid); UTIL_THROW_IF2(c >= check2.size(), "out of bounds at line " << sid); diff --git a/moses/TranslationModel/UG/mm/mmlex-lookup.cc b/moses/TranslationModel/UG/mm/mmlex-lookup.cc index 3ba9ef492..f7cc019e3 100644 --- a/moses/TranslationModel/UG/mm/mmlex-lookup.cc +++ b/moses/TranslationModel/UG/mm/mmlex-lookup.cc @@ -29,6 +29,7 @@ #include "ug_corpus_token.h" using namespace std; +using namespace sapt; using namespace ugdiss; using namespace boost::math; diff --git a/moses/TranslationModel/UG/mm/mtt-build.cc b/moses/TranslationModel/UG/mm/mtt-build.cc index a61cbac3f..61f5e4c98 100644 --- a/moses/TranslationModel/UG/mm/mtt-build.cc +++ b/moses/TranslationModel/UG/mm/mtt-build.cc @@ -30,7 +30,7 @@ #include "moses/TranslationModel/UG/mm/ug_im_tsa.h" using namespace std; -using namespace ugdiss; +using namespace sapt; using namespace Moses; using namespace boost; using namespace boost::algorithm; @@ -178,7 +178,7 @@ process_plain_input(ostream& out, vector & s_index) s_index.push_back(totalWords); while (buf>>w) { - numwrite(out,get_id(SF,w)); + tpt::numwrite(out,get_id(SF,w)); ++totalWords; } } @@ -226,9 +226,9 @@ numberize() ofstream out(tmpFile.c_str()); filepos_type startIdx=0; id_type idxSize=0,totalWords=0; - numwrite(out,startIdx); // place holder, to be filled at the end - numwrite(out,idxSize); // place holder, to be filled at the end - numwrite(out,totalWords); // place holder, to be filled at the end + tpt::numwrite(out,startIdx); // place holder, to be filled at the end + tpt::numwrite(out,idxSize); // place holder, to be filled at the end + tpt::numwrite(out,totalWords); // place holder, to be filled at the end vector s_index, p_index; @@ -248,12 +248,13 @@ numberize() cerr << endl << "Writing index ... (" << index->size() << " chunks) "; startIdx = out.tellp(); - for (size_t i = 0; i < index->size(); i++) numwrite(out,(*index)[i]); + for (size_t i = 0; i < index->size(); i++) + tpt::numwrite(out,(*index)[i]); out.seekp(0); idxSize = index->size(); - numwrite(out, startIdx); - numwrite(out, idxSize - 1); - numwrite(out, totalWords); + tpt::numwrite(out, startIdx); + tpt::numwrite(out, idxSize - 1); + tpt::numwrite(out, totalWords); out.close(); if (!quiet) cerr << "done" << endl; return totalWords; @@ -291,9 +292,9 @@ void remap() id_type totalWords, idxSize; boost::iostreams::mapped_file mtt(tmpFile); char const* p = mtt.data(); - p = numread(p,idxOffset); - p = numread(p,idxSize); - p = numread(p,totalWords); + p = tpt::numread(p,idxOffset); + p = tpt::numread(p,idxSize); + p = tpt::numread(p,totalWords); if (is_conll) { vector sf(SF.totalVocabSize(), 0); diff --git a/moses/TranslationModel/UG/mm/mtt-count-words.cc b/moses/TranslationModel/UG/mm/mtt-count-words.cc index 223ba2090..c28f6876e 100644 --- a/moses/TranslationModel/UG/mm/mtt-count-words.cc +++ b/moses/TranslationModel/UG/mm/mtt-count-words.cc @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // count words in a memory-mapped corpus #include "ug_mm_ttrack.h" #include "tpt_tokenindex.h" @@ -17,6 +18,7 @@ #include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h" using namespace std; +using namespace sapt; using namespace ugdiss; using namespace Moses; typedef L2R_Token Token; diff --git a/moses/TranslationModel/UG/mm/mtt-demo1.cc b/moses/TranslationModel/UG/mm/mtt-demo1.cc index d3506fa0f..a30bc3980 100644 --- a/moses/TranslationModel/UG/mm/mtt-demo1.cc +++ b/moses/TranslationModel/UG/mm/mtt-demo1.cc @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // Demo program for use of single-track suffix arrays #include @@ -14,7 +14,7 @@ using namespace Moses; using namespace std; using namespace boost; -using namespace ugdiss; +using namespace sapt; typedef L2R_Token < SimpleWordId > Token; int main(int argc, char* argv[]) { diff --git a/moses/TranslationModel/UG/mm/mtt-dump.cc b/moses/TranslationModel/UG/mm/mtt-dump.cc index eea1bb400..b1d68e702 100644 --- a/moses/TranslationModel/UG/mm/mtt-dump.cc +++ b/moses/TranslationModel/UG/mm/mtt-dump.cc @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2008-2010 Ulrich Germann #include #include @@ -10,7 +10,7 @@ #include "ug_corpus_token.h" using namespace std; -using namespace ugdiss; +using namespace sapt; namespace po = boost::program_options; string bname,mtt,mct; diff --git a/moses/TranslationModel/UG/mm/num_read_write.cc b/moses/TranslationModel/UG/mm/num_read_write.cc index 5c281d9dd..5ca67e577 100644 --- a/moses/TranslationModel/UG/mm/num_read_write.cc +++ b/moses/TranslationModel/UG/mm/num_read_write.cc @@ -1,5 +1,5 @@ #include "num_read_write.h" -namespace ugdiss { +namespace tpt { typedef unsigned char uchar; void diff --git a/moses/TranslationModel/UG/mm/num_read_write.h b/moses/TranslationModel/UG/mm/num_read_write.h index f83e1c982..0b058fb8a 100644 --- a/moses/TranslationModel/UG/mm/num_read_write.h +++ b/moses/TranslationModel/UG/mm/num_read_write.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2006,2007,2008 Ulrich Germann // #ifndef __num_read_write_hh // #define __num_read_write_hh @@ -9,7 +9,7 @@ // #include // #include "tpt_typedefs.h" -namespace ugdiss { +namespace tpt { void numwrite(std::ostream& out, uint16_t const& x); void numwrite(std::ostream& out, uint32_t const& x); @@ -19,60 +19,5 @@ namespace ugdiss { char const* numread(char const* src, uint32_t & x); char const* numread(char const* src, uint64_t & x); -// template -// void -// numwrite(std::ostream& out, uintNumber const& x) -// { -// uchar const* c = reinterpret_cast(&x); -// for (size_t i = 0; i < sizeof(x); ++i) -// out.write(c -// #if __BYTE_ORDER == __BIG_ENDIAN -// uintNumber y; -// switch (sizeof(uintNumber)) -// { -// case 2: y = bswap_16(x); break; -// case 4: y = bswap_32(x); break; -// case 8: y = bswap_64(x); break; -// default: y = x; -// } -// out.write(reinterpret_cast(&y),sizeof(y)); -// #else -// out.write(reinterpret_cast(&x),sizeof(x)); -// #endif -// } - -// template -// void -// numread(std::istream& in, uintNumber& x) -// { -// in.read(reinterpret_cast(&x),sizeof(uintNumber)); -// #if __BYTE_ORDER == __BIG_ENDIAN -// switch (sizeof(uintNumber)) -// { -// case 2: x = bswap_16(x); break; -// case 4: x = bswap_32(x); break; -// case 8: x = bswap_64(x); break; -// default: break; -// } -// #endif -// } - -// template -// char const* -// numread(char const* src, uintNumber& x) -// { -// // ATTENTION: THIS NEEDS TO BE VERIFIED FOR BIG-ENDIAN MACHINES!!! -// x = *reinterpret_cast(src); -// #if __BYTE_ORDER == __BIG_ENDIAN -// switch (sizeof(uintNumber)) -// { -// case 2: x = bswap_16(x); break; -// case 4: x = bswap_32(x); break; -// case 8: x = bswap_64(x); break; -// default: break; -// } -// #endif -// return src+sizeof(uintNumber); -// } } // end of namespace ugdiss -//#endif + diff --git a/moses/TranslationModel/UG/mm/symal2mam.cc b/moses/TranslationModel/UG/mm/symal2mam.cc index 6d0af57b0..a3ae87fb7 100644 --- a/moses/TranslationModel/UG/mm/symal2mam.cc +++ b/moses/TranslationModel/UG/mm/symal2mam.cc @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // program to convert GIZA-style alignments into memory-mapped format // (c) 2010 Ulrich Germann @@ -31,6 +31,7 @@ using namespace std; using namespace ugdiss; +using namespace sapt; ofstream t1out,t2out,mam; int len1=0,len2=0; @@ -119,8 +120,8 @@ procSymalLine(string const& line, ostream& out) } assert(len1 == 0 || a& idx, id_type numTok) id_type offset = sizeof(filepos_type)+2*sizeof(id_type); filepos_type idxStart = out.tellp(); for (vector::iterator i = idx.begin(); i != idx.end(); ++i) - numwrite(out,*i-offset); + tpt::numwrite(out,*i-offset); out.seekp(0); - numwrite(out,idxStart); - numwrite(out,id_type(idx.size()-1)); - numwrite(out,numTok); + tpt::numwrite(out,idxStart); + tpt::numwrite(out,id_type(idx.size()-1)); + tpt::numwrite(out,numTok); out.close(); } @@ -144,11 +145,11 @@ finalize(ofstream& out, vector const& idx, id_type tokenCount) id_type idxSize = idx.size(); filepos_type idxStart = out.tellp(); for (size_t i = 0; i < idx.size(); ++i) - numwrite(out,idx[i]); + tpt::numwrite(out,idx[i]); out.seekp(0); - numwrite(out,idxStart); - numwrite(out,idxSize-1); - numwrite(out,tokenCount); + tpt::numwrite(out,idxStart); + tpt::numwrite(out,idxSize-1); + tpt::numwrite(out,tokenCount); out.close(); } @@ -268,9 +269,9 @@ void initialize(ofstream& out, string const& fname) { out.open(fname.c_str()); - numwrite(out,filepos_type(0)); // place holder for index start - numwrite(out,id_type(0)); // place holder for index size - numwrite(out,id_type(0)); // place holder for token count + tpt::numwrite(out,filepos_type(0)); // place holder for index start + tpt::numwrite(out,id_type(0)); // place holder for index size + tpt::numwrite(out,id_type(0)); // place holder for token count } int main(int argc, char* argv[]) diff --git a/moses/TranslationModel/UG/mm/tpt_pickler.cc b/moses/TranslationModel/UG/mm/tpt_pickler.cc index 353e5b901..9fbf4388c 100644 --- a/moses/TranslationModel/UG/mm/tpt_pickler.cc +++ b/moses/TranslationModel/UG/mm/tpt_pickler.cc @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2006,2007,2008 Ulrich Germann #include "tpt_pickler.h" @@ -9,9 +9,8 @@ #define stat64 stat #endif -namespace ugdiss +namespace tpt { - using namespace std; uint64_t getFileSize(const std::string& fname) @@ -235,7 +234,7 @@ namespace ugdiss binwrite(std::ostream& out, std::string const& s) { size_t len = s.size(); - ugdiss::binwrite(out,len); + binwrite(out,len); out.write(s.c_str(),len); } @@ -243,7 +242,7 @@ namespace ugdiss binread(std::istream& in, std::string& s) { size_t len; - ugdiss::binread(in,len); + binread(in,len); if (!in) return; char buf[len+1]; in.read(buf,len); diff --git a/moses/TranslationModel/UG/mm/tpt_pickler.h b/moses/TranslationModel/UG/mm/tpt_pickler.h index 5ae033151..5a42ced01 100644 --- a/moses/TranslationModel/UG/mm/tpt_pickler.h +++ b/moses/TranslationModel/UG/mm/tpt_pickler.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2006,2007,2008 Ulrich Germann #ifndef __Pickler #define __Pickler @@ -11,7 +11,7 @@ #include "num_read_write.h" #include -namespace ugdiss +namespace tpt { /// Utility method placed here for lack of a better place /// @return the size of file fname. @@ -209,6 +209,5 @@ namespace ugdiss return binread(p,*buf); } - -} // end namespace ugdiss +} // end namespace sapt #endif diff --git a/moses/TranslationModel/UG/mm/tpt_tightindex.cc b/moses/TranslationModel/UG/mm/tpt_tightindex.cc index 72cf0c183..984399592 100644 --- a/moses/TranslationModel/UG/mm/tpt_tightindex.cc +++ b/moses/TranslationModel/UG/mm/tpt_tightindex.cc @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2007,2008 Ulrich Germann /* Functions for writing indices tightly (use only the bytes you need). @@ -20,48 +20,9 @@ #include #include "tpt_tightindex.h" -namespace ugdiss +namespace tpt { -// std::string bitpattern(unsigned int s) -// { -// std::ostringstream out; -// size_t bit=1; -// for (size_t i = 31; i > 0; i--) -// out << (s&(bit< #include @@ -9,9 +9,10 @@ #include #include "tpt_tokenindex.h" +#include "ug_typedefs.h" using namespace std; -namespace ugdiss +namespace sapt { TokenIndex:: diff --git a/moses/TranslationModel/UG/mm/tpt_tokenindex.h b/moses/TranslationModel/UG/mm/tpt_tokenindex.h index 2642bdd2f..dac196e04 100644 --- a/moses/TranslationModel/UG/mm/tpt_tokenindex.h +++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.h @@ -1,8 +1,9 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // TO DO (12.01.2011): // -// - Vocab items should be stored in order of ids, so that we can determine their length -// by taking computing V[id+1] - V[id] instead of using strlen. +// - Vocab items should be stored in order of ids, so that we can +// determine their length by taking computing V[id+1] - V[id] +// instead of using strlen. // // (c) 2007,2008 Ulrich Germann @@ -23,10 +24,11 @@ // // using namespace std; namespace bio=boost::iostreams; -namespace ugdiss +namespace sapt { class TokenIndex { + typedef tpt::id_type id_type; /** Reverse index: maps from ID to char const* */ mutable std::vector ridx; /** Label for the UNK token */ @@ -38,7 +40,7 @@ namespace ugdiss // NEW 2011-01-30: dynamic adding of unknown items bool dynamic; // dynamically assign a new word id to unknown items? - boost::shared_ptr > str2idExtra; + boost::shared_ptr > str2idExtra; boost::shared_ptr > newWords; // The use of pointers to external items is a bit of a bad hack // in terms of the semantic of TokenIndex const: since external items @@ -53,7 +55,7 @@ namespace ugdiss { public: uint32_t offset; - id_type id; + id_type id; }; /** Comparison function object used for Entry instances */ diff --git a/moses/TranslationModel/UG/mm/tpt_typedefs.h b/moses/TranslationModel/UG/mm/tpt_typedefs.h index d2d2932de..092a62ceb 100644 --- a/moses/TranslationModel/UG/mm/tpt_typedefs.h +++ b/moses/TranslationModel/UG/mm/tpt_typedefs.h @@ -1,11 +1,11 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // Basic type definitions for code related to tightly packed tries // (c) 2006-2012 Ulrich Germann #ifndef __tpt_typedefs_h #define __tpt_typedefs_h #include -namespace ugdiss +namespace tpt { typedef uint32_t id_type; typedef uint32_t count_type; diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index fb75877ed..0857cc21f 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -1,177 +1,171 @@ -//-*- c++ -*- +//-*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #include "ug_bitext.h" #include #include -using namespace ugdiss; -using namespace std; -namespace Moses +namespace sapt { - namespace bitext + + float + lbop(size_t const tries, size_t const succ, float const confidence) { - - float - lbop(size_t const tries, size_t const succ, float const confidence) - { - return (confidence == 0 - ? float(succ)/tries - : (boost::math::binomial_distribution<>:: - find_lower_bound_on_p(tries, succ, confidence))); - } - - - // template<> - void - snt_adder >:: - operator()() - { - typedef L2R_Token tkn; - vector sids; sids.reserve(snt.size()); - BOOST_FOREACH(string const& foo, snt) - { - sids.push_back(track ? track->size() : 0); - istringstream buf(foo); - string w; - vector s; s.reserve(100); - while (buf >> w) s.push_back(tkn(V[w])); - track = append(track,s); - } - if (index) - index.reset(new imTSA(*index,track,sids,V.tsize())); - else - index.reset(new imTSA(track,NULL,NULL)); - } - - snt_adder >:: - snt_adder(vector const& s, TokenIndex& v, - SPTR > >& t, - SPTR > >& i) - : snt(s), V(v), track(t), index(i) - { } - - bool - expand_phrase_pair - (vector >& a1, - vector >& a2, - ushort const s2, // next word on in target side - ushort const L1, ushort const R1, // limits of previous phrase - ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg - { - if (a2[s2].size() == 0) - { - cout << __FILE__ << ":" << __LINE__ << endl; - return false; - } - bitvector done1(a1.size()); - bitvector done2(a2.size()); - vector > agenda; - // x.first: side (1 or 2) - // x.second: word position - agenda.reserve(a1.size() + a2.size()); - agenda.push_back(pair(2,s2)); - e2 = s2; - s1 = e1 = a2[s2].front(); - if (s1 >= L1 && s1 < R1) - { - cout << __FILE__ << ":" << __LINE__ << endl; - return false; - } - agenda.push_back(pair(2,s2)); - while (agenda.size()) - { - ushort side = agenda.back().first; - ushort p = agenda.back().second; - agenda.pop_back(); - if (side == 1) - { - done1.set(p); - BOOST_FOREACH(ushort i, a1[p]) - { - if (i < s2) - { - // cout << __FILE__ << ":" << __LINE__ << endl; - return false; - } - if (done2[i]) continue; - for (;e2 <= i;++e2) - if (!done2[e2]) - agenda.push_back(pair(2,e2)); - } - } - else - { - done2.set(p); - BOOST_FOREACH(ushort i, a2[p]) - { - if ((e1 < L1 && i >= L1) || - (s1 >= R1 && i < R1) || - (i >= L1 && i < R1)) - { - // cout << __FILE__ << ":" << __LINE__ << " " - // << L1 << "-" << R1 << " " << i << " " - // << s1 << "-" << e1<< endl; - return false; - } - - if (e1 < i) - { - for (; e1 <= i; ++e1) - if (!done1[e1]) - agenda.push_back(pair(1,e1)); - } - else if (s1 > i) - { - for (; i <= s1; ++i) - if (!done1[i]) - agenda.push_back(pair(1,i)); - } - } - } - } - ++e1; - ++e2; - return true; - } - - void - print_amatrix(vector > a1, uint32_t len2, - ushort b1, ushort e1, ushort b2, ushort e2) - { - vector M(a1.size(),bitvector(len2)); - for (ushort j = 0; j < a1.size(); ++j) - { - BOOST_FOREACH(ushort k, a1[j]) - M[j].set(k); - } - cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl; - cout << " "; - for (size_t c = 0; c < len2;++c) - cout << c%10; - cout << endl; - for (size_t r = 0; r < M.size(); ++r) - { - cout << setw(3) << r << " "; - for (size_t c = 0; c < M[r].size(); ++c) - { - if ((b1 <= r) && (r < e1) && b2 <= c && c < e2) - cout << (M[r][c] ? 'x' : '-'); - else cout << (M[r][c] ? 'o' : '.'); - } - cout << endl; - } - cout << string(90,'-') << endl; - } - - void - write_bitvector(bitvector const& v, ostream& out) - { - for (size_t i = v.find_first(); i < v.size();) - { - out << i; - if ((i = v.find_next(i)) < v.size()) out << ","; - } - } - + return (confidence == 0 + ? float(succ)/tries + : (boost::math::binomial_distribution<>:: + find_lower_bound_on_p(tries, succ, confidence))); } + + void + snt_adder >:: + operator()() + { + typedef L2R_Token tkn; + std::vector sids; sids.reserve(snt.size()); + BOOST_FOREACH(std::string const& foo, snt) + { + sids.push_back(track ? track->size() : 0); + std::istringstream buf(foo); + std::string w; + std::vector s; s.reserve(100); + while (buf >> w) s.push_back(tkn(V[w])); + track = append(track,s); + } + if (index) + index.reset(new imTSA(*index,track,sids,V.tsize())); + else + index.reset(new imTSA(track,NULL,NULL)); + } + + snt_adder >:: + snt_adder(std::vector const& s, TokenIndex& v, + SPTR > >& t, + SPTR > >& i) + : snt(s), V(v), track(t), index(i) + { } + + bool + expand_phrase_pair + (std::vector >& a1, + std::vector >& a2, + ushort const s2, // next word on in target side + ushort const L1, ushort const R1, // limits of previous phrase + ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg + { + if (a2[s2].size() == 0) + { + std::cout << __FILE__ << ":" << __LINE__ << std::endl; + return false; + } + bitvector done1(a1.size()); + bitvector done2(a2.size()); + std::vector > agenda; + // x.first: side (1 or 2) + // x.second: word position + agenda.reserve(a1.size() + a2.size()); + agenda.push_back(std::pair(2,s2)); + e2 = s2; + s1 = e1 = a2[s2].front(); + if (s1 >= L1 && s1 < R1) + { + std::cout << __FILE__ << ":" << __LINE__ << std::endl; + return false; + } + agenda.push_back(std::pair(2,s2)); + while (agenda.size()) + { + ushort side = agenda.back().first; + ushort p = agenda.back().second; + agenda.pop_back(); + if (side == 1) + { + done1.set(p); + BOOST_FOREACH(ushort i, a1[p]) + { + if (i < s2) + { + // cout << __FILE__ << ":" << __LINE__ << endl; + return false; + } + if (done2[i]) continue; + for (;e2 <= i;++e2) + if (!done2[e2]) + agenda.push_back(std::pair(2,e2)); + } + } + else + { + done2.set(p); + BOOST_FOREACH(ushort i, a2[p]) + { + if ((e1 < L1 && i >= L1) || + (s1 >= R1 && i < R1) || + (i >= L1 && i < R1)) + { + // cout << __FILE__ << ":" << __LINE__ << " " + // << L1 << "-" << R1 << " " << i << " " + // << s1 << "-" << e1<< endl; + return false; + } + + if (e1 < i) + { + for (; e1 <= i; ++e1) + if (!done1[e1]) + agenda.push_back(std::pair(1,e1)); + } + else if (s1 > i) + { + for (; i <= s1; ++i) + if (!done1[i]) + agenda.push_back(std::pair(1,i)); + } + } + } + } + ++e1; + ++e2; + return true; + } + + void + print_amatrix(std::vector > a1, uint32_t len2, + ushort b1, ushort e1, ushort b2, ushort e2) + { + using namespace std; + std::vector M(a1.size(),bitvector(len2)); + for (ushort j = 0; j < a1.size(); ++j) + { + BOOST_FOREACH(ushort k, a1[j]) + M[j].set(k); + } + cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl; + cout << " "; + for (size_t c = 0; c < len2;++c) + cout << c%10; + cout << endl; + for (size_t r = 0; r < M.size(); ++r) + { + cout << setw(3) << r << " "; + for (size_t c = 0; c < M[r].size(); ++c) + { + if ((b1 <= r) && (r < e1) && b2 <= c && c < e2) + cout << (M[r][c] ? 'x' : '-'); + else cout << (M[r][c] ? 'o' : '.'); + } + cout << endl; + } + cout << std::string(90,'-') << endl; + } + + void + write_bitvector(bitvector const& v, std::ostream& out) + { + for (size_t i = v.find_first(); i < v.size();) + { + out << i; + if ((i = v.find_next(i)) < v.size()) out << ","; + } + } + } diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 11b08a276..990b7cd8a 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -1,4 +1,4 @@ -//-*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #pragma once // Implementations of word-aligned bitext. // Written by Ulrich Germann @@ -70,293 +70,80 @@ #define PSTATS_CACHE_THRESHOLD 50 -namespace Moses { - class Mmsapt; - namespace bitext +namespace Moses { class Mmsapt; } +namespace sapt +{ + using Moses::ttasksptr; + using Moses::ttaskwptr; + using tpt::binread; + using tpt::binwrite; + // using namespace ugdiss; + float lbop(size_t const tries, size_t const succ, float const confidence); + void write_bitvector(bitvector const& v, std::ostream& out); + +#ifndef NO_MOSES + struct + ContextForQuery { - // using namespace ugdiss; - using ugdiss::bitvector; - using ugdiss::Ttrack; - using ugdiss::TSA; - using ugdiss::imTSA; - using ugdiss::mmTSA; - using ugdiss::L2R_Token; - using ugdiss::SimpleWordId; - using ugdiss::imTtrack; - using ugdiss::mmTtrack; - using ugdiss::binread; - float lbop(size_t const tries, size_t const succ, float const confidence); - void write_bitvector(bitvector const& v, std::ostream& out); - -#ifndef NO_MOSES - struct - ContextForQuery - { - // needs to be made thread-safe - // ttasksptr const m_ttask; - // size_t max_samples; - boost::shared_mutex lock; - SPTR bias; - SPTR cache1, cache2; - std::ostream* bias_log; - ContextForQuery() : bias_log(NULL) { } - }; + // needs to be made thread-safe + // ttasksptr const m_ttask; + // size_t max_samples; + boost::shared_mutex lock; + SPTR bias; + SPTR cache1, cache2; + std::ostream* bias_log; + ContextForQuery() : bias_log(NULL) { } + }; #endif - template - class Bitext : public reference_counter - { - public: - template friend class BitextSampler; - typedef TKN Token; - typedef typename ugdiss::TSA::tree_iterator iter; - typedef typename std::vector > vec_ppair; - typedef typename lru_cache::LRU_Cache pplist_cache_t; - typedef ugdiss::TSA tsa; - friend class Moses::Mmsapt; - protected: - mutable boost::shared_mutex m_lock; // for thread-safe operation + template class BitextSampler; + + template + class Bitext : public Moses::reference_counter + { + public: + template friend class BitextSampler; + typedef TKN Token; + typedef typename TSA::tree_iterator iter; + typedef typename std::vector > vec_ppair; + typedef typename lru_cache::LRU_Cache pplist_cache_t; + typedef TSA tsa; + friend class Moses::Mmsapt; + protected: + mutable boost::shared_mutex m_lock; // for thread-safe operation - class agenda; // for parallel sampling see ug_bitext_agenda.h - mutable SPTR ag; - size_t m_num_workers; // number of workers available to the agenda + class agenda; // for parallel sampling see ug_bitext_agenda.h + mutable SPTR ag; + size_t m_num_workers; // number of workers available to the agenda - size_t m_default_sample_size; - size_t m_pstats_cache_threshold; // threshold for caching sampling results - SPTR m_cache1, m_cache2; // caches for sampling results + size_t m_default_sample_size; + size_t m_pstats_cache_threshold; // threshold for caching sampling results + SPTR m_cache1, m_cache2; // caches for sampling results - std::vector m_docname; - map m_docname2docid; // maps from doc names to ids - SPTR > m_sid2docid; // maps from sentences to docs (ids) + std::vector m_docname; + std::map m_docname2docid; // maps from doc names to ids + SPTR > m_sid2docid; // maps from sentences to docs (ids) - mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; - // caches for unbiased sampling; biased sampling uses the caches that - // are stored locally on the translation task + mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; + // caches for unbiased sampling; biased sampling uses the caches that + // are stored locally on the translation task - public: - SPTR > Tx; // word alignments - SPTR > T1; // token track - SPTR > T2; // token track - SPTR V1; // vocab - SPTR V2; // vocab - SPTR > I1; // indices - SPTR > I2; // indices + public: + SPTR > Tx; // word alignments + SPTR > T1; // token track + SPTR > T2; // token track + SPTR V1; // vocab + SPTR V2; // vocab + SPTR > I1; // indices + SPTR > I2; // indices - /// given the source phrase sid[start:stop] - // find the possible start (s1 .. s2) and end (e1 .. e2) - // points of the target phrase; if non-NULL, store word - // alignments in *core_alignment. If /flip/, source phrase is - // L2. - bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const; - bool find_trg_phr_bounds - ( size_t const sid, // sentence to investigate - size_t const start, // start of source phrase - size_t const stop, // last position of source phrase - size_t & s1, size_t & s2, // beginning and end of target start - size_t & e1, size_t & e2, // beginning and end of target end - int& po_fwd, int& po_bwd, // phrase orientations - std::vector * core_alignment, // stores the core alignment - bitvector* full_alignment, // stores full word alignment for this sent. - bool const flip) const; // flip source and target (reverse lookup) - - // prep2 launches sampling and returns immediately. - // lookup (below) waits for the job to finish before it returns - SPTR - prep2(iter const& phrase, int max_sample = -1) const; - -#ifndef NO_MOSES - SPTR - prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; -#endif - - public: - Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16); - - Bitext(Ttrack* const t1, Ttrack* const t2, - Ttrack* const tx, - TokenIndex* const v1, TokenIndex* const v2, - TSA* const i1, TSA* const i2, - size_t const max_sample=1000, - size_t const xnum_workers=16); - - virtual void - open(string const base, string const L1, string const L2) = 0; - - SPTR - lookup(iter const& phrase, int max_sample = -1) const; - void prep(iter const& phrase) const; - -#ifndef NO_MOSES - SPTR - lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; - void prep(ttasksptr const& ttask, iter const& phrase) const; -#endif - - void setDefaultSampleSize(size_t const max_samples); - size_t getDefaultSampleSize() const; - - string toString(uint64_t pid, int isL2) const; - - virtual size_t revision() const { return 0; } - - SPTR - loadSentenceBias(string const& fname) const; - - SPTR - SetupDocumentBias(string const& bserver, string const& text, std::ostream* log) const; - - SPTR - SetupDocumentBias(map context_weights, std::ostream* log) const; - - void - mark_match(Token const* start, Token const* end, iter const& m, - bitvector& check) const; - void - write_yawat_alignment - ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const; - - string docname(id_type const sid) const; - - vector const* sid2did() const; - }; - -#include "ug_bitext_agenda.h" - - template - string - Bitext:: - docname(id_type const sid) const - { - if (sid < m_sid2docid->size() && (*m_sid2docid)[sid] < m_docname.size()) - return m_docname[(*m_sid2docid)[sid]]; - else - return ""; - } - - template - vector const* - Bitext:: - sid2did() const - { - return m_sid2docid.get(); - } - - template - SPTR - Bitext:: - loadSentenceBias(string const& fname) const - { - SPTR ret(new SentenceBias(T1->size())); - ifstream in(fname.c_str()); - size_t i = 0; - float v; while (in>>v) (*ret)[i++] = v; - UTIL_THROW_IF2(i != T1->size(), - "Mismatch between bias vector size and corpus size at " - << HERE); - return ret; - } - - template - string - Bitext:: - toString(uint64_t pid, int isL2) const - { - std::ostringstream buf; - uint32_t sid,off,len; ugdiss::parse_pid(pid,sid,off,len); - Token const* t = (isL2 ? T2 : T1)->sntStart(sid) + off; - Token const* x = t + len; - TokenIndex const& V = isL2 ? *V2 : *V1; - while (t < x) - { - buf << V[t->id()]; - if (++t < x) buf << " "; - } - return buf.str(); - } - - template - size_t - Bitext:: - getDefaultSampleSize() const - { - return m_default_sample_size; - } - template - void - Bitext:: - setDefaultSampleSize(size_t const max_samples) - { - boost::unique_lock guard(m_lock); - if (max_samples != m_default_sample_size) - { - m_cache1.reset(new pstats::cache_t); - m_cache2.reset(new pstats::cache_t); - m_default_sample_size = max_samples; - } - } - - template - Bitext:: - Bitext(size_t const max_sample, size_t const xnum_workers) - : m_num_workers(xnum_workers) - , m_default_sample_size(max_sample) - , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD) - , m_cache1(new pstats::cache_t) - , m_cache2(new pstats::cache_t) - { } - - template - Bitext:: - Bitext(Ttrack* const t1, - Ttrack* const t2, - Ttrack* const tx, - TokenIndex* const v1, - TokenIndex* const v2, - TSA* const i1, - TSA* const i2, - size_t const max_sample, - size_t const xnum_workers) - : m_num_workers(xnum_workers) - , m_default_sample_size(max_sample) - , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD) - , m_cache1(new pstats::cache_t) - , m_cache2(new pstats::cache_t) - , Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2) - { } - - template class snt_adder; - template<> class snt_adder >; - - template<> - class snt_adder > - { - typedef L2R_Token TKN; - std::vector const & snt; - TokenIndex & V; - SPTR > & track; - SPTR > & index; - public: - snt_adder(std::vector const& s, TokenIndex& v, - SPTR >& t, SPTR >& i); - - void operator()(); - }; - - template - bool - Bitext:: - find_trg_phr_bounds(PhraseExtractionRecord& rec) const - { - return find_trg_phr_bounds(rec.sid, rec.start, rec.stop, - rec.s1, rec.s2, rec.e1, rec.e2, - rec.po_fwd, rec.po_bwd, - rec.aln, rec.full_aln, rec.flip); - } - - template - bool - Bitext:: - find_trg_phr_bounds + /// given the source phrase sid[start:stop] + // find the possible start (s1 .. s2) and end (e1 .. e2) + // points of the target phrase; if non-NULL, store word + // alignments in *core_alignment. If /flip/, source phrase is + // L2. + bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const; + bool find_trg_phr_bounds ( size_t const sid, // sentence to investigate size_t const start, // start of source phrase size_t const stop, // last position of source phrase @@ -365,377 +152,588 @@ namespace Moses { int& po_fwd, int& po_bwd, // phrase orientations std::vector * core_alignment, // stores the core alignment bitvector* full_alignment, // stores full word alignment for this sent. - bool const flip) const // flip source and target (reverse lookup) - { - // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl; - // a word on the core_alignment (core_alignment): - // - // Since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1 - // < e2, respectively) are be definition unaligned, we store - // only the core alignment in *aln. It is up to the calling - // function to shift alignment points over for start positions - // of extracted phrases that start with a fringe word - assert(T1); - assert(T2); - assert(Tx); + bool const flip) const; // flip source and target (reverse lookup) - size_t slen1,slen2; - if (flip) - { - slen1 = T2->sntLen(sid); - slen2 = T1->sntLen(sid); - } - else - { - slen1 = T1->sntLen(sid); - slen2 = T2->sntLen(sid); - } - bitvector forbidden(slen2); - if (full_alignment) - { - if (slen1*slen2 > full_alignment->size()) - full_alignment->resize(slen1*slen2*2); - full_alignment->reset(); - } - size_t src,trg; - size_t lft = forbidden.size(); - size_t rgt = 0; - std::vector > aln1(slen1),aln2(slen2); - - // process word alignment for this sentence - char const* p = Tx->sntStart(sid); - char const* x = Tx->sntEnd(sid); - while (p < x) - { - if (flip) - { - p = binread(p,trg); - assert(p= slen1 || trg >= slen2), - "Alignment range error at sentence " << sid << "!\n" - << src << "/" << slen1 << " " << trg << "/" << slen2); - - if (src < start || src >= stop) - forbidden.set(trg); - else - { - lft = min(lft,trg); - rgt = max(rgt,trg); - } - if (core_alignment) - { - aln1[src].push_back(trg); - aln2[trg].push_back(src); - } - if (full_alignment) - full_alignment->set(src*slen2 + trg); - } - - for (size_t i = lft; i <= rgt; ++i) - if (forbidden[i]) - return false; - - s2 = lft; for (s1 = s2; s1 && !forbidden[s1-1]; --s1); - e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2); - - if (lft > rgt) return false; - if (core_alignment) - { - core_alignment->clear(); - for (size_t i = start; i < stop; ++i) - { - BOOST_FOREACH(ushort x, aln1[i]) - { - core_alignment->push_back(i - start); - core_alignment->push_back(x - lft); - } - } - // now determine fwd and bwd phrase orientation - po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2); - po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2); - } - return lft <= rgt; - } - - template - SPTR - Bitext:: - SetupDocumentBias - ( string const& bserver, string const& text, std::ostream* log ) const - { - SPTR ret; - UTIL_THROW_IF2(m_sid2docid == NULL, - "Document bias requested but no document map loaded."); - ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, - bserver, text, log)); - return ret; - } - - template - SPTR - Bitext:: - SetupDocumentBias - ( std::map context_weights, std::ostream* log ) const - { - SPTR ret; - UTIL_THROW_IF2(m_sid2docid == NULL, - "Document bias requested but no document map loaded."); - ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, - context_weights, log)); - return ret; - } - - template - void - Bitext:: - prep(iter const& phrase) const - { - prep2(phrase, m_default_sample_size); - } - - - - // prep2 schedules a phrase for sampling, and returns immediately - // the member function lookup retrieves the respective pstats instance - // and waits until the sampling is finished before it returns. - // This allows sampling in the background - template + // prep2 launches sampling and returns immediately. + // lookup (below) waits for the job to finish before it returns SPTR - Bitext - ::prep2 - (iter const& phrase, int max_sample) const - { - if (max_sample < 0) max_sample = m_default_sample_size; - SPTR bias; - SPTR cache; - // - no caching for rare phrases and special requests (max_sample) - // (still need to test what a good caching threshold is ...) - // - use the task-specific cache when there is a sampling bias - if (max_sample == int(m_default_sample_size) - && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) - { - cache = (phrase.root == I1.get() ? m_cache1 : m_cache2); - } + prep2(iter const& phrase, int max_sample = -1) const; - SPTR ret; - SPTR const* cached; +#ifndef NO_MOSES + SPTR + prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; +#endif - if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached) - return *cached; - boost::unique_lock guard(m_lock); - if (!ag) - { - ag.reset(new agenda(*this)); - if (m_num_workers > 1) - ag->add_workers(m_num_workers); - } - ret = ag->add_job(this, phrase, max_sample, bias); - if (cache) cache->set(phrase.getPid(),ret); - UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job."); - return ret; - } + public: + Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16); - // worker for scoring and sorting phrase table entries in parallel - template - class pstats2pplist - { - Ttrack const& m_other; - SPTR m_pstats; - std::vector >& m_pplist; - typename PhrasePair::Scorer const* m_scorer; - PhrasePair m_pp; - Token const* m_token; - size_t m_len; - uint64_t m_pid1; - bool m_is_inverse; - public: + Bitext(Ttrack* const t1, Ttrack* const t2, + Ttrack* const tx, + TokenIndex* const v1, TokenIndex* const v2, + TSA* const i1, TSA* const i2, + size_t const max_sample=1000, + size_t const xnum_workers=16); - // CONSTRUCTOR - pstats2pplist(typename TSA::tree_iterator const& m, - Ttrack const& other, - SPTR const& ps, - std::vector >& dest, - typename PhrasePair::Scorer const* scorer) - : m_other(other) - , m_pstats(ps) - , m_pplist(dest) - , m_scorer(scorer) - , m_token(m.getToken(0)) - , m_len(m.size()) - , m_pid1(m.getPid()) - , m_is_inverse(false) - { } + virtual void + open(std::string const base, std::string const L1, std::string const L2) = 0; - // WORKER - void - operator()() - { - // wait till all statistics have been collected - boost::unique_lock lock(m_pstats->lock); - while (m_pstats->in_progress) - m_pstats->ready.wait(lock); + SPTR + lookup(iter const& phrase, int max_sample = -1) const; - m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0); + void prep(iter const& phrase) const; - // convert pstats entries to phrase pairs - pstats::trg_map_t::iterator a; - for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a) - { - uint32_t sid,off,len; - ugdiss::parse_pid(a->first, sid, off, len); - m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second); - m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1), - m_pp.joint); - size_t J = m_pp.joint<<7; // hard coded threshold of 1/128 - if (m_pp.good1 > J || m_pp.good2 > J) continue; - if (m_scorer) - { - (*m_scorer)(m_pp); - } - m_pplist.push_back(m_pp); - } - greater > sorter; - if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter); - } - }; +#ifndef NO_MOSES + SPTR + lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const; + + void prep(ttasksptr const& ttask, iter const& phrase) const; +#endif + + void setDefaultSampleSize(size_t const max_samples); + size_t getDefaultSampleSize() const; + + std::string toString(uint64_t pid, int isL2) const; + + virtual size_t revision() const { return 0; } + + SPTR + loadSentenceBias(std::string const& fname) const; + + SPTR + SetupDocumentBias(std::string const& bserver, std::string const& text, + std::ostream* log) const; + + SPTR + SetupDocumentBias(std::map context_weights, + std::ostream* log) const; - template void - Bitext - ::mark_match(Token const* start, Token const* end, - iter const& m, bitvector& check) const - { - check.resize(end-start); - check.reset(); - Token const* x = m.getToken(0); - for (Token const* s = start; s < end; ++s) - { - if (s->id() != x->id()) continue; - Token const* a = x; - Token const* b = s; - size_t i = 0; - while (a && b && a->id() == b->id() && i < m.size()) - { - ++i; - a = a->next(); - b = b->next(); - } - if (i == m.size()) - { - b = s; - while (i-- > 0) { check.set(b-start); b = b->next(); } - } - } - } - - template + mark_match(Token const* start, Token const* end, iter const& m, + bitvector& check) const; void - Bitext:: write_yawat_alignment - ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const - { - std::vector a1(T1->sntLen(sid),-1), a2(T2->sntLen(sid),-1); - bitvector f1(a1.size()), f2(a2.size()); - if (m1) mark_match(T1->sntStart(sid), T1->sntEnd(sid), *m1, f1); - if (m2) mark_match(T2->sntStart(sid), T2->sntEnd(sid), *m2, f2); + ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const; - std::vector > agroups; - std::vector grouplabel; - std::pair ag; - ag.first.resize(a1.size()); - ag.second.resize(a2.size()); - char const* x = Tx->sntStart(sid); - size_t a, b; - while (x < Tx->sntEnd(sid)) - { - x = binread(x,a); - x = binread(x,b); - if (a1.at(a) < 0 && a2.at(b) < 0) - { - a1[a] = a2[b] = agroups.size(); - ag.first.reset(); - ag.second.reset(); - ag.first.set(a); - ag.second.set(b); - agroups.push_back(ag); - grouplabel.push_back(f1[a] || f2[b] ? "infocusbi" : "unspec"); - } - else if (a1.at(a) < 0) - { - a1[a] = a2[b]; - agroups[a2[b]].first.set(a); - if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi"; - } - else if (a2.at(b) < 0) - { - a2[b] = a1[a]; - agroups[a1[a]].second.set(b); - if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi"; - } - else - { - agroups[a1[a]].first |= agroups[a2[b]].first; - agroups[a1[a]].second |= agroups[a2[b]].second; - a2[b] = a1[a]; - if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi"; - } - } + std::string docname(id_type const sid) const; - for (a = 0; a < a1.size(); ++a) - { - if (a1[a] < 0) - { - if (f1[a]) out << a << "::" << "infocusmono "; - continue; - } - bitvector const& A = agroups[a1[a]].first; - bitvector const& B = agroups[a1[a]].second; - if (A.find_first() < a) continue; - write_bitvector(A,out); out << ":"; - write_bitvector(B,out); out << ":"; - out << grouplabel[a1[a]] << " "; - } - for (b = 0; b < a2.size(); ++b) - { - if (a2[b] < 0 && f2[b]) - out << "::" << "infocusmono "; - } - } + std::vector const* sid2did() const; + }; - template + #include "ug_bitext_agenda.h" + + template + std::string + Bitext:: + docname(id_type const sid) const + { + if (sid < m_sid2docid->size() && (*m_sid2docid)[sid] < m_docname.size()) + return m_docname[(*m_sid2docid)[sid]]; + else + return ""; + } + + template + std::vector const* + Bitext:: + sid2did() const + { + return m_sid2docid.get(); + } + + template + SPTR + Bitext:: + loadSentenceBias(std::string const& fname) const + { + SPTR ret(new SentenceBias(T1->size())); + std::ifstream in(fname.c_str()); + size_t i = 0; + float v; while (in>>v) (*ret)[i++] = v; + UTIL_THROW_IF2(i != T1->size(), + "Mismatch between bias vector size and corpus size at " + << HERE); + return ret; + } + + template + std::string + Bitext:: + toString(uint64_t pid, int isL2) const + { + std::ostringstream buf; + uint32_t sid,off,len; parse_pid(pid,sid,off,len); + Token const* t = (isL2 ? T2 : T1)->sntStart(sid) + off; + Token const* x = t + len; + TokenIndex const& V = isL2 ? *V2 : *V1; + while (t < x) + { + buf << V[t->id()]; + if (++t < x) buf << " "; + } + return buf.str(); + } + + template + size_t + Bitext:: + getDefaultSampleSize() const + { + return m_default_sample_size; + } + template + void + Bitext:: + setDefaultSampleSize(size_t const max_samples) + { + boost::unique_lock guard(m_lock); + if (max_samples != m_default_sample_size) + { + m_cache1.reset(new pstats::cache_t); + m_cache2.reset(new pstats::cache_t); + m_default_sample_size = max_samples; + } + } + + template + Bitext:: + Bitext(size_t const max_sample, size_t const xnum_workers) + : m_num_workers(xnum_workers) + , m_default_sample_size(max_sample) + , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD) + , m_cache1(new pstats::cache_t) + , m_cache2(new pstats::cache_t) + { } + + template + Bitext:: + Bitext(Ttrack* const t1, + Ttrack* const t2, + Ttrack* const tx, + TokenIndex* const v1, + TokenIndex* const v2, + TSA* const i1, + TSA* const i2, + size_t const max_sample, + size_t const xnum_workers) + : m_num_workers(xnum_workers) + , m_default_sample_size(max_sample) + , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD) + , m_cache1(new pstats::cache_t) + , m_cache2(new pstats::cache_t) + , Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2) + { } + + template class snt_adder; + template<> class snt_adder >; + + template<> + class snt_adder > + { + typedef L2R_Token TKN; + std::vector const & snt; + TokenIndex & V; + SPTR > & track; + SPTR > & index; + public: + snt_adder(std::vector const& s, TokenIndex& v, + SPTR >& t, SPTR >& i); + + void operator()(); + }; + + template + bool + Bitext:: + find_trg_phr_bounds(PhraseExtractionRecord& rec) const + { + return find_trg_phr_bounds(rec.sid, rec.start, rec.stop, + rec.s1, rec.s2, rec.e1, rec.e2, + rec.po_fwd, rec.po_bwd, + rec.aln, rec.full_aln, rec.flip); + } + + template + bool + Bitext:: + find_trg_phr_bounds + ( size_t const sid, // sentence to investigate + size_t const start, // start of source phrase + size_t const stop, // last position of source phrase + size_t & s1, size_t & s2, // beginning and end of target start + size_t & e1, size_t & e2, // beginning and end of target end + int& po_fwd, int& po_bwd, // phrase orientations + std::vector * core_alignment, // stores the core alignment + bitvector* full_alignment, // stores full word alignment for this sent. + bool const flip) const // flip source and target (reverse lookup) + { + // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl; + // a word on the core_alignment (core_alignment): + // + // Since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1 + // < e2, respectively) are be definition unaligned, we store + // only the core alignment in *aln. It is up to the calling + // function to shift alignment points over for start positions + // of extracted phrases that start with a fringe word + assert(T1); + assert(T2); + assert(Tx); + + size_t slen1,slen2; + if (flip) + { + slen1 = T2->sntLen(sid); + slen2 = T1->sntLen(sid); + } + else + { + slen1 = T1->sntLen(sid); + slen2 = T2->sntLen(sid); + } + bitvector forbidden(slen2); + if (full_alignment) + { + if (slen1*slen2 > full_alignment->size()) + full_alignment->resize(slen1*slen2*2); + full_alignment->reset(); + } + size_t src,trg; + size_t lft = forbidden.size(); + size_t rgt = 0; + std::vector > aln1(slen1),aln2(slen2); + + // process word alignment for this sentence + char const* p = Tx->sntStart(sid); + char const* x = Tx->sntEnd(sid); + while (p < x) + { + if (flip) + { + p = binread(p,trg); + assert(p= slen1 || trg >= slen2), + "Alignment range error at sentence " << sid << "!\n" + << src << "/" << slen1 << " " << trg << "/" << slen2); + + if (src < start || src >= stop) + forbidden.set(trg); + else + { + lft = std::min(lft,trg); + rgt = std::max(rgt,trg); + } + if (core_alignment) + { + aln1[src].push_back(trg); + aln2[trg].push_back(src); + } + if (full_alignment) + full_alignment->set(src*slen2 + trg); + } + + for (size_t i = lft; i <= rgt; ++i) + if (forbidden[i]) + return false; + + s2 = lft; for (s1 = s2; s1 && !forbidden[s1-1]; --s1); + e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2); + + if (lft > rgt) return false; + if (core_alignment) + { + core_alignment->clear(); + for (size_t i = start; i < stop; ++i) + { + BOOST_FOREACH(ushort x, aln1[i]) + { + core_alignment->push_back(i - start); + core_alignment->push_back(x - lft); + } + } + // now determine fwd and bwd phrase orientation + po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2); + po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2); + } + return lft <= rgt; + } + + template + SPTR + Bitext:: + SetupDocumentBias + ( std::string const& bserver, std::string const& text, std::ostream* log ) const + { + SPTR ret; + UTIL_THROW_IF2(m_sid2docid == NULL, + "Document bias requested but no document map loaded."); + ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, + bserver, text, log)); + return ret; + } + + template + SPTR + Bitext:: + SetupDocumentBias + ( std::map context_weights, std::ostream* log ) const + { + SPTR ret; + UTIL_THROW_IF2(m_sid2docid == NULL, + "Document bias requested but no document map loaded."); + ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid, + context_weights, log)); + return ret; + } + + template + void + Bitext:: + prep(iter const& phrase) const + { + prep2(phrase, m_default_sample_size); + } + + + + // prep2 schedules a phrase for sampling, and returns immediately + // the member function lookup retrieves the respective pstats instance + // and waits until the sampling is finished before it returns. + // This allows sampling in the background + template + SPTR + Bitext + ::prep2 + (iter const& phrase, int max_sample) const + { + if (max_sample < 0) max_sample = m_default_sample_size; + SPTR bias; + SPTR cache; + // - no caching for rare phrases and special requests (max_sample) + // (still need to test what a good caching threshold is ...) + // - use the task-specific cache when there is a sampling bias + if (max_sample == int(m_default_sample_size) + && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) + { + cache = (phrase.root == I1.get() ? m_cache1 : m_cache2); + } + + SPTR ret; + SPTR const* cached; + + if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached) + return *cached; + boost::unique_lock guard(m_lock); + if (!ag) + { + ag.reset(new agenda(*this)); + if (m_num_workers > 1) + ag->add_workers(m_num_workers); + } + ret = ag->add_job(this, phrase, max_sample, bias); + if (cache) cache->set(phrase.getPid(),ret); + UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job."); + return ret; + } + + // worker for scoring and sorting phrase table entries in parallel + template + class pstats2pplist + { + Ttrack const& m_other; + SPTR m_pstats; + std::vector >& m_pplist; + typename PhrasePair::Scorer const* m_scorer; + PhrasePair m_pp; + Token const* m_token; + size_t m_len; + uint64_t m_pid1; + bool m_is_inverse; + public: + + // CONSTRUCTOR + pstats2pplist(typename TSA::tree_iterator const& m, + Ttrack const& other, + SPTR const& ps, + std::vector >& dest, + typename PhrasePair::Scorer const* scorer) + : m_other(other) + , m_pstats(ps) + , m_pplist(dest) + , m_scorer(scorer) + , m_token(m.getToken(0)) + , m_len(m.size()) + , m_pid1(m.getPid()) + , m_is_inverse(false) + { } + + // WORKER void - expand(typename Bitext::iter const& m, - Bitext const& bt, pstats const& ps, - std::vector >& dest, std::ostream* log) + operator()() { - bool fwd = m.root == bt.I1.get(); - dest.reserve(ps.trg.size()); - PhrasePair pp; - pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0); - // cout << HERE << " " - // << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << std::endl; - pstats::trg_map_t::const_iterator a; - for (a = ps.trg.begin(); a != ps.trg.end(); ++a) - { - uint32_t sid,off,len; - ugdiss::parse_pid(a->first, sid, off, len); - pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off, - len, a->second); - dest.push_back(pp); - } - } + // wait till all statistics have been collected + boost::unique_lock lock(m_pstats->lock); + while (m_pstats->in_progress) + m_pstats->ready.wait(lock); - } // end of namespace bitext -} // end of namespace moses + m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0); + + // convert pstats entries to phrase pairs + pstats::trg_map_t::iterator a; + for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a) + { + uint32_t sid,off,len; + parse_pid(a->first, sid, off, len); + m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second); + m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1), + m_pp.joint); + size_t J = m_pp.joint<<7; // hard coded threshold of 1/128 + if (m_pp.good1 > J || m_pp.good2 > J) continue; + if (m_scorer) + { + (*m_scorer)(m_pp); + } + m_pplist.push_back(m_pp); + } + std::greater > sorter; + if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter); + } + }; + + template + void + Bitext + ::mark_match(Token const* start, Token const* end, + iter const& m, bitvector& check) const + { + check.resize(end-start); + check.reset(); + Token const* x = m.getToken(0); + for (Token const* s = start; s < end; ++s) + { + if (s->id() != x->id()) continue; + Token const* a = x; + Token const* b = s; + size_t i = 0; + while (a && b && a->id() == b->id() && i < m.size()) + { + ++i; + a = a->next(); + b = b->next(); + } + if (i == m.size()) + { + b = s; + while (i-- > 0) { check.set(b-start); b = b->next(); } + } + } + } + + template + void + Bitext:: + write_yawat_alignment + ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const + { + std::vector a1(T1->sntLen(sid),-1), a2(T2->sntLen(sid),-1); + bitvector f1(a1.size()), f2(a2.size()); + if (m1) mark_match(T1->sntStart(sid), T1->sntEnd(sid), *m1, f1); + if (m2) mark_match(T2->sntStart(sid), T2->sntEnd(sid), *m2, f2); + + std::vector > agroups; + std::vector grouplabel; + std::pair ag; + ag.first.resize(a1.size()); + ag.second.resize(a2.size()); + char const* x = Tx->sntStart(sid); + size_t a, b; + while (x < Tx->sntEnd(sid)) + { + x = binread(x,a); + x = binread(x,b); + if (a1.at(a) < 0 && a2.at(b) < 0) + { + a1[a] = a2[b] = agroups.size(); + ag.first.reset(); + ag.second.reset(); + ag.first.set(a); + ag.second.set(b); + agroups.push_back(ag); + grouplabel.push_back(f1[a] || f2[b] ? "infocusbi" : "unspec"); + } + else if (a1.at(a) < 0) + { + a1[a] = a2[b]; + agroups[a2[b]].first.set(a); + if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi"; + } + else if (a2.at(b) < 0) + { + a2[b] = a1[a]; + agroups[a1[a]].second.set(b); + if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi"; + } + else + { + agroups[a1[a]].first |= agroups[a2[b]].first; + agroups[a1[a]].second |= agroups[a2[b]].second; + a2[b] = a1[a]; + if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi"; + } + } + + for (a = 0; a < a1.size(); ++a) + { + if (a1[a] < 0) + { + if (f1[a]) out << a << "::" << "infocusmono "; + continue; + } + bitvector const& A = agroups[a1[a]].first; + bitvector const& B = agroups[a1[a]].second; + if (A.find_first() < a) continue; + write_bitvector(A,out); out << ":"; + write_bitvector(B,out); out << ":"; + out << grouplabel[a1[a]] << " "; + } + for (b = 0; b < a2.size(); ++b) + { + if (a2[b] < 0 && f2[b]) + out << "::" << "infocusmono "; + } + } + + template + void + expand(typename Bitext::iter const& m, + Bitext const& bt, pstats const& ps, + std::vector >& dest, std::ostream* log) + { + bool fwd = m.root == bt.I1.get(); + dest.reserve(ps.trg.size()); + PhrasePair pp; + pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0); + // cout << HERE << " " + // << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << std::endl; + pstats::trg_map_t::const_iterator a; + for (a = ps.trg.begin(); a != ps.trg.end(); ++a) + { + uint32_t sid,off,len; + parse_pid(a->first, sid, off, len); + pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off, + len, a->second); + dest.push_back(pp); + } + } + +} // end of namespace sapt #include "ug_im_bitext.h" #include "ug_mm_bitext.h" diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h index 5e899e0b4..bc038bd03 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // to be included from ug_bitext.h // The agenda handles parallel sampling. @@ -64,16 +64,16 @@ void Bitext static boost::posix_time::time_duration nodelay(0,0,0,0); boost::lock_guard guard(this->lock); - int target = max(1, int(n + workers.size() - this->doomed)); + int target = std::max(1, int(n + workers.size() - this->doomed)); // house keeping: remove all workers that have finished for (size_t i = 0; i < workers.size(); ) { if (workers[i]->timed_join(nodelay)) - { - if (i + 1 < workers.size()) - workers[i].swap(workers.back()); - workers.pop_back(); - } + { + if (i + 1 < workers.size()) + workers[i].swap(workers.back()); + workers.pop_back(); + } else ++i; } // cerr << workers.size() << "/" << target << " active" << std::endl; @@ -82,8 +82,8 @@ void Bitext else while (int(workers.size()) < target) { - SPTR w(new boost::thread(worker(*this))); - workers.push_back(w); + SPTR w(new boost::thread(worker(*this))); + workers.push_back(w); } } @@ -142,7 +142,7 @@ Bitext return ret; } - typename list >::iterator j = joblist.begin(); + typename std::list >::iterator j = joblist.begin(); while (j != joblist.end()) { if ((*j)->done()) diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h index d6e3959d5..52d52fc7f 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // class declaration of template class Bitxt::agenda::job // to be included by ug_bitext.h // todo: add check to enforce this @@ -157,17 +157,17 @@ int Bitext::agenda::job if (log) { Token const* t = root->getCorpus()->sntStart(sid)+offset; - Token const* x = t - min(offset,uint64_t(3)); + Token const* x = t - std::min(offset,uint64_t(3)); Token const* e = t+4; if (e > root->getCorpus()->sntEnd(sid)) - e = root->getCorpus()->sntEnd(sid); + e = root->getCorpus()->sntEnd(sid); *log << docid << ":" << sid << " " << size_t(k) << "/" << N - << " @" << p << " => " << d << " ["; + << " @" << p << " => " << d << " ["; for (id_iter m = stats->indoc.begin(); m != stats->indoc.end(); ++m) - { - if (m != stats->indoc.begin()) *log << " "; - *log << m->first << ":" << m->second; - } + { + if (m != stats->indoc.begin()) *log << " "; + *log << m->first << ":" << m->second; + } // for (size_t i = 0; i < stats->indoc.size(); ++i) // { // if (i) *log << " "; @@ -192,7 +192,7 @@ bool Bitext::agenda::job if (no_maybe_yes > 1) return true; // yes // ... maybe: flip a coin size_t options_chosen = stats->good; - size_t options_total = max(stats->raw_cnt, this->ctr); + size_t options_total = std::max(stats->raw_cnt, this->ctr); size_t options_left = (options_total - this->ctr); size_t random_number = options_left * (rnd()/(rnd.max()+1.)); size_t threshold; diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h index 97bdfd784..5dc05de11 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // to be included from ug_bitext_agenda.h template diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc index a5debcec8..97a7203a9 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc @@ -1,93 +1,91 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include "ug_bitext_jstats.h" -namespace Moses +namespace sapt { - namespace bitext + + uint32_t jstats::rcnt() const { return my_rcnt; } + float jstats::wcnt() const { return my_wcnt; } + float jstats::bcnt() const { return my_bcnt; } + uint32_t jstats::cnt2() const { return my_cnt2; } + + // What was that used for again? UG + bool jstats::valid() { return my_wcnt >= 0; } + void jstats::validate() { if (my_wcnt < 0) my_wcnt *= -1; } + void jstats::invalidate() { if (my_wcnt > 0) my_wcnt *= -1; } + + jstats:: + jstats() + : my_rcnt(0), my_cnt2(0), my_wcnt(0), my_bcnt(0) { + for (int i = 0; i <= Moses::LRModel::NONE; ++i) + ofwd[i] = obwd[i] = 0; + my_aln.reserve(1); + } - uint32_t jstats::rcnt() const { return my_rcnt; } - float jstats::wcnt() const { return my_wcnt; } - float jstats::bcnt() const { return my_bcnt; } - uint32_t jstats::cnt2() const { return my_cnt2; } + jstats:: + jstats(jstats const& other) + { + my_rcnt = other.rcnt(); + my_wcnt = other.wcnt(); + my_bcnt = other.bcnt(); + my_aln = other.aln(); + indoc = other.indoc; + for (int i = 0; i <= Moses::LRModel::NONE; i++) + { + ofwd[i] = other.ofwd[i]; + obwd[i] = other.obwd[i]; + } + } - // What was that used for again? UG - bool jstats::valid() { return my_wcnt >= 0; } - void jstats::validate() { if (my_wcnt < 0) my_wcnt *= -1; } - void jstats::invalidate() { if (my_wcnt > 0) my_wcnt *= -1; } + uint32_t + jstats:: + dcnt_fwd(PhraseOrientation const idx) const + { + assert(idx <= Moses::LRModel::NONE); + return ofwd[idx]; + } - jstats:: - jstats() - : my_rcnt(0), my_cnt2(0), my_wcnt(0), my_bcnt(0) - { - for (int i = 0; i <= Moses::LRModel::NONE; ++i) - ofwd[i] = obwd[i] = 0; - my_aln.reserve(1); - } + uint32_t + jstats:: + dcnt_bwd(PhraseOrientation const idx) const + { + assert(idx <= Moses::LRModel::NONE); + return obwd[idx]; + } - jstats:: - jstats(jstats const& other) - { - my_rcnt = other.rcnt(); - my_wcnt = other.wcnt(); - my_bcnt = other.bcnt(); - my_aln = other.aln(); - indoc = other.indoc; - for (int i = 0; i <= Moses::LRModel::NONE; i++) - { - ofwd[i] = other.ofwd[i]; - obwd[i] = other.obwd[i]; - } - } + void + jstats:: + add(float w, float b, std::vector const& a, uint32_t const cnt2, + uint32_t fwd_orient, uint32_t bwd_orient, int const docid) + { + boost::lock_guard lk(this->lock); + my_cnt2 = cnt2; + my_rcnt += 1; + my_wcnt += w; + my_bcnt += b; + if (a.size()) + { + size_t i = 0; + while (i < my_aln.size() && my_aln[i].second != a) ++i; + if (i == my_aln.size()) + my_aln.push_back(std::pair >(1,a)); + else + my_aln[i].first++; + if (my_aln[i].first > my_aln[i/2].first) + push_heap(my_aln.begin(),my_aln.begin()+i+1); + } + ++ofwd[fwd_orient]; + ++obwd[bwd_orient]; + if (docid >= 0) + { + // while (int(indoc.size()) <= docid) indoc.push_back(0); + ++indoc[docid]; + } + } - uint32_t - jstats:: - dcnt_fwd(PhraseOrientation const idx) const - { - assert(idx <= Moses::LRModel::NONE); - return ofwd[idx]; - } + std::vector > > const& + jstats:: + aln() const + { return my_aln; } - uint32_t - jstats:: - dcnt_bwd(PhraseOrientation const idx) const - { - assert(idx <= Moses::LRModel::NONE); - return obwd[idx]; - } - - void - jstats:: - add(float w, float b, std::vector const& a, uint32_t const cnt2, - uint32_t fwd_orient, uint32_t bwd_orient, int const docid) - { - boost::lock_guard lk(this->lock); - my_cnt2 = cnt2; - my_rcnt += 1; - my_wcnt += w; - my_bcnt += b; - if (a.size()) - { - size_t i = 0; - while (i < my_aln.size() && my_aln[i].second != a) ++i; - if (i == my_aln.size()) - my_aln.push_back(std::pair >(1,a)); - else - my_aln[i].first++; - if (my_aln[i].first > my_aln[i/2].first) - push_heap(my_aln.begin(),my_aln.begin()+i+1); - } - ++ofwd[fwd_orient]; - ++obwd[bwd_orient]; - if (docid >= 0) - { - // while (int(indoc.size()) <= docid) indoc.push_back(0); - ++indoc[docid]; - } - } - - std::vector > > const& - jstats:: - aln() const - { return my_aln; } - - } // namespace bitext -} // namespace Moses +} // namespace sapt diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h index 2984c9293..f7b1910cf 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #pragma once #include #include @@ -6,55 +6,51 @@ #include "ug_lexical_reordering.h" #include -namespace Moses +namespace sapt { - namespace bitext + + // "joint" (i.e., phrase std::pair) statistics + class + jstats { + boost::mutex lock; + uint32_t my_rcnt; // unweighted joint count + uint32_t my_cnt2; // raw counts L2 + float my_wcnt; // weighted joint count + float my_bcnt; // cumulative bias - // using namespace ugdiss; + // to do: use a static alignment pattern store that stores each pattern only + // once, so that we don't have to store so many alignment std::vectors + std::vector > > my_aln; + // internal word alignment - // "joint" (i.e., phrase std::pair) statistics - class - jstats - { - boost::mutex lock; - uint32_t my_rcnt; // unweighted joint count - uint32_t my_cnt2; // raw counts L2 - float my_wcnt; // weighted joint count - float my_bcnt; // cumulative bias + uint32_t ofwd[Moses::LRModel::NONE+1]; // forward distortion type counts + uint32_t obwd[Moses::LRModel::NONE+1]; // backward distortion type counts - // to do: use a static alignment pattern store that stores each pattern only - // once, so that we don't have to store so many alignment std::vectors - std::vector > > my_aln; - // internal word alignment + public: + std::map indoc; + // std::vector indoc; // counts origin of samples (for biased sampling) + jstats(); + jstats(jstats const& other); + uint32_t rcnt() const; // raw joint counts + uint32_t cnt2() const; // raw target phrase occurrence count + float wcnt() const; // weighted joint counts + float bcnt() const; // cumulative bias scores - uint32_t ofwd[Moses::LRModel::NONE+1]; // forward distortion type counts - uint32_t obwd[Moses::LRModel::NONE+1]; // backward distortion type counts + std::vector > > const & aln() const; - public: - std::map indoc; - // std::vector indoc; // counts origin of samples (for biased sampling) - jstats(); - jstats(jstats const& other); - uint32_t rcnt() const; // raw joint counts - uint32_t cnt2() const; // raw target phrase occurrence count - float wcnt() const; // weighted joint counts - float bcnt() const; // cumulative bias scores + void + add(float w, float b, std::vector const& a, uint32_t const cnt2, + uint32_t fwd_orient, uint32_t bwd_orient, int const docid); - std::vector > > const & aln() const; - - void - add(float w, float b, std::vector const& a, uint32_t const cnt2, - uint32_t fwd_orient, uint32_t bwd_orient, int const docid); - - void invalidate(); - void validate(); - bool valid(); - uint32_t dcnt_fwd(PhraseOrientation const idx) const; - uint32_t dcnt_bwd(PhraseOrientation const idx) const; - void fill_lr_vec(Moses::LRModel::Direction const& dir, - Moses::LRModel::ModelType const& mdl, - std::vector& v); - }; - } + void invalidate(); + void validate(); + bool valid(); + uint32_t dcnt_fwd(PhraseOrientation const idx) const; + uint32_t dcnt_bwd(PhraseOrientation const idx) const; + void fill_lr_vec(Moses::LRModel::Direction const& dir, + Moses::LRModel::ModelType const& mdl, + std::vector& v); + }; } + diff --git a/moses/TranslationModel/UG/mm/ug_bitext_moses.h b/moses/TranslationModel/UG/mm/ug_bitext_moses.h index 100c5e1e6..c04d87bfd 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_moses.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h @@ -1,8 +1,7 @@ // -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; cc-style: moses-cc-style -*- #pragma once #ifndef NO_MOSES -namespace Moses { -namespace bitext { +namespace sapt { template SPTR @@ -49,7 +48,7 @@ Bitext { if (max_sample < 0) max_sample = m_default_sample_size; SPTR bias; - SPTR scope = ttask->GetScope(); + SPTR scope = ttask->GetScope(); SPTR context = scope->get(this); if (context) bias = context->bias; SPTR cache; @@ -83,6 +82,5 @@ Bitext -} } #endif diff --git a/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h b/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h index 390ccbf5c..e4417df93 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_phrase_extraction_record.h @@ -3,24 +3,23 @@ #include #include "ug_typedefs.h" -namespace Moses +namespace sapt { - namespace bitext + struct PhraseExtractionRecord { - struct PhraseExtractionRecord - { - size_t const sid, start, stop; - bool const flip; // 'backward' lookup from L2 - size_t s1, s2, e1, e2; // soft and hard boundaries of target phrase - int po_fwd, po_bwd; // fwd and bwd phrase orientation - std::vector* aln; // local alignments - ugdiss::bitvector* full_aln; // full word alignment for sentence - - PhraseExtractionRecord(size_t const xsid, size_t const xstart, - size_t const xstop, bool const xflip, - std::vector* xaln, ugdiss::bitvector* xfull_aln = NULL) - : sid(xsid), start(xstart), stop(xstop), flip(xflip) - , aln(xaln), full_aln(xfull_aln) { } - }; - } + size_t const sid, start, stop; + bool const flip; // 'backward' lookup from L2 + size_t s1, s2, e1, e2; // soft and hard boundaries of target phrase + int po_fwd, po_bwd; // fwd and bwd phrase orientation + std::vector* aln; // local alignments + bitvector* full_aln; // full word alignment for sentence + + PhraseExtractionRecord(size_t const xsid, size_t const xstart, + size_t const xstop, bool const xflip, + std::vector* xaln, + bitvector* xfull_aln = NULL) + : sid(xsid), start(xstart), stop(xstop), flip(xflip) + , aln(xaln), full_aln(xfull_aln) { } + }; } + diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc index 594f37b96..6a5cf036a 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc @@ -1,96 +1,95 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include #include "ug_bitext_pstats.h" -namespace Moses +namespace sapt { - namespace bitext + +#if UG_BITEXT_TRACK_ACTIVE_THREADS + ThreadSafeCounter pstats::active; +#endif + + pstats:: + pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0) { - -#if UG_BITEXT_TRACK_ACTIVE_THREADS - ThreadSafeCounter pstats::active; -#endif - - pstats:: - pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0) - { - for (int i = 0; i <= Moses::LRModel::NONE; ++i) - ofwd[i] = obwd[i] = 0; - } - - pstats:: - ~pstats() - { -#if UG_BITEXT_TRACK_ACTIVE_THREADS - // counter may not exist any more at destruction time, so try ... catch - try { --active; } catch (...) {} -#endif - } - - void - pstats:: - register_worker() - { - this->lock.lock(); - ++this->in_progress; - this->lock.unlock(); - } - - void - pstats:: - release() - { - this->lock.lock(); - if (this->in_progress-- == 1) // last one - >we're done - this->ready.notify_all(); - this->lock.unlock(); - } - - void - pstats - ::count_sample(int const docid, size_t const num_pairs, - int const po_fwd, int const po_bwd) - { - boost::lock_guard guard(lock); - ++sample_cnt; - if (num_pairs == 0) return; - ++good; - sum_pairs += num_pairs; - ++ofwd[po_fwd]; - ++obwd[po_bwd]; - if (docid >= 0) - { - // while (int(indoc.size()) <= docid) indoc.push_back(0); - ++indoc[docid]; - } - } - - bool - pstats:: - add(uint64_t pid, float const w, float const b, - std::vector const& a, - uint32_t const cnt2, - uint32_t fwd_o, - uint32_t bwd_o, int const docid) - { - boost::lock_guard guard(this->lock); - jstats& entry = this->trg[pid]; - entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid); - if (this->good < entry.rcnt()) - { - UTIL_THROW(util::Exception, "more joint counts than good counts:" - << entry.rcnt() << "/" << this->good << "!"); - } - return true; - } - - void - pstats:: - wait() const - { - boost::unique_lock lock(this->lock); - while (this->in_progress) - this->ready.wait(lock); - } - + for (int i = 0; i <= Moses::LRModel::NONE; ++i) + ofwd[i] = obwd[i] = 0; } -} + + pstats:: + ~pstats() + { +#if UG_BITEXT_TRACK_ACTIVE_THREADS + // counter may not exist any more at destruction time, so try ... catch + try { --active; } catch (...) {} +#endif + } + + void + pstats:: + register_worker() + { + this->lock.lock(); + ++this->in_progress; + this->lock.unlock(); + } + + void + pstats:: + release() + { + this->lock.lock(); + if (this->in_progress-- == 1) // last one - >we're done + this->ready.notify_all(); + this->lock.unlock(); + } + + void + pstats + ::count_sample(int const docid, size_t const num_pairs, + int const po_fwd, int const po_bwd) + { + boost::lock_guard guard(lock); + ++sample_cnt; + if (num_pairs == 0) return; + ++good; + sum_pairs += num_pairs; + ++ofwd[po_fwd]; + ++obwd[po_bwd]; + if (docid >= 0) + { + // while (int(indoc.size()) <= docid) indoc.push_back(0); + ++indoc[docid]; + } + } + + bool + pstats:: + add(uint64_t pid, float const w, float const b, + std::vector const& a, + uint32_t const cnt2, + uint32_t fwd_o, + uint32_t bwd_o, int const docid) + { + boost::lock_guard guard(this->lock); + jstats& entry = this->trg[pid]; + entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid); + if (this->good < entry.rcnt()) + { + UTIL_THROW(util::Exception, "more joint counts than good counts:" + << entry.rcnt() << "/" << this->good << "!"); + } + return true; + } + + void + pstats:: + wait() const + { + boost::unique_lock lock(this->lock); + while (this->in_progress) + this->ready.wait(lock); + } + +} // end of namespace sapt + diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index 4be82234c..4e40a26d1 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #pragma once #include @@ -8,58 +8,56 @@ #include "ug_bitext_jstats.h" #include "moses/thread_safe_container.h" -namespace Moses +namespace sapt { - namespace bitext + struct + pstats { - struct - pstats - { - typedef boost::unordered_map > map_t; - typedef ThreadSafeContainer, map_t> cache_t; - typedef std::vector alnvec; - typedef boost::unordered_map trg_map_t; - typedef boost::unordered_map indoc_map_t; + typedef boost::unordered_map > map_t; + typedef Moses::ThreadSafeContainer, map_t> cache_t; + typedef std::vector alnvec; + typedef boost::unordered_map trg_map_t; + typedef boost::unordered_map indoc_map_t; #if UG_BITEXT_TRACK_ACTIVE_THREADS - static ThreadSafeCounter active; + static ThreadSafeCounter active; #endif - mutable boost::mutex lock; // for parallel gathering of stats - mutable boost::condition_variable ready; // consumers can wait for me to be ready + mutable boost::mutex lock; // for parallel gathering of stats + mutable boost::condition_variable ready; // consumers can wait for me to be ready - size_t raw_cnt; // (approximate) raw occurrence count - size_t sample_cnt; // number of instances selected during sampling - size_t good; // number of selected instances with valid word alignments - size_t sum_pairs; // total number of target phrases extracted (can be > raw_cnt) - size_t in_progress; // how many threads are currently working on this? + size_t raw_cnt; // (approximate) raw occurrence count + size_t sample_cnt; // number of instances selected during sampling + size_t good; // number of selected instances with valid word alignments + size_t sum_pairs; // total number of target phrases extracted (can be > raw_cnt) + size_t in_progress; // how many threads are currently working on this? - uint32_t ofwd[Moses::LRModel::NONE+1]; // distribution of fwd phrase orientations - uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations + uint32_t ofwd[Moses::LRModel::NONE+1]; // distribution of fwd phrase orientations + uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations - indoc_map_t indoc; - trg_map_t trg; - pstats(); - ~pstats(); - void release(); - void register_worker(); - size_t count_workers() { return in_progress; } + indoc_map_t indoc; + trg_map_t trg; + pstats(); + ~pstats(); + void release(); + void register_worker(); + size_t count_workers() { return in_progress; } - bool - add(uint64_t const pid, // target phrase id - float const w, // sample weight (1./(# of phrases extractable)) - float const b, // sample bias score - alnvec const& a, // local alignment - uint32_t const cnt2, // raw target phrase count - uint32_t fwd_o, // fwd. phrase orientation - uint32_t bwd_o, // bwd. phrase orientation - int const docid); // document where sample was found + bool + add(uint64_t const pid, // target phrase id + float const w, // sample weight (1./(# of phrases extractable)) + float const b, // sample bias score + alnvec const& a, // local alignment + uint32_t const cnt2, // raw target phrase count + uint32_t fwd_o, // fwd. phrase orientation + uint32_t bwd_o, // bwd. phrase orientation + int const docid); // document where sample was found - void - count_sample(int const docid, // document where sample was found - size_t const num_pairs, // # of phrases extractable here - int const po_fwd, // fwd phrase orientation - int const po_bwd); // bwd phrase orientation - void wait() const; - }; + void + count_sample(int const docid, // document where sample was found + size_t const num_pairs, // # of phrases extractable here + int const po_fwd, // fwd phrase orientation + int const po_bwd); // bwd phrase orientation + void wait() const; + }; - } } + diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index 6c08f9d48..d778c95eb 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -17,14 +17,12 @@ #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h" #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h" #include "moses/TranslationModel/UG/generic/sorting/NBestList.h" -namespace Moses -{ -namespace bitext +namespace sapt { enum sampling_method { full_coverage, random_sampling, ranked_sampling }; -typedef ugdiss::ttrack::Position TokenPosition; +typedef ttrack::Position TokenPosition; class CandidateSorter { SamplingBias const& score; @@ -36,7 +34,7 @@ public: template class -BitextSampler : public reference_counter +BitextSampler : public Moses::reference_counter { typedef Bitext bitext; typedef TSA tsa; @@ -71,7 +69,7 @@ BitextSampler : public reference_counter size_t perform_random_sampling(); int check_sample_distribution(uint64_t const& sid, uint64_t const& offset); - bool flip_coin(ugdiss::id_type & sid, ushort & offset); + bool flip_coin(id_type & sid, ushort & offset); public: BitextSampler(BitextSampler const& other); @@ -147,14 +145,14 @@ check_sample_distribution(uint64_t const& sid, uint64_t const& offset) template bool BitextSampler:: -flip_coin(ugdiss::id_type & sid, ushort & offset) +flip_coin(id_type & sid, ushort & offset) { int no_maybe_yes = m_bias ? check_sample_distribution(sid, offset) : 1; if (no_maybe_yes == 0) return false; // no if (no_maybe_yes > 1) return true; // yes // ... maybe: flip a coin size_t options_chosen = m_stats->good; - size_t options_total = max(m_stats->raw_cnt, m_ctr); + size_t options_total = std::max(m_stats->raw_cnt, m_ctr); size_t options_left = (options_total - m_ctr); size_t random_number = options_left * (m_rnd()/(m_rnd.max()+1.)); size_t threshold; @@ -231,12 +229,12 @@ perform_ranked_sampling() if (m_next == m_stop) return m_ctr; CandidateSorter sorter(*m_bias); // below: nbest size = 4 * m_samples to allow for failed phrase extraction - NBestList nbest(4*m_samples,sorter); - ugdiss::tsa::ArrayEntry I(m_next); + Moses::NBestList nbest(4*m_samples, sorter); + sapt::tsa::ArrayEntry I(m_next); while (I.next < m_stop) { ++m_ctr; - nbest.add(m_root->readEntry(I.next,I)); + nbest.add(m_root->readEntry(I.next, I)); } for (size_t i = 0; m_stats->good < m_samples && i < nbest.size(); ++i) consider_sample(nbest[i]); @@ -251,18 +249,19 @@ perform_random_sampling() { if (m_next == m_stop) return m_ctr; m_bias_total = 0; + sapt::tsa::ArrayEntry I(m_next); if (m_bias) { m_stats->raw_cnt = 0; - for (ugdiss::tsa::ArrayEntry I(m_next); I.next < m_stop;) + while (I.next < m_stop) { m_root->readEntry(I.next,I); ++m_stats->raw_cnt; m_bias_total += (*m_bias)[I.sid]; } + I.next = m_next; } - ugdiss::tsa::ArrayEntry I(m_next); while (m_stats->good < m_samples && I.next < m_stop) { ++m_ctr; @@ -300,7 +299,7 @@ consider_sample(TokenPosition const& p) for (size_t k = 1; k < aln.size(); k += 2) aln[k] += rec.s2 - rec.s1; - vector seen; seen.reserve(10); + std::vector seen; seen.reserve(10); // It is possible that the phrase extraction extracts the same // phrase twice, e.g., when word a co-occurs with sequence b b b but // is aligned only to the middle word. We can only count each phrase @@ -377,5 +376,5 @@ BitextSampler:: m_stats->release(); } -} // end of namespace bitext -} // end of namespace Moses +} // end of namespace sapt + diff --git a/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h b/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h index 29816a55d..e63b3b345 100644 --- a/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h +++ b/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h @@ -1,11 +1,11 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2007-2012 Ulrich Germann // Token class for dependency trees, where the linear order // of tokens is defined as going up a dependency chain #ifndef __ug_conll_bottom_up_token_h #define __ug_conll_bottok_up_token_h #include "ug_typedefs.h" -namespace ugdiss +namespace sapt { // using namespace std; diff --git a/moses/TranslationModel/UG/mm/ug_conll_record.cc b/moses/TranslationModel/UG/mm/ug_conll_record.cc index c44a20b92..5d7084c17 100644 --- a/moses/TranslationModel/UG/mm/ug_conll_record.cc +++ b/moses/TranslationModel/UG/mm/ug_conll_record.cc @@ -1,5 +1,6 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include "ug_conll_record.h" -namespace ugdiss +namespace sapt { Conll_Record Conll_Record:: diff --git a/moses/TranslationModel/UG/mm/ug_conll_record.h b/moses/TranslationModel/UG/mm/ug_conll_record.h index c8663a166..b77ccb981 100644 --- a/moses/TranslationModel/UG/mm/ug_conll_record.h +++ b/moses/TranslationModel/UG/mm/ug_conll_record.h @@ -1,12 +1,16 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #ifndef __ug_conll_record_h #define __ug_conll_record_h #include "ug_typedefs.h" +#include // Base class for dependency tree corpora with POS and Lemma annotations -namespace ugdiss +namespace sapt { // using namespace std; - + + using tpt::id_type; + using tpt::uchar; class Conll_Record { diff --git a/moses/TranslationModel/UG/mm/ug_corpus_token.cc b/moses/TranslationModel/UG/mm/ug_corpus_token.cc index aa08ccc4e..b32d939e7 100644 --- a/moses/TranslationModel/UG/mm/ug_corpus_token.cc +++ b/moses/TranslationModel/UG/mm/ug_corpus_token.cc @@ -1,8 +1,10 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include "ug_corpus_token.h" -// Simple wrapper around integer IDs for use with the Ctrack and TSA template classes. +// Simple wrapper around integer IDs for use with the Ctrack and TSA +// template classes. // (c) 2007-2009 Ulrich Germann -namespace ugdiss +namespace sapt { id_type const& SimpleWordId:: diff --git a/moses/TranslationModel/UG/mm/ug_corpus_token.h b/moses/TranslationModel/UG/mm/ug_corpus_token.h index 52ec41a40..88a96f1d4 100644 --- a/moses/TranslationModel/UG/mm/ug_corpus_token.h +++ b/moses/TranslationModel/UG/mm/ug_corpus_token.h @@ -1,6 +1,9 @@ -// -*- c++ -*- -// This code is part of the re-factorization of the earlier non-template implementation of "corpus tracks" -// and suffix and prefix arrays over them as template classes. +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- + +// This code is part of the re-factorization of the earlier +// non-template implementation of "corpus tracks" and suffix and +// prefix arrays over them as template classes. + // (c) 2007-2009 Ulrich Germann #ifndef __ug_corpus_token_h @@ -13,9 +16,10 @@ #include "tpt_typedefs.h" +#include "ug_typedefs.h" #include "ug_ttrack_base.h" -namespace ugdiss +namespace sapt { /** Simple wrapper around id_type for use with the Ttrack/TSA template classes */ diff --git a/moses/TranslationModel/UG/mm/ug_deptree.cc b/moses/TranslationModel/UG/mm/ug_deptree.cc index 003d9b35e..54b090f72 100644 --- a/moses/TranslationModel/UG/mm/ug_deptree.cc +++ b/moses/TranslationModel/UG/mm/ug_deptree.cc @@ -1,10 +1,11 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include #include "ug_deptree.h" #include "tpt_tokenindex.h" using namespace std; -namespace ugdiss +namespace sapt { bool diff --git a/moses/TranslationModel/UG/mm/ug_deptree.h b/moses/TranslationModel/UG/mm/ug_deptree.h index 1cd3f2a0c..df096b52b 100644 --- a/moses/TranslationModel/UG/mm/ug_deptree.h +++ b/moses/TranslationModel/UG/mm/ug_deptree.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2007-2012 Ulrich Germann // Stuff related to dependency trees @@ -16,7 +16,7 @@ #include "ug_typedefs.h" // using namespace std; -namespace ugdiss +namespace sapt { // Fills the std::vector v with pointers to the internal root r_x for the diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.cc b/moses/TranslationModel/UG/mm/ug_im_bitext.cc index 55603e1e2..ac6dddd6c 100644 --- a/moses/TranslationModel/UG/mm/ug_im_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.cc @@ -1,87 +1,86 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include "ug_im_bitext.h" -namespace Moses +namespace sapt { - namespace bitext + using namespace tpt; + template<> + SPTR > > + imBitext >:: + add(std::vector const& s1, + std::vector const& s2, + std::vector const& aln) const { + typedef L2R_Token TKN; + assert(s1.size() == s2.size() && s1.size() == aln.size()); - template<> - SPTR > > - imBitext >:: - add(vector const& s1, - vector const& s2, - vector const& aln) const +#ifndef NDEBUG + size_t first_new_snt = this->T1 ? this->T1->size() : 0; +#endif + + SPTR > ret; { - typedef L2R_Token TKN; - assert(s1.size() == s2.size() && s1.size() == aln.size()); - -#ifndef NDEBUG - size_t first_new_snt = this->T1 ? this->T1->size() : 0; -#endif - - SPTR > ret; - { - boost::unique_lock guard(m_lock); - ret.reset(new imBitext(*this)); - } - - // we add the sentences in separate threads (so it's faster) - boost::thread thread1(snt_adder(s1,*ret->V1,ret->myT1,ret->myI1)); - // thread1.join(); // for debugging - boost::thread thread2(snt_adder(s2,*ret->V2,ret->myT2,ret->myI2)); - BOOST_FOREACH(string const& a, aln) - { - istringstream ibuf(a); - ostringstream obuf; - uint32_t row,col; char c; - while (ibuf >> row >> c >> col) - { - UTIL_THROW_IF2(c != '-', "[" << HERE << "] " - << "Error in alignment information:\n" << a); - ugdiss::binwrite(obuf,row); - ugdiss::binwrite(obuf,col); - } - // important: DO NOT replace the two lines below this comment by - // char const* x = obuf.str().c_str(), as the memory x is pointing - // to is freed immediately upon deconstruction of the string object. - string foo = obuf.str(); - char const* x = foo.c_str(); - vector v(x,x+foo.size()); - ret->myTx = append(ret->myTx, v); - } - - thread1.join(); - thread2.join(); - - ret->Tx = ret->myTx; - ret->T1 = ret->myT1; - ret->T2 = ret->myT2; - ret->I1 = ret->myI1; - ret->I2 = ret->myI2; - -#ifndef NDEBUG - // sanity check - for (size_t i = first_new_snt; i < ret->T1->size(); ++i) - { - size_t slen1 = ret->T1->sntLen(i); - size_t slen2 = ret->T2->sntLen(i); - char const* p = ret->Tx->sntStart(i); - char const* q = ret->Tx->sntEnd(i); - size_t k; - while (p < q) - { - p = binread(p,k); - assert(p); - assert(p < q); - assert(k < slen1); - p = binread(p,k); - assert(p); - assert(k < slen2); - } - } -#endif - return ret; + boost::unique_lock guard(m_lock); + ret.reset(new imBitext(*this)); } + // we add the sentences in separate threads (so it's faster) + boost::thread thread1(snt_adder(s1,*ret->V1,ret->myT1,ret->myI1)); + // thread1.join(); // for debugging + boost::thread thread2(snt_adder(s2,*ret->V2,ret->myT2,ret->myI2)); + BOOST_FOREACH(std::string const& a, aln) + { + std::istringstream ibuf(a); + std::ostringstream obuf; + uint32_t row,col; char c; + while (ibuf >> row >> c >> col) + { + UTIL_THROW_IF2(c != '-', "[" << HERE << "] " + << "Error in alignment information:\n" << a); + binwrite(obuf,row); + binwrite(obuf,col); + } + // important: DO NOT replace the two lines below this comment by + // char const* x = obuf.str().c_str(), as the memory x is pointing + // to is freed immediately upon deconstruction of the string object. + std::string foo = obuf.str(); + char const* x = foo.c_str(); + std::vector v(x,x+foo.size()); + ret->myTx = append(ret->myTx, v); + } + + thread1.join(); + thread2.join(); + + ret->Tx = ret->myTx; + ret->T1 = ret->myT1; + ret->T2 = ret->myT2; + ret->I1 = ret->myI1; + ret->I2 = ret->myI2; + +#ifndef NDEBUG + // sanity check + for (size_t i = first_new_snt; i < ret->T1->size(); ++i) + { + size_t slen1 = ret->T1->sntLen(i); + size_t slen2 = ret->T2->sntLen(i); + char const* p = ret->Tx->sntStart(i); + char const* q = ret->Tx->sntEnd(i); + size_t k; + while (p < q) + { + p = binread(p,k); + assert(p); + assert(p < q); + assert(k < slen1); + p = binread(p,k); + assert(p); + assert(k < slen2); + } + } +#endif + return ret; } + } + diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.h b/moses/TranslationModel/UG/mm/ug_im_bitext.h index ca7c75c77..b17ccf415 100644 --- a/moses/TranslationModel/UG/mm/ug_im_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_im_bitext.h @@ -2,129 +2,127 @@ #pragma once #include "ug_bitext.h" -namespace Moses +namespace sapt { - namespace bitext + template + class imBitext : public Bitext { - template - class imBitext : public Bitext - { - SPTR > myTx; - SPTR > myT1; - SPTR > myT2; - SPTR > myI1; - SPTR > myI2; - static ThreadSafeCounter my_revision; - public: - size_t revision() const { return my_revision; } - void open(string const base, string const L1, string L2); - imBitext(SPTR const& V1, - SPTR const& V2, - size_t max_sample = 5000, size_t num_workers=4); - imBitext(size_t max_sample = 5000, size_t num_workers=4); - imBitext(imBitext const& other); + SPTR > myTx; + SPTR > myT1; + SPTR > myT2; + SPTR > myI1; + SPTR > myI2; + static Moses::ThreadSafeCounter my_revision; + public: + size_t revision() const { return my_revision; } + void open(std::string const base, std::string const L1, std::string L2); + imBitext(SPTR const& V1, + SPTR const& V2, + size_t max_sample = 5000, size_t num_workers=4); + imBitext(size_t max_sample = 5000, size_t num_workers=4); + imBitext(imBitext const& other); - // SPTR > - // add(vector const& s1, std::vector const& s2, vector & a); + // SPTR > + // add(std::vector const& s1, std::vector const& s2, std::vector & a); - SPTR > - add(vector const& s1, - std::vector const& s2, - std::vector const& a) const; - - }; - - template - ThreadSafeCounter - imBitext::my_revision; - - template - imBitext:: - imBitext(size_t max_sample, size_t num_workers) - : Bitext(max_sample, num_workers) - { - this->m_default_sample_size = max_sample; - this->V1.reset(new TokenIndex()); - this->V2.reset(new TokenIndex()); - this->V1->setDynamic(true); - this->V2->setDynamic(true); - ++my_revision; - } - - template - imBitext:: - imBitext(SPTR const& v1, - SPTR const& v2, - size_t max_sample, size_t num_workers) - : Bitext(max_sample, num_workers) - { - // this->default_sample_size = max_sample; - this->V1 = v1; - this->V2 = v2; - this->V1->setDynamic(true); - this->V2->setDynamic(true); - ++my_revision; - } - - - template - imBitext:: - imBitext(imBitext const& other) - { - this->myTx = other.myTx; - this->myT1 = other.myT1; - this->myT2 = other.myT2; - this->myI1 = other.myI1; - this->myI2 = other.myI2; - this->Tx = this->myTx; - this->T1 = this->myT1; - this->T2 = this->myT2; - this->I1 = this->myI1; - this->I2 = this->myI2; - this->V1 = other.V1; - this->V2 = other.V2; - this->m_default_sample_size = other.m_default_sample_size; - this->m_num_workers = other.m_num_workers; - ++my_revision; - } - - template<> - SPTR > > - imBitext >:: - add(vector const& s1, - vector const& s2, - vector const& aln) const; - - template SPTR > - imBitext:: - add(vector const& s1, - vector const& s2, - vector const& aln) const - { - throw "Not yet implemented"; - } + add(std::vector const& s1, + std::vector const& s2, + std::vector const& a) const; - // What's up with this function???? UG - template - void - imBitext:: - open(string const base, string const L1, string L2) - { - mmTtrack& t1 = *reinterpret_cast*>(this->T1.get()); - mmTtrack& t2 = *reinterpret_cast*>(this->T2.get()); - mmTtrack& tx = *reinterpret_cast*>(this->Tx.get()); - t1.open(base+L1+".mct"); - t2.open(base+L2+".mct"); - tx.open(base+L1+"-"+L2+".mam"); - this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex(); - this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex(); - mmTSA& i1 = *reinterpret_cast*>(this->I1.get()); - mmTSA& i2 = *reinterpret_cast*>(this->I2.get()); - i1.open(base+L1+".sfa", this->T1); - i2.open(base+L2+".sfa", this->T2); - assert(this->T1->size() == this->T2->size()); - } + }; + template + Moses::ThreadSafeCounter + imBitext::my_revision; + + template + imBitext:: + imBitext(size_t max_sample, size_t num_workers) + : Bitext(max_sample, num_workers) + { + this->m_default_sample_size = max_sample; + this->V1.reset(new TokenIndex()); + this->V2.reset(new TokenIndex()); + this->V1->setDynamic(true); + this->V2->setDynamic(true); + ++my_revision; } + + template + imBitext:: + imBitext(SPTR const& v1, + SPTR const& v2, + size_t max_sample, size_t num_workers) + : Bitext(max_sample, num_workers) + { + // this->default_sample_size = max_sample; + this->V1 = v1; + this->V2 = v2; + this->V1->setDynamic(true); + this->V2->setDynamic(true); + ++my_revision; + } + + + template + imBitext:: + imBitext(imBitext const& other) + { + this->myTx = other.myTx; + this->myT1 = other.myT1; + this->myT2 = other.myT2; + this->myI1 = other.myI1; + this->myI2 = other.myI2; + this->Tx = this->myTx; + this->T1 = this->myT1; + this->T2 = this->myT2; + this->I1 = this->myI1; + this->I2 = this->myI2; + this->V1 = other.V1; + this->V2 = other.V2; + this->m_default_sample_size = other.m_default_sample_size; + this->m_num_workers = other.m_num_workers; + ++my_revision; + } + + template<> + SPTR > > + imBitext >:: + add(std::vector const& s1, + std::vector const& s2, + std::vector const& aln) const; + + template + SPTR > + imBitext:: + add(std::vector const& s1, + std::vector const& s2, + std::vector const& aln) const + { + throw "Not yet implemented"; + } + + // What's up with this function???? UG + template + void + imBitext:: + open(std::string const base, std::string const L1, std::string L2) + { + mmTtrack& t1 = *reinterpret_cast*>(this->T1.get()); + mmTtrack& t2 = *reinterpret_cast*>(this->T2.get()); + mmTtrack& tx = *reinterpret_cast*>(this->Tx.get()); + t1.open(base+L1+".mct"); + t2.open(base+L2+".mct"); + tx.open(base+L1+"-"+L2+".mam"); + this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex(); + this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex(); + mmTSA& i1 = *reinterpret_cast*>(this->I1.get()); + mmTSA& i2 = *reinterpret_cast*>(this->I2.get()); + i1.open(base+L1+".sfa", this->T1); + i2.open(base+L2+".sfa", this->T2); + assert(this->T1->size() == this->T2->size()); + } + } + diff --git a/moses/TranslationModel/UG/mm/ug_im_tsa.h b/moses/TranslationModel/UG/mm/ug_im_tsa.h index 60cc8dc47..87cd63db6 100644 --- a/moses/TranslationModel/UG/mm/ug_im_tsa.h +++ b/moses/TranslationModel/UG/mm/ug_im_tsa.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2007-2009 Ulrich Germann. All rights reserved. #ifndef _ug_im_tsa_h #define _ug_im_tsa_h @@ -22,7 +22,7 @@ #include "moses/TranslationModel/UG/generic/threading/ug_thread_pool.h" #include "util/usage.hh" -namespace ugdiss +namespace sapt { // using namespace std; // using namespace boost; @@ -383,32 +383,32 @@ namespace ugdiss std::ofstream out(fname.c_str()); filepos_type idxStart(0); id_type idxSize(index.size()); - numwrite(out,idxStart); - numwrite(out,idxSize); + tpt::numwrite(out,idxStart); + tpt::numwrite(out,idxSize); std::vector mmIndex; for (size_t i = 1; i < this->index.size(); i++) { - mmIndex.push_back(out.tellp()); - for (size_t k = this->index[i-1]; k < this->index[i]; ++k) - { - tightwrite(out,sufa[k].sid,0); - tightwrite(out,sufa[k].offset,1); - } + mmIndex.push_back(out.tellp()); + for (size_t k = this->index[i-1]; k < this->index[i]; ++k) + { + tpt::tightwrite(out,sufa[k].sid,0); + tpt::tightwrite(out,sufa[k].offset,1); + } } mmIndex.push_back(out.tellp()); idxStart = out.tellp(); for (size_t i = 0; i < mmIndex.size(); i++) - numwrite(out,mmIndex[i]-mmIndex[0]); + tpt::numwrite(out,mmIndex[i]-mmIndex[0]); out.seekp(0); - numwrite(out,idxStart); + tpt::numwrite(out,idxStart); out.close(); } template imTSA:: imTSA(imTSA const& prior, - boost::shared_ptr const> const& crp, - std::vector const& newsids, size_t const vsize) + boost::shared_ptr const> const& crp, + std::vector const& newsids, size_t const vsize) { typename ttrack::Position::LESS > sorter(crp.get()); diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h index fd14c161f..539831ceb 100644 --- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // In-memory corpus track // (c) 2006-2012 Ulrich Germann. @@ -26,10 +26,8 @@ #define IMTTRACK_INCREMENT_SIZE 100000 #define IMTSA_INCREMENT_SIZE 1000000 -namespace ugdiss +namespace sapt { - // using namespace std; - // using namespace boost; namespace bio=boost::iostreams; template class imTSA; diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h index 6a87b4f69..3ec0af454 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h @@ -15,23 +15,23 @@ #include "ug_mm_2d_table.h" #include "util/exception.hh" // using namespace std; -namespace ugdiss +namespace sapt { template class LexicalPhraseScorer2 { - std::vector ftag; + std::vector ftag; public: typedef mm2dTable table_t; table_t COOC; - void open(string const& fname); - template + void open(std::string const& fname); + template void score(TKN const* snt1, size_t const s1, size_t const e1, TKN const* snt2, size_t const s2, size_t const e2, - std::vector const & aln, float const alpha, + std::vector const & aln, float const alpha, float & fwd_score, float& bwd_score) const; void @@ -53,18 +53,18 @@ namespace ugdiss template void LexicalPhraseScorer2:: - open(string const& fname) + open(std::string const& fname) { COOC.open(fname); } template - template + template void LexicalPhraseScorer2:: score(TKN const* snt1, size_t const s1, size_t const e1, TKN const* snt2, size_t const s2, size_t const e2, - vector const & aln, float const alpha, + std::vector const & aln, float const alpha, float & fwd_score, float& bwd_score) const { std::vector p1(e1,0), p2(e2,0); @@ -146,7 +146,7 @@ namespace ugdiss size_t i1=0,i2=0; for (char const* x = aln_start; x < aln_end;) { - x = binread(binread(x,i1),i2); + x = tpt::binread(tpt::binread(x,i1),i2); if (i1 < s1 || i1 >= e1 || i2 < s2 || i2 >= e2) continue; p1[i1] += plup_fwd(snt1[i1].id(), snt2[i2].id(),alpha); ++c1[i1]; diff --git a/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc b/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc index 704aced63..3273905bd 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc +++ b/moses/TranslationModel/UG/mm/ug_lexical_reordering.cc @@ -1,151 +1,149 @@ #include "ug_lexical_reordering.h" #include "moses/Util.h" -namespace Moses +namespace sapt { - namespace bitext + using namespace std; + + Moses::LRModel::ReorderingType po_other = Moses::LRModel::NONE; + // check if min and max in the aligmnet vector v are within the + // bounds LFT and RGT and update the actual bounds L and R; update + // the total count of alignment links in the underlying phrase + // pair + bool + check(vector const& v, // alignment row/column + size_t const LFT, size_t const RGT, // hard limits + ushort& L, ushort& R, size_t& count) // current bounds, count { - using namespace std; + if (v.size() == 0) return 0; + if (L > v.front() && (L=v.front()) < LFT) return false; + if (R < v.back() && (R=v.back()) > RGT) return false; + count += v.size(); + return true; + } - Moses::LRModel::ReorderingType po_other = Moses::LRModel::NONE; - // check if min and max in the aligmnet vector v are within the - // bounds LFT and RGT and update the actual bounds L and R; update - // the total count of alignment links in the underlying phrase - // pair - bool - check(vector const& v, // alignment row/column - size_t const LFT, size_t const RGT, // hard limits - ushort& L, ushort& R, size_t& count) // current bounds, count - { - if (v.size() == 0) return 0; - if (L > v.front() && (L=v.front()) < LFT) return false; - if (R < v.back() && (R=v.back()) > RGT) return false; - count += v.size(); - return true; - } + /// return number of alignment points in box, -1 on failure + int + expand_block(vector > const& row2col, + vector > const& col2row, + size_t row, size_t col, // seed coordinates + size_t const TOP, size_t const LFT, // hard limits + size_t const BOT, size_t const RGT, // hard limits + ushort* top = NULL, ushort* lft = NULL, + ushort* bot = NULL, ushort* rgt = NULL) // store results + { + if (row < TOP || row > BOT || col < LFT || col > RGT) return -1; + UTIL_THROW_IF2(row >= row2col.size(), "out of bounds"); + UTIL_THROW_IF2(col >= col2row.size(), "out of bounds"); - /// return number of alignment points in box, -1 on failure - int - expand_block(vector > const& row2col, - vector > const& col2row, - size_t row, size_t col, // seed coordinates - size_t const TOP, size_t const LFT, // hard limits - size_t const BOT, size_t const RGT, // hard limits - ushort* top = NULL, ushort* lft = NULL, - ushort* bot = NULL, ushort* rgt = NULL) // store results - { - if (row < TOP || row > BOT || col < LFT || col > RGT) return -1; - UTIL_THROW_IF2(row >= row2col.size(), "out of bounds"); - UTIL_THROW_IF2(col >= col2row.size(), "out of bounds"); + // ==================================================== + // tables grow downwards, so TOP is smaller than BOT! + // ==================================================== - // ==================================================== - // tables grow downwards, so TOP is smaller than BOT! - // ==================================================== + ushort T, L, B, R; // box dimensions - ushort T, L, B, R; // box dimensions + // if we start on an empty cell, search for the first alignment point + if (row2col[row].size() == 0 && col2row[col].size() == 0) + { + if (row == TOP) while (row < BOT && !row2col[++row].size()); + else if (row == BOT) while (row > TOP && !row2col[--row].size()); - // if we start on an empty cell, search for the first alignment point - if (row2col[row].size() == 0 && col2row[col].size() == 0) - { - if (row == TOP) while (row < BOT && !row2col[++row].size()); - else if (row == BOT) while (row > TOP && !row2col[--row].size()); + if (col == LFT) while (col < RGT && !col2row[++col].size()); + else if (col == RGT) while (col > RGT && !col2row[--col].size()); - if (col == LFT) while (col < RGT && !col2row[++col].size()); - else if (col == RGT) while (col > RGT && !col2row[--col].size()); + if (row2col[row].size() == 0 && col2row[col].size() == 0) + return 0; + } + if (row2col[row].size() == 0) + row = col2row[col].front(); + if (col2row[col].size() == 0) + col = row2col[row].front(); - if (row2col[row].size() == 0 && col2row[col].size() == 0) - return 0; - } - if (row2col[row].size() == 0) - row = col2row[col].front(); - if (col2row[col].size() == 0) - col = row2col[row].front(); + if ((T = col2row[col].front()) < TOP) return -1; + if ((B = col2row[col].back()) > BOT) return -1; + if ((L = row2col[row].front()) < LFT) return -1; + if ((R = row2col[row].back()) > RGT) return -1; - if ((T = col2row[col].front()) < TOP) return -1; - if ((B = col2row[col].back()) > BOT) return -1; - if ((L = row2col[row].front()) < LFT) return -1; - if ((R = row2col[row].back()) > RGT) return -1; + if (B == T && R == L) return 1; - if (B == T && R == L) return 1; + // start/end of row / column coverage: + ushort rs = row, re = row, cs = col, ce = col; + int ret = row2col[row].size(); + for (size_t tmp = 1; tmp; ret += tmp) + { + tmp = 0;; + while (rs>T) if (!check(row2col[--rs],LFT,RGT,L,R,tmp)) return -1; + while (reL) if (!check(col2row[--cs],TOP,BOT,T,B,tmp)) return -1; + while (ceT) if (!check(row2col[--rs],LFT,RGT,L,R,tmp)) return -1; - while (reL) if (!check(col2row[--cs],TOP,BOT,T,B,tmp)) return -1; - while (ce >& a1, - vector >& a2, - size_t s1, size_t e1, - size_t s2, size_t e2) - { - if (e2 == a2.size()) // end of target sentence - return Moses::LRModel::M; - size_t y = e2, L = e2, R = a2.size()-1; // won't change - size_t x = e1, T = e1, B = a1.size()-1; - if (e1 < a1.size() && expand_block(a1,a2,x,y,T,L,B,R) >= 0) - return Moses::LRModel::M; - B = x = s1-1; T = 0; - if (s1 && expand_block(a1,a2,x,y,T,L,B,R) >= 0) - return Moses::LRModel::S; - while (e2 < a2.size() && a2[e2].size() == 0) ++e2; - if (e2 == a2.size()) // should never happen, actually - return Moses::LRModel::NONE; - if (a2[e2].back() < s1) - return Moses::LRModel::DL; - if (a2[e2].front() >= e1) - return Moses::LRModel::DR; + Moses::LRModel::ReorderingType + find_po_fwd(vector >& a1, + vector >& a2, + size_t s1, size_t e1, + size_t s2, size_t e2) + { + if (e2 == a2.size()) // end of target sentence + return Moses::LRModel::M; + size_t y = e2, L = e2, R = a2.size()-1; // won't change + size_t x = e1, T = e1, B = a1.size()-1; + if (e1 < a1.size() && expand_block(a1,a2,x,y,T,L,B,R) >= 0) + return Moses::LRModel::M; + B = x = s1-1; T = 0; + if (s1 && expand_block(a1,a2,x,y,T,L,B,R) >= 0) + return Moses::LRModel::S; + while (e2 < a2.size() && a2[e2].size() == 0) ++e2; + if (e2 == a2.size()) // should never happen, actually return Moses::LRModel::NONE; - } + if (a2[e2].back() < s1) + return Moses::LRModel::DL; + if (a2[e2].front() >= e1) + return Moses::LRModel::DR; + return Moses::LRModel::NONE; + } - Moses::LRModel::ReorderingType - find_po_bwd(vector >& a1, - vector >& a2, - size_t s1, size_t e1, - size_t s2, size_t e2) - { - if (s1 == 0 && s2 == 0) return Moses::LRModel::M; - if (s2 == 0) return Moses::LRModel::DR; - if (s1 == 0) return Moses::LRModel::DL; - size_t y = s2-1, L = 0, R = s2-1; // won't change - size_t x = s1-1, T = 0, B = s1-1; - if (expand_block(a1,a2,x,y,T,L,B,R) >= 0) - return Moses::LRModel::M; - T = x = e1; B = a1.size()-1; - if (expand_block(a1,a2,x,y,T,L,B,R) >= 0) - return Moses::LRModel::S; - while (s2-- && a2[s2].size() == 0); + Moses::LRModel::ReorderingType + find_po_bwd(vector >& a1, + vector >& a2, + size_t s1, size_t e1, + size_t s2, size_t e2) + { + if (s1 == 0 && s2 == 0) return Moses::LRModel::M; + if (s2 == 0) return Moses::LRModel::DR; + if (s1 == 0) return Moses::LRModel::DL; + size_t y = s2-1, L = 0, R = s2-1; // won't change + size_t x = s1-1, T = 0, B = s1-1; + if (expand_block(a1,a2,x,y,T,L,B,R) >= 0) + return Moses::LRModel::M; + T = x = e1; B = a1.size()-1; + if (expand_block(a1,a2,x,y,T,L,B,R) >= 0) + return Moses::LRModel::S; + while (s2-- && a2[s2].size() == 0); - Moses::LRModel::ReorderingType ret; - ret = (a2[s2].size() == 0 ? po_other : - a2[s2].back() < s1 ? Moses::LRModel::DR : - a2[s2].front() >= e1 ? Moses::LRModel::DL : - po_other); + Moses::LRModel::ReorderingType ret; + ret = (a2[s2].size() == 0 ? po_other : + a2[s2].back() < s1 ? Moses::LRModel::DR : + a2[s2].front() >= e1 ? Moses::LRModel::DL : + po_other); #if 0 - cout << "s1=" << s1 << endl; - cout << "s2=" << s2x << "=>" << s2 << endl; - cout << "e1=" << e1 << endl; - cout << "e2=" << e2 << endl; - cout << "a2[s2].size()=" << a2[s2].size() << endl; - cout << "a2[s2].back()=" << a2[s2].back() << endl; - cout << "a2[s2].front()=" << a2[s2].front() << endl; - cout << "RETURNING " << ret << endl; + cout << "s1=" << s1 << endl; + cout << "s2=" << s2x << "=>" << s2 << endl; + cout << "e1=" << e1 << endl; + cout << "e2=" << e2 << endl; + cout << "a2[s2].size()=" << a2[s2].size() << endl; + cout << "a2[s2].back()=" << a2[s2].back() << endl; + cout << "a2[s2].front()=" << a2[s2].front() << endl; + cout << "RETURNING " << ret << endl; #endif - return ret; - } + return ret; + } + +} // namespace sapt - } // namespace bitext -} // namespace Moses diff --git a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h index 965dfcc04..846f9436d 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_reordering.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_reordering.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #pragma once #include "ug_typedefs.h" #include @@ -8,9 +8,10 @@ #include "moses/FF/LexicalReordering/LexicalReorderingState.h" #endif -namespace Moses { +namespace sapt { + #ifdef NO_MOSES -namespace LRModel{ +class LRModel{ enum ModelType { Monotonic, MSD, MSLR, LeftRight, None }; enum Direction { Forward, Backward, Bidirectional }; @@ -28,13 +29,13 @@ namespace LRModel{ NONE = 4 // largest possible }; -} +}; +typedef int PhraseOrientation; +#else + typedef Moses::LRModel LRModel; + typedef Moses::LRModel::ReorderingType PhraseOrientation; #endif -namespace bitext { - -typedef Moses::LRModel::ReorderingType PhraseOrientation; - PhraseOrientation find_po_fwd(std::vector >& a1, std::vector >& a2, @@ -47,7 +48,4 @@ find_po_bwd(std::vector >& a1, size_t b1, size_t e1, size_t b2, size_t e2); - - - -}} // close namespaces +} // close namespaces diff --git a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h index 0ae16895b..f8d9b9b20 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h +++ b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h @@ -11,7 +11,7 @@ #include "ug_typedefs.h" #include "util/exception.hh" namespace bio=boost::iostreams; -namespace ugdiss +namespace sapt { // using namespace std; template @@ -186,9 +186,9 @@ namespace ugdiss } filepos_type idxOffset=0; - numwrite(out,idxOffset); // place holder, we'll return here at the end - numwrite(out,id_type(m1->size())); // number of rows - numwrite(out,id_type(m2->size())); // number of columns + tpt::numwrite(out,idxOffset); // place holder, we'll return here at the end + tpt::numwrite(out,id_type(m1->size())); // number of rows + tpt::numwrite(out,id_type(m2->size())); // number of columns // write actual table std::vector index; @@ -229,7 +229,7 @@ namespace ugdiss out.write(reinterpret_cast(&(*m2)[0]),m2->size()*sizeof(VAL)); out.seekp(0); - numwrite(out,idxOffset); + tpt::numwrite(out,idxOffset); } } #endif diff --git a/moses/TranslationModel/UG/mm/ug_mm_bitext.h b/moses/TranslationModel/UG/mm/ug_mm_bitext.h index 63feb9427..b15fb15b2 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_mm_bitext.h @@ -1,83 +1,81 @@ // -*- c++ -*- // don't include this file directly! it is included by ug_bitext.h -namespace Moses +namespace sapt { - namespace bitext + template + class mmBitext : public Bitext { - template - class mmBitext : public Bitext - { - void load_document_map(string const& fname); - public: - void open(string const base, string const L1, string L2); - mmBitext(); - }; + void load_document_map(std::string const& fname); + public: + void open(std::string const base, std::string const L1, std::string L2); + mmBitext(); + }; - template - mmBitext:: - mmBitext() - : Bitext(new mmTtrack(), new mmTtrack(), new mmTtrack(), - new TokenIndex(), new TokenIndex(), - new mmTSA(), new mmTSA()) - {}; + template + mmBitext:: + mmBitext() + : Bitext(new mmTtrack(), new mmTtrack(), new mmTtrack(), + new TokenIndex(), new TokenIndex(), + new mmTSA(), new mmTSA()) + {}; - template - void - mmBitext:: - load_document_map(string const& fname) - { - ifstream docmap(fname.c_str()); - // the docmap file should list the documents in the corpus - // in the order in which they appear with one line per document: - // - // - // in the future, we might also allow listing documents with - // sentence ranges. - string buffer,docname; size_t a=0,b; - this->m_sid2docid.reset(new std::vector(this->T1->size())); - while(getline(docmap,buffer)) - { - istringstream line(buffer); - if (!(line>>docname)) continue; // empty line - if (docname.size() && docname[0] == '#') continue; // comment - size_t docid = this->m_docname2docid.size(); - this->m_docname2docid[docname] = docid; - this->m_docname.push_back(docname); - line >> b; + template + void + mmBitext:: + load_document_map(std::string const& fname) + { + std::ifstream docmap(fname.c_str()); + // the docmap file should list the documents in the corpus + // in the order in which they appear with one line per document: + // + // + // in the future, we might also allow listing documents with + // sentence ranges. + std::string buffer,docname; size_t a=0,b; + this->m_sid2docid.reset(new std::vector(this->T1->size())); + while(getline(docmap,buffer)) + { + std::istringstream line(buffer); + if (!(line>>docname)) continue; // empty line + if (docname.size() && docname[0] == '#') continue; // comment + size_t docid = this->m_docname2docid.size(); + this->m_docname2docid[docname] = docid; + this->m_docname.push_back(docname); + line >> b; #ifndef NO_MOSES - VERBOSE(2, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << std::endl); + VERBOSE(3, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << std::endl); #endif - for (b += a; a < b; ++a) - (*this->m_sid2docid)[a] = docid; - } - UTIL_THROW_IF2(b != this->T1->size(), - "Document map doesn't match corpus!"); - } - - template - void - mmBitext:: - open(string const base, string const L1, string L2) - { - mmTtrack& t1 = *reinterpret_cast*>(this->T1.get()); - mmTtrack& t2 = *reinterpret_cast*>(this->T2.get()); - mmTtrack& tx = *reinterpret_cast*>(this->Tx.get()); - t1.open(base+L1+".mct"); - t2.open(base+L2+".mct"); - tx.open(base+L1+"-"+L2+".mam"); - this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex(); - this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex(); - mmTSA& i1 = *reinterpret_cast*>(this->I1.get()); - mmTSA& i2 = *reinterpret_cast*>(this->I2.get()); - i1.open(base+L1+".sfa", this->T1); - i2.open(base+L2+".sfa", this->T2); - assert(this->T1->size() == this->T2->size()); - - string docmapfile = base+"dmp"; - if (!access(docmapfile.c_str(),F_OK)) - load_document_map(docmapfile); - } - + for (b += a; a < b; ++a) + (*this->m_sid2docid)[a] = docid; + } + UTIL_THROW_IF2(b != this->T1->size(), + "Document map doesn't match corpus!"); } + + template + void + mmBitext:: + open(std::string const base, std::string const L1, std::string L2) + { + mmTtrack& t1 = *reinterpret_cast*>(this->T1.get()); + mmTtrack& t2 = *reinterpret_cast*>(this->T2.get()); + mmTtrack& tx = *reinterpret_cast*>(this->Tx.get()); + t1.open(base+L1+".mct"); + t2.open(base+L2+".mct"); + tx.open(base+L1+"-"+L2+".mam"); + this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex(); + this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex(); + mmTSA& i1 = *reinterpret_cast*>(this->I1.get()); + mmTSA& i2 = *reinterpret_cast*>(this->I2.get()); + i1.open(base+L1+".sfa", this->T1); + i2.open(base+L2+".sfa", this->T2); + assert(this->T1->size() == this->T2->size()); + + std::string docmapfile = base+"dmp"; + if (!access(docmapfile.c_str(),F_OK)) + load_document_map(docmapfile); + } + } + diff --git a/moses/TranslationModel/UG/mm/ug_mm_tsa.h b/moses/TranslationModel/UG/mm/ug_mm_tsa.h index e73ff5a71..591e7c59c 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_tsa.h +++ b/moses/TranslationModel/UG/mm/ug_mm_tsa.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #ifndef _ug_mm_tsa_h #define _ug_mm_tsa_h @@ -17,7 +17,7 @@ #include "tpt_pickler.h" #include "ug_tsa_base.h" -namespace ugdiss +namespace sapt { // using namespace std; namespace bio=boost::iostreams; @@ -134,8 +134,8 @@ namespace ugdiss Moses::prime(file); char const* p = file.data(); filepos_type idxOffset; - p = numread(p,idxOffset); - p = numread(p,this->indexSize); + p = tpt::numread(p,idxOffset); + p = tpt::numread(p,this->indexSize); // cerr << fname << ": " << idxOffset << " " << this->indexSize << std::endl; @@ -180,7 +180,7 @@ namespace ugdiss mmTSA:: readSid(char const* p, char const* q, id_type& sid) const { - return tightread(p,q,sid); + return tpt::tightread(p,q,sid); } // ====================================================================== @@ -190,7 +190,7 @@ namespace ugdiss mmTSA:: readSid(char const* p, char const* q, ::uint64_t& sid) const { - return tightread(p,q,sid); + return tpt::tightread(p,q,sid); } // ====================================================================== @@ -201,7 +201,7 @@ namespace ugdiss mmTSA:: readOffset(char const* p, char const* q, uint16_t& offset) const { - return tightread(p,q,offset); + return tpt::tightread(p,q,offset); } // ====================================================================== @@ -212,7 +212,7 @@ namespace ugdiss mmTSA:: readOffset(char const* p, char const* q, ::uint64_t& offset) const { - return tightread(p,q,offset); + return tpt::tightread(p,q,offset); } // ====================================================================== @@ -226,8 +226,8 @@ namespace ugdiss size_t ret=0; while (p < q) { - p = tightread(p,q,sid); - p = tightread(p,q,off); + p = tpt::tightread(p,q,sid); + p = tpt::tightread(p,q,off); ret++; } return ret; @@ -246,8 +246,8 @@ namespace ugdiss boost::dynamic_bitset check(this->corpus->size()); while (p < q) { - p = tightread(p,q,sid); - p = tightread(p,q,off); + p = tpt::tightread(p,q,sid); + p = tpt::tightread(p,q,off); check.set(sid); raw++; } diff --git a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h index cb214dd6c..4e0848d6a 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // Memory-mapped corpus track. The corpus (each Token occupying a fixed number // of bytes (must be compatible with the memory alignment in the OS) is stored // as one huge array. The "index" maps from sentence IDs to positions within @@ -17,13 +17,14 @@ #include #include "tpt_typedefs.h" +#include "ug_typedefs.h" #include "tpt_tokenindex.h" #include "ug_ttrack_base.h" #include "num_read_write.h" #include "ug_load_primer.h" #include "ug_tsa_base.h" -namespace ugdiss +namespace sapt { // using namespace std; namespace bio=boost::iostreams; @@ -93,12 +94,12 @@ namespace ugdiss bio::mapped_file myfile(fname); assert(myfile.is_open()); Moses::prime(myfile); - filepos_type idxOffset; + tpt::filepos_type idxOffset; const char* p = myfile.data(); id_type numSent,numWords; - p = numread(p,idxOffset); - p = numread(p,numSent); - p = numread(p,numWords); + p = tpt::numread(p,idxOffset); + p = tpt::numread(p,numSent); + p = tpt::numread(p,numWords); data = reinterpret_cast(p); for (size_t i = 0; i < numWords; ++i) data[i] = data[i].remap(f); @@ -179,11 +180,11 @@ namespace ugdiss std::cerr << "Error opening file " << fname << std::endl; assert(0); } - filepos_type idxOffset; + tpt::filepos_type idxOffset; char const* p = file.data(); - p = numread(p,idxOffset); - p = numread(p,this->numSent); - p = numread(p,this->numWords); + p = tpt::numread(p, idxOffset); + p = tpt::numread(p,this->numSent); + p = tpt::numread(p,this->numWords); data = reinterpret_cast(p); index = reinterpret_cast(file.data()+idxOffset); } @@ -214,9 +215,9 @@ namespace ugdiss mmTtrack:: write_blank_file_header(std::ostream& out) const { - numwrite(out,filepos_type(0)); // place holder for index start - numwrite(out,id_type(0)); // place holder for index size - numwrite(out,id_type(0)); // place holder for token count + tpt::numwrite(out,filepos_type(0)); // place holder for index start + tpt::numwrite(out,id_type(0)); // place holder for index size + tpt::numwrite(out,id_type(0)); // place holder for token count } template @@ -227,13 +228,13 @@ namespace ugdiss id_type tokenCount) const { id_type idxSize = idx.size(); - filepos_type idxStart = out.tellp(); + tpt::filepos_type idxStart = out.tellp(); for (size_t i = 0; i < idx.size(); ++i) - numwrite(out,idx[i]); + tpt::numwrite(out,idx[i]); out.seekp(0); - numwrite(out,idxStart); - numwrite(out,idxSize-1); - numwrite(out,tokenCount); + tpt::numwrite(out,idxStart); + tpt::numwrite(out,idxSize-1); + tpt::numwrite(out,tokenCount); } template diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.cc b/moses/TranslationModel/UG/mm/ug_phrasepair.cc index d533dafa3..7ac3a34d8 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.cc +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.cc @@ -1,7 +1,6 @@ #include "ug_phrasepair.h" -namespace Moses { -namespace bitext { +namespace sapt { void fill_lr_vec2 @@ -40,5 +39,5 @@ fill_lr_vec2 } -} // namespace bitext -} // namespace Moses +} // namespace sapt + diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h index 11a7bcb6b..9f6f3ebb7 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.h +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #pragma once #include #include "ug_typedefs.h" @@ -8,344 +8,340 @@ #endif #include "boost/format.hpp" #include "tpt_tokenindex.h" -namespace Moses + +namespace sapt { - namespace bitext + + template + class + PhrasePair { + public: + class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; }; + Token const* start1; + Token const* start2; + uint32_t len1; + uint32_t len2; + uint64_t p1, p2; + uint32_t raw1, raw2, sample1, sample2, good1, good2, joint; + float cum_bias; + std::vector fvals; + float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs? + float dbwd[Moses::LRModel::NONE+1]; // distortion counts + std::vector aln; + float score; + bool inverse; + // std::vector indoc; + std::map indoc; + PhrasePair() { }; + PhrasePair(PhrasePair const& o); - using ugdiss::TokenIndex; + PhrasePair const& operator+=(PhrasePair const& other); - template - class - PhrasePair - { - public: - class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; }; - Token const* start1; - Token const* start2; - uint32_t len1; - uint32_t len2; - uint64_t p1, p2; - uint32_t raw1, raw2, sample1, sample2, good1, good2, joint; - float cum_bias; - std::vector fvals; - float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs? - float dbwd[Moses::LRModel::NONE+1]; // distortion counts - std::vector aln; - float score; - bool inverse; - // std::vector indoc; - std::map indoc; - PhrasePair() { }; - PhrasePair(PhrasePair const& o); + bool operator<(PhrasePair const& other) const; + bool operator>(PhrasePair const& other) const; + bool operator<=(PhrasePair const& other) const; + bool operator>=(PhrasePair const& other) const; - PhrasePair const& operator+=(PhrasePair const& other); + void init(); + void init(uint64_t const pid1, bool is_inverse, + Token const* x, uint32_t const len, + pstats const* ps = NULL, size_t const numfeats=0); - bool operator<(PhrasePair const& other) const; - bool operator>(PhrasePair const& other) const; - bool operator<=(PhrasePair const& other) const; - bool operator>=(PhrasePair const& other) const; + PhrasePair const& + update(uint64_t const pid2, Token const* x, + uint32_t const len, jstats const& js); - void init(); - void init(uint64_t const pid1, bool is_inverse, - Token const* x, uint32_t const len, - pstats const* ps = NULL, size_t const numfeats=0); - - PhrasePair const& - update(uint64_t const pid2, Token const* x, - uint32_t const len, jstats const& js); - - void - fill_lr_vec(LRModel::Direction const& dir, - LRModel::ModelType const& mdl, - std::vector& v) const; + void + fill_lr_vec(LRModel::Direction const& dir, + LRModel::ModelType const& mdl, + std::vector& v) const; #ifndef NO_MOSES - void - print(std::ostream& out, TokenIndex const& V1, TokenIndex const& V2, - LRModel const& LR) const; + void + print(std::ostream& out, TokenIndex const& V1, TokenIndex const& V2, + LRModel const& LR) const; #endif - class SortByTargetIdSeq - { - public: - int cmp(PhrasePair const& a, PhrasePair const& b) const; - bool operator()(PhrasePair const& a, PhrasePair const& b) const; - }; - - class SortDescendingByJointCount - { - public: - int cmp(PhrasePair const& a, PhrasePair const& b) const; - bool operator()(PhrasePair const& a, PhrasePair const& b) const; - }; + class SortByTargetIdSeq + { + public: + int cmp(PhrasePair const& a, PhrasePair const& b) const; + bool operator()(PhrasePair const& a, PhrasePair const& b) const; }; - template - void PhrasePair - ::init(uint64_t const pid1, bool is_inverse, - Token const* x, uint32_t const len, - pstats const* ps, size_t const numfeats) + class SortDescendingByJointCount { - inverse = is_inverse; - start1 = x; len1 = len; - p1 = pid1; - p2 = 0; - if (ps) - { - raw1 = ps->raw_cnt; - sample1 = ps->sample_cnt; - good1 = ps->good; - } - else raw1 = sample1 = good1 = 0; - joint = 0; - good2 = 0; - sample2 = 0; - raw2 = 0; - cum_bias = 0; - fvals.resize(numfeats); - } + public: + int cmp(PhrasePair const& a, PhrasePair const& b) const; + bool operator()(PhrasePair const& a, PhrasePair const& b) const; + }; + }; - template - PhrasePair const& - PhrasePair - ::update(uint64_t const pid2, - Token const* x, uint32_t const len, jstats const& js) - { - p2 = pid2; - start2 = x; len2 = len; - raw2 = js.cnt2(); - joint = js.rcnt(); - cum_bias = js.bcnt(); - assert(js.aln().size()); - if (js.aln().size()) - aln = js.aln()[0].second; - // float total_fwd = 0, total_bwd = 0; - // for (int i = 0; i <= Moses::LRModel::NONE; i++) - // { - // PhraseOrientation po = static_cast(i); - // total_fwd += js.dcnt_fwd(po)+1; - // total_bwd += js.dcnt_bwd(po)+1; - // } + template + void PhrasePair + ::init(uint64_t const pid1, bool is_inverse, + Token const* x, uint32_t const len, + pstats const* ps, size_t const numfeats) + { + inverse = is_inverse; + start1 = x; len1 = len; + p1 = pid1; + p2 = 0; + if (ps) + { + raw1 = ps->raw_cnt; + sample1 = ps->sample_cnt; + good1 = ps->good; + } + else raw1 = sample1 = good1 = 0; + joint = 0; + good2 = 0; + sample2 = 0; + raw2 = 0; + cum_bias = 0; + fvals.resize(numfeats); + } - // should we do that here or leave the raw counts? - for (int i = 0; i <= Moses::LRModel::NONE; i++) - { - PhraseOrientation po = static_cast(i); - dfwd[i] = js.dcnt_fwd(po); - dbwd[i] = js.dcnt_bwd(po); - } + template + PhrasePair const& + PhrasePair + ::update(uint64_t const pid2, + Token const* x, uint32_t const len, jstats const& js) + { + p2 = pid2; + start2 = x; len2 = len; + raw2 = js.cnt2(); + joint = js.rcnt(); + cum_bias = js.bcnt(); + assert(js.aln().size()); + if (js.aln().size()) + aln = js.aln()[0].second; + // float total_fwd = 0, total_bwd = 0; + // for (int i = 0; i <= Moses::LRModel::NONE; i++) + // { + // PhraseOrientation po = static_cast(i); + // total_fwd += js.dcnt_fwd(po)+1; + // total_bwd += js.dcnt_bwd(po)+1; + // } - indoc = js.indoc; - return *this; - } + // should we do that here or leave the raw counts? + for (int i = 0; i <= Moses::LRModel::NONE; i++) + { + PhraseOrientation po = static_cast(i); + dfwd[i] = js.dcnt_fwd(po); + dbwd[i] = js.dcnt_bwd(po); + } - template - bool - PhrasePair - ::operator<(PhrasePair const& other) const - { - return this->score < other.score; - } + indoc = js.indoc; + return *this; + } - template - bool - PhrasePair - ::operator>(PhrasePair const& other) const - { - return this->score > other.score; - } + template + bool + PhrasePair + ::operator<(PhrasePair const& other) const + { + return this->score < other.score; + } - template - bool - PhrasePair - ::operator<=(PhrasePair const& other) const - { - return this->score <= other.score; - } + template + bool + PhrasePair + ::operator>(PhrasePair const& other) const + { + return this->score > other.score; + } - template - bool - PhrasePair - ::operator>=(PhrasePair const& other) const - { - return this->score >= other.score; - } + template + bool + PhrasePair + ::operator<=(PhrasePair const& other) const + { + return this->score <= other.score; + } - template - PhrasePair const& - PhrasePair - ::operator+=(PhrasePair const& o) - { - raw1 += o.raw1; - raw2 += o.raw2; - good1 += o.good1; - good2 += o.good2; - joint += o.joint; - sample1 += o.sample1; - sample2 += o.sample2; - cum_bias += o.cum_bias; - // todo: add distortion counts - return *this; - } + template + bool + PhrasePair + ::operator>=(PhrasePair const& other) const + { + return this->score >= other.score; + } - template - PhrasePair - ::PhrasePair(PhrasePair const& o) - : start1(o.start1) , start2(o.start2) - , len1(o.len1) , len2(o.len2) - , p1(o.p1) , p2(o.p2) - , raw1(o.raw1) , raw2(o.raw2) - , sample1(o.sample1) , sample2(o.sample2) - , good1(o.good1) , good2(o.good2) - , joint(o.joint) , cum_bias(o.cum_bias) - , fvals(o.fvals) - , aln(o.aln) - , score(o.score) - , inverse(o.inverse) - , indoc(o.indoc) - { - for (int i = 0; i <= Moses::LRModel::NONE; ++i) - { - dfwd[i] = o.dfwd[i]; - dbwd[i] = o.dbwd[i]; - } - } + template + PhrasePair const& + PhrasePair + ::operator+=(PhrasePair const& o) + { + raw1 += o.raw1; + raw2 += o.raw2; + good1 += o.good1; + good2 += o.good2; + joint += o.joint; + sample1 += o.sample1; + sample2 += o.sample2; + cum_bias += o.cum_bias; + // todo: add distortion counts + return *this; + } - template - int PhrasePair - ::SortByTargetIdSeq - ::cmp(PhrasePair const& a, PhrasePair const& b) const - { - size_t i = 0; - Token const* x = a.start2; - Token const* y = b.start2; - while (i < a.len2 && i < b.len2 && x->id() == y->id()) - { - x = x->next(); - y = y->next(); - ++i; - } - if (i == a.len2 && i == b.len2) return 0; - if (i == a.len2) return -1; - if (i == b.len2) return 1; - return x->id() < y->id() ? -1 : 1; - } + template + PhrasePair + ::PhrasePair(PhrasePair const& o) + : start1(o.start1) , start2(o.start2) + , len1(o.len1) , len2(o.len2) + , p1(o.p1) , p2(o.p2) + , raw1(o.raw1) , raw2(o.raw2) + , sample1(o.sample1) , sample2(o.sample2) + , good1(o.good1) , good2(o.good2) + , joint(o.joint) , cum_bias(o.cum_bias) + , fvals(o.fvals) + , aln(o.aln) + , score(o.score) + , inverse(o.inverse) + , indoc(o.indoc) + { + for (int i = 0; i <= Moses::LRModel::NONE; ++i) + { + dfwd[i] = o.dfwd[i]; + dbwd[i] = o.dbwd[i]; + } + } - template - bool PhrasePair - ::SortByTargetIdSeq - ::operator()(PhrasePair const& a, PhrasePair const& b) const - { - return this->cmp(a,b) < 0; - } + template + int PhrasePair + ::SortByTargetIdSeq + ::cmp(PhrasePair const& a, PhrasePair const& b) const + { + size_t i = 0; + Token const* x = a.start2; + Token const* y = b.start2; + while (i < a.len2 && i < b.len2 && x->id() == y->id()) + { + x = x->next(); + y = y->next(); + ++i; + } + if (i == a.len2 && i == b.len2) return 0; + if (i == a.len2) return -1; + if (i == b.len2) return 1; + return x->id() < y->id() ? -1 : 1; + } - template - int PhrasePair - ::SortDescendingByJointCount - ::cmp(PhrasePair const& a, PhrasePair const& b) const - { - if (a.joint == b.joint) return 0; - return a.joint > b.joint ? -1 : 1; - } + template + bool PhrasePair + ::SortByTargetIdSeq + ::operator()(PhrasePair const& a, PhrasePair const& b) const + { + return this->cmp(a,b) < 0; + } - template - bool - PhrasePair - ::SortDescendingByJointCount - ::operator()(PhrasePair const& a, PhrasePair const& b) const - { - return this->cmp(a,b) < 0; - } + template + int PhrasePair + ::SortDescendingByJointCount + ::cmp(PhrasePair const& a, PhrasePair const& b) const + { + if (a.joint == b.joint) return 0; + return a.joint > b.joint ? -1 : 1; + } - template - void - PhrasePair - ::init() - { - inverse = false; - len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; - start1 = start2 = NULL; - p1 = p2 = 0; - } + template + bool + PhrasePair + ::SortDescendingByJointCount + ::operator()(PhrasePair const& a, PhrasePair const& b) const + { + return this->cmp(a,b) < 0; + } + + template + void + PhrasePair + ::init() + { + inverse = false; + len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; + start1 = start2 = NULL; + p1 = p2 = 0; + } - void - fill_lr_vec2(LRModel::ModelType mdl, float const* const cnt, - float const total, float* v); + void + fill_lr_vec2(LRModel::ModelType mdl, float const* const cnt, + float const total, float* v); - template - void - PhrasePair - ::fill_lr_vec(LRModel::Direction const& dir, - LRModel::ModelType const& mdl, - std::vector& v) const - { - // how many distinct scores do we have? - size_t num_scores = (mdl == LRModel::MSLR ? 4 : mdl == LRModel::MSD ? 3 : 2); - size_t offset; - if (dir == LRModel::Bidirectional) - { - offset = num_scores; - num_scores *= 2; - } - else offset = 0; + template + void + PhrasePair + ::fill_lr_vec(LRModel::Direction const& dir, + LRModel::ModelType const& mdl, + std::vector& v) const + { + // how many distinct scores do we have? + size_t num_scores = (mdl == LRModel::MSLR ? 4 : mdl == LRModel::MSD ? 3 : 2); + size_t offset; + if (dir == LRModel::Bidirectional) + { + offset = num_scores; + num_scores *= 2; + } + else offset = 0; - v.resize(num_scores); + v.resize(num_scores); - // determine the denominator - float total = 0; - for (size_t i = 0; i <= LRModel::NONE; ++i) - total += dfwd[i]; + // determine the denominator + float total = 0; + for (size_t i = 0; i <= LRModel::NONE; ++i) + total += dfwd[i]; - if (dir != LRModel::Forward) // i.e., Backward or Bidirectional - fill_lr_vec2(mdl, dbwd, total, &v[0]); - if (dir != LRModel::Backward) // i.e., Forward or Bidirectional - fill_lr_vec2(mdl, dfwd, total, &v[offset]); - } + if (dir != LRModel::Forward) // i.e., Backward or Bidirectional + fill_lr_vec2(mdl, dbwd, total, &v[0]); + if (dir != LRModel::Backward) // i.e., Forward or Bidirectional + fill_lr_vec2(mdl, dfwd, total, &v[offset]); + } #ifndef NO_MOSES - template - void - PhrasePair - ::print(std::ostream& out, TokenIndex const& V1, TokenIndex const& V2, + template + void + PhrasePair + ::print(std::ostream& out, TokenIndex const& V1, TokenIndex const& V2, LRModel const& LR) const - { - out << toString (V1, this->start1, this->len1) << " ::: " - << toString (V2, this->start2, this->len2) << " " - << this->joint << " ["; - // for (size_t i = 0; i < this->indoc.size(); ++i) - for (std::map::const_iterator m = indoc.begin(); - m != indoc.end(); ++m) - { - if (m != indoc.begin()) out << " "; - out << m->first << ":" << m->second; - } - out << "] ["; - std::vector lrscores; - this->fill_lr_vec(LR.GetDirection(), LR.GetModelType(), lrscores); - for (size_t i = 0; i < lrscores.size(); ++i) - { - if (i) out << " "; - out << boost::format("%.2f") % exp(lrscores[i]); - } - out << "]" << std::endl; + { + out << toString (V1, this->start1, this->len1) << " ::: " + << toString (V2, this->start2, this->len2) << " " + << this->joint << " ["; + // for (size_t i = 0; i < this->indoc.size(); ++i) + for (std::map::const_iterator m = indoc.begin(); + m != indoc.end(); ++m) + { + if (m != indoc.begin()) out << " "; + out << m->first << ":" << m->second; + } + out << "] ["; + std::vector lrscores; + this->fill_lr_vec(LR.GetDirection(), LR.GetModelType(), lrscores); + for (size_t i = 0; i < lrscores.size(); ++i) + { + if (i) out << " "; + out << boost::format("%.2f") % exp(lrscores[i]); + } + out << "]" << std::endl; #if 0 - for (int i = 0; i <= Moses::LRModel::NONE; i++) - { - // PhraseOrientation po = static_cast(i); - if (i) *log << " "; - *log << p.dfwd[i]; - } - *log << "] ["; - for (int i = 0; i <= Moses::LRModel::NONE; i++) - { - // PhraseOrientation po = static_cast(i); - if (i) *log << " "; - *log << p.dbwd[i]; - } + for (int i = 0; i <= Moses::LRModel::NONE; i++) + { + // PhraseOrientation po = static_cast(i); + if (i) *log << " "; + *log << p.dfwd[i]; + } + *log << "] ["; + for (int i = 0; i <= Moses::LRModel::NONE; i++) + { + // PhraseOrientation po = static_cast(i); + if (i) *log << " "; + *log << p.dbwd[i]; + } #endif - } + } #endif - } // namespace bitext -} // namespace Moses +} // namespace sapt diff --git a/moses/TranslationModel/UG/mm/ug_prep_phrases.h b/moses/TranslationModel/UG/mm/ug_prep_phrases.h index 25ba4b8f7..1c62db2e3 100644 --- a/moses/TranslationModel/UG/mm/ug_prep_phrases.h +++ b/moses/TranslationModel/UG/mm/ug_prep_phrases.h @@ -8,7 +8,7 @@ #include "ug_lru_cache.h" namespace Moses { -namespace bitext { +namespace sapt { template // , typename BITEXT> struct StatsCollector @@ -75,7 +75,7 @@ struct StatsCollector } } }; -} // end of namespace bitext +} // end of namespace sapt } // end of namespace Moses #if 0 diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index c67b2cd09..649e463f5 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include "ug_sampling_bias.h" #include #include @@ -5,237 +6,230 @@ #ifndef NO_MOSES #include "moses/Timer.h" #endif -// #ifdef HAVE_CURLPP -// #include -// #include -// #include -// #endif // #ifdef WITH_MMT_BIAS_CLIENT #include "ug_http_client.h" // #endif -namespace Moses +namespace sapt { - namespace bitext + using tpt::id_type; + + std::string + query_bias_server(std::string const& server, + std::string const& context, + std::ostream* log) { - using ugdiss::id_type; - - std::string - query_bias_server(std::string const& server, - std::string const& context, - std::ostream* log) - { - std::string query = server+uri_encode(context); - boost::asio::io_service io_service; - Moses::http_client c(io_service, query, log); - io_service.run(); + std::string query = Moses::uri_encode(context); + boost::asio::io_service io_service; + Moses::http_client c(io_service, query, log); + io_service.run(); - if (log) - { - std::string response = c.content(); - *log << "SERVER RESPONSE: " << response << std::endl; - } - if (c.content().size() == 0) - { - if (log) *log << "BIAS SERVER ERROR: " << c.error_msg() << std::endl; - // UTIL_THROW_IF2(c.content().size() == 0, "No response from bias server!"); - } - return c.content(); - } - // #endif - - SamplingBias:: - SamplingBias(std::vector const* sid2doc) - : m_sid2docid(sid2doc) - { } - - int - SamplingBias:: - GetClass(id_type const idx) const - { - return m_sid2docid ? m_sid2docid->at(idx) : -1; - } - - DocumentBias:: - DocumentBias(std::vector const& sid2doc, - std::map const& docname2docid, - std::string const& server_url, std::string const& text, - std::ostream* _log) - : SamplingBias(&sid2doc) - // , m_bias(docname2docid.size(), 0) - { - this->log = _log; -#ifndef NO_MOSES - Timer timer; - if (_log) timer.start(NULL); -#endif - std::string json = query_bias_server(server_url, text, _log); - - // std::cerr << "SERVER RESPONSE " << json << std::endl; - init_from_json(json, docname2docid, log); -#ifndef NO_MOSES - if (_log) *_log << "Bias query took " << timer << " seconds." << std::endl; -#endif - } - - DocumentBias:: - DocumentBias(std::vector const& sid2doc, - std::map const& docname2docid, - std::map const& context_weights, - std::ostream* _log) - : SamplingBias(&sid2doc) - // , m_bias(docname2docid.size(), 0) - { - this->log = _log; - init(context_weights, docname2docid); - } - - std::map& SamplingBias::getBiasMap() { - return m_bias_map; - } - - void - DocumentBias:: - init_from_json - ( std::string const& json, std::map const& docname2docid, - std::ostream* log) - { // poor man's special purpose json parser for responses from the - // MMT bias server - - std::string d; float total = 0; std::map bias; - size_t i = 0; while (i < json.size() && json[i] != '"') ++i; - while (++i < json.size()) - { - size_t k = i; while (i < json.size() && json[i] != '"') ++i; - if (i >= json.size()) break; - float& f = bias[json.substr(k,i-k)]; - while (++i < json.size() && json[i] != ':'); - k = ++i; - while (++i < json.size() && json[i] != ',' && json[i] != '}'); - total += (f = atof(json.substr(k, i-k).c_str())); - k = ++i; while (i < json.size() && json[i] != '"') ++i; - } - - typedef std::pair item; - if (total) { BOOST_FOREACH(item& x, bias) { x.second /= total; } } - if (log) - { - BOOST_FOREACH(item& x, bias) - { - std::map::const_iterator m; - m = docname2docid.find(x.first); - int docid = m != docname2docid.end() ? m->second : -1; - *log << "CONTEXT SERVER RESPONSE " - << "[" << docid << "] " - << x.first << " " << x.second << std::endl; - } - } - init(bias, docname2docid); - - // using xmlrpc_parse_json didn't always work (parser errors) - // xmlrpc_value* b = xmlrpc_parse_json(env ,buf.str().c_str()); - // std::cerr << "|" << buf.str() << "|" << std::endl; - // // if (b == NULL) std::cerr << "OOpS" << std::endl; - // xmlrpc_c::value_struct v(b); // = *b; - // std::map const - // bmap = static_cast >(v); - // std::map bias; - // typedef std::map::value_type item; - // float total = 0; - // BOOST_FOREACH(item const& x, bmap) - // { - // total += bias[x.first] = xmlrpc_c::value_double(x.second); - // } - // typedef std::map::value_type fitem; - // BOOST_FOREACH(fitem const& x, bias) - // std::cerr << x.first << " " << x.second/total << std::endl; - // // delete b; - } - - void - DocumentBias:: - init(std::map const& biasmap, - std::map const& docname2docid) - { - typedef std::map::value_type bias_record; - float total = 0; - BOOST_FOREACH(bias_record const& b, biasmap) - { - std::map::const_iterator m = docname2docid.find(b.first); - if (m != docname2docid.end()) - total += (m_bias[m->second] = b.second); - } - if (total) - { - typedef std::map::value_type item; - BOOST_FOREACH(item& i, m_bias) i.second /= total; - } - - if (log) - { - BOOST_FOREACH(bias_record const& b, biasmap) - { - std::map::const_iterator m = docname2docid.find(b.first); - if (m != docname2docid.end()) - *log << "BIAS " << b.first << " " << m_bias[m->second] << std::endl; - else - *log << "WARNING: bias reported for unknown document " << b.first << std::endl; - } - } - } - - float - DocumentBias:: - operator[](id_type const idx) const - { - // UTIL_THROW_IF2(idx >= m_sid2docid->size(), "Out of bounds: " - // << idx << "/" << m_sid2docid->size()); - std::map::const_iterator m = m_bias.find((*m_sid2docid)[idx]); - return m != m_bias.end() ? m->second : 0; - } - - size_t - DocumentBias:: - size() const - { return m_sid2docid->size(); } - - - - SentenceBias:: - SentenceBias(std::vector const& bias, - std::vector const* sid2doc) - : SamplingBias(sid2doc) - , m_bias(bias) - { } - - SentenceBias:: - SentenceBias(size_t const s, float const f, - std::vector const* sid2doc) - - : SamplingBias(sid2doc) - , m_bias(s,f) - { } - - float& - SentenceBias:: - operator[](id_type const idx) - { - UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds"); - return m_bias[idx]; - } - - float - SentenceBias:: - operator[](id_type const idx) const - { - UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds"); - return m_bias[idx]; - } - - size_t - SentenceBias:: - size() const { return m_bias.size(); } - + if (log) + { + std::string response = c.content(); + *log << "SERVER RESPONSE: " << response << std::endl; + } + if (c.content().size() == 0) + { + if (log) *log << "BIAS SERVER ERROR: " << c.error_msg() << std::endl; + // UTIL_THROW_IF2(c.content().size() == 0, "No response from bias server!"); + } + return c.content(); } + // #endif + + SamplingBias:: + SamplingBias(std::vector const* sid2doc) + : m_sid2docid(sid2doc) + { } + + int + SamplingBias:: + GetClass(id_type const idx) const + { + return m_sid2docid ? m_sid2docid->at(idx) : -1; + } + + DocumentBias:: + DocumentBias(std::vector const& sid2doc, + std::map const& docname2docid, + std::string const& server_url, std::string const& text, + std::ostream* _log) + : SamplingBias(&sid2doc) + // , m_bias(docname2docid.size(), 0) + { + this->log = _log; +#ifndef NO_MOSES + Moses::Timer timer; + if (_log) timer.start(NULL); +#endif + std::string json = query_bias_server(server_url, text, _log); + + // std::cerr << "SERVER RESPONSE " << json << std::endl; + init_from_json(json, docname2docid, log); +#ifndef NO_MOSES + if (_log) *_log << "Bias query took " << timer << " seconds." << std::endl; +#endif + } + + DocumentBias:: + DocumentBias(std::vector const& sid2doc, + std::map const& docname2docid, + std::map const& context_weights, + std::ostream* _log) + : SamplingBias(&sid2doc) + // , m_bias(docname2docid.size(), 0) + { + this->log = _log; + init(context_weights, docname2docid); + } + + std::map& SamplingBias::getBiasMap() { + return m_bias_map; + } + + void + DocumentBias:: + init_from_json + ( std::string const& json, std::map const& docname2docid, + std::ostream* log) + { // poor man's special purpose json parser for responses from the + // MMT bias server + + std::string d; float total = 0; std::map bias; + size_t i = 0; while (i < json.size() && json[i] != '"') ++i; + while (++i < json.size()) + { + size_t k = i; while (i < json.size() && json[i] != '"') ++i; + if (i >= json.size()) break; + float& f = bias[json.substr(k,i-k)]; + while (++i < json.size() && json[i] != ':'); + k = ++i; + while (++i < json.size() && json[i] != ',' && json[i] != '}'); + total += (f = atof(json.substr(k, i-k).c_str())); + k = ++i; while (i < json.size() && json[i] != '"') ++i; + } + + typedef std::pair item; + if (total) { BOOST_FOREACH(item& x, bias) { x.second /= total; } } + // if (log) + // { + // BOOST_FOREACH(item& x, bias) + // { + // std::map::const_iterator m; + // m = docname2docid.find(x.first); + // int docid = m != docname2docid.end() ? m->second : -1; + // *log << "CONTEXT SERVER RESPONSE " + // << "[" << docid << "] " + // << x.first << " " << x.second << std::endl; + // } + // } + init(bias, docname2docid); + + // using xmlrpc_parse_json didn't always work (parser errors) + // xmlrpc_value* b = xmlrpc_parse_json(env ,buf.str().c_str()); + // std::cerr << "|" << buf.str() << "|" << std::endl; + // // if (b == NULL) std::cerr << "OOpS" << std::endl; + // xmlrpc_c::value_struct v(b); // = *b; + // std::map const + // bmap = static_cast >(v); + // std::map bias; + // typedef std::map::value_type item; + // float total = 0; + // BOOST_FOREACH(item const& x, bmap) + // { + // total += bias[x.first] = xmlrpc_c::value_double(x.second); + // } + // typedef std::map::value_type fitem; + // BOOST_FOREACH(fitem const& x, bias) + // std::cerr << x.first << " " << x.second/total << std::endl; + // // delete b; + } + + void + DocumentBias:: + init(std::map const& biasmap, + std::map const& docname2docid) + { + typedef std::map::value_type bias_record; + float total = 0; + BOOST_FOREACH(bias_record const& b, biasmap) + { + std::map::const_iterator m = docname2docid.find(b.first); + if (m != docname2docid.end()) + total += (m_bias[m->second] = b.second); + } + if (total) + { + typedef std::map::value_type item; + BOOST_FOREACH(item& i, m_bias) i.second /= total; + } + + if (log) + { + BOOST_FOREACH(bias_record const& b, biasmap) + { + std::map::const_iterator m = docname2docid.find(b.first); + if (m != docname2docid.end()) + *log << "BIAS " << b.first << " " << m_bias[m->second] << std::endl; + else + *log << "WARNING: bias reported for unknown document " << b.first << std::endl; + } + } + } + + float + DocumentBias:: + operator[](id_type const idx) const + { + // UTIL_THROW_IF2(idx >= m_sid2docid->size(), "Out of bounds: " + // << idx << "/" << m_sid2docid->size()); + std::map::const_iterator m = m_bias.find((*m_sid2docid)[idx]); + return m != m_bias.end() ? m->second : 0; + } + + size_t + DocumentBias:: + size() const + { return m_sid2docid->size(); } + + + + SentenceBias:: + SentenceBias(std::vector const& bias, + std::vector const* sid2doc) + : SamplingBias(sid2doc) + , m_bias(bias) + { } + + SentenceBias:: + SentenceBias(size_t const s, float const f, + std::vector const* sid2doc) + + : SamplingBias(sid2doc) + , m_bias(s,f) + { } + + float& + SentenceBias:: + operator[](id_type const idx) + { + UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds"); + return m_bias[idx]; + } + + float + SentenceBias:: + operator[](id_type const idx) const + { + UTIL_THROW_IF2(idx >= m_bias.size(), "Out of bounds"); + return m_bias[idx]; + } + + size_t + SentenceBias:: + size() const { return m_bias.size(); } + } + diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h index bbdadc62f..3f8fd1fed 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #pragma once #include @@ -7,104 +7,102 @@ #include // #include "moses/Util.h" #include "ug_typedefs.h" -namespace Moses +namespace sapt { - namespace bitext + using tpt::id_type; + + std::string + query_bias_server(std::string const& url, + std::string const& text, + std::ostream* log); + + class SamplingBias { - using ugdiss::id_type; + protected: + std::vector const* m_sid2docid; + public: + SamplingBias(std::vector const* sid2docid); + int loglevel; + std::ostream* log; + // Map to store the biasmap as you get it from the server: + std::map m_bias_map; + std::map& getBiasMap(); + virtual float + operator[](id_type const ID) const = 0; + // returns (unnormalized bias) for the class of item ID - std::string - query_bias_server(std::string const& url, - std::string const& text, - std::ostream* log); + virtual size_t size() const = 0; + // number of classes - class SamplingBias - { - protected: - std::vector const* m_sid2docid; - public: - SamplingBias(std::vector const* sid2docid); - int loglevel; - std::ostream* log; - // Map to store the biasmap as you get it from the server: - std::map m_bias_map; - std::map& getBiasMap(); - virtual float - operator[](id_type const ID) const = 0; - // returns (unnormalized bias) for the class of item ID + virtual int + GetClass(id_type const ID) const; + // returns class/document/domain id of item ID + }; - virtual size_t size() const = 0; - // number of classes + class + DocumentBias : public SamplingBias + { + // std::vector m_bias; + std::map m_bias; + public: - virtual int - GetClass(id_type const ID) const; - // returns class/document/domain id of item ID - }; + DocumentBias(std::vector const& sid2doc, + std::map const& docname2docid, + std::string const& server_url, + std::string const& text, + std::ostream* log); - class - DocumentBias : public SamplingBias - { - // std::vector m_bias; - std::map m_bias; - public: + DocumentBias(std::vector const& sid2doc, + std::map const& docname2docid, + std::map const& context_weights, + std::ostream* log); - DocumentBias(std::vector const& sid2doc, - std::map const& docname2docid, - std::string const& server_url, - std::string const& text, - std::ostream* log); + void + init_from_json + ( std::string const& json, + std::map const& docname2docid, + std::ostream* log ); - DocumentBias(std::vector const& sid2doc, - std::map const& docname2docid, - std::map const& context_weights, - std::ostream* log); + void + init + ( std::map const& biasmap, + std::map const& docname2docid); - void - init_from_json - ( std::string const& json, - std::map const& docname2docid, - std::ostream* log ); + float + operator[](id_type const idx) const; - void - init - ( std::map const& biasmap, - std::map const& docname2docid); + size_t + size() const; + }; - float - operator[](id_type const idx) const; + class + SentenceBias : public SamplingBias + { + std::vector m_bias; + public: + SentenceBias(std::vector const& bias, + std::vector const* sid2docid = NULL); - size_t - size() const; - }; + SentenceBias(size_t const s, float const f = 0, + std::vector const* sid2docid = NULL); - class - SentenceBias : public SamplingBias - { - std::vector m_bias; - public: - SentenceBias(std::vector const& bias, - std::vector const* sid2docid = NULL); + float& operator[](id_type const idx); + float operator[](id_type const idx) const; + size_t size() const; - SentenceBias(size_t const s, float const f = 0, - std::vector const* sid2docid = NULL); + }; - float& operator[](id_type const idx); - float operator[](id_type const idx) const; - size_t size() const; - - }; - - class - SamplingBiasAlways : public SamplingBias - { - public: - SamplingBiasAlways(std::vector const* sid2docid) - : SamplingBias(sid2docid) {} + class + SamplingBiasAlways : public SamplingBias + { + public: + SamplingBiasAlways(std::vector const* sid2docid) + : SamplingBias(sid2docid) {} - float operator[](id_type const idx) { return 1; } - float operator[](id_type const idx) const { return 1; } - size_t size() const { return 0; } - }; + float operator[](id_type const idx) { return 1; } + float operator[](id_type const idx) const { return 1; } + size_t size() const { return 0; } + }; - } } + diff --git a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.cc b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.cc index 2eb3f7a1f..39bb22f1c 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.cc +++ b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.cc @@ -1,10 +1,11 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include "ug_tsa_array_entry.h" #include "ug_ttrack_position.h" #include "moses/TranslationModel/UG/generic/sampling/Sampling.h" // (c) 2007-2010 Ulrich Germann -namespace ugdiss +namespace sapt { namespace tsa { diff --git a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h index 3af929644..34cf09143 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_array_entry.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2007-2010 Ulrich Germann // implementation of stuff related to ArrayEntries // this file should only be included via ug_tsa_base.h, @@ -7,7 +7,7 @@ #define __ug_tsa_array_entry_h #include "ug_ttrack_position.h" -namespace ugdiss +namespace sapt { namespace tsa { @@ -33,51 +33,6 @@ namespace ugdiss S->readEntry(p,*this); } - // template - // class SamplingArrayEntryIterator - // : public tsa::ArrayEntry - // { - // size_t const N; // (approximate) total number of occurrences - // size_t const samplesize; // how many samples to chose from the range - // size_t const sampled; // how many occurrences we've looked at so far - // size_t const chosen; // how many we have chosen - // TSA_TYPE const* root; // the underlying TSA - // char const* stop; // end of the range - // public: - // SamplingArrayEntryIterator(TSA_TYPE::tree_iterator const& m, size_t const s); - // bool step(); // returns false when at end of range - // bool done(); // - // }; - - // template - // SamplingArrayEntryIterator:: - // SamplingArrayEntryIterator(typename TSA_TYPE::tree_iterator const& m, size_t const s) - // : ArrayEntry(m.lower_bound(-1)) - // , N(m.approxOccurrenceCount()) - // , samplesize(min(s,N)) - // , sampled(0) - // , chosen(0) - // , root(m.root) - // , stop(m.upper_bound(-1)) - // { } - - // template - // bool - // SamplingArrayEntryIterator:: - // step() - // { - // while (chosen < samplesize && next < stop) - // { - // root->readEntry(next,*this); - // if (util::rand_excl(N - sampled++) < samplesize - chosen) - // { - // ++chosen; - // return true; - // } - // } - // return false; - // } - } // end of namespace tsa -} // end of namespace ugdiss +} // end of namespace sapt #endif diff --git a/moses/TranslationModel/UG/mm/ug_tsa_base.h b/moses/TranslationModel/UG/mm/ug_tsa_base.h index 37597348e..4f72c75ba 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_base.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_base.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // Base class for Token Sequence Arrays // (c) 2007-2010 Ulrich Germann. All rights reserved. #ifndef _ug_tsa_base_h @@ -18,7 +18,7 @@ #include "ug_tsa_bitset_cache.h" #include "ug_typedefs.h" -namespace ugdiss +namespace sapt { // using namespace std; diff --git a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h index ec8a499b2..486302c19 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2010 Ulrich Germann. All rights reserved. #ifndef __ug_tsa_bitset_cache_h @@ -16,7 +16,7 @@ // size of the range of entries in the TSA's index in bytes to determine // whether or not to store the respective bit std::vector in the cache. -namespace ugdiss +namespace sapt { // using namespace std; template diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h index 634342a23..e05454096 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // (c) 2007 - 2010 Ulrich Germann. All rights reserved. #ifndef __ug_tsa_tree_iterator_h #define __ug_tsa_tree_iterator_h @@ -14,7 +14,7 @@ // #include "ug_bv_iter.h" -namespace ugdiss +namespace sapt { #ifndef _DISPLAY_CHAIN diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.cc b/moses/TranslationModel/UG/mm/ug_ttrack_base.cc index e754539f6..c9e383370 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_base.cc +++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.cc @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // Memory-mapped corpus track // (c) Ulrich Germann. All rights reserved @@ -6,7 +7,7 @@ #include "ug_mm_ttrack.h" #include "tpt_pickler.h" -namespace ugdiss +namespace sapt { using namespace std; diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.h b/moses/TranslationModel/UG/mm/ug_ttrack_base.h index fbbc131ad..b3275b1ea 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_base.h +++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.h @@ -1,9 +1,11 @@ -// -*- c++ -*- -// Base class for corpus tracks. mmTtrack (memory-mapped Ttrack) and imTtrack (in-memory Ttrack) -// are derived from this class. +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- + +// Base class for corpus tracks. mmTtrack (memory-mapped Ttrack) and +// imTtrack (in-memory Ttrack) are derived from this class. + +// This code is part of a refactorization of the earlier Ttrack class +// as a template class for tokens of arbitrary fixed-length size. -// This code is part of a refactorization of the earlier Ttrack class as a template class for -// tokens of arbitrary fixed-length size. // (c) 2007-2009 Ulrich Germann. All rights reserved. #ifndef __ug_ttrack_base @@ -20,11 +22,12 @@ #include "moses/Util.h" // #include "ug_vocab.h" -namespace ugdiss +namespace sapt { // using namespace std; typedef boost::dynamic_bitset bdBitset; + using tpt::count_type; size_t len_from_pid(uint64_t pid); diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_position.cc b/moses/TranslationModel/UG/mm/ug_ttrack_position.cc index d542eabb3..81b94a4ae 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_position.cc +++ b/moses/TranslationModel/UG/mm/ug_ttrack_position.cc @@ -1,9 +1,12 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include "ug_ttrack_position.h" -namespace ugdiss +namespace sapt { namespace ttrack { + using tpt::id_type; Position::Position() : sid(0), offset(0) {}; Position::Position(id_type _sid, ushort _off) : sid(_sid), offset(_off) {}; + } } diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_position.h b/moses/TranslationModel/UG/mm/ug_ttrack_position.h index 09eb1508f..7353fc11f 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_position.h +++ b/moses/TranslationModel/UG/mm/ug_ttrack_position.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #ifndef __ug_ttrack_position_h #define __ug_ttrack_position_h @@ -10,8 +10,9 @@ // // (c) 2007-2010 Ulrich Germann. All rights reserved. -namespace ugdiss +namespace sapt { + using tpt::id_type; namespace ttrack { /** Represents a position in a corpus (sentence Id + offset from beginning diff --git a/moses/TranslationModel/UG/mm/ug_typedefs.h b/moses/TranslationModel/UG/mm/ug_typedefs.h index 1c35825a0..c726d94ec 100644 --- a/moses/TranslationModel/UG/mm/ug_typedefs.h +++ b/moses/TranslationModel/UG/mm/ug_typedefs.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // typedefs for Uli Germann's stuff #ifndef __ug_typedefs_h #define __ug_typedefs_h @@ -8,7 +8,7 @@ #include #include #include "tpt_typedefs.h" -namespace ugdiss +namespace sapt { // using namespace std; typedef boost::dynamic_bitset bitvector; @@ -28,6 +28,10 @@ namespace ugdiss typedef std::vector > int_2d_table; typedef std::vector int_3d_table; typedef std::vector int_4d_table; + + typedef tpt::id_type id_type; + typedef tpt::uchar uchar; + typedef tpt::filepos_type filepos_type; } #define SPTR boost::shared_ptr diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 29808d674..4dd0a6561 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -1,9 +1,4 @@ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- -// #ifdef HAVE_CURLPP -// #include -// #include -// #include -// #endif #include "mmsapt.h" #include @@ -18,7 +13,7 @@ namespace Moses { - using namespace bitext; + using namespace sapt; using namespace std; using namespace boost; @@ -121,6 +116,19 @@ namespace Moses bool Mmsapt::isLogVal(int i) const { return m_is_logval.at(i); } bool Mmsapt::isInteger(int i) const { return m_is_integer.at(i); } + void + Mmsapt:: + parse_factor_spec(std::vector& flist, std::string const key) + { + pair dflt(key, "0"); + string factors = this->param.insert(dflt).first->second; + size_t p = 0, q = factors.find(','); + for (; q < factors.size(); q = factors.find(',', p=q+1)) + flist.push_back(atoi(factors.substr(p, q-p).c_str())); + flist.push_back(atoi(factors.substr(p).c_str())); + } + + void Mmsapt::init(string const& line) { map::const_iterator m; @@ -151,20 +159,10 @@ namespace Moses UTIL_THROW_IF2(L2.size() == 0, "Missing L2 tag at " << HERE); // set defaults for all parameters if not specified so far - pair dflt("input-factor","0"); - string ifactors = param.insert(dflt).first->second; - size_t p = 0; - for (size_t q = ifactors.find(','); q < ifactors.size(); q = ifactors.find(',', p=q+1)) - m_ifactor.push_back(atoi(ifactors.substr(p, q-p).c_str())); - m_ifactor.push_back(atoi(ifactors.substr(p).c_str())); - - dflt = pair ("output-factor","0"); - string ofactors = param.insert(dflt).first->second; - for (size_t q = ofactors.find(',', p=0); q < ifactors.size(); q = ifactors.find(',', p=q+1)) - m_ofactor.push_back(atoi(ifactors.substr(p, q-p).c_str())); - m_ofactor.push_back(atoi(ofactors.substr(p).c_str())); + parse_factor_spec(m_ifactor,"input-factor"); + parse_factor_spec(m_ofactor,"output-factor"); - dflt = pair ("smooth",".01"); + pair dflt = pair ("smooth",".01"); m_lbop_conf = atof(param.insert(dflt).first->second.c_str()); dflt = pair ("lexalpha","0"); @@ -173,10 +171,11 @@ namespace Moses dflt = pair ("sample","1000"); m_default_sample_size = atoi(param.insert(dflt).first->second.c_str()); - dflt = pair("workers","8"); + dflt = pair("workers","0"); m_workers = atoi(param.insert(dflt).first->second.c_str()); - m_workers = min(m_workers,24UL); - + if (m_workers == 0) m_workers = boost::thread::hardware_concurrency(); + else m_workers = min(m_workers,size_t(boost::thread::hardware_concurrency())); + dflt = pair("bias-loglevel","0"); m_bias_loglevel = atoi(param.insert(dflt).first->second.c_str()); diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index afda5618c..7139e4fd2 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -47,24 +47,22 @@ namespace Moses { - using namespace bitext; class Mmsapt #ifndef NO_MOSES : public PhraseDictionary #endif { - // using namespace std; class TPCOllCache; friend class Alignment; std::map param; std::string m_name; public: - typedef L2R_Token Token; - typedef mmBitext mmbitext; - typedef imBitext imbitext; - typedef Bitext bitext; - typedef TSA tsa; - typedef PhraseScorer pscorer; + typedef sapt::L2R_Token Token; + typedef sapt::mmBitext mmbitext; + typedef sapt::imBitext imbitext; + typedef sapt::Bitext bitext; + typedef sapt::TSA tsa; + typedef sapt::PhraseScorer pscorer; private: // vector > shards; iptr btfix; @@ -85,21 +83,21 @@ namespace Moses int m_bias_loglevel; LexicalReordering* m_lr_func; // associated lexical reordering function string m_lr_func_name; // name of associated lexical reordering function - sampling_method m_sampling_method; // sampling method, see ug_bitext_sampler + sapt::sampling_method m_sampling_method; // sampling method, see ug_bitext_sampler boost::scoped_ptr m_thread_pool; public: void* const bias_key; // for getting bias from ttask void* const cache_key; // for getting cache from ttask void* const context_key; // for context scope from ttask private: - boost::shared_ptr m_bias; // for global default bias + boost::shared_ptr m_bias; // for global default bias boost::shared_ptr m_cache; // for global default bias size_t m_cache_size; // // size_t input_factor; // // size_t output_factor; // we can actually return entire Tokens! - std::vector m_input_factor; - std::vector m_output_factor; + // std::vector m_input_factor; + // std::vector m_output_factor; // for display for human inspection (ttable dumps): @@ -112,6 +110,9 @@ namespace Moses std::vector > m_active_ff_common; // activated feature functions (dyn) + void + parse_factor_spec(std::vector& flist, std::string const key); + void register_ff(SPTR const& ff, std::vector > & registry); @@ -150,7 +151,7 @@ namespace Moses #if PROVIDES_RANKED_SAMPLING void - set_bias_for_ranking(ttasksptr const& ttask, iptr const> bt); + set_bias_for_ranking(ttasksptr const& ttask, iptr const> bt); #endif private: @@ -159,39 +160,39 @@ namespace Moses // phrase table feature weights for alignment: std::vector feature_weights; - std::vector > wlex21; + std::vector > wlex21; // word translation lexicon (without counts, get these from calc_lex.COOC) - typedef ugdiss::mm2dTable mm2dtable_t; + typedef sapt::mm2dTable mm2dtable_t; mm2dtable_t COOCraw; TargetPhrase* mkTPhrase(ttasksptr const& ttask, Phrase const& src, - Moses::bitext::PhrasePair* fix, - Moses::bitext::PhrasePair* dyn, - SPTR > const& dynbt) const; + sapt::PhrasePair* fix, + sapt::PhrasePair* dyn, + SPTR > const& dynbt) const; void process_pstats (Phrase const& src, uint64_t const pid1, - pstats const& stats, - Bitext const & bt, + sapt::pstats const& stats, + sapt::Bitext const & bt, TargetPhraseCollection* tpcoll ) const; bool pool_pstats (Phrase const& src, - uint64_t const pid1a, pstats * statsa, Bitext const & bta, - uint64_t const pid1b, pstats const* statsb, Bitext const & btb, + uint64_t const pid1a, sapt::pstats * statsa, sapt::Bitext const & bta, + uint64_t const pid1b, sapt::pstats const* statsb, sapt::Bitext const & btb, TargetPhraseCollection* tpcoll) const; bool combine_pstats (Phrase const& src, - uint64_t const pid1a, pstats* statsa, Bitext const & bta, - uint64_t const pid1b, pstats const* statsb, Bitext const & btb, + uint64_t const pid1a, sapt::pstats* statsa, sapt::Bitext const & bta, + uint64_t const pid1b, sapt::pstats const* statsb, sapt::Bitext const & btb, TargetPhraseCollection* tpcoll) const; void load_extra_data(std::string bname, bool locking); @@ -255,10 +256,10 @@ namespace Moses std::vector const& GetFeatureNames() const; - SPTR + SPTR setupDocumentBias(std::map const& bias) const; - vector DefaultWeights() const; + std::vector DefaultWeights() const; }; } // end namespace diff --git a/moses/TranslationModel/UG/mmsapt_align.cc b/moses/TranslationModel/UG/mmsapt_align.cc index ea00a6ef9..dc7a6af12 100644 --- a/moses/TranslationModel/UG/mmsapt_align.cc +++ b/moses/TranslationModel/UG/mmsapt_align.cc @@ -3,7 +3,7 @@ // namespace Moses // { -// using namespace bitext; +// using namespace sapt; // using namespace std; // using namespace boost; diff --git a/moses/TranslationModel/UG/ptable-describe-features.cc b/moses/TranslationModel/UG/ptable-describe-features.cc index c9dd3abd1..77f36f30d 100644 --- a/moses/TranslationModel/UG/ptable-describe-features.cc +++ b/moses/TranslationModel/UG/ptable-describe-features.cc @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include "mmsapt.h" #include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h" #include @@ -8,7 +9,7 @@ #include using namespace Moses; -using namespace bitext; +using namespace sapt; using namespace std; using namespace boost; diff --git a/moses/TranslationModel/UG/ptable-lookup.cc b/moses/TranslationModel/UG/ptable-lookup.cc index 81f5b5191..cccc5b857 100644 --- a/moses/TranslationModel/UG/ptable-lookup.cc +++ b/moses/TranslationModel/UG/ptable-lookup.cc @@ -9,7 +9,7 @@ #include using namespace Moses; -using namespace bitext; +using namespace sapt; using namespace std; using namespace boost; diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h index fa2425c14..7fee0568d 100644 --- a/moses/TranslationModel/UG/sapt_phrase_scorers.h +++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // Phrase scoring functions for suffix array-based phrase tables // written by Ulrich Germann #pragma once diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h index 8e8e3852d..1d509dc40 100644 --- a/moses/TranslationModel/UG/sapt_pscore_base.h +++ b/moses/TranslationModel/UG/sapt_pscore_base.h @@ -1,4 +1,4 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // Base classes for suffix array-based phrase scorers // written by Ulrich Germann #pragma once @@ -6,8 +6,8 @@ #include "util/exception.hh" #include "boost/format.hpp" -namespace Moses { - namespace bitext +// namespace Moses { + namespace sapt { // abstract base class that defines the common API for phrase scorers @@ -18,18 +18,16 @@ namespace Moses { protected: int m_index; int m_num_feats; - string m_tag; - vector m_feature_names; + std::string m_tag; + std::vector m_feature_names; public: virtual ~PhraseScorer() {} - virtual - void - operator()(Bitext const& pt, - PhrasePair& pp, - vector * dest=NULL) - const = 0; + + virtual void + operator()(Bitext const& pt, PhrasePair& pp, + std::vector * dest=NULL) const = 0; void setIndex(int const i) { m_index = i; } @@ -40,10 +38,10 @@ namespace Moses { int fcnt() const { return m_num_feats; } - vector const & + std::vector const & fnames() const { return m_feature_names; } - string const & + std::string const & fname(int i) const { if (i < 0) i += m_num_feats; @@ -81,13 +79,12 @@ namespace Moses { : public PhraseScorer { protected: - vector m_x; + std::vector m_x; virtual void - init(string const specs) + init(std::string const specs) { - using namespace boost; UTIL_THROW_IF2(this->m_tag.size() == 0, "m_tag must be initialized in constructor"); UTIL_THROW_IF2(specs.size() == 0,"empty specification string!"); @@ -95,14 +92,14 @@ namespace Moses { "PhraseScorer can only be initialized once!"); this->m_index = -1; float x; char c; - for (istringstream buf(specs); buf>>x; buf>>c) + for (std::istringstream buf(specs); buf>>x; buf>>c) { this->m_x.push_back(x); - string fname = (format("%s-%.2f") % this->m_tag % x).str(); + std::string fname = (boost::format("%s-%.2f") % this->m_tag % x).str(); this->m_feature_names.push_back(fname); } this->m_num_feats = this->m_x.size(); } }; - } // namespace bitext -} // namespace moses + } // namespace sapt +// } // namespace moses diff --git a/moses/TranslationModel/UG/sapt_pscore_coherence.h b/moses/TranslationModel/UG/sapt_pscore_coherence.h index c201c9651..a3c13fb5b 100644 --- a/moses/TranslationModel/UG/sapt_pscore_coherence.h +++ b/moses/TranslationModel/UG/sapt_pscore_coherence.h @@ -5,29 +5,28 @@ #include "util/exception.hh" #include "boost/format.hpp" -namespace Moses { - namespace bitext +namespace sapt +{ + template + class + PScoreCoherence : public PhraseScorer { - template - class - PScoreCoherence : public PhraseScorer + public: + PScoreCoherence(std::string const dummy) { - public: - PScoreCoherence(string const dummy) - { - this->m_index = -1; - this->m_num_feats = 1; - this->m_feature_names.push_back(string("coherence")); - } + this->m_index = -1; + this->m_num_feats = 1; + this->m_feature_names.push_back(std::string("coherence")); + } - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - (*dest)[this->m_index] = log(pp.good1) - log(pp.sample1); - } - }; - } + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + (*dest)[this->m_index] = log(pp.good1) - log(pp.sample1); + } + }; } + diff --git a/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h index 0dff728d7..25b81e56a 100644 --- a/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h +++ b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h @@ -5,33 +5,31 @@ #include "sapt_pscore_base.h" #include -using namespace std; -namespace Moses { - namespace bitext { - - template - class - PScoreCumBias : public PhraseScorer +namespace sapt { + + template + class + PScoreCumBias : public PhraseScorer + { + public: + PScoreCumBias(std::string const spec) { - public: - PScoreCumBias(string const spec) - { - this->m_index = -1; - this->m_feature_names.push_back("cumb"); - this->m_num_feats = this->m_feature_names.size(); - } + this->m_index = -1; + this->m_feature_names.push_back("cumb"); + this->m_num_feats = this->m_feature_names.size(); + } - bool - isIntegerValued(int i) const { return false; } + bool + isIntegerValued(int i) const { return false; } + + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + (*dest)[this->m_index] = log(pp.cum_bias); + } + }; +} // namespace sapt - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - (*dest)[this->m_index] = log(pp.cum_bias); - } - }; - } // namespace bitext -} // namespace Moses diff --git a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h index 8bf7a07bf..356217caa 100644 --- a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h +++ b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h @@ -12,59 +12,58 @@ #include #include #include "mm/ug_ttrack_base.h" -using namespace std; -namespace Moses { - namespace bitext { - - - // // return the probability that a phrase length ratio is as extrem as - // // or more extreme as alen:blen. Based on a binomial experiment with - // // (alen + blen) trials and the probability of producing ratio L1 tokens per - // // L2 token - // float - // length_ratio_prob(float const alen, float const blen, float const ratio) - // { - // if (alen + blen == 0) return 1; - // float p = 1./(1 + ratio); - // boost::math::binomial bino(alen+blen,p); - // if (blen/(alen+blen) < p) - // return cdf(bino,blen); - // else - // return cdf(complement(bino,blen - 1)); - // } - template - class - PScoreLengthRatio : public PhraseScorer +namespace sapt { + + + // // return the probability that a phrase length ratio is as extrem as + // // or more extreme as alen:blen. Based on a binomial experiment with + // // (alen + blen) trials and the probability of producing ratio L1 tokens per + // // L2 token + // float + // length_ratio_prob(float const alen, float const blen, float const ratio) + // { + // if (alen + blen == 0) return 1; + // float p = 1./(1 + ratio); + // boost::math::binomial bino(alen+blen,p); + // if (blen/(alen+blen) < p) + // return cdf(bino,blen); + // else + // return cdf(complement(bino,blen - 1)); + // } + + template + class + PScoreLengthRatio : public PhraseScorer + { + public: + PScoreLengthRatio(std::string const& spec) { - public: - PScoreLengthRatio(std::string const& spec) - { - this->m_feature_names.push_back("lenrat"); - this->m_num_feats = this->m_feature_names.size(); - } + this->m_feature_names.push_back("lenrat"); + this->m_num_feats = this->m_feature_names.size(); + } - bool - isIntegerValued(int i) const { return false; } + bool + isIntegerValued(int i) const { return false; } - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - float p = float(bt.T1->numTokens()); - p /= bt.T1->numTokens() + bt.T2->numTokens(); - float len1 = ugdiss::len_from_pid(pp.p1); - float len2 = ugdiss::len_from_pid(pp.p2); + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + float p = float(bt.T1->numTokens()); + p /= bt.T1->numTokens() + bt.T2->numTokens(); + float len1 = sapt::len_from_pid(pp.p1); + float len2 = sapt::len_from_pid(pp.p2); - boost::math::binomial binomi(len1 + len2, p); - float& x = (*dest)[this->m_index]; - if (len2/(len1 + len2) < p) - x = log(boost::math::cdf(binomi,len2)); - else - x = log(boost::math::cdf(boost::math::complement(binomi,len2 - 1))); - } - }; - } // namespace bitext -} // namespace Moses + boost::math::binomial binomi(len1 + len2, p); + float& x = (*dest)[this->m_index]; + if (len2/(len1 + len2) < p) + x = log(boost::math::cdf(binomi,len2)); + else + x = log(boost::math::cdf(boost::math::complement(binomi,len2 - 1))); + } + }; +} // namespace sapt + diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h index eb3b20c71..4ae94502b 100644 --- a/moses/TranslationModel/UG/sapt_pscore_lex1.h +++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h @@ -6,72 +6,70 @@ #include "sapt_pscore_base.h" #include -namespace Moses { - namespace bitext +namespace sapt +{ + template + class + PScoreLex1 : public PhraseScorer { - template - class - PScoreLex1 : public PhraseScorer + float m_alpha; + std::string m_lexfile; + public: + sapt::LexicalPhraseScorer2 scorer; + + PScoreLex1(std::string const& alphaspec, std::string const& lexfile) { - float m_alpha; - string m_lexfile; - public: - ugdiss::LexicalPhraseScorer2 scorer; + this->m_index = -1; + this->m_num_feats = 2; + this->m_feature_names.reserve(2); + this->m_feature_names.push_back("lexfwd"); + this->m_feature_names.push_back("lexbwd"); + m_alpha = atof(alphaspec.c_str()); + m_lexfile = lexfile; + } - PScoreLex1(string const& alphaspec, string const& lexfile) - { - this->m_index = -1; - this->m_num_feats = 2; - this->m_feature_names.reserve(2); - this->m_feature_names.push_back("lexfwd"); - this->m_feature_names.push_back("lexbwd"); - m_alpha = atof(alphaspec.c_str()); - m_lexfile = lexfile; - } + void + load() + { + scorer.open(m_lexfile); + } - void - load() - { - scorer.open(m_lexfile); - } - - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0; - // parse_pid(pp.p1, sid1, off1, len1); - // parse_pid(pp.p2, sid2, off2, len2); + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0; + // parse_pid(pp.p1, sid1, off1, len1); + // parse_pid(pp.p2, sid2, off2, len2); #if 0 - cout << len1 << " " << len2 << endl; - Token const* t1 = bt.T1->sntStart(sid1); - for (size_t i = off1; i < off1 + len1; ++i) - cout << (*bt.V1)[t1[i].id()] << " "; - cout << __FILE__ << ":" << __LINE__ << endl; + cout << len1 << " " << len2 << endl; + Token const* t1 = bt.T1->sntStart(sid1); + for (size_t i = off1; i < off1 + len1; ++i) + cout << (*bt.V1)[t1[i].id()] << " "; + cout << __FILE__ << ":" << __LINE__ << endl; - Token const* t2 = bt.T2->sntStart(sid2); - for (size_t i = off2; i < off2 + len2; ++i) - cout << (*bt.V2)[t2[i].id()] << " "; - cout << __FILE__ << ":" << __LINE__ << endl; + Token const* t2 = bt.T2->sntStart(sid2); + for (size_t i = off2; i < off2 + len2; ++i) + cout << (*bt.V2)[t2[i].id()] << " "; + cout << __FILE__ << ":" << __LINE__ << endl; - BOOST_FOREACH (int a, pp.aln) - cout << a << " " ; - cout << __FILE__ << ":" << __LINE__ << "\n" << endl; + BOOST_FOREACH (int a, pp.aln) + cout << a << " " ; + cout << __FILE__ << ":" << __LINE__ << "\n" << endl; - scorer.score(bt.T1->sntStart(sid1)+off1,0,len1, - bt.T2->sntStart(sid2)+off2,0,len2, - pp.aln, m_alpha, - (*dest)[this->m_index], - (*dest)[this->m_index+1]); + scorer.score(bt.T1->sntStart(sid1)+off1,0,len1, + bt.T2->sntStart(sid2)+off2,0,len2, + pp.aln, m_alpha, + (*dest)[this->m_index], + (*dest)[this->m_index+1]); #endif - scorer.score(pp.start1,0, pp.len1, - pp.start2,0, pp.len2, pp.aln, m_alpha, - (*dest)[this->m_index], - (*dest)[this->m_index+1]); - } - }; - } //namespace bitext -} // namespace Moses + scorer.score(pp.start1,0, pp.len1, + pp.start2,0, pp.len2, pp.aln, m_alpha, + (*dest)[this->m_index], + (*dest)[this->m_index+1]); + } + }; +} //namespace sapt diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h index 9dc5ac7ba..d079a0af8 100644 --- a/moses/TranslationModel/UG/sapt_pscore_logcnt.h +++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h @@ -7,59 +7,56 @@ #include "sapt_pscore_base.h" #include -using namespace std; -namespace Moses { - namespace bitext { +namespace sapt { - template - class - PScoreLogCnt : public PhraseScorer + template + class + PScoreLogCnt : public PhraseScorer + { + std::string m_specs; + public: + PScoreLogCnt(std::string const specs) { - string m_specs; - public: - PScoreLogCnt(string const specs) - { - this->m_index = -1; - this->m_specs = specs; - if (specs.find("r1") != string::npos) // raw source phrase counts - this->m_feature_names.push_back("log-r1"); - if (specs.find("s1") != string::npos) - this->m_feature_names.push_back("log-s1"); // L1 sample size - if (specs.find("g1") != string::npos) // coherent phrases - this->m_feature_names.push_back("log-g1"); - if (specs.find("j") != string::npos) // joint counts - this->m_feature_names.push_back("log-j"); - if (specs.find("r2") != string::npos) // raw target phrase counts - this->m_feature_names.push_back("log-r2"); - this->m_num_feats = this->m_feature_names.size(); - } + this->m_index = -1; + this->m_specs = specs; + if (specs.find("r1") != std::string::npos) // raw source phrase counts + this->m_feature_names.push_back("log-r1"); + if (specs.find("s1") != std::string::npos) + this->m_feature_names.push_back("log-s1"); // L1 sample size + if (specs.find("g1") != std::string::npos) // coherent phrases + this->m_feature_names.push_back("log-g1"); + if (specs.find("j") != std::string::npos) // joint counts + this->m_feature_names.push_back("log-j"); + if (specs.find("r2") != std::string::npos) // raw target phrase counts + this->m_feature_names.push_back("log-r2"); + this->m_num_feats = this->m_feature_names.size(); + } - bool - isIntegerValued(int i) const { return true; } + bool + isIntegerValued(int i) const { return true; } - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - assert(pp.raw1); - assert(pp.sample1); - assert(pp.good1); - assert(pp.joint); - assert(pp.raw2); - size_t i = this->m_index; - if (m_specs.find("r1") != string::npos) - (*dest)[i++] = log(pp.raw1); - if (m_specs.find("s1") != string::npos) - (*dest)[i++] = log(pp.sample1); - if (m_specs.find("g1") != string::npos) - (*dest)[i++] = log(pp.good1); - if (m_specs.find("j") != string::npos) - (*dest)[i++] = log(pp.joint); - if (m_specs.find("r2") != string::npos) - (*dest)[++i] = log(pp.raw2); - } - }; - } // namespace bitext -} // namespace Moses + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + assert(pp.raw1); + assert(pp.sample1); + assert(pp.good1); + assert(pp.joint); + assert(pp.raw2); + size_t i = this->m_index; + if (m_specs.find("r1") != std::string::npos) + (*dest)[i++] = log(pp.raw1); + if (m_specs.find("s1") != std::string::npos) + (*dest)[i++] = log(pp.sample1); + if (m_specs.find("g1") != std::string::npos) + (*dest)[i++] = log(pp.good1); + if (m_specs.find("j") != std::string::npos) + (*dest)[i++] = log(pp.joint); + if (m_specs.find("r2") != std::string::npos) + (*dest)[++i] = log(pp.raw2); + } + }; +} // namespace sapt diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h index 81a97f59f..91f0aaad6 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pbwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h @@ -6,54 +6,53 @@ #include "boost/format.hpp" #include "boost/foreach.hpp" -namespace Moses { - namespace bitext +namespace sapt +{ + template + class + PScorePbwd : public PhraseScorer { - template - class - PScorePbwd : public PhraseScorer + float conf; + std::string denom; + + public: + virtual ~PScorePbwd(){}; + PScorePbwd(float const c, std::string d) { - float conf; - string denom; + this->m_index = -1; + conf = c; + denom = d; + size_t checksum = d.size(); + BOOST_FOREACH(char const& x, denom) + { + if (x == '+') { --checksum; continue; } + if (x != 'g' && x != 's' && x != 'r') continue; + std::string s = (boost::format("pbwd-%c%.3f") % x % c).str(); + this->m_feature_names.push_back(s); + } + this->m_num_feats = this->m_feature_names.size(); + UTIL_THROW_IF2(this->m_feature_names.size() != checksum, + "Unknown parameter in specification '" + << d << "' for Pbwd phrase scorer at " << HERE); + } - public: - virtual ~PScorePbwd(){}; - PScorePbwd(float const c, string d) - { - this->m_index = -1; - conf = c; - denom = d; - size_t checksum = d.size(); - BOOST_FOREACH(char const& x, denom) - { - if (x == '+') { --checksum; continue; } - if (x != 'g' && x != 's' && x != 'r') continue; - string s = (boost::format("pbwd-%c%.3f") % x % c).str(); - this->m_feature_names.push_back(s); - } - this->m_num_feats = this->m_feature_names.size(); - UTIL_THROW_IF2(this->m_feature_names.size() != checksum, - "Unknown parameter in specification '" - << d << "' for Pbwd phrase scorer at " << HERE); - } + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + // we use the denominator specification to scale the raw counts on the + // target side; the clean way would be to counter-sample + size_t i = this->m_index; + BOOST_FOREACH(char const& x, denom) + { + uint32_t m2 = pp.raw2; + if (x == 'g') m2 = round(m2 * float(pp.good1) / pp.raw1); + else if (x == 's') m2 = round(m2 * float(pp.sample1) / pp.raw1); + (*dest)[i++] = log(lbop(std::max(m2, pp.joint), pp.joint,conf)); + } + } + }; +} // namespace sapt - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - // we use the denominator specification to scale the raw counts on the - // target side; the clean way would be to counter-sample - size_t i = this->m_index; - BOOST_FOREACH(char const& x, denom) - { - uint32_t m2 = pp.raw2; - if (x == 'g') m2 = round(m2 * float(pp.good1) / pp.raw1); - else if (x == 's') m2 = round(m2 * float(pp.sample1) / pp.raw1); - (*dest)[i++] = log(lbop(max(m2, pp.joint),pp.joint,conf)); - } - } - }; - } // namespace bitext -} // namespace Moses diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h index 23f7a2abd..b42bb464e 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pfwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h @@ -6,66 +6,65 @@ #include "boost/format.hpp" #include "boost/foreach.hpp" -namespace Moses { - namespace bitext +namespace sapt +{ + template + class + PScorePfwd : public PhraseScorer { - template - class - PScorePfwd : public PhraseScorer + float conf; + std::string denom; + + public: + + virtual ~PScorePfwd(){}; + PScorePfwd(float const c, std::string d) { - float conf; - string denom; + this->m_index = -1; + conf = c; + denom = d; + size_t checksum = d.size(); + BOOST_FOREACH(char const& x, denom) + { + if (x == '+') { --checksum; continue; } + if (x != 'g' && x != 's' && x != 'r') continue; + std::string s = (boost::format("pfwd-%c%.3f") % x % c).str(); + this->m_feature_names.push_back(s); + } + this->m_num_feats = this->m_feature_names.size(); + UTIL_THROW_IF2(this->m_feature_names.size() != checksum, + "Unknown parameter in specification '" + << d << "' for Pfwd phrase scorer at " << HERE); + } - public: - - virtual ~PScorePfwd(){}; - PScorePfwd(float const c, string d) - { - this->m_index = -1; - conf = c; - denom = d; - size_t checksum = d.size(); - BOOST_FOREACH(char const& x, denom) - { - if (x == '+') { --checksum; continue; } - if (x != 'g' && x != 's' && x != 'r') continue; - string s = (boost::format("pfwd-%c%.3f") % x % c).str(); - this->m_feature_names.push_back(s); - } - this->m_num_feats = this->m_feature_names.size(); - UTIL_THROW_IF2(this->m_feature_names.size() != checksum, - "Unknown parameter in specification '" - << d << "' for Pfwd phrase scorer at " << HERE); - } - - void - operator()(Bitext const& bt, PhrasePair & pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - if (pp.joint > pp.good1) - { - pp.joint = pp.good1; - // cerr<m_index; - BOOST_FOREACH(char const& c, this->denom) - { - switch (c) - { - case 'g': - (*dest)[i++] = log(lbop(pp.good1, pp.joint, conf)); - break; - case 's': - (*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf)); - break; - case 'r': - (*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf)); - } - } - } - }; - } + void + operator()(Bitext const& bt, PhrasePair & pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + if (pp.joint > pp.good1) + { + pp.joint = pp.good1; + // cerr<m_index; + BOOST_FOREACH(char const& c, this->denom) + { + switch (c) + { + case 'g': + (*dest)[i++] = log(lbop(pp.good1, pp.joint, conf)); + break; + case 's': + (*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf)); + break; + case 'r': + (*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf)); + } + } + } + }; } + diff --git a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h index e0ce40117..a1426426a 100644 --- a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h +++ b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h @@ -6,29 +6,28 @@ #include "boost/format.hpp" #include "sapt_pscore_base.h" -namespace Moses { - namespace bitext +namespace sapt +{ + template + class + PScorePC : public PhraseScorer { - template - class - PScorePC : public PhraseScorer + public: + PScorePC(std::string const dummy) { - public: - PScorePC(string const dummy) - { - this->m_index = -1; - this->m_num_feats = 1; - this->m_feature_names.push_back(string("phrasecount")); - } + this->m_index = -1; + this->m_num_feats = 1; + this->m_feature_names.push_back(std::string("phrasecount")); + } - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - (*dest)[this->m_index] = 1; - } - }; - } + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + (*dest)[this->m_index] = 1; + } + }; } + diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h index 388ee75ec..67ee74850 100644 --- a/moses/TranslationModel/UG/sapt_pscore_provenance.h +++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h @@ -7,42 +7,40 @@ #include "sapt_pscore_base.h" #include -using namespace std; -namespace Moses { - namespace bitext { +namespace sapt { - // asymptotic provenance feature n/(n+x) - template - class - PScoreProvenance : public SingleRealValuedParameterPhraseScorerFamily + // asymptotic provenance feature n/(n+x) + template + class + PScoreProvenance : public SingleRealValuedParameterPhraseScorerFamily + { + public: + + virtual ~PScoreProvenance() {} + PScoreProvenance(std::string const& spec) { - public: + this->m_tag = "prov"; + this->init(spec); + } - virtual ~PScoreProvenance() {} - PScoreProvenance(string const& spec) - { - this->m_tag = "prov"; - this->init(spec); - } + bool + isLogVal(int i) const { return false; } - bool - isLogVal(int i) const { return false; } + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + size_t i = this->m_index; + BOOST_FOREACH(float const x, this->m_x) + (*dest).at(i++) = pp.joint/(x + pp.joint); + } - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - size_t i = this->m_index; - BOOST_FOREACH(float const x, this->m_x) - (*dest).at(i++) = pp.joint/(x + pp.joint); - } + bool + allowPooling() const + { return false; } - bool - allowPooling() const - { return false; } + }; +} // namespace sapt - }; - } // namespace bitext -} // namespace Moses diff --git a/moses/TranslationModel/UG/sapt_pscore_rareness.h b/moses/TranslationModel/UG/sapt_pscore_rareness.h index 34979243c..c36da1913 100644 --- a/moses/TranslationModel/UG/sapt_pscore_rareness.h +++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h @@ -7,35 +7,32 @@ #include "sapt_pscore_base.h" #include -using namespace std; -namespace Moses { - namespace bitext { +namespace sapt { - // rareness penalty: x/(n+x) - template - class - PScoreRareness : public SingleRealValuedParameterPhraseScorerFamily + // rareness penalty: x/(n+x) + template + class + PScoreRareness : public SingleRealValuedParameterPhraseScorerFamily + { + public: + PScoreRareness(std::string const spec) { - public: - PScoreRareness(string const spec) - { - this->m_tag = "rare"; - this->init(spec); - } + this->m_tag = "rare"; + this->init(spec); + } - bool - isLogVal(int i) const { return false; } + bool + isLogVal(int i) const { return false; } - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - size_t i = this->m_index; - BOOST_FOREACH(float const x, this->m_x) - (*dest).at(i++) = x/(x + pp.joint); - } - }; - } // namespace bitext -} // namespace Moses + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + size_t i = this->m_index; + BOOST_FOREACH(float const x, this->m_x) + (*dest).at(i++) = x/(x + pp.joint); + } + }; +} // namespace sapt diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h index 3388c6b7d..4201b839c 100644 --- a/moses/TranslationModel/UG/sapt_pscore_unaligned.h +++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h @@ -1,67 +1,66 @@ -// -*- c++ -*- +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- // Phrase scorer that counts the number of unaligend words in the phrase // written by Ulrich Germann #include "sapt_pscore_base.h" #include +#include -namespace Moses { - namespace bitext +namespace sapt +{ + template + class + PScoreUnaligned : public PhraseScorer { - template - class - PScoreUnaligned : public PhraseScorer + typedef boost::dynamic_bitset bitvector; + public: + PScoreUnaligned(std::string const spec) { - typedef boost::dynamic_bitset bitvector; - public: - PScoreUnaligned(string const spec) - { - this->m_index = -1; - int f = this->m_num_feats = atoi(spec.c_str()); - UTIL_THROW_IF2(f != 1 && f != 2,"unal parameter must be 1 or 2 at "<m_feature_names.resize(f); - if (f == 1) - this->m_feature_names[0] = "unal"; - else - { - this->m_feature_names[0] = "unal-s"; - this->m_feature_names[1] = "unal-t"; - } - } + this->m_index = -1; + int f = this->m_num_feats = atoi(spec.c_str()); + UTIL_THROW_IF2(f != 1 && f != 2,"unal parameter must be 1 or 2 at "<m_feature_names.resize(f); + if (f == 1) + this->m_feature_names[0] = "unal"; + else + { + this->m_feature_names[0] = "unal-s"; + this->m_feature_names[1] = "unal-t"; + } + } - bool - isLogVal(int i) const { return false; } + bool + isLogVal(int i) const { return false; } - bool - isIntegerValued(int i) const { return true; } + bool + isIntegerValued(int i) const { return true; } - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0; - // parse_pid(pp.p1, sid1, off1, len1); - // parse_pid(pp.p2, sid2, off2, len2); - bitvector check1(pp.len1),check2(pp.len2); - for (size_t i = 0; i < pp.aln.size(); ) - { - check1.set(pp.aln[i++]); - check2.set(pp.aln.at(i++)); - } + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0; + // parse_pid(pp.p1, sid1, off1, len1); + // parse_pid(pp.p2, sid2, off2, len2); + bitvector check1(pp.len1),check2(pp.len2); + for (size_t i = 0; i < pp.aln.size(); ) + { + check1.set(pp.aln[i++]); + check2.set(pp.aln.at(i++)); + } - if (this->m_num_feats == 1) - { - (*dest)[this->m_index] = pp.len1 - check1.count(); - (*dest)[this->m_index] += pp.len2 - check2.count(); - } - else - { - (*dest)[this->m_index] = pp.len1 - check1.count(); - (*dest)[this->m_index+1] = pp.len2 - check2.count(); - } - } - }; - } // namespace bitext -} // namespace Moses + if (this->m_num_feats == 1) + { + (*dest)[this->m_index] = pp.len1 - check1.count(); + (*dest)[this->m_index] += pp.len2 - check2.count(); + } + else + { + (*dest)[this->m_index] = pp.len1 - check1.count(); + (*dest)[this->m_index+1] = pp.len2 - check2.count(); + } + } + }; +} // namespace sapt diff --git a/moses/TranslationModel/UG/sapt_pscore_wordcount.h b/moses/TranslationModel/UG/sapt_pscore_wordcount.h index a5000be37..6cd9e7c0c 100644 --- a/moses/TranslationModel/UG/sapt_pscore_wordcount.h +++ b/moses/TranslationModel/UG/sapt_pscore_wordcount.h @@ -6,29 +6,28 @@ #include "boost/format.hpp" #include "sapt_pscore_base.h" -namespace Moses { - namespace bitext +namespace sapt +{ + template + class + PScoreWC : public PhraseScorer { - template - class - PScoreWC : public PhraseScorer + public: + PScoreWC(std::string const dummy) { - public: - PScoreWC(string const dummy) - { - this->m_index = -1; - this->m_num_feats = 1; - this->m_feature_names.push_back(string("wordcount")); - } + this->m_index = -1; + this->m_num_feats = 1; + this->m_feature_names.push_back(std::string("wordcount")); + } - void - operator()(Bitext const& bt, - PhrasePair& pp, - vector * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - (*dest)[this->m_index] = pp.len2; - } - }; - } + void + operator()(Bitext const& bt, + PhrasePair& pp, + std::vector * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + (*dest)[this->m_index] = pp.len2; + } + }; } + diff --git a/moses/TranslationModel/UG/sim-pe.cc b/moses/TranslationModel/UG/sim-pe.cc index 00a705936..a302be3da 100644 --- a/moses/TranslationModel/UG/sim-pe.cc +++ b/moses/TranslationModel/UG/sim-pe.cc @@ -9,7 +9,7 @@ #include using namespace Moses; -using namespace bitext; +using namespace sapt; using namespace std; using namespace boost; diff --git a/moses/TranslationModel/UG/spe-check-coverage.cc b/moses/TranslationModel/UG/spe-check-coverage.cc index 378dd800f..351832fa5 100644 --- a/moses/TranslationModel/UG/spe-check-coverage.cc +++ b/moses/TranslationModel/UG/spe-check-coverage.cc @@ -12,7 +12,7 @@ #include using namespace Moses; -using namespace bitext; +using namespace sapt; using namespace std; using namespace boost; using namespace boost::algorithm; diff --git a/moses/TranslationModel/UG/spe-check-coverage2.cc b/moses/TranslationModel/UG/spe-check-coverage2.cc index 3b4f559d2..699039f72 100644 --- a/moses/TranslationModel/UG/spe-check-coverage2.cc +++ b/moses/TranslationModel/UG/spe-check-coverage2.cc @@ -9,7 +9,7 @@ #include using namespace Moses; -using namespace bitext; +using namespace sapt; using namespace std; using namespace boost; diff --git a/moses/TranslationModel/UG/spe-check-coverage3.cc b/moses/TranslationModel/UG/spe-check-coverage3.cc index 62f078044..63add8b15 100644 --- a/moses/TranslationModel/UG/spe-check-coverage3.cc +++ b/moses/TranslationModel/UG/spe-check-coverage3.cc @@ -10,7 +10,7 @@ #include using namespace Moses; -using namespace bitext; +using namespace sapt; using namespace std; using namespace boost; diff --git a/moses/TranslationModel/UG/try-align.cc b/moses/TranslationModel/UG/try-align.cc index c40fd2388..91c5406f8 100644 --- a/moses/TranslationModel/UG/try-align.cc +++ b/moses/TranslationModel/UG/try-align.cc @@ -3,6 +3,7 @@ using namespace std; using namespace Moses; using namespace ugdiss; +using namespace sapt; typedef L2R_Token Token; typedef mmTtrack ttrack_t; @@ -16,7 +17,8 @@ float lbop_level = .05; #define smooth 1 namespace stats { - using namespace Moses::bitext; + using namespace Moses; + using namespace sapt; float pmi(size_t j,size_t m1, size_t m2, size_t N) { @@ -63,7 +65,7 @@ struct SinglePhrase }; -struct PhrasePair +struct PPair { struct score_t; uint64_t p1,p2; @@ -115,12 +117,12 @@ struct PhrasePair } } stats; - PhrasePair(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0) + PPair(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0) : s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { } bool - operator<(PhrasePair const& other) const + operator<(PPair const& other) const { return (this->stats.score == other.stats.score ? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2) @@ -129,7 +131,7 @@ struct PhrasePair size_t len1() const { return e1 - s1; } size_t len2() const { return e2 - s2; } - bool includes(PhrasePair const& o) const + bool includes(PPair const& o) const { return s1 <= o.s1 && e1 >= o.e1 && s2 <= o.s2 && e2 >= o.e2; } @@ -137,7 +139,7 @@ struct PhrasePair }; SinglePhrase::cache_t cache1,cache2; -PhrasePair::stats_t::cache_t ppcache; +PPair::stats_t::cache_t ppcache; struct SortByPositionInCorpus @@ -256,18 +258,18 @@ int main(int argc, char* argv[]) lookup_phrases(snt1,V1,*T1,I1,cache1,M1); lookup_phrases(snt2,V2,*T2,I2,cache2,M2); - vector pp_all,pp_good; + vector pp_all, pp_good; vector a1(snt1.size(),-1); vector a2(snt2.size(),-1); vector > z1(snt1.size(),vector(snt1.size(),-1)); vector > z2(snt2.size(),vector(snt2.size(),-1)); - vector > >ppm1(M1.size()),ppm2(M2.size()); + vector > >ppm1(M1.size()),ppm2(M2.size()); vector > M(snt1.size(), vector(snt2.size(),0)); vector > best1(snt1.size()), best2(snt2.size()); for (size_t i1 = 0; i1 < M1.size(); ++i1) { - PhrasePair pp; + PPair pp; pp.s1 = i1; ppm1[i1].resize(M1[i1].size()); for (size_t i2 = 0; i2 < M2.size(); ++i2) @@ -282,11 +284,11 @@ int main(int argc, char* argv[]) for (size_t k2 = 0; k2 < M2[i2].size(); ++k2) { pp.e2 = i2 + k2 + 1; - SPTR & s + SPTR & s = ppcache[make_pair(M1[i1][k1]->pid,M2[i2][k2]->pid)]; if (!s) { - s.reset(new PhrasePair::stats_t()); + s.reset(new PPair::stats_t()); s->set(M1[i1][k1]->occs,M2[i2][k2]->occs,T1->size()); } pp.stats = *s; @@ -304,7 +306,7 @@ int main(int argc, char* argv[]) } sort(pp_all.begin(),pp_all.end()); #if 0 - BOOST_FOREACH(PhrasePair const& pp,pp_all) + BOOST_FOREACH(PPair const& pp,pp_all) { if (pp.stats.npmi < 0) continue; for (size_t r = pp.s1; r < pp.e1; ++r) @@ -358,7 +360,7 @@ int main(int argc, char* argv[]) vector assoc1(snt1.size(),-1), assoc2(snt2.size(),-1); for (size_t p = 0; p < pp_all.size(); ++p) { - PhrasePair const& x = pp_all[p]; + PPair const& x = pp_all[p]; // if (x.stats.npmi < .7) break; // if (z1[x.s1][x.e1-1] >= 0 || z2[x.s2][x.e2-1] >=0) // continue; @@ -368,7 +370,7 @@ int main(int argc, char* argv[]) assoc1[i] = p; else { - // PhrasePair& y = pp_all[assoc1[i]]; + // PPair& y = pp_all[assoc1[i]]; // if (y.includes(x)) // assoc1[i] = p; } @@ -379,7 +381,7 @@ int main(int argc, char* argv[]) assoc2[i] = p; else { - // PhrasePair& y = pp_all[assoc2[i]]; + // PPair& y = pp_all[assoc2[i]]; // if (y.includes(x)) // assoc2[i] = p; } @@ -409,7 +411,7 @@ int main(int argc, char* argv[]) // if (assoc1[i] == assoc2[k]) { done[assoc1[i]] = true; - PhrasePair& p = pp_all[assoc1[i]]; + PPair& p = pp_all[assoc1[i]]; for (size_t j = p.s1; j < p.e1; ++j) cout << j << ":" << V1[snt1[j]] << " "; cout << " ::: "; @@ -427,7 +429,7 @@ int main(int argc, char* argv[]) if (assoc2[i] < 0 || done[assoc2[i]]) continue; done[assoc2[i]] = true; - PhrasePair& p = pp_all[assoc2[i]]; + PPair& p = pp_all[assoc2[i]]; for (size_t j = p.s1; j < p.e1; ++j) cout << j << ":" << V1[snt1[j]] << " "; cout << " ::: "; @@ -439,16 +441,16 @@ int main(int argc, char* argv[]) } #endif // sort(pp_all.begin(),pp_all.end()); - // BOOST_FOREACH(PhrasePair const& pp, pp_all) + // BOOST_FOREACH(PPair const& pp, pp_all) // { // while (ppm1[pp.s1].size() < pp.e1 - pp.s1) - // ppm1[pp.s1].push_back(vector()); - // vector& v1 = ppm1[pp.s1][pp.e1-pp.s1-1]; + // ppm1[pp.s1].push_back(vector()); + // vector& v1 = ppm1[pp.s1][pp.e1-pp.s1-1]; // if (v1.size() && v1[0].stats.score > pp.stats.score) // continue; // while (ppm2[pp.s2].size() < pp.e2 - pp.s2) - // ppm2[pp.s2].push_back(vector()); - // vector& v2 = ppm2[pp.s2][pp.e2-pp.s2-1]; + // ppm2[pp.s2].push_back(vector()); + // vector& v2 = ppm2[pp.s2][pp.e2-pp.s2-1]; // if (v2.size() && v2[0].stats.score > pp.stats.score) // continue; // v1.push_back(pp); @@ -456,9 +458,9 @@ int main(int argc, char* argv[]) // } - // BOOST_FOREACH(vector >& vv, ppm1) + // BOOST_FOREACH(vector >& vv, ppm1) // { - // BOOST_FOREACH(vector& v, vv) + // BOOST_FOREACH(vector& v, vv) // { // sort(v.begin(),v.end()); // if (v.size() > 1 && v[0].stats.score == v[1].stats.score) @@ -469,7 +471,7 @@ int main(int argc, char* argv[]) // { // for (size_t k2 = 0; k2 < ppm2[i2].size(); ++k2) // { - // vector& v2 = ppm2[i2][k2]; + // vector& v2 = ppm2[i2][k2]; // sort(v2.begin(),v2.end()); // if (v2.size() > 1 && v2[0].stats.score == v2[1].stats.score) // { @@ -486,7 +488,7 @@ int main(int argc, char* argv[]) // else pp_good.push_back(ppm2[i2][k2][0]); // } // } - // BOOST_FOREACH(PhrasePair const& pp, pp_good) + // BOOST_FOREACH(PPair const& pp, pp_good) // { // cout << pp.stats.mi << " "; // for (size_t z = pp.s1; z < pp.e1; ++z) @@ -498,7 +500,7 @@ int main(int argc, char* argv[]) // } // // cout << string(80,'=') << endl; // // sort(pp_all.begin(),pp_all.end()); - // // BOOST_FOREACH(PhrasePair const& pp, pp_all) + // // BOOST_FOREACH(PPair const& pp, pp_all) // // { // // cout << pp.mi << " "; // // for (size_t z = pp.s1; z < pp.e1; ++z) From 1e82118ae9bf98e64bacb14ee83a22210462b869 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 9 Aug 2015 20:52:56 +0100 Subject: [PATCH 241/286] Bug fix: prefixed std:: in several places. --- moses/server/Server.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/moses/server/Server.cpp b/moses/server/Server.cpp index fde116224..2eee00c52 100644 --- a/moses/server/Server.cpp +++ b/moses/server/Server.cpp @@ -30,19 +30,19 @@ namespace MosesServer .allowOrigin("*") .maxConn(m_server_options.num_threads)); - XVERBOSE(1,"Listening on port " << m_server_options.port << endl); + XVERBOSE(1,"Listening on port " << m_server_options.port << std::endl); if (m_server_options.is_serial) { - VERBOSE(1,"Running server in serial mode." << endl); - while(true) myAbyssServer.runOnce(); + VERBOSE(1,"Running server in serial mode." << std::endl); + while(true) myAbyssServer.runOnce(); } else myAbyssServer.run(); - + std::cerr << "xmlrpc_c::serverAbyss.run() returned but should not." << std::endl; // #pragma message("BUILDING MOSES WITH SERVER SUPPORT") #else // #pragma message("BUILDING MOSES WITHOUT SERVER SUPPORT") - std::cerr << "Moses was compiled without server support." << endl; + std::cerr << "Moses was compiled without server support." << std::endl; #endif return 1; } From ab820e7b25fd4c83d630fd418e8bd6599804fa39 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 9 Aug 2015 20:54:58 +0100 Subject: [PATCH 242/286] Copy options from StaticData in constructor. --- moses/server/TranslationRequest.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/moses/server/TranslationRequest.cpp b/moses/server/TranslationRequest.cpp index b05330d2b..bdccd8b87 100644 --- a/moses/server/TranslationRequest.cpp +++ b/moses/server/TranslationRequest.cpp @@ -1,7 +1,7 @@ #include "TranslationRequest.h" #include "moses/ContextScope.h" #include - +#include "moses/Util.h" namespace MosesServer { using namespace std; @@ -29,7 +29,7 @@ create(Translator* translator, xmlrpc_c::paramList const& paramList, boost::mutex& mut) { boost::shared_ptr ret; - ret.reset(new TranslationRequest(paramList,cond, mut)); + ret.reset(new TranslationRequest(paramList, cond, mut)); ret->m_self = ret; ret->m_translator = translator; return ret; @@ -49,8 +49,7 @@ Run() m_session_id = S.id; // cerr << "SESSION ID" << m_session_id << endl; } - else - m_scope.reset(new Moses::ContextScope); + else m_scope.reset(new Moses::ContextScope); Moses::StaticData const& SD = Moses::StaticData::Instance(); @@ -59,14 +58,14 @@ Run() // why on earth is this a global variable? Is this even thread-safe???? UG (const_cast(SD)).SetOutputSearchGraph(true); - std::stringstream out, graphInfo, transCollOpts; - + // std::stringstream out, graphInfo, transCollOpts; + if (SD.IsSyntax()) run_chart_decoder(); else run_phrase_decoder(); - XVERBOSE(1,"Output: " << out.str() << endl); + // XVERBOSE(1,"Output: " << out.str() << endl); { boost::lock_guard lock(m_mutex); m_done = true; @@ -236,7 +235,9 @@ TranslationRequest(xmlrpc_c::paramList const& paramList, : m_cond(cond), m_mutex(mut), m_done(false), m_paramList(paramList) , m_nbestSize(0) , m_session_id(0) -{ } +{ + m_options = StaticData::Instance().options(); +} void TranslationRequest:: @@ -345,7 +346,8 @@ pack_hypothesis(vector const& edges, string const& key, // target string ostringstream target; BOOST_REVERSE_FOREACH(Hypothesis const* e, edges) - output_phrase(target, e->GetCurrTargetPhrase()); + output_phrase(target, e->GetCurrTargetPhrase()); + std::cerr << "SERVER TRANSLATION: " << target.str() << std::endl; dest[key] = xmlrpc_c::value_string(target.str()); if (m_withAlignInfo) { From 4253a0e642dfd2f84efba2e6f699515fdfc63fac Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 9 Aug 2015 20:56:40 +0100 Subject: [PATCH 243/286] Get options from Manager instead of StaticData. --- moses/SearchCubePruning.cpp | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/moses/SearchCubePruning.cpp b/moses/SearchCubePruning.cpp index 0c8b418f0..097cf497e 100644 --- a/moses/SearchCubePruning.cpp +++ b/moses/SearchCubePruning.cpp @@ -45,8 +45,6 @@ SearchCubePruning(Manager& manager, const InputType &source, , m_hypoStackColl(source.GetSize() + 1) , m_transOptColl(transOptColl) { - const StaticData &staticData = StaticData::Instance(); - std::vector < HypothesisStackCubePruning >::iterator iterStack; for (size_t ind = 0 ; ind < m_hypoStackColl.size() ; ++ind) { HypothesisStackCubePruning *sourceHypoColl = new HypothesisStackCubePruning(m_manager); @@ -68,9 +66,6 @@ SearchCubePruning::~SearchCubePruning() */ void SearchCubePruning::Decode() { - const StaticData &SD = StaticData::Instance(); - AllOptions const& opts = SD.options(); - // initial seed hypothesis: nothing translated, no words produced Hypothesis *hypo = Hypothesis::Create(m_manager,m_source, m_initialTransOpt); @@ -81,11 +76,13 @@ void SearchCubePruning::Decode() firstStack.CleanupArcList(); CreateForwardTodos(firstStack); - const size_t PopLimit = StaticData::Instance().options().cube.pop_limit; - VERBOSE(3,"Cube Pruning pop limit is " << PopLimit << std::endl); + const size_t PopLimit = m_manager.options().cube.pop_limit; + VERBOSE(2,"Cube Pruning pop limit is " << PopLimit << std::endl); - const size_t Diversity = StaticData::Instance().options().cube.diversity; - VERBOSE(3,"Cube Pruning diversity is " << Diversity << std::endl) + const size_t Diversity = m_manager.options().cube.diversity; + VERBOSE(2,"Cube Pruning diversity is " << Diversity << std::endl); + VERBOSE(2,"Max Phrase length is " + << m_manager.options().search.max_phrase_length << std::endl); // go through each stack size_t stackNo = 1; @@ -202,9 +199,9 @@ void SearchCubePruning::CreateForwardTodos(HypothesisStackCubePruning &stack) } size_t maxSize = size - startPos; - size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength(); + size_t maxSizePhrase = m_manager.options().search.max_phrase_length; maxSize = std::min(maxSize, maxSizePhrase); - + for (endPos = startPos+1; endPos < startPos + maxSize; endPos++) { if (bitmap.GetValue(endPos)) break; @@ -245,7 +242,7 @@ SearchCubePruning:: CheckDistortion(const WordsBitmap &hypoBitmap, const WordsRange &range) const { // since we check for reordering limits, its good to have that limit handy - int maxDistortion = StaticData::Instance().GetMaxDistortion(); + int maxDistortion = m_manager.options().reordering.max_distortion; if (maxDistortion < 0) return true; // if there are reordering limits, make sure it is not violated From e24cd5186f8c075f06517e8f944c037b7a4772e5 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 9 Aug 2015 20:57:37 +0100 Subject: [PATCH 244/286] Bug fix. No sanity check when updating parameters via RPC calls in the server. --- moses/parameters/AllOptions.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/moses/parameters/AllOptions.cpp b/moses/parameters/AllOptions.cpp index 11a9e3cb8..6a6cba18f 100644 --- a/moses/parameters/AllOptions.cpp +++ b/moses/parameters/AllOptions.cpp @@ -67,7 +67,6 @@ namespace Moses if (!input.update(param)) return false; if (!mbr.update(param)) return false; if (!lmbr.update(param)) return false; - return sanity_check(); } #endif From a7ee9d2e3d523e48695170ee70ae80d35e73de2f Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 9 Aug 2015 20:58:13 +0100 Subject: [PATCH 245/286] Added Emacs code formatting instructions. --- moses/server/Translator.cpp | 2 ++ moses/server/Updater.cpp | 1 + 2 files changed, 3 insertions(+) diff --git a/moses/server/Translator.cpp b/moses/server/Translator.cpp index 42275e0fd..edf11a948 100644 --- a/moses/server/Translator.cpp +++ b/moses/server/Translator.cpp @@ -1,3 +1,5 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- + #include "Translator.h" #include "TranslationRequest.h" #include "Server.h" diff --git a/moses/server/Updater.cpp b/moses/server/Updater.cpp index bf129bf49..095af3838 100644 --- a/moses/server/Updater.cpp +++ b/moses/server/Updater.cpp @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- #include "Updater.h" namespace MosesServer From 55b45859428a2279c4b7bbb0331a4188d65f7271 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 9 Aug 2015 22:48:15 +0100 Subject: [PATCH 246/286] Filter more warnings with --filter-warnings. --- Jamroot | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Jamroot b/Jamroot index 130ad5d88..3c855beb9 100644 --- a/Jamroot +++ b/Jamroot @@ -133,8 +133,9 @@ if [ option.get "filter-warnings" : : "yes" ] { requirements += -Wno-unused-but-set-variable ; requirements += -Wno-unused-result ; requirements += -Wno-unused-variable ; - requirements += -Wcomment ; - requirements += -Wstrict-aliasing ; + requirements += -Wno-comment ; + requirements += -Wno-strict-aliasing ; + requirements += -Wno-overloaded-virtual ; } if [ option.get "debug-build" : : "yes" ] { From 97020846413135bf03ab2ae96b65ce74b50f7619 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 9 Aug 2015 22:48:26 +0100 Subject: [PATCH 247/286] Deleted unused code. --- moses/TranslationModel/UG/mm/ug_mmbitext.cc | 450 -------------------- moses/TranslationModel/UG/mm/ug_mmbitext.h | 191 --------- 2 files changed, 641 deletions(-) delete mode 100644 moses/TranslationModel/UG/mm/ug_mmbitext.cc delete mode 100644 moses/TranslationModel/UG/mm/ug_mmbitext.h diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.cc b/moses/TranslationModel/UG/mm/ug_mmbitext.cc deleted file mode 100644 index ef797b5c1..000000000 --- a/moses/TranslationModel/UG/mm/ug_mmbitext.cc +++ /dev/null @@ -1,450 +0,0 @@ -// #include "ug_mmbitext.h" -// #include - -// namespace Moses -// { -// using namespace ugdiss; -// using namespace std; - -// mmbitext:: -// pstats:: -// pstats() -// : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0) -// {} - -// void -// mmbitext:: -// pstats:: -// register_worker() -// { -// this->lock.lock(); -// ++this->in_progress; -// this->lock.unlock(); -// } - -// void -// pstats:: -// release() -// { -// this->lock.lock(); -// if (this->in_progress-- == 1) // last one - >we're done -// this->ready.notify_all(); -// this->lock.unlock(); -// } - -// void -// mmbitext:: -// open(string const base, string const L1, string L2) -// { -// T1.open(base+L1+".mct"); -// T2.open(base+L2+".mct"); -// Tx.open(base+L1+"-"+L2+".mam"); -// V1.open(base+L1+".tdx"); V1.iniReverseIndex(); -// V2.open(base+L2+".tdx"); V2.iniReverseIndex(); -// I1.open(base+L1+".sfa",&T1); -// I2.open(base+L2+".sfa",&T2); -// // lexscorer.open(base+L1+"-"+L2+".lex"); -// assert(T1.size() == T2.size()); -// } - - -// mmbitext:: -// mmbitext() -// : ag(NULL) -// { - -// } - -// bool -// mmbitext:: -// find_trg_phr_bounds(size_t const sid, size_t const start, size_t const stop, -// size_t & s1, size_t & s2, size_t & e1, size_t & e2, -// vector* core_alignment, bool const flip) const -// { -// // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl; -// // a word on the core_alignment: -// // since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1 < e2, respectively) -// // are be definition unaligned, we store only the core alignment in *core_alignment -// // it is up to the calling function to shift alignment points over for start positions -// // of extracted phrases that start with a fringe word -// char const* p = Tx.sntStart(sid); -// char const* x = Tx.sntEnd(sid); -// bitvector forbidden((flip ? T1 : T2).sntLen(sid)); -// size_t src,trg; -// size_t lft = forbidden.size(); -// size_t rgt = 0; -// vector > aln(T1.sntLen(sid)); -// while (p < x) -// { -// if (flip) { p = binread(p,trg); assert(p= stop) -// forbidden.set(trg); -// else -// { -// lft = min(lft,trg); -// rgt = max(rgt,trg); -// if (core_alignment) -// { -// if (flip) aln[trg].push_back(src); -// else aln[src].push_back(trg); -// } -// } -// } -// #if 0 -// cout << setw(5) << mctr << " " << setw(3) << xctr << " "; -// for (size_t i = 0; i < forbidden.size(); ++i) -// { -// if (i == lft) cout << '('; -// cout << (forbidden[i] ? 'x' : '-'); -// if (i == rgt) cout << ')'; -// } -// cout << endl; -// #endif - -// for (size_t i = lft; i <= rgt; ++i) -// if (forbidden[i]) -// return false; - -// s2 = lft; for (s1 = s2; s1 && !forbidden[s1-1]; --s1); -// e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2); - -// if (lft > rgt) return false; -// if (core_alignment) -// { -// core_alignment->clear(); -// if (flip) -// { -// for (size_t i = lft; i <= rgt; ++i) -// { -// sort(aln[i].begin(),aln[i].end()); -// BOOST_FOREACH(ushort x, aln[i]) -// { -// core_alignment->push_back(i-lft); -// core_alignment->push_back(x-start); -// } -// } -// } -// else -// { -// for (size_t i = start; i < stop; ++i) -// { -// BOOST_FOREACH(ushort x, aln[i]) -// { -// core_alignment->push_back(i-start); -// core_alignment->push_back(x-lft); -// } -// } -// } -// } -// return lft <= rgt; -// } - -// void -// mmbitext:: -// prep(iter const& phrase) -// { -// prep2(phrase); -// } - -// SPTR -// mmbitext:: -// prep2(iter const& phrase) -// { -// if (!ag) -// { -// ag = new agenda(*this); -// ag->add_workers(20); -// } -// typedef boost::unordered_map > pcache_t; -// uint64_t pid = phrase.getPid(); -// pcache_t & cache(phrase.root == &this->I1 ? cache1 : cache2); -// pcache_t::value_type entry(pid,SPTR()); -// pair foo = cache.emplace(entry); -// if (foo.second) foo.first->second = ag->add_job(phrase, 1000); -// return foo.first->second; -// } - -// SPTR -// mmbitext:: -// lookup(iter const& phrase) -// { -// SPTR ret = prep2(phrase); -// boost::unique_lock lock(ret->lock); -// while (ret->in_progress) -// ret->ready.wait(lock); -// return ret; -// } - -// void -// mmbitext:: -// agenda:: -// worker:: -// operator()() -// { -// uint64_t sid=0, offset=0, len=0; // of the source phrase -// bool fwd=false; // source phrase is L1 -// SPTR stats; -// size_t s1=0, s2=0, e1=0, e2=0; -// for (; ag.get_task(sid,offset,len,fwd,stats); ) -// { -// if (!stats) break; -// vector aln; -// if (!ag.bitext.find_trg_phr_bounds -// (sid, offset, offset+len, s1, s2, e1, e2, fwd ? &aln : NULL, !fwd)) -// { -// stats->release(); -// continue; -// } - -// stats->lock.lock(); -// stats->good += 1; -// stats->lock.unlock(); - -// for (size_t k = 0; k < aln.size(); k += 2) -// aln[k] += s2 - s1; -// Token const* o = (fwd ? ag.bitext.T2 : ag.bitext.T1).sntStart(sid); -// float sample_weight = 1./((s2-s1+1)*(e2-e1+1)); -// for (size_t s = s1; s <= s2; ++s) -// { -// iter b(&(fwd ? ag.bitext.I2 : ag.bitext.I1)); -// for (size_t i = s; i < e1; ++i) -// assert(b.extend(o[i].id())); -// for (size_t i = e1; i <= e2; ++i) -// { -// stats->add(b,sample_weight,aln); -// if (i < e2) assert(b.extend(o[i].id())); -// } -// if (fwd && s < s2) -// for (size_t k = 0; k < aln.size(); k += 2) -// --aln[k]; -// } -// stats->release(); -// } -// } - -// void -// mmbitext:: -// pstats:: -// add(mmbitext::iter const& trg_phrase, float const w, vector const& a) -// { -// this->lock.lock(); -// jstats& entry = this->trg[trg_phrase.getPid()]; -// this->lock.unlock(); -// entry.add(w,a); -// } - -// mmbitext:: -// agenda:: -// agenda(mmbitext const& thebitext) -// : shutdown(false), doomed(0), bitext(thebitext) -// { - -// } - -// mmbitext:: -// agenda:: -// ~agenda() -// { -// this->lock.lock(); -// this->shutdown = true; -// this->ready.notify_all(); -// this->lock.unlock(); -// for (size_t i = 0; i < workers.size(); ++i) -// workers[i]->join(); -// } - -// mmbitext:: -// ~mmbitext() -// { -// if (ag) delete ag; -// } - -// SPTR -// mmbitext:: -// agenda:: -// add_job(mmbitext::iter const& phrase, size_t const max_samples) -// { -// static boost::posix_time::time_duration nodelay(0,0,0,0); - -// job j; -// j.stats.reset(new mmbitext::pstats()); -// j.stats->register_worker(); -// j.stats->raw_cnt = phrase.approxOccurrenceCount(); -// j.max_samples = max_samples; -// j.next = phrase.lower_bound(-1); -// j.stop = phrase.upper_bound(-1); -// j.len = phrase.size(); -// j.ctr = 0; -// j.fwd = phrase.root == &bitext.I1; - -// boost::unique_lock lk(this->lock); -// joblist.push_back(j); -// if (joblist.size() == 1) -// { -// for (size_t i = 0; i < workers.size(); ++i) -// { -// if (workers[i]->timed_join(nodelay)) -// { -// workers[i] = SPTR(new boost::thread(worker(*this))); -// } -// } -// } -// return j.stats; -// } - -// bool -// mmbitext:: -// agenda:: -// get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, -// bool & fwd, SPTR & stats) -// { -// boost::unique_lock lock(this->lock); -// if (this->doomed || this->shutdown) -// { -// if (this->doomed) --this->doomed; -// return false; -// } -// // while (joblist.empty()) -// // { -// // cerr << "no jobs" << endl; -// // this->ready.wait(lock); -// // if (this->doomed || this->shutdown) -// // { -// // if (this->doomed) --this->doomed; -// // return false; -// // } -// // } -// while (joblist.size()) -// { -// if (joblist.front().step(sid,offset)) -// { -// job const& j = joblist.front(); -// len = j.len; -// fwd = j.fwd; -// stats = j.stats; -// stats->register_worker(); -// return true; -// } -// joblist.front().stats->release(); -// joblist.pop_front(); -// } -// stats.reset(); -// return true; -// } - -// bool -// mmbitext:: -// agenda:: -// job:: -// step(uint64_t & sid, uint64_t & offset) -// { -// while (next < stop && stats->good < max_samples) -// { -// next = tightread(tightread(next,stop,sid),stop,offset); -// { -// boost::lock_guard lock(stats->lock); -// if (stats->raw_cnt == ctr) ++stats->raw_cnt; -// size_t rnum = util::rand_excl(stats->raw_cnt - ctr++); -// // cout << stats->raw_cnt << " " << ctr-1 << " " -// // << rnum << " " << max_samples - stats->good << endl; -// if (rnum < max_samples - stats->good) -// { -// stats->sample_cnt++; -// return true; -// } -// } -// } -// return false; -// } - - -// void -// mmbitext:: -// agenda:: -// add_workers(int n) -// { -// static boost::posix_time::time_duration nodelay(0,0,0,0); -// boost::lock_guard lock(this->lock); -// // house keeping: remove all workers that have finished -// for (size_t i = 0; i < workers.size(); ) -// { -// if (workers[i]->timed_join(nodelay)) -// { -// if (i + 1 < workers.size()) -// workers[i].swap(workers.back()); -// workers.pop_back(); -// } -// else ++i; -// } -// if (n < 0) -// { -// this->doomed -= n; -// } -// else -// { -// for (int i = 0; i < n; ++i) -// { -// SPTR w(new boost::thread(worker(*this))); -// workers.push_back(w); -// } -// } -// } - -// mmbitext:: -// jstats:: -// jstats() -// { -// my_aln.reserve(1); -// } - -// mmbitext:: -// jstats:: -// jstats(jstats const& other) -// { -// my_rcnt = other.rcnt(); -// my_wcnt = other.wcnt(); -// my_aln = other.aln(); -// } - -// void -// mmbitext:: -// jstats:: -// add(float w, vector const& a) -// { -// boost::lock_guard lk(this->lock); -// my_rcnt += 1; -// my_wcnt += w; -// if (a.size()) -// { -// size_t i = 0; -// while (i < my_aln.size() && my_aln[i].second != a) ++i; -// if (i == my_aln.size()) -// my_aln.push_back(pair >(1,a)); -// else -// my_aln[i].first++; -// if (my_aln[i].first > my_aln[i/2].first) -// push_heap(my_aln.begin(),my_aln.begin()+i+1); -// } -// } - -// uint32_t -// mmbitext:: -// jstats:: -// rcnt() const -// { return my_rcnt; } - -// float -// mmbitext:: -// jstats:: -// wcnt() const -// { return my_wcnt; } - -// vector > > const& -// mmbitext:: -// jstats:: -// aln() const -// { return my_aln; } - -// } - diff --git a/moses/TranslationModel/UG/mm/ug_mmbitext.h b/moses/TranslationModel/UG/mm/ug_mmbitext.h deleted file mode 100644 index e07d92830..000000000 --- a/moses/TranslationModel/UG/mm/ug_mmbitext.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef __ug_mm_bitext_h -#define __ug_mm_bitext_h -// Memory-mapped, word-aligned bitext -// Written by Ulrich Germann - -// things we can do to speed up things: -// - set up threads at startup time that force the -// data in to memory sequentially -// -// - use multiple agendas for better load balancing and to avoid -// competition for locks - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h" -#include "moses/TranslationModel/UG/generic/sampling/Sampling.h" -#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h" - -#include "ug_typedefs.h" -#include "ug_mm_ttrack.h" -#include "ug_mm_tsa.h" -#include "tpt_tokenindex.h" -#include "ug_corpus_token.h" -#include "tpt_pickler.h" - -// using namespace ugdiss; -// using namespace std; -namespace Moses { - - typedef L2R_Token Token; - typedef mmTSA::tree_iterator iter; - - class mmbitext - { - public: - typedef mmTSA::tree_iterator iter; - class pstats; // one-sided phrase statistics - class jstats; // phrase std::pair ("joint") statistics - class agenda - { - boost::mutex lock; - boost::condition_variable ready; - class job; - class worker; - list joblist; - std::vector > workers; - bool shutdown; - size_t doomed; - public: - mmbitext const& bitext; - agenda(mmbitext const& bitext); - ~agenda(); - void add_workers(int n); - SPTR add_job(mmbitext::iter const& phrase, - size_t const max_samples); - bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, - bool & fwd, SPTR & stats); - }; - - // stores the list of unfinished jobs; - // maintains a pool of workers and assigns the jobs to them - - agenda* ag; - mmTtrack Tx; // word alignments - mmTtrack T1,T2; // token tracks - TokenIndex V1,V2; // vocabs - mmTSA I1,I2; // suffix arrays - - /// given the source phrase sid[start:stop] - // find the possible start (s1 .. s2) and end (e1 .. e2) - // points of the target phrase; if non-NULL, store word - // alignments in *core_alignment. If /flip/, source phrase is - // L2. - bool - find_trg_phr_bounds - (size_t const sid, size_t const start, size_t const stop, - size_t & s1, size_t & s2, size_t & e1, size_t & e2, - std::vector * core_alignment, bool const flip) const; - - boost::unordered_map > cache1,cache2; - private: - SPTR - prep2(iter const& phrase); - public: - mmbitext(); - ~mmbitext(); - - void open(string const base, string const L1, string const L2); - - SPTR lookup(iter const& phrase); - void prep(iter const& phrase); - }; - - // "joint" (i.e., phrase std::pair) statistics - class - mmbitext:: - jstats - { - uint32_t my_rcnt; // unweighted count - float my_wcnt; // weighted count - std::vector > > my_aln; - boost::mutex lock; - public: - jstats(); - jstats(jstats const& other); - uint32_t rcnt() const; - float wcnt() const; - std::vector > > const & aln() const; - void add(float w, std::vector const& a); - }; - - // struct - // mmbitext: - // phrasepair - // { - // Token const* t; - // size_t len; - // size_t cnt; - // float fwd, bwd; - - // map aln; - // string toString(TokenIndex const& V) const; - // bool operator<(phrase const& other) const; - // bool operator>(phrase const& other) const; - // phrase(pair,jstats> const & foo); - - // }; - - - struct - mmbitext:: - pstats - { - boost::mutex lock; // for parallel gathering of stats - boost::condition_variable ready; // consumers can wait for this data structure to be ready. - - size_t raw_cnt; // (approximate) raw occurrence count - size_t sample_cnt; // number of instances selected during sampling - size_t good; // number of selected instances with valid word alignments - size_t sum_pairs; - // size_t snt_cnt; - // size_t sample_snt; - size_t in_progress; // keeps track of how many threads are currently working on this - boost::unordered_map trg; - pstats(); - // std::vector nbest; - // void select_nbest(size_t const N=10); - void release(); - void register_worker(); - void add(mmbitext::iter const& trg_phrase, float const w, std::vector const& a); - }; - - class - mmbitext:: - agenda:: - worker - { - agenda& ag; - public: - worker(agenda& a); - void operator()(); - - }; - - class - mmbitext:: - agenda:: - job - { - public: - char const* next; - char const* stop; - size_t max_samples; - size_t ctr; - size_t len; - bool fwd; - SPTR stats; - bool step(uint64_t & sid, uint64_t & offset); - }; - -} -#endif - From 6b084a0587ae6b52e74745a6ecfaab577937d6d9 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 9 Aug 2015 22:49:35 +0100 Subject: [PATCH 248/286] clang can't handle boost intrusive pointers, it seems. --- moses/TranslationModel/UG/mmsapt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 7139e4fd2..643141411 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -65,7 +65,7 @@ namespace Moses typedef sapt::PhraseScorer pscorer; private: // vector > shards; - iptr btfix; + SPTR btfix; SPTR btdyn; std::string m_bname, m_extra_data, m_bias_file,m_bias_server; std::string L1; @@ -151,7 +151,7 @@ namespace Moses #if PROVIDES_RANKED_SAMPLING void - set_bias_for_ranking(ttasksptr const& ttask, iptr const> bt); + set_bias_for_ranking(ttasksptr const& ttask, SPTR const> bt); #endif private: From 8e601d92a17a921c2c2da76a7a13bae210688612 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Mon, 10 Aug 2015 00:01:01 +0100 Subject: [PATCH 249/286] daily automatic beautifier --- moses/SearchCubePruning.cpp | 20 ++++++++++---------- moses/SearchNormal.cpp | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/moses/SearchCubePruning.cpp b/moses/SearchCubePruning.cpp index a247fe2b9..7219a40f7 100644 --- a/moses/SearchCubePruning.cpp +++ b/moses/SearchCubePruning.cpp @@ -38,8 +38,8 @@ public: }; SearchCubePruning:: -SearchCubePruning(Manager& manager, const InputType &source, - const TranslationOptionCollection &transOptColl) +SearchCubePruning(Manager& manager, const InputType &source, + const TranslationOptionCollection &transOptColl) : Search(manager) , m_source(source) , m_hypoStackColl(source.GetSize() + 1) @@ -69,8 +69,8 @@ void SearchCubePruning::Decode() // initial seed hypothesis: nothing translated, no words produced Hypothesis *hypo = Hypothesis::Create(m_manager,m_source, m_initialTransOpt); - HypothesisStackCubePruning &firstStack - = *static_cast(m_hypoStackColl.front()); + HypothesisStackCubePruning &firstStack + = *static_cast(m_hypoStackColl.front()); firstStack.AddInitial(hypo); // Call this here because the loop below starts at the second stack. firstStack.CleanupArcList(); @@ -81,8 +81,8 @@ void SearchCubePruning::Decode() const size_t Diversity = m_manager.options().cube.diversity; VERBOSE(2,"Cube Pruning diversity is " << Diversity << std::endl); - VERBOSE(2,"Max Phrase length is " - << m_manager.options().search.max_phrase_length << std::endl); + VERBOSE(2,"Max Phrase length is " + << m_manager.options().search.max_phrase_length << std::endl); // go through each stack size_t stackNo = 1; @@ -92,13 +92,13 @@ void SearchCubePruning::Decode() // BOOST_FOREACH(HypothesisStack* hstack, m_hypoStackColl) { if (this->out_of_time()) return; - HypothesisStackCubePruning &sourceHypoColl - = *static_cast(*iterStack); + HypothesisStackCubePruning &sourceHypoColl + = *static_cast(*iterStack); // priority queue which has a single entry for each bitmap // container, sorted by score of top hyp - std::priority_queue < BitmapContainer*, std::vector< BitmapContainer* >, - BitmapContainerOrderer > BCQueue; + std::priority_queue < BitmapContainer*, std::vector< BitmapContainer* >, + BitmapContainerOrderer > BCQueue; _BMType::const_iterator bmIter; const _BMType &accessor = sourceHypoColl.GetBitmapAccessor(); diff --git a/moses/SearchNormal.cpp b/moses/SearchNormal.cpp index 170714a02..7056aeb51 100644 --- a/moses/SearchNormal.cpp +++ b/moses/SearchNormal.cpp @@ -57,8 +57,8 @@ ProcessOneStack(HypothesisStack* hstack) { if (this->out_of_time()) return false; SentenceStats &stats = m_manager.GetSentenceStats(); - HypothesisStackNormal &sourceHypoColl - = *static_cast(hstack); + HypothesisStackNormal &sourceHypoColl + = *static_cast(hstack); // the stack is pruned before processing (lazy pruning): VERBOSE(3,"processing hypothesis from next stack"); @@ -69,10 +69,10 @@ ProcessOneStack(HypothesisStack* hstack) IFVERBOSE(2) stats.StopTimeStack(); // go through each hypothesis on the stack and try to expand it - // BOOST_FOREACH(Hypothesis* h, sourceHypoColl) + // BOOST_FOREACH(Hypothesis* h, sourceHypoColl) HypothesisStackNormal::const_iterator h; for (h = sourceHypoColl.begin(); h != sourceHypoColl.end(); ++h) - ProcessOneHypothesis(**h); + ProcessOneHypothesis(**h); return true; } From a68b77c300fee51f5bbfd5910d2b71527e6faa31 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 00:08:44 +0100 Subject: [PATCH 250/286] Minor fix in logging of interaction with bias server. --- moses/TranslationModel/UG/mm/ug_http_client.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_http_client.cc b/moses/TranslationModel/UG/mm/ug_http_client.cc index ff2a0a1c6..89eb308e2 100644 --- a/moses/TranslationModel/UG/mm/ug_http_client.cc +++ b/moses/TranslationModel/UG/mm/ug_http_client.cc @@ -42,9 +42,10 @@ http_client(boost::asio::io_service& io_service, std::string url, std::ostream* if (log) { *log << HERE << std::endl; + // *log << "URL " << url << std::endl; *log << "SERVER " << server << std::endl; *log << "PORT " << port << "" << std::endl; - *log << "PATH " << path << std::endl; + *log << "PATH " << path << std::endl; } #endif init(server, port, path); From c40082f94c1dc0b04ca55a5bb72268ced0f0b11b Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 00:09:56 +0100 Subject: [PATCH 251/286] Bug fix: restored passing of server information to bias client. --- moses/TranslationModel/UG/mm/ug_sampling_bias.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc index 649e463f5..85e143278 100644 --- a/moses/TranslationModel/UG/mm/ug_sampling_bias.cc +++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.cc @@ -20,7 +20,7 @@ namespace sapt std::string const& context, std::ostream* log) { - std::string query = Moses::uri_encode(context); + std::string query = server + Moses::uri_encode(context); boost::asio::io_service io_service; Moses::http_client c(io_service, query, log); io_service.run(); From 7d1987121fbf8d2f7148a69d944aac1621abd510 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 09:30:53 +0100 Subject: [PATCH 252/286] Minor update of declaration of binread(). --- moses/TranslationModel/UG/mm/tpt_pickler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/tpt_pickler.h b/moses/TranslationModel/UG/mm/tpt_pickler.h index 5a42ced01..1e4cd39c1 100644 --- a/moses/TranslationModel/UG/mm/tpt_pickler.h +++ b/moses/TranslationModel/UG/mm/tpt_pickler.h @@ -44,7 +44,7 @@ namespace tpt char const *binread(char const* p, uint16_t& buf); char const *binread(char const* p, uint32_t& buf); - char const *binread(char const* p, filepos_type& buf); + char const *binread(char const* p, uint64_t& buf); char const *binread(char const* p, float& buf); #ifdef __clang__ char const *binread(char const* p, size_t& buf); From 16c637b8a58122da417eac065aecb7e5e29a24b3 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 09:32:26 +0100 Subject: [PATCH 253/286] Back to intrusive pointer for btfix in Mmsapt. Shared pointer causes segfaults. Needs further investigation. --- moses/TranslationModel/UG/mmsapt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 643141411..7139e4fd2 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -65,7 +65,7 @@ namespace Moses typedef sapt::PhraseScorer pscorer; private: // vector > shards; - SPTR btfix; + iptr btfix; SPTR btdyn; std::string m_bname, m_extra_data, m_bias_file,m_bias_server; std::string L1; @@ -151,7 +151,7 @@ namespace Moses #if PROVIDES_RANKED_SAMPLING void - set_bias_for_ranking(ttasksptr const& ttask, SPTR const> bt); + set_bias_for_ranking(ttasksptr const& ttask, iptr const> bt); #endif private: From 19313feea038c555a61d3ea1440c6fa077e5a695 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 10:14:13 +0100 Subject: [PATCH 254/286] parameter/AllOptions: Added missing return value. --- moses/parameters/AllOptions.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/moses/parameters/AllOptions.cpp b/moses/parameters/AllOptions.cpp index 6a6cba18f..32a2bcd36 100644 --- a/moses/parameters/AllOptions.cpp +++ b/moses/parameters/AllOptions.cpp @@ -67,6 +67,7 @@ namespace Moses if (!input.update(param)) return false; if (!mbr.update(param)) return false; if (!lmbr.update(param)) return false; + return true; } #endif From 03463facd70630ab7ca8fb922015536b8b551159 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 10:14:28 +0100 Subject: [PATCH 255/286] Cleanup. --- .../BilingualDynSuffixArray.h | 1 - moses/TranslationModel/DynSuffixArray.h | 6 ++-- .../UG/generic/file_io/ug_stream.h | 24 +++++++------- .../program_options/ug_splice_arglist.h | 3 +- .../UG/generic/sampling/Sampling.h | 19 ++++++------ .../UG/generic/sorting/VectorIndexSorter.h | 11 +++---- .../UG/generic/stringdist/ug_stringdist.h | 18 +++++------ .../UG/mm/obsolete/ug_bitext_base.h | 31 +++++++++---------- moses/TranslationModel/UG/mm/tpt_tightindex.h | 2 -- moses/TranslationModel/UG/mm/tpt_tokenindex.h | 1 - moses/TranslationModel/UG/mm/ug_bitext.h | 2 +- .../UG/mm/ug_bitext_agenda_job.h | 2 -- .../UG/mm/ug_conll_bottom_up_token.h | 1 - .../TranslationModel/UG/mm/ug_conll_record.h | 1 - moses/TranslationModel/UG/mm/ug_deptree.h | 1 - moses/TranslationModel/UG/mm/ug_im_tsa.h | 11 ++----- moses/TranslationModel/UG/mm/ug_im_ttrack.h | 2 -- .../UG/mm/ug_lexical_phrase_scorer1.h | 1 - .../UG/mm/ug_lexical_phrase_scorer2.h | 2 +- moses/TranslationModel/UG/mm/ug_lru_cache.h | 2 -- moses/TranslationModel/UG/mm/ug_mm_2d_table.h | 1 - moses/TranslationModel/UG/mm/ug_mm_tsa.h | 1 - moses/TranslationModel/UG/mm/ug_mm_ttrack.h | 1 - moses/TranslationModel/UG/mm/ug_tsa_base.h | 2 -- .../UG/mm/ug_tsa_bitset_cache.h | 2 +- moses/TranslationModel/UG/mm/ug_ttrack_base.h | 2 -- moses/TranslationModel/UG/mm/ug_typedefs.h | 1 - moses/TranslationModel/UG/sapt_phrase_key.h | 13 -------- moses/TranslationModel/WordCoocTable.h | 8 ++--- 29 files changed, 61 insertions(+), 111 deletions(-) delete mode 100644 moses/TranslationModel/UG/sapt_phrase_key.h diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h index 1c4ceae34..b9d52d443 100644 --- a/moses/TranslationModel/BilingualDynSuffixArray.h +++ b/moses/TranslationModel/BilingualDynSuffixArray.h @@ -13,7 +13,6 @@ #include "moses/TargetPhraseCollection.h" #include -using namespace std; namespace Moses { class PhraseDictionaryDynSuffixArray; diff --git a/moses/TranslationModel/DynSuffixArray.h b/moses/TranslationModel/DynSuffixArray.h index 62f719d57..cbef2ba3a 100644 --- a/moses/TranslationModel/DynSuffixArray.h +++ b/moses/TranslationModel/DynSuffixArray.h @@ -11,7 +11,7 @@ namespace Moses { -using namespace std; + typedef std::vector vuint_t; @@ -25,8 +25,8 @@ class ComparePosition public: ComparePosition(vuint_t const& crp, vuint_t const& sfa); - bool operator()(unsigned const& i, vector const& phrase) const; - bool operator()(vector const& phrase, unsigned const& i) const; + bool operator()(unsigned const& i, std::vector const& phrase) const; + bool operator()(std::vector const& phrase, unsigned const& i) const; }; diff --git a/moses/TranslationModel/UG/generic/file_io/ug_stream.h b/moses/TranslationModel/UG/generic/file_io/ug_stream.h index 5555e36f8..a925f3a64 100644 --- a/moses/TranslationModel/UG/generic/file_io/ug_stream.h +++ b/moses/TranslationModel/UG/generic/file_io/ug_stream.h @@ -18,20 +18,22 @@ namespace ugdiss { -using namespace std; -using namespace boost::iostreams; -/** open input file that is possibly compressed - * decompression filters are automatically added based on the file name - * gzip for .gz; bzip2 for bz2. - */ -filtering_istream* open_input_stream(string fname); -void open_input_stream(string fname, filtering_istream& in); -// filtering_streambuf* open_input_stream(string fname); + /** open input file that is possibly compressed + * decompression filters are automatically added based on the file name + * gzip for .gz; bzip2 for bz2. + */ + boost::iostreams::filtering_istream* + open_input_stream(std::string fname); -filtering_ostream* open_output_stream(string fname); -void open_output_stream(string fname, filtering_ostream& in); + void open_input_stream(std::string fname, + boost::iostreams::filtering_istream& in); + boost::iostreams::filtering_ostream* + open_output_stream(std::string fname); + + void open_output_stream(std::string fname, + boost::iostreams::filtering_ostream& in); } #endif diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h index 605acee6c..58b099f1c 100644 --- a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h +++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h @@ -3,7 +3,6 @@ #include #include namespace Moses { - using namespace std; // Function to splice the argument list (e.g. before handing it over to // Moses LoadParam() function. /filter/ is a vector of argument names @@ -12,7 +11,7 @@ namespace Moses { filter_arguments(int const argc_in, char const* const* const argv_in, int & argc_moses, char*** argv_moses, int & argc_other, char*** argv_other, - vector > const& filter); + std::vector > const& filter); } // namespace Moses diff --git a/moses/TranslationModel/UG/generic/sampling/Sampling.h b/moses/TranslationModel/UG/generic/sampling/Sampling.h index 652e532bc..d8750a983 100644 --- a/moses/TranslationModel/UG/generic/sampling/Sampling.h +++ b/moses/TranslationModel/UG/generic/sampling/Sampling.h @@ -11,24 +11,23 @@ namespace Moses { -using namespace std; -// select a random sample of size /s/ without restitution from the range of -// integers [0,N); +// select a random sample of size /s/ without restitution from the +// range of integers [0,N); template void -randomSample(vector& v, size_t s, size_t N) +randomSample(std::vector& v, size_t s, size_t N) { // see also Knuth: Art of Computer Programming Vol. 2, p. 142 - s = min(s,N); + s = std::min(s,N); v.resize(s); - // the first option tries to be a bit more efficient than O(N) in picking - // the samples. The threshold is an ad-hoc, off-the-cuff guess. I still - // need to figure out the optimal break-even point between a linear sweep - // and repeatedly picking random numbers with the risk of hitting the same - // number many times. + // the first option tries to be a bit more efficient than O(N) in + // picking the samples. The threshold is an ad-hoc, off-the-cuff + // guess. I still need to figure out the optimal break-even point + // between a linear sweep and repeatedly picking random numbers with + // the risk of hitting the same number many times. if (s*10 check(N,0); for (size_t i = 0; i < v.size(); i++) { diff --git a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h index c68b0d2e4..c0887f1a0 100644 --- a/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h +++ b/moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h @@ -1,3 +1,4 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #ifndef __vector_index_sorter_h #define __vector_index_sorter_h #include @@ -5,8 +6,8 @@ #include // VectorIndexSorter; (c) 2007-2012 Ulrich Germann -// A VectorIndexSorter is a function object for sorting indices into a vector -// of objects (instead of sorting the vector itself). +// A VectorIndexSorter is a function object for sorting indices into a +// vector of objects (instead of sorting the vector itself). // // typcial use: // vector my_vector; @@ -16,13 +17,9 @@ namespace Moses { - // using namespace std; - - - using std::greater; template, + typename COMP = std::greater, typename IDX_T=size_t> class VectorIndexSorter diff --git a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h index 8dfcfb58a..895664fc6 100644 --- a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h +++ b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h @@ -17,10 +17,6 @@ #include "moses/TranslationModel/UG/mm/tpt_typedefs.h" -using namespace std; -//using namespace boost; -using namespace ugdiss; - namespace stringdist { float @@ -32,7 +28,7 @@ namespace stringdist float fillAlignmentMatrix(UChar const* a, size_t const lenA, UChar const* b, size_t const lenB, - vector > & M); + std::vector > & M); class StringDiff { @@ -67,21 +63,21 @@ namespace stringdist }; private: UnicodeString a,b; - vector difflist; - vector diffcnt; + std::vector difflist; + std::vector diffcnt; public: - UnicodeString const& set_a(string const& a); - UnicodeString const& set_b(string const& b); + UnicodeString const& set_a(std::string const& a); + UnicodeString const& set_b(std::string const& b); UnicodeString const& get_a() const; UnicodeString const& get_b() const; - StringDiff(string const& a, string const& b); + StringDiff(std::string const& a, std::string const& b); StringDiff(); size_t size(); size_t align(bool force=false); // returns the levenshtein distance void showDiff(std::ostream& out); float levenshtein(); Segment const& operator[](uint32_t i) const; - void fillAlignmentMatrix(vector > & M) const; + void fillAlignmentMatrix(std::vector > & M) const; vector const& getFeatures() const; }; } diff --git a/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h b/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h index e5e9ca88c..c6abd6779 100644 --- a/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h +++ b/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h @@ -24,8 +24,6 @@ #include "ug_corpus_token.h" #include "tpt_pickler.h" -using namespace ugdiss; -using namespace std; namespace Moses { typedef L2R_Token Token; @@ -44,7 +42,7 @@ namespace Moses { class job; class worker; list joblist; - vector > workers; + std::vector > workers; bool shutdown; size_t doomed; public: @@ -52,10 +50,10 @@ namespace Moses { agenda(bitext_base const& bitext); ~agenda(); void add_workers(int n); - sptr add_job(mmbitext::iter const& phrase, + SPTR add_job(mmbitext::iter const& phrase, size_t const max_samples); bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, - bool & fwd, sptr & stats); + bool & fwd, SPTR & stats); }; // stores the list of unfinished jobs; @@ -76,19 +74,19 @@ namespace Moses { find_trg_phr_bounds (size_t const sid, size_t const start, size_t const stop, size_t & s1, size_t & s2, size_t & e1, size_t & e2, - vector * core_alignment, bool const flip) const; + std::vector * core_alignment, bool const flip) const; - boost::unordered_map > cache1,cache2; + boost::unordered_map > cache1,cache2; private: - sptr + SPTR prep2(iter const& phrase); public: mmbitext(); ~mmbitext(); - void open(string const base, string const L1, string const L2); + void open(std::string const base, std::string const L1, std::string const L2); - sptr lookup(iter const& phrase); + SPTR lookup(iter const& phrase); void prep(iter const& phrase); }; @@ -99,15 +97,15 @@ namespace Moses { { uint32_t my_rcnt; // unweighted count float my_wcnt; // weighted count - vector > > my_aln; + std::vector > > my_aln; boost::mutex lock; public: jstats(); jstats(jstats const& other); uint32_t rcnt() const; float wcnt() const; - vector > > const & aln() const; - void add(float w, vector const& a); + std::vector > > const & aln() const; + void add(float w, std::vector const& a); }; struct @@ -126,11 +124,12 @@ namespace Moses { size_t in_progress; // keeps track of how many threads are currently working on this boost::unordered_map trg; pstats(); - // vector nbest; + // std::vector nbest; // void select_nbest(size_t const N=10); void release(); void register_worker(); - void add(mmbitext::iter const& trg_phrase, float const w, vector const& a); + void add(mmbitext::iter const& trg_phrase, float const w, + std::vector const& a); }; class @@ -157,7 +156,7 @@ namespace Moses { size_t ctr; size_t len; bool fwd; - sptr stats; + SPTR stats; bool step(uint64_t & sid, uint64_t & offset); }; diff --git a/moses/TranslationModel/UG/mm/tpt_tightindex.h b/moses/TranslationModel/UG/mm/tpt_tightindex.h index 510706879..3511562f6 100644 --- a/moses/TranslationModel/UG/mm/tpt_tightindex.h +++ b/moses/TranslationModel/UG/mm/tpt_tightindex.h @@ -10,9 +10,7 @@ #include #include #include "tpt_typedefs.h" -// #include #include -// // using namespace std; #ifndef uchar #endif diff --git a/moses/TranslationModel/UG/mm/tpt_tokenindex.h b/moses/TranslationModel/UG/mm/tpt_tokenindex.h index dac196e04..c07eae44e 100644 --- a/moses/TranslationModel/UG/mm/tpt_tokenindex.h +++ b/moses/TranslationModel/UG/mm/tpt_tokenindex.h @@ -21,7 +21,6 @@ #include #include -// // using namespace std; namespace bio=boost::iostreams; namespace sapt diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 990b7cd8a..8d29d7dbd 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -77,7 +77,7 @@ namespace sapt using Moses::ttaskwptr; using tpt::binread; using tpt::binwrite; - // using namespace ugdiss; + float lbop(size_t const tries, size_t const succ, float const confidence); void write_bitvector(bitvector const& v, std::ostream& out); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h index 52d52fc7f..7312ecef4 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h @@ -130,7 +130,6 @@ int Bitext::agenda::job if (!m_bias) return 1; - // // using namespace boost::math; typedef boost::math::binomial_distribution<> binomial; std::ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL; @@ -138,7 +137,6 @@ int Bitext::agenda::job float p = (*m_bias)[sid]; id_type docid = m_bias->GetClass(sid); - // uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0; typedef pstats::indoc_map_t::const_iterator id_iter; id_iter m = stats->indoc.find(docid); uint32_t k = m != stats->indoc.end() ? m->second : 0 ; diff --git a/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h b/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h index e63b3b345..3b98c39ca 100644 --- a/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h +++ b/moses/TranslationModel/UG/mm/ug_conll_bottom_up_token.h @@ -7,7 +7,6 @@ #include "ug_typedefs.h" namespace sapt { - // using namespace std; template class ConllBottomUpToken : public T diff --git a/moses/TranslationModel/UG/mm/ug_conll_record.h b/moses/TranslationModel/UG/mm/ug_conll_record.h index b77ccb981..4d263842b 100644 --- a/moses/TranslationModel/UG/mm/ug_conll_record.h +++ b/moses/TranslationModel/UG/mm/ug_conll_record.h @@ -7,7 +7,6 @@ namespace sapt { - // using namespace std; using tpt::id_type; using tpt::uchar; diff --git a/moses/TranslationModel/UG/mm/ug_deptree.h b/moses/TranslationModel/UG/mm/ug_deptree.h index df096b52b..fb17aeafb 100644 --- a/moses/TranslationModel/UG/mm/ug_deptree.h +++ b/moses/TranslationModel/UG/mm/ug_deptree.h @@ -15,7 +15,6 @@ #include "ug_conll_bottom_up_token.h" #include "ug_typedefs.h" -// using namespace std; namespace sapt { diff --git a/moses/TranslationModel/UG/mm/ug_im_tsa.h b/moses/TranslationModel/UG/mm/ug_im_tsa.h index 87cd63db6..33c61afc0 100644 --- a/moses/TranslationModel/UG/mm/ug_im_tsa.h +++ b/moses/TranslationModel/UG/mm/ug_im_tsa.h @@ -24,13 +24,8 @@ namespace sapt { - // using namespace std; - // using namespace boost; namespace bio=boost::iostreams; - // template class imBitext; - - template class TsaSorter { @@ -44,8 +39,8 @@ namespace sapt public: TsaSorter(SORTER sorter, iter& begin, iter& end) : m_sorter(sorter), - m_begin(begin), - m_end(end) { } + m_begin(begin), + m_end(end) { } bool operator()() @@ -62,7 +57,7 @@ namespace sapt class imTSA : public TSA { typedef typename Ttrack::Position cpos; - // friend class imBitext; + public: class tree_iterator; friend class tree_iterator; diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h index 539831ceb..515e8a3df 100644 --- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h @@ -19,8 +19,6 @@ #include "util/exception.hh" #include "moses/Util.h" -// #include "ug_vocab.h" - // define the corpus buffer size (in sentences) and the // for adding additional sentences: #define IMTTRACK_INCREMENT_SIZE 100000 diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h index 2b83c9f4e..ba389b072 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer1.h @@ -11,7 +11,6 @@ #include #include "tpt_pickler.h" -// using namespace std; namespace ugdiss { diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h index 3ec0af454..5d7d5794f 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h @@ -14,7 +14,7 @@ #include "tpt_pickler.h" #include "ug_mm_2d_table.h" #include "util/exception.hh" -// using namespace std; + namespace sapt { diff --git a/moses/TranslationModel/UG/mm/ug_lru_cache.h b/moses/TranslationModel/UG/mm/ug_lru_cache.h index 6a75ee77d..3282259aa 100644 --- a/moses/TranslationModel/UG/mm/ug_lru_cache.h +++ b/moses/TranslationModel/UG/mm/ug_lru_cache.h @@ -14,8 +14,6 @@ namespace lru_cache { - // using namespace std; - // using namespace boost; template class LRU_Cache diff --git a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h index f8d9b9b20..26e4a8e66 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_2d_table.h +++ b/moses/TranslationModel/UG/mm/ug_mm_2d_table.h @@ -13,7 +13,6 @@ namespace bio=boost::iostreams; namespace sapt { - // using namespace std; template class mm2dTable diff --git a/moses/TranslationModel/UG/mm/ug_mm_tsa.h b/moses/TranslationModel/UG/mm/ug_mm_tsa.h index 591e7c59c..75e4d7917 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_tsa.h +++ b/moses/TranslationModel/UG/mm/ug_mm_tsa.h @@ -19,7 +19,6 @@ namespace sapt { - // using namespace std; namespace bio=boost::iostreams; template diff --git a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h index 4e0848d6a..7c4d74c16 100644 --- a/moses/TranslationModel/UG/mm/ug_mm_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_mm_ttrack.h @@ -26,7 +26,6 @@ namespace sapt { - // using namespace std; namespace bio=boost::iostreams; template diff --git a/moses/TranslationModel/UG/mm/ug_tsa_base.h b/moses/TranslationModel/UG/mm/ug_tsa_base.h index 4f72c75ba..188b4bafa 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_base.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_base.h @@ -21,8 +21,6 @@ namespace sapt { - // using namespace std; - // using namespace boost; namespace bio=boost::iostreams; template diff --git a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h index 486302c19..0ea2af85d 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_bitset_cache.h @@ -18,7 +18,7 @@ namespace sapt { - // using namespace std; + template class BitSetCache diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.h b/moses/TranslationModel/UG/mm/ug_ttrack_base.h index b3275b1ea..b448f348d 100644 --- a/moses/TranslationModel/UG/mm/ug_ttrack_base.h +++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.h @@ -20,11 +20,9 @@ #include "tpt_typedefs.h" #include "tpt_tokenindex.h" #include "moses/Util.h" -// #include "ug_vocab.h" namespace sapt { - // using namespace std; typedef boost::dynamic_bitset bdBitset; using tpt::count_type; diff --git a/moses/TranslationModel/UG/mm/ug_typedefs.h b/moses/TranslationModel/UG/mm/ug_typedefs.h index c726d94ec..bba414273 100644 --- a/moses/TranslationModel/UG/mm/ug_typedefs.h +++ b/moses/TranslationModel/UG/mm/ug_typedefs.h @@ -10,7 +10,6 @@ #include "tpt_typedefs.h" namespace sapt { - // using namespace std; typedef boost::dynamic_bitset bitvector; typedef std::vector > flt_2d_table; diff --git a/moses/TranslationModel/UG/sapt_phrase_key.h b/moses/TranslationModel/UG/sapt_phrase_key.h deleted file mode 100644 index 0caf11e43..000000000 --- a/moses/TranslationModel/UG/sapt_phrase_key.h +++ /dev/null @@ -1,13 +0,0 @@ -//-*- c++ -*- -#pragma once -#include - -using namespace std; -namespace sapt -{ - using namespace Moses; - using namespace std; - - - -} diff --git a/moses/TranslationModel/WordCoocTable.h b/moses/TranslationModel/WordCoocTable.h index 60d788bd3..c193737cb 100644 --- a/moses/TranslationModel/WordCoocTable.h +++ b/moses/TranslationModel/WordCoocTable.h @@ -13,8 +13,6 @@ namespace Moses { -using namespace std; - #ifndef bitvector typedef boost::dynamic_bitset bitvector; #endif @@ -27,9 +25,9 @@ typedef boost::dynamic_bitset bitvector; class WordCoocTable { typedef map my_map_t; - vector m_cooc; - vector m_marg1; - vector m_marg2; + std::vector m_cooc; + std::vector m_marg1; + std::vector m_marg2; public: WordCoocTable(); WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2); From 0e2dc5636077a00621be7ed3833bea19a5cdf6bf Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 11:03:31 +0100 Subject: [PATCH 256/286] Namespace cleanup --- moses/FF/Factory.cpp | 6 +- .../FF/SkeletonTranslationOptionListFeature.h | 2 +- .../BilingualDynSuffixArray.h | 70 +++++++++---------- .../program_options/ug_splice_arglist.cc | 4 +- .../UG/generic/sorting/NBestList.h | 6 +- moses/TranslationModel/WordCoocTable.h | 2 +- 6 files changed, 45 insertions(+), 45 deletions(-) diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 0042651bf..28af7bd11 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -353,7 +353,7 @@ void FeatureRegistry::Construct(const std::string &name, const std::string &line void FeatureRegistry::PrintFF() const { - vector ffs; + std::vector ffs; std::cerr << "Available feature functions:" << std::endl; Map::const_iterator iter; for (iter = registry_.begin(); iter != registry_.end(); ++iter) { @@ -361,10 +361,10 @@ void FeatureRegistry::PrintFF() const ffs.push_back(ffName); } - vector::const_iterator iterVec; + std::vector::const_iterator iterVec; std::sort(ffs.begin(), ffs.end()); for (iterVec = ffs.begin(); iterVec != ffs.end(); ++iterVec) { - const string &ffName = *iterVec; + const std::string &ffName = *iterVec; std::cerr << ffName << " "; } diff --git a/moses/FF/SkeletonTranslationOptionListFeature.h b/moses/FF/SkeletonTranslationOptionListFeature.h index 1d88717e1..e47e691aa 100644 --- a/moses/FF/SkeletonTranslationOptionListFeature.h +++ b/moses/FF/SkeletonTranslationOptionListFeature.h @@ -34,7 +34,7 @@ public: void EvaluateTranslationOptionListWithSourceContext(const InputType &input , const TranslationOptionList &translationOptionList) const { - vector newScores(m_numScoreComponents); + std::vector newScores(m_numScoreComponents); newScores[0] = translationOptionList.size(); TranslationOptionList::const_iterator iterTransOpt; diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h index b9d52d443..bf00b6c71 100644 --- a/moses/TranslationModel/BilingualDynSuffixArray.h +++ b/moses/TranslationModel/BilingualDynSuffixArray.h @@ -22,7 +22,7 @@ class PhraseDictionaryDynSuffixArray; class SAPhrase { public: - vector words; + std::vector words; SAPhrase(size_t phraseSize) :words(phraseSize) { @@ -66,18 +66,18 @@ class SentenceAlignment public: SentenceAlignment(int sntIndex, int sourceSize, int targetSize); int m_sntIndex; - vector* trgSnt; - vector* srcSnt; - vector numberAligned; - vector< vector > alignedList; - bool Extract(int maxPhraseLength, vector &ret, + std::vector* trgSnt; + std::vector* srcSnt; + std::vector numberAligned; + std::vector< std::vector > alignedList; + bool Extract(int maxPhraseLength, std::vector &ret, int startSource, int endSource) const; }; class ScoresComp { public: - ScoresComp(const vector& weights) { + ScoresComp(const std::vector& weights) { } bool operator()(const Scores& s1, const Scores& s2) const { return s1[0] < s2[0]; // just p(e|f) as approximation @@ -98,10 +98,10 @@ public: struct BetterPhrase { ScoresComp const& cmp; BetterPhrase(ScoresComp const& sc); - // bool operator()(pair const& a, - // pair const& b) const; - bool operator()(pair const& a, - pair const& b) const; + // bool operator()(std::pair const& a, + // std::pair const& b) const; + bool operator()(std::pair const& a, + std::pair const& b) const; }; /** @todo ask Abbey Levenberg @@ -111,20 +111,20 @@ class BilingualDynSuffixArray public: BilingualDynSuffixArray(); ~BilingualDynSuffixArray(); - bool Load( const vector& inputFactors, - const vector& outputTactors, + bool Load( const std::vector& inputFactors, + const std::vector& outputTactors, string source, string target, string alignments, - const vector &weight); - // bool LoadTM( const vector& inputFactors, - // const vector& outputTactors, + const std::vector &weight); + // bool LoadTM( const std::vector& inputFactors, + // const std::vector& outputTactors, // string source, string target, string alignments, - // const vector &weight); - void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair >& target) const; + // const std::vector &weight); + void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair >& target) const; void CleanUp(const InputType& source); void addSntPair(string& source, string& target, string& alignment); - pair - GatherCands(Phrase const& src, map >& pstats) const; + std::pair + GatherCands(Phrase const& src, std::map >& pstats) const; TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase, const PhraseDictionary *pt) const; @@ -135,42 +135,42 @@ private: mutable WordCoocTable m_wrd_cooc; DynSuffixArray * m_srcSA; DynSuffixArray * m_trgSA; - vector* m_srcCorpus; - vector* m_trgCorpus; - vector m_inputFactors; - vector m_outputFactors; + std::vector* m_srcCorpus; + std::vector* m_trgCorpus; + std::vector m_inputFactors; + std::vector m_outputFactors; - vector m_srcSntBreaks, m_trgSntBreaks; + std::vector m_srcSntBreaks, m_trgSntBreaks; Vocab* m_srcVocab, *m_trgVocab; ScoresComp* m_scoreCmp; - vector m_alignments; - vector > m_rawAlignments; + std::vector m_alignments; + std::vector > m_rawAlignments; - mutable map, pair > m_wordPairCache; - mutable set m_freqWordsCached; + mutable std::map, std::pair > m_wordPairCache; + mutable std::set m_freqWordsCached; const size_t m_maxPhraseLength, m_maxSampleSize; const size_t m_maxPTEntries; int LoadCorpus(FactorDirection direction, - InputFileStream&, const vector& factors, - vector&, vector&, + InputFileStream&, const std::vector& factors, + std::vector&, std::vector&, Vocab*); int LoadAlignments(InputFileStream& aligs); int LoadRawAlignments(InputFileStream& aligs); int LoadRawAlignments(string& aligs); - bool ExtractPhrases(const int&, const int&, const int&, vector&, bool=false) const; + bool ExtractPhrases(const int&, const int&, const int&, std::vector&, bool=false) const; SentenceAlignment GetSentenceAlignment(const int, bool=false) const; - int SampleSelection(vector&, int = 300) const; + int SampleSelection(std::vector&, int = 300) const; - vector GetSntIndexes(vector&, int, const vector&) const; + std::vector GetSntIndexes(std::vector&, int, const std::vector&) const; SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const; bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const; void CacheWordProbs(wordID_t) const; void CacheFreqWords() const; void ClearWordInCache(wordID_t); - pair GetLexicalWeight(const PhrasePair&) const; + std::pair GetLexicalWeight(const PhrasePair&) const; int GetSourceSentenceSize(size_t sentenceId) const; int GetTargetSentenceSize(size_t sentenceId) const; diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc index f30d91acc..54743b7a8 100644 --- a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc +++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc @@ -10,7 +10,7 @@ namespace Moses { filter_arguments(int const argc_in, char const* const* const argv_in, int & argc_moses, char*** argv_moses, int & argc_other, char*** argv_other, - vector > const& filter) + std::vector > const& filter) { *argv_moses = new char*[argc_in]; *argv_other = new char*[argc_in]; @@ -18,7 +18,7 @@ namespace Moses { strcpy((*argv_moses)[0], argv_in[0]); argc_moses = 1; argc_other = 0; - typedef pair option; + typedef std::pair option; int i = 1; while (i < argc_in) { diff --git a/moses/TranslationModel/UG/generic/sorting/NBestList.h b/moses/TranslationModel/UG/generic/sorting/NBestList.h index 12fd57900..d7fa4b881 100644 --- a/moses/TranslationModel/UG/generic/sorting/NBestList.h +++ b/moses/TranslationModel/UG/generic/sorting/NBestList.h @@ -17,10 +17,10 @@ namespace Moses class NBestList { - vector m_heap; - vector m_list; + std::vector m_heap; + std::vector m_list; VectorIndexSorter m_better; - mutable vector m_order; + mutable std::vector m_order; mutable bool m_changed; public: NBestList(size_t const max_size, CMP const& cmp); diff --git a/moses/TranslationModel/WordCoocTable.h b/moses/TranslationModel/WordCoocTable.h index c193737cb..c76691a16 100644 --- a/moses/TranslationModel/WordCoocTable.h +++ b/moses/TranslationModel/WordCoocTable.h @@ -24,7 +24,7 @@ typedef boost::dynamic_bitset bitvector; */ class WordCoocTable { - typedef map my_map_t; + typedef std::map my_map_t; std::vector m_cooc; std::vector m_marg1; std::vector m_marg2; From 67ba05521370625240fc1ee2a18c40c048cf4c1f Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 11:04:45 +0100 Subject: [PATCH 257/286] Exclude code from compilation when compiling without xmlrpc-c. --- moses/server/Server.cpp | 4 ++++ moses/server/Server.h | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/moses/server/Server.cpp b/moses/server/Server.cpp index 2eee00c52..8f8c7d6a9 100644 --- a/moses/server/Server.cpp +++ b/moses/server/Server.cpp @@ -5,6 +5,7 @@ namespace MosesServer { Server:: Server(Moses::Parameter& params) +#ifdef HAVE_XMLRPC_C : m_server_options(params), m_updater(new Updater), m_optimizer(new Optimizer), @@ -16,6 +17,9 @@ namespace MosesServer m_registry.addMethod("optimize", m_optimizer); m_registry.addMethod("close_session", m_close_session); } +#else + { } +#endif int Server:: diff --git a/moses/server/Server.h b/moses/server/Server.h index 72cfa5640..6614d0b2b 100644 --- a/moses/server/Server.h +++ b/moses/server/Server.h @@ -9,8 +9,8 @@ #include "Updater.h" #include "CloseSession.h" #include "Session.h" -#include "moses/parameters/ServerOptions.h" #endif +#include "moses/parameters/ServerOptions.h" namespace MosesServer { @@ -18,12 +18,13 @@ namespace MosesServer { Moses::ServerOptions m_server_options; SessionCache m_session_cache; +#ifdef HAVE_XMLRPC_C xmlrpc_c::registry m_registry; xmlrpc_c::methodPtr const m_updater; xmlrpc_c::methodPtr const m_optimizer; xmlrpc_c::methodPtr const m_translator; xmlrpc_c::methodPtr const m_close_session; - +#endif public: Server(Moses::Parameter& params); From 1dcd077806aa95ec1e4cbad980af7b04fca40644 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 15:14:44 +0100 Subject: [PATCH 258/286] More namespace fixes. --- moses/TranslationModel/UG/mm/calc-coverage.cc | 3 ++- moses/TranslationModel/UG/mm/symal2mam.cc | 7 ++++--- moses/TranslationModel/UG/mmsapt.cpp | 4 ++-- moses/TranslationModel/UG/mmsapt.h | 6 +++--- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/moses/TranslationModel/UG/mm/calc-coverage.cc b/moses/TranslationModel/UG/mm/calc-coverage.cc index 43ce60fde..098659337 100644 --- a/moses/TranslationModel/UG/mm/calc-coverage.cc +++ b/moses/TranslationModel/UG/mm/calc-coverage.cc @@ -12,6 +12,7 @@ // using namespace Moses; using namespace ugdiss; using namespace sapt; +using namespace std; typedef L2R_Token Token; TokenIndex V; @@ -19,7 +20,7 @@ SPTR > > C(new vector >()); void add_file(string fname) { - filtering_istream in; + boost::iostreams::filtering_istream in; open_input_stream(fname,in); string line; while (getline(in,line)) diff --git a/moses/TranslationModel/UG/mm/symal2mam.cc b/moses/TranslationModel/UG/mm/symal2mam.cc index a3ae87fb7..0ed10e7ac 100644 --- a/moses/TranslationModel/UG/mm/symal2mam.cc +++ b/moses/TranslationModel/UG/mm/symal2mam.cc @@ -198,13 +198,14 @@ go(string t1name, string t2name, string A3filename) { typedef mmTtrack track_t; track_t T1(t1name),T2(t2name); - filtering_istream A3file; open_input_stream(A3filename,A3file); + boost::iostreams::filtering_istream A3file; + open_input_stream(A3filename, A3file); string line; int check1=-1,check2=-1; - vector idx1(1,0),idx2(1,0),idxm(1,mam.tellp()); + vector idx1(1,0),idx2(1,0),idxm(1, mam.tellp()); size_t tokenCount1=0,tokenCount2=0; size_t skipCtr=0,lineCtr=0; - if (!getCheckValues(A3file,check1,check2)) + if (!getCheckValues(A3file, check1, check2)) UTIL_THROW(util::Exception, "Mismatch in input files!"); for (sid = 0; sid < T1.size(); ++sid) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 4dd0a6561..c0cc2ab7f 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -222,7 +222,7 @@ namespace Moses m_bias_log = &std::cout; else { - m_bias_logger.reset(new ofstream(m_bias_logfile.c_str())); + m_bias_logger.reset(new std::ofstream(m_bias_logfile.c_str())); m_bias_log = m_bias_logger.get(); } } @@ -319,7 +319,7 @@ namespace Moses // - sane word alignment? vector text1,text2,symal; string line; - filtering_istream in1,in2,ina; + boost::iostreams::filtering_istream in1,in2,ina; open_input_stream(bname+L1+".txt.gz",in1); open_input_stream(bname+L2+".txt.gz",in2); diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 7139e4fd2..d8ebd06ea 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -78,11 +78,11 @@ namespace Moses size_t m_workers; // number of worker threads for sampling the bitexts std::vector m_feature_set_names; // one or more of: standard, datasource std::string m_bias_logfile; - boost::scoped_ptr m_bias_logger; // for logging to a file - ostream* m_bias_log; + boost::scoped_ptr m_bias_logger; // for logging to a file + std::ostream* m_bias_log; int m_bias_loglevel; LexicalReordering* m_lr_func; // associated lexical reordering function - string m_lr_func_name; // name of associated lexical reordering function + std::string m_lr_func_name; // name of associated lexical reordering function sapt::sampling_method m_sampling_method; // sampling method, see ug_bitext_sampler boost::scoped_ptr m_thread_pool; public: From 64af8123c1f810719543e30f756cc2c4a7f2e915 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 15:40:04 +0100 Subject: [PATCH 259/286] Trying to get things to compile with newer versions of boost. --- moses/server/CloseSession.cpp | 2 +- moses/server/CloseSession.h | 4 ++++ moses/server/Server.h | 5 +++++ moses/server/Translator.h | 3 ++- 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/moses/server/CloseSession.cpp b/moses/server/CloseSession.cpp index 5cc6175d3..d87cf0d5c 100644 --- a/moses/server/CloseSession.cpp +++ b/moses/server/CloseSession.cpp @@ -1,7 +1,7 @@ // -*- mode: c++; indent-tabs-mode: nil; tab-width: -*- #include "CloseSession.h" +#include "TranslationRequest.h" #include "Server.h" -#include "moses/StaticData.h" namespace MosesServer { diff --git a/moses/server/CloseSession.h b/moses/server/CloseSession.h index 833781a68..8ce189df4 100644 --- a/moses/server/CloseSession.h +++ b/moses/server/CloseSession.h @@ -1,10 +1,14 @@ // -*- mode: c++; indent-tabs-mode: nil; tab-width: -*- #pragma once +#include "moses/parameters/ServerOptions.h" +#include "Session.h" #include #include #include #ifndef WITH_THREADS #pragma message("COMPILING WITHOUT THREADS!") +#else +#include "moses/ThreadPool.h" #endif namespace MosesServer { diff --git a/moses/server/Server.h b/moses/server/Server.h index 6614d0b2b..81fbf2953 100644 --- a/moses/server/Server.h +++ b/moses/server/Server.h @@ -1,6 +1,11 @@ // -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- #pragma once #ifdef HAVE_XMLRPC_C +#include "moses/TypeDef.h" +#ifdef WITH_THREADS +#include +#include "moses/ThreadPool.h" +#endif #include #include #include diff --git a/moses/server/Translator.h b/moses/server/Translator.h index 8137f59bd..e26968ddd 100644 --- a/moses/server/Translator.h +++ b/moses/server/Translator.h @@ -1,7 +1,6 @@ // -*- c++ -*- #pragma once -#include "moses/ThreadPool.h" #include "moses/parameters/ServerOptions.h" #include "Session.h" #include @@ -9,6 +8,8 @@ #include #ifndef WITH_THREADS #pragma message("COMPILING WITHOUT THREADS!") +#else +#include "moses/ThreadPool.h" #endif namespace MosesServer { From 0426e95e554f06a5cde8538d5eae7a1908a57cd4 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 10 Aug 2015 16:07:32 +0100 Subject: [PATCH 260/286] Fixed typo. --- moses/parameters/SearchOptions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/parameters/SearchOptions.cpp b/moses/parameters/SearchOptions.cpp index b75b0f15b..6636ba744 100644 --- a/moses/parameters/SearchOptions.cpp +++ b/moses/parameters/SearchOptions.cpp @@ -54,7 +54,7 @@ namespace Moses { typedef std::map params_t; - params_t::const_iterator si = params.find("search-algoritm"); + params_t::const_iterator si = params.find("search-algorithm"); if (si != params.end()) { // use named parameters From 4a3363479e256f2373990a10cbac65937c46dc14 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 11 Aug 2015 12:44:42 +0400 Subject: [PATCH 261/286] remove namespace pollution from old dynamic suffix array and randlm --- contrib/other-builds/OnDiskPt/.cproject | 8 +- contrib/other-builds/moses/.cproject | 4 +- contrib/server/mosesserver.cpp | 2 +- moses/FF/Factory.cpp | 4 +- moses/LM/ORLM.cpp | 3 +- moses/LM/ORLM.h | 2 +- .../BilingualDynSuffixArray.h | 6 +- .../DynSAInclude/RandLMFilter.h | 30 +++--- moses/TranslationModel/DynSAInclude/hash.h | 27 +++-- .../TranslationModel/DynSAInclude/onlineRLM.h | 90 ++++++++-------- .../TranslationModel/DynSAInclude/params.cpp | 4 +- .../DynSAInclude/perfectHash.h | 100 +++++++++--------- .../TranslationModel/DynSAInclude/quantizer.h | 6 +- moses/TranslationModel/DynSAInclude/types.h | 5 - moses/TranslationModel/DynSAInclude/vocab.cpp | 2 +- moses/TranslationModel/DynSAInclude/vocab.h | 4 +- .../PhraseDictionaryDynSuffixArray.h | 2 +- 17 files changed, 147 insertions(+), 152 deletions(-) diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject index e32a5baea..f551380fd 100644 --- a/contrib/other-builds/OnDiskPt/.cproject +++ b/contrib/other-builds/OnDiskPt/.cproject @@ -11,12 +11,12 @@ + + - - @@ -72,13 +72,13 @@ + + - - diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject index 2fd2601c6..960a13947 100644 --- a/contrib/other-builds/moses/.cproject +++ b/contrib/other-builds/moses/.cproject @@ -11,11 +11,11 @@ + - @@ -79,12 +79,12 @@ + - diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index f6b2deaa1..2ccfa7157 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -58,8 +58,8 @@ int main(int argc, char** argv) #include // using namespace Moses; -using Moses::TreeInput; using namespace std; +using namespace Moses; typedef std::map params_t; diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 28af7bd11..297c17798 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -152,7 +152,7 @@ FeatureFactory ::DefaultSetup(F *feature) { StaticData &static_data = StaticData::InstanceNonConst(); - const string &featureName = feature->GetScoreProducerDescription(); + const std::string &featureName = feature->GetScoreProducerDescription(); std::vector weights = static_data.GetParameter()->GetWeights(featureName); @@ -357,7 +357,7 @@ void FeatureRegistry::PrintFF() const std::cerr << "Available feature functions:" << std::endl; Map::const_iterator iter; for (iter = registry_.begin(); iter != registry_.end(); ++iter) { - const string &ffName = iter->first; + const std::string &ffName = iter->first; ffs.push_back(ffName); } diff --git a/moses/LM/ORLM.cpp b/moses/LM/ORLM.cpp index 44fd64efb..9632fd6ab 100644 --- a/moses/LM/ORLM.cpp +++ b/moses/LM/ORLM.cpp @@ -8,7 +8,8 @@ #include "moses/StaticData.h" #include "ORLM.h" -using std::map; +using namespace std; + namespace Moses { bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType, diff --git a/moses/LM/ORLM.h b/moses/LM/ORLM.h index dd2a675d8..1ce282ded 100644 --- a/moses/LM/ORLM.h +++ b/moses/LM/ORLM.h @@ -39,7 +39,7 @@ public: m_lm->clearCache(); // clear caches } - bool UpdateORLM(const std::vector& ngram, const int value); + bool UpdateORLM(const std::vector& ngram, const int value); protected: OnlineRLM* m_lm; //MultiOnlineRLM* m_lm; diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h index bf00b6c71..ff0c0594d 100644 --- a/moses/TranslationModel/BilingualDynSuffixArray.h +++ b/moses/TranslationModel/BilingualDynSuffixArray.h @@ -113,7 +113,7 @@ public: ~BilingualDynSuffixArray(); bool Load( const std::vector& inputFactors, const std::vector& outputTactors, - string source, string target, string alignments, + std::string source, std::string target, std::string alignments, const std::vector &weight); // bool LoadTM( const std::vector& inputFactors, // const std::vector& outputTactors, @@ -122,7 +122,7 @@ public: void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair >& target) const; void CleanUp(const InputType& source); - void addSntPair(string& source, string& target, string& alignment); + void addSntPair(std::string& source, std::string& target, std::string& alignment); std::pair GatherCands(Phrase const& src, std::map >& pstats) const; @@ -158,7 +158,7 @@ private: Vocab*); int LoadAlignments(InputFileStream& aligs); int LoadRawAlignments(InputFileStream& aligs); - int LoadRawAlignments(string& aligs); + int LoadRawAlignments(std::string& aligs); bool ExtractPhrases(const int&, const int&, const int&, std::vector&, bool=false) const; SentenceAlignment GetSentenceAlignment(const int, bool=false) const; diff --git a/moses/TranslationModel/DynSAInclude/RandLMFilter.h b/moses/TranslationModel/DynSAInclude/RandLMFilter.h index 19566ff40..edb94e183 100644 --- a/moses/TranslationModel/DynSAInclude/RandLMFilter.h +++ b/moses/TranslationModel/DynSAInclude/RandLMFilter.h @@ -61,7 +61,7 @@ public: // mask for bits that make up the address address_mask_ = full_mask_ >> first_bit_; } - Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) { + Filter(Moses::FileHandler* fin, bool loaddata = true) : data_(NULL) { assert(loadHeader(fin)); if (loaddata) assert(loadData(fin)); @@ -222,7 +222,7 @@ public: uint32_t getCells() { return cells_; } - virtual bool save(FileHandler* out) { + virtual bool save(Moses::FileHandler* out) { assert(out != NULL); assert(out->write((char*)&cells_, sizeof(cells_))); assert(out->write((char*)&cell_width_, sizeof(cell_width_))); @@ -247,7 +247,7 @@ public: return true; } protected: - bool loadHeader(FileHandler* fin) { + bool loadHeader(Moses::FileHandler* fin) { assert(fin != NULL); assert(fin->read((char*)&cells_, sizeof(cells_))); assert(fin->read((char*)&cell_width_, sizeof(cell_width_))); @@ -260,7 +260,7 @@ protected: assert(fin->read((char*)&address_mask_, sizeof(address_mask_))); return true; } - bool loadData(FileHandler* fin) { + bool loadData(Moses::FileHandler* fin) { // instantiate underlying array data_ = new T[cells_]; assert(data_ != NULL); @@ -285,7 +285,7 @@ class BitFilter : public Filter { public: BitFilter(uint64_t bits) : Filter(bits, 1) {} - BitFilter(FileHandler* fin, bool loaddata = true) + BitFilter(Moses::FileHandler* fin, bool loaddata = true) : Filter(fin, loaddata) { if (loaddata) assert(load(fin)); @@ -305,7 +305,7 @@ public: data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8); return true; } - bool save(FileHandler* fout) { + bool save(Moses::FileHandler* fout) { assert(Filter::save(fout)); std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;; return true; @@ -320,7 +320,7 @@ public: return static_cast((range << 3) - ones)/static_cast(range << 3); } protected: - bool load(FileHandler* fin) { + bool load(Moses::FileHandler* fin) { std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;; return true; } @@ -332,13 +332,13 @@ protected: // to fit a smaller range. class ResizedBitFilter : public BitFilter { public: - ResizedBitFilter(FileHandler* fin) : BitFilter(fin) { + ResizedBitFilter(Moses::FileHandler* fin) : BitFilter(fin) { assert(load(fin)); } - ResizedBitFilter(FileHandler* fin, uint64_t newsize) : BitFilter(newsize) { + ResizedBitFilter(Moses::FileHandler* fin, uint64_t newsize) : BitFilter(newsize) { assert(resizeFromFile(fin, newsize)); } - bool resizeFromFile(FileHandler* oldin, uint64_t newsize); + bool resizeFromFile(Moses::FileHandler* oldin, uint64_t newsize); virtual bool testBit(uint64_t location) { // test bit referenced by location return BitFilter::testBit((location % old_addresses_) * a_ + b_); @@ -347,7 +347,7 @@ protected: // set bit referenced by location return BitFilter::setBit((location % old_addresses_) * a_ + b_); } - bool save(FileHandler* fout) { + bool save(Moses::FileHandler* fout) { // re-hashing parameters assert(BitFilter::save(fout)); std::cerr << "Saved ResizedBitFilter. Rho = " << rho() << "." << std::endl; @@ -356,7 +356,7 @@ protected: return fout->write((char*)&b_, sizeof(b_)); } protected: - bool load(FileHandler* fin) { + bool load(Moses::FileHandler* fin) { // re-hashing parameters std::cerr << "Loaded ResizedBitFilter. Rho = " << rho() << "." << std::endl; CHECK(fin->read((char*)&old_addresses_, sizeof(old_addresses_))); @@ -376,7 +376,7 @@ protected: public: CountingFilter(uint64_t addresses, int width, bool wrap_around = true) : Filter(addresses, width), wrap_around_(wrap_around) {} - CountingFilter(FileHandler* fin) : Filter(fin, true) { + CountingFilter(Moses::FileHandler* fin) : Filter(fin, true) { CHECK(load(fin)); } ~CountingFilter() {} @@ -404,12 +404,12 @@ protected: CHECK(this->write(address, this->address_mask_)); return false; // false to indicate that overflowed } - bool save(FileHandler* fout) { + bool save(Moses::FileHandler* fout) { CHECK(Filter::save(fout)); return fout->write((char*)&wrap_around_, sizeof(wrap_around_)); } private: - bool load(FileHandler* fin) { + bool load(Moses::FileHandler* fin) { return fin->read((char*)&wrap_around_, sizeof(wrap_around_)); } inline bool incrementSubCell(int bit, int len, T* cell) { diff --git a/moses/TranslationModel/DynSAInclude/hash.h b/moses/TranslationModel/DynSAInclude/hash.h index 4cf69bf2f..f349536b5 100644 --- a/moses/TranslationModel/DynSAInclude/hash.h +++ b/moses/TranslationModel/DynSAInclude/hash.h @@ -8,7 +8,6 @@ #include "util/exception.hh" #include "util/random.hh" -using namespace Moses; typedef uint64_t P; // largest input range is 2^64 //! @todo ask abby2 @@ -24,7 +23,7 @@ public: HashBase(float m, count_t H=1):m_((T)m), H_(H) { //cerr << "range = (0..." << m_ << "]" << endl; } - HashBase(FileHandler* fin) { + HashBase(Moses::FileHandler* fin) { load(fin); } virtual ~HashBase() {} @@ -33,12 +32,12 @@ public: count_t size() { return H_; } - virtual void save(FileHandler* fout) { + virtual void save(Moses::FileHandler* fout) { UTIL_THROW_IF2(fout == 0, "Null file handle"); fout->write((char*)&m_, sizeof(m_)); fout->write((char*)&H_, sizeof(H_)); } - virtual void load(FileHandler* fin) { + virtual void load(Moses::FileHandler* fin) { UTIL_THROW_IF2(fin == 0, "Null file handle"); fin->read((char*)&m_, sizeof(m_)); fin->read((char*)&H_, sizeof(H_)); @@ -54,7 +53,7 @@ public: HashBase(m, H), pr_(pr) { initSeeds(); } - UnivHash_linear(FileHandler* fin): + UnivHash_linear(Moses::FileHandler* fin): HashBase(fin) { load(fin); } @@ -67,8 +66,8 @@ public: T hash(const wordID_t* id, const int len, count_t h); T hash(const wordID_t id, const count_t pos, const T prevValue, count_t h); - void save(FileHandler* fout); - void load(FileHandler* fin); + void save(Moses::FileHandler* fout); + void load(Moses::FileHandler* fin); private: T** a_, **b_; P pr_; @@ -92,7 +91,7 @@ public: else p_ = (P) pow(2,l); initSeeds(); } - UnivHash_noPrimes(FileHandler* fin): + UnivHash_noPrimes(Moses::FileHandler* fin): HashBase(fin) { load(fin); } @@ -102,8 +101,8 @@ public: T hash(const char* s, count_t h); T hash(const wordID_t* id, const int len, count_t h); T hash(const P x, count_t h); - void save(FileHandler* fout); - void load(FileHandler* fin); + void save(Moses::FileHandler* fout); + void load(Moses::FileHandler* fin); private: count_t d_; // l-k P p_, *a_; // real-valued input range, storage @@ -253,7 +252,7 @@ T UnivHash_noPrimes::hash(const char* s, count_t h) return value % this->m_; } template -void UnivHash_noPrimes::save(FileHandler* fout) +void UnivHash_noPrimes::save(Moses::FileHandler* fout) { HashBase::save(fout); fout->write((char*)&p_, sizeof(p_)); @@ -263,7 +262,7 @@ void UnivHash_noPrimes::save(FileHandler* fout) } } template -void UnivHash_noPrimes::load(FileHandler* fin) +void UnivHash_noPrimes::load(Moses::FileHandler* fin) { a_ = new P[this->H_]; // HashBase::load(fin) already done in constructor @@ -323,7 +322,7 @@ inline T UnivHash_linear::hash(const wordID_t id, const count_t pos, return value % this->m_; } template -void UnivHash_linear::save(FileHandler* fout) +void UnivHash_linear::save(Moses::FileHandler* fout) { // int bytes = sizeof(a_[0][0]); HashBase::save(fout); @@ -338,7 +337,7 @@ void UnivHash_linear::save(FileHandler* fout) } } template -void UnivHash_linear::load(FileHandler* fin) +void UnivHash_linear::load(Moses::FileHandler* fin) { // HashBase::load(fin) already done in constructor fin->read((char*)&pr_, sizeof(pr_)); diff --git a/moses/TranslationModel/DynSAInclude/onlineRLM.h b/moses/TranslationModel/DynSAInclude/onlineRLM.h index 050e016c9..a4dbe98f3 100644 --- a/moses/TranslationModel/DynSAInclude/onlineRLM.h +++ b/moses/TranslationModel/DynSAInclude/onlineRLM.h @@ -26,18 +26,18 @@ public: vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) { UTIL_THROW_IF2(vocab_ == 0, "Vocab object not set"); //instantiate quantizer class here - cache_ = new Cache(8888.8888, 9999.9999); // unknown_value, null_value + cache_ = new randlm::Cache(8888.8888, 9999.9999); // unknown_value, null_value alpha_ = new float[order_ + 1]; for(count_t i = 0; i <= order_; ++i) alpha_[i] = i * log10(0.4); - cerr << "Initialzing auxillary bit filters...\n"; - bPrefix_ = new BitFilter(this->cells_); - bHit_ = new BitFilter(this->cells_); + std::cerr << "Initialzing auxillary bit filters...\n"; + bPrefix_ = new randlm::BitFilter(this->cells_); + bHit_ = new randlm::BitFilter(this->cells_); } - OnlineRLM(FileHandler* fin, count_t order): + OnlineRLM(Moses::FileHandler* fin, count_t order): PerfectHash(fin), bAdapting_(true), order_(order), corpusSize_(0) { load(fin); - cache_ = new Cache(8888.8888, 9999.9999); // unknown_value, null_value + cache_ = new randlm::Cache(8888.8888, 9999.9999); // unknown_value, null_value alpha_ = new float[order_ + 1]; for(count_t i = 0; i <= order_; ++i) alpha_[i] = i * log10(0.4); @@ -52,14 +52,14 @@ public: } float getProb(const wordID_t* ngram, int len, const void** state); //float getProb2(const wordID_t* ngram, int len, const void** state); - bool insert(const std::vector& ngram, const int value); - bool update(const std::vector& ngram, const int value); + bool insert(const std::vector& ngram, const int value); + bool update(const std::vector& ngram, const int value); int query(const wordID_t* IDs, const int len); - int sbsqQuery(const std::vector& ngram, int* len, + int sbsqQuery(const std::vector& ngram, int* len, bool bStrict = false); int sbsqQuery(const wordID_t* IDs, const int len, int* codes, bool bStrict = false); - void remove(const std::vector& ngram); + void remove(const std::vector& ngram); count_t heurDelete(count_t num2del, count_t order = 5); uint64_t corpusSize() { return corpusSize_; @@ -70,8 +70,8 @@ public: void clearCache() { if(cache_) cache_->clear(); } - void save(FileHandler* fout); - void load(FileHandler* fin); + void save(Moses::FileHandler* fout); + void load(Moses::FileHandler* fin); void randDelete(int num2del); int countHits(); int countPrefixes(); @@ -89,13 +89,13 @@ private: const count_t order_; // LM order uint64_t corpusSize_; // total training corpus size float* alpha_; // backoff constant - Cache* cache_; - BitFilter* bPrefix_; - BitFilter* bHit_; + randlm::Cache* cache_; + randlm::BitFilter* bPrefix_; + randlm::BitFilter* bHit_; }; template -bool OnlineRLM::insert(const std::vector& ngram, const int value) +bool OnlineRLM::insert(const std::vector& ngram, const int value) { int len = ngram.size(); wordID_t wrdIDs[len]; @@ -114,7 +114,7 @@ bool OnlineRLM::insert(const std::vector& ngram, const int value) } template -bool OnlineRLM::update(const std::vector& ngram, const int value) +bool OnlineRLM::update(const std::vector& ngram, const int value) { int len = ngram.size(); std::vector wrdIDs(len); @@ -160,14 +160,14 @@ template bool OnlineRLM::markPrefix(const wordID_t* IDs, const int len, bool bSet) { if(len <= 1) return true; // only do this for for ngrams with context - static Cache pfCache(-1, -1); // local prefix cache + static randlm::Cache pfCache(-1, -1); // local prefix cache int code(0); if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) { hpdEntry_t hpdItr; uint64_t filterIndex(0); code = PerfectHash::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1] if(code == -1) { // encountered false positive in pipeline - cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n"; + std::cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n"; // add all prefixes or return false; return false; } @@ -189,7 +189,7 @@ template void OnlineRLM::markQueried(const uint64_t& index) { bHit_->setBit(index); - //cerr << "filter[" << index << "] = " << this->filter_->read(index) << endl; + //std::cerr << "filter[" << index << "] = " << this->filter_->read(index) << std::endl; } template @@ -200,7 +200,7 @@ void OnlineRLM::markQueried(hpdEntry_t& value) } template -void OnlineRLM::remove(const std::vector& ngram) +void OnlineRLM::remove(const std::vector& ngram) { wordID_t IDs[ngram.size()]; for(count_t i = 0; i < ngram.size(); ++i) @@ -212,7 +212,7 @@ template count_t OnlineRLM::heurDelete(count_t num2del, count_t order) { count_t deleted = 0; - cout << "Deleting " << num2del << " of order "<< order << endl; + std::cout << "Deleting " << num2del << " of order "<< order << std::endl; // delete from filter first int full = *std::max_element(this->idxTracker_, this->idxTracker_ + this->totBuckets_); @@ -234,14 +234,14 @@ count_t OnlineRLM::heurDelete(count_t num2del, count_t order) } if(deleted < num2del) { // remove from hpd - cerr << "TODO! HPD deletions\n"; + std::cerr << "TODO! HPD deletions\n"; } - cerr << "Total deleted = " << deleted << endl; + std::cerr << "Total deleted = " << deleted << std::endl; return deleted; } template -int OnlineRLM::sbsqQuery(const std::vector& ngram, int* codes, +int OnlineRLM::sbsqQuery(const std::vector& ngram, int* codes, bool bStrict) { wordID_t IDs[ngram.size()]; @@ -372,7 +372,7 @@ int OnlineRLM::countHits() iterate(this->dict_, itr) if((itr->second & this->hitMask_) != 0) ++hit; - cerr << "Hit count = " << hit << endl; + std::cerr << "Hit count = " << hit << std::endl; return hit; } @@ -383,15 +383,15 @@ int OnlineRLM::countPrefixes() for(uint64_t i = 0; i < this->cells_; ++i) if(bPrefix_->testBit(i)) ++pfx; //TODO::Handle hpdict prefix counts - cerr << "Prefix count (in filter) = " << pfx << endl; + std::cerr << "Prefix count (in filter) = " << pfx << std::endl; return pfx; } template int OnlineRLM::cleanUpHPD() { - cerr << "HPD size before = " << this->dict_.size() << endl; - std::vector vDel, vtmp; + std::cerr << "HPD size before = " << this->dict_.size() << std::endl; + std::vector vDel, vtmp; iterate(this->dict_, itr) { if(((itr->second & this->hitMask_) == 0) && // if not hit during testing (Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram @@ -400,14 +400,14 @@ int OnlineRLM::cleanUpHPD() } iterate(vDel, vitr) this->dict_.erase(*vitr); - cerr << "HPD size after = " << this->dict_.size() << endl; + std::cerr << "HPD size after = " << this->dict_.size() << std::endl; return vDel.size(); } template void OnlineRLM::clearMarkings() { - cerr << "clearing all event hits\n"; + std::cerr << "clearing all event hits\n"; bHit_->reset(); count_t* value(0); iterate(this->dict_, itr) { @@ -417,9 +417,9 @@ void OnlineRLM::clearMarkings() } template -void OnlineRLM::save(FileHandler* fout) +void OnlineRLM::save(Moses::FileHandler* fout) { - cerr << "Saving ORLM...\n"; + std::cerr << "Saving ORLM...\n"; // save vocab vocab_->Save(fout); fout->write((char*)&corpusSize_, sizeof(corpusSize_)); @@ -428,22 +428,22 @@ void OnlineRLM::save(FileHandler* fout) bHit_->save(fout); // save everything else PerfectHash::save(fout); - cerr << "Finished saving ORLM." << endl; + std::cerr << "Finished saving ORLM." << std::endl; } template -void OnlineRLM::load(FileHandler* fin) +void OnlineRLM::load(Moses::FileHandler* fin) { - cerr << "Loading ORLM...\n"; + std::cerr << "Loading ORLM...\n"; // load vocab first vocab_ = new Moses::Vocab(fin); UTIL_THROW_IF2(vocab_ == 0, "Vocab object not set"); fin->read((char*)&corpusSize_, sizeof(corpusSize_)); - cerr << "\tCorpus size = " << corpusSize_ << endl; + std::cerr << "\tCorpus size = " << corpusSize_ << std::endl; fin->read((char*)&order_, sizeof(order_)); - cerr << "\tModel order = " << order_ << endl; - bPrefix_ = new BitFilter(fin); - bHit_ = new BitFilter(fin); + std::cerr << "\tModel order = " << order_ << std::endl; + bPrefix_ = new randlm::BitFilter(fin); + bHit_ = new randlm::BitFilter(fin); // load everything else PerfectHash::load(fin); } @@ -451,7 +451,7 @@ void OnlineRLM::load(FileHandler* fin) template void OnlineRLM::removeNonMarked() { - cerr << "deleting all unused events\n"; + std::cerr << "deleting all unused events\n"; int deleted(0); for(uint64_t i = 0; i < this->cells_; ++i) { if(!(bHit_->testBit(i) || bPrefix_->testBit(i)) @@ -461,7 +461,7 @@ void OnlineRLM::removeNonMarked() } } deleted += cleanUpHPD(); - cerr << "total removed from ORLM = " << deleted << endl; + std::cerr << "total removed from ORLM = " << deleted << std::endl; } /* @@ -474,13 +474,13 @@ float OnlineRLM::getProb2(const wordID_t* ngram, int len, const void** state) int* denom_codes[order_]; int* num_codes[order_ + 1]; int denom_found(0); - cerr << "length=" << len << endl; + std::cerr << "length=" << len << std::endl; // constrain cache queries using model assumptions int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found); - cerr << "denom_len = " << denom_len << endl; + std::cerr << "denom_len = " << denom_len << std::endl; int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1, &num_codes[0], &found); - cerr << "num_len= " << num_len << endl; + std::cerr << "num_len= " << num_len << std::endl; // keed reducing ngram size until both denominator and numerator are found // allowed to leave kUnknownCode in cache because we check for this. found = num_len; // guaranteed to be <= denom_len + 1 diff --git a/moses/TranslationModel/DynSAInclude/params.cpp b/moses/TranslationModel/DynSAInclude/params.cpp index 03ad48446..2c5b416e5 100644 --- a/moses/TranslationModel/DynSAInclude/params.cpp +++ b/moses/TranslationModel/DynSAInclude/params.cpp @@ -31,9 +31,9 @@ void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum) for( count_t i = 0; i < paramNum; i++ ) { params_[paramdefs[i].name] = paramdefs[i]; // assign name } - cerr << "Default parameter values:\n"; + std::cerr << "Default parameter values:\n"; iterate(params_, itr) - cerr << "\t" << itr->first << " --> " << itr->second.value << endl; + std::cerr << "\t" << itr->first << " --> " << itr->second.value << std::endl; } bool Parameters::loadParams(int argc, char ** argv) diff --git a/moses/TranslationModel/DynSAInclude/perfectHash.h b/moses/TranslationModel/DynSAInclude/perfectHash.h index 93e98ef4c..5042c9cc4 100644 --- a/moses/TranslationModel/DynSAInclude/perfectHash.h +++ b/moses/TranslationModel/DynSAInclude/perfectHash.h @@ -14,7 +14,7 @@ */ using randlm::Filter; using randlm::BitFilter; -typedef std::map hpDict_t; +typedef std::map hpDict_t; typedef hpDict_t::iterator hpdEntry_t; static count_t collisions_ = 0; @@ -24,7 +24,7 @@ class PerfectHash { public: PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase); - PerfectHash(FileHandler* fin) { + PerfectHash(Moses::FileHandler* fin) { UTIL_THROW_IF2(fin == 0, "Invalid file handle"); } virtual ~PerfectHash(); @@ -49,14 +49,14 @@ protected: hpdEntry_t& hpdAddr, uint64_t& filterIdx); virtual void remove(const wordID_t* IDs, const int len); void remove(uint64_t index); - void save(FileHandler* fout); - void load(FileHandler* fin); + void save(Moses::FileHandler* fout); + void load(Moses::FileHandler* fin); virtual void markQueried(const uint64_t&)=0; //pointer to a specific entry in a hpDict_t virtual void markQueried(hpdEntry_t&)=0; private: T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket); - string hpDictKeyValue(const wordID_t* IDs, const int len); + std::string hpDictKeyValue(const wordID_t* IDs, const int len); uint64_t memBound_; // total memory bound in bytes uint16_t cellWidth_; // in bits UnivHash_linear* bucketHash_; @@ -71,12 +71,12 @@ PerfectHash::PerfectHash(uint16_t MBs, int width, int bucketRange, { bucketRange_ = static_cast(bucketRange); if(bucketRange > 255) { - cerr << "ERROR: Max bucket range is > 2^8\n"; + std::cerr << "ERROR: Max bucket range is > 2^8\n"; exit(1); } qtizer_ = new LogQtizer(qBase); int valBits = (int)ceil(log2((float)qtizer_->maxcode())); - cerr << "BITS FOR VALUES ARRAY = " << valBits << endl; + std::cerr << "BITS FOR VALUES ARRAY = " << valBits << std::endl; uint64_t totalBits = memBound_ << 3; cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range @@ -142,7 +142,7 @@ bool PerfectHash::update(const wordID_t* IDs, const int len, { // check if key is in high perf. dictionary filterIdx = cells_ + 1; - string skey = hpDictKeyValue(IDs, len); + std::string skey = hpDictKeyValue(IDs, len); if((hpdAddr = dict_.find(skey)) != dict_.end()) { hpdAddr->second = value; return true; @@ -172,7 +172,7 @@ int PerfectHash::query(const wordID_t* IDs, const int len, hpdEntry_t& hpdAddr, uint64_t& filterIdx) { // check if key is in high perf. dictionary - string skey = hpDictKeyValue(IDs, len); + std::string skey = hpDictKeyValue(IDs, len); if((hpdAddr = dict_.find(skey)) != dict_.end()) { filterIdx = cells_ + 1; return(hpdAddr->second); // returns copy of value @@ -188,7 +188,7 @@ int PerfectHash::query(const wordID_t* IDs, const int len, for(; index < lastrow; ++index) { if(filter_->read(index) == fp) { //cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" << - //filter_->read(index) << "\tcode = " << code << endl; + //filter_->read(index) << "\tcode = " << code << std::endl; filterIdx = index; hpdAddr = dict_.end(); return (int)qtizer_->value(values_->read(index)); @@ -202,7 +202,7 @@ template void PerfectHash::remove(const wordID_t* IDs, const int len) { // delete key if in high perf. dictionary - string skey = hpDictKeyValue(IDs, len); + std::string skey = hpDictKeyValue(IDs, len); if(dict_.find(skey) != dict_.end()) dict_.erase(skey); else { // check if key is in filter @@ -248,14 +248,14 @@ T PerfectHash::nonZeroSignature(const wordID_t* IDs, const int len, h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around } while((fingerprint == 0) && (h != bucket)); if(fingerprint == 0) - cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl; + std::cerr << "WARNING: Unable to find non-zero signature for ngram\n" << std::endl; return fingerprint; } template -string PerfectHash::hpDictKeyValue(const wordID_t* IDs, const int len) +std::string PerfectHash::hpDictKeyValue(const wordID_t* IDs, const int len) { - string skey(" "); + std::string skey(" "); for(int i = 0; i < len; ++i) skey += Utils::IntToStr(IDs[i]) + "¬"; Utils::trim(skey); @@ -277,10 +277,10 @@ count_t PerfectHash::bucketsMemUse() } template -void PerfectHash::save(FileHandler* fout) +void PerfectHash::save(Moses::FileHandler* fout) { UTIL_THROW_IF2(fout == 0, "Invalid file handle"); - cerr << "\tSaving perfect hash parameters...\n"; + std::cerr << "\tSaving perfect hash parameters...\n"; fout->write((char*)&hitMask_, sizeof(hitMask_)); fout->write((char*)&memBound_, sizeof(memBound_)); fout->write((char*)&cellWidth_, sizeof(cellWidth_)); @@ -289,25 +289,25 @@ void PerfectHash::save(FileHandler* fout) fout->write((char*)&bucketRange_, sizeof(bucketRange_)); fout->write((char*)idxTracker_, totBuckets_ * sizeof(idxTracker_[0])); qtizer_->save(fout); - cerr << "\tSaving hash functions...\n"; + std::cerr << "\tSaving hash functions...\n"; fingerHash_->save(fout); bucketHash_->save(fout); - cerr << "\tSaving bit filter...\n"; + std::cerr << "\tSaving bit filter...\n"; filter_->save(fout); values_->save(fout); - cerr << "\tSaving high performance dictionary...\n"; + std::cerr << "\tSaving high performance dictionary...\n"; count_t size = dict_.size(); fout->write((char*)&size, sizeof(count_t)); - *fout << endl; + *fout << std::endl; iterate(dict_, t) *fout << t->first << "\t" << t->second << "\n"; } template -void PerfectHash::load(FileHandler* fin) +void PerfectHash::load(Moses::FileHandler* fin) { UTIL_THROW_IF2(fin == 0, "Invalid file handle"); - cerr << "\tLoading perfect hash parameters...\n"; + std::cerr << "\tLoading perfect hash parameters...\n"; fin->read((char*)&hitMask_, sizeof(hitMask_)); fin->read((char*)&memBound_, sizeof(memBound_)); fin->read((char*)&cellWidth_, sizeof(cellWidth_)); @@ -317,17 +317,17 @@ void PerfectHash::load(FileHandler* fin) idxTracker_ = new uint8_t[totBuckets_]; fin->read((char*)idxTracker_, totBuckets_ * sizeof(idxTracker_[0])); qtizer_ = new LogQtizer(fin); - cerr << "\tLoading hash functions...\n"; + std::cerr << "\tLoading hash functions...\n"; fingerHash_ = new UnivHash_linear(fin); bucketHash_ = new UnivHash_linear(fin); - cerr << "\tLoading bit filter...\n"; + std::cerr << "\tLoading bit filter...\n"; filter_ = new Filter(fin); values_ = new Filter(fin); - cerr << "\tLoading HPD...\n"; + std::cerr << "\tLoading HPD...\n"; count_t size = 0; fin->read((char*)&size, sizeof(count_t)); fin->ignore(256, '\n'); - string line; + std::string line; hpDict_t::key_type key; hpDict_t::mapped_type val; for(count_t i=0; i < size; ++i) { @@ -337,14 +337,14 @@ void PerfectHash::load(FileHandler* fin) ss >> key, ss >> val; dict_[key] = val; } - cerr << "\tHPD size=" << dict_.size() << endl; - cerr << "Finished loading ORLM." << endl; + std::cerr << "\tHPD size=" << dict_.size() << std::endl; + std::cerr << "Finished loading ORLM." << std::endl; } template void PerfectHash::analyze() { - cerr << "Analyzing Dynamic Bloomier Filter...\n"; + std::cerr << "Analyzing Dynamic Bloomier Filter...\n"; // see how many items in each bucket uint8_t* bucketCnt = new uint8_t[totBuckets_]; unsigned largestBucket = 0, totalCellsSet = 0, @@ -376,27 +376,27 @@ void PerfectHash::analyze() } for(int i = 0; i < totBuckets_; ++i) { if(bucketCnt[i] != idxTracker_[i]) - cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] << - "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl; + std::cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] << + "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << std::endl; } - cerr << "total cells= " << cells_ << endl; - cerr << "total buckets= " << totBuckets_ << endl; - cerr << "bucket range= " << (int)bucketRange_ << endl; - cerr << "fingerprint bits= " << cellWidth_ << endl; - cerr << "total cells set= " << totalCellsSet; - cerr << " (idxTracker set = " << trackerCells << ")" << endl; - cerr << "total zeroes=" << totalZeroes; - cerr << " (idxTracker zeros = " << cells_ - trackerCells << ")" << endl; - cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl; - cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl; - cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] << - " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl; - cerr << "total buckets full = " << fullBuckets << endl; - cerr << "total collision errors= " << collisions_ << endl; - cerr << "high performance dictionary size= " << dict_.size() << endl; - cerr << "high performance dictionary MBs= " << hpDictMemUse() << endl; - cerr << "filter MBs= " << filter_->size() << endl; - cerr << "values MBs= " << values_->size() << endl; + std::cerr << "total cells= " << cells_ << std::endl; + std::cerr << "total buckets= " << totBuckets_ << std::endl; + std::cerr << "bucket range= " << (int)bucketRange_ << std::endl; + std::cerr << "fingerprint bits= " << cellWidth_ << std::endl; + std::cerr << "total cells set= " << totalCellsSet; + std::cerr << " (idxTracker set = " << trackerCells << ")" << std::endl; + std::cerr << "total zeroes=" << totalZeroes; + std::cerr << " (idxTracker zeros = " << cells_ - trackerCells << ")" << std::endl; + std::cerr << "largest bucket (" << bi << ") size= " << largestBucket << std::endl; + std::cerr << "smallest bucket (" << si << ") size= " << smallestBucket << std::endl; + std::cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] << + " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << std::endl; + std::cerr << "total buckets full = " << fullBuckets << std::endl; + std::cerr << "total collision errors= " << collisions_ << std::endl; + std::cerr << "high performance dictionary size= " << dict_.size() << std::endl; + std::cerr << "high performance dictionary MBs= " << hpDictMemUse() << std::endl; + std::cerr << "filter MBs= " << filter_->size() << std::endl; + std::cerr << "values MBs= " << values_->size() << std::endl; delete[] bucketCnt; } @@ -406,7 +406,7 @@ bool PerfectHash::update2(const wordID_t* IDs, const int len, { // check if key is in high perf. dictionary filterIdx = cells_ + 1; - string skey = hpDictKeyValue(IDs, len); + std::string skey = hpDictKeyValue(IDs, len); if((hpdAddr = dict_.find(skey)) != dict_.end()) { hpdAddr->second += value; return true; diff --git a/moses/TranslationModel/DynSAInclude/quantizer.h b/moses/TranslationModel/DynSAInclude/quantizer.h index 6dbcc3cc4..2e83a33b7 100644 --- a/moses/TranslationModel/DynSAInclude/quantizer.h +++ b/moses/TranslationModel/DynSAInclude/quantizer.h @@ -39,7 +39,7 @@ public: } std::cerr << "Initialized quantization (size = " << max_code_ + 1 << ")" << std::endl; } - LogQtizer(FileHandler* fin) { + LogQtizer(Moses::FileHandler* fin) { UTIL_THROW_IF2(fin == NULL, "Null file handle"); load(fin); } @@ -70,7 +70,7 @@ public: delete[] code_to_value_; delete[] code_to_log_value_; } - void save(FileHandler* fout) { + void save(Moses::FileHandler* fout) { fout->write((char*)&base_, sizeof(base_)); fout->write((char*)&max_code_, sizeof(max_code_)); fout->write((char*)&max_value_, sizeof(max_value_)); @@ -88,7 +88,7 @@ private: int max_code_; float max_value_; float min_value_; - void load(FileHandler* fin) { + void load(Moses::FileHandler* fin) { fin->read((char*)&base_, sizeof(base_)); fin->read((char*)&max_code_, sizeof(max_code_)); fin->read((char*)&max_value_, sizeof(max_value_)); diff --git a/moses/TranslationModel/DynSAInclude/types.h b/moses/TranslationModel/DynSAInclude/types.h index b3122a4ea..2d7c38bdb 100644 --- a/moses/TranslationModel/DynSAInclude/types.h +++ b/moses/TranslationModel/DynSAInclude/types.h @@ -26,11 +26,6 @@ #define MAX_HASH_FUNCS 1000 //#define PRIME 409 -using std::string; -using std::cout; -using std::cerr; -using std::endl; - //typedefs for projects typedef std::string word_t; // word as string typedef unsigned int wordID_t; // word mapped to integer diff --git a/moses/TranslationModel/DynSAInclude/vocab.cpp b/moses/TranslationModel/DynSAInclude/vocab.cpp index b717f533c..af57232f2 100644 --- a/moses/TranslationModel/DynSAInclude/vocab.cpp +++ b/moses/TranslationModel/DynSAInclude/vocab.cpp @@ -12,7 +12,7 @@ void Vocab::InitSpecialWords() m_kOOVWord = InitSpecialWord(UNKNOWN_FACTOR); // UNKNOWN_FACTOR also defined in ../typedef.h } -const Word Vocab::InitSpecialWord( const string& word_str) +const Word Vocab::InitSpecialWord( const std::string& word_str) { FactorList factors; factors.push_back(0); // store the special word string as the first factor diff --git a/moses/TranslationModel/DynSAInclude/vocab.h b/moses/TranslationModel/DynSAInclude/vocab.h index 0edf70f93..cf81bf3a9 100644 --- a/moses/TranslationModel/DynSAInclude/vocab.h +++ b/moses/TranslationModel/DynSAInclude/vocab.h @@ -59,7 +59,7 @@ public: wordID_t GetWordID(const std::string& word, const FactorDirection& direction, const FactorList& factors, bool isNonTerminal); wordID_t GetWordID(const Word& word); - wordID_t GetWordID(const string& word); + wordID_t GetWordID(const std::string& word); Word& GetWord(wordID_t id); inline wordID_t GetkOOVWordID() { return m_kOOVWordID; @@ -115,7 +115,7 @@ protected: Word m_kEOSWord; // end of sentence marker Word m_kOOVWord; // - const Word InitSpecialWord( const string& type); // initialize special word like kBOS, kEOS + const Word InitSpecialWord( const std::string& type); // initialize special word like kBOS, kEOS void InitSpecialWords(); Word2Id m_words2ids; // map from words to word ids diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h index 6a79b1edc..6e4d67b6d 100644 --- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h +++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h @@ -22,7 +22,7 @@ public: void Load(); // functions below required by base class const TargetPhraseCollection* GetTargetPhraseCollectionLEGACY(const Phrase& src) const; - void insertSnt(string&, string&, string&); + void insertSnt(std::string&, std::string&, std::string&); void deleteSnt(unsigned, unsigned); ChartRuleLookupManager *CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase&, std::size_t); void SetParameter(const std::string& key, const std::string& value); From e0d2af268cd7588a943863c0d371fc3867f07301 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 11 Aug 2015 13:10:38 +0400 Subject: [PATCH 262/286] eclipse --- contrib/other-builds/extract-ghkm/.project | 9 +- contrib/other-builds/extract-rules/.project | 9 +- contrib/other-builds/extractor/.project | 10 + contrib/other-builds/mert_lib/.cproject | 14 +- contrib/other-builds/moses/.project | 283 +++++++++++++++++++- contrib/other-builds/server/.cproject | 1 + phrase-extract/extract-ghkm/Alignment.cpp | 2 +- 7 files changed, 302 insertions(+), 26 deletions(-) diff --git a/contrib/other-builds/extract-ghkm/.project b/contrib/other-builds/extract-ghkm/.project index 2cc5bb826..0b74caf70 100644 --- a/contrib/other-builds/extract-ghkm/.project +++ b/contrib/other-builds/extract-ghkm/.project @@ -102,9 +102,14 @@ PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignmentWithSyntax.h - SyntaxTree.cpp + SyntaxNodeCollection.cpp 1 - PARENT-3-PROJECT_LOC/phrase-extract/SyntaxTree.cpp + PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.cpp + + + SyntaxNodeCollection.h + 1 + PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.h SyntaxTree.h diff --git a/contrib/other-builds/extract-rules/.project b/contrib/other-builds/extract-rules/.project index 79b72a58a..90c62e2d4 100644 --- a/contrib/other-builds/extract-rules/.project +++ b/contrib/other-builds/extract-rules/.project @@ -81,9 +81,14 @@ PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignmentWithSyntax.h - SyntaxTree.cpp + SyntaxNodeCollection.cpp 1 - PARENT-3-PROJECT_LOC/phrase-extract/SyntaxTree.cpp + PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.cpp + + + SyntaxNodeCollection.h + 1 + PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.h SyntaxTree.h diff --git a/contrib/other-builds/extractor/.project b/contrib/other-builds/extractor/.project index 56d560019..a720f94b4 100644 --- a/contrib/other-builds/extractor/.project +++ b/contrib/other-builds/extractor/.project @@ -83,6 +83,16 @@ org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + InternalTree.cpp + 1 + PARENT-3-PROJECT_LOC/mert/InternalTree.cpp + + + InternalTree.h + 1 + PARENT-3-PROJECT_LOC/mert/InternalTree.h + bin 2 diff --git a/contrib/other-builds/mert_lib/.cproject b/contrib/other-builds/mert_lib/.cproject index c53700bac..908ecf784 100644 --- a/contrib/other-builds/mert_lib/.cproject +++ b/contrib/other-builds/mert_lib/.cproject @@ -11,15 +11,15 @@ + - - + @@ -32,6 +32,9 @@ + @@ -46,9 +49,6 @@ - - - @@ -66,15 +66,15 @@ + - - + diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index fcc6b8948..65fa053fa 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -60,6 +60,16 @@ 1 PARENT-3-PROJECT_LOC/moses/AlignmentInfoTest.cpp + + AllOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/AllOptions.cpp + + + AllOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/AllOptions.h + BaseManager.cpp 1 @@ -70,6 +80,11 @@ 1 PARENT-3-PROJECT_LOC/moses/BaseManager.h + + BeamSearchOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/BeamSearchOptions.h + BitmapContainer.cpp 1 @@ -80,6 +95,16 @@ 1 PARENT-3-PROJECT_LOC/moses/BitmapContainer.h + + BookkeepingOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/BookkeepingOptions.cpp + + + BookkeepingOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/BookkeepingOptions.h + CMakeLists.txt 1 @@ -230,6 +255,16 @@ 1 PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.h + + CubePruningOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/CubePruningOptions.cpp + + + CubePruningOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/CubePruningOptions.h + DecodeGraph.cpp 1 @@ -460,6 +495,16 @@ 1 PARENT-3-PROJECT_LOC/moses/InputFileStream.h + + InputOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/InputOptions.cpp + + + InputOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/InputOptions.h + InputPath.cpp 1 @@ -490,6 +535,16 @@ 2 virtual:/virtual + + LMBR_Options.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/LMBR_Options.cpp + + + LMBR_Options.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/LMBR_Options.h + LVoc.cpp 1 @@ -510,6 +565,21 @@ 1 PARENT-3-PROJECT_LOC/moses/LatticeMBR.h + + LookupOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/LookupOptions.h + + + MBR_Options.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/MBR_Options.cpp + + + MBR_Options.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/MBR_Options.h + Manager.cpp 1 @@ -535,6 +605,16 @@ 1 PARENT-3-PROJECT_LOC/moses/MosesTest.cpp + + NBestOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/NBestOptions.cpp + + + NBestOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/NBestOptions.h + NonTerminal.cpp 1 @@ -550,6 +630,16 @@ 1 PARENT-3-PROJECT_LOC/moses/ObjectPool.h + + OptionsBaseClass.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/OptionsBaseClass.cpp + + + OptionsBaseClass.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/OptionsBaseClass.h + OutputCollector.h 1 @@ -635,6 +725,26 @@ 1 PARENT-3-PROJECT_LOC/moses/ReorderingConstraint.h + + ReorderingOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ReorderingOptions.cpp + + + ReorderingOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ReorderingOptions.h + + + ReportingOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ReportingOptions.cpp + + + ReportingOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ReportingOptions.h + RuleCube.cpp 1 @@ -711,14 +821,14 @@ PARENT-3-PROJECT_LOC/moses/SearchNormal.h - SearchNormalBatch.cpp + SearchOptions.cpp 1 - PARENT-3-PROJECT_LOC/moses/SearchNormalBatch.cpp + PARENT-3-PROJECT_LOC/moses/parameters/SearchOptions.cpp - SearchNormalBatch.h + SearchOptions.h 1 - PARENT-3-PROJECT_LOC/moses/SearchNormalBatch.h + PARENT-3-PROJECT_LOC/moses/parameters/SearchOptions.h Sentence.cpp @@ -740,6 +850,16 @@ 1 PARENT-3-PROJECT_LOC/moses/SentenceStats.h + + ServerOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ServerOptions.cpp + + + ServerOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ServerOptions.h + SquareMatrix.cpp 1 @@ -1065,6 +1185,11 @@ 1 PARENT-3-PROJECT_LOC/moses/mbr.h + + parameters + 2 + virtual:/virtual + rule.proto 1 @@ -1360,16 +1485,6 @@ 1 PARENT-3-PROJECT_LOC/moses/FF/SetSourcePhrase.h - - FF/SkeletonChangeInput.cpp - 1 - PARENT-3-PROJECT_LOC/moses/FF/SkeletonChangeInput.cpp - - - FF/SkeletonChangeInput.h - 1 - PARENT-3-PROJECT_LOC/moses/FF/SkeletonChangeInput.h - FF/SkeletonStatefulFF.cpp 1 @@ -2240,6 +2355,146 @@ 1 PARENT-3-PROJECT_LOC/phrase-extract/extract-ghkm/PhraseOrientation.h + + parameters/AllOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/AllOptions.cpp + + + parameters/AllOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/AllOptions.h + + + parameters/BeamSearchOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/BeamSearchOptions.h + + + parameters/BookkeepingOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/BookkeepingOptions.cpp + + + parameters/BookkeepingOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/BookkeepingOptions.h + + + parameters/ContextParameters.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.cpp + + + parameters/ContextParameters.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ContextParameters.h + + + parameters/CubePruningOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/CubePruningOptions.cpp + + + parameters/CubePruningOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/CubePruningOptions.h + + + parameters/InputOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/InputOptions.cpp + + + parameters/InputOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/InputOptions.h + + + parameters/LMBR_Options.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/LMBR_Options.cpp + + + parameters/LMBR_Options.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/LMBR_Options.h + + + parameters/LookupOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/LookupOptions.h + + + parameters/MBR_Options.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/MBR_Options.cpp + + + parameters/MBR_Options.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/MBR_Options.h + + + parameters/NBestOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/NBestOptions.cpp + + + parameters/NBestOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/NBestOptions.h + + + parameters/OptionsBaseClass.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/OptionsBaseClass.cpp + + + parameters/OptionsBaseClass.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/OptionsBaseClass.h + + + parameters/ReorderingOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ReorderingOptions.cpp + + + parameters/ReorderingOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ReorderingOptions.h + + + parameters/ReportingOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ReportingOptions.cpp + + + parameters/ReportingOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ReportingOptions.h + + + parameters/SearchOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/SearchOptions.cpp + + + parameters/SearchOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/SearchOptions.h + + + parameters/ServerOptions.cpp + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ServerOptions.cpp + + + parameters/ServerOptions.h + 1 + PARENT-3-PROJECT_LOC/moses/parameters/ServerOptions.h + FF/LexicalReordering/LexicalReordering.cpp 1 diff --git a/contrib/other-builds/server/.cproject b/contrib/other-builds/server/.cproject index 78c5185f9..11e521736 100644 --- a/contrib/other-builds/server/.cproject +++ b/contrib/other-builds/server/.cproject @@ -75,6 +75,7 @@ + diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp index 9293a07cf..d12f9398b 100644 --- a/phrase-extract/extract-ghkm/Alignment.cpp +++ b/phrase-extract/extract-ghkm/Alignment.cpp @@ -19,7 +19,7 @@ #include "Alignment.h" -#include "syntax-common/exception.h" +#include "phrase-extract/syntax-common/exception.h" #include #include From 8af06a6f0dae14c36e30b052aae24279dac92c6a Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Wed, 12 Aug 2015 00:01:03 +0100 Subject: [PATCH 263/286] daily automatic beautifier --- moses/TranslationModel/DynSAInclude/perfectHash.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/moses/TranslationModel/DynSAInclude/perfectHash.h b/moses/TranslationModel/DynSAInclude/perfectHash.h index 5042c9cc4..9b597f06c 100644 --- a/moses/TranslationModel/DynSAInclude/perfectHash.h +++ b/moses/TranslationModel/DynSAInclude/perfectHash.h @@ -377,7 +377,7 @@ void PerfectHash::analyze() for(int i = 0; i < totBuckets_; ++i) { if(bucketCnt[i] != idxTracker_[i]) std::cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] << - "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << std::endl; + "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << std::endl; } std::cerr << "total cells= " << cells_ << std::endl; std::cerr << "total buckets= " << totBuckets_ << std::endl; @@ -390,7 +390,7 @@ void PerfectHash::analyze() std::cerr << "largest bucket (" << bi << ") size= " << largestBucket << std::endl; std::cerr << "smallest bucket (" << si << ") size= " << smallestBucket << std::endl; std::cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] << - " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << std::endl; + " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << std::endl; std::cerr << "total buckets full = " << fullBuckets << std::endl; std::cerr << "total collision errors= " << collisions_ << std::endl; std::cerr << "high performance dictionary size= " << dict_.size() << std::endl; From 01a9dd2305fd16372a4c222b39e22b86c73fb00b Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 14 Aug 2015 16:53:24 +0100 Subject: [PATCH 264/286] extract-target-trees.py: support for new-style trace files --- scripts/analysis/extract-target-trees.py | 51 +++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/scripts/analysis/extract-target-trees.py b/scripts/analysis/extract-target-trees.py index 7166211d9..5dd097ff0 100755 --- a/scripts/analysis/extract-target-trees.py +++ b/scripts/analysis/extract-target-trees.py @@ -110,8 +110,15 @@ def read_derivations(input): yield derivation, start_line_num -# Extract the hypothesis components and return a Hypothesis object. def parse_line(s): + if s.startswith("Trans Opt"): + return parse_line_old_format(s) + else: + return parse_line_new_format(s) + + +# Extract the hypothesis components and return a Hypothesis object. +def parse_line_old_format(s): pattern = r"Trans Opt (\d+) " + \ r"\[(\d+)\.\.(\d+)\]:" + \ r"((?: \[\d+\.\.\d+\]=\S+ )+):" + \ @@ -147,6 +154,48 @@ def parse_line(s): return hypothesis +# Extract the hypothesis components and return a Hypothesis object. +def parse_line_new_format(s): + pattern = r"(\d+) \|\|\|" + \ + r" (\[\S+\]) -> ((?:\S+ )+)\|\|\|" + \ + r" (\[\S+\]) -> ((?:\S+ )+)\|\|\|" + \ + r" ((?:\d+-\d+ )*)\|\|\|" + \ + r"((?: \d+\.\.\d+)*)" + regexp = re.compile(pattern) + match = regexp.match(s) + if not match: + sys.stderr.write("%s\n" % s) + assert match + group = match.groups() + hypothesis = Hypothesis() + hypothesis.sentence_num = int(group[0]) + 1 + spans = [] + for pair in group[6].split(): + match = re.match(r'(\d+)\.\.(\d+)', pair) + assert match + span = (int(match.group(1)), int(match.group(2))) + spans.append(span) + hypothesis.span = (spans[0][0], spans[-1][1]) + hypothesis.source_symbol_info = [] + for i, symbol in enumerate(group[2].split()): + hypothesis.source_symbol_info.append((spans[i], strip_brackets(symbol))) + hypothesis.target_lhs = strip_brackets(group[3]) + hypothesis.target_rhs = group[4].split() + hypothesis.nt_alignments = [] + for pair in group[5].split(): + match = re.match(r'(\d+)-(\d+)', pair) + assert match + ai = (int(match.group(1)), int(match.group(2))) + hypothesis.nt_alignments.append(ai) + return hypothesis + + +def strip_brackets(symbol): + if symbol[0] == '[' and symbol[-1] == ']': + return symbol[1:-1] + return symbol + + def tree_to_xml(tree): if tree.is_leaf(): return tree.label From 3a261c9fc95667eb43311c61ea9b7de3b293af6f Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 16 Aug 2015 20:32:07 +0400 Subject: [PATCH 265/286] don't hardcode amount of mem to be used by lmplz --- moses/FactorCollection.h | 2 +- scripts/OSM/OSM-Train.perl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/moses/FactorCollection.h b/moses/FactorCollection.h index b5f49f3cf..5d94e1197 100644 --- a/moses/FactorCollection.h +++ b/moses/FactorCollection.h @@ -110,7 +110,7 @@ public: */ const Factor *AddFactor(const StringPiece &factorString, bool isNonTerminal = false); - const size_t GetNumNonTerminals() { + size_t GetNumNonTerminals() { return m_factorIdNonTerminal; } diff --git a/scripts/OSM/OSM-Train.perl b/scripts/OSM/OSM-Train.perl index 07ad71f68..cf3bccff2 100755 --- a/scripts/OSM/OSM-Train.perl +++ b/scripts/OSM/OSM-Train.perl @@ -109,7 +109,7 @@ if (defined($SRILM_DIR)) { `$cmd`; } else { - $cmd = "$LMPLZ -S 20% -T $OUT_DIR --order $ORDER --text $OUT_DIR/$factor_val/opCorpus --arpa $OUT_DIR/$factor_val/operationLM --prune 0 0 1 2>> /dev/stderr"; + $cmd = "$LMPLZ -T $OUT_DIR --order $ORDER --text $OUT_DIR/$factor_val/opCorpus --arpa $OUT_DIR/$factor_val/operationLM --prune 0 0 1 2>> /dev/stderr"; print STDERR "Executing: $cmd\n"; `$cmd`; } @@ -117,7 +117,7 @@ else { print STDERR "Binarizing\n"; $cmd = "$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/$factor_val/operationLM $OUT_DIR/$factor_val/operationLM.bin"; print STDERR "Executing: $cmd\n"; -`$cmd`; +system($cmd) == 0 or die("system $cmd failed: $?"); } From 8b3f2d4338cbaa4f78e68669f7c03d248ea59f34 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 17 Aug 2015 15:09:40 +0100 Subject: [PATCH 266/286] Bye-bye, PhraseDictionaryDynSuffixArray. --- contrib/server/mosesserver.cpp | 10 +- moses/FF/Factory.cpp | 3 +- .../BilingualDynSuffixArray.cpp | 681 ------------------ .../BilingualDynSuffixArray.h | 180 ----- moses/TranslationModel/DynSuffixArray.cpp | 353 --------- moses/TranslationModel/DynSuffixArray.h | 73 -- .../PhraseDictionaryDynSuffixArray.README | 4 - .../PhraseDictionaryDynSuffixArray.cpp | 110 --- .../PhraseDictionaryDynSuffixArray.h | 37 - moses/TranslationModel/WordCoocTable.cpp | 72 -- moses/TranslationModel/WordCoocTable.h | 70 -- 11 files changed, 5 insertions(+), 1588 deletions(-) delete mode 100644 moses/TranslationModel/BilingualDynSuffixArray.cpp delete mode 100644 moses/TranslationModel/BilingualDynSuffixArray.h delete mode 100644 moses/TranslationModel/DynSuffixArray.cpp delete mode 100644 moses/TranslationModel/DynSuffixArray.h delete mode 100644 moses/TranslationModel/PhraseDictionaryDynSuffixArray.README delete mode 100644 moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp delete mode 100644 moses/TranslationModel/PhraseDictionaryDynSuffixArray.h delete mode 100644 moses/TranslationModel/WordCoocTable.cpp delete mode 100644 moses/TranslationModel/WordCoocTable.h diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 2ccfa7157..48c14c988 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -38,7 +38,6 @@ int main(int argc, char** argv) #include "moses/StaticData.h" #include "moses/ThreadPool.h" #include "moses/TranslationTask.h" -#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h" #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h" #if PT_UG #include "moses/TranslationModel/UG/mmsapt.h" @@ -82,11 +81,10 @@ public: Mmsapt* pdsa = reinterpret_cast(PhraseDictionary::GetColl()[0]); pdsa->add(source_,target_,alignment_); #else - const PhraseDictionary* pdf = PhraseDictionary::GetColl()[0]; - PhraseDictionaryDynSuffixArray* - pdsa = (PhraseDictionaryDynSuffixArray*) pdf; - cerr << "Inserting into address " << pdsa << endl; - pdsa->insertSnt(source_, target_, alignment_); + std::string msg; + msg = "Server was compiled without a phrase table implementation that "; + msg += "supports updates."; + throw xmlrpc_c::fault(msg.c_str(), xmlrpc_c::fault::CODE_PARSE); #endif if(add2ORLM_) { //updateORLM(); diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 297c17798..9fbf7c83f 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -6,7 +6,6 @@ #include "moses/TranslationModel/PhraseDictionaryMemory.h" #include "moses/TranslationModel/PhraseDictionaryMultiModel.h" #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h" -#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h" #include "moses/TranslationModel/PhraseDictionaryScope3.h" #include "moses/TranslationModel/PhraseDictionaryTransliteration.h" #include "moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h" @@ -215,7 +214,7 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME(PhraseDictionaryMultiModel); MOSES_FNAME(PhraseDictionaryMultiModelCounts); MOSES_FNAME(PhraseDictionaryALSuffixArray); - MOSES_FNAME(PhraseDictionaryDynSuffixArray); + // MOSES_FNAME(PhraseDictionaryDynSuffixArray); MOSES_FNAME(PhraseDictionaryTransliteration); MOSES_FNAME(PhraseDictionaryDynamicCacheBased); MOSES_FNAME(PhraseDictionaryFuzzyMatch); diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp deleted file mode 100644 index b0607b770..000000000 --- a/moses/TranslationModel/BilingualDynSuffixArray.cpp +++ /dev/null @@ -1,681 +0,0 @@ -#include "BilingualDynSuffixArray.h" -#include "moses/TranslationModel/DynSAInclude/utils.h" -#include "moses/FactorCollection.h" -#include "moses/StaticData.h" -#include "moses/TargetPhrase.h" - -#include "moses/TranslationModel/UG/generic/sorting/NBestList.h" -#include "moses/TranslationModel/UG/generic/sampling/Sampling.h" - -#include -#include - -using namespace std; - -namespace Moses -{ - -BilingualDynSuffixArray:: -BilingualDynSuffixArray(): - m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()), - m_maxSampleSize(20), m_maxPTEntries(20) -{ - m_srcSA = 0; - m_trgSA = 0; - m_srcCorpus = new vector(); - m_trgCorpus = new vector(); - m_srcVocab = new Vocab(false); - m_trgVocab = new Vocab(false); - m_scoreCmp = 0; -} - -BilingualDynSuffixArray:: -~BilingualDynSuffixArray() -{ - if(m_srcSA) delete m_srcSA; - if(m_trgSA) delete m_trgSA; - if(m_srcVocab) delete m_srcVocab; - if(m_trgVocab) delete m_trgVocab; - if(m_srcCorpus) delete m_srcCorpus; - if(m_trgCorpus) delete m_trgCorpus; - if(m_scoreCmp) delete m_scoreCmp; -} - -bool -BilingualDynSuffixArray:: -Load( - const vector& inputFactors, - const vector& outputFactors, - string source, string target, string alignments, - const vector &weight) -{ - m_inputFactors = inputFactors; - m_outputFactors = outputFactors; - - // m_scoreCmp = new ScoresComp(weight); - InputFileStream sourceStrme(source); - InputFileStream targetStrme(target); - cerr << "Loading source corpus...\n"; - // Input and Output are 'Factor directions' (whatever that is) defined in Typedef.h - LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab); - cerr << "Loading target corpus...\n"; - LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab); - - UTIL_THROW_IF2(m_srcSntBreaks.size() != m_trgSntBreaks.size(), - "Source and target arrays aren't the same size"); - - // build suffix arrays and auxilliary arrays - cerr << "Building Source Suffix Array...\n"; - m_srcSA = new DynSuffixArray(m_srcCorpus); - if(!m_srcSA) return false; - cerr << "Building Target Suffix Array...\n"; - m_trgSA = new DynSuffixArray(m_trgCorpus); - if(!m_trgSA) return false; - - InputFileStream alignStrme(alignments); - cerr << "Loading Alignment File...\n"; - LoadRawAlignments(alignStrme); - cerr << m_srcSntBreaks.size() << " " - << m_trgSntBreaks.size() << " " - << m_rawAlignments.size() << endl; - //LoadAlignments(alignStrme); - cerr << "Building frequent word cache...\n"; - CacheFreqWords(); - - wordID_t const* s = &(*m_srcCorpus)[0]; - wordID_t const* t = &(*m_trgCorpus)[0]; - for (size_t sid = 0; sid < m_srcSntBreaks.size(); ++sid) { - wordID_t const* se = s + GetSourceSentenceSize(sid); - wordID_t const* te = t + GetTargetSentenceSize(sid); - vector const& a = m_rawAlignments[sid]; - m_wrd_cooc.Count(vector(s,se), - vector(t,te), a, - m_srcVocab->GetkOOVWordID(), - m_trgVocab->GetkOOVWordID()); - s = se; - t = te; - } - if (m_srcSntBreaks.size() != m_trgSntBreaks.size() || - m_rawAlignments.size() != m_trgSntBreaks.size()) { - cerr << "FATAL ERROR: Line counts don't match!\n" - << "Source side text corpus: " << m_srcSntBreaks.size() << "\n" - << "Target side text corpus: " << m_trgSntBreaks.size() << "\n" - << "Word alignments: " << m_rawAlignments.size() << endl; - exit(1); - } - return true; -} - -int -BilingualDynSuffixArray:: -LoadRawAlignments(InputFileStream& align) -{ - // stores the alignments in the raw file format - string line; - // vector vtmp; - // int lineNum = 0; - while(getline(align, line)) { - // if (++lineNum % 10000 == 0) cerr << lineNum << endl; - LoadRawAlignments(line); - } - return m_rawAlignments.size(); -} - - -int -BilingualDynSuffixArray:: -LoadRawAlignments(string& align) -{ - // stores the alignments in the raw file format - vector vtmp; - Utils::splitToInt(align, vtmp, "- "); - UTIL_THROW_IF2(vtmp.size() % 2 != 0, - "Alignment format is incorrect: " << align); - vector vAlgn; // store as short ints for memory - for (vector::const_iterator itr = vtmp.begin(); - itr != vtmp.end(); ++itr) { - vAlgn.push_back(short(*itr)); - } - m_rawAlignments.push_back(vAlgn); - return m_rawAlignments.size(); -} - -SentenceAlignment -BilingualDynSuffixArray:: -GetSentenceAlignment(const int sntIndex, bool trg2Src) const -{ - // retrieves the alignments in the format used by SentenceAlignment.Extract() - int t = GetTargetSentenceSize(sntIndex); - int s = GetSourceSentenceSize(sntIndex); - int sntGiven = trg2Src ? t : s; - int sntExtract = trg2Src ? s : t; - SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence - vector const& a = m_rawAlignments.at(sntIndex); - for(size_t i=0; i < a.size(); i+=2) { - int sourcePos = a[i]; - int targetPos = a[i+1]; - if(trg2Src) { - curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word - curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word - } else { - curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word - curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word - } - } - curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence - curSnt.trgSnt = m_trgCorpus + sntIndex; - - return curSnt; -} - -bool -BilingualDynSuffixArray:: -ExtractPhrases(const int& sntIndex, - const int& wordIndex, - const int& sourceSize, - vector& phrasePairs, - bool trg2Src) const -{ - /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src - * parameter */ - SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src); - // get span of phrase in source sentence - int beginSentence = m_srcSntBreaks[sntIndex]; - int rightIdx = wordIndex - beginSentence; - int leftIdx = rightIdx - sourceSize + 1; - return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence -} - -void -BilingualDynSuffixArray:: -CleanUp(const InputType& source) -{ - //m_wordPairCache.clear(); -} - -int -BilingualDynSuffixArray:: -LoadCorpus(FactorDirection direction, - InputFileStream & corpus, - const FactorList & factors, - vector & cArray, - vector & sntArray, - Vocab* vocab) -{ - string line, word; - int sntIdx(0); - // corpus.seekg(0); Seems needless -> commented out to allow - // loading of gzipped corpora (gzfilebuf doesn't support seeking). - while(getline(corpus, line)) { - sntArray.push_back(sntIdx); - Phrase phrase(ARRAY_SIZE_INCR); - phrase.CreateFromString( direction, factors, line, NULL); - // store words in vocabulary and corpus - for( size_t i = 0; i < phrase.GetSize(); ++i) { - cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) ); - } - sntIdx += phrase.GetSize(); - } - //cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus - vocab->MakeClosed(); // avoid adding words - return cArray.size(); -} - -bool -BilingualDynSuffixArray:: -GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const -{ - // looks up the SA vocab ids for the current src phrase - size_t phraseSize = src.GetSize(); - for (size_t pos = 0; pos < phraseSize; ++pos) { - const Word &word = src.GetWord(pos); - wordID_t arrayId = m_srcVocab->GetWordID(word); - if (arrayId == m_srcVocab->GetkOOVWordID()) { - // oov - return false; - } else { - output.SetId(pos, arrayId); - } - } - return true; -} - -pair -BilingualDynSuffixArray:: -GetLexicalWeight(const PhrasePair& pp) const -{ - // sp,tp: sum of link probabilities - // sc,tc: count of links - int src_size = pp.GetSourceSize(); - int trg_size = pp.GetTargetSize(); - vector sp(src_size, 0), tp(trg_size, 0); - vector sc(src_size,0), tc(trg_size,0); - wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex))); - wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex))); - vector const & a = m_rawAlignments.at(pp.m_sntIndex); - for (size_t i = 0; i < a.size(); i += 2) { - int s = a[i], t = a.at(i+1), sx, tx; - // sx, tx: local positions within phrase pair - - if (s < pp.m_startSource || t < pp.m_startTarget) continue; - if ((sx = s - pp.m_startSource) >= src_size) continue; - if ((tx = t - pp.m_startTarget) >= trg_size) continue; - - sp[sx] += m_wrd_cooc.pfwd(sw[s],tw[t]); - tp[tx] += m_wrd_cooc.pbwd(sw[s],tw[t]); - ++sc[sx]; - ++tc[tx]; -#if 0 - cout << m_srcVocab->GetWord(sw[s]) << " -> " - << m_trgVocab->GetWord(tw[t]) << " " - << m_wrd_cooc.pfwd(sw[s],tw[t]) << " " - << m_wrd_cooc.pbwd(sw[s],tw[t]) << " " - << sp[sx] << " (" << sc[sx] << ") " - << tp[tx] << " (" << tc[tx] << ") " - << endl; -#endif - } - pair ret(1,1); - wordID_t null_trg = m_trgVocab->GetkOOVWordID(); - wordID_t null_src = m_srcVocab->GetkOOVWordID(); - size_t soff = pp.m_startSource; - for (size_t i = 0; i < sp.size(); ++i) { - if (sc[i]) ret.first *= sp[i]/sc[i]; - else ret.first *= m_wrd_cooc.pfwd(sw[soff+i], null_trg); - } - size_t toff = pp.m_startTarget; - for (size_t i = 0; i < tp.size(); ++i) { - if (tc[i]) ret.second *= tp[i]/tc[i]; - else ret.second *= m_wrd_cooc.pbwd(null_src,tw[toff+i]); - } - return ret; -} - -void -BilingualDynSuffixArray:: -CacheFreqWords() const -{ - multimap wordCnts; - // for each source word in vocab - Vocab::Word2Id::const_iterator it; - for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) { - // get its frequency - wordID_t srcWord = it->second; - vector sword(1, srcWord), wrdIndices; - m_srcSA->GetCorpusIndex(&sword, &wrdIndices); - if(wrdIndices.size() >= 1000) { // min count - wordCnts.insert(make_pair(wrdIndices.size(), srcWord)); - } - } - int numSoFar(0); - multimap::reverse_iterator ritr; - for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) { - m_freqWordsCached.insert(ritr->second); - CacheWordProbs(ritr->second); - if(++numSoFar == 50) break; // get top counts - } - cerr << "\tCached " << m_freqWordsCached.size() << " source words\n"; -} - -void -BilingualDynSuffixArray:: -CacheWordProbs(wordID_t srcWord) const -{ - map counts; - vector sword(1, srcWord), wrdIndices; - bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices); - UTIL_THROW_IF2(!ret, "Error"); - - vector sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks); - float denom(0); - // for each occurrence of this word - for(size_t snt = 0; snt < sntIndexes.size(); ++snt) { - int sntIdx = sntIndexes.at(snt); // get corpus index for sentence - UTIL_THROW_IF2(sntIdx == -1, "Error"); - - int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence - const vector srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word - if(srcAlg.size() == 0) { - ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word - ++denom; - } else { //get target words aligned to srcword in this sentence - for(size_t i=0; i < srcAlg.size(); ++i) { - wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]); - ++counts[trgWord]; - ++denom; - } - } - } - // now we've gotten counts of all target words aligned to this source word - // get probs and cache all pairs - for(map::const_iterator itrCnt = counts.begin(); - itrCnt != counts.end(); ++itrCnt) { - pair wordPair = make_pair(srcWord, itrCnt->first); - float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg) - float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src) - m_wordPairCache[wordPair] = pair(srcTrgPrb, trgSrcPrb); - } -} - -SAPhrase -BilingualDynSuffixArray:: -TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const -{ - // takes sentence indexes and looks up vocab IDs - SAPhrase phraseIds(phrasepair.GetTargetSize()); - int sntIndex = phrasepair.m_sntIndex; - int id(-1), pos(0); - for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words - id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i); - phraseIds.SetId(pos++, id); - } - return phraseIds; -} - -TargetPhrase* -BilingualDynSuffixArray:: -GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase, const PhraseDictionary *pt) const -{ - TargetPhrase* targetPhrase = new TargetPhrase(pt); - for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words - Word& word = m_trgVocab->GetWord( phrase.words[i]); - UTIL_THROW_IF2(word == m_trgVocab->GetkOOVWord(), - "Unknown word at position " << i); - targetPhrase->AddWord(word); - } - // scoring - return targetPhrase; -} - -/// Gather translation candidates for source phrase /src/ and store raw -// phrase pair statistics in /pstats/. Return the sample rate -// (number of samples considered / total number of hits) and total number of -// phrase pairs -pair -BilingualDynSuffixArray:: -GatherCands(Phrase const& src, map >& pstats) const -{ - typedef map >::iterator pstat_iter; - typedef map >::value_type pstat_entry; - pair ret(0,0); - float& sampleRate = ret.first; - float& totalPhrases = ret.second; - size_t srcSize = src.GetSize(); - SAPhrase localIDs(srcSize); - vector wrdIndices; - if(!GetLocalVocabIDs(src, localIDs) || - !m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) - return ret; // source phrase contains OOVs - - // select a sample of the occurrences for phrase extraction - size_t m1 = wrdIndices.size(); - SampleSelection(wrdIndices); // careful! SampleSelection alters wrdIndices! - sampleRate = float(wrdIndices.size())/m1; - - // determine the sentences in which these phrases occur - vector sntIndices = GetSntIndexes(wrdIndices, srcSize, m_srcSntBreaks); - for(size_t s = 0; s < sntIndices.size(); ++s) { - int sntStart = sntIndices.at(s); - if(sntStart == -1) continue; // marked as bad by GetSntIndexes() - vector phrasePairs; - ExtractPhrases(sntStart, wrdIndices[s], srcSize, phrasePairs); - totalPhrases += phrasePairs.size(); - vector::iterator p; - for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p) { - assert(*p); - pair lex = GetLexicalWeight(**p); - pstat_entry entry(TrgPhraseFromSntIdx(**p), Scores(5)); - pair foo = pstats.insert(entry); - Scores& feats = foo.first->second; - if (foo.second) { - feats[0] = 1; // count - feats[1] = lex.first; - feats[3] = lex.second; - } else { - feats[0] += 1; - feats[1] = max(feats[1],lex.first); - feats[3] = max(feats[3],lex.second); - } - delete *p; - } - } // done with all sentences - BOOST_FOREACH(pstat_entry & e, pstats) { - Scores& feats = e.second; - // 0: bwd phrase prob - // 1: lex 1 - // 2: fwd phrase prob - // 3: lex 2 - // 4: phrase penalty - float x = m_trgSA->GetCount(e.first.words)-feats[0] * sampleRate; - feats[4] = 1; - feats[3] = log(feats[3]); - feats[2] = log(feats[0]) - log(totalPhrases); - feats[1] = log(feats[1]); - feats[0] = log(feats[0]) - log(feats[0] + x); - } - return ret; -} - -vector -BilingualDynSuffixArray:: -GetSntIndexes(vector& wrdIndices, - const int sourceSize, - const vector& sntBreaks) const -{ - vector::const_iterator vit; - vector sntIndices; - for(size_t i=0; i < wrdIndices.size(); ++i) { - vit = upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]); - int index = int(vit - sntBreaks.begin()) - 1; - // check for phrases that cross sentence boundaries - if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index)) - sntIndices.push_back(-1); // set bad flag - else - sntIndices.push_back(index); // store the index of the sentence in the corpus - } - return sntIndices; -} - -int -BilingualDynSuffixArray:: -SampleSelection(vector& sample, int sampleSize) const -{ - // only use top 'sampleSize' number of samples - vector s; - randomSample(s,sampleSize,sample.size()); - for (size_t i = 0; i < s.size(); ++i) - s[i] = sample[s[i]]; - sample.swap(s); - return sample.size(); -} - -void -BilingualDynSuffixArray:: -addSntPair(string& source, string& target, string& alignment) -{ - vuint_t srcFactor, trgFactor; - cerr << "source, target, alignment = " << source << ", " - << target << ", " << alignment << endl; - const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size(); - cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl; - Phrase sphrase(ARRAY_SIZE_INCR); - sphrase.CreateFromString(Input, m_inputFactors, source, NULL); - m_srcVocab->MakeOpen(); - vector sIDs(sphrase.GetSize()); - // store words in vocabulary and corpus - for(int i = sphrase.GetSize()-1; i >= 0; --i) { - sIDs[i] = m_srcVocab->GetWordID(sphrase.GetWord(i)); // get vocab id backwards - } - for(size_t i = 0; i < sphrase.GetSize(); ++i) { - srcFactor.push_back(sIDs[i]); - cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl; - m_srcCorpus->push_back(srcFactor.back()); // add word to corpus - } - m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence - m_srcVocab->MakeClosed(); - Phrase tphrase(ARRAY_SIZE_INCR); - tphrase.CreateFromString(Output, m_outputFactors, target, NULL); - m_trgVocab->MakeOpen(); - vector tIDs(tphrase.GetSize()); - for(int i = tphrase.GetSize()-1; i >= 0; --i) { - tIDs[i] = m_trgVocab->GetWordID(tphrase.GetWord(i)); // get vocab id - } - for(size_t i = 0; i < tphrase.GetSize(); ++i) { - trgFactor.push_back(tIDs[i]); - cerr << "trgFactor[" << (trgFactor.size() - 1) << "] = " << trgFactor.back() << endl; - m_trgCorpus->push_back(trgFactor.back()); - } - cerr << "gets to 1\n"; - m_trgSntBreaks.push_back(oldTrgCrpSize); - cerr << "gets to 2\n"; - m_srcSA->Insert(&srcFactor, oldSrcCrpSize); - cerr << "gets to 3\n"; - m_trgSA->Insert(&trgFactor, oldTrgCrpSize); - LoadRawAlignments(alignment); - m_trgVocab->MakeClosed(); - - m_wrd_cooc.Count(sIDs,tIDs, m_rawAlignments.back(), - m_srcVocab->GetkOOVWordID(), - m_trgVocab->GetkOOVWordID()); - - //for(size_t i=0; i < sphrase.GetSize(); ++i) - //ClearWordInCache(sIDs[i]); - -} - -void -BilingualDynSuffixArray:: -ClearWordInCache(wordID_t srcWord) -{ - if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end()) - return; - map, pair >::iterator it, - first, last; - for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) { - if(it->first.first == srcWord) { // all source words grouped - first = it; // copy first entry of srcWord - last = it++; - while(it != m_wordPairCache.end() && (it->first.first == srcWord)) { - last = it++; - } - } - m_wordPairCache.erase(first, last); - } -} - -SentenceAlignment:: -SentenceAlignment(int sntIndex, int sourceSize, int targetSize) - : m_sntIndex(sntIndex) - , numberAligned(targetSize, 0) - , alignedList(sourceSize) -{ - // What is the code below supposed to accomplish??? UG. - // for(int i=0; i < sourceSize; ++i) { - // vector trgWrd; - // alignedList[i] = trgWrd; - // } -} - -bool -SentenceAlignment:: -Extract(int maxPhraseLength, vector &ret, int startSource, int endSource) const -{ - // foreign = target, F=T - // english = source, E=S - int countTarget = numberAligned.size(); - - int minTarget = 9999; - int maxTarget = -1; - vector< int > usedTarget = numberAligned; - for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) { - for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) { - int targetPos = alignedList[sourcePos][ind]; - // cout << "point (" << targetPos << ", " << sourcePos << ")\n"; - if (targetPosmaxTarget) { - maxTarget = targetPos; - } - usedTarget[ targetPos ]--; - } // for(int ind=0;ind= 0 && // aligned to any foreign words at all - maxTarget-minTarget < maxPhraseLength) { - // foreign phrase within limits - - // check if foreign words are aligned to out of bound english words - bool out_of_bounds = false; - for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) { - if (usedTarget[targetPos]>0) { - // cout << "ouf of bounds: " << targetPos << "\n"; - out_of_bounds = true; - } - } - - // cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n"; - if (!out_of_bounds) { - // start point of foreign phrase may retreat over unaligned - for(int startTarget = minTarget; - (startTarget >= 0 && - startTarget > maxTarget-maxPhraseLength && // within length limit - (startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned - startTarget--) { - // end point of foreign phrase may advance over unaligned - for (int endTarget=maxTarget; - (endTarget= 0 && - return (ret.size() > 0); - -} - -int -BilingualDynSuffixArray:: -GetSourceSentenceSize(size_t sentenceId) const -{ - return (sentenceId==m_srcSntBreaks.size()-1) ? - m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) : - m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId); -} - -int -BilingualDynSuffixArray:: -GetTargetSentenceSize(size_t sentenceId) const -{ - return (sentenceId==m_trgSntBreaks.size()-1) ? - m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) : - m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId); -} - -BetterPhrase:: -BetterPhrase(ScoresComp const& sc) - : cmp(sc) {} - -// bool -// BetterPhrase:: -// operator()(pair const& a, -// pair const& b) const -// { -// return cmp(b.first,a.first); -// } - -bool -BetterPhrase:: -operator()(pair const& a, - pair const& b) const -{ - return cmp(b.first,a.first); -} - -}// end namepsace diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h deleted file mode 100644 index ff0c0594d..000000000 --- a/moses/TranslationModel/BilingualDynSuffixArray.h +++ /dev/null @@ -1,180 +0,0 @@ -#ifndef moses_BilingualDynSuffixArray_h -#define moses_BilingualDynSuffixArray_h - -#include "DynSuffixArray.h" -#include "moses/TranslationModel/DynSAInclude/vocab.h" -#include "moses/TranslationModel/DynSAInclude/types.h" -#include "moses/TranslationModel/DynSAInclude/utils.h" -#include "moses/TranslationModel/WordCoocTable.h" -#include "moses/InputFileStream.h" -#include "moses/FactorTypeSet.h" -#include "moses/TargetPhrase.h" -#include -#include "moses/TargetPhraseCollection.h" -#include - -namespace Moses -{ -class PhraseDictionaryDynSuffixArray; - -/** @todo ask Abbey Levenberg - */ -class SAPhrase -{ -public: - std::vector words; - - SAPhrase(size_t phraseSize) - :words(phraseSize) { - } - - void SetId(size_t pos, wordID_t id) { - words.at(pos) = id; - } - bool operator<(const SAPhrase& phr2) const { - return words < phr2.words; - } -}; - -/** @todo ask Abbey Levenberg - */ -class PhrasePair -{ -public: - int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex; - PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex) - : m_startTarget(startTarget) - , m_endTarget(endTarget) - , m_startSource(startSource) - , m_endSource(endSource) - , m_sntIndex(sntIndex) { - } - - size_t GetTargetSize() const { - return m_endTarget - m_startTarget + 1; - } - - size_t GetSourceSize() const { - return m_endSource - m_startSource + 1; - } -}; - -/** @todo ask Abbey Levenberg - */ -class SentenceAlignment -{ -public: - SentenceAlignment(int sntIndex, int sourceSize, int targetSize); - int m_sntIndex; - std::vector* trgSnt; - std::vector* srcSnt; - std::vector numberAligned; - std::vector< std::vector > alignedList; - bool Extract(int maxPhraseLength, std::vector &ret, - int startSource, int endSource) const; -}; - -class ScoresComp -{ -public: - ScoresComp(const std::vector& weights) { - } - bool operator()(const Scores& s1, const Scores& s2) const { - return s1[0] < s2[0]; // just p(e|f) as approximation - // float score1(0), score2(0); - // int idx1(0), idx2(0); - // for (Scores::const_iterator itr = s1.begin(); - // itr != s1.end(); ++itr) { - // score1 += log(*itr * m_weights.at(idx1++)); - // } - // for (Scores::const_iterator itr = s2.begin(); - // itr != s2.end(); ++itr) { - // score2 += log(*itr * m_weights.at(idx2++)); - // } - // return score1 < score2; - } -}; - -struct BetterPhrase { - ScoresComp const& cmp; - BetterPhrase(ScoresComp const& sc); - // bool operator()(std::pair const& a, - // std::pair const& b) const; - bool operator()(std::pair const& a, - std::pair const& b) const; -}; - -/** @todo ask Abbey Levenberg - */ -class BilingualDynSuffixArray -{ -public: - BilingualDynSuffixArray(); - ~BilingualDynSuffixArray(); - bool Load( const std::vector& inputFactors, - const std::vector& outputTactors, - std::string source, std::string target, std::string alignments, - const std::vector &weight); - // bool LoadTM( const std::vector& inputFactors, - // const std::vector& outputTactors, - // string source, string target, string alignments, - // const std::vector &weight); - void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair >& target) const; - - void CleanUp(const InputType& source); - void addSntPair(std::string& source, std::string& target, std::string& alignment); - std::pair - GatherCands(Phrase const& src, std::map >& pstats) const; - - TargetPhrase* - GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase, const PhraseDictionary *pt) const; - -private: - - - mutable WordCoocTable m_wrd_cooc; - DynSuffixArray * m_srcSA; - DynSuffixArray * m_trgSA; - std::vector* m_srcCorpus; - std::vector* m_trgCorpus; - std::vector m_inputFactors; - std::vector m_outputFactors; - - std::vector m_srcSntBreaks, m_trgSntBreaks; - - Vocab* m_srcVocab, *m_trgVocab; - ScoresComp* m_scoreCmp; - - std::vector m_alignments; - std::vector > m_rawAlignments; - - mutable std::map, std::pair > m_wordPairCache; - mutable std::set m_freqWordsCached; - const size_t m_maxPhraseLength, m_maxSampleSize; - const size_t m_maxPTEntries; - int LoadCorpus(FactorDirection direction, - InputFileStream&, const std::vector& factors, - std::vector&, std::vector&, - Vocab*); - int LoadAlignments(InputFileStream& aligs); - int LoadRawAlignments(InputFileStream& aligs); - int LoadRawAlignments(std::string& aligs); - - bool ExtractPhrases(const int&, const int&, const int&, std::vector&, bool=false) const; - SentenceAlignment GetSentenceAlignment(const int, bool=false) const; - int SampleSelection(std::vector&, int = 300) const; - - std::vector GetSntIndexes(std::vector&, int, const std::vector&) const; - SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const; - bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const; - void CacheWordProbs(wordID_t) const; - void CacheFreqWords() const; - void ClearWordInCache(wordID_t); - std::pair GetLexicalWeight(const PhrasePair&) const; - - int GetSourceSentenceSize(size_t sentenceId) const; - int GetTargetSentenceSize(size_t sentenceId) const; - -}; -} // end namespace -#endif diff --git a/moses/TranslationModel/DynSuffixArray.cpp b/moses/TranslationModel/DynSuffixArray.cpp deleted file mode 100644 index c1dc62f12..000000000 --- a/moses/TranslationModel/DynSuffixArray.cpp +++ /dev/null @@ -1,353 +0,0 @@ -#include "DynSuffixArray.h" -#include "util/random.hh" - -#include -#include - -using namespace std; - -namespace Moses -{ - -DynSuffixArray::DynSuffixArray() -{ - m_SA = new vuint_t(); - m_ISA = new vuint_t(); - m_F = new vuint_t(); - m_L = new vuint_t(); - std::cerr << "DYNAMIC SUFFIX ARRAY CLASS INSTANTIATED" << std::endl; -} - -DynSuffixArray::~DynSuffixArray() -{ - delete m_SA; - delete m_ISA; - delete m_F; - delete m_L; -} - -DynSuffixArray::DynSuffixArray(vuint_t* crp) -{ - // make native int array and pass to SA builder - m_corpus = crp; - int size = m_corpus->size(); - int* tmpArr = new int[size]; - for(int i=0 ; i < size; ++i) tmpArr[i] = i; - - Qsort(tmpArr, 0, size-1); - - m_SA = new vuint_t(tmpArr, tmpArr + size); - //std::cerr << "printing SA " << std::endl; - //for(int i=0; i < size; ++i) std::cerr << m_SA->at(i) << std::endl; - delete[] tmpArr; - std::cerr << "DYNAMIC SUFFIX ARRAY CLASS INSTANTIATED WITH SIZE " << size << std::endl; - BuildAuxArrays(); - //printAuxArrays(); -} - -void DynSuffixArray::BuildAuxArrays() -{ - int size = m_SA->size(); - m_ISA = new vuint_t(size); - m_F = new vuint_t(size); - m_L = new vuint_t(size); - - for(int i=0; i < size; ++i) { - m_ISA->at(m_SA->at(i)) = i; - //(*m_ISA)[(*m_SA)[i]] = i; - (*m_F)[i] = (*m_corpus)[m_SA->at(i)]; - (*m_L)[i] = (*m_corpus)[(m_SA->at(i) == 0 ? size-1 : m_SA->at(i)-1)]; - } -} - -int DynSuffixArray::Rank(unsigned word, unsigned idx) -{ - /* use Gerlach's code to make rank faster */ - // the number of words in L[0..i] (minus 1 which is why 'i < idx', not '<=') - int r(0); - for(unsigned i=0; i < idx; ++i) - if(m_L->at(i) == word) ++r; - return r; -} - -/* count function should be implemented - * with binary search over suffix array!! */ -int DynSuffixArray::F_firstIdx(unsigned word) -{ - // return index of first row where word is found in m_F - /*for(int i=0; i < m_F->size(); ++i) { - if(m_F->at(i) == word) { - return i; - } - } - return -1;*/ - //NOTE: lower_bound is faster than linear search above but may cause issues - // if ordering of vocab is not consecutive (ie..after deletions) - int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin(); - //cerr << "in F_firstIdx with word = " << word << " and low = " << low << " and F->size() =" << m_F->size() << endl; - if((size_t)low >= m_F->size()) - return -1; - else - return low; -} - -/* uses rank() and c() to obtain the LastFirstFunc function */ -int DynSuffixArray::LastFirstFunc(unsigned L_idx) -{ - int fIdx(-1); - //cerr << "in LastFirstFcn() with L_idx = " << L_idx << endl; - unsigned word = m_L->at(L_idx); - if((fIdx = F_firstIdx(word)) != -1) { - //cerr << "fidx + Rank(" << word << "," << L_idx << ") = " << fIdx << "+" << Rank(word, L_idx) << endl; - fIdx += Rank(word, L_idx); - } - return fIdx; -} - -void DynSuffixArray::Insert(vuint_t* newSent, unsigned newIndex) -{ - // for sentences - //stages 1, 2, 4 stay same from 1char case - //(use last word of new text in step 2 and save Ltmp until last insert?) - //stage 3...all words of new sentence are inserted backwards - // stage 2: k=ISA[newIndex], tmp= L[k], L[k] = newChar - //PrintAuxArrays(); - UTIL_THROW_IF2(newIndex > m_SA->size(), "Error"); - int k(-1), kprime(-1); - k = (newIndex < m_SA->size() ? m_ISA->at(newIndex) : m_ISA->at(0)); // k is now index of the cycle that starts at newindex - int true_pos = LastFirstFunc(k); // track cycle shift (newIndex - 1) - int Ltmp = m_L->at(k); - m_L->at(k) = newSent->at(newSent->size()-1); // cycle k now ends with correct word - for(int j = newSent->size()-1; j > -1; --j) { - kprime = LastFirstFunc(k); // find cycle that starts with (newindex - 1) - //kprime += ((m_L[k] == Ltmp) && (k > isa[k]) ? 1 : 0); // yada yada - // only terminal char can be 0 so add new vocab at end - kprime = (kprime > 0 ? kprime : m_SA->size()); - true_pos += (kprime <= true_pos ? 1 : 0); // track changes - // insert everything - m_F->insert(m_F->begin() + kprime, newSent->at(j)); - int theLWord = (j == 0 ? Ltmp : newSent->at(j-1)); - - m_L->insert(m_L->begin() + kprime, theLWord); - for (vuint_t::iterator itr = m_SA->begin(); itr != m_SA->end(); ++itr) { - if(*itr >= newIndex) ++(*itr); - } - m_SA->insert(m_SA->begin() + kprime, newIndex); - for (vuint_t::iterator itr = m_ISA->begin(); itr != m_ISA->end(); ++itr) { - if((int)*itr >= kprime) ++(*itr); - } - - m_ISA->insert(m_ISA->begin() + newIndex, kprime); - k = kprime; - //PrintAuxArrays(); - } - // Begin stage 4 - Reorder(true_pos, LastFirstFunc(kprime)); // actual position vs computed position of cycle (newIndex-1) -} - -void DynSuffixArray::Reorder(unsigned j, unsigned jprime) -{ - set > seen; - while(j != jprime) { - // this 'seenit' check added for data with many loops. will remove after double - // checking. - bool seenit = seen.insert(std::make_pair(j, jprime)).second; - if(seenit) { - for(size_t i=1; i < m_SA->size(); ++i) { - if(m_corpus->at(m_SA->at(i)) < m_corpus->at(m_SA->at(i-1))) { - cerr << "PROBLEM WITH SUFFIX ARRAY REORDERING. EXITING...\n"; - exit(1); - } - } - return; - } - //cerr << "j=" << j << "\tj'=" << jprime << endl; - int isaIdx(-1); - int new_j = LastFirstFunc(j); - UTIL_THROW_IF2(j > jprime, "Error"); - // for SA and L, the element at pos j is moved to pos j' - m_L->insert(m_L->begin() + jprime + 1, m_L->at(j)); - m_L->erase(m_L->begin() + j); - m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j)); - m_SA->erase(m_SA->begin() + j); - // all ISA values between (j...j'] decremented - for(size_t i = 0; i < m_ISA->size(); ++i) { - if((m_ISA->at(i) == j) && (isaIdx == -1)) - isaIdx = i; // store index of ISA[i] = j - if((m_ISA->at(i) > j) && (m_ISA->at(i) <= jprime)) --(*m_ISA)[i]; - } - // replace j with j' in ISA - //isa[isaIdx] = jprime; - m_ISA->at(isaIdx) = jprime; - j = new_j; - jprime = LastFirstFunc(jprime); - } - //cerr << "j=" << j << "\tj'=" << jprime << endl; -} - -void DynSuffixArray::Delete(unsigned index, unsigned num2del) -{ - int ltmp = m_L->at(m_ISA->at(index)); - int true_pos = LastFirstFunc(m_ISA->at(index)); // track cycle shift (newIndex - 1) - for(size_t q = 0; q < num2del; ++q) { - int row = m_ISA->at(index); // gives the position of index in SA and m_F - //std::cerr << "row = " << row << std::endl; - //std::cerr << "SA[r]/index = " << m_SA->at(row) << "/" << index << std::endl; - true_pos -= (row <= true_pos ? 1 : 0); // track changes - m_L->erase(m_L->begin() + row); - m_F->erase(m_F->begin() + row); - - m_ISA->erase(m_ISA->begin() + index); // order is important - for (vuint_t::iterator itr = m_ISA->begin(); itr != m_ISA->end(); ++itr) { - if((int)*itr > row) --(*itr); - } - - m_SA->erase(m_SA->begin() + row); - for (vuint_t::iterator itr = m_SA->begin(); itr != m_SA->end(); ++itr) { - if(*itr > index) --(*itr); - } - } - m_L->at(m_ISA->at(index))= ltmp; - Reorder(LastFirstFunc(m_ISA->at(index)), true_pos); - //PrintAuxArrays(); -} - -void DynSuffixArray::Substitute(vuint_t* /* newSents */, unsigned /* newIndex */) -{ - std::cerr << "NEEDS TO IMPLEMENT SUBSITITUTE FACTOR\n"; - return; -} - -ComparePosition:: -ComparePosition(vuint_t const& crp, vuint_t const& sfa) - : m_crp(crp), m_sfa(sfa) { } - -bool -ComparePosition:: -operator()(unsigned const& i, vector const& phrase) const -{ - unsigned const* x = &m_crp.at(i); - unsigned const* e = &m_crp.back(); - size_t k = 0; - for (; k < phrase.size() && x < e; ++k, ++x) - if (*x != phrase[k]) return *x < phrase[k]; - return (x == e && k < phrase.size()); -} - -bool -ComparePosition:: -operator()(vector const& phrase, unsigned const& i) const -{ - unsigned const* x = &m_crp.at(i); - unsigned const* e = &m_crp.back(); - size_t k = 0; - for (; k < phrase.size() && x < e; ++k, ++x) - if (*x != phrase[k]) return phrase[k] < *x; - return false; // (k == phrase.size() && x < e); -} - -bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices) -{ - // DOES THIS EVEN WORK WHEN A DynSuffixArray has been saved and reloaded???? - pair bounds; - indices->clear(); - size_t phrasesize = phrase->size(); - // find lower and upper bounds on phrase[0] - bounds = std::equal_range(m_F->begin(), m_F->end(), phrase->at(0)); - // bounds holds first and (last + 1) index of phrase[0] in m_SA - size_t lwrBnd = size_t(bounds.first - m_F->begin()); - size_t uprBnd = size_t(bounds.second - m_F->begin()); - //cerr << "phrasesize = " << phrasesize << "\tuprBnd = " << uprBnd << "\tlwrBnd = " << lwrBnd; - //cerr << "\tcorpus size = " << m_corpus->size() << endl; - if(uprBnd - lwrBnd == 0) return false; // not found - if(phrasesize == 1) { - for(size_t i=lwrBnd; i < uprBnd; ++i) { - indices->push_back(m_SA->at(i)); - } - return (indices->size() > 0); - } - //find longer phrases if they exist - for(size_t i = lwrBnd; i < uprBnd; ++i) { - size_t crpIdx = m_SA->at(i); - if((crpIdx + phrasesize) > m_corpus->size()) continue; // past end of corpus - for(size_t pos = 1; pos < phrasesize; ++pos) { // for all following words - if(m_corpus->at(crpIdx + pos) != phrase->at(pos)) { // if word doesn't match - if(indices->size() > 0) i = uprBnd; // past the phrases since SA is ordered - break; - } else if(pos == phrasesize-1) { // found phrase - indices->push_back(crpIdx + pos); // store rigthmost index of phrase - } - } - } - //cerr << "Total count of phrase = " << indices->size() << endl; - return (indices->size() > 0); -} - -size_t -DynSuffixArray:: -GetCount(vuint_t const& phrase) const -{ - ComparePosition cmp(*m_corpus, *m_SA); - vuint_t::const_iterator lb = lower_bound(m_SA->begin(), m_SA->end(), phrase, cmp); - vuint_t::const_iterator ub = upper_bound(m_SA->begin(), m_SA->end(), phrase, cmp); - return ub-lb; -} - -void DynSuffixArray::Save(FILE* fout) -{ - fWriteVector(fout, *m_SA); -} - -void DynSuffixArray::Load(FILE* fin) -{ - fReadVector(fin, *m_SA); -} - -int DynSuffixArray::Compare(int pos1, int pos2, int max) -{ - for (size_t i = 0; i < (unsigned)max; ++i) { - if((pos1 + i < m_corpus->size()) && (pos2 + i >= m_corpus->size())) - return 1; - if((pos2 + i < m_corpus->size()) && (pos1 + i >= m_corpus->size())) - return -1; - - int diff = m_corpus->at(pos1+i) - m_corpus->at(pos2+i); - if(diff != 0) return diff; - } - return 0; -} - -namespace -{ -/// Helper: swap two entries in an int array. -inline void swap_ints(int array[], int one, int other) -{ - const int tmp = array[one]; - array[one] = array[other]; - array[other] = tmp; -} -} - -void DynSuffixArray::Qsort(int* array, int begin, int end) -{ - if(end > begin) { - int index = util::rand_incl(begin, end); - { - const int pivot = array[index]; - swap_ints(array, index, end); - for(int i=index=begin; i < end; ++i) { - if (Compare(array[i], pivot, 20) <= 0) { - swap_ints(array, index, i); - index++; - } - } - swap_ints(array, index, end); - } - Qsort(array, begin, index - 1); - Qsort(array, index + 1, end); - } -} - - - -} // end namespace diff --git a/moses/TranslationModel/DynSuffixArray.h b/moses/TranslationModel/DynSuffixArray.h deleted file mode 100644 index cbef2ba3a..000000000 --- a/moses/TranslationModel/DynSuffixArray.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef moses_DynSuffixArray_h -#define moses_DynSuffixArray_h - -#include -#include -#include -#include -#include "moses/Util.h" -#include "moses/File.h" -#include "moses/TranslationModel/DynSAInclude/types.h" - -namespace Moses -{ - -typedef std::vector vuint_t; - - -/// compare position /i/ in the suffix array /m_sfa/ into corpus /m_crp/ -/// against reference phrase /phrase/ -// added by Ulrich Germann -class ComparePosition -{ - vuint_t const& m_crp; - vuint_t const& m_sfa; - -public: - ComparePosition(vuint_t const& crp, vuint_t const& sfa); - bool operator()(unsigned const& i, std::vector const& phrase) const; - bool operator()(std::vector const& phrase, unsigned const& i) const; -}; - - -/** @todo ask Abbey Levenberg - */ -class DynSuffixArray -{ - -public: - DynSuffixArray(); - DynSuffixArray(vuint_t*); - ~DynSuffixArray(); - bool GetCorpusIndex(const vuint_t*, vuint_t*); - void Load(FILE*); - void Save(FILE*); - void Insert(vuint_t*, unsigned); - void Delete(unsigned, unsigned); - void Substitute(vuint_t*, unsigned); - - size_t GetCount(vuint_t const& phrase) const; - -private: - vuint_t* m_SA; - vuint_t* m_ISA; - vuint_t* m_F; - vuint_t* m_L; - vuint_t* m_corpus; - void BuildAuxArrays(); - void Qsort(int* array, int begin, int end); - int Compare(int, int, int); - void Reorder(unsigned, unsigned); - int LastFirstFunc(unsigned); - int Rank(unsigned, unsigned); - int F_firstIdx(unsigned); - void PrintAuxArrays() { - std::cerr << "SA\tISA\tF\tL\n"; - for(size_t i=0; i < m_SA->size(); ++i) - std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" - << m_F->at(i) << "\t" << m_L->at(i) << std::endl; - } -}; -} //end namespace - -#endif diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README deleted file mode 100644 index 19a7f8779..000000000 --- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README +++ /dev/null @@ -1,4 +0,0 @@ -Specifying Dynamic Suffix Array-based Phrase Tables in moses.ini - -[ttable-file] -14 0 0 5 diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp deleted file mode 100644 index 0ae4d4fc8..000000000 --- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp +++ /dev/null @@ -1,110 +0,0 @@ -#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h" -#include "moses/FactorCollection.h" -#include "moses/StaticData.h" -#include "moses/TargetPhrase.h" -#include -#include -using namespace std; - -namespace Moses -{ -PhraseDictionaryDynSuffixArray:: -PhraseDictionaryDynSuffixArray(const std::string &line) - : PhraseDictionary(line, true) - ,m_biSA(new BilingualDynSuffixArray()) -{ - ReadParameters(); -} - - -void -PhraseDictionaryDynSuffixArray:: -Load() -{ - SetFeaturesToApply(); - - vector weight = StaticData::Instance().GetWeights(this); - m_biSA->Load(m_input, m_output, m_source, m_target, m_alignments, weight); -} - -PhraseDictionaryDynSuffixArray:: -~PhraseDictionaryDynSuffixArray() -{ - delete m_biSA; -} - -void -PhraseDictionaryDynSuffixArray:: -SetParameter(const std::string& key, const std::string& value) -{ - if (key == "source") { - m_source = value; - } else if (key == "target") { - m_target = value; - } else if (key == "alignment") { - m_alignments = value; - } else { - PhraseDictionary::SetParameter(key, value); - } -} - -const TargetPhraseCollection* -PhraseDictionaryDynSuffixArray:: -GetTargetPhraseCollectionLEGACY(const Phrase& src) const -{ - typedef map >::value_type pstat_entry; - map > pstats; // phrase (pair) statistics - m_biSA->GatherCands(src,pstats); - - TargetPhraseCollection *ret = new TargetPhraseCollection(); - BOOST_FOREACH(pstat_entry & e, pstats) { - TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src, this); - tp->GetScoreBreakdown().Assign(this,e.second); - tp->EvaluateInIsolation(src); - ret->Add(tp); - } - // return ret; - // TargetPhraseCollection *ret = new TargetPhraseCollection(); - // std::vector< std::pair< Scores, TargetPhrase*> > trg; - // - // // extract target phrases and their scores from suffix array - // m_biSA->GetTargetPhrasesByLexicalWeight(src, trg); - // - // std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr; - // for(itr = trg.begin(); itr != trg.end(); ++itr) { - // Scores scoreVector = itr->first; - // TargetPhrase *targetPhrase = itr->second; - // std::transform(scoreVector.begin(),scoreVector.end(), - // scoreVector.begin(),FloorScore); - // targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); - // targetPhrase->Evaluate(); - // ret->Add(targetPhrase); - // } - ret->NthElement(m_tableLimit); // sort the phrases for the decoder - return ret; -} - -void -PhraseDictionaryDynSuffixArray:: -insertSnt(string& source, string& target, string& alignment) -{ - m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays - //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache -} - -void -PhraseDictionaryDynSuffixArray:: -deleteSnt(unsigned /* idx */, unsigned /* num2Del */) -{ - // need to implement -- -} - -ChartRuleLookupManager* -PhraseDictionaryDynSuffixArray:: -CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase&, std::size_t) -{ - UTIL_THROW(util::Exception, "SCFG decoding not supported with dynamic suffix array"); - return 0; -} - -}// end namepsace diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h deleted file mode 100644 index 6e4d67b6d..000000000 --- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef moses_PhraseDictionaryDynSuffixArray_h -#define moses_PhraseDictionaryDynSuffixArray_h - -#include - -#include "moses/TypeDef.h" -#include "moses/TranslationModel/PhraseDictionary.h" -#include "moses/TranslationModel/BilingualDynSuffixArray.h" - -namespace Moses -{ - -/** Implementation of a phrase table using the biconcor suffix array. - * Wrapper around a BilingualDynSuffixArray object - */ -class PhraseDictionaryDynSuffixArray: public PhraseDictionary -{ -public: - PhraseDictionaryDynSuffixArray(const std::string &line); - ~PhraseDictionaryDynSuffixArray(); - bool InitDictionary(); - void Load(); - // functions below required by base class - const TargetPhraseCollection* GetTargetPhraseCollectionLEGACY(const Phrase& src) const; - void insertSnt(std::string&, std::string&, std::string&); - void deleteSnt(unsigned, unsigned); - ChartRuleLookupManager *CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase&, std::size_t); - void SetParameter(const std::string& key, const std::string& value); -private: - BilingualDynSuffixArray *m_biSA; - std::string m_source, m_target, m_alignments; - - std::vector m_weight; -}; - -} // end namespace -#endif diff --git a/moses/TranslationModel/WordCoocTable.cpp b/moses/TranslationModel/WordCoocTable.cpp deleted file mode 100644 index c3223942e..000000000 --- a/moses/TranslationModel/WordCoocTable.cpp +++ /dev/null @@ -1,72 +0,0 @@ -#include "moses/TranslationModel/WordCoocTable.h" -using namespace std; -namespace Moses -{ - -WordCoocTable:: -WordCoocTable() -{ - m_cooc.reserve(1000000); - m_marg1.reserve(1000000); - m_marg2.reserve(1000000); -} - -WordCoocTable:: -WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2) - : m_cooc(VocabSize1), m_marg1(VocabSize1,0), m_marg2(VocabSize2, 0) -{} - -void -WordCoocTable:: -Count(size_t const a, size_t const b) -{ - while (a >= m_marg1.size()) { - m_cooc.push_back(my_map_t()); - m_marg1.push_back(0); - } - while (b >= m_marg2.size()) - m_marg2.push_back(0); - ++m_marg1[a]; - ++m_marg2[b]; - ++m_cooc[a][b]; -} - -uint32_t -WordCoocTable:: -GetJoint(size_t const a, size_t const b) const -{ - if (a >= m_marg1.size() || b >= m_marg2.size()) return 0; - my_map_t::const_iterator m = m_cooc.at(a).find(b); - if (m == m_cooc[a].end()) return 0; - return m->second; -} - -uint32_t -WordCoocTable:: -GetMarg1(size_t const x) const -{ - return x >= m_marg1.size() ? 0 : m_marg1[x]; -} - -uint32_t -WordCoocTable:: -GetMarg2(size_t const x) const -{ - return x >= m_marg2.size() ? 0 : m_marg2[x]; -} - -float -WordCoocTable:: -pfwd(size_t const a, size_t const b) const -{ - return float(GetJoint(a,b))/GetMarg1(a); -} - -float -WordCoocTable:: -pbwd(size_t const a, size_t const b) const -{ - // cerr << "at " << __FILE__ << ":" << __LINE__ << endl; - return float(GetJoint(a,b))/GetMarg2(b); -} -} diff --git a/moses/TranslationModel/WordCoocTable.h b/moses/TranslationModel/WordCoocTable.h deleted file mode 100644 index c76691a16..000000000 --- a/moses/TranslationModel/WordCoocTable.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef moses_WordCoocTable_h -#define moses_WordCoocTable_h - -#include "moses/TranslationModel/DynSAInclude/vocab.h" -#include "moses/TranslationModel/DynSAInclude/types.h" -#include "moses/TranslationModel/DynSAInclude/utils.h" -#include "moses/InputFileStream.h" -#include "moses/FactorTypeSet.h" -#include "moses/TargetPhrase.h" -#include -#include - -namespace Moses -{ - -#ifndef bitvector -typedef boost::dynamic_bitset bitvector; -#endif - - -/** - * Stores word cooccurrence counts - * @todo ask Uli Germann - */ -class WordCoocTable -{ - typedef std::map my_map_t; - std::vector m_cooc; - std::vector m_marg1; - std::vector m_marg2; -public: - WordCoocTable(); - WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2); - uint32_t GetJoint(size_t const a, size_t const b) const; - uint32_t GetMarg1(size_t const x) const; - uint32_t GetMarg2(size_t const x) const; - float pfwd(size_t const a, size_t const b) const; - float pbwd(size_t const a, size_t const b) const; - void - Count(size_t const a, size_t const b); - - template - void - Count(idvec const& s1, idvec const& s2, alnvec const& aln, - wordID_t const NULL1, wordID_t const NULL2); - -}; - -template -void -WordCoocTable:: -Count(idvec const& s1, idvec const& s2, alnvec const& aln, - wordID_t const NULL1, wordID_t const NULL2) -{ - boost::dynamic_bitset check1(s1.size()), check2(s2.size()); - check1.set(); - check2.set(); - for (size_t i = 0; i < aln.size(); i += 2) { - Count(s1[aln[i]], s2[aln[i+1]]); - check1.reset(aln[i]); - check2.reset(aln[i+1]); - } - for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i)) - Count(s1[i], NULL2); - for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i)) - Count(NULL1, s2[i]); -} - -} -#endif From e8f010b9aff3feb8eeac13a5b06930c18519ea8a Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 17 Aug 2015 18:11:04 +0100 Subject: [PATCH 267/286] Removed ORLM. --- contrib/server/mosesserver.cpp | 59 +- moses/LM/Jamfile | 4 +- moses/LM/ORLM.cpp | 108 ---- moses/LM/ORLM.h | 53 -- .../DynSAInclude/FileHandler.cpp | 186 ------ .../DynSAInclude/FileHandler.h | 72 --- moses/TranslationModel/DynSAInclude/Jamfile | 1 - .../DynSAInclude/RandLMCache.h | 201 ------- .../DynSAInclude/RandLMFilter.h | 427 -------------- .../TranslationModel/DynSAInclude/fdstream.h | 146 ----- moses/TranslationModel/DynSAInclude/hash.h | 357 ------------ .../TranslationModel/DynSAInclude/onlineRLM.h | 542 ------------------ .../TranslationModel/DynSAInclude/params.cpp | 241 -------- moses/TranslationModel/DynSAInclude/params.h | 64 --- .../DynSAInclude/perfectHash.h | 437 -------------- .../TranslationModel/DynSAInclude/quantizer.h | 106 ---- moses/TranslationModel/DynSAInclude/types.h | 35 -- moses/TranslationModel/DynSAInclude/utils.h | 67 --- moses/TranslationModel/DynSAInclude/vocab.cpp | 158 ----- moses/TranslationModel/DynSAInclude/vocab.h | 127 ---- 20 files changed, 4 insertions(+), 3387 deletions(-) delete mode 100644 moses/LM/ORLM.cpp delete mode 100644 moses/LM/ORLM.h delete mode 100644 moses/TranslationModel/DynSAInclude/FileHandler.cpp delete mode 100644 moses/TranslationModel/DynSAInclude/FileHandler.h delete mode 100644 moses/TranslationModel/DynSAInclude/Jamfile delete mode 100644 moses/TranslationModel/DynSAInclude/RandLMCache.h delete mode 100644 moses/TranslationModel/DynSAInclude/RandLMFilter.h delete mode 100644 moses/TranslationModel/DynSAInclude/fdstream.h delete mode 100644 moses/TranslationModel/DynSAInclude/hash.h delete mode 100644 moses/TranslationModel/DynSAInclude/onlineRLM.h delete mode 100644 moses/TranslationModel/DynSAInclude/params.cpp delete mode 100644 moses/TranslationModel/DynSAInclude/params.h delete mode 100644 moses/TranslationModel/DynSAInclude/perfectHash.h delete mode 100644 moses/TranslationModel/DynSAInclude/quantizer.h delete mode 100644 moses/TranslationModel/DynSAInclude/types.h delete mode 100644 moses/TranslationModel/DynSAInclude/utils.h delete mode 100644 moses/TranslationModel/DynSAInclude/vocab.cpp delete mode 100644 moses/TranslationModel/DynSAInclude/vocab.h diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 48c14c988..9b34adb6c 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -39,11 +39,11 @@ int main(int argc, char** argv) #include "moses/ThreadPool.h" #include "moses/TranslationTask.h" #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h" +#include "moses/FF/StatefulFeatureFunction.h" #if PT_UG #include "moses/TranslationModel/UG/mmsapt.h" #endif #include "moses/TreeInput.h" -#include "moses/LM/ORLM.h" #include "moses/IOWrapper.h" #include @@ -86,64 +86,11 @@ public: msg += "supports updates."; throw xmlrpc_c::fault(msg.c_str(), xmlrpc_c::fault::CODE_PARSE); #endif - if(add2ORLM_) { - //updateORLM(); - } XVERBOSE(1,"Done inserting\n"); - //PhraseDictionary* pdsa = (PhraseDictionary*) pdf->GetDictionary(*dummy); - map retData; - //*retvalP = xmlrpc_c::value_struct(retData); -#ifndef PT_UG - pdf = 0; -#endif - pdsa = 0; *retvalP = xmlrpc_c::value_string("Phrase table updated"); } string source_, target_, alignment_; - bool bounded_, add2ORLM_; - /* - void updateORLM() { - // TODO(level101): this belongs in the language model, not in moseserver.cpp - vector vl; - map, int> ngSet; - LMList lms = StaticData::Instance().GetLMList(); // get LM - LMList::const_iterator lmIter = lms.begin(); - LanguageModel *lm = *lmIter; - LanguageModelORLM* orlm = static_cast(lm); - if(orlm == 0) { - cerr << "WARNING: Unable to add target sentence to ORLM\n"; - return; - } - // break out new ngrams from sentence - const int ngOrder(orlm->GetNGramOrder()); - const std::string sBOS = orlm->GetSentenceStart()->GetString().as_string(); - const std::string sEOS = orlm->GetSentenceEnd()->GetString().as_string(); - Utils::splitToStr(target_, vl, " "); - // insert BOS and EOS - vl.insert(vl.begin(), sBOS); - vl.insert(vl.end(), sEOS); - for(int j=0; j < vl.size(); ++j) { - int i = (j= i; --t) { - vector ngVec; - for(int s=t; s<=j; ++s) { - ngVec.push_back(vl[s]); - //cerr << vl[s] << " "; - } - ngSet[ngVec]++; - //cerr << endl; - } - } - // insert into LM in order from 1grams up (for LM well-formedness) - cerr << "Inserting " << ngSet.size() << " ngrams into ORLM...\n"; - for(int i=1; i <= ngOrder; ++i) { - iterate(ngSet, it) { - if(it->first.size() == i) - orlm->UpdateORLM(it->first, it->second); - } - } - } - */ + bool bounded_; void breakOutParams(const params_t& params) { params_t::const_iterator si = params.find("source"); @@ -163,8 +110,6 @@ public: XVERBOSE(1,"alignment = " << alignment_ << endl); si = params.find("bounded"); bounded_ = (si != params.end()); - si = params.find("updateORLM"); - add2ORLM_ = (si != params.end()); } }; diff --git a/moses/LM/Jamfile b/moses/LM/Jamfile index 6ce50f5f0..b8ae957fd 100644 --- a/moses/LM/Jamfile +++ b/moses/LM/Jamfile @@ -134,11 +134,11 @@ if $(with-dalm) { } #ORLM is always compiled but needs special headers -obj ORLM.o : ORLM.cpp ..//headers ../TranslationModel/DynSAInclude//dynsa : : : ../TranslationModel/DynSAInclude ; +#obj ORLM.o : ORLM.cpp ..//headers ../TranslationModel/DynSAInclude//dynsa : : : ../TranslationModel/DynSAInclude ; #Top-level LM library. If you've added a file that doesn't depend on external #libraries, put it here. -alias LM : Backward.cpp BackwardLMState.cpp Base.cpp BilingualLM.cpp Implementation.cpp Ken.cpp MultiFactor.cpp Remote.cpp SingleFactor.cpp SkeletonLM.cpp ORLM.o +alias LM : Backward.cpp BackwardLMState.cpp Base.cpp BilingualLM.cpp Implementation.cpp Ken.cpp MultiFactor.cpp Remote.cpp SingleFactor.cpp SkeletonLM.cpp ../../lm//kenlm ..//headers $(dependencies) ; alias macros : : : : $(lmmacros) ; diff --git a/moses/LM/ORLM.cpp b/moses/LM/ORLM.cpp deleted file mode 100644 index 9632fd6ab..000000000 --- a/moses/LM/ORLM.cpp +++ /dev/null @@ -1,108 +0,0 @@ -#include -#include -#include - -#include "moses/FactorCollection.h" -#include "moses/Phrase.h" -#include "moses/InputFileStream.h" -#include "moses/StaticData.h" -#include "ORLM.h" - -using namespace std; - -namespace Moses -{ -bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType, - size_t nGramOrder) -{ - cerr << "Loading LanguageModelORLM..." << endl; - m_filePath = filePath; - m_factorType = factorType; - m_nGramOrder = nGramOrder; - FileHandler fLmIn(m_filePath, std::ios::in|std::ios::binary, true); - m_lm = new OnlineRLM(&fLmIn, m_nGramOrder); - fLmIn.close(); - //m_lm = new MultiOnlineRLM(m_filePath, m_nGramOrder); - // get special word ids - m_oov_id = m_lm->vocab_->GetWordID(""); - CreateFactors(); - return true; -} -void LanguageModelORLM::CreateFactors() -{ - FactorCollection &factorCollection = FactorCollection::Instance(); - size_t maxFactorId = 0; // to create lookup vector later on - std::map m_lmids_map; // map from factor id -> word id - - for(std::map::const_iterator vIter = m_lm->vocab_->VocabStart(); - vIter != m_lm->vocab_->VocabEnd(); vIter++) { - // get word from ORLM vocab and associate with (new) factor id - size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId(); - m_lmids_map[factorId] = vIter->second; - maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; - } - // add factors for BOS and EOS and store bf word ids - size_t factorId; - m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, ""); - factorId = m_sentenceStart->GetId(); - maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; - m_sentenceStartWord[m_factorType] = m_sentenceStart; - - m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, ""); - factorId = m_sentenceEnd->GetId(); - maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; - m_sentenceEndWord[m_factorType] = m_sentenceEnd; - // add to lookup vector in object - lm_ids_vec_.resize(maxFactorId+1); - // fill with OOV code - fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id); - - for (map::const_iterator iter = m_lmids_map.begin(); - iter != m_lmids_map.end() ; ++iter) - lm_ids_vec_[iter->first] = iter->second; -} -wordID_t LanguageModelORLM::GetLmID(const std::string& str) const -{ - return m_lm->vocab_->GetWordID(str); -} -wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const -{ - size_t factorId = factor->GetId(); - return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId]; -} -LMResult LanguageModelORLM::GetValue(const std::vector &contextFactor, - State* finalState) const -{ - FactorType factorType = GetFactorType(); - // set up context - //std::vector factor(1,0); - //std::vector sngram; - wordID_t ngram[MAX_NGRAM_SIZE]; - int count = contextFactor.size(); - for (int i = 0; i < count; i++) { - ngram[i] = GetLmID((*contextFactor[i])[factorType]); - //sngram.push_back(contextFactor[i]->GetString(factor, false)); - } - //float logprob = FloorScore(TransformLMScore(lm_->getProb(sngram, count, finalState))); - LMResult ret; - ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, finalState))); - ret.unknown = count && (ngram[count - 1] == m_oov_id); - /*if (finalState) - std::cout << " = " << logprob << "(" << *finalState << ", " << *len <<")"<< std::endl; - else - std::cout << " = " << logprob << std::endl; - */ - return ret; -} -bool LanguageModelORLM::UpdateORLM(const std::vector& ngram, const int value) -{ - /*cerr << "Inserting into ORLM: \""; - iterate(ngram, nit) - cerr << *nit << " "; - cerr << "\"\t" << value << endl; */ - m_lm->vocab_->MakeOpen(); - bool res = m_lm->update(ngram, value); - m_lm->vocab_->MakeClosed(); - return res; -} -} diff --git a/moses/LM/ORLM.h b/moses/LM/ORLM.h deleted file mode 100644 index 1ce282ded..000000000 --- a/moses/LM/ORLM.h +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once - -#include -#include -#include "moses/Factor.h" -#include "moses/Util.h" -#include "SingleFactor.h" -#include "moses/TranslationModel/DynSAInclude/onlineRLM.h" -//#include "multiOnlineRLM.h" -#include "moses/TranslationModel/DynSAInclude/FileHandler.h" -#include "moses/TranslationModel/DynSAInclude/vocab.h" - -namespace Moses -{ -class Factor; -class Phrase; - -/** @todo ask ollie - */ -class LanguageModelORLM : public LanguageModelSingleFactor -{ -public: - typedef count_t T; // type for ORLM filter - LanguageModelORLM(const std::string &line) - :LanguageModelSingleFactor(line) - ,m_lm(0) { - } - bool Load(const std::string &filePath, FactorType factorType, size_t nGramOrder); - virtual LMResult GetValue(const std::vector &contextFactor, State* finalState = NULL) const; - ~LanguageModelORLM() { - //save LM with markings - Utils::rtrim(m_filePath, ".gz"); - FileHandler fout(m_filePath + ".marked.gz", std::ios::out|std::ios::binary, false); - m_lm->save(&fout); - fout.close(); - delete m_lm; - } - void CleanUpAfterSentenceProcessing() { - m_lm->clearCache(); // clear caches - } - - bool UpdateORLM(const std::vector& ngram, const int value); -protected: - OnlineRLM* m_lm; - //MultiOnlineRLM* m_lm; - wordID_t m_oov_id; - std::vector lm_ids_vec_; - void CreateFactors(); - wordID_t GetLmID(const std::string &str) const; - wordID_t GetLmID(const Factor *factor) const; -}; -} // end namespace - diff --git a/moses/TranslationModel/DynSAInclude/FileHandler.cpp b/moses/TranslationModel/DynSAInclude/FileHandler.cpp deleted file mode 100644 index ecde3c644..000000000 --- a/moses/TranslationModel/DynSAInclude/FileHandler.cpp +++ /dev/null @@ -1,186 +0,0 @@ -#include "FileHandler.h" -#include - -// Workaround: plain Windows does not have popen()/pclose(). -// (MinGW already #define's them, so skip the workaround there.) -#if defined(WIN32) && !defined(__MINGW32__) -#define popen(A, B) _popen(A, B) -#define pclose(A) _pclose(A) -#endif - -namespace Moses -{ - -// FileHandler class -const std::string FileHandler::kStdInDescriptor = "___stdin___"; -const std::string FileHandler::kStdOutDescriptor = "___stdout___"; -// compression commands -const FileExtension FileHandler::kGzipped = ".gz"; -const FileExtension FileHandler::kBzipped2 = ".bz2"; - -const std::string FileHandler::kCatCommand = "cat"; -const std::string FileHandler::kGzipCommand = "gzip -f"; -const std::string FileHandler::kGunzipCommand = "gunzip -f"; -const std::string FileHandler::kBzip2Command = "bzip2 -f"; -const std::string FileHandler::kBunzip2Command = "bunzip2 -f"; - -FileHandler::FileHandler(const std::string & path, std::ios_base::openmode flags, bool /* checkExists */) - : std::fstream((const char*) NULL), path_(path), flags_(flags), buffer_(NULL), fp_(NULL) -{ - if( !(flags^(std::ios::in|std::ios::out)) ) { - fprintf(stderr, "ERROR: FileHandler does not support bidirectional files (%s).\n", path_.c_str()); - exit(EXIT_FAILURE); - } else { - bool ret = setStreamBuffer(flags & std::ios::in); - UTIL_THROW_IF2(!ret, "Unable to set stream buffer"); - } - this->precision(32); -} - -FileHandler::~FileHandler() -{ -#ifndef NO_PIPES - if( fp_ != 0 ) - pclose(fp_); -#endif - if( path_ != FileHandler::kStdInDescriptor && - path_ != FileHandler::kStdOutDescriptor ) - delete buffer_; - if( this->is_open() ) - this->close(); -} - -fdstreambuf * FileHandler::openCompressedFile(const char * cmd) -{ - //bool isInput = (flags_ & std::ios::in); - //open pipe to file with compression/decompression command - const char * p_type = (flags_ & std::ios::in ? "r" : "w"); -#ifndef NO_PIPES - fp_ = popen(cmd, p_type); -#else - fp_ = NULL; -#endif - if( fp_ == NULL ) { - //fprintf(stderr, "ERROR:Failed to open compressed file at %s\n", path_.c_str()); - perror("openCompressedFile: "); - exit(EXIT_FAILURE); - } - //open streambuf with file descriptor - return new fdstreambuf(fileno(fp_)); -} - -bool FileHandler::setStreamBuffer(bool checkExists) -{ - // redirect stdin or stdout if necesary - if (path_ == FileHandler::kStdInDescriptor) { - UTIL_THROW_IF2((flags_ & std::ios::in) == 0, - "Incorrect flags: " << flags_); - std::streambuf* sb = std::cin.rdbuf(); - buffer_ = sb; - } else if (path_ == FileHandler::kStdOutDescriptor) { - UTIL_THROW_IF2((flags_ & std::ios::out) == 0, - "Incorrect flags: " << flags_); - std::streambuf* sb = std::cout.rdbuf(); - buffer_ = sb; - } else { - // real file - if( checkExists && ! fileExists() ) { - fprintf(stderr, "ERROR: Failed to find file at %s\n", path_.c_str()); - exit(EXIT_FAILURE); - } - std::string cmd = ""; - if( isCompressedFile(cmd) && (! cmd.empty()) ) { - buffer_ = openCompressedFile(cmd.c_str()); - } else { - // open underlying filebuf - std::filebuf* fb = new std::filebuf(); - fb->open(path_.c_str(), flags_); - buffer_ = fb; - } - } - if (!buffer_) { - fprintf(stderr, "ERROR:Failed to open file at %s\n", path_.c_str()); - exit(EXIT_FAILURE); - } - this->init(buffer_); - return true; -} - -/* - * Checks for compression via file extension. Currently checks for - * ".gz" and ".bz2". - */ -bool FileHandler::isCompressedFile(std::string & cmd) -{ - bool compressed = false, isInput = (flags_ & std::ios::in); - cmd = ""; - unsigned int len = path_.size(); - if( len > kGzipped.size() - && path_.find(kGzipped) == len - kGzipped.size()) { - //gzip file command to compress or decompress - compressed = true; - // cmd = (isInput ? "exec gunzip -cf " : "exec gzip -c > ") + path_; - cmd = (isInput ? "exec " + kGunzipCommand + "c " - : "exec " + kGzipCommand + "c > ") + path_; - } else if( len > kBzipped2.size() && - path_.find(kBzipped2) == len - kBzipped2.size()) { - //do bzipped2 file command - compressed = true; - cmd = (isInput ? "exec " + kBunzip2Command + "c " - : "exec " + kBzip2Command + "c > ") + path_; - } - return compressed; -} - -bool FileHandler::fileExists() -{ - bool exists = false; - struct stat f_info; - if( stat(path_.c_str(), &f_info) == 0 ) //if stat() returns no errors - exists = true; - return( exists ); -} - -// static method used during preprocessing compressed files without -// opening fstream objects. -bool FileHandler::getCompressionCmds(const std::string & filepath, std::string & compressionCmd, - std::string & decompressionCmd, - std::string & compressionSuffix) -{ - // determine what compression and decompression cmds are suitable from filepath - compressionCmd = kCatCommand; - decompressionCmd = kCatCommand; - if (filepath.length() > kGzipped.size() && - filepath.find(kGzipped) == filepath.length() - - kGzipped.length()) { - compressionCmd = kGzipCommand; - decompressionCmd = kGunzipCommand; - compressionSuffix = kGzipped; - } else if (filepath.length() > kBzipped2.size() && - filepath.find(kBzipped2) == filepath.length() - - kBzipped2.length() ) { - compressionCmd = kBzip2Command; - decompressionCmd = kBunzip2Command; - compressionSuffix = kBzipped2;; - } - return (compressionCmd != kCatCommand && decompressionCmd != kCatCommand); -} - -bool FileHandler::reset() -{ -#ifndef NO_PIPES - // move to beginning of file - if (fp_ != 0) { - //can't seek on a pipe so reopen - pclose(fp_); - std::string cmd = ""; - if (isCompressedFile(cmd) && ! cmd.empty()) - buffer_ = openCompressedFile(cmd.c_str()); - //reinitialize - this->init(buffer_); - } else -#endif - buffer_->pubseekoff(0, std::ios_base::beg); //sets both get and put pointers to beginning of stream - return true; -} -} //end namespace diff --git a/moses/TranslationModel/DynSAInclude/FileHandler.h b/moses/TranslationModel/DynSAInclude/FileHandler.h deleted file mode 100644 index 20b3ba6a1..000000000 --- a/moses/TranslationModel/DynSAInclude/FileHandler.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef moses_DynSAInclude_file_h -#define moses_DynSAInclude_file_h - -#include -#include -#include -#include -#include -#include -#include "util/exception.hh" -#include "fdstream.h" -#include "utils.h" - -namespace Moses -{ -typedef std::string FileExtension; - -//! @todo ask abby2 -class FileHandler: public std::fstream -{ -public: - // descriptors for stdin and stdout - static const std::string kStdInDescriptor; // file name for std::cin - static const std::string kStdOutDescriptor; // file name for std::cout - // compression commands - static const std::string kCatCommand; // i.e. no compression - static const std::string kGzipCommand; // gzip -f - static const std::string kGunzipCommand; // gunzip -f - static const std::string kBzip2Command; // bzip2 -f - static const std::string kBunzip2Command; // bunzip2 -f - - // open file or wrap stdin or stdout - FileHandler(const std::string & path, - std::ios_base::openmode flags = std::ios::in, - bool checkExists = true); - ~FileHandler(); - // file utilities - static bool getCompressionCmds(const std::string & filepath, - std::string & compressionCmd, - std::string & decompressionCmd, - std::string & compressionSuffix); - - // data accessors - std::string getPath() { - return path_; - } - std::ios_base::openmode getFlags() { - return flags_; - } - bool isStdIn() { - return path_ == FileHandler::kStdInDescriptor; - } - bool isStdOut() { - return path_ == FileHandler::kStdOutDescriptor; - } - bool reset(); -protected: - static const FileExtension kGzipped; - static const FileExtension kBzipped2; - bool fileExists(); - bool setStreamBuffer(bool checkExists); - bool isCompressedFile(std::string & cmd); - fdstreambuf* openCompressedFile(const char* cmd); - std::string path_; // file path - std::ios_base::openmode flags_; // open flags - std::streambuf* buffer_; // buffer to either gzipped or standard data - std::FILE* fp_; //file pointer to handle pipe data -}; - -} // end namespace - -#endif diff --git a/moses/TranslationModel/DynSAInclude/Jamfile b/moses/TranslationModel/DynSAInclude/Jamfile deleted file mode 100644 index d2c00f587..000000000 --- a/moses/TranslationModel/DynSAInclude/Jamfile +++ /dev/null @@ -1 +0,0 @@ -alias dynsa : ../../../util//kenutil ../..//headers : : : . ; diff --git a/moses/TranslationModel/DynSAInclude/RandLMCache.h b/moses/TranslationModel/DynSAInclude/RandLMCache.h deleted file mode 100644 index 06ce240a1..000000000 --- a/moses/TranslationModel/DynSAInclude/RandLMCache.h +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright 2008 Abby Levenberg, David Talbot -// -// This file is part of RandLM -// -// RandLM is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RandLM is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RandLM. If not, see . -#ifndef INC_RANDLM_CACHE_H -#define INC_RANDLM_CACHE_H - -#include -#include -#include -#include - -namespace randlm -{ - -//! @todo ask abby2 -template -class CacheNode -{ -public: - typedef std::map* > childMap; - // initialise value to 'unknown' (i.e. not yet queried or cached). - CacheNode(T unknown_value) : value_(unknown_value) {} - childMap childs_; // child pointers - T value_; // value stored - const void* state_; // state pointer -}; - -template -class Cache -{ -public: - typedef typename std::map* >::iterator childPtr; - // unknown_value is used to indicate the ngram was not queried (yet) - // null_value_ indicates it was queried but not found in model - // space usage is handled by client. - Cache(T unknown_value, T null_value) : - cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) { - root_ = newNode(); - } - ~Cache() { - if(clear()) { - delete root_; - root_ = NULL; - } else { - std::cerr << "Error freeing cache memory.\n"; - } - } - bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) { - // inserts full ngram into cache - CacheNode* node = root_; - for (int i = len - 1; i > -1; --i) { - childPtr child = node->childs_.find(ngram[i]); - if( child != node->childs_.end() ) { - // current node is already prefix. Go to child node - node = node->childs_[ngram[i]]; - } else { - // no child for prefix. set new child link in current node - CacheNode * newChild = newNode(node); - node->childs_[ngram[i]] = newChild; - // go to new node - node = newChild; - } - } - node->value_ = value; - node->state_ = state; - return true; - } - bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) { - // finds value for this full ngram only (returns false if full ngram not in cache) - CacheNode * node = root_; - for(int i = len - 1; i > -1; --i) { - // go to deepest level node of ngram in cache - childPtr child = node->childs_.find(ngram[i]); - if( child != node->childs_.end() ) { - // switch to child node - node = node->childs_[ngram[i]]; - } else { - // not cached - return false; - } - } - *value = node->value_; - if(state) *state = node->state_; - return *value != null_value_ && *value != unknown_value_; - } - int getCache2(const wordID_t* ngram, int len, T** values, int* found) { - // set values array to point to cache value nodes - CacheNode * node = root_; - *found = 0; - //values[0] = &node->value_; // pointer to root node's value - bool all_found = true; - for(int i = len - 1; i > -1; --i) { - // go to deepest level node of ngram in cache - childPtr child = node->childs_.find(ngram[i]); - if( child != node->childs_.end() ) { - // switch to child node - node = node->childs_[ngram[i]]; - // get pointer to value (index by length - 1) - values[i] = &node->value_; - // if null_value then assume all extensions impossible - if (node->value_ == null_value_) { - return len - 1 - i; // max length posible - } - all_found = all_found && (node->value_ != unknown_value_); - if (all_found) - ++(*found); - } else { - // initialise uncached values - CacheNode * newChild = newNode(node); - node->childs_[ngram[i]] = newChild; - // go to new node - node = newChild; - values[i] = &node->value_; - } - } - return len; // all possible - } - int getCache(const wordID_t* ngram, int len, T** values, int* found) { - // get pointers to values for ngram and constituents. - // returns upper bound on longest subngram in model. - // 'found' stores longest non-null and known value found. - CacheNode * node = root_; - *found = 0; - values[0] = &node->value_; // pointer to root node's value - bool all_found = true; - for(int i = len - 1; i > -1; --i) { - // go to deepest level node of ngram in cache - childPtr child = node->childs_.find(ngram[i]); - if( child != node->childs_.end() ) { - // switch to child node - node = node->childs_[ngram[i]]; - // get pointer to value (index by length - 1) - values[len - i] = &node->value_; - // if null_value then assume all extensions impossible - if (node->value_ == null_value_) - return len - 1 - i; // max length posible - all_found = all_found && (node->value_ != unknown_value_); - if (all_found) - ++(*found); - } else { - // initialise uncached values - CacheNode * newChild = newNode(node); - node->childs_[ngram[i]] = newChild; - // go to new node - node = newChild; - values[len - i] = &node->value_; - } - } - return len; // all possible - } - bool clear() { - std::cerr << "Clearing cache with " << static_cast(cur_nodes_ * nodeSize()) - / static_cast(1ull << 20) << "MB" << std::endl; - return clearNodes(root_); - } - int nodes() { - // returns number of nodes - return cur_nodes_; - } - int nodeSize() { - return sizeof(CacheNode) + sizeof(root_->childs_); - } -private: - CacheNode * root_; - count_t cur_nodes_; - T unknown_value_; // Used to initialise data at each node - T null_value_; // Indicates cached something not in model - CacheNode* newNode(CacheNode * node = 0) { - ++cur_nodes_; - return new CacheNode(unknown_value_); - } - bool clearNodes(CacheNode * node) { - //delete children from this node - if(!node->childs_.empty()) { - iterate(node->childs_, itr) { - if(!clearNodes(itr->second)) - std::cerr << "Error emptying cache\n"; - delete itr->second; - --cur_nodes_; - } - node->childs_.clear(); - } - return true; - } - -}; -} //end namespace -#endif //INC_RANDLM_CACHE_H diff --git a/moses/TranslationModel/DynSAInclude/RandLMFilter.h b/moses/TranslationModel/DynSAInclude/RandLMFilter.h deleted file mode 100644 index edb94e183..000000000 --- a/moses/TranslationModel/DynSAInclude/RandLMFilter.h +++ /dev/null @@ -1,427 +0,0 @@ -// Copyright 2008 David Talbot -// -// This file is part of RandLM -// -// RandLM is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// RandLM is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with RandLM. If not, see . -#ifndef INC_RANDLM_FILTER_H -#define INC_RANDLM_FILTER_H - -#include -#include -#include "FileHandler.h" - -#ifdef WIN32 -#define log2(X) (log((double)X)/log((double)2)) -#endif - -namespace randlm -{ - -/* Class Filter wraps a contiguous array of data. Filter and its subclasses - * implement read/write/increment functionality on arrays with arbitrary sized addresses - * (i.e. an address may not use a full number of bytes). When converting to byte-based - * representation we assume "unused" bits are to left. - * E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11 - * to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1]) - * and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have - * been masked out. - */ -template -class Filter -{ -public: - Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) { - // number of bits in T - cell_width_ = sizeof(T) << 3; - // current implementation has following constraints - assert(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width); - // used for >> division - log_cell_width_ = static_cast(floor(log((double)cell_width_)/log((double)2) + 0.000001)); - // size of underlying data in Ts - cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_; - // instantiate underlying data - data_ = new T[cells_]; - assert(data_ != NULL); - assert(reset()); - // 'first_bit' marks the first bit used by 'address' (left padded with zeros). - first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_); - // mask for full cell - full_mask_ = static_cast(0xffffffffffffffffull); - // mask for bits that make up the address - address_mask_ = full_mask_ >> first_bit_; - } - Filter(Moses::FileHandler* fin, bool loaddata = true) : data_(NULL) { - assert(loadHeader(fin)); - if (loaddata) - assert(loadData(fin)); - } - virtual ~Filter() { - delete[] data_; - } - bool reset() { - for (uint64_t i = 0; i < cells_; ++i) - data_[i] = 0; - return true; - } - count_t size() { - // return approx size of filter in MBs - return cells_ * sizeof(T) >> 20; - } - // read / write functions - inline bool read(uint64_t address, T* value) { - assert(address <= addresses_); - // copy address to 'value' - uint64_t data_bit = address * width_; - uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; - // 'offset' shows how address in 'data' and 'value' align - int offset = (data_bit % cell_width_) - first_bit_; - // they align so just copy across masking unneeded leading bits - if (offset == 0) { - *value = data_[data_cell] & address_mask_; - return true; - } - // data address starts to left so shift it right - if (offset < 0) { - *value = (data_[data_cell] >> -offset) & address_mask_; - return true; - } - // data address is to right so shift it left and look at one more cell to right - *value = ((data_[data_cell] << offset) - | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ; - return true; - } - inline T read(uint64_t address) { - assert(address <= addresses_); - // return value at address - T value = 0; - uint64_t data_bit = address * width_; - uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; - // 'offset' shows how address in 'data' and 'value' align - int offset = (data_bit % cell_width_) - first_bit_; - // they align so just copy across masking unneeded leading bits - if (offset == 0) { - value = data_[data_cell] & address_mask_; - } - // data address starts to left so shift it right - else if (offset < 0) { - value = (data_[data_cell] >> -offset) & address_mask_; - } - // data address is to right so shift it left and look at one more cell to right - else - value = ((data_[data_cell] << offset) - | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ; - return value; - } - inline bool write(uint64_t address, T value) { - assert(address <= addresses_); - assert(log2(value) <= width_); - // write 'value' to address - uint64_t data_bit = address * width_; - uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; - // 'offset' shows how address in 'data' and 'value' align - int offset = (data_bit % cell_width_) - first_bit_; - // they align so just copy across masking unneeded leading zeros of value - if (offset == 0) { - data_[data_cell] = value | (data_[data_cell] & ~address_mask_); - return true; - } - // address in data is to left so shift value left by -offset - if (offset < 0) { - data_[data_cell] = (value << -offset) - | (data_[data_cell] & ~(address_mask_ << -offset)); - return true; - } - // address in data is to right so shift value right by offset - data_[data_cell] = (value >> offset) | - (data_[data_cell] & ~(address_mask_ >> offset)); - data_[data_cell + 1] = (value << (cell_width_ - offset)) | - (data_[data_cell + 1] & (full_mask_ >> offset)); - return true; - } - inline bool readWithFingerprint(uint64_t address, T finger, T* value) { - // copy 'address' ^ 'finger' to 'value' - uint64_t data_bit = address * width_; - uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; - // 'offset' shows how address in 'data' and 'value' align - int offset = (data_bit % cell_width_) - first_bit_; - // they align so just copy across masking unneeded leading bits - if (offset == 0) { - *value = (finger ^ data_[data_cell]) & address_mask_; - return true; - } - // data address starts to left so shift it right - if (offset < 0) { - *value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_; - return true; - } - // data address is to right so shift it left and look at one more cell to right - *value = (((data_[data_cell] << offset) - | (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger) - & address_mask_ ; - return true; - } - inline bool writeWithFingerprint(uint64_t address, T finger, T value) { - // write 'value' ^ 'finger' to address - finger &= address_mask_; // make sure fingerprint is correct size - uint64_t data_bit = address * width_; - uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_; - // 'offset' shows how address in 'data' and 'value' align - int offset = (data_bit % cell_width_) - first_bit_; - // they align so just copy across masking unneeded leading zeros of value - if (offset == 0) { - data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_); - return true; - } - // address in data is to left so shift value left by -offset - if (offset < 0) { - data_[data_cell] = ((finger ^ value) << -offset) - | (data_[data_cell] & ~(address_mask_ << -offset)); - return true; - } - // address in data is to right so shift value right by offset - data_[data_cell] = ((finger ^ value) >> offset) | - (data_[data_cell] & ~(address_mask_ >> offset)); - data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) | - (data_[data_cell + 1] & (full_mask_ >> offset)); - return true; - } - // debugging - void printFilter(const std::string & prefix = "", uint32_t truncate = 64) { - std::cout << prefix; - for (uint32_t i = 0; i < cells_ && i < truncate; ++i) { - for (int j = cell_width_ - 1; j >= 0; --j) - if (data_[i] & (1ull << j)) - std::cout << 1; - else - std::cout << 0; - std::cout << "\n"; - } - std::cout << std::endl; - } - // i/o - uint64_t getAddresses() { - return addresses_; - } - int getWidth() { - return width_; - } - int getCellWidth() { - return cell_width_; - } - uint32_t getCells() { - return cells_; - } - virtual bool save(Moses::FileHandler* out) { - assert(out != NULL); - assert(out->write((char*)&cells_, sizeof(cells_))); - assert(out->write((char*)&cell_width_, sizeof(cell_width_))); - assert(out->write((char*)&log_cell_width_, sizeof(log_cell_width_))); - assert(out->write((char*)&addresses_, sizeof(addresses_))); - assert(out->write((char*)&width_, sizeof(width_))); - assert(out->write((char*)&first_bit_, sizeof(first_bit_))); - assert(out->write((char*)&full_mask_, sizeof(full_mask_))); - assert(out->write((char*)&address_mask_, sizeof(address_mask_))); - //assert(out->write((char*)data_, cells_ * sizeof(T))); - const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29); - if((width_ == 1) || cells_ < jump) - assert(out->write((char*)data_, cells_ * sizeof(T))); - else { - uint64_t idx(0); - while(idx + jump < cells_) { - assert(out->write((char*)&data_[idx], jump * sizeof(T))); - idx += jump; - } - assert(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T))); - } - return true; - } -protected: - bool loadHeader(Moses::FileHandler* fin) { - assert(fin != NULL); - assert(fin->read((char*)&cells_, sizeof(cells_))); - assert(fin->read((char*)&cell_width_, sizeof(cell_width_))); - assert(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type - assert(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_))); - assert(fin->read((char*)&addresses_, sizeof(addresses_))); - assert(fin->read((char*)&width_, sizeof(width_))); - assert(fin->read((char*)&first_bit_, sizeof(first_bit_))); - assert(fin->read((char*)&full_mask_, sizeof(full_mask_))); - assert(fin->read((char*)&address_mask_, sizeof(address_mask_))); - return true; - } - bool loadData(Moses::FileHandler* fin) { - // instantiate underlying array - data_ = new T[cells_]; - assert(data_ != NULL); - assert(fin->read((char*)data_, cells_ * sizeof(T))); - //assert(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T))); - //assert(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T))); - return true; - } - uint64_t cells_; // number T making up 'data_' - int cell_width_; // bits per cell (i.e. sizeof(T) << 3) - int log_cell_width_; // log of bits used for >> division - uint64_t addresses_; // number of addresses in the filter - int width_; // width in bits of each address - int first_bit_; // position of first bit in initial byte - T full_mask_; // all 1s - T address_mask_; // 1s in those positions that are part of address - T* data_; // the raw data as bytes -}; - -// Extension with bit test/setter methods added -class BitFilter : public Filter -{ -public: - BitFilter(uint64_t bits) : Filter(bits, 1) {} - BitFilter(Moses::FileHandler* fin, bool loaddata = true) - : Filter(fin, loaddata) { - if (loaddata) - assert(load(fin)); - } - // TODO: overload operator[] - virtual bool testBit(uint64_t location) { - // test bit referenced by location - return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8); - } - virtual bool setBit(uint64_t location) { - // set bit referenced by location - data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8); - return true; - } - virtual bool clearBit(uint64_t location) { - // set bit referenced by location - data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8); - return true; - } - bool save(Moses::FileHandler* fout) { - assert(Filter::save(fout)); - std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;; - return true; - } - float rho(uint64_t limit = 0) { - uint64_t ones = 0; - uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_; - for (uint64_t i = 0; i < range; ++i) - for (int j = 0; j < 8; ++j) - if (data_[i] & (1 << j)) - ++ones; - return static_cast((range << 3) - ones)/static_cast(range << 3); - } -protected: - bool load(Moses::FileHandler* fin) { - std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;; - return true; - } -}; -/* - // ResizedBitFilter deals with resizing to save memory - // whereas other filters should expect locations to be within range - // this filter will need to resize (and possibly rehash) locations - // to fit a smaller range. - class ResizedBitFilter : public BitFilter { - public: - ResizedBitFilter(Moses::FileHandler* fin) : BitFilter(fin) { - assert(load(fin)); - } - ResizedBitFilter(Moses::FileHandler* fin, uint64_t newsize) : BitFilter(newsize) { - assert(resizeFromFile(fin, newsize)); - } - bool resizeFromFile(Moses::FileHandler* oldin, uint64_t newsize); - virtual bool testBit(uint64_t location) { - // test bit referenced by location - return BitFilter::testBit((location % old_addresses_) * a_ + b_); - } - virtual bool setBit(uint64_t location) { - // set bit referenced by location - return BitFilter::setBit((location % old_addresses_) * a_ + b_); - } - bool save(Moses::FileHandler* fout) { - // re-hashing parameters - assert(BitFilter::save(fout)); - std::cerr << "Saved ResizedBitFilter. Rho = " << rho() << "." << std::endl; - assert(fout->write((char*)&old_addresses_, sizeof(old_addresses_))); - assert(fout->write((char*)&a_, sizeof(a_))); - return fout->write((char*)&b_, sizeof(b_)); - } - protected: - bool load(Moses::FileHandler* fin) { - // re-hashing parameters - std::cerr << "Loaded ResizedBitFilter. Rho = " << rho() << "." << std::endl; - CHECK(fin->read((char*)&old_addresses_, sizeof(old_addresses_))); - CHECK(fin->read((char*)&a_, sizeof(a_))); - return fin->read((char*)&b_, sizeof(b_)); - } - // member data - uint64_t old_addresses_; // size of pre-resized filter - uint64_t a_, b_; // re-hashing parameters (needed?) - }; - - // CountingFilter supports increment operator. Addresses - // of the filter are treated as counters that store their counts - // in big-endian format (i.e. leftmost bit is most significant). - template - class CountingFilter : public Filter { - public: - CountingFilter(uint64_t addresses, int width, bool wrap_around = true) : - Filter(addresses, width), wrap_around_(wrap_around) {} - CountingFilter(Moses::FileHandler* fin) : Filter(fin, true) { - CHECK(load(fin)); - } - ~CountingFilter() {} - // increment this address by one - inline bool increment(uint32_t address) { - uint64_t data_bit = address * this->width_; // index of first bit - uint32_t data_cell = (data_bit >> this->log_cell_width_); // % this->cells_; // index in data_ - // 'offset' shows how address in 'data' and 'value' align - data_bit %= this->cell_width_; - int offset = data_bit - this->first_bit_; - // start from right incrementing and carrying if necessary - bool carry = true; - if (offset > 0) { // counter spans two cells - carry = incrementSubCell(0, offset, &this->data_[data_cell + 1]); - if (carry) - carry = incrementSubCell(data_bit, this->width_ - offset, &this->data_[data_cell]); - } else { // counter is within a single cell - carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]); - } - // last update must not have carried - if (!carry) - return true; - // wrapped round so check whether need to reset to max count - if (!wrap_around_) - CHECK(this->write(address, this->address_mask_)); - return false; // false to indicate that overflowed - } - bool save(Moses::FileHandler* fout) { - CHECK(Filter::save(fout)); - return fout->write((char*)&wrap_around_, sizeof(wrap_around_)); - } - private: - bool load(Moses::FileHandler* fin) { - return fin->read((char*)&wrap_around_, sizeof(wrap_around_)); - } - inline bool incrementSubCell(int bit, int len, T* cell) { - // increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged - *cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1) - & (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len)) - | (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len)))); - // indicate overflow as true - return ((*cell & (this->full_mask_ >> bit)) >> (this->cell_width_ - bit - len)) == 0; - } - bool wrap_around_; // whether to start from 0 on overflow (if not just stay at maximum count) - }; -*/ -} -#endif // INC_RANDLM_FILTER_H diff --git a/moses/TranslationModel/DynSAInclude/fdstream.h b/moses/TranslationModel/DynSAInclude/fdstream.h deleted file mode 100644 index 4c95d032c..000000000 --- a/moses/TranslationModel/DynSAInclude/fdstream.h +++ /dev/null @@ -1,146 +0,0 @@ -/* Class modified by ADL for randlm namespace on Feb 15th, 2008. - * - * The following code declares classes to read from and write to - * file descriptore or file handles. - * - * See - * http://www.josuttis.com/cppcode - * for details and the latest version. - * - * - open: - * - integrating BUFSIZ on some systems? - * - optimized reading of multiple characters - * - stream for reading AND writing - * - i18n - * - * (C) Copyright Nicolai M. Josuttis 2001. - * Permission to copy, use, modify, sell and distribute this software - * is granted provided this copyright notice appears in all copies. - * This software is provided "as is" without express or implied - * warranty, and with no claim as to its suitability for any purpose. - * - * Version: Jul 28, 2002 - * History: - * Jul 28, 2002: bugfix memcpy() => memmove() - * fdinbuf::underflow(): cast for return statements - * Aug 05, 2001: first public version - */ -#ifndef moses_DynSAInclude_fdstream_h -#define moses_DynSAInclude_fdstream_h - -#include -// for EOF: -#include -// for memmove(): -#include - - -// low-level read and write functions -#ifdef _MSC_VER -# include -#else -# include -//extern "C" { -// int write (int fd, const char* buf, int num); -// int read (int fd, char* buf, int num); -//} -#endif - - -// BEGIN namespace -//namespace randlm { - -/************************************************************ - * fdstreambuf - * - a stream that reads on a file descriptor - ************************************************************/ - -class fdstreambuf : public std::streambuf -{ -protected: - int fd; // file descriptor -protected: - /* data buffer: - * - at most, pbSize characters in putback area plus - * - at most, bufSize characters in ordinary read buffer - */ - static const int pbSize = 4; // size of putback area - static const int bufSize = 1024; // size of the data buffer - char buffer[bufSize+pbSize]; // data buffer - -public: - /* constructor - * - initialize file descriptor - * - initialize empty data buffer - * - no putback area - * => force underflow() - */ - fdstreambuf (int _fd) : fd(_fd) { - setg (buffer+pbSize, // beginning of putback area - buffer+pbSize, // read position - buffer+pbSize); // end position - } -protected: - // insert new characters into the buffer - virtual int_type underflow () { -#ifndef _MSC_VER - using std::memmove; -#endif - - // is read position before end of buffer? - if (gptr() < egptr()) { - return traits_type::to_int_type(*gptr()); - } - /* process size of putback area - * - use number of characters read - * - but at most size of putback area - */ - int numPutback; - numPutback = gptr() - eback(); - if (numPutback > pbSize) { - numPutback = pbSize; - } - - /* copy up to pbSize characters previously read into - * the putback area - */ - memmove (buffer+(pbSize-numPutback), gptr()-numPutback, - numPutback); - - // read at most bufSize new characters - int num; - num = read (fd, buffer+pbSize, bufSize); - if (num <= 0) { - // ERROR or EOF - return EOF; - } - - // reset buffer pointers - setg (buffer+(pbSize-numPutback), // beginning of putback area - buffer+pbSize, // read position - buffer+pbSize+num); // end of buffer - - // return next character - return traits_type::to_int_type(*gptr()); - } - - // write one character - virtual int_type overflow (int_type c) { - if (c != EOF) { - char z = c; - if (write (fd, &z, 1) != 1) { - return EOF; - } - } - return c; - } - // write multiple characters - virtual - std::streamsize xsputn (const char* s, - std::streamsize num) { - return write(fd,s,num); - } -}; -//} // END namespace - -#endif diff --git a/moses/TranslationModel/DynSAInclude/hash.h b/moses/TranslationModel/DynSAInclude/hash.h deleted file mode 100644 index f349536b5..000000000 --- a/moses/TranslationModel/DynSAInclude/hash.h +++ /dev/null @@ -1,357 +0,0 @@ -#ifndef INC_ALLHASHFUNCS_H -#define INC_ALLHASHFUNCS_H - -#include -#include "types.h" -#include "utils.h" -#include "FileHandler.h" -#include "util/exception.hh" -#include "util/random.hh" - -typedef uint64_t P; // largest input range is 2^64 - -//! @todo ask abby2 -template -class HashBase -{ -protected: - T m_; // range of hash output - count_t H_; // number of hash functions to instantiate - virtual void initSeeds()=0; - virtual void freeSeeds()=0; -public: - HashBase(float m, count_t H=1):m_((T)m), H_(H) { - //cerr << "range = (0..." << m_ << "]" << endl; - } - HashBase(Moses::FileHandler* fin) { - load(fin); - } - virtual ~HashBase() {} - virtual T hash(const char*s, count_t h)=0; // string hashing - virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing - count_t size() { - return H_; - } - virtual void save(Moses::FileHandler* fout) { - UTIL_THROW_IF2(fout == 0, "Null file handle"); - fout->write((char*)&m_, sizeof(m_)); - fout->write((char*)&H_, sizeof(H_)); - } - virtual void load(Moses::FileHandler* fin) { - UTIL_THROW_IF2(fin == 0, "Null file handle"); - fin->read((char*)&m_, sizeof(m_)); - fin->read((char*)&H_, sizeof(H_)); - } -}; - -//! @todo ask abby2 -template -class UnivHash_linear: public HashBase -{ -public: - UnivHash_linear(float m, count_t H, P pr): - HashBase(m, H), pr_(pr) { - initSeeds(); - } - UnivHash_linear(Moses::FileHandler* fin): - HashBase(fin) { - load(fin); - } - ~UnivHash_linear() { - freeSeeds(); - } - T hash(const char* s, count_t h) { - return 0; //not implemented - } - T hash(const wordID_t* id, const int len, count_t h); - T hash(const wordID_t id, const count_t pos, - const T prevValue, count_t h); - void save(Moses::FileHandler* fout); - void load(Moses::FileHandler* fin); -private: - T** a_, **b_; - P pr_; - void initSeeds(); - void freeSeeds(); -}; - -/** UnivHash_noPrimes: - * From Dietzfelbinger 2008 - * p = input domain range = 2^l - * m = output range = 2^k - * # of hash function = 2^(l-1) - */ -template -class UnivHash_noPrimes: public HashBase -{ -public: - UnivHash_noPrimes(float k, float l): - HashBase(k, 100), d_(count_t((l-k))) { - if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1; - else p_ = (P) pow(2,l); - initSeeds(); - } - UnivHash_noPrimes(Moses::FileHandler* fin): - HashBase(fin) { - load(fin); - } - ~UnivHash_noPrimes() { - freeSeeds(); - } - T hash(const char* s, count_t h); - T hash(const wordID_t* id, const int len, count_t h); - T hash(const P x, count_t h); - void save(Moses::FileHandler* fout); - void load(Moses::FileHandler* fin); -private: - count_t d_; // l-k - P p_, *a_; // real-valued input range, storage - void initSeeds(); - void freeSeeds() { - delete[] a_; - } -}; - -//! @todo ask abby2 -template -class Hash_shiftAddXOR: public HashBase -{ -public: - Hash_shiftAddXOR(float m, count_t H=5): HashBase(m,H), - l_(5), r_(2) { - initSeeds(); - } - ~Hash_shiftAddXOR() { - freeSeeds(); - } - T hash(const char* s, count_t h); - T hash(const wordID_t* id, const int len, count_t h) {} // empty -private: - T* v_; // random seed storage - const unsigned short l_, r_; // left-shift bits, right-shift bits - void initSeeds(); - void freeSeeds() { - delete[] v_; - } -}; - -//! @todo ask abby2 -template -class UnivHash_tableXOR: public HashBase -{ -public: - UnivHash_tableXOR(float m, count_t H=5): HashBase(m, H), - table_(NULL), tblLen_(255*MAX_STR_LEN) { - initSeeds(); - } - ~UnivHash_tableXOR() { - freeSeeds(); - } - T hash(const char* s, count_t h); - T hash(const wordID_t* id, const int len, count_t h) {} -private: - T** table_; // storage for random numbers - count_t tblLen_; // length of table - void initSeeds(); - void freeSeeds(); -}; - -// ShiftAddXor -template -void Hash_shiftAddXOR::initSeeds() -{ - v_ = new T[this->H_]; - for(count_t i=0; i < this->H_; i++) - v_[i] = util::wide_rand() + 1; -} -template -T Hash_shiftAddXOR::hash(const char* s, count_t h) -{ - T value = v_[h]; - int pos(0); - unsigned char c; - while((c = *s++) && (++pos < MAX_STR_LEN)) { - value ^= ((value << l_) + (value >> r_) + c); - } - return (value % this->m_); -} - -// UnivHash_tableXOR -template -void UnivHash_tableXOR::initSeeds() -{ - // delete any values in table - if(table_) freeSeeds(); - // instance of new table - table_ = new T* [this->H_]; - // fill with random values - for(count_t j=0; j < this->H_; j++) { - table_[j] = new T[tblLen_]; - for(count_t i=0; i < tblLen_; i++) - table_[j][i] = util::wide_rand_excl(this->m_-1); - } -} -template -void UnivHash_tableXOR::freeSeeds() -{ - for(count_t j = 0; j < this->H_; j++) - delete[] table_[j]; - delete[] table_; - table_ = NULL; -} -template -T UnivHash_tableXOR::hash(const char* s, count_t h) -{ - T value = 0; - count_t pos = 0, idx = 0; - unsigned char c; - while((c = *s++) && (++pos < MAX_STR_LEN)) - value ^= table_[h][idx += c]; - UTIL_THROW_IF2(value >= this->m_, "Error"); - return value; -} - -// UnivHash_noPrimes -template -void UnivHash_noPrimes::initSeeds() -{ - a_ = new P[this->H_]; - for(T i=0; i < this->H_; i++) { - a_[i] = util::wide_rand

(); - if(a_[i] % 2 == 0) a_[i]++; // a must be odd - } -} -template -T UnivHash_noPrimes::hash(const P x, count_t h) -{ - // h_a(x) = (ax mod 2^l) div 2^(l-k) - T value = ((a_[h] * x) % p_) >> d_; - return value % this->m_; -} -template -T UnivHash_noPrimes::hash(const wordID_t* id, const int len, - count_t h) -{ - T value = 0; - int pos(0); - while(pos < len) { - value ^= hash((P)id[pos], h++); - pos++; - } - return value % this->m_; -} -template -T UnivHash_noPrimes::hash(const char* s, count_t h) -{ - T value = 0; - int pos(0); - unsigned char c; - while((c = *s++) && (++pos < MAX_STR_LEN)) { - value ^= hash((P)c, h); - } - return value % this->m_; -} -template -void UnivHash_noPrimes::save(Moses::FileHandler* fout) -{ - HashBase::save(fout); - fout->write((char*)&p_, sizeof(p_)); - fout->write((char*)&d_, sizeof(d_)); - for(T i=0; i < this->H_; i++) { - fout->write((char*)&a_[i], sizeof(a_[i])); - } -} -template -void UnivHash_noPrimes::load(Moses::FileHandler* fin) -{ - a_ = new P[this->H_]; - // HashBase::load(fin) already done in constructor - fin->read((char*)&p_, sizeof(p_)); - fin->read((char*)&d_, sizeof(d_)); - for(T i=0; i < this->H_; i++) { - fin->read((char*)&a_[i], sizeof(a_[i])); - } -} - -//UnivHash_linear -template -void UnivHash_linear::initSeeds() -{ - a_ = new T*[this->H_]; - b_ = new T*[this->H_]; - for(count_t i=0; i < this->H_; i++) { - a_[i] = new T[MAX_NGRAM_ORDER]; - b_[i] = new T[MAX_NGRAM_ORDER]; - for(count_t j=0; j < MAX_NGRAM_ORDER; j++) { - a_[i][j] = 1 + util::wide_rand(); - b_[i][j] = util::wide_rand(); - } - } -} -template -void UnivHash_linear::freeSeeds() -{ - for(count_t i=0; i < this->H_; i++) { - delete[] a_[i]; - delete[] b_[i]; - } - delete[] a_; - delete[] b_; - a_ = b_ = NULL; -} -template -inline T UnivHash_linear::hash(const wordID_t* id, const int len, - count_t h) -{ - UTIL_THROW_IF2(h >= this->H_, "Error"); - - T value = 0; - int pos(0); - while(pos < len) { - value += ((a_[h][pos] * id[pos]) + b_[h][pos]);// % pr_; - ++pos; - } - return value % this->m_; -} -template -inline T UnivHash_linear::hash(const wordID_t id, const count_t pos, - const T prevValue, count_t h) -{ - UTIL_THROW_IF2(h >= this->H_, "Error"); - T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_; - return value % this->m_; -} -template -void UnivHash_linear::save(Moses::FileHandler* fout) -{ - // int bytes = sizeof(a_[0][0]); - HashBase::save(fout); - fout->write((char*)&pr_, sizeof(pr_)); - for(count_t i=0; i < this->H_; i++) { - for(count_t j=0; j < MAX_NGRAM_ORDER; j++) { - fout->write((char*)&a_[i][j], sizeof(a_[i][j])); - fout->write((char*)&b_[i][j], sizeof(b_[i][j])); - //cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl; - //cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl; - } - } -} -template -void UnivHash_linear::load(Moses::FileHandler* fin) -{ - // HashBase::load(fin) already done in constructor - fin->read((char*)&pr_, sizeof(pr_)); - a_ = new T*[this->H_]; - b_ = new T*[this->H_]; - for(count_t i=0; i < this->H_; i++) { - a_[i] = new T[MAX_NGRAM_ORDER]; - b_[i] = new T[MAX_NGRAM_ORDER]; - for(count_t j=0; j < MAX_NGRAM_ORDER; j++) { - fin->read((char*)&a_[i][j], sizeof(a_[i][j])); - fin->read((char*)&b_[i][j], sizeof(b_[i][j])); - //cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl; - //cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl; - } - } -} -#endif diff --git a/moses/TranslationModel/DynSAInclude/onlineRLM.h b/moses/TranslationModel/DynSAInclude/onlineRLM.h deleted file mode 100644 index a4dbe98f3..000000000 --- a/moses/TranslationModel/DynSAInclude/onlineRLM.h +++ /dev/null @@ -1,542 +0,0 @@ -#ifndef INC_DYNAMICLM_H -#define INC_DYNAMICLM_H - -#include -#include -#include "perfectHash.h" -#include "RandLMCache.h" -#include "types.h" -#include "vocab.h" - -/* - * DynamicLM manipulates LM - */ -using randlm::BitFilter; -using randlm::Cache; - -const bool strict_checks_ = false; - -//! @todo ask abby2 -template -class OnlineRLM: public PerfectHash -{ -public: - OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order, - Moses::Vocab* v, float qBase = 8): PerfectHash(MBs, width, bucketRange, qBase), - vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) { - UTIL_THROW_IF2(vocab_ == 0, "Vocab object not set"); - //instantiate quantizer class here - cache_ = new randlm::Cache(8888.8888, 9999.9999); // unknown_value, null_value - alpha_ = new float[order_ + 1]; - for(count_t i = 0; i <= order_; ++i) - alpha_[i] = i * log10(0.4); - std::cerr << "Initialzing auxillary bit filters...\n"; - bPrefix_ = new randlm::BitFilter(this->cells_); - bHit_ = new randlm::BitFilter(this->cells_); - } - OnlineRLM(Moses::FileHandler* fin, count_t order): - PerfectHash(fin), bAdapting_(true), order_(order), corpusSize_(0) { - load(fin); - cache_ = new randlm::Cache(8888.8888, 9999.9999); // unknown_value, null_value - alpha_ = new float[order_ + 1]; - for(count_t i = 0; i <= order_; ++i) - alpha_[i] = i * log10(0.4); - } - ~OnlineRLM() { - delete[] alpha_; - if(bAdapting_) delete vocab_; - else vocab_ = NULL; - delete cache_; - delete bPrefix_; - delete bHit_; - } - float getProb(const wordID_t* ngram, int len, const void** state); - //float getProb2(const wordID_t* ngram, int len, const void** state); - bool insert(const std::vector& ngram, const int value); - bool update(const std::vector& ngram, const int value); - int query(const wordID_t* IDs, const int len); - int sbsqQuery(const std::vector& ngram, int* len, - bool bStrict = false); - int sbsqQuery(const wordID_t* IDs, const int len, int* codes, - bool bStrict = false); - void remove(const std::vector& ngram); - count_t heurDelete(count_t num2del, count_t order = 5); - uint64_t corpusSize() { - return corpusSize_; - } - void corpusSize(uint64_t c) { - corpusSize_ = c; - } - void clearCache() { - if(cache_) cache_->clear(); - } - void save(Moses::FileHandler* fout); - void load(Moses::FileHandler* fin); - void randDelete(int num2del); - int countHits(); - int countPrefixes(); - int cleanUpHPD(); - void clearMarkings(); - void removeNonMarked(); - Moses::Vocab* vocab_; -protected: - void markQueried(const uint64_t& index); - void markQueried(hpdEntry_t& value); - bool markPrefix(const wordID_t* IDs, const int len, bool bSet); -private: - const void* getContext(const wordID_t* ngram, int len); - const bool bAdapting_; // used to signal adaptation of model - const count_t order_; // LM order - uint64_t corpusSize_; // total training corpus size - float* alpha_; // backoff constant - randlm::Cache* cache_; - randlm::BitFilter* bPrefix_; - randlm::BitFilter* bHit_; -}; - -template -bool OnlineRLM::insert(const std::vector& ngram, const int value) -{ - int len = ngram.size(); - wordID_t wrdIDs[len]; - uint64_t index(this->cells_ + 1); - for(int i = 0; i < len; ++i) - wrdIDs[i] = vocab_->GetWordID(ngram[i]); - index = PerfectHash::insert(wrdIDs, len, value); - if(value > 1 && len < order_) - markPrefix(wrdIDs, ngram.size(), true); // mark context - // keep track of total items from training data minus "" - if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting - corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0; - if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting - markQueried(index); - return true; -} - -template -bool OnlineRLM::update(const std::vector& ngram, const int value) -{ - int len = ngram.size(); - std::vector wrdIDs(len); - uint64_t index(this->cells_ + 1); - hpdEntry_t hpdItr; - vocab_->MakeOpen(); - for(int i = 0; i < len; ++i) - wrdIDs[i] = vocab_->GetWordID(ngram[i]); - // if updating, minimize false positives by pre-checking if context already in model - bool bIncluded(true); - if(value > 1 && len < (int)order_) - bIncluded = markPrefix(&wrdIDs[0], ngram.size(), true); // mark context - if(bIncluded) { // if context found - bIncluded = PerfectHash::update2(&wrdIDs[0], len, value, hpdItr, index); - if(index < this->cells_) { - markQueried(index); - } else if(hpdItr != this->dict_.end()) markQueried(hpdItr); - } - - return bIncluded; -} -template -int OnlineRLM::query(const wordID_t* IDs, int len) -{ - uint64_t filterIdx = 0; - hpdEntry_t hpdItr; - int value(0); - value = PerfectHash::query(IDs, len, hpdItr, filterIdx); - if(value != -1) { - if(hpdItr != this->dict_.end()) { - //markQueried(hpdItr); // mark this event as "hit" - value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks - } else { - UTIL_THROW_IF2(filterIdx >= this->cells_, - "Out of bound: " << filterIdx); - //markQueried(filterIdx); - } - } - return value > 0 ? value : 0; -} - -template -bool OnlineRLM::markPrefix(const wordID_t* IDs, const int len, bool bSet) -{ - if(len <= 1) return true; // only do this for for ngrams with context - static randlm::Cache pfCache(-1, -1); // local prefix cache - int code(0); - if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) { - hpdEntry_t hpdItr; - uint64_t filterIndex(0); - code = PerfectHash::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1] - if(code == -1) { // encountered false positive in pipeline - std::cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n"; - // add all prefixes or return false; - return false; - } - if(filterIndex != this->cells_ + 1) { - UTIL_THROW_IF2(hpdItr != this->dict_.end(), "Error"); - if(bSet) bPrefix_->setBit(filterIndex); // mark index - else bPrefix_->clearBit(filterIndex); // unset index - } else { - UTIL_THROW_IF2(filterIndex != this->cells_ + 1, "Error"); - //how to handle hpd prefixes? - } - if(pfCache.nodes() > 10000) pfCache.clear(); - pfCache.setCacheNgram(IDs, len - 1, code, NULL); - } - return true; -} - -template -void OnlineRLM::markQueried(const uint64_t& index) -{ - bHit_->setBit(index); - //std::cerr << "filter[" << index << "] = " << this->filter_->read(index) << std::endl; -} - -template -void OnlineRLM::markQueried(hpdEntry_t& value) -{ - // set high bit of counter to indicate "hit" status - value->second |= this->hitMask_; -} - -template -void OnlineRLM::remove(const std::vector& ngram) -{ - wordID_t IDs[ngram.size()]; - for(count_t i = 0; i < ngram.size(); ++i) - IDs[i] = vocab_->GetWordID(ngram[i]); - PerfectHash::remove(IDs, ngram.size()); -} - -template -count_t OnlineRLM::heurDelete(count_t num2del, count_t order) -{ - count_t deleted = 0; - std::cout << "Deleting " << num2del << " of order "<< order << std::endl; - // delete from filter first - int full = *std::max_element(this->idxTracker_, this->idxTracker_ - + this->totBuckets_); - for(; full > 0; --full) // delete from fullest buckets first - for(int bk = 0; bk < this->totBuckets_; ++bk) { - if(deleted >= num2del) break; - if(this->idxTracker_[bk] == full) { // if full - uint64_t first = bk * this->bucketRange_, - last = first + this->bucketRange_; - for(uint64_t row = first; row < last; ++row) { // check each row - if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) { - if(this->filter_->read(row) != 0) { - PerfectHash::remove(row); // remove from filter - ++deleted; - } - } - } - } - } - if(deleted < num2del) { - // remove from hpd - std::cerr << "TODO! HPD deletions\n"; - } - std::cerr << "Total deleted = " << deleted << std::endl; - return deleted; -} - -template -int OnlineRLM::sbsqQuery(const std::vector& ngram, int* codes, - bool bStrict) -{ - wordID_t IDs[ngram.size()]; - for(count_t i = 0; i < ngram.size(); ++i) - IDs[i] = vocab_->GetWordID(ngram[i]); - return sbsqQuery(IDs, ngram.size(), codes, bStrict); -} - -template -int OnlineRLM::sbsqQuery(const wordID_t* IDs, const int len, int* codes, - bool bStrict) -{ - uint64_t filterIdx = 0; - int val(0), fnd(0); - hpdEntry_t hpdItr; - for(int i = len - 1; i >= 0; --i) { // do subsequence filtering - //if(IDs[i] == Vocab::kOOVWordID) break; - val = PerfectHash::query(&IDs[i], len - i, hpdItr, filterIdx); - if(val != -1) { // if event found - fnd = len - i; // increment found sequence - if(hpdItr != this->dict_.end()) { - val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks - } - } else if(bStrict) { - break; - } - // add to value array - codes[i] = val > 0 ? val : 0; - } - while(bStrict && (fnd > 1)) { // do checks the other way - val = PerfectHash::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx); - if(val != -1) break; // if anything found - else --fnd; // else decrement found - } - - return fnd; -} - -template -float OnlineRLM::getProb(const wordID_t* ngram, int len, - const void** state) -{ - static const float oovprob = log10(1.0 / (static_cast(vocab_->Size()) - 1)); - float logprob(0); - const void* context = (state) ? *state : 0; - // if full ngram and prob not in cache - if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) { - // get full prob and put in cache - int num_fnd(0), den_val(0); - int *in = new int[len]; // in[] keeps counts of increasing order numerator - for(int i = 0; i < len; ++i) in[i] = 0; - for(int i = len - 1; i >= 0; --i) { - if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV - in[i] = query(&ngram[i], len - i); - if(in[i] > 0) { - num_fnd = len - i; - } else if(strict_checks_) break; - } - while(num_fnd > 1) { // get lower order count - //get sub-context of size one less than length found (exluding target) - den_val = query(&ngram[len - num_fnd], num_fnd - 1); - if((den_val > 0) && - (den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) { - break; - } else --num_fnd; // else backoff to lower ngram order - } - if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams - num_fnd = 0; - switch(num_fnd) { // find prob (need to refactor into precomputation) - case 0: // OOV - logprob = alpha_[len] + oovprob; - break; - case 1: // unigram found only - UTIL_THROW_IF2(in[len - 1] <= 0, "Error"); - logprob = alpha_[len - 1] + (corpusSize_ > 0 ? - log10(static_cast(in[len - 1]) / static_cast(corpusSize_)) : 0); - //logprob = alpha_[len - 1] + - //log10(static_cast(in[len - 1]) / static_cast(corpusSize_)); - break; - default: - UTIL_THROW_IF2(den_val <= 0, "Error"); - //if(subgram == in[len - found]) ++subgram; // avoid returning zero probs???? - logprob = alpha_[len - num_fnd] + - log10(static_cast(in[len - num_fnd]) / static_cast(den_val)); - break; - } - // need unique context - context = getContext(&ngram[len - num_fnd], num_fnd); - // put whatever was found in cache - cache_->setCacheNgram(ngram, len, logprob, context); - } // end checkCache - return logprob; -} - -template -const void* OnlineRLM::getContext(const wordID_t* ngram, int len) -{ - int dummy(0); - float**addresses = new float*[len]; // only interested in addresses of cache - UTIL_THROW_IF2(cache_->getCache2(ngram, len, &addresses[0], &dummy) != len, - "Error"); - // return address of cache node - - float *addr0 = addresses[0]; - free( addresses ); - return (const void*)addr0; -} - -template -void OnlineRLM::randDelete(int num2del) -{ - int deleted = 0; - for(uint64_t i = 0; i < this->cells_; i++) { - if(this->filter_->read(i) != 0) { - PerfectHash::remove(i); - ++deleted; - } - if(deleted >= num2del) break; - } -} - -template -int OnlineRLM::countHits() -{ - int hit(0); - for(uint64_t i = 0; i < this->cells_; ++i) - if(bHit_->testBit(i)) ++hit; - iterate(this->dict_, itr) - if((itr->second & this->hitMask_) != 0) - ++hit; - std::cerr << "Hit count = " << hit << std::endl; - return hit; -} - -template -int OnlineRLM::countPrefixes() -{ - int pfx(0); - for(uint64_t i = 0; i < this->cells_; ++i) - if(bPrefix_->testBit(i)) ++pfx; - //TODO::Handle hpdict prefix counts - std::cerr << "Prefix count (in filter) = " << pfx << std::endl; - return pfx; -} - -template -int OnlineRLM::cleanUpHPD() -{ - std::cerr << "HPD size before = " << this->dict_.size() << std::endl; - std::vector vDel, vtmp; - iterate(this->dict_, itr) { - if(((itr->second & this->hitMask_) == 0) && // if not hit during testing - (Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram - vDel.push_back(itr->first); - } - } - iterate(vDel, vitr) - this->dict_.erase(*vitr); - std::cerr << "HPD size after = " << this->dict_.size() << std::endl; - return vDel.size(); -} - -template -void OnlineRLM::clearMarkings() -{ - std::cerr << "clearing all event hits\n"; - bHit_->reset(); - count_t* value(0); - iterate(this->dict_, itr) { - value = &itr->second; - *value -= ((*value & this->hitMask_) != 0) ? this->hitMask_ : 0; - } -} - -template -void OnlineRLM::save(Moses::FileHandler* fout) -{ - std::cerr << "Saving ORLM...\n"; - // save vocab - vocab_->Save(fout); - fout->write((char*)&corpusSize_, sizeof(corpusSize_)); - fout->write((char*)&order_, sizeof(order_)); - bPrefix_->save(fout); - bHit_->save(fout); - // save everything else - PerfectHash::save(fout); - std::cerr << "Finished saving ORLM." << std::endl; -} - -template -void OnlineRLM::load(Moses::FileHandler* fin) -{ - std::cerr << "Loading ORLM...\n"; - // load vocab first - vocab_ = new Moses::Vocab(fin); - UTIL_THROW_IF2(vocab_ == 0, "Vocab object not set"); - fin->read((char*)&corpusSize_, sizeof(corpusSize_)); - std::cerr << "\tCorpus size = " << corpusSize_ << std::endl; - fin->read((char*)&order_, sizeof(order_)); - std::cerr << "\tModel order = " << order_ << std::endl; - bPrefix_ = new randlm::BitFilter(fin); - bHit_ = new randlm::BitFilter(fin); - // load everything else - PerfectHash::load(fin); -} - -template -void OnlineRLM::removeNonMarked() -{ - std::cerr << "deleting all unused events\n"; - int deleted(0); - for(uint64_t i = 0; i < this->cells_; ++i) { - if(!(bHit_->testBit(i) || bPrefix_->testBit(i)) - && (this->filter_->read(i) != 0)) { - PerfectHash::remove(i); - ++deleted; - } - } - deleted += cleanUpHPD(); - std::cerr << "total removed from ORLM = " << deleted << std::endl; -} - -/* -template -float OnlineRLM::getProb2(const wordID_t* ngram, int len, const void** state) { - static const float oovprob = log10(1.0 / (static_cast(vocab_->size()) - 1)); - float log_prob(0); - const void* context_state(NULL); - int found; - int* denom_codes[order_]; - int* num_codes[order_ + 1]; - int denom_found(0); - std::cerr << "length=" << len << std::endl; - // constrain cache queries using model assumptions - int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found); - std::cerr << "denom_len = " << denom_len << std::endl; - int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1, - &num_codes[0], &found); - std::cerr << "num_len= " << num_len << std::endl; - // keed reducing ngram size until both denominator and numerator are found - // allowed to leave kUnknownCode in cache because we check for this. - found = num_len; // guaranteed to be <= denom_len + 1 - // still check for OOV - for (int i = len - found; i < len; ++i) - if (ngram[i] == Vocab::kOOVWordID) { - found = len - i - 1; - } - // check for relative estimator - while(found > 1) { - if(*denom_codes[found-1] == cache_unk_ && - ((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) { - //!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) { - *num_codes[found] = cache_unk_; - } else { - if(*num_codes[found] != cache_unk_ || - ((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1])) - // struct_->query(&ngram[len-*found], *found, kMainEventIdx, - // num_codes[*found], *denom_codes[*found-1])) - break; - } - --found; - } - // didn't find bigram numerator or unigram denominator - if (found == 1) - found = *num_codes[1] != cache_unk_ - || ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0); - //struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]); - // .... - // return estimate applying correct backoff score (precomputed) - // store full log prob with complete ngram (even if backed off) - switch (found) { - case 0: // no observation: assign prob of 'new word' in training data - log_prob = alpha_[len] + oovprob; - //log_prob = stupid_backoff_log10_[len] + uniform_log10prob_; - break; - case 1: // unigram over whole corpus - log_prob = alpha_[len - 1] + - log10(static_cast(*num_codes[1]) / static_cast(corpusSize_)); - //log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_ - // + stupid_backoff_log10_[len - 1]; // precomputed - break; - default: // otherwise use both statistics and (possibly zero) backoff weight - log_prob = alpha_[len - found] + - log10(static_cast(*num_codes[found]) / static_cast(*denom_codes[found-1])); - //log_prob = log_quantiser_->getLog10Value(*num_codes[*found ]) - // - log_quantiser_->getLog10Value(*denom_codes[*found - 1]) - // + stupid_backoff_log10_[len - *found]; - } - context_state = (const void*)num_codes[found == len ? found - 1 : found];; - //probCache_->store(len, log_prob, context_state); - if (state) - *state = context_state; - return log_prob; -} -*/ - -#endif - diff --git a/moses/TranslationModel/DynSAInclude/params.cpp b/moses/TranslationModel/DynSAInclude/params.cpp deleted file mode 100644 index 2c5b416e5..000000000 --- a/moses/TranslationModel/DynSAInclude/params.cpp +++ /dev/null @@ -1,241 +0,0 @@ -#include "params.h" - -namespace Moses -{ -// parameter constants -const std::string Parameters::kNotSetValue = "__NOT_SET__"; - -const int Parameters::kBoolValue = 0; -const int Parameters::kIntValue = 1; -const int Parameters::kFloatValue = 2; -const int Parameters::kStringValue = 3; -const int Parameters::kUndefinedValue = -1; - -const std::string Parameters::kTrueValue = "1"; -const std::string Parameters::kFalseValue = "0"; - -Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum) -{ - initialize(paramdefs, paramNum); -} - -Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs, - const count_t paramNum) -{ - initialize(paramdefs, paramNum); - loadParams(argc, argv); -} - -void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum) -{ - for( count_t i = 0; i < paramNum; i++ ) { - params_[paramdefs[i].name] = paramdefs[i]; // assign name - } - std::cerr << "Default parameter values:\n"; - iterate(params_, itr) - std::cerr << "\t" << itr->first << " --> " << itr->second.value << std::endl; -} - -bool Parameters::loadParams(int argc, char ** argv) -{ - // load params from commandline args - //if( argc < 3 ) { - // fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n"); - // return false; - //} - bool load_from_file = false; - std::set setParams; - int jumpBy = 0; - for( int i = 1; i < argc; i += jumpBy ) { - std::string param = argv[i]; - if(param[0] != '-') { - std::cerr << "Unknown parameter: " << param << std::endl; - return false; - } - Utils::ltrim(param, "- "); - // normalise parameter to long name - param = normaliseParamName(param); - // check if valid param name - if(!isValidParamName(param)) { - std::cerr << "Unknown param option \"" << param << "\"\n"; - exit(EXIT_FAILURE); - } - setParams.insert(param); // needed to not overwrite param value if file is specified - //if the parameter is of type booL no corresponding value - if( getValueType(param) == kBoolValue ) { - jumpBy = 1; - UTIL_THROW_IF2(!setParamValue(param, kTrueValue), - "Couldn't set parameter " << param); - } else { //not of type bool so must have corresponding value - UTIL_THROW_IF2(i+1 >= argc, - "Out of bound error: " << i+1); - - jumpBy = 2; - std::string val = argv[i+1]; - Utils::trim(val); - if( param == "config" ) - load_from_file = true; - if(!setParamValue(param, val)) { - std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl; - return false; - } - } - } - bool success = true; - // load from file if specified - if (load_from_file) - success = loadParams(getParamValue("config"), setParams); - return success; -} - -std::string Parameters::normaliseParamName(const std::string & name) -{ - // Map valid abbreviations to long names. Retain other names. - if( params_.find(name) == params_.end() ) - iterate(params_, i) - if( i->second.abbrev == name ) - return i->first; - return name; -} - -int Parameters::getValueType(const std::string& name) -{ - if(params_.find(name) != params_.end()) - return params_[name].type; - return Parameters::kUndefinedValue; -} - -bool Parameters::isValidParamName(const std::string & name) -{ - return params_.find(name) != params_.end(); -} - -bool Parameters::setParamValue(const std::string& name, const std::string& val) -{ - // TODO: Add basic type checking w verifyValueType() - bool set = isValidParamName(name); - if(set) { - params_[name].value = val; - std::cerr << "PARAM SET: "<< name << "=" << val << std::endl; - } - return( set ); -} -std::string Parameters::getParamValue(const std::string& name) -{ - std::string value = Parameters::kNotSetValue; - if(isValidParamName(name)) { - if(params_.find(name) != params_.end()) - value = params_[name].value; - else if(getValueType(name) == kBoolValue) - value = kFalseValue; - } - return value; -} -std::string Parameters::getParam(const std::string& name) -{ - return getParamValue(name); - /*void* Parameters::getParam(const std::string& name) { - void* paramVal = 0; - int type = getValueType(name); - const char* sval = getParamValue(name).c_str(); - switch(type) { - case kIntValue: { - int ival = atoi(sval); - paramVal = (void*)&ival; - break; - } - case kFloatValue: { - float fval = atof(sval); - paramVal = (void*)&fval; - break; - } - case kStringValue: { - paramVal = (void*)sval; - break; - } - case kBoolValue: { - bool bval = sval == Parameters::kTrueValue ? true : false; - paramVal = (void*)&bval; - break; - } - default: // --> Parameters::kUndefinedValue - paramVal = (void*)sval; // will set to Parameters::kNotSetValue - } - return paramVal;*/ -} -bool Parameters::verifyValueType(const std::string& name, const std::string& val) -{ - // Implement basic type checking - return true; -} - -int Parameters::getParamCount() const -{ - return params_.size(); -} - -/* - * HAVE TO CHANGE loadParams() from file to not overwrite command lines but - * override default if different*/ -bool Parameters::loadParams(const std::string & file_path, - std::set& setParams) -{ - // parameters loaded from file don't override cmd line paramters - /*std::set::iterator end = setParams.end(); - FileHandler file(file_path.c_str(), std::ios::in); - std::string line, param; - while ( getline(file, line) ) { - Utils::trim(line); - //ignore comments (lines beginning with '#') and empty lines - if( line[0] == '#' || line.empty() ) continue; - if( line[0] == '[' ) { - Utils::trim(line, "-[]"); //remove brackets - // normalise parameter names - param = normaliseParamName(line); - //handle boolean type parameters - if(getValueType(param) == kBoolValue && setParams.find(param) == end) - setParamValue(param, kTrueValue); - } else { - // TODO: verify that this works as intended - if(setParams.find(param) == end) { // if param hasn't already been set in cmd line - if(!setParamValue(param, line)) { - std::cerr << "Invalid Param name->value " << param << "->" << line << std::endl; - return false; - } - } - } - }*/ - return true; -} -/* -int Parameters::getCSVParams(const std::string & name, std::vector & values) { - // get param values(s) may be more than one separated by commas - values.clear(); - if( isValidParamName(name) ) - if( params_.find(name) != params_.end() ) - return Utils::tokenizeToStr(params_.find(name)->second.c_str(), values, ","); - return 0; -} - -bool Parameters::checkParamIsSet(const std::string & name) { - // Returns true for non-bool parameter that is set to anything. - // Returns true for bool parameter only if set to true. - if (getValueType(name) == kBoolValue) // boolean value so check whether true - return getParamValue(name) == kTrueValue; - return (getParamValue(name) != kNotSetValue); -} - -bool Parameters::printHelp(const std::string & name) { - return true; -} - -bool Parameters::printParams() { - // print out parameters and values - std::map::iterator it; - std::cerr << "User defined parameter settings:\n"; - for (it = params_.begin(); it != params_.end(); ++it) - std::cerr << "\t" << it->first << "\t" << it->second << "\n"; - return true; -} -*/ -} diff --git a/moses/TranslationModel/DynSAInclude/params.h b/moses/TranslationModel/DynSAInclude/params.h deleted file mode 100644 index ecaccada4..000000000 --- a/moses/TranslationModel/DynSAInclude/params.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef moses_DynSAInclude_params_h -#define moses_DynSAInclude_params_h - -#include -#include -#include -#include -#include "FileHandler.h" -#include "utils.h" -#include "types.h" - -#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0])) - -namespace Moses -{ -typedef struct ParamDefs { - std::string name; - std::string value; - std::string abbrev; - int type; - std::string description; -} ParamDefs; - -//! @todo ask abby2 -class Parameters -{ -public: - static const std::string kNotSetValue; - static const int kBoolValue; - static const int kIntValue; - static const int kFloatValue; - static const int kStringValue; - static const int kUndefinedValue; - static const std::string kFalseValue; - static const std::string kTrueValue; - - Parameters(const ParamDefs * paramdefs, const count_t paramNum); - Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum); - ~Parameters() {} - bool loadParams(int argc, char ** argv); - bool loadParams(const std::string& param_file, std::set&); - int getValueType(const std::string & name); - bool setParamValue(const std::string& name, const std::string& value); - bool verifyValueType(const std::string& name, const std::string& value); - bool isValidParamName(const std::string & name); - std::string getParamValue(const std::string& name); - //void* getParam(const std::string& name); - std::string getParam(const std::string& name); - int getParamCount() const; - /* - int getCSVParams(const std::string & name, std::vector & values); - bool checkParamIsSet(const std::string& name); - bool printParams(); - bool printHelp(const std::string & name); - */ -private: - std::string normaliseParamName(const std::string &name); - void initialize(const ParamDefs * paramdefs, const count_t paramNum); - std::map params_; // name->value,type,abbrev,desc -}; - -} -#endif //INC_PARAMS.H - diff --git a/moses/TranslationModel/DynSAInclude/perfectHash.h b/moses/TranslationModel/DynSAInclude/perfectHash.h deleted file mode 100644 index 9b597f06c..000000000 --- a/moses/TranslationModel/DynSAInclude/perfectHash.h +++ /dev/null @@ -1,437 +0,0 @@ -/* NO OVERLAY VALUES STORED IN SEPERATE FILTER */ -#ifndef INC_PERFECTHASH_H -#define INC_PERFECTHASH_H - -#include -#include -#include "hash.h" -#include "RandLMFilter.h" -#include "quantizer.h" - -/** - * PerfectHash handles setting up hash functions and storage - * for LM data. - */ -using randlm::Filter; -using randlm::BitFilter; -typedef std::map hpDict_t; -typedef hpDict_t::iterator hpdEntry_t; -static count_t collisions_ = 0; - -/* Based on Mortenson et. al. 2006 */ -template -class PerfectHash -{ -public: - PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase); - PerfectHash(Moses::FileHandler* fin) { - UTIL_THROW_IF2(fin == 0, "Invalid file handle"); - } - virtual ~PerfectHash(); - void analyze(); - count_t hpDictMemUse(); - count_t bucketsMemUse(); -protected: - Filter* filter_; - Filter* values_; - hpDict_t dict_; - uint64_t cells_; - count_t hitMask_; - int totBuckets_; - uint8_t bucketRange_; - uint8_t* idxTracker_; - uint64_t insert(const wordID_t* IDs, const int len, const count_t value); - bool update(const wordID_t* IDs, const int len, const count_t value, - hpdEntry_t& hpdAddr, uint64_t& filterIdx); - bool update2(const wordID_t* IDs, const int len, const count_t value, - hpdEntry_t& hpdAddr, uint64_t& filterIdx); - int query(const wordID_t* IDs, const int len, - hpdEntry_t& hpdAddr, uint64_t& filterIdx); - virtual void remove(const wordID_t* IDs, const int len); - void remove(uint64_t index); - void save(Moses::FileHandler* fout); - void load(Moses::FileHandler* fin); - virtual void markQueried(const uint64_t&)=0; - //pointer to a specific entry in a hpDict_t - virtual void markQueried(hpdEntry_t&)=0; -private: - T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket); - std::string hpDictKeyValue(const wordID_t* IDs, const int len); - uint64_t memBound_; // total memory bound in bytes - uint16_t cellWidth_; // in bits - UnivHash_linear* bucketHash_; - UnivHash_linear* fingerHash_; - LogQtizer* qtizer_; -}; - -template -PerfectHash::PerfectHash(uint16_t MBs, int width, int bucketRange, - float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)), - cellWidth_(width) -{ - bucketRange_ = static_cast(bucketRange); - if(bucketRange > 255) { - std::cerr << "ERROR: Max bucket range is > 2^8\n"; - exit(1); - } - qtizer_ = new LogQtizer(qBase); - int valBits = (int)ceil(log2((float)qtizer_->maxcode())); - std::cerr << "BITS FOR VALUES ARRAY = " << valBits << std::endl; - uint64_t totalBits = memBound_ << 3; - cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells - cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range - totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells - filter_ = new Filter(cells_, cellWidth_); - values_ = new Filter(cells_, valBits); - idxTracker_ = new uint8_t[totBuckets_]; - for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0; - // initialize ranges for each hash function - bucketHash_ = new UnivHash_linear(totBuckets_, 1, PRIME); - fingerHash_ = new UnivHash_linear(pow(2.0f, cellWidth_), MAX_HASH_FUNCS, PRIME); -} - -template -PerfectHash::~PerfectHash() -{ - delete[] idxTracker_; - delete filter_; - filter_ = NULL; - delete fingerHash_; - delete bucketHash_; - delete qtizer_; - delete values_; -} - -template -uint64_t PerfectHash::insert(const wordID_t* IDs, const int len, - const count_t value) -{ - count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0)); - if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows - // restriction on fprint value is non-zero - T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); - uint64_t emptyidx = cells_ + 1; - uint64_t index = bucket * bucketRange_, // starting bucket row - lastrow = index + bucketRange_; // ending row - while(index < lastrow) { // unique so check each row for "matching" signature - T filterVal = filter_->read(index); - if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row - emptyidx = index; - } else if(filterVal == fp) { - ++collisions_; - dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd - return cells_ + 1; // finished - } - ++index; - } - UTIL_THROW_IF2((emptyidx >= index) || (filter_->read(emptyidx) != 0), "Error"); // should have found empty index if it gets here - T code = (T)qtizer_->code(value); - filter_->write(emptyidx, fp); // insert the fprint - values_->write(emptyidx, code); - ++idxTracker_[bucket]; // keep track of bucket size - return emptyidx; - } else { // bucket is full - dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd - return cells_ + 1; - } -} - -template -bool PerfectHash::update(const wordID_t* IDs, const int len, - const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) -{ - // check if key is in high perf. dictionary - filterIdx = cells_ + 1; - std::string skey = hpDictKeyValue(IDs, len); - if((hpdAddr = dict_.find(skey)) != dict_.end()) { - hpdAddr->second = value; - return true; - } - // else hash ngram - //count_t bucket = bucketHash_->hash(IDs, len); - count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0)); - // restriction on fprint value is non-zero - T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); - uint64_t index = bucket * bucketRange_, // starting bucket row - lastrow = index + bucketRange_; - while(index < lastrow) { // must check each row for matching fp event - T filterVal = filter_->read(index); - if(filterVal == fp) { // found event w.h.p. - values_->write(index, (T)qtizer_->code(value)); - filterIdx = index; - return true; - } - ++index; - } - // could add if it gets here. - return false; -} - -template -int PerfectHash::query(const wordID_t* IDs, const int len, - hpdEntry_t& hpdAddr, uint64_t& filterIdx) -{ - // check if key is in high perf. dictionary - std::string skey = hpDictKeyValue(IDs, len); - if((hpdAddr = dict_.find(skey)) != dict_.end()) { - filterIdx = cells_ + 1; - return(hpdAddr->second); // returns copy of value - } else { // check if key is in filter - // get bucket - //count_t bucket = bucketHash_->hash(IDs, len); - count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0)); - // restriction on fprint value is non-zero - T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); - // return value if ngram is in filter - uint64_t index = bucket * bucketRange_, - lastrow = index + bucketRange_; - for(; index < lastrow; ++index) { - if(filter_->read(index) == fp) { - //cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" << - //filter_->read(index) << "\tcode = " << code << std::endl; - filterIdx = index; - hpdAddr = dict_.end(); - return (int)qtizer_->value(values_->read(index)); - } - } - } - return -1; -} - -template -void PerfectHash::remove(const wordID_t* IDs, const int len) -{ - // delete key if in high perf. dictionary - std::string skey = hpDictKeyValue(IDs, len); - if(dict_.find(skey) != dict_.end()) - dict_.erase(skey); - else { // check if key is in filter - // get small representation for ngrams - //count_t bucket = bucketHash_->hash(IDs, len); - count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0)); - // retrieve non zero fingerprint for ngram - T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); - // return value if ngram is in filter - uint64_t index = bucket * bucketRange_, - lastrow = index + bucketRange_; - for(; index < lastrow; ++index) { - if(filter_->read(index) == fp) { - filter_->write(index, 0); - values_->write(index, 0); - --idxTracker_[bucket]; // track bucket size reduction - break; - } - } - } -} - -template // clear filter index -void PerfectHash::remove(uint64_t index) -{ - UTIL_THROW_IF2(index >= cells_, "Out of bound: " << index); - UTIL_THROW_IF2(filter_->read(index) == 0, "Error"); // slow - filter_->write(index, 0); - values_->write(index, 0); - //reduce bucket size - count_t bucket = index / bucketRange_; - --idxTracker_[bucket]; -} - -template -T PerfectHash::nonZeroSignature(const wordID_t* IDs, const int len, - count_t bucket) -{ - count_t h = bucket; - T fingerprint(0); - do { - fingerprint = fingerHash_->hash(IDs, len, h); - h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around - } while((fingerprint == 0) && (h != bucket)); - if(fingerprint == 0) - std::cerr << "WARNING: Unable to find non-zero signature for ngram\n" << std::endl; - return fingerprint; -} - -template -std::string PerfectHash::hpDictKeyValue(const wordID_t* IDs, const int len) -{ - std::string skey(" "); - for(int i = 0; i < len; ++i) - skey += Utils::IntToStr(IDs[i]) + "¬"; - Utils::trim(skey); - return skey; -} - -template -count_t PerfectHash::hpDictMemUse() -{ - // return hpDict memory usage in MBs - return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20; -} - -template -count_t PerfectHash::bucketsMemUse() -{ - // return bucket memory usage in MBs - return (count_t) (filter_->size() + values_->size()); -} - -template -void PerfectHash::save(Moses::FileHandler* fout) -{ - UTIL_THROW_IF2(fout == 0, "Invalid file handle"); - std::cerr << "\tSaving perfect hash parameters...\n"; - fout->write((char*)&hitMask_, sizeof(hitMask_)); - fout->write((char*)&memBound_, sizeof(memBound_)); - fout->write((char*)&cellWidth_, sizeof(cellWidth_)); - fout->write((char*)&cells_, sizeof(cells_)); - fout->write((char*)&totBuckets_, sizeof(totBuckets_)); - fout->write((char*)&bucketRange_, sizeof(bucketRange_)); - fout->write((char*)idxTracker_, totBuckets_ * sizeof(idxTracker_[0])); - qtizer_->save(fout); - std::cerr << "\tSaving hash functions...\n"; - fingerHash_->save(fout); - bucketHash_->save(fout); - std::cerr << "\tSaving bit filter...\n"; - filter_->save(fout); - values_->save(fout); - std::cerr << "\tSaving high performance dictionary...\n"; - count_t size = dict_.size(); - fout->write((char*)&size, sizeof(count_t)); - *fout << std::endl; - iterate(dict_, t) - *fout << t->first << "\t" << t->second << "\n"; -} - -template -void PerfectHash::load(Moses::FileHandler* fin) -{ - UTIL_THROW_IF2(fin == 0, "Invalid file handle"); - std::cerr << "\tLoading perfect hash parameters...\n"; - fin->read((char*)&hitMask_, sizeof(hitMask_)); - fin->read((char*)&memBound_, sizeof(memBound_)); - fin->read((char*)&cellWidth_, sizeof(cellWidth_)); - fin->read((char*)&cells_, sizeof(cells_)); - fin->read((char*)&totBuckets_, sizeof(totBuckets_)); - fin->read((char*)&bucketRange_, sizeof(bucketRange_)); - idxTracker_ = new uint8_t[totBuckets_]; - fin->read((char*)idxTracker_, totBuckets_ * sizeof(idxTracker_[0])); - qtizer_ = new LogQtizer(fin); - std::cerr << "\tLoading hash functions...\n"; - fingerHash_ = new UnivHash_linear(fin); - bucketHash_ = new UnivHash_linear(fin); - std::cerr << "\tLoading bit filter...\n"; - filter_ = new Filter(fin); - values_ = new Filter(fin); - std::cerr << "\tLoading HPD...\n"; - count_t size = 0; - fin->read((char*)&size, sizeof(count_t)); - fin->ignore(256, '\n'); - std::string line; - hpDict_t::key_type key; - hpDict_t::mapped_type val; - for(count_t i=0; i < size; ++i) { - getline(*fin, line); - Utils::trim(line); - std::istringstream ss(line.c_str()); - ss >> key, ss >> val; - dict_[key] = val; - } - std::cerr << "\tHPD size=" << dict_.size() << std::endl; - std::cerr << "Finished loading ORLM." << std::endl; -} - -template -void PerfectHash::analyze() -{ - std::cerr << "Analyzing Dynamic Bloomier Filter...\n"; - // see how many items in each bucket - uint8_t* bucketCnt = new uint8_t[totBuckets_]; - unsigned largestBucket = 0, totalCellsSet = 0, - smallestBucket = bucketRange_, totalZeroes = 0; - int curBucket = -1, fullBuckets(0); - for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0; - for(uint64_t i =0; i < cells_; ++i) { - if(i % bucketRange_ == 0) ++curBucket; - if(filter_->read(i) != 0) { - ++bucketCnt[curBucket]; - ++totalCellsSet; - } else ++totalZeroes; - } - count_t bi = 0, si = 0; - for(int i = 0; i < totBuckets_; ++i) { - if(bucketCnt[i] > largestBucket) { - largestBucket = bucketCnt[i]; - bi = i; - } else if(bucketCnt[i] < smallestBucket) { - smallestBucket = bucketCnt[i]; - si = i; - } - } - count_t trackerCells(0); - for(int i = 0; i < totBuckets_; i++) { - trackerCells += idxTracker_[i]; - if(idxTracker_[i] == bucketRange_) - ++fullBuckets; - } - for(int i = 0; i < totBuckets_; ++i) { - if(bucketCnt[i] != idxTracker_[i]) - std::cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] << - "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << std::endl; - } - std::cerr << "total cells= " << cells_ << std::endl; - std::cerr << "total buckets= " << totBuckets_ << std::endl; - std::cerr << "bucket range= " << (int)bucketRange_ << std::endl; - std::cerr << "fingerprint bits= " << cellWidth_ << std::endl; - std::cerr << "total cells set= " << totalCellsSet; - std::cerr << " (idxTracker set = " << trackerCells << ")" << std::endl; - std::cerr << "total zeroes=" << totalZeroes; - std::cerr << " (idxTracker zeros = " << cells_ - trackerCells << ")" << std::endl; - std::cerr << "largest bucket (" << bi << ") size= " << largestBucket << std::endl; - std::cerr << "smallest bucket (" << si << ") size= " << smallestBucket << std::endl; - std::cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] << - " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << std::endl; - std::cerr << "total buckets full = " << fullBuckets << std::endl; - std::cerr << "total collision errors= " << collisions_ << std::endl; - std::cerr << "high performance dictionary size= " << dict_.size() << std::endl; - std::cerr << "high performance dictionary MBs= " << hpDictMemUse() << std::endl; - std::cerr << "filter MBs= " << filter_->size() << std::endl; - std::cerr << "values MBs= " << values_->size() << std::endl; - delete[] bucketCnt; -} - -template -bool PerfectHash::update2(const wordID_t* IDs, const int len, - const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) -{ - // check if key is in high perf. dictionary - filterIdx = cells_ + 1; - std::string skey = hpDictKeyValue(IDs, len); - if((hpdAddr = dict_.find(skey)) != dict_.end()) { - hpdAddr->second += value; - return true; - } - // else hash ngram - //count_t bucket = bucketHash_->hash(IDs, len); - count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0)); - // restriction on fprint value is non-zero - T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS)); - uint64_t index = bucket * bucketRange_, // starting bucket row - lastrow = index + bucketRange_; - while(index < lastrow) { // must check each row for matching fp event - T filterVal = filter_->read(index); - if(filterVal == fp) { // found event w.h.p. - int oldval = (int)qtizer_->value(values_->read(index)); - values_->write(index, (T)qtizer_->code(oldval + value)); - filterIdx = index; - return true; - } - ++index; - } - // add if it gets here. - insert(IDs, len, value); - return false; -} - -#endif - diff --git a/moses/TranslationModel/DynSAInclude/quantizer.h b/moses/TranslationModel/DynSAInclude/quantizer.h deleted file mode 100644 index 2e83a33b7..000000000 --- a/moses/TranslationModel/DynSAInclude/quantizer.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef ORLM_QUANTIZER_H -#define ORLM_QUANTIZER_H - -#include -#include -#include -#include "types.h" - -static const float kFloatErr = 0.00001f; - -#ifdef WIN32 -#define log2(X) (log((double)X)/log((double)2)) -#endif - -//! @todo ask abby2 -class LogQtizer -{ -public: - LogQtizer(float i): base_(pow(2, 1 / i)) { - UTIL_THROW_IF2(base_ <= 1, "Can't calculate log base less than 1"); - max_code_ = 0; - float value = 1; // code = 1 -> value = 1 for any base - std::vector code_to_value_vec; - while (log2(value) < 30) { // assume 2^30 is largest count - code_to_value_vec.push_back(value); - value = pow(base_, ++max_code_); - } - code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_] - // get valid range - max_value_ = code_to_value_vec[max_code_]; - min_value_ = 1; - // store codes in array for lookup - code_to_value_ = new float[max_code_ + 1]; - code_to_log_value_ = new float[max_code_ + 1]; - for (int j = 0; j <= max_code_; ++j) { - // map to integers - code_to_value_[j] = floor(kFloatErr + code_to_value_vec[j]); // - code_to_log_value_[j] = log10(code_to_value_[j]); // log_base 10 to match srilm - } - std::cerr << "Initialized quantization (size = " << max_code_ + 1 << ")" << std::endl; - } - LogQtizer(Moses::FileHandler* fin) { - UTIL_THROW_IF2(fin == NULL, "Null file handle"); - load(fin); - } - int code(float value) { - // should just be: return log_b(value) - UTIL_THROW_IF2(value < min_value_ || value > max_value_, - "Value " << value << " out of bound"); - - // but binary search removes errors due to floor operator above - int code = static_cast(std::lower_bound(code_to_value_, code_to_value_+ max_code_, - value) - code_to_value_); - // make sure not overestimating - code = code_to_value_[code] > value ? code - 1 : code; - return code; - } - inline float value(int code) { - // table look up for values - return code_to_value_[code]; - } - inline int maxcode() { - return max_code_; - } - inline float logValue(int code) { - // table look up for log of values - return code_to_log_value_[code]; - } - ~LogQtizer() { - delete[] code_to_value_; - delete[] code_to_log_value_; - } - void save(Moses::FileHandler* fout) { - fout->write((char*)&base_, sizeof(base_)); - fout->write((char*)&max_code_, sizeof(max_code_)); - fout->write((char*)&max_value_, sizeof(max_value_)); - fout->write((char*)&min_value_, sizeof(min_value_)); - for (int j = 0; j <= max_code_; ++j) - fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j])); - for (int j = 0; j <= max_code_; ++j) - fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j])); - std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <read((char*)&base_, sizeof(base_)); - fin->read((char*)&max_code_, sizeof(max_code_)); - fin->read((char*)&max_value_, sizeof(max_value_)); - fin->read((char*)&min_value_, sizeof(min_value_)); - code_to_value_ = new float[max_code_ + 1]; - for(int j = 0; j <= max_code_; ++j) - fin->read((char*)&code_to_value_[j], sizeof(code_to_value_[j])); - code_to_log_value_ = new float[max_code_ + 1]; - for(int j = 0; j <= max_code_; ++j) - fin->read((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j])); - std::cerr << "Loaded log codebook with " << max_code_ + 1 << " codes." << std::endl; - } -}; - -#endif diff --git a/moses/TranslationModel/DynSAInclude/types.h b/moses/TranslationModel/DynSAInclude/types.h deleted file mode 100644 index 2d7c38bdb..000000000 --- a/moses/TranslationModel/DynSAInclude/types.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef moses_DynSAInclude_types_h -#define moses_DynSAInclude_types_h - -#include -#include -#include -#include -#include -#include - -#if defined WIN32 && !defined __MINGW32__ -#define iterate(c, i) for(decltype(c.begin()) i = c.begin(); i != c.end(); ++i) -#define piterate(c, i) for(decltype(c->begin()) i = c->begin(); i != c->end(); ++i) -#define riterate(c, i) for(decltype(c.rbegin()) i = c.rbegin(); i != c.rend(); ++i) -#else -#define iterate(c, i) for(__typeof__(c.begin()) i = c.begin(); i != c.end(); ++i) -#define piterate(c, i) for(__typeof__(c->begin()) i = c->begin(); i != c->end(); ++i) -#define riterate(c, i) for(__typeof__(c.rbegin()) i = c.rbegin(); i != c.rend(); ++i) -#endif - -#define THREADED false -#define THREAD_MAX 2 -#define MAX_NGRAM_ORDER 8 -#define MAX_STR_LEN 300 -#define PRIME 8589935681ULL -#define MAX_HASH_FUNCS 1000 -//#define PRIME 409 - -//typedefs for projects -typedef std::string word_t; // word as string -typedef unsigned int wordID_t; // word mapped to integer -typedef std::string date_t; // a date marker -typedef unsigned int count_t; // for 64-bit to 32-bit compatibility - -#endif diff --git a/moses/TranslationModel/DynSAInclude/utils.h b/moses/TranslationModel/DynSAInclude/utils.h deleted file mode 100644 index 485e4a065..000000000 --- a/moses/TranslationModel/DynSAInclude/utils.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef moses_DynSAInclude_utils_h -#define moses_DynSAInclude_utils_h - -#include -#include -#include -#include -#include -#include -#include - -//! @todo ask abby2 -class Utils -{ -public: - static void trim(std::string& str, const std::string dropChars = " \t\n\r") { - str.erase(str.find_last_not_of(dropChars)+1); - str.erase(0, str.find_first_not_of(dropChars)); - } - static void rtrim(std::string& str, const std::string dropChars = " \t\n\r") { - str.erase(str.find_last_not_of(dropChars)+1); - } - static void ltrim(std::string& str, const std::string dropChars = " \t\n\r") { - str.erase(0, str.find_first_not_of(dropChars)); - } - static std::string IntToStr(int integer) { - std::ostringstream stream; - stream << integer; - return stream.str(); - } - static int splitToStr(const char * str, - std::vector & items, - const char * delm = "\t") { - char * buff = const_cast(str); - items.clear(); - char * pch = strtok(buff, delm); - while( pch != NULL ) { - items.push_back(pch); - pch = strtok(NULL, delm); - } - return items.size(); - } - static int splitToStr(std::string buff, - std::vector & items, - std::string delm = "\t") { - std::string cp = buff.substr(); - return splitToStr(cp.c_str(), items, delm.c_str()); - } - static int splitToInt(std::string buff, std::vector& items, - std::string delm = ",") { - items.clear(); - std::vector tmpVector(0); - int i = 0; - i = splitToStr(buff.c_str(), tmpVector, delm.c_str()); - if( i > 0 ) - for( int j = 0; j < i; j++ ) - items.push_back(atoi(tmpVector[j].c_str())); - return i; - } - static void strToLowercase(std::string& str) { - for(unsigned i=0; i < str.length(); i++) { - str[i] = tolower(str[i]); - } - } -}; - -#endif diff --git a/moses/TranslationModel/DynSAInclude/vocab.cpp b/moses/TranslationModel/DynSAInclude/vocab.cpp deleted file mode 100644 index af57232f2..000000000 --- a/moses/TranslationModel/DynSAInclude/vocab.cpp +++ /dev/null @@ -1,158 +0,0 @@ -#include -#include "vocab.h" - -namespace Moses -{ - -// Vocab class -void Vocab::InitSpecialWords() -{ - m_kBOSWord = InitSpecialWord(BOS_); // BOS_ is a string (defined in ../typedef.h) - m_kEOSWord = InitSpecialWord(EOS_); // EOS_ is a string (defined in ../typedef.h) - m_kOOVWord = InitSpecialWord(UNKNOWN_FACTOR); // UNKNOWN_FACTOR also defined in ../typedef.h -} - -const Word Vocab::InitSpecialWord( const std::string& word_str) -{ - FactorList factors; - factors.push_back(0); // store the special word string as the first factor - Word word; - // define special word as Input word with one factor and isNonTerminal=false - word.CreateFromString( Input, factors, word_str, false ); // Input is enum defined in ../typedef.h - // TODO not sure if this will work properly: - // - word comparison can fail because the last parameter (isNonTerminal) - // in function CreateFromString may not match properly created words - // - special word is Input word but what about Output words? - // - currently Input/Output variable is not stored in class Word, but in the future??? - return word; -} -wordID_t Vocab::GetWordID(const std::string& word_str) -{ - FactorList factors; - factors.push_back(0); - Word word; - word.CreateFromString(Input, factors, word_str, false); - return GetWordID(word); -} - -// get wordID_t index for word represented as string -wordID_t Vocab::GetWordID(const std::string& word_str, - const FactorDirection& direction, const FactorList& factors, bool isNonTerminal) -{ - // get id for factored string - Word word; - word.CreateFromString( direction, factors, word_str, isNonTerminal); - return GetWordID( word); -} - -wordID_t Vocab::GetWordID(const Word& word) -{ - // get id and possibly add to vocab - if(m_words2ids.find(word) == m_words2ids.end()) { - if (!m_closed) { - wordID_t id = m_words2ids.size() + 1; - m_ids2words[id] = word; - // update lookup tables - m_words2ids[word] = id; - } else { - return m_kOOVWordID; - } - } - wordID_t id = m_words2ids[word]; - return id; -} - -Word& Vocab::GetWord(wordID_t id) -{ - // get word string given id - return (m_ids2words.find(id) == m_ids2words.end()) ? m_kOOVWord : m_ids2words[id]; -} - -bool Vocab::InVocab(wordID_t id) -{ - return m_ids2words.find(id) != m_ids2words.end(); -} - -bool Vocab::InVocab(const Word& word) -{ - return m_words2ids.find(word) != m_words2ids.end(); -} - -bool Vocab::Save(const std::string & vocab_path) -{ - // save vocab as id -> word - FileHandler vcbout(vocab_path, std::ios::out); - return Save(&vcbout); -} - -bool Vocab::Save(FileHandler* vcbout) -{ - // then each vcb entry - *vcbout << m_ids2words.size() << "\n"; - for (Id2Word::const_iterator iter = m_ids2words.begin(); - iter != m_ids2words.end(); ++iter) { - *vcbout << iter->second << "\t" << iter->first << "\n"; - } - return true; -} - -bool Vocab::Load(const std::string & vocab_path, const FactorDirection& direction, - const FactorList& factors, bool closed) -{ - FileHandler vcbin(vocab_path, std::ios::in); - std::cerr << "Loading vocab from " << vocab_path << std::endl; - return Load(&vcbin, direction, factors, closed); -} -bool Vocab::Load(FileHandler* vcbin) -{ - FactorList factors; - factors.push_back(0); - return Load(vcbin, Input, factors); -} -bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction, - const FactorList& factors, bool closed) -{ - // load vocab id -> word mapping - m_words2ids.clear(); // reset mapping - m_ids2words.clear(); - std::string line, word_str; - wordID_t id; - - std::istream &ret = getline(*vcbin, line); - UTIL_THROW_IF2(!ret, "Couldn't read file"); - std::istringstream first(line.c_str()); - uint32_t vcbsize(0); - first >> vcbsize; - uint32_t loadedsize = 0; - while (loadedsize++ < vcbsize && getline(*vcbin, line)) { - std::istringstream entry(line.c_str()); - entry >> word_str; - Word word; - word.CreateFromString( direction, factors, word_str, false); // TODO set correctly isNonTerminal - entry >> id; - // may be no id (i.e. file may just be a word list) - if (id == 0 && word != GetkOOVWord()) - id = m_ids2words.size() + 1; // assign ids sequentially starting from 1 - UTIL_THROW_IF2(m_ids2words.count(id) != 0 || m_words2ids.count(word) != 0, - "Error"); - - m_ids2words[id] = word; - m_words2ids[word] = id; - } - m_closed = closed; // once loaded fix vocab ? - std::cerr << "Loaded vocab with " << m_ids2words.size() << " words." << std::endl; - return true; -} -void Vocab::PrintVocab() -{ - for (Id2Word::const_iterator iter = m_ids2words.begin(); - iter != m_ids2words.end(); ++iter ) { - std::cerr << iter->second << "\t" << iter->first << "\n"; - } - for (Word2Id::const_iterator iter = m_words2ids.begin(); - iter != m_words2ids.end(); ++iter ) { - std::cerr << iter->second << "\t" << iter->first << "\n"; - } -} - -} //end namespace diff --git a/moses/TranslationModel/DynSAInclude/vocab.h b/moses/TranslationModel/DynSAInclude/vocab.h deleted file mode 100644 index cf81bf3a9..000000000 --- a/moses/TranslationModel/DynSAInclude/vocab.h +++ /dev/null @@ -1,127 +0,0 @@ -#ifndef moses_DynSAInclude_vocab_h -#define moses_DynSAInclude_vocab_h - -#include -#include -#include "types.h" -#include "FileHandler.h" -#include "utils.h" -#include "moses/TypeDef.h" -#include "moses/Word.h" - -namespace Moses -{ - -//! Vocab maps between strings and uint32 ids. -class Vocab -{ -public: - - typedef std::map Word2Id; - typedef std::map Id2Word; - - Vocab(bool sntMarkers = true): - m_closed(false), - m_kOOVWordID(0), - m_kBOSWordID(1) { - InitSpecialWords(); - if(sntMarkers) { - GetWordID(m_kBOSWord); // added in case not observed in corpus - GetWordID(m_kEOSWord); - } - } - // if no file then must allow new words - // specify whether more words can be added via 'closed' - // assume that if a vocab is loaded from file then it should be closed. - Vocab(const std::string & vocab_path, const FactorDirection& direction, - const FactorList& factors, bool closed = true): - m_kOOVWordID(0), - m_kBOSWordID(1) { - InitSpecialWords(); - bool ret = Load(vocab_path, direction, factors, closed); - UTIL_THROW_IF2(!ret, "Unable to load vocab file: " << vocab_path); - } - Vocab(FileHandler * fin, const FactorDirection& direction, - const FactorList& factors, bool closed = true): - m_kOOVWordID(0), - m_kBOSWordID(1) { - InitSpecialWords(); - bool ret = Load(fin, direction, factors, closed); - UTIL_THROW_IF2(!ret, "Unable to load vocab file"); - } - Vocab(FileHandler *fin): - m_kOOVWordID(0), - m_kBOSWordID(1) { - Load(fin); - } - ~Vocab() {} - // parse 'word' into factored Word and get id - wordID_t GetWordID(const std::string& word, const FactorDirection& direction, - const FactorList& factors, bool isNonTerminal); - wordID_t GetWordID(const Word& word); - wordID_t GetWordID(const std::string& word); - Word& GetWord(wordID_t id); - inline wordID_t GetkOOVWordID() { - return m_kOOVWordID; - } - inline wordID_t GetBOSWordID() { - return m_kBOSWordID; - } - inline const Word& GetkOOVWord() { - return m_kOOVWord; - } - inline const Word& GetkBOSWord() { - return m_kBOSWord; - } - inline const Word& GetkEOSWord() { - return m_kEOSWord; - } - - bool InVocab(wordID_t id); - bool InVocab(const Word& word); - uint32_t Size() { - return m_words2ids.size(); - } - void MakeClosed() { - m_closed = true; - } - void MakeOpen() { - m_closed = false; - } - bool IsClosed() { - return m_closed; - } - bool Save(const std::string & vocab_path); - bool Save(FileHandler* fout); - bool Load(const std::string & vocab_path, const FactorDirection& direction, - const FactorList& factors, bool closed = true); - bool Load(FileHandler* fin, const FactorDirection& direction, - const FactorList& factors, bool closed = true); - bool Load(FileHandler* fin); - void PrintVocab(); - Word2Id::const_iterator VocabStart() { - return m_words2ids.begin(); - } - Word2Id::const_iterator VocabEnd() { - return m_words2ids.end(); - } - -protected: - bool m_closed; // can more words be added - - const wordID_t m_kOOVWordID; // out of vocabulary word id - const wordID_t m_kBOSWordID; - Word m_kBOSWord; // beginning of sentence marker - Word m_kEOSWord; // end of sentence marker - Word m_kOOVWord; // - - const Word InitSpecialWord( const std::string& type); // initialize special word like kBOS, kEOS - void InitSpecialWords(); - - Word2Id m_words2ids; // map from words to word ids - Id2Word m_ids2words; // map from ids to words -}; - -} - -#endif From d583e0888f385358c346afa39faf85f4d3ad1ba7 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Tue, 18 Aug 2015 18:31:41 +0100 Subject: [PATCH 268/286] make-factor-de-pos.perl --- scripts/training/wrappers/make-factor-de-pos.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/wrappers/make-factor-de-pos.perl b/scripts/training/wrappers/make-factor-de-pos.perl index 585323bd4..0c9f58a59 100755 --- a/scripts/training/wrappers/make-factor-de-pos.perl +++ b/scripts/training/wrappers/make-factor-de-pos.perl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl +#!/usr/bin/perl -w # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. From 97eed023013d009ff01e54bc2ff6fbce177a32ed Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 19 Aug 2015 13:25:29 +0100 Subject: [PATCH 269/286] dos2unix. Revert Matthias' change, use env --- .../training/wrappers/make-factor-de-pos.perl | 72 +++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/scripts/training/wrappers/make-factor-de-pos.perl b/scripts/training/wrappers/make-factor-de-pos.perl index 0c9f58a59..bb9f19518 100755 --- a/scripts/training/wrappers/make-factor-de-pos.perl +++ b/scripts/training/wrappers/make-factor-de-pos.perl @@ -1,36 +1,36 @@ -#!/usr/bin/perl -w -# -# This file is part of moses. Its use is licensed under the GNU Lesser General -# Public License version 2.1 or, at your option, any later version. - -use warnings; -use strict; - -my ($in,$out,$tmpdir) = @ARGV; -use Encode; -use FindBin qw($RealBin); - -`mkdir -p $tmpdir`; -`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`; -`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`; - -open(LOPAR,"$tmpdir/lopar.$$"); -open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl > $out"); -while() { - chomp; - s/ +/ /g; - s/^ //; - s/ $//; - my $first = 1; - foreach (split) { - die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/; - my ($word,$pos,$lemma) = ($1,$2,$3); - $pos =~ s/\..+//; - print OUT " " unless $first; - $first = 0; - print OUT encode('utf8', decode('iso-8859-1', $pos)); - } - print OUT "\n"; -} -close(LOPAR); -close(OUT); +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; + +my ($in,$out,$tmpdir) = @ARGV; +use Encode; +use FindBin qw($RealBin); + +`mkdir -p $tmpdir`; +`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`; +`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`; + +open(LOPAR,"$tmpdir/lopar.$$"); +open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl > $out"); +while() { + chomp; + s/ +/ /g; + s/^ //; + s/ $//; + my $first = 1; + foreach (split) { + die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/; + my ($word,$pos,$lemma) = ($1,$2,$3); + $pos =~ s/\..+//; + print OUT " " unless $first; + $first = 0; + print OUT encode('utf8', decode('iso-8859-1', $pos)); + } + print OUT "\n"; +} +close(LOPAR); +close(OUT); From 4ff776f564296508f3fe402c0089fd81216f8ea9 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 19 Aug 2015 16:30:31 +0400 Subject: [PATCH 270/286] dos2unix the whole lot --- .../wrappers/make-factor-de-lemma.perl | 72 +++++++++---------- .../wrappers/make-factor-en-porter.perl | 26 +++---- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/scripts/training/wrappers/make-factor-de-lemma.perl b/scripts/training/wrappers/make-factor-de-lemma.perl index 0b93002a9..f45733992 100755 --- a/scripts/training/wrappers/make-factor-de-lemma.perl +++ b/scripts/training/wrappers/make-factor-de-lemma.perl @@ -1,36 +1,36 @@ -#!/usr/bin/perl -w -# -# This file is part of moses. Its use is licensed under the GNU Lesser General -# Public License version 2.1 or, at your option, any later version. - -use strict; -use Encode; -use FindBin qw($RealBin); - -my ($in,$out,$tmpdir) = @ARGV; - -`mkdir -p $tmpdir`; -`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`; -`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`; - -open(LOPAR,"$tmpdir/lopar.$$"); -open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl > $out"); -while() { - chomp; - s/ +/ /g; - s/^ //; - s/ $//; - my $first = 1; - foreach (split) { - die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/; - my ($word,$pos,$lemma) = ($1,$2,$3); - print OUT " " unless $first; - $first = 0; - $lemma =~ s/\|.+$//; - $lemma = $word if $lemma =~ /^\<.+\>$/; - print OUT encode('utf8', decode('iso-8859-1', $lemma)); - } - print OUT "\n"; -} -close(LOPAR); -close(OUT); +#!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use strict; +use Encode; +use FindBin qw($RealBin); + +my ($in,$out,$tmpdir) = @ARGV; + +`mkdir -p $tmpdir`; +`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`; +`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`; + +open(LOPAR,"$tmpdir/lopar.$$"); +open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl > $out"); +while() { + chomp; + s/ +/ /g; + s/^ //; + s/ $//; + my $first = 1; + foreach (split) { + die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/; + my ($word,$pos,$lemma) = ($1,$2,$3); + print OUT " " unless $first; + $first = 0; + $lemma =~ s/\|.+$//; + $lemma = $word if $lemma =~ /^\<.+\>$/; + print OUT encode('utf8', decode('iso-8859-1', $lemma)); + } + print OUT "\n"; +} +close(LOPAR); +close(OUT); diff --git a/scripts/training/wrappers/make-factor-en-porter.perl b/scripts/training/wrappers/make-factor-en-porter.perl index 7ae5fd0b3..78db1685c 100755 --- a/scripts/training/wrappers/make-factor-en-porter.perl +++ b/scripts/training/wrappers/make-factor-en-porter.perl @@ -1,13 +1,13 @@ -#!/usr/bin/perl -w -# -# This file is part of moses. Its use is licensed under the GNU Lesser General -# Public License version 2.1 or, at your option, any later version. - -use strict; -use FindBin qw($RealBin); - -my ($in,$out,$tmpdir) = @ARGV; - -my $porter_in = "$tmpdir/porter-in.$$"; -`$RealBin/../../tokenizer/deescape-special-chars.perl < $in > $porter_in`; -`/home/pkoehn/statmt/bin/porter-stemmer $porter_in | $RealBin/../../tokenizer/escape-special-chars.perl > $out`; +#!/usr/bin/perl -w +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use strict; +use FindBin qw($RealBin); + +my ($in,$out,$tmpdir) = @ARGV; + +my $porter_in = "$tmpdir/porter-in.$$"; +`$RealBin/../../tokenizer/deescape-special-chars.perl < $in > $porter_in`; +`/home/pkoehn/statmt/bin/porter-stemmer $porter_in | $RealBin/../../tokenizer/escape-special-chars.perl > $out`; From 261cfdb0244fc0375500d1b7bc7927bc1d99d173 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Wed, 19 Aug 2015 16:59:10 +0100 Subject: [PATCH 271/286] perl shebang --- scripts/training/wrappers/make-factor-de-lemma.perl | 3 ++- scripts/training/wrappers/make-factor-en-porter.perl | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/training/wrappers/make-factor-de-lemma.perl b/scripts/training/wrappers/make-factor-de-lemma.perl index f45733992..71ca18462 100755 --- a/scripts/training/wrappers/make-factor-de-lemma.perl +++ b/scripts/training/wrappers/make-factor-de-lemma.perl @@ -1,8 +1,9 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. +use warnings; use strict; use Encode; use FindBin qw($RealBin); diff --git a/scripts/training/wrappers/make-factor-en-porter.perl b/scripts/training/wrappers/make-factor-en-porter.perl index 78db1685c..95e56e26e 100755 --- a/scripts/training/wrappers/make-factor-en-porter.perl +++ b/scripts/training/wrappers/make-factor-en-porter.perl @@ -1,8 +1,9 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. +use warnings; use strict; use FindBin qw($RealBin); From 5d0c6925144d9bbd5c3506ff641bc984e8090a59 Mon Sep 17 00:00:00 2001 From: Rico Sennrich Date: Fri, 21 Aug 2015 16:08:49 +0100 Subject: [PATCH 272/286] speed-up RDLM --- moses/LM/RDLM.cpp | 103 +++++++++++++++++++++++++++++++++++++++------- moses/LM/RDLM.h | 20 +++++++++ 2 files changed, 109 insertions(+), 14 deletions(-) diff --git a/moses/LM/RDLM.cpp b/moses/LM/RDLM.cpp index 33bdc9c55..b4dc5c80f 100644 --- a/moses/LM/RDLM.cpp +++ b/moses/LM/RDLM.cpp @@ -241,9 +241,9 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost // root of tree: score without context if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) { std::vector ngram_head_null (static_head_null); - ngram_head_null.back() = lm_head->lookup_output_word(root->GetChildren()[0]->GetLabel().GetString(m_factorType).as_string()); + ngram_head_null.back() = Factor2ID(root->GetChildren()[0]->GetLabel()[m_factorType], HEAD_OUTPUT); if (m_isPretermBackoff && ngram_head_null.back() == 0) { - ngram_head_null.back() = lm_head->lookup_output_word(root->GetLabel().GetString(m_factorType).as_string()); + ngram_head_null.back() = Factor2ID(root->GetLabel()[m_factorType], HEAD_OUTPUT); } if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) { std::vector::iterator it = ngram_head_null.begin(); @@ -296,7 +296,7 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost } size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size()); - const std::string & head_label = root->GetLabel().GetString(0).as_string(); + const StringPiece & head_label = root->GetLabel().GetString(0); bool virtual_head = false; int reached_end = 0; int label_idx, label_idx_out; @@ -308,13 +308,13 @@ void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost reached_end = 2; // indicate that we've seen the last symbol of the RHS } // with 'full' binarization, direction is encoded in 2nd char - std::string clipped_label = (m_binarized == 3) ? head_label.substr(2,head_label.size()-2) : head_label.substr(1,head_label.size()-1); - label_idx = lm_label->lookup_input_word(clipped_label); - label_idx_out = lm_label->lookup_output_word(clipped_label); + StringPiece clipped_label = (m_binarized == 3) ? head_label.substr(2,head_label.size()-2) : head_label.substr(1,head_label.size()-1); + label_idx = lm_label->lookup_input_word(clipped_label.as_string()); + label_idx_out = lm_label->lookup_output_word(clipped_label.as_string()); } else { reached_end = 3; // indicate that we've seen first and last symbol of the RHS - label_idx = lm_label->lookup_input_word(head_label); - label_idx_out = lm_label->lookup_output_word(head_label); + label_idx = Factor2ID(root->GetLabel()[0], LABEL_INPUT); + label_idx_out = Factor2ID(root->GetLabel()[0], LABEL_OUTPUT); } int head_idx = (virtual_head && head_ids.first == static_dummy_head) ? static_label_null[offset_up_head+m_context_up-1] : head_ids.first; @@ -597,8 +597,8 @@ void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & bac child_ids = std::make_pair(static_dummy_head, static_dummy_head); } - labels[j] = lm_head->lookup_input_word(child->GetLabel().GetString(0).as_string()); - labels_output[j] = lm_label->lookup_output_word(child->GetLabel().GetString(0).as_string()); + labels[j] = Factor2ID(child->GetLabel()[0], LABEL_INPUT); + labels_output[j] = Factor2ID(child->GetLabel()[0], LABEL_OUTPUT); heads[j] = child_ids.first; heads_output[j] = child_ids.second; j++; @@ -615,20 +615,77 @@ void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & bac void RDLM::GetIDs(const Word & head, const Word & preterminal, std::pair & IDs) const { - IDs.first = lm_head_base_instance_->lookup_input_word(head.GetString(m_factorType).as_string()); + IDs.first = Factor2ID(head[m_factorType], HEAD_INPUT); if (m_isPretermBackoff && IDs.first == 0) { - IDs.first = lm_head_base_instance_->lookup_input_word(preterminal.GetString(0).as_string()); + IDs.first = Factor2ID(preterminal[0], HEAD_INPUT); } if (m_sharedVocab) { IDs.second = IDs.first; } else { - IDs.second = lm_head_base_instance_->lookup_output_word(head.GetString(m_factorType).as_string()); + IDs.second = Factor2ID(head[m_factorType], HEAD_OUTPUT); if (m_isPretermBackoff && IDs.second == 0) { - IDs.second = lm_head_base_instance_->lookup_output_word(preterminal.GetString(0).as_string()); + IDs.second = Factor2ID(preterminal[0], HEAD_OUTPUT); } } } +// map from moses factor to NPLM ID; use vectors as cache to avoid hash table lookups +int RDLM::Factor2ID(const Factor * const factor, int model_type) const +{ + size_t ID = factor->GetId(); + int ret; + + std::vector* cache = NULL; + switch(model_type) { + case LABEL_INPUT: + cache = &factor2id_label_input; + break; + case LABEL_OUTPUT: + cache = &factor2id_label_output; + break; + case HEAD_INPUT: + cache = &factor2id_head_input; + break; + case HEAD_OUTPUT: + cache = &factor2id_head_output; + break; + } + + try { + ret = cache->at(ID); + } catch (const std::out_of_range& oor) { +#ifdef WITH_THREADS //need to resize cache; write lock + m_accessLock.unlock_shared(); + m_accessLock.lock(); +#endif + size_t old_size = cache->size(); + cache->resize(ID*2, -1); +#ifdef WITH_THREADS //go back to read lock + m_accessLock.unlock(); + m_accessLock.lock_shared(); +#endif + ret = -1; + } + if (ret == -1) { + switch(model_type) { + case LABEL_INPUT: + ret = lm_label_base_instance_->lookup_input_word(factor->GetString().as_string()); + break; + case LABEL_OUTPUT: + ret = lm_label_base_instance_->lookup_output_word(factor->GetString().as_string()); + break; + case HEAD_INPUT: + ret = lm_head_base_instance_->lookup_input_word(factor->GetString().as_string()); + break; + case HEAD_OUTPUT: + ret = lm_head_base_instance_->lookup_output_word(factor->GetString().as_string()); + break; + } + (*cache)[ID] = ret; + } + + return ret; +} void RDLM::PrintInfo(std::vector &ngram, nplm::neuralTM* lm) const { @@ -767,13 +824,31 @@ FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo //hash of all boundary symbols (symbols with incomplete context); trees with same hash share state for cube pruning. size_t boundary_hash = 0; if (!m_rerank) { + { //lock scope +#ifdef WITH_THREADS + //read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope + m_accessLock.lock_shared(); +#endif Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); +#ifdef WITH_THREADS + m_accessLock.unlock_shared(); +#endif + } accumulator->PlusEquals(ff_idx, score[0] + score[1]); accumulator->PlusEquals(ff_idx+1, score[2] + score[3]); } mytree->Combine(previous_trees); if (m_rerank && full_sentence) { + { //lock scope +#ifdef WITH_THREADS + //read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope + m_accessLock.lock_shared(); +#endif Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); +#ifdef WITH_THREADS + m_accessLock.unlock_shared(); +#endif + } accumulator->PlusEquals(ff_idx, score[0] + score[1]); accumulator->PlusEquals(ff_idx+1, score[2] + score[3]); } diff --git a/moses/LM/RDLM.h b/moses/LM/RDLM.h index 3d8c62f7e..e5d4b9490 100644 --- a/moses/LM/RDLM.h +++ b/moses/LM/RDLM.h @@ -8,6 +8,11 @@ #include #include +#ifdef WITH_THREADS +#include +#endif + + // relational dependency language model, described in: // Sennrich, Rico (2015). Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation. Transactions of the Association for Computational Linguistics. // see 'scripts/training/rdlm' for training scripts @@ -106,6 +111,20 @@ class RDLM : public StatefulFeatureFunction FactorType m_factorType; + static const int LABEL_INPUT = 0; + static const int LABEL_OUTPUT = 1; + static const int HEAD_INPUT = 2; + static const int HEAD_OUTPUT = 3; + mutable std::vector factor2id_label_input; + mutable std::vector factor2id_label_output; + mutable std::vector factor2id_head_input; + mutable std::vector factor2id_head_output; + +#ifdef WITH_THREADS + //reader-writer lock + mutable boost::shared_mutex m_accessLock; +#endif + public: RDLM(const std::string &line) : StatefulFeatureFunction(2, line) @@ -142,6 +161,7 @@ public: bool GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair & IDs) const; void GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, const nplm::neuralTM *lm_head, const nplm::neuralTM *lm_labels, std::vector & heads, std::vector & labels, std::vector & heads_output, std::vector & labels_output) const; void GetIDs(const Word & head, const Word & preterminal, std::pair & IDs) const; + int Factor2ID(const Factor * const factor, int model_type) const; void ScoreFile(std::string &path); //for debugging void PrintInfo(std::vector &ngram, nplm::neuralTM* lm) const; //for debugging From d5c41634e888861334b58414dc7ebae6eadbcdc7 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Fri, 21 Aug 2015 18:58:36 +0100 Subject: [PATCH 273/286] EMS: fix filtering issue when output-splitter is defined --- scripts/ems/experiment.meta | 45 +++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index b0c8c9a73..1e1e5af5d 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -940,21 +940,6 @@ parse-input-devtest pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval ignore-unless: use-mira template: $input-parser < IN > OUT -parse-relax-input - in: split-input - out: input - default-name: tuning/input.parse-relaxed - pass-unless: input-parse-relaxer - pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval - template: $input-parse-relaxer < IN > OUT -parse-relax-input-devtest - in: split-input-devtest - out: input-devtest - default-name: tuning/input.devtest.parse-relaxed - pass-unless: input-parse-relaxer - pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval - ignore-unless: use-mira - template: $input-parse-relaxer < IN > OUT factorize-input in: parsed-input out: factorized-input @@ -1016,20 +1001,35 @@ truecase-input-devtest ignore-unless: AND input-truecaser use-mira template: $input-truecaser -model IN1.$input-extension < IN > OUT split-input - in: truecased-input SPLITTER:splitter-model + in: truecased-input out: split-input - rerun-on-change: input-splitter + rerun-on-change: input-splitter SPLITTER:splitter-model default-name: tuning/input.split pass-unless: input-splitter - template: $input-splitter -model IN1.$input-extension < IN > OUT + template: $input-splitter -model SPLITTER:splitter-model.$input-extension < IN > OUT split-input-devtest - in: truecased-input-devtest SPLITTER:splitter-model + in: truecased-input-devtest out: split-input-devtest rerun-on-change: input-splitter default-name: tuning/input.devtest.split pass-unless: input-splitter ignore-unless: use-mira - template: $input-splitter -model IN1.$input-extension < IN > OUT + template: $input-splitter -model SPLITTER:splitter-model.$input-extension < IN > OUT +parse-relax-input + in: split-input + out: input + default-name: tuning/input.parse-relaxed + pass-unless: input-parse-relaxer + pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval + template: $input-parse-relaxer < IN > OUT +parse-relax-input-devtest + in: split-input-devtest + out: input-devtest + default-name: tuning/input.devtest.parse-relaxed + pass-unless: input-parse-relaxer + pass-if: skip-parse-input-devtesteval mock-input-parser-devtesteval + ignore-unless: use-mira + template: $input-parse-relaxer < IN > OUT reference-from-sgm in: reference-sgm input-sgm out: raw-reference @@ -1269,11 +1269,12 @@ truecase-input ignore-unless: input-truecaser template: $input-truecaser -model IN1.$input-extension < IN > OUT split-input - in: truecased-input SPLITTER:splitter-model + in: truecased-input out: split-input + rerun-on-change: input-splitter SPLITTER:splitter-model default-name: evaluation/input.split pass-unless: input-splitter - template: $input-splitter -model IN1.$input-extension < IN > OUT + template: $input-splitter -model SPLITTER:splitter-model.$input-extension < IN > OUT filter in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table out: filtered-dir From 37d9c18d3bb36ec4f70a612161db87749e92de07 Mon Sep 17 00:00:00 2001 From: MosesAdmin Date: Sat, 22 Aug 2015 00:00:46 +0100 Subject: [PATCH 274/286] daily automatic beautifier --- moses/LM/RDLM.cpp | 70 ++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/moses/LM/RDLM.cpp b/moses/LM/RDLM.cpp index b4dc5c80f..6df666926 100644 --- a/moses/LM/RDLM.cpp +++ b/moses/LM/RDLM.cpp @@ -637,18 +637,18 @@ int RDLM::Factor2ID(const Factor * const factor, int model_type) const std::vector* cache = NULL; switch(model_type) { - case LABEL_INPUT: - cache = &factor2id_label_input; - break; - case LABEL_OUTPUT: - cache = &factor2id_label_output; - break; - case HEAD_INPUT: - cache = &factor2id_head_input; - break; - case HEAD_OUTPUT: - cache = &factor2id_head_output; - break; + case LABEL_INPUT: + cache = &factor2id_label_input; + break; + case LABEL_OUTPUT: + cache = &factor2id_label_output; + break; + case HEAD_INPUT: + cache = &factor2id_head_input; + break; + case HEAD_OUTPUT: + cache = &factor2id_head_output; + break; } try { @@ -668,18 +668,18 @@ int RDLM::Factor2ID(const Factor * const factor, int model_type) const } if (ret == -1) { switch(model_type) { - case LABEL_INPUT: - ret = lm_label_base_instance_->lookup_input_word(factor->GetString().as_string()); - break; - case LABEL_OUTPUT: - ret = lm_label_base_instance_->lookup_output_word(factor->GetString().as_string()); - break; - case HEAD_INPUT: - ret = lm_head_base_instance_->lookup_input_word(factor->GetString().as_string()); - break; - case HEAD_OUTPUT: - ret = lm_head_base_instance_->lookup_output_word(factor->GetString().as_string()); - break; + case LABEL_INPUT: + ret = lm_label_base_instance_->lookup_input_word(factor->GetString().as_string()); + break; + case LABEL_OUTPUT: + ret = lm_label_base_instance_->lookup_output_word(factor->GetString().as_string()); + break; + case HEAD_INPUT: + ret = lm_head_base_instance_->lookup_input_word(factor->GetString().as_string()); + break; + case HEAD_OUTPUT: + ret = lm_head_base_instance_->lookup_output_word(factor->GetString().as_string()); + break; } (*cache)[ID] = ret; } @@ -824,14 +824,15 @@ FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo //hash of all boundary symbols (symbols with incomplete context); trees with same hash share state for cube pruning. size_t boundary_hash = 0; if (!m_rerank) { - { //lock scope + { + //lock scope #ifdef WITH_THREADS - //read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope - m_accessLock.lock_shared(); + //read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope + m_accessLock.lock_shared(); #endif - Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); + Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); #ifdef WITH_THREADS - m_accessLock.unlock_shared(); + m_accessLock.unlock_shared(); #endif } accumulator->PlusEquals(ff_idx, score[0] + score[1]); @@ -839,14 +840,15 @@ FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo } mytree->Combine(previous_trees); if (m_rerank && full_sentence) { - { //lock scope + { + //lock scope #ifdef WITH_THREADS - //read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope - m_accessLock.lock_shared(); + //read-lock for cache; cache resizes are so rare that we want to minimize number of calls, not scope + m_accessLock.lock_shared(); #endif - Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); + Score(mytree.get(), back_pointers, score, ancestor_heads, ancestor_labels, boundary_hash); #ifdef WITH_THREADS - m_accessLock.unlock_shared(); + m_accessLock.unlock_shared(); #endif } accumulator->PlusEquals(ff_idx, score[0] + score[1]); From d349bf8a94cb9b20255065488c599d3fed76c9d1 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Sun, 23 Aug 2015 19:00:19 +0400 Subject: [PATCH 275/286] dos2unix everything --- .../effects_highlight_bg_image.html | 2 +- scripts/generic/ph_numbers.perl | 212 +++++++++--------- .../nonbreaking_prefix.ca | 150 ++++++------- .../nonbreaking_prefix.hu | 2 +- .../nonbreaking_prefix.sl | 156 ++++++------- scripts/tests/full-train-mert-decode.test | 2 +- 6 files changed, 262 insertions(+), 262 deletions(-) diff --git a/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/test/functional/effects_highlight_bg_image.html b/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/test/functional/effects_highlight_bg_image.html index cb0adf284..edd77abc0 100644 --- a/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/test/functional/effects_highlight_bg_image.html +++ b/scripts/ems/web/javascripts/scriptaculous-js-1.8.3/test/functional/effects_highlight_bg_image.html @@ -26,7 +26,7 @@ window.onload = function() { li[i].onmousedown = Li.onMouseDown_without.bindAsEventListener(li[i]); } } - + //]]>