probing pt lib compiles

This commit is contained in:
Hieu Hoang 2017-02-15 21:26:28 +00:00
parent 8f456299c4
commit 74b123649e
27 changed files with 438 additions and 82 deletions

View File

@ -30,11 +30,6 @@ else {
alias programsMin ;
}
exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ;
#exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
alias programsProbing : CreateProbingPT ; #QueryProbingPT
exe merge-sorted :
merge-sorted.cc
../moses//moses
@ -43,6 +38,6 @@ $(TOP)//boost_iostreams
$(TOP)//boost_program_options
;
alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable pruneGeneration ;
alias programs : 1-1-Extraction TMining generateSequences processLexicalTable queryLexicalTable programsMin merge-sorted prunePhraseTable pruneGeneration ;
#processPhraseTable queryPhraseTable

View File

@ -65,15 +65,7 @@ alias deps : ..//z ..//boost_iostreams ..//boost_filesystem ../moses/Translatio
TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp
TranslationModel/CompactPT/ThrowingFwrite.cpp
TranslationModel/ProbingPT/ProbingPT.cpp
TranslationModel/ProbingPT/hash.cpp
TranslationModel/ProbingPT/line_splitter.cpp
TranslationModel/ProbingPT/probing_hash_utils.cpp
TranslationModel/ProbingPT/querying.cpp
TranslationModel/ProbingPT/storing.cpp
TranslationModel/ProbingPT/StoreVocab.cpp
TranslationModel/ProbingPT/StoreTarget.cpp
TranslationModel/ProbingPT/vocabid.cpp
TranslationModel/ProbingPT/ProbingPT.cpp
parameters/AllOptions.cpp
parameters/BookkeepingOptions.cpp

View File

@ -2,8 +2,8 @@
#include <boost/program_options.hpp>
#include "util/usage.hh"
#include "moses/TranslationModel/ProbingPT/storing.hh"
#include "moses/InputFileStream.h"
#include "moses/OutputFileStream.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "moses/Util.h"
using namespace std;
@ -74,9 +74,9 @@ int main(int argc, char* argv[])
std::string ReformatSCFGFile(const std::string &path)
{
Moses::InputFileStream inFile(path);
probingpt::InputFileStream inFile(path);
string reformattedPath = path + ".reformat.gz";
Moses::OutputFileStream outFile(reformattedPath);
probingpt::OutputFileStream outFile(reformattedPath);
string line;
while (getline(inFile, line)) {

View File

@ -0,0 +1,59 @@
// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "InputFileStream.h"
#include "gzfilebuf.h"
#include <iostream>
using namespace std;
namespace probingpt
{
InputFileStream::InputFileStream(const std::string &filePath) :
std::istream(NULL), m_streambuf(NULL)
{
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
std::filebuf* fb = new std::filebuf();
fb = fb->open(filePath.c_str(), std::ios::in);
if (!fb) {
cerr << "Can't read " << filePath.c_str() << endl;
exit(1);
}
m_streambuf = fb;
}
this->init(m_streambuf);
}
InputFileStream::~InputFileStream()
{
delete m_streambuf;
m_streambuf = NULL;
}
void InputFileStream::Close()
{
}
}

View File

@ -0,0 +1,46 @@
// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <cstdlib>
#include <fstream>
#include <string>
namespace probingpt
{
/** Used in place of std::istream, can read zipped files if it ends in .gz
*/
class InputFileStream: public std::istream
{
protected:
std::streambuf *m_streambuf;
public:
explicit InputFileStream(const std::string &filePath);
~InputFileStream();
void Close();
};
}

View File

@ -1,7 +1,19 @@
alias deps : ..//z ..//boost_iostreams ..//boost_filesystem ../moses/TranslationModel/CompactPT//cmph ;
lib probingpt :
temp.cpp
StoreTarget.cpp
StoreVocab.cpp
hash.cpp
line_splitter.cpp
probing_hash_utils.cpp
querying.cpp
storing.cpp
vocabid.cpp
OutputFileStream.cpp
InputFileStream.cpp
deps
;
exe ppt : Main.cpp probingpt ;
exe CreateProbingPT : CreateProbingPT.cpp probingpt ;
alias programs : ppt ;
alias programs : CreateProbingPT ;

View File

@ -1,5 +0,0 @@
int main()
{
}

View File

@ -0,0 +1,87 @@
// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include <boost/algorithm/string/predicate.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include "OutputFileStream.h"
#include "gzfilebuf.h"
using namespace std;
using namespace boost::algorithm;
namespace probingpt
{
OutputFileStream::OutputFileStream() :
boost::iostreams::filtering_ostream(), m_outFile(NULL), m_open(false)
{
}
OutputFileStream::OutputFileStream(const std::string &filePath) :
m_outFile(NULL), m_open(false)
{
Open(filePath);
}
OutputFileStream::~OutputFileStream()
{
Close();
}
bool OutputFileStream::Open(const std::string &filePath)
{
assert(!m_open);
if (filePath == std::string("-")) {
// Write to standard output. Leave m_outFile null.
this->push(std::cout);
} else {
m_outFile = new ofstream(filePath.c_str(),
ios_base::out | ios_base::binary);
if (m_outFile->fail()) {
return false;
}
if (ends_with(filePath, ".gz")) {
this->push(boost::iostreams::gzip_compressor());
}
this->push(*m_outFile);
}
m_open = true;
return true;
}
void OutputFileStream::Close()
{
if (!m_open) return;
this->flush();
if (m_outFile) {
this->pop(); // file
m_outFile->close();
delete m_outFile;
m_outFile = NULL;
}
m_open = false;
}
}

View File

@ -0,0 +1,81 @@
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <cstdlib>
#include <fstream>
#include <string>
#include <iostream>
#include <boost/iostreams/filtering_stream.hpp>
namespace probingpt
{
/** Version of std::ostream with transparent compression.
*
* Transparently compresses output when writing to a file whose name ends in
* ".gz". Or, writes to stdout instead of a file when given a filename
* consisting of just a dash ("-").
*/
class OutputFileStream: public boost::iostreams::filtering_ostream
{
private:
/** File that needs flushing & closing when we close this stream.
*
* Is NULL when no file is opened, e.g. when writing to standard output.
*/
std::ofstream *m_outFile;
/// Is this stream open?
bool m_open;
public:
/** Create an unopened OutputFileStream.
*
* Until it's been opened, nothing can be done with this stream.
*/
OutputFileStream();
/// Create an OutputFileStream, and open it by calling Open().
OutputFileStream(const std::string &filePath);
virtual ~OutputFileStream();
// TODO: Can we please just always throw an exception when this fails?
/** Open stream.
*
* If filePath is "-" (just a dash), this opens the stream for writing to
* standard output. Otherwise, it opens the given file. If the filename
* has the ".gz" suffix, output will be transparently compressed.
*
* Call Close() to close the file.
*
* Returns whether opening the file was successful. It may also throw an
* exception on failure.
*/
bool Open(const std::string &filePath);
/// Flush and close stream. After this, the stream can be opened again.
void Close();
};
}

View File

@ -8,12 +8,12 @@
#include "StoreTarget.h"
#include "line_splitter.hh"
#include "probing_hash_utils.hh"
#include "../../legacy/OutputFileStream.h"
#include "../../legacy/Util2.h"
#include "moses2/legacy/OutputFileStream.h"
#include "moses2/legacy/Util2.h"
using namespace std;
namespace Moses2
namespace probingpt
{
StoreTarget::StoreTarget(const std::string &basepath)
@ -51,7 +51,7 @@ uint64_t StoreTarget::Save()
}
// clear coll
RemoveAllInColl(m_coll);
Moses2::RemoveAllInColl(m_coll);
m_coll.clear();
// starting position of coll
@ -149,10 +149,10 @@ void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
util::SingleCharacter(' '));
while (it) {
string tok = it->as_string();
float prob = Scan<float>(tok);
float prob = Moses2::Scan<float>(tok);
if (log_prob) {
prob = FloorScore(log(prob));
prob = Moses2::FloorScore(log(prob));
if (prob == 0.0f) prob = 0.0000000001;
}
@ -172,12 +172,12 @@ void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
it = util::TokenIter<util::SingleCharacter>(line.word_align,
util::SingleCharacter(' '));
while (it) {
string tokPair = Trim(it->as_string());
string tokPair = Moses2::Trim(it->as_string());
if (tokPair.empty()) {
break;
}
vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-");
vector<size_t> alignPair = Moses2::Tokenize<size_t>(tokPair, "-");
assert(alignPair.size() == 2);
bool nonTerm = false;
@ -241,11 +241,11 @@ void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector,
//cerr << "lexProb=" << lexProb << endl;
// append lex probs to pt probs
vector<float> scores = Tokenize<float>(lexProb);
vector<float> scores = Moses2::Tokenize<float>(lexProb);
if (log_prob) {
for (size_t i = 0; i < scores.size(); ++i) {
scores[i] = FloorScore(log(scores[i]));
scores[i] = Moses2::FloorScore(log(scores[i]));
if (scores[i] == 0.0f) scores[i] = 0.0000000001;
}
}

View File

@ -13,7 +13,7 @@
#include <boost/unordered_set.hpp>
#include "StoreVocab.h"
namespace Moses2
namespace probingpt
{
class line_text;

View File

@ -7,7 +7,7 @@
#include <fstream>
#include "StoreVocab.h"
namespace Moses2
namespace probingpt
{
} /* namespace Moses2 */

View File

@ -7,10 +7,10 @@
#pragma once
#include <string>
#include <boost/unordered_map.hpp>
#include "../../legacy/OutputFileStream.h"
#include "../../legacy/Util2.h"
#include "OutputFileStream.h"
#include "moses2/legacy/Util2.h"
namespace Moses2
namespace probingpt
{
template<typename VOCABID>

94
probingpt/gzfilebuf.h Normal file
View File

@ -0,0 +1,94 @@
#ifndef moses_gzfile_buf_h
#define moses_gzfile_buf_h
#include <stdexcept>
#include <streambuf>
#include <zlib.h>
#include <cstring>
namespace probingpt
{
/** wrapper around gzip input stream. Unknown parentage
* @todo replace with boost version - output stream already uses it
*/
class gzfilebuf: public std::streambuf
{
public:
gzfilebuf(const char *filename) {
_gzf = gzopen(filename, "rb");
if (!_gzf) throw std::runtime_error(
"Could not open " + std::string(filename) + ".");
setg(_buff + sizeof(int), // beginning of putback area
_buff + sizeof(int), // read position
_buff + sizeof(int)); // end position
}
~gzfilebuf() {
gzclose(_gzf);
}
protected:
virtual int_type overflow(int_type /* c */) {
throw;
}
// write multiple characters
virtual std::streamsize xsputn(const char* /* s */, std::streamsize /* num */) {
throw;
}
virtual std::streampos seekpos(std::streampos /* sp */,
std::ios_base::openmode /* which = std::ios_base::in | std::ios_base::out */) {
throw;
}
//read one character
virtual int_type underflow() {
// is read position before end of _buff?
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
/* process size of putback area
* - use number of characters read
* - but at most four
*/
unsigned int numPutback = gptr() - eback();
if (numPutback > sizeof(int)) {
numPutback = sizeof(int);
}
/* copy up to four characters previously read into
* the putback _buff (area of first four characters)
*/
std::memmove(_buff + (sizeof(int) - numPutback), gptr() - numPutback,
numPutback);
// read new characters
int num = gzread(_gzf, _buff + sizeof(int), _buffsize - sizeof(int));
if (num <= 0) {
// ERROR or EOF
return EOF;
}
// reset _buff pointers
setg(_buff + (sizeof(int) - numPutback), // beginning of putback area
_buff + sizeof(int), // read position
_buff + sizeof(int) + num); // end of buffer
// return next character
return traits_type::to_int_type(*gptr());
}
std::streamsize xsgetn(char* s, std::streamsize num) {
return gzread(_gzf, s, num);
}
private:
gzFile _gzf;
static const unsigned int _buffsize = 1024;
char _buff[_buffsize];
};
}
#endif

View File

@ -3,7 +3,7 @@
using namespace std;
namespace Moses2
namespace probingpt
{
uint64_t getHash(StringPiece text)

View File

@ -6,7 +6,7 @@
#include "util/tokenize_piece.hh"
#include <vector>
namespace Moses2
namespace probingpt
{
//Gets the MurmurmurHash for give string

View File

@ -1,6 +1,6 @@
#include "line_splitter.hh"
namespace Moses2
namespace probingpt
{
line_text splitLine(const StringPiece &textin, bool scfg)

View File

@ -9,7 +9,7 @@
#include "util/tokenize_piece.hh"
#include <vector>
namespace Moses2
namespace probingpt
{
//Struct for holding processed line

View File

@ -2,7 +2,7 @@
#include "probing_hash_utils.hh"
#include "util/file.hh"
namespace Moses2
namespace probingpt
{
//Read table from disk, return memory map location

View File

@ -11,7 +11,7 @@
#include <fcntl.h>
#include <fstream>
namespace Moses2
namespace probingpt
{
#define API_VERSION 15

View File

@ -1,10 +1,10 @@
#include "querying.hh"
#include "util/exception.hh"
#include "../../legacy/Util2.h"
#include "moses2/legacy/Util2.h"
using namespace std;
namespace Moses2
namespace probingpt
{
QueryEngine::QueryEngine(const char * filepath, util::LoadMethod load_method)
@ -103,7 +103,7 @@ uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
{
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
return Moses2::getKey(source_phrase, size);
return probingpt::getKey(source_phrase, size);
}
std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
@ -127,14 +127,14 @@ void QueryEngine::read_alignments(const std::string &alignPath)
vector<string> toks = Moses2::Tokenize(line, "\t ");
UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
uint32_t alignInd = Scan<uint32_t>(toks[0]);
uint32_t alignInd = Moses2::Scan<uint32_t>(toks[0]);
if (alignInd >= alignColl.size()) {
alignColl.resize(alignInd + 1);
}
Alignments &aligns = alignColl[alignInd];
for (size_t i = 1; i < toks.size(); ++i) {
size_t pos = Scan<size_t>(toks[i]);
size_t pos = Moses2::Scan<size_t>(toks[i]);
aligns.push_back(pos);
}
}
@ -142,25 +142,25 @@ void QueryEngine::read_alignments(const std::string &alignPath)
void QueryEngine::file_exits(const std::string &basePath)
{
if (!FileExists(basePath + "/Alignments.dat")) {
if (!Moses2::FileExists(basePath + "/Alignments.dat")) {
UTIL_THROW2("Require file does not exist in: " << basePath << "/Alignments.dat");
}
if (!FileExists(basePath + "/TargetColl.dat")) {
if (!Moses2::FileExists(basePath + "/TargetColl.dat")) {
UTIL_THROW2("Require file does not exist in: " << basePath << "/TargetColl.dat");
}
if (!FileExists(basePath + "/TargetVocab.dat")) {
if (!Moses2::FileExists(basePath + "/TargetVocab.dat")) {
UTIL_THROW2("Require file does not exist in: " << basePath << "/TargetVocab.dat");
}
if (!FileExists(basePath + "/cache")) {
if (!Moses2::FileExists(basePath + "/cache")) {
UTIL_THROW2("Require file does not exist in: " << basePath << "/cache");
}
if (!FileExists(basePath + "/config")) {
if (!Moses2::FileExists(basePath + "/config")) {
UTIL_THROW2("Require file does not exist in: " << basePath << "/config");
}
if (!FileExists(basePath + "/probing_hash.dat")) {
if (!Moses2::FileExists(basePath + "/probing_hash.dat")) {
UTIL_THROW2("Require file does not exist in: " << basePath << "/probing_hash.dat");
}
if (!FileExists(basePath + "/source_vocabids")) {
if (!Moses2::FileExists(basePath + "/source_vocabids")) {
UTIL_THROW2("Require file does not exist in: " << basePath << "/source_vocabids");
}

View File

@ -9,9 +9,9 @@
#include "probing_hash_utils.hh"
#include "hash.hh" //Includes line splitter
#include "line_splitter.hh"
#include "../../legacy/Util2.h"
#include "moses2/legacy/Util2.h"
namespace Moses2
namespace probingpt
{
class QueryEngine
@ -68,7 +68,7 @@ public:
}
const std::string &foundStr = iter->second;
found = Scan<T>(foundStr);
found = Moses2::Scan<T>(foundStr);
return true;
}

View File

@ -4,12 +4,12 @@
#include "storing.hh"
#include "StoreTarget.h"
#include "StoreVocab.h"
#include "../../legacy/Util2.h"
#include "../../legacy/InputFileStream.h"
#include "moses2/legacy/Util2.h"
#include "InputFileStream.h"
using namespace std;
namespace Moses2
namespace probingpt
{
///////////////////////////////////////////////////////////////////////
@ -161,9 +161,9 @@ void createProbingPT(const std::string &phrasetable_path,
// update cache - CURRENT source phrase, not prev
if (max_cache_size) {
std::string countStr = line.counts.as_string();
countStr = Trim(countStr);
countStr = Moses2::Trim(countStr);
if (!countStr.empty()) {
std::vector<float> toks = Tokenize<float>(countStr);
std::vector<float> toks = Moses2::Tokenize<float>(countStr);
//cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl;
if (toks.size() >= 2) {
@ -174,7 +174,7 @@ void createProbingPT(const std::string &phrasetable_path,
uint64_t currKey = getKey(currVocabidSource);
CacheItem *item = new CacheItem(
Trim(line.source_phrase.as_string()),
Moses2::Trim(line.source_phrase.as_string()),
currKey,
toks[1]);
cache.push(item);
@ -244,7 +244,7 @@ size_t countUniqueSource(const std::string &path)
std::string line, prevSource;
while (std::getline(strme, line)) {
std::vector<std::string> toks = TokenizeMultiCharSeparator(line, "|||");
std::vector<std::string> toks = Moses2::TokenizeMultiCharSeparator(line, "|||");
assert(toks.size() != 0);
if (prevSource != toks[0]) {
@ -284,7 +284,7 @@ void serialize_cache(
uint64_t getKey(const std::vector<uint64_t> &vocabid_source)
{
return Moses2::getKey(vocabid_source.data(), vocabid_source.size());
return probingpt::getKey(vocabid_source.data(), vocabid_source.size());
}
std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos)

View File

@ -17,7 +17,7 @@
#include "util/file.hh"
#include "vocabid.hh"
namespace Moses2
namespace probingpt
{
typedef std::vector<uint64_t> SourcePhrase;

View File

@ -1,5 +0,0 @@
int foo()
{
return 5;
}

View File

@ -1,9 +1,9 @@
#include <boost/foreach.hpp>
#include "vocabid.hh"
#include "StoreVocab.h"
#include "../../legacy/Util2.h"
#include "moses2/legacy/Util2.h"
namespace Moses2
namespace probingpt
{
void add_to_map(StoreVocab<uint64_t> &sourceVocab,
@ -45,9 +45,9 @@ void read_map(std::map<uint64_t, std::string> &karta, const char* filename)
std::string line;
while (getline(is, line)) {
std::vector<std::string> toks = Tokenize(line, "\t");
std::vector<std::string> toks = Moses2::Tokenize(line, "\t");
assert(toks.size() == 2);
uint64_t ind = Scan<uint64_t>(toks[1]);
uint64_t ind = Moses2::Scan<uint64_t>(toks[1]);
karta[ind] = toks[0];
}

View File

@ -13,7 +13,7 @@
#include "util/string_piece.hh" //Tokenization and work with StringPiece
#include "util/tokenize_piece.hh"
namespace Moses2
namespace probingpt
{
template<typename VOCABID>
class StoreVocab;