Removed ORLM.

This commit is contained in:
Ulrich Germann 2015-08-17 18:11:04 +01:00
parent 8b3f2d4338
commit e8f010b9af
20 changed files with 4 additions and 3387 deletions

View File

@ -39,11 +39,11 @@ int main(int argc, char** argv)
#include "moses/ThreadPool.h"
#include "moses/TranslationTask.h"
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
#include "moses/FF/StatefulFeatureFunction.h"
#if PT_UG
#include "moses/TranslationModel/UG/mmsapt.h"
#endif
#include "moses/TreeInput.h"
#include "moses/LM/ORLM.h"
#include "moses/IOWrapper.h"
#include <boost/foreach.hpp>
@ -86,64 +86,11 @@ public:
msg += "supports updates.";
throw xmlrpc_c::fault(msg.c_str(), xmlrpc_c::fault::CODE_PARSE);
#endif
if(add2ORLM_) {
//updateORLM();
}
XVERBOSE(1,"Done inserting\n");
//PhraseDictionary* pdsa = (PhraseDictionary*) pdf->GetDictionary(*dummy);
map<string, xmlrpc_c::value> retData;
//*retvalP = xmlrpc_c::value_struct(retData);
#ifndef PT_UG
pdf = 0;
#endif
pdsa = 0;
*retvalP = xmlrpc_c::value_string("Phrase table updated");
}
string source_, target_, alignment_;
bool bounded_, add2ORLM_;
/*
void updateORLM() {
// TODO(level101): this belongs in the language model, not in moseserver.cpp
vector<string> vl;
map<vector<string>, int> ngSet;
LMList lms = StaticData::Instance().GetLMList(); // get LM
LMList::const_iterator lmIter = lms.begin();
LanguageModel *lm = *lmIter;
LanguageModelORLM* orlm = static_cast<LanguageModelORLM*>(lm);
if(orlm == 0) {
cerr << "WARNING: Unable to add target sentence to ORLM\n";
return;
}
// break out new ngrams from sentence
const int ngOrder(orlm->GetNGramOrder());
const std::string sBOS = orlm->GetSentenceStart()->GetString().as_string();
const std::string sEOS = orlm->GetSentenceEnd()->GetString().as_string();
Utils::splitToStr(target_, vl, " ");
// insert BOS and EOS
vl.insert(vl.begin(), sBOS);
vl.insert(vl.end(), sEOS);
for(int j=0; j < vl.size(); ++j) {
int i = (j<ngOrder) ? 0 : j-ngOrder+1;
for(int t=j; t >= i; --t) {
vector<string> ngVec;
for(int s=t; s<=j; ++s) {
ngVec.push_back(vl[s]);
//cerr << vl[s] << " ";
}
ngSet[ngVec]++;
//cerr << endl;
}
}
// insert into LM in order from 1grams up (for LM well-formedness)
cerr << "Inserting " << ngSet.size() << " ngrams into ORLM...\n";
for(int i=1; i <= ngOrder; ++i) {
iterate(ngSet, it) {
if(it->first.size() == i)
orlm->UpdateORLM(it->first, it->second);
}
}
}
*/
bool bounded_;
void breakOutParams(const params_t& params) {
params_t::const_iterator si = params.find("source");
@ -163,8 +110,6 @@ public:
XVERBOSE(1,"alignment = " << alignment_ << endl);
si = params.find("bounded");
bounded_ = (si != params.end());
si = params.find("updateORLM");
add2ORLM_ = (si != params.end());
}
};

View File

@ -134,11 +134,11 @@ if $(with-dalm) {
}
#ORLM is always compiled but needs special headers
obj ORLM.o : ORLM.cpp ..//headers ../TranslationModel/DynSAInclude//dynsa : : : <include>../TranslationModel/DynSAInclude ;
#obj ORLM.o : ORLM.cpp ..//headers ../TranslationModel/DynSAInclude//dynsa : : : <include>../TranslationModel/DynSAInclude ;
#Top-level LM library. If you've added a file that doesn't depend on external
#libraries, put it here.
alias LM : Backward.cpp BackwardLMState.cpp Base.cpp BilingualLM.cpp Implementation.cpp Ken.cpp MultiFactor.cpp Remote.cpp SingleFactor.cpp SkeletonLM.cpp ORLM.o
alias LM : Backward.cpp BackwardLMState.cpp Base.cpp BilingualLM.cpp Implementation.cpp Ken.cpp MultiFactor.cpp Remote.cpp SingleFactor.cpp SkeletonLM.cpp
../../lm//kenlm ..//headers $(dependencies) ;
alias macros : : : : <define>$(lmmacros) ;

View File

@ -1,108 +0,0 @@
#include <limits>
#include <iostream>
#include <fstream>
#include "moses/FactorCollection.h"
#include "moses/Phrase.h"
#include "moses/InputFileStream.h"
#include "moses/StaticData.h"
#include "ORLM.h"
using namespace std;
namespace Moses
{
bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
size_t nGramOrder)
{
cerr << "Loading LanguageModelORLM..." << endl;
m_filePath = filePath;
m_factorType = factorType;
m_nGramOrder = nGramOrder;
FileHandler fLmIn(m_filePath, std::ios::in|std::ios::binary, true);
m_lm = new OnlineRLM<T>(&fLmIn, m_nGramOrder);
fLmIn.close();
//m_lm = new MultiOnlineRLM<T>(m_filePath, m_nGramOrder);
// get special word ids
m_oov_id = m_lm->vocab_->GetWordID("<unk>");
CreateFactors();
return true;
}
void LanguageModelORLM::CreateFactors()
{
FactorCollection &factorCollection = FactorCollection::Instance();
size_t maxFactorId = 0; // to create lookup vector later on
std::map<size_t, wordID_t> m_lmids_map; // map from factor id -> word id
for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart();
vIter != m_lm->vocab_->VocabEnd(); vIter++) {
// get word from ORLM vocab and associate with (new) factor id
size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId();
m_lmids_map[factorId] = vIter->second;
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
}
// add factors for BOS and EOS and store bf word ids
size_t factorId;
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, "<s>");
factorId = m_sentenceStart->GetId();
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceStartWord[m_factorType] = m_sentenceStart;
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, "</s>");
factorId = m_sentenceEnd->GetId();
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
// add to lookup vector in object
lm_ids_vec_.resize(maxFactorId+1);
// fill with OOV code
fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id);
for (map<size_t, wordID_t>::const_iterator iter = m_lmids_map.begin();
iter != m_lmids_map.end() ; ++iter)
lm_ids_vec_[iter->first] = iter->second;
}
wordID_t LanguageModelORLM::GetLmID(const std::string& str) const
{
return m_lm->vocab_->GetWordID(str);
}
wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const
{
size_t factorId = factor->GetId();
return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId];
}
LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
State* finalState) const
{
FactorType factorType = GetFactorType();
// set up context
//std::vector<long unsigned int> factor(1,0);
//std::vector<string> sngram;
wordID_t ngram[MAX_NGRAM_SIZE];
int count = contextFactor.size();
for (int i = 0; i < count; i++) {
ngram[i] = GetLmID((*contextFactor[i])[factorType]);
//sngram.push_back(contextFactor[i]->GetString(factor, false));
}
//float logprob = FloorScore(TransformLMScore(lm_->getProb(sngram, count, finalState)));
LMResult ret;
ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, finalState)));
ret.unknown = count && (ngram[count - 1] == m_oov_id);
/*if (finalState)
std::cout << " = " << logprob << "(" << *finalState << ", " << *len <<")"<< std::endl;
else
std::cout << " = " << logprob << std::endl;
*/
return ret;
}
bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value)
{
/*cerr << "Inserting into ORLM: \"";
iterate(ngram, nit)
cerr << *nit << " ";
cerr << "\"\t" << value << endl; */
m_lm->vocab_->MakeOpen();
bool res = m_lm->update(ngram, value);
m_lm->vocab_->MakeClosed();
return res;
}
}

View File

@ -1,53 +0,0 @@
#pragma once
#include <string>
#include <vector>
#include "moses/Factor.h"
#include "moses/Util.h"
#include "SingleFactor.h"
#include "moses/TranslationModel/DynSAInclude/onlineRLM.h"
//#include "multiOnlineRLM.h"
#include "moses/TranslationModel/DynSAInclude/FileHandler.h"
#include "moses/TranslationModel/DynSAInclude/vocab.h"
namespace Moses
{
class Factor;
class Phrase;
/** @todo ask ollie
*/
class LanguageModelORLM : public LanguageModelSingleFactor
{
public:
typedef count_t T; // type for ORLM filter
LanguageModelORLM(const std::string &line)
:LanguageModelSingleFactor(line)
,m_lm(0) {
}
bool Load(const std::string &filePath, FactorType factorType, size_t nGramOrder);
virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL) const;
~LanguageModelORLM() {
//save LM with markings
Utils::rtrim(m_filePath, ".gz");
FileHandler fout(m_filePath + ".marked.gz", std::ios::out|std::ios::binary, false);
m_lm->save(&fout);
fout.close();
delete m_lm;
}
void CleanUpAfterSentenceProcessing() {
m_lm->clearCache(); // clear caches
}
bool UpdateORLM(const std::vector<std::string>& ngram, const int value);
protected:
OnlineRLM<T>* m_lm;
//MultiOnlineRLM<T>* m_lm;
wordID_t m_oov_id;
std::vector<wordID_t> lm_ids_vec_;
void CreateFactors();
wordID_t GetLmID(const std::string &str) const;
wordID_t GetLmID(const Factor *factor) const;
};
} // end namespace

View File

@ -1,186 +0,0 @@
#include "FileHandler.h"
#include <cstdio>
// Workaround: plain Windows does not have popen()/pclose().
// (MinGW already #define's them, so skip the workaround there.)
#if defined(WIN32) && !defined(__MINGW32__)
#define popen(A, B) _popen(A, B)
#define pclose(A) _pclose(A)
#endif
namespace Moses
{
// FileHandler class
const std::string FileHandler::kStdInDescriptor = "___stdin___";
const std::string FileHandler::kStdOutDescriptor = "___stdout___";
// compression commands
const FileExtension FileHandler::kGzipped = ".gz";
const FileExtension FileHandler::kBzipped2 = ".bz2";
const std::string FileHandler::kCatCommand = "cat";
const std::string FileHandler::kGzipCommand = "gzip -f";
const std::string FileHandler::kGunzipCommand = "gunzip -f";
const std::string FileHandler::kBzip2Command = "bzip2 -f";
const std::string FileHandler::kBunzip2Command = "bunzip2 -f";
FileHandler::FileHandler(const std::string & path, std::ios_base::openmode flags, bool /* checkExists */)
: std::fstream((const char*) NULL), path_(path), flags_(flags), buffer_(NULL), fp_(NULL)
{
if( !(flags^(std::ios::in|std::ios::out)) ) {
fprintf(stderr, "ERROR: FileHandler does not support bidirectional files (%s).\n", path_.c_str());
exit(EXIT_FAILURE);
} else {
bool ret = setStreamBuffer(flags & std::ios::in);
UTIL_THROW_IF2(!ret, "Unable to set stream buffer");
}
this->precision(32);
}
FileHandler::~FileHandler()
{
#ifndef NO_PIPES
if( fp_ != 0 )
pclose(fp_);
#endif
if( path_ != FileHandler::kStdInDescriptor &&
path_ != FileHandler::kStdOutDescriptor )
delete buffer_;
if( this->is_open() )
this->close();
}
fdstreambuf * FileHandler::openCompressedFile(const char * cmd)
{
//bool isInput = (flags_ & std::ios::in);
//open pipe to file with compression/decompression command
const char * p_type = (flags_ & std::ios::in ? "r" : "w");
#ifndef NO_PIPES
fp_ = popen(cmd, p_type);
#else
fp_ = NULL;
#endif
if( fp_ == NULL ) {
//fprintf(stderr, "ERROR:Failed to open compressed file at %s\n", path_.c_str());
perror("openCompressedFile: ");
exit(EXIT_FAILURE);
}
//open streambuf with file descriptor
return new fdstreambuf(fileno(fp_));
}
bool FileHandler::setStreamBuffer(bool checkExists)
{
// redirect stdin or stdout if necesary
if (path_ == FileHandler::kStdInDescriptor) {
UTIL_THROW_IF2((flags_ & std::ios::in) == 0,
"Incorrect flags: " << flags_);
std::streambuf* sb = std::cin.rdbuf();
buffer_ = sb;
} else if (path_ == FileHandler::kStdOutDescriptor) {
UTIL_THROW_IF2((flags_ & std::ios::out) == 0,
"Incorrect flags: " << flags_);
std::streambuf* sb = std::cout.rdbuf();
buffer_ = sb;
} else {
// real file
if( checkExists && ! fileExists() ) {
fprintf(stderr, "ERROR: Failed to find file at %s\n", path_.c_str());
exit(EXIT_FAILURE);
}
std::string cmd = "";
if( isCompressedFile(cmd) && (! cmd.empty()) ) {
buffer_ = openCompressedFile(cmd.c_str());
} else {
// open underlying filebuf
std::filebuf* fb = new std::filebuf();
fb->open(path_.c_str(), flags_);
buffer_ = fb;
}
}
if (!buffer_) {
fprintf(stderr, "ERROR:Failed to open file at %s\n", path_.c_str());
exit(EXIT_FAILURE);
}
this->init(buffer_);
return true;
}
/*
* Checks for compression via file extension. Currently checks for
* ".gz" and ".bz2".
*/
bool FileHandler::isCompressedFile(std::string & cmd)
{
bool compressed = false, isInput = (flags_ & std::ios::in);
cmd = "";
unsigned int len = path_.size();
if( len > kGzipped.size()
&& path_.find(kGzipped) == len - kGzipped.size()) {
//gzip file command to compress or decompress
compressed = true;
// cmd = (isInput ? "exec gunzip -cf " : "exec gzip -c > ") + path_;
cmd = (isInput ? "exec " + kGunzipCommand + "c "
: "exec " + kGzipCommand + "c > ") + path_;
} else if( len > kBzipped2.size() &&
path_.find(kBzipped2) == len - kBzipped2.size()) {
//do bzipped2 file command
compressed = true;
cmd = (isInput ? "exec " + kBunzip2Command + "c "
: "exec " + kBzip2Command + "c > ") + path_;
}
return compressed;
}
bool FileHandler::fileExists()
{
bool exists = false;
struct stat f_info;
if( stat(path_.c_str(), &f_info) == 0 ) //if stat() returns no errors
exists = true;
return( exists );
}
// static method used during preprocessing compressed files without
// opening fstream objects.
bool FileHandler::getCompressionCmds(const std::string & filepath, std::string & compressionCmd,
std::string & decompressionCmd,
std::string & compressionSuffix)
{
// determine what compression and decompression cmds are suitable from filepath
compressionCmd = kCatCommand;
decompressionCmd = kCatCommand;
if (filepath.length() > kGzipped.size() &&
filepath.find(kGzipped) == filepath.length()
- kGzipped.length()) {
compressionCmd = kGzipCommand;
decompressionCmd = kGunzipCommand;
compressionSuffix = kGzipped;
} else if (filepath.length() > kBzipped2.size() &&
filepath.find(kBzipped2) == filepath.length()
- kBzipped2.length() ) {
compressionCmd = kBzip2Command;
decompressionCmd = kBunzip2Command;
compressionSuffix = kBzipped2;;
}
return (compressionCmd != kCatCommand && decompressionCmd != kCatCommand);
}
bool FileHandler::reset()
{
#ifndef NO_PIPES
// move to beginning of file
if (fp_ != 0) {
//can't seek on a pipe so reopen
pclose(fp_);
std::string cmd = "";
if (isCompressedFile(cmd) && ! cmd.empty())
buffer_ = openCompressedFile(cmd.c_str());
//reinitialize
this->init(buffer_);
} else
#endif
buffer_->pubseekoff(0, std::ios_base::beg); //sets both get and put pointers to beginning of stream
return true;
}
} //end namespace

View File

@ -1,72 +0,0 @@
#ifndef moses_DynSAInclude_file_h
#define moses_DynSAInclude_file_h
#include <iostream>
#include <fstream>
#include <cstdio>
#include <cstdlib>
#include <sys/stat.h>
#include <string>
#include "util/exception.hh"
#include "fdstream.h"
#include "utils.h"
namespace Moses
{
typedef std::string FileExtension;
//! @todo ask abby2
class FileHandler: public std::fstream
{
public:
// descriptors for stdin and stdout
static const std::string kStdInDescriptor; // file name for std::cin
static const std::string kStdOutDescriptor; // file name for std::cout
// compression commands
static const std::string kCatCommand; // i.e. no compression
static const std::string kGzipCommand; // gzip -f
static const std::string kGunzipCommand; // gunzip -f
static const std::string kBzip2Command; // bzip2 -f
static const std::string kBunzip2Command; // bunzip2 -f
// open file or wrap stdin or stdout
FileHandler(const std::string & path,
std::ios_base::openmode flags = std::ios::in,
bool checkExists = true);
~FileHandler();
// file utilities
static bool getCompressionCmds(const std::string & filepath,
std::string & compressionCmd,
std::string & decompressionCmd,
std::string & compressionSuffix);
// data accessors
std::string getPath() {
return path_;
}
std::ios_base::openmode getFlags() {
return flags_;
}
bool isStdIn() {
return path_ == FileHandler::kStdInDescriptor;
}
bool isStdOut() {
return path_ == FileHandler::kStdOutDescriptor;
}
bool reset();
protected:
static const FileExtension kGzipped;
static const FileExtension kBzipped2;
bool fileExists();
bool setStreamBuffer(bool checkExists);
bool isCompressedFile(std::string & cmd);
fdstreambuf* openCompressedFile(const char* cmd);
std::string path_; // file path
std::ios_base::openmode flags_; // open flags
std::streambuf* buffer_; // buffer to either gzipped or standard data
std::FILE* fp_; //file pointer to handle pipe data
};
} // end namespace
#endif

View File

@ -1 +0,0 @@
alias dynsa : ../../../util//kenutil ../..//headers : : : <include>. ;

View File

@ -1,201 +0,0 @@
// Copyright 2008 Abby Levenberg, David Talbot
//
// This file is part of RandLM
//
// RandLM is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// RandLM is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with RandLM. If not, see <http://www.gnu.org/licenses/>.
#ifndef INC_RANDLM_CACHE_H
#define INC_RANDLM_CACHE_H
#include <iterator>
#include <map>
#include <ctime>
#include <iostream>
namespace randlm
{
//! @todo ask abby2
template<typename T>
class CacheNode
{
public:
typedef std::map<wordID_t, CacheNode<T>* > childMap;
// initialise value to 'unknown' (i.e. not yet queried or cached).
CacheNode(T unknown_value) : value_(unknown_value) {}
childMap childs_; // child pointers
T value_; // value stored
const void* state_; // state pointer
};
template<typename T>
class Cache
{
public:
typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
// unknown_value is used to indicate the ngram was not queried (yet)
// null_value_ indicates it was queried but not found in model
// space usage is handled by client.
Cache(T unknown_value, T null_value) :
cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
root_ = newNode();
}
~Cache() {
if(clear()) {
delete root_;
root_ = NULL;
} else {
std::cerr << "Error freeing cache memory.\n";
}
}
bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
// inserts full ngram into cache
CacheNode<T>* node = root_;
for (int i = len - 1; i > -1; --i) {
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// current node is already prefix. Go to child node
node = node->childs_[ngram[i]];
} else {
// no child for prefix. set new child link in current node
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
}
}
node->value_ = value;
node->state_ = state;
return true;
}
bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
// finds value for this full ngram only (returns false if full ngram not in cache)
CacheNode<T> * node = root_;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
} else {
// not cached
return false;
}
}
*value = node->value_;
if(state) *state = node->state_;
return *value != null_value_ && *value != unknown_value_;
}
int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
// set values array to point to cache value nodes
CacheNode<T> * node = root_;
*found = 0;
//values[0] = &node->value_; // pointer to root node's value
bool all_found = true;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
// get pointer to value (index by length - 1)
values[i] = &node->value_;
// if null_value then assume all extensions impossible
if (node->value_ == null_value_) {
return len - 1 - i; // max length posible
}
all_found = all_found && (node->value_ != unknown_value_);
if (all_found)
++(*found);
} else {
// initialise uncached values
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
values[i] = &node->value_;
}
}
return len; // all possible
}
int getCache(const wordID_t* ngram, int len, T** values, int* found) {
// get pointers to values for ngram and constituents.
// returns upper bound on longest subngram in model.
// 'found' stores longest non-null and known value found.
CacheNode<T> * node = root_;
*found = 0;
values[0] = &node->value_; // pointer to root node's value
bool all_found = true;
for(int i = len - 1; i > -1; --i) {
// go to deepest level node of ngram in cache
childPtr child = node->childs_.find(ngram[i]);
if( child != node->childs_.end() ) {
// switch to child node
node = node->childs_[ngram[i]];
// get pointer to value (index by length - 1)
values[len - i] = &node->value_;
// if null_value then assume all extensions impossible
if (node->value_ == null_value_)
return len - 1 - i; // max length posible
all_found = all_found && (node->value_ != unknown_value_);
if (all_found)
++(*found);
} else {
// initialise uncached values
CacheNode<T> * newChild = newNode(node);
node->childs_[ngram[i]] = newChild;
// go to new node
node = newChild;
values[len - i] = &node->value_;
}
}
return len; // all possible
}
bool clear() {
std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
/ static_cast<float>(1ull << 20) << "MB" << std::endl;
return clearNodes(root_);
}
int nodes() {
// returns number of nodes
return cur_nodes_;
}
int nodeSize() {
return sizeof(CacheNode<T>) + sizeof(root_->childs_);
}
private:
CacheNode<T> * root_;
count_t cur_nodes_;
T unknown_value_; // Used to initialise data at each node
T null_value_; // Indicates cached something not in model
CacheNode<T>* newNode(CacheNode<T> * node = 0) {
++cur_nodes_;
return new CacheNode<T>(unknown_value_);
}
bool clearNodes(CacheNode<T> * node) {
//delete children from this node
if(!node->childs_.empty()) {
iterate(node->childs_, itr) {
if(!clearNodes(itr->second))
std::cerr << "Error emptying cache\n";
delete itr->second;
--cur_nodes_;
}
node->childs_.clear();
}
return true;
}
};
} //end namespace
#endif //INC_RANDLM_CACHE_H

View File

@ -1,427 +0,0 @@
// Copyright 2008 David Talbot
//
// This file is part of RandLM
//
// RandLM is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// RandLM is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with RandLM. If not, see <http://www.gnu.org/licenses/>.
#ifndef INC_RANDLM_FILTER_H
#define INC_RANDLM_FILTER_H
#include <cassert>
#include <cmath>
#include "FileHandler.h"
#ifdef WIN32
#define log2(X) (log((double)X)/log((double)2))
#endif
namespace randlm
{
/* Class Filter wraps a contiguous array of data. Filter and its subclasses
* implement read/write/increment functionality on arrays with arbitrary sized addresses
* (i.e. an address may not use a full number of bytes). When converting to byte-based
* representation we assume "unused" bits are to left.
* E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
* to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
* and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
* been masked out.
*/
template<typename T>
class Filter
{
public:
Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
// number of bits in T
cell_width_ = sizeof(T) << 3;
// current implementation has following constraints
assert(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
// used for >> division
log_cell_width_ = static_cast<int>(floor(log((double)cell_width_)/log((double)2) + 0.000001));
// size of underlying data in Ts
cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
// instantiate underlying data
data_ = new T[cells_];
assert(data_ != NULL);
assert(reset());
// 'first_bit' marks the first bit used by 'address' (left padded with zeros).
first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
// mask for full cell
full_mask_ = static_cast<T>(0xffffffffffffffffull);
// mask for bits that make up the address
address_mask_ = full_mask_ >> first_bit_;
}
Filter(Moses::FileHandler* fin, bool loaddata = true) : data_(NULL) {
assert(loadHeader(fin));
if (loaddata)
assert(loadData(fin));
}
virtual ~Filter() {
delete[] data_;
}
bool reset() {
for (uint64_t i = 0; i < cells_; ++i)
data_[i] = 0;
return true;
}
count_t size() {
// return approx size of filter in MBs
return cells_ * sizeof(T) >> 20;
}
// read / write functions
inline bool read(uint64_t address, T* value) {
assert(address <= addresses_);
// copy address to 'value'
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
*value = data_[data_cell] & address_mask_;
return true;
}
// data address starts to left so shift it right
if (offset < 0) {
*value = (data_[data_cell] >> -offset) & address_mask_;
return true;
}
// data address is to right so shift it left and look at one more cell to right
*value = ((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
return true;
}
inline T read(uint64_t address) {
assert(address <= addresses_);
// return value at address
T value = 0;
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
value = data_[data_cell] & address_mask_;
}
// data address starts to left so shift it right
else if (offset < 0) {
value = (data_[data_cell] >> -offset) & address_mask_;
}
// data address is to right so shift it left and look at one more cell to right
else
value = ((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
return value;
}
inline bool write(uint64_t address, T value) {
assert(address <= addresses_);
assert(log2(value) <= width_);
// write 'value' to address
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading zeros of value
if (offset == 0) {
data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
return true;
}
// address in data is to left so shift value left by -offset
if (offset < 0) {
data_[data_cell] = (value << -offset)
| (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
// address in data is to right so shift value right by offset
data_[data_cell] = (value >> offset) |
(data_[data_cell] & ~(address_mask_ >> offset));
data_[data_cell + 1] = (value << (cell_width_ - offset)) |
(data_[data_cell + 1] & (full_mask_ >> offset));
return true;
}
inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
// copy 'address' ^ 'finger' to 'value'
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading bits
if (offset == 0) {
*value = (finger ^ data_[data_cell]) & address_mask_;
return true;
}
// data address starts to left so shift it right
if (offset < 0) {
*value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
return true;
}
// data address is to right so shift it left and look at one more cell to right
*value = (((data_[data_cell] << offset)
| (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
& address_mask_ ;
return true;
}
inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
// write 'value' ^ 'finger' to address
finger &= address_mask_; // make sure fingerprint is correct size
uint64_t data_bit = address * width_;
uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
// 'offset' shows how address in 'data' and 'value' align
int offset = (data_bit % cell_width_) - first_bit_;
// they align so just copy across masking unneeded leading zeros of value
if (offset == 0) {
data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
return true;
}
// address in data is to left so shift value left by -offset
if (offset < 0) {
data_[data_cell] = ((finger ^ value) << -offset)
| (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
// address in data is to right so shift value right by offset
data_[data_cell] = ((finger ^ value) >> offset) |
(data_[data_cell] & ~(address_mask_ >> offset));
data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
(data_[data_cell + 1] & (full_mask_ >> offset));
return true;
}
// debugging
void printFilter(const std::string & prefix = "", uint32_t truncate = 64) {
std::cout << prefix;
for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
for (int j = cell_width_ - 1; j >= 0; --j)
if (data_[i] & (1ull << j))
std::cout << 1;
else
std::cout << 0;
std::cout << "\n";
}
std::cout << std::endl;
}
// i/o
uint64_t getAddresses() {
return addresses_;
}
int getWidth() {
return width_;
}
int getCellWidth() {
return cell_width_;
}
uint32_t getCells() {
return cells_;
}
virtual bool save(Moses::FileHandler* out) {
assert(out != NULL);
assert(out->write((char*)&cells_, sizeof(cells_)));
assert(out->write((char*)&cell_width_, sizeof(cell_width_)));
assert(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
assert(out->write((char*)&addresses_, sizeof(addresses_)));
assert(out->write((char*)&width_, sizeof(width_)));
assert(out->write((char*)&first_bit_, sizeof(first_bit_)));
assert(out->write((char*)&full_mask_, sizeof(full_mask_)));
assert(out->write((char*)&address_mask_, sizeof(address_mask_)));
//assert(out->write((char*)data_, cells_ * sizeof(T)));
const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
if((width_ == 1) || cells_ < jump)
assert(out->write((char*)data_, cells_ * sizeof(T)));
else {
uint64_t idx(0);
while(idx + jump < cells_) {
assert(out->write((char*)&data_[idx], jump * sizeof(T)));
idx += jump;
}
assert(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
}
return true;
}
protected:
bool loadHeader(Moses::FileHandler* fin) {
assert(fin != NULL);
assert(fin->read((char*)&cells_, sizeof(cells_)));
assert(fin->read((char*)&cell_width_, sizeof(cell_width_)));
assert(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
assert(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
assert(fin->read((char*)&addresses_, sizeof(addresses_)));
assert(fin->read((char*)&width_, sizeof(width_)));
assert(fin->read((char*)&first_bit_, sizeof(first_bit_)));
assert(fin->read((char*)&full_mask_, sizeof(full_mask_)));
assert(fin->read((char*)&address_mask_, sizeof(address_mask_)));
return true;
}
bool loadData(Moses::FileHandler* fin) {
// instantiate underlying array
data_ = new T[cells_];
assert(data_ != NULL);
assert(fin->read((char*)data_, cells_ * sizeof(T)));
//assert(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
//assert(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
return true;
}
uint64_t cells_; // number T making up 'data_'
int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
int log_cell_width_; // log of bits used for >> division
uint64_t addresses_; // number of addresses in the filter
int width_; // width in bits of each address
int first_bit_; // position of first bit in initial byte
T full_mask_; // all 1s
T address_mask_; // 1s in those positions that are part of address
T* data_; // the raw data as bytes
};
// Extension with bit test/setter methods added
class BitFilter : public Filter<uint8_t>
{
public:
BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
BitFilter(Moses::FileHandler* fin, bool loaddata = true)
: Filter<uint8_t>(fin, loaddata) {
if (loaddata)
assert(load(fin));
}
// TODO: overload operator[]
virtual bool testBit(uint64_t location) {
// test bit referenced by location
return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
}
virtual bool setBit(uint64_t location) {
// set bit referenced by location
data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
return true;
}
virtual bool clearBit(uint64_t location) {
// set bit referenced by location
data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
return true;
}
bool save(Moses::FileHandler* fout) {
assert(Filter<uint8_t>::save(fout));
std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
return true;
}
float rho(uint64_t limit = 0) {
uint64_t ones = 0;
uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
for (uint64_t i = 0; i < range; ++i)
for (int j = 0; j < 8; ++j)
if (data_[i] & (1 << j))
++ones;
return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
}
protected:
bool load(Moses::FileHandler* fin) {
std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
return true;
}
};
/*
// ResizedBitFilter deals with resizing to save memory
// whereas other filters should expect locations to be within range
// this filter will need to resize (and possibly rehash) locations
// to fit a smaller range.
class ResizedBitFilter : public BitFilter {
public:
ResizedBitFilter(Moses::FileHandler* fin) : BitFilter(fin) {
assert(load(fin));
}
ResizedBitFilter(Moses::FileHandler* fin, uint64_t newsize) : BitFilter(newsize) {
assert(resizeFromFile(fin, newsize));
}
bool resizeFromFile(Moses::FileHandler* oldin, uint64_t newsize);
virtual bool testBit(uint64_t location) {
// test bit referenced by location
return BitFilter::testBit((location % old_addresses_) * a_ + b_);
}
virtual bool setBit(uint64_t location) {
// set bit referenced by location
return BitFilter::setBit((location % old_addresses_) * a_ + b_);
}
bool save(Moses::FileHandler* fout) {
// re-hashing parameters
assert(BitFilter::save(fout));
std::cerr << "Saved ResizedBitFilter. Rho = " << rho() << "." << std::endl;
assert(fout->write((char*)&old_addresses_, sizeof(old_addresses_)));
assert(fout->write((char*)&a_, sizeof(a_)));
return fout->write((char*)&b_, sizeof(b_));
}
protected:
bool load(Moses::FileHandler* fin) {
// re-hashing parameters
std::cerr << "Loaded ResizedBitFilter. Rho = " << rho() << "." << std::endl;
CHECK(fin->read((char*)&old_addresses_, sizeof(old_addresses_)));
CHECK(fin->read((char*)&a_, sizeof(a_)));
return fin->read((char*)&b_, sizeof(b_));
}
// member data
uint64_t old_addresses_; // size of pre-resized filter
uint64_t a_, b_; // re-hashing parameters (needed?)
};
// CountingFilter supports increment operator. Addresses
// of the filter are treated as counters that store their counts
// in big-endian format (i.e. leftmost bit is most significant).
template<typename T>
class CountingFilter : public Filter<T> {
public:
CountingFilter(uint64_t addresses, int width, bool wrap_around = true) :
Filter<T>(addresses, width), wrap_around_(wrap_around) {}
CountingFilter(Moses::FileHandler* fin) : Filter<T>(fin, true) {
CHECK(load(fin));
}
~CountingFilter() {}
// increment this address by one
inline bool increment(uint32_t address) {
uint64_t data_bit = address * this->width_; // index of first bit
uint32_t data_cell = (data_bit >> this->log_cell_width_); // % this->cells_; // index in data_
// 'offset' shows how address in 'data' and 'value' align
data_bit %= this->cell_width_;
int offset = data_bit - this->first_bit_;
// start from right incrementing and carrying if necessary
bool carry = true;
if (offset > 0) { // counter spans two cells
carry = incrementSubCell(0, offset, &this->data_[data_cell + 1]);
if (carry)
carry = incrementSubCell(data_bit, this->width_ - offset, &this->data_[data_cell]);
} else { // counter is within a single cell
carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]);
}
// last update must not have carried
if (!carry)
return true;
// wrapped round so check whether need to reset to max count
if (!wrap_around_)
CHECK(this->write(address, this->address_mask_));
return false; // false to indicate that overflowed
}
bool save(Moses::FileHandler* fout) {
CHECK(Filter<T>::save(fout));
return fout->write((char*)&wrap_around_, sizeof(wrap_around_));
}
private:
bool load(Moses::FileHandler* fin) {
return fin->read((char*)&wrap_around_, sizeof(wrap_around_));
}
inline bool incrementSubCell(int bit, int len, T* cell) {
// increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged
*cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
& (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len))
| (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len))));
// indicate overflow as true
return ((*cell & (this->full_mask_ >> bit)) >> (this->cell_width_ - bit - len)) == 0;
}
bool wrap_around_; // whether to start from 0 on overflow (if not just stay at maximum count)
};
*/
}
#endif // INC_RANDLM_FILTER_H

View File

@ -1,146 +0,0 @@
/* Class modified by ADL for randlm namespace on Feb 15th, 2008.
*
* The following code declares classes to read from and write to
* file descriptore or file handles.
*
* See
* http://www.josuttis.com/cppcode
* for details and the latest version.
*
* - open:
* - integrating BUFSIZ on some systems?
* - optimized reading of multiple characters
* - stream for reading AND writing
* - i18n
*
* (C) Copyright Nicolai M. Josuttis 2001.
* Permission to copy, use, modify, sell and distribute this software
* is granted provided this copyright notice appears in all copies.
* This software is provided "as is" without express or implied
* warranty, and with no claim as to its suitability for any purpose.
*
* Version: Jul 28, 2002
* History:
* Jul 28, 2002: bugfix memcpy() => memmove()
* fdinbuf::underflow(): cast for return statements
* Aug 05, 2001: first public version
*/
#ifndef moses_DynSAInclude_fdstream_h
#define moses_DynSAInclude_fdstream_h
#include <streambuf>
// for EOF:
#include <cstdio>
// for memmove():
#include <cstring>
// low-level read and write functions
#ifdef _MSC_VER
# include <io.h>
#else
# include <unistd.h>
//extern "C" {
// int write (int fd, const char* buf, int num);
// int read (int fd, char* buf, int num);
//}
#endif
// BEGIN namespace
//namespace randlm {
/************************************************************
* fdstreambuf
* - a stream that reads on a file descriptor
************************************************************/
class fdstreambuf : public std::streambuf
{
protected:
int fd; // file descriptor
protected:
/* data buffer:
* - at most, pbSize characters in putback area plus
* - at most, bufSize characters in ordinary read buffer
*/
static const int pbSize = 4; // size of putback area
static const int bufSize = 1024; // size of the data buffer
char buffer[bufSize+pbSize]; // data buffer
public:
/* constructor
* - initialize file descriptor
* - initialize empty data buffer
* - no putback area
* => force underflow()
*/
fdstreambuf (int _fd) : fd(_fd) {
setg (buffer+pbSize, // beginning of putback area
buffer+pbSize, // read position
buffer+pbSize); // end position
}
protected:
// insert new characters into the buffer
virtual int_type underflow () {
#ifndef _MSC_VER
using std::memmove;
#endif
// is read position before end of buffer?
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
/* process size of putback area
* - use number of characters read
* - but at most size of putback area
*/
int numPutback;
numPutback = gptr() - eback();
if (numPutback > pbSize) {
numPutback = pbSize;
}
/* copy up to pbSize characters previously read into
* the putback area
*/
memmove (buffer+(pbSize-numPutback), gptr()-numPutback,
numPutback);
// read at most bufSize new characters
int num;
num = read (fd, buffer+pbSize, bufSize);
if (num <= 0) {
// ERROR or EOF
return EOF;
}
// reset buffer pointers
setg (buffer+(pbSize-numPutback), // beginning of putback area
buffer+pbSize, // read position
buffer+pbSize+num); // end of buffer
// return next character
return traits_type::to_int_type(*gptr());
}
// write one character
virtual int_type overflow (int_type c) {
if (c != EOF) {
char z = c;
if (write (fd, &z, 1) != 1) {
return EOF;
}
}
return c;
}
// write multiple characters
virtual
std::streamsize xsputn (const char* s,
std::streamsize num) {
return write(fd,s,num);
}
};
//} // END namespace
#endif

View File

@ -1,357 +0,0 @@
#ifndef INC_ALLHASHFUNCS_H
#define INC_ALLHASHFUNCS_H
#include <cmath>
#include "types.h"
#include "utils.h"
#include "FileHandler.h"
#include "util/exception.hh"
#include "util/random.hh"
typedef uint64_t P; // largest input range is 2^64
//! @todo ask abby2
template <typename T>
class HashBase
{
protected:
T m_; // range of hash output
count_t H_; // number of hash functions to instantiate
virtual void initSeeds()=0;
virtual void freeSeeds()=0;
public:
HashBase(float m, count_t H=1):m_((T)m), H_(H) {
//cerr << "range = (0..." << m_ << "]" << endl;
}
HashBase(Moses::FileHandler* fin) {
load(fin);
}
virtual ~HashBase() {}
virtual T hash(const char*s, count_t h)=0; // string hashing
virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
count_t size() {
return H_;
}
virtual void save(Moses::FileHandler* fout) {
UTIL_THROW_IF2(fout == 0, "Null file handle");
fout->write((char*)&m_, sizeof(m_));
fout->write((char*)&H_, sizeof(H_));
}
virtual void load(Moses::FileHandler* fin) {
UTIL_THROW_IF2(fin == 0, "Null file handle");
fin->read((char*)&m_, sizeof(m_));
fin->read((char*)&H_, sizeof(H_));
}
};
//! @todo ask abby2
template <typename T>
class UnivHash_linear: public HashBase<T>
{
public:
UnivHash_linear(float m, count_t H, P pr):
HashBase<T>(m, H), pr_(pr) {
initSeeds();
}
UnivHash_linear(Moses::FileHandler* fin):
HashBase<T>(fin) {
load(fin);
}
~UnivHash_linear() {
freeSeeds();
}
T hash(const char* s, count_t h) {
return 0; //not implemented
}
T hash(const wordID_t* id, const int len, count_t h);
T hash(const wordID_t id, const count_t pos,
const T prevValue, count_t h);
void save(Moses::FileHandler* fout);
void load(Moses::FileHandler* fin);
private:
T** a_, **b_;
P pr_;
void initSeeds();
void freeSeeds();
};
/** UnivHash_noPrimes:
* From Dietzfelbinger 2008
* p = input domain range = 2^l
* m = output range = 2^k
* # of hash function = 2^(l-1)
*/
template <typename T>
class UnivHash_noPrimes: public HashBase<T>
{
public:
UnivHash_noPrimes(float k, float l):
HashBase<T>(k, 100), d_(count_t((l-k))) {
if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
else p_ = (P) pow(2,l);
initSeeds();
}
UnivHash_noPrimes(Moses::FileHandler* fin):
HashBase<T>(fin) {
load(fin);
}
~UnivHash_noPrimes() {
freeSeeds();
}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h);
T hash(const P x, count_t h);
void save(Moses::FileHandler* fout);
void load(Moses::FileHandler* fin);
private:
count_t d_; // l-k
P p_, *a_; // real-valued input range, storage
void initSeeds();
void freeSeeds() {
delete[] a_;
}
};
//! @todo ask abby2
template <typename T>
class Hash_shiftAddXOR: public HashBase<T>
{
public:
Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
l_(5), r_(2) {
initSeeds();
}
~Hash_shiftAddXOR() {
freeSeeds();
}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h) {} // empty
private:
T* v_; // random seed storage
const unsigned short l_, r_; // left-shift bits, right-shift bits
void initSeeds();
void freeSeeds() {
delete[] v_;
}
};
//! @todo ask abby2
template <typename T>
class UnivHash_tableXOR: public HashBase<T>
{
public:
UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
table_(NULL), tblLen_(255*MAX_STR_LEN) {
initSeeds();
}
~UnivHash_tableXOR() {
freeSeeds();
}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h) {}
private:
T** table_; // storage for random numbers
count_t tblLen_; // length of table
void initSeeds();
void freeSeeds();
};
// ShiftAddXor
template <typename T>
void Hash_shiftAddXOR<T>::initSeeds()
{
v_ = new T[this->H_];
for(count_t i=0; i < this->H_; i++)
v_[i] = util::wide_rand<T>() + 1;
}
template <typename T>
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h)
{
T value = v_[h];
int pos(0);
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN)) {
value ^= ((value << l_) + (value >> r_) + c);
}
return (value % this->m_);
}
// UnivHash_tableXOR
template <typename T>
void UnivHash_tableXOR<T>::initSeeds()
{
// delete any values in table
if(table_) freeSeeds();
// instance of new table
table_ = new T* [this->H_];
// fill with random values
for(count_t j=0; j < this->H_; j++) {
table_[j] = new T[tblLen_];
for(count_t i=0; i < tblLen_; i++)
table_[j][i] = util::wide_rand_excl(this->m_-1);
}
}
template <typename T>
void UnivHash_tableXOR<T>::freeSeeds()
{
for(count_t j = 0; j < this->H_; j++)
delete[] table_[j];
delete[] table_;
table_ = NULL;
}
template <typename T>
T UnivHash_tableXOR<T>::hash(const char* s, count_t h)
{
T value = 0;
count_t pos = 0, idx = 0;
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN))
value ^= table_[h][idx += c];
UTIL_THROW_IF2(value >= this->m_, "Error");
return value;
}
// UnivHash_noPrimes
template <typename T>
void UnivHash_noPrimes<T>::initSeeds()
{
a_ = new P[this->H_];
for(T i=0; i < this->H_; i++) {
a_[i] = util::wide_rand<P>();
if(a_[i] % 2 == 0) a_[i]++; // a must be odd
}
}
template <typename T>
T UnivHash_noPrimes<T>::hash(const P x, count_t h)
{
// h_a(x) = (ax mod 2^l) div 2^(l-k)
T value = ((a_[h] * x) % p_) >> d_;
return value % this->m_;
}
template <typename T>
T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
count_t h)
{
T value = 0;
int pos(0);
while(pos < len) {
value ^= hash((P)id[pos], h++);
pos++;
}
return value % this->m_;
}
template <typename T>
T UnivHash_noPrimes<T>::hash(const char* s, count_t h)
{
T value = 0;
int pos(0);
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN)) {
value ^= hash((P)c, h);
}
return value % this->m_;
}
template <typename T>
void UnivHash_noPrimes<T>::save(Moses::FileHandler* fout)
{
HashBase<T>::save(fout);
fout->write((char*)&p_, sizeof(p_));
fout->write((char*)&d_, sizeof(d_));
for(T i=0; i < this->H_; i++) {
fout->write((char*)&a_[i], sizeof(a_[i]));
}
}
template <typename T>
void UnivHash_noPrimes<T>::load(Moses::FileHandler* fin)
{
a_ = new P[this->H_];
// HashBase<T>::load(fin) already done in constructor
fin->read((char*)&p_, sizeof(p_));
fin->read((char*)&d_, sizeof(d_));
for(T i=0; i < this->H_; i++) {
fin->read((char*)&a_[i], sizeof(a_[i]));
}
}
//UnivHash_linear
template <typename T>
void UnivHash_linear<T>::initSeeds()
{
a_ = new T*[this->H_];
b_ = new T*[this->H_];
for(count_t i=0; i < this->H_; i++) {
a_[i] = new T[MAX_NGRAM_ORDER];
b_[i] = new T[MAX_NGRAM_ORDER];
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
a_[i][j] = 1 + util::wide_rand<T>();
b_[i][j] = util::wide_rand<T>();
}
}
}
template <typename T>
void UnivHash_linear<T>::freeSeeds()
{
for(count_t i=0; i < this->H_; i++) {
delete[] a_[i];
delete[] b_[i];
}
delete[] a_;
delete[] b_;
a_ = b_ = NULL;
}
template <typename T>
inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
count_t h)
{
UTIL_THROW_IF2(h >= this->H_, "Error");
T value = 0;
int pos(0);
while(pos < len) {
value += ((a_[h][pos] * id[pos]) + b_[h][pos]);// % pr_;
++pos;
}
return value % this->m_;
}
template <typename T>
inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos,
const T prevValue, count_t h)
{
UTIL_THROW_IF2(h >= this->H_, "Error");
T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_;
return value % this->m_;
}
template <typename T>
void UnivHash_linear<T>::save(Moses::FileHandler* fout)
{
// int bytes = sizeof(a_[0][0]);
HashBase<T>::save(fout);
fout->write((char*)&pr_, sizeof(pr_));
for(count_t i=0; i < this->H_; i++) {
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
fout->write((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
}
}
}
template <typename T>
void UnivHash_linear<T>::load(Moses::FileHandler* fin)
{
// HashBase<T>::load(fin) already done in constructor
fin->read((char*)&pr_, sizeof(pr_));
a_ = new T*[this->H_];
b_ = new T*[this->H_];
for(count_t i=0; i < this->H_; i++) {
a_[i] = new T[MAX_NGRAM_ORDER];
b_[i] = new T[MAX_NGRAM_ORDER];
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
}
}
}
#endif

View File

@ -1,542 +0,0 @@
#ifndef INC_DYNAMICLM_H
#define INC_DYNAMICLM_H
#include <algorithm>
#include <vector>
#include "perfectHash.h"
#include "RandLMCache.h"
#include "types.h"
#include "vocab.h"
/*
* DynamicLM manipulates LM
*/
using randlm::BitFilter;
using randlm::Cache;
const bool strict_checks_ = false;
//! @todo ask abby2
template<typename T>
class OnlineRLM: public PerfectHash<T>
{
public:
OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
Moses::Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) {
UTIL_THROW_IF2(vocab_ == 0, "Vocab object not set");
//instantiate quantizer class here
cache_ = new randlm::Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
alpha_ = new float[order_ + 1];
for(count_t i = 0; i <= order_; ++i)
alpha_[i] = i * log10(0.4);
std::cerr << "Initialzing auxillary bit filters...\n";
bPrefix_ = new randlm::BitFilter(this->cells_);
bHit_ = new randlm::BitFilter(this->cells_);
}
OnlineRLM(Moses::FileHandler* fin, count_t order):
PerfectHash<T>(fin), bAdapting_(true), order_(order), corpusSize_(0) {
load(fin);
cache_ = new randlm::Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
alpha_ = new float[order_ + 1];
for(count_t i = 0; i <= order_; ++i)
alpha_[i] = i * log10(0.4);
}
~OnlineRLM() {
delete[] alpha_;
if(bAdapting_) delete vocab_;
else vocab_ = NULL;
delete cache_;
delete bPrefix_;
delete bHit_;
}
float getProb(const wordID_t* ngram, int len, const void** state);
//float getProb2(const wordID_t* ngram, int len, const void** state);
bool insert(const std::vector<std::string>& ngram, const int value);
bool update(const std::vector<std::string>& ngram, const int value);
int query(const wordID_t* IDs, const int len);
int sbsqQuery(const std::vector<std::string>& ngram, int* len,
bool bStrict = false);
int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
bool bStrict = false);
void remove(const std::vector<std::string>& ngram);
count_t heurDelete(count_t num2del, count_t order = 5);
uint64_t corpusSize() {
return corpusSize_;
}
void corpusSize(uint64_t c) {
corpusSize_ = c;
}
void clearCache() {
if(cache_) cache_->clear();
}
void save(Moses::FileHandler* fout);
void load(Moses::FileHandler* fin);
void randDelete(int num2del);
int countHits();
int countPrefixes();
int cleanUpHPD();
void clearMarkings();
void removeNonMarked();
Moses::Vocab* vocab_;
protected:
void markQueried(const uint64_t& index);
void markQueried(hpdEntry_t& value);
bool markPrefix(const wordID_t* IDs, const int len, bool bSet);
private:
const void* getContext(const wordID_t* ngram, int len);
const bool bAdapting_; // used to signal adaptation of model
const count_t order_; // LM order
uint64_t corpusSize_; // total training corpus size
float* alpha_; // backoff constant
randlm::Cache<float>* cache_;
randlm::BitFilter* bPrefix_;
randlm::BitFilter* bHit_;
};
template<typename T>
bool OnlineRLM<T>::insert(const std::vector<std::string>& ngram, const int value)
{
int len = ngram.size();
wordID_t wrdIDs[len];
uint64_t index(this->cells_ + 1);
for(int i = 0; i < len; ++i)
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
index = PerfectHash<T>::insert(wrdIDs, len, value);
if(value > 1 && len < order_)
markPrefix(wrdIDs, ngram.size(), true); // mark context
// keep track of total items from training data minus "<s>"
if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0;
if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
markQueried(index);
return true;
}
template<typename T>
bool OnlineRLM<T>::update(const std::vector<std::string>& ngram, const int value)
{
int len = ngram.size();
std::vector<wordID_t> wrdIDs(len);
uint64_t index(this->cells_ + 1);
hpdEntry_t hpdItr;
vocab_->MakeOpen();
for(int i = 0; i < len; ++i)
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
// if updating, minimize false positives by pre-checking if context already in model
bool bIncluded(true);
if(value > 1 && len < (int)order_)
bIncluded = markPrefix(&wrdIDs[0], ngram.size(), true); // mark context
if(bIncluded) { // if context found
bIncluded = PerfectHash<T>::update2(&wrdIDs[0], len, value, hpdItr, index);
if(index < this->cells_) {
markQueried(index);
} else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
}
return bIncluded;
}
template<typename T>
int OnlineRLM<T>::query(const wordID_t* IDs, int len)
{
uint64_t filterIdx = 0;
hpdEntry_t hpdItr;
int value(0);
value = PerfectHash<T>::query(IDs, len, hpdItr, filterIdx);
if(value != -1) {
if(hpdItr != this->dict_.end()) {
//markQueried(hpdItr); // mark this event as "hit"
value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks
} else {
UTIL_THROW_IF2(filterIdx >= this->cells_,
"Out of bound: " << filterIdx);
//markQueried(filterIdx);
}
}
return value > 0 ? value : 0;
}
template<typename T>
bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet)
{
if(len <= 1) return true; // only do this for for ngrams with context
static randlm::Cache<int> pfCache(-1, -1); // local prefix cache
int code(0);
if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
hpdEntry_t hpdItr;
uint64_t filterIndex(0);
code = PerfectHash<T>::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1]
if(code == -1) { // encountered false positive in pipeline
std::cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n";
// add all prefixes or return false;
return false;
}
if(filterIndex != this->cells_ + 1) {
UTIL_THROW_IF2(hpdItr != this->dict_.end(), "Error");
if(bSet) bPrefix_->setBit(filterIndex); // mark index
else bPrefix_->clearBit(filterIndex); // unset index
} else {
UTIL_THROW_IF2(filterIndex != this->cells_ + 1, "Error");
//how to handle hpd prefixes?
}
if(pfCache.nodes() > 10000) pfCache.clear();
pfCache.setCacheNgram(IDs, len - 1, code, NULL);
}
return true;
}
template<typename T>
void OnlineRLM<T>::markQueried(const uint64_t& index)
{
bHit_->setBit(index);
//std::cerr << "filter[" << index << "] = " << this->filter_->read(index) << std::endl;
}
template<typename T>
void OnlineRLM<T>::markQueried(hpdEntry_t& value)
{
// set high bit of counter to indicate "hit" status
value->second |= this->hitMask_;
}
template<typename T>
void OnlineRLM<T>::remove(const std::vector<std::string>& ngram)
{
wordID_t IDs[ngram.size()];
for(count_t i = 0; i < ngram.size(); ++i)
IDs[i] = vocab_->GetWordID(ngram[i]);
PerfectHash<T>::remove(IDs, ngram.size());
}
template<typename T>
count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order)
{
count_t deleted = 0;
std::cout << "Deleting " << num2del << " of order "<< order << std::endl;
// delete from filter first
int full = *std::max_element(this->idxTracker_, this->idxTracker_
+ this->totBuckets_);
for(; full > 0; --full) // delete from fullest buckets first
for(int bk = 0; bk < this->totBuckets_; ++bk) {
if(deleted >= num2del) break;
if(this->idxTracker_[bk] == full) { // if full
uint64_t first = bk * this->bucketRange_,
last = first + this->bucketRange_;
for(uint64_t row = first; row < last; ++row) { // check each row
if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) {
if(this->filter_->read(row) != 0) {
PerfectHash<T>::remove(row); // remove from filter
++deleted;
}
}
}
}
}
if(deleted < num2del) {
// remove from hpd
std::cerr << "TODO! HPD deletions\n";
}
std::cerr << "Total deleted = " << deleted << std::endl;
return deleted;
}
template<typename T>
int OnlineRLM<T>::sbsqQuery(const std::vector<std::string>& ngram, int* codes,
bool bStrict)
{
wordID_t IDs[ngram.size()];
for(count_t i = 0; i < ngram.size(); ++i)
IDs[i] = vocab_->GetWordID(ngram[i]);
return sbsqQuery(IDs, ngram.size(), codes, bStrict);
}
template<typename T>
int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
bool bStrict)
{
uint64_t filterIdx = 0;
int val(0), fnd(0);
hpdEntry_t hpdItr;
for(int i = len - 1; i >= 0; --i) { // do subsequence filtering
//if(IDs[i] == Vocab::kOOVWordID) break;
val = PerfectHash<T>::query(&IDs[i], len - i, hpdItr, filterIdx);
if(val != -1) { // if event found
fnd = len - i; // increment found sequence
if(hpdItr != this->dict_.end()) {
val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks
}
} else if(bStrict) {
break;
}
// add to value array
codes[i] = val > 0 ? val : 0;
}
while(bStrict && (fnd > 1)) { // do checks the other way
val = PerfectHash<T>::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx);
if(val != -1) break; // if anything found
else --fnd; // else decrement found
}
return fnd;
}
template<typename T>
float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
const void** state)
{
static const float oovprob = log10(1.0 / (static_cast<float>(vocab_->Size()) - 1));
float logprob(0);
const void* context = (state) ? *state : 0;
// if full ngram and prob not in cache
if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) {
// get full prob and put in cache
int num_fnd(0), den_val(0);
int *in = new int[len]; // in[] keeps counts of increasing order numerator
for(int i = 0; i < len; ++i) in[i] = 0;
for(int i = len - 1; i >= 0; --i) {
if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV
in[i] = query(&ngram[i], len - i);
if(in[i] > 0) {
num_fnd = len - i;
} else if(strict_checks_) break;
}
while(num_fnd > 1) { // get lower order count
//get sub-context of size one less than length found (exluding target)
den_val = query(&ngram[len - num_fnd], num_fnd - 1);
if((den_val > 0) &&
(den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
break;
} else --num_fnd; // else backoff to lower ngram order
}
if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
num_fnd = 0;
switch(num_fnd) { // find prob (need to refactor into precomputation)
case 0: // OOV
logprob = alpha_[len] + oovprob;
break;
case 1: // unigram found only
UTIL_THROW_IF2(in[len - 1] <= 0, "Error");
logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
//logprob = alpha_[len - 1] +
//log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
break;
default:
UTIL_THROW_IF2(den_val <= 0, "Error");
//if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
logprob = alpha_[len - num_fnd] +
log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
break;
}
// need unique context
context = getContext(&ngram[len - num_fnd], num_fnd);
// put whatever was found in cache
cache_->setCacheNgram(ngram, len, logprob, context);
} // end checkCache
return logprob;
}
template<typename T>
const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len)
{
int dummy(0);
float**addresses = new float*[len]; // only interested in addresses of cache
UTIL_THROW_IF2(cache_->getCache2(ngram, len, &addresses[0], &dummy) != len,
"Error");
// return address of cache node
float *addr0 = addresses[0];
free( addresses );
return (const void*)addr0;
}
template<typename T>
void OnlineRLM<T>::randDelete(int num2del)
{
int deleted = 0;
for(uint64_t i = 0; i < this->cells_; i++) {
if(this->filter_->read(i) != 0) {
PerfectHash<T>::remove(i);
++deleted;
}
if(deleted >= num2del) break;
}
}
template<typename T>
int OnlineRLM<T>::countHits()
{
int hit(0);
for(uint64_t i = 0; i < this->cells_; ++i)
if(bHit_->testBit(i)) ++hit;
iterate(this->dict_, itr)
if((itr->second & this->hitMask_) != 0)
++hit;
std::cerr << "Hit count = " << hit << std::endl;
return hit;
}
template<typename T>
int OnlineRLM<T>::countPrefixes()
{
int pfx(0);
for(uint64_t i = 0; i < this->cells_; ++i)
if(bPrefix_->testBit(i)) ++pfx;
//TODO::Handle hpdict prefix counts
std::cerr << "Prefix count (in filter) = " << pfx << std::endl;
return pfx;
}
template<typename T>
int OnlineRLM<T>::cleanUpHPD()
{
std::cerr << "HPD size before = " << this->dict_.size() << std::endl;
std::vector<std::string> vDel, vtmp;
iterate(this->dict_, itr) {
if(((itr->second & this->hitMask_) == 0) && // if not hit during testing
(Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
vDel.push_back(itr->first);
}
}
iterate(vDel, vitr)
this->dict_.erase(*vitr);
std::cerr << "HPD size after = " << this->dict_.size() << std::endl;
return vDel.size();
}
template<typename T>
void OnlineRLM<T>::clearMarkings()
{
std::cerr << "clearing all event hits\n";
bHit_->reset();
count_t* value(0);
iterate(this->dict_, itr) {
value = &itr->second;
*value -= ((*value & this->hitMask_) != 0) ? this->hitMask_ : 0;
}
}
template<typename T>
void OnlineRLM<T>::save(Moses::FileHandler* fout)
{
std::cerr << "Saving ORLM...\n";
// save vocab
vocab_->Save(fout);
fout->write((char*)&corpusSize_, sizeof(corpusSize_));
fout->write((char*)&order_, sizeof(order_));
bPrefix_->save(fout);
bHit_->save(fout);
// save everything else
PerfectHash<T>::save(fout);
std::cerr << "Finished saving ORLM." << std::endl;
}
template<typename T>
void OnlineRLM<T>::load(Moses::FileHandler* fin)
{
std::cerr << "Loading ORLM...\n";
// load vocab first
vocab_ = new Moses::Vocab(fin);
UTIL_THROW_IF2(vocab_ == 0, "Vocab object not set");
fin->read((char*)&corpusSize_, sizeof(corpusSize_));
std::cerr << "\tCorpus size = " << corpusSize_ << std::endl;
fin->read((char*)&order_, sizeof(order_));
std::cerr << "\tModel order = " << order_ << std::endl;
bPrefix_ = new randlm::BitFilter(fin);
bHit_ = new randlm::BitFilter(fin);
// load everything else
PerfectHash<T>::load(fin);
}
template<typename T>
void OnlineRLM<T>::removeNonMarked()
{
std::cerr << "deleting all unused events\n";
int deleted(0);
for(uint64_t i = 0; i < this->cells_; ++i) {
if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
&& (this->filter_->read(i) != 0)) {
PerfectHash<T>::remove(i);
++deleted;
}
}
deleted += cleanUpHPD();
std::cerr << "total removed from ORLM = " << deleted << std::endl;
}
/*
template<typename T>
float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state) {
static const float oovprob = log10(1.0 / (static_cast<float>(vocab_->size()) - 1));
float log_prob(0);
const void* context_state(NULL);
int found;
int* denom_codes[order_];
int* num_codes[order_ + 1];
int denom_found(0);
std::cerr << "length=" << len << std::endl;
// constrain cache queries using model assumptions
int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found);
std::cerr << "denom_len = " << denom_len << std::endl;
int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
&num_codes[0], &found);
std::cerr << "num_len= " << num_len << std::endl;
// keed reducing ngram size until both denominator and numerator are found
// allowed to leave kUnknownCode in cache because we check for this.
found = num_len; // guaranteed to be <= denom_len + 1
// still check for OOV
for (int i = len - found; i < len; ++i)
if (ngram[i] == Vocab::kOOVWordID) {
found = len - i - 1;
}
// check for relative estimator
while(found > 1) {
if(*denom_codes[found-1] == cache_unk_ &&
((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
//!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) {
*num_codes[found] = cache_unk_;
} else {
if(*num_codes[found] != cache_unk_ ||
((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1]))
// struct_->query(&ngram[len-*found], *found, kMainEventIdx,
// num_codes[*found], *denom_codes[*found-1]))
break;
}
--found;
}
// didn't find bigram numerator or unigram denominator
if (found == 1)
found = *num_codes[1] != cache_unk_
|| ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
//struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]);
// ....
// return estimate applying correct backoff score (precomputed)
// store full log prob with complete ngram (even if backed off)
switch (found) {
case 0: // no observation: assign prob of 'new word' in training data
log_prob = alpha_[len] + oovprob;
//log_prob = stupid_backoff_log10_[len] + uniform_log10prob_;
break;
case 1: // unigram over whole corpus
log_prob = alpha_[len - 1] +
log10(static_cast<float>(*num_codes[1]) / static_cast<float>(corpusSize_));
//log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
// + stupid_backoff_log10_[len - 1]; // precomputed
break;
default: // otherwise use both statistics and (possibly zero) backoff weight
log_prob = alpha_[len - found] +
log10(static_cast<float>(*num_codes[found]) / static_cast<float>(*denom_codes[found-1]));
//log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
// - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
// + stupid_backoff_log10_[len - *found];
}
context_state = (const void*)num_codes[found == len ? found - 1 : found];;
//probCache_->store(len, log_prob, context_state);
if (state)
*state = context_state;
return log_prob;
}
*/
#endif

View File

@ -1,241 +0,0 @@
#include "params.h"
namespace Moses
{
// parameter constants
const std::string Parameters::kNotSetValue = "__NOT_SET__";
const int Parameters::kBoolValue = 0;
const int Parameters::kIntValue = 1;
const int Parameters::kFloatValue = 2;
const int Parameters::kStringValue = 3;
const int Parameters::kUndefinedValue = -1;
const std::string Parameters::kTrueValue = "1";
const std::string Parameters::kFalseValue = "0";
Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum)
{
initialize(paramdefs, paramNum);
}
Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
const count_t paramNum)
{
initialize(paramdefs, paramNum);
loadParams(argc, argv);
}
void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum)
{
for( count_t i = 0; i < paramNum; i++ ) {
params_[paramdefs[i].name] = paramdefs[i]; // assign name
}
std::cerr << "Default parameter values:\n";
iterate(params_, itr)
std::cerr << "\t" << itr->first << " --> " << itr->second.value << std::endl;
}
bool Parameters::loadParams(int argc, char ** argv)
{
// load params from commandline args
//if( argc < 3 ) {
// fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n");
// return false;
//}
bool load_from_file = false;
std::set<std::string> setParams;
int jumpBy = 0;
for( int i = 1; i < argc; i += jumpBy ) {
std::string param = argv[i];
if(param[0] != '-') {
std::cerr << "Unknown parameter: " << param << std::endl;
return false;
}
Utils::ltrim(param, "- ");
// normalise parameter to long name
param = normaliseParamName(param);
// check if valid param name
if(!isValidParamName(param)) {
std::cerr << "Unknown param option \"" << param << "\"\n";
exit(EXIT_FAILURE);
}
setParams.insert(param); // needed to not overwrite param value if file is specified
//if the parameter is of type booL no corresponding value
if( getValueType(param) == kBoolValue ) {
jumpBy = 1;
UTIL_THROW_IF2(!setParamValue(param, kTrueValue),
"Couldn't set parameter " << param);
} else { //not of type bool so must have corresponding value
UTIL_THROW_IF2(i+1 >= argc,
"Out of bound error: " << i+1);
jumpBy = 2;
std::string val = argv[i+1];
Utils::trim(val);
if( param == "config" )
load_from_file = true;
if(!setParamValue(param, val)) {
std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl;
return false;
}
}
}
bool success = true;
// load from file if specified
if (load_from_file)
success = loadParams(getParamValue("config"), setParams);
return success;
}
std::string Parameters::normaliseParamName(const std::string & name)
{
// Map valid abbreviations to long names. Retain other names.
if( params_.find(name) == params_.end() )
iterate(params_, i)
if( i->second.abbrev == name )
return i->first;
return name;
}
int Parameters::getValueType(const std::string& name)
{
if(params_.find(name) != params_.end())
return params_[name].type;
return Parameters::kUndefinedValue;
}
bool Parameters::isValidParamName(const std::string & name)
{
return params_.find(name) != params_.end();
}
bool Parameters::setParamValue(const std::string& name, const std::string& val)
{
// TODO: Add basic type checking w verifyValueType()
bool set = isValidParamName(name);
if(set) {
params_[name].value = val;
std::cerr << "PARAM SET: "<< name << "=" << val << std::endl;
}
return( set );
}
std::string Parameters::getParamValue(const std::string& name)
{
std::string value = Parameters::kNotSetValue;
if(isValidParamName(name)) {
if(params_.find(name) != params_.end())
value = params_[name].value;
else if(getValueType(name) == kBoolValue)
value = kFalseValue;
}
return value;
}
std::string Parameters::getParam(const std::string& name)
{
return getParamValue(name);
/*void* Parameters::getParam(const std::string& name) {
void* paramVal = 0;
int type = getValueType(name);
const char* sval = getParamValue(name).c_str();
switch(type) {
case kIntValue: {
int ival = atoi(sval);
paramVal = (void*)&ival;
break;
}
case kFloatValue: {
float fval = atof(sval);
paramVal = (void*)&fval;
break;
}
case kStringValue: {
paramVal = (void*)sval;
break;
}
case kBoolValue: {
bool bval = sval == Parameters::kTrueValue ? true : false;
paramVal = (void*)&bval;
break;
}
default: // --> Parameters::kUndefinedValue
paramVal = (void*)sval; // will set to Parameters::kNotSetValue
}
return paramVal;*/
}
bool Parameters::verifyValueType(const std::string& name, const std::string& val)
{
// Implement basic type checking
return true;
}
int Parameters::getParamCount() const
{
return params_.size();
}
/*
* HAVE TO CHANGE loadParams() from file to not overwrite command lines but
* override default if different*/
bool Parameters::loadParams(const std::string & file_path,
std::set<std::string>& setParams)
{
// parameters loaded from file don't override cmd line paramters
/*std::set<std::string>::iterator end = setParams.end();
FileHandler file(file_path.c_str(), std::ios::in);
std::string line, param;
while ( getline(file, line) ) {
Utils::trim(line);
//ignore comments (lines beginning with '#') and empty lines
if( line[0] == '#' || line.empty() ) continue;
if( line[0] == '[' ) {
Utils::trim(line, "-[]"); //remove brackets
// normalise parameter names
param = normaliseParamName(line);
//handle boolean type parameters
if(getValueType(param) == kBoolValue && setParams.find(param) == end)
setParamValue(param, kTrueValue);
} else {
// TODO: verify that this works as intended
if(setParams.find(param) == end) { // if param hasn't already been set in cmd line
if(!setParamValue(param, line)) {
std::cerr << "Invalid Param name->value " << param << "->" << line << std::endl;
return false;
}
}
}
}*/
return true;
}
/*
int Parameters::getCSVParams(const std::string & name, std::vector<std::string> & values) {
// get param values(s) may be more than one separated by commas
values.clear();
if( isValidParamName(name) )
if( params_.find(name) != params_.end() )
return Utils::tokenizeToStr(params_.find(name)->second.c_str(), values, ",");
return 0;
}
bool Parameters::checkParamIsSet(const std::string & name) {
// Returns true for non-bool parameter that is set to anything.
// Returns true for bool parameter only if set to true.
if (getValueType(name) == kBoolValue) // boolean value so check whether true
return getParamValue(name) == kTrueValue;
return (getParamValue(name) != kNotSetValue);
}
bool Parameters::printHelp(const std::string & name) {
return true;
}
bool Parameters::printParams() {
// print out parameters and values
std::map<std::string, std::string>::iterator it;
std::cerr << "User defined parameter settings:\n";
for (it = params_.begin(); it != params_.end(); ++it)
std::cerr << "\t" << it->first << "\t" << it->second << "\n";
return true;
}
*/
}

View File

@ -1,64 +0,0 @@
#ifndef moses_DynSAInclude_params_h
#define moses_DynSAInclude_params_h
#include <iostream>
#include <map>
#include <set>
#include <vector>
#include "FileHandler.h"
#include "utils.h"
#include "types.h"
#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
namespace Moses
{
typedef struct ParamDefs {
std::string name;
std::string value;
std::string abbrev;
int type;
std::string description;
} ParamDefs;
//! @todo ask abby2
class Parameters
{
public:
static const std::string kNotSetValue;
static const int kBoolValue;
static const int kIntValue;
static const int kFloatValue;
static const int kStringValue;
static const int kUndefinedValue;
static const std::string kFalseValue;
static const std::string kTrueValue;
Parameters(const ParamDefs * paramdefs, const count_t paramNum);
Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum);
~Parameters() {}
bool loadParams(int argc, char ** argv);
bool loadParams(const std::string& param_file, std::set<std::string>&);
int getValueType(const std::string & name);
bool setParamValue(const std::string& name, const std::string& value);
bool verifyValueType(const std::string& name, const std::string& value);
bool isValidParamName(const std::string & name);
std::string getParamValue(const std::string& name);
//void* getParam(const std::string& name);
std::string getParam(const std::string& name);
int getParamCount() const;
/*
int getCSVParams(const std::string & name, std::vector<std::string> & values);
bool checkParamIsSet(const std::string& name);
bool printParams();
bool printHelp(const std::string & name);
*/
private:
std::string normaliseParamName(const std::string &name);
void initialize(const ParamDefs * paramdefs, const count_t paramNum);
std::map<std::string, ParamDefs > params_; // name->value,type,abbrev,desc
};
}
#endif //INC_PARAMS.H

View File

@ -1,437 +0,0 @@
/* NO OVERLAY VALUES STORED IN SEPERATE FILTER */
#ifndef INC_PERFECTHASH_H
#define INC_PERFECTHASH_H
#include <map>
#include <stdint.h>
#include "hash.h"
#include "RandLMFilter.h"
#include "quantizer.h"
/**
* PerfectHash handles setting up hash functions and storage
* for LM data.
*/
using randlm::Filter;
using randlm::BitFilter;
typedef std::map<std::string, count_t> hpDict_t;
typedef hpDict_t::iterator hpdEntry_t;
static count_t collisions_ = 0;
/* Based on Mortenson et. al. 2006 */
template<typename T>
class PerfectHash
{
public:
PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase);
PerfectHash(Moses::FileHandler* fin) {
UTIL_THROW_IF2(fin == 0, "Invalid file handle");
}
virtual ~PerfectHash();
void analyze();
count_t hpDictMemUse();
count_t bucketsMemUse();
protected:
Filter<T>* filter_;
Filter<T>* values_;
hpDict_t dict_;
uint64_t cells_;
count_t hitMask_;
int totBuckets_;
uint8_t bucketRange_;
uint8_t* idxTracker_;
uint64_t insert(const wordID_t* IDs, const int len, const count_t value);
bool update(const wordID_t* IDs, const int len, const count_t value,
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
bool update2(const wordID_t* IDs, const int len, const count_t value,
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
int query(const wordID_t* IDs, const int len,
hpdEntry_t& hpdAddr, uint64_t& filterIdx);
virtual void remove(const wordID_t* IDs, const int len);
void remove(uint64_t index);
void save(Moses::FileHandler* fout);
void load(Moses::FileHandler* fin);
virtual void markQueried(const uint64_t&)=0;
//pointer to a specific entry in a hpDict_t
virtual void markQueried(hpdEntry_t&)=0;
private:
T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
std::string hpDictKeyValue(const wordID_t* IDs, const int len);
uint64_t memBound_; // total memory bound in bytes
uint16_t cellWidth_; // in bits
UnivHash_linear<count_t>* bucketHash_;
UnivHash_linear<T>* fingerHash_;
LogQtizer* qtizer_;
};
template<typename T>
PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
cellWidth_(width)
{
bucketRange_ = static_cast<uint8_t>(bucketRange);
if(bucketRange > 255) {
std::cerr << "ERROR: Max bucket range is > 2^8\n";
exit(1);
}
qtizer_ = new LogQtizer(qBase);
int valBits = (int)ceil(log2((float)qtizer_->maxcode()));
std::cerr << "BITS FOR VALUES ARRAY = " << valBits << std::endl;
uint64_t totalBits = memBound_ << 3;
cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range
totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells
filter_ = new Filter<T>(cells_, cellWidth_);
values_ = new Filter<T>(cells_, valBits);
idxTracker_ = new uint8_t[totBuckets_];
for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0;
// initialize ranges for each hash function
bucketHash_ = new UnivHash_linear<count_t>(totBuckets_, 1, PRIME);
fingerHash_ = new UnivHash_linear<T>(pow(2.0f, cellWidth_), MAX_HASH_FUNCS, PRIME);
}
template<typename T>
PerfectHash<T>::~PerfectHash()
{
delete[] idxTracker_;
delete filter_;
filter_ = NULL;
delete fingerHash_;
delete bucketHash_;
delete qtizer_;
delete values_;
}
template<typename T>
uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
const count_t value)
{
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t emptyidx = cells_ + 1;
uint64_t index = bucket * bucketRange_, // starting bucket row
lastrow = index + bucketRange_; // ending row
while(index < lastrow) { // unique so check each row for "matching" signature
T filterVal = filter_->read(index);
if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
emptyidx = index;
} else if(filterVal == fp) {
++collisions_;
dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd
return cells_ + 1; // finished
}
++index;
}
UTIL_THROW_IF2((emptyidx >= index) || (filter_->read(emptyidx) != 0), "Error"); // should have found empty index if it gets here
T code = (T)qtizer_->code(value);
filter_->write(emptyidx, fp); // insert the fprint
values_->write(emptyidx, code);
++idxTracker_[bucket]; // keep track of bucket size
return emptyidx;
} else { // bucket is full
dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd
return cells_ + 1;
}
}
template<typename T>
bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
{
// check if key is in high perf. dictionary
filterIdx = cells_ + 1;
std::string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
hpdAddr->second = value;
return true;
}
// else hash ngram
//count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t index = bucket * bucketRange_, // starting bucket row
lastrow = index + bucketRange_;
while(index < lastrow) { // must check each row for matching fp event
T filterVal = filter_->read(index);
if(filterVal == fp) { // found event w.h.p.
values_->write(index, (T)qtizer_->code(value));
filterIdx = index;
return true;
}
++index;
}
// could add if it gets here.
return false;
}
template<typename T>
int PerfectHash<T>::query(const wordID_t* IDs, const int len,
hpdEntry_t& hpdAddr, uint64_t& filterIdx)
{
// check if key is in high perf. dictionary
std::string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
filterIdx = cells_ + 1;
return(hpdAddr->second); // returns copy of value
} else { // check if key is in filter
// get bucket
//count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
// return value if ngram is in filter
uint64_t index = bucket * bucketRange_,
lastrow = index + bucketRange_;
for(; index < lastrow; ++index) {
if(filter_->read(index) == fp) {
//cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
//filter_->read(index) << "\tcode = " << code << std::endl;
filterIdx = index;
hpdAddr = dict_.end();
return (int)qtizer_->value(values_->read(index));
}
}
}
return -1;
}
template<typename T>
void PerfectHash<T>::remove(const wordID_t* IDs, const int len)
{
// delete key if in high perf. dictionary
std::string skey = hpDictKeyValue(IDs, len);
if(dict_.find(skey) != dict_.end())
dict_.erase(skey);
else { // check if key is in filter
// get small representation for ngrams
//count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
// retrieve non zero fingerprint for ngram
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
// return value if ngram is in filter
uint64_t index = bucket * bucketRange_,
lastrow = index + bucketRange_;
for(; index < lastrow; ++index) {
if(filter_->read(index) == fp) {
filter_->write(index, 0);
values_->write(index, 0);
--idxTracker_[bucket]; // track bucket size reduction
break;
}
}
}
}
template<typename T> // clear filter index
void PerfectHash<T>::remove(uint64_t index)
{
UTIL_THROW_IF2(index >= cells_, "Out of bound: " << index);
UTIL_THROW_IF2(filter_->read(index) == 0, "Error"); // slow
filter_->write(index, 0);
values_->write(index, 0);
//reduce bucket size
count_t bucket = index / bucketRange_;
--idxTracker_[bucket];
}
template<typename T>
T PerfectHash<T>::nonZeroSignature(const wordID_t* IDs, const int len,
count_t bucket)
{
count_t h = bucket;
T fingerprint(0);
do {
fingerprint = fingerHash_->hash(IDs, len, h);
h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
} while((fingerprint == 0) && (h != bucket));
if(fingerprint == 0)
std::cerr << "WARNING: Unable to find non-zero signature for ngram\n" << std::endl;
return fingerprint;
}
template<typename T>
std::string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len)
{
std::string skey(" ");
for(int i = 0; i < len; ++i)
skey += Utils::IntToStr(IDs[i]) + "¬";
Utils::trim(skey);
return skey;
}
template<typename T>
count_t PerfectHash<T>::hpDictMemUse()
{
// return hpDict memory usage in MBs
return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20;
}
template<typename T>
count_t PerfectHash<T>::bucketsMemUse()
{
// return bucket memory usage in MBs
return (count_t) (filter_->size() + values_->size());
}
template<typename T>
void PerfectHash<T>::save(Moses::FileHandler* fout)
{
UTIL_THROW_IF2(fout == 0, "Invalid file handle");
std::cerr << "\tSaving perfect hash parameters...\n";
fout->write((char*)&hitMask_, sizeof(hitMask_));
fout->write((char*)&memBound_, sizeof(memBound_));
fout->write((char*)&cellWidth_, sizeof(cellWidth_));
fout->write((char*)&cells_, sizeof(cells_));
fout->write((char*)&totBuckets_, sizeof(totBuckets_));
fout->write((char*)&bucketRange_, sizeof(bucketRange_));
fout->write((char*)idxTracker_, totBuckets_ * sizeof(idxTracker_[0]));
qtizer_->save(fout);
std::cerr << "\tSaving hash functions...\n";
fingerHash_->save(fout);
bucketHash_->save(fout);
std::cerr << "\tSaving bit filter...\n";
filter_->save(fout);
values_->save(fout);
std::cerr << "\tSaving high performance dictionary...\n";
count_t size = dict_.size();
fout->write((char*)&size, sizeof(count_t));
*fout << std::endl;
iterate(dict_, t)
*fout << t->first << "\t" << t->second << "\n";
}
template<typename T>
void PerfectHash<T>::load(Moses::FileHandler* fin)
{
UTIL_THROW_IF2(fin == 0, "Invalid file handle");
std::cerr << "\tLoading perfect hash parameters...\n";
fin->read((char*)&hitMask_, sizeof(hitMask_));
fin->read((char*)&memBound_, sizeof(memBound_));
fin->read((char*)&cellWidth_, sizeof(cellWidth_));
fin->read((char*)&cells_, sizeof(cells_));
fin->read((char*)&totBuckets_, sizeof(totBuckets_));
fin->read((char*)&bucketRange_, sizeof(bucketRange_));
idxTracker_ = new uint8_t[totBuckets_];
fin->read((char*)idxTracker_, totBuckets_ * sizeof(idxTracker_[0]));
qtizer_ = new LogQtizer(fin);
std::cerr << "\tLoading hash functions...\n";
fingerHash_ = new UnivHash_linear<T>(fin);
bucketHash_ = new UnivHash_linear<count_t>(fin);
std::cerr << "\tLoading bit filter...\n";
filter_ = new Filter<T>(fin);
values_ = new Filter<T>(fin);
std::cerr << "\tLoading HPD...\n";
count_t size = 0;
fin->read((char*)&size, sizeof(count_t));
fin->ignore(256, '\n');
std::string line;
hpDict_t::key_type key;
hpDict_t::mapped_type val;
for(count_t i=0; i < size; ++i) {
getline(*fin, line);
Utils::trim(line);
std::istringstream ss(line.c_str());
ss >> key, ss >> val;
dict_[key] = val;
}
std::cerr << "\tHPD size=" << dict_.size() << std::endl;
std::cerr << "Finished loading ORLM." << std::endl;
}
template<typename T>
void PerfectHash<T>::analyze()
{
std::cerr << "Analyzing Dynamic Bloomier Filter...\n";
// see how many items in each bucket
uint8_t* bucketCnt = new uint8_t[totBuckets_];
unsigned largestBucket = 0, totalCellsSet = 0,
smallestBucket = bucketRange_, totalZeroes = 0;
int curBucket = -1, fullBuckets(0);
for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0;
for(uint64_t i =0; i < cells_; ++i) {
if(i % bucketRange_ == 0) ++curBucket;
if(filter_->read(i) != 0) {
++bucketCnt[curBucket];
++totalCellsSet;
} else ++totalZeroes;
}
count_t bi = 0, si = 0;
for(int i = 0; i < totBuckets_; ++i) {
if(bucketCnt[i] > largestBucket) {
largestBucket = bucketCnt[i];
bi = i;
} else if(bucketCnt[i] < smallestBucket) {
smallestBucket = bucketCnt[i];
si = i;
}
}
count_t trackerCells(0);
for(int i = 0; i < totBuckets_; i++) {
trackerCells += idxTracker_[i];
if(idxTracker_[i] == bucketRange_)
++fullBuckets;
}
for(int i = 0; i < totBuckets_; ++i) {
if(bucketCnt[i] != idxTracker_[i])
std::cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
"\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << std::endl;
}
std::cerr << "total cells= " << cells_ << std::endl;
std::cerr << "total buckets= " << totBuckets_ << std::endl;
std::cerr << "bucket range= " << (int)bucketRange_ << std::endl;
std::cerr << "fingerprint bits= " << cellWidth_ << std::endl;
std::cerr << "total cells set= " << totalCellsSet;
std::cerr << " (idxTracker set = " << trackerCells << ")" << std::endl;
std::cerr << "total zeroes=" << totalZeroes;
std::cerr << " (idxTracker zeros = " << cells_ - trackerCells << ")" << std::endl;
std::cerr << "largest bucket (" << bi << ") size= " << largestBucket << std::endl;
std::cerr << "smallest bucket (" << si << ") size= " << smallestBucket << std::endl;
std::cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] <<
" (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << std::endl;
std::cerr << "total buckets full = " << fullBuckets << std::endl;
std::cerr << "total collision errors= " << collisions_ << std::endl;
std::cerr << "high performance dictionary size= " << dict_.size() << std::endl;
std::cerr << "high performance dictionary MBs= " << hpDictMemUse() << std::endl;
std::cerr << "filter MBs= " << filter_->size() << std::endl;
std::cerr << "values MBs= " << values_->size() << std::endl;
delete[] bucketCnt;
}
template<typename T>
bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
{
// check if key is in high perf. dictionary
filterIdx = cells_ + 1;
std::string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
hpdAddr->second += value;
return true;
}
// else hash ngram
//count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t index = bucket * bucketRange_, // starting bucket row
lastrow = index + bucketRange_;
while(index < lastrow) { // must check each row for matching fp event
T filterVal = filter_->read(index);
if(filterVal == fp) { // found event w.h.p.
int oldval = (int)qtizer_->value(values_->read(index));
values_->write(index, (T)qtizer_->code(oldval + value));
filterIdx = index;
return true;
}
++index;
}
// add if it gets here.
insert(IDs, len, value);
return false;
}
#endif

View File

@ -1,106 +0,0 @@
#ifndef ORLM_QUANTIZER_H
#define ORLM_QUANTIZER_H
#include <vector>
#include <cmath>
#include <algorithm>
#include "types.h"
static const float kFloatErr = 0.00001f;
#ifdef WIN32
#define log2(X) (log((double)X)/log((double)2))
#endif
//! @todo ask abby2
class LogQtizer
{
public:
LogQtizer(float i): base_(pow(2, 1 / i)) {
UTIL_THROW_IF2(base_ <= 1, "Can't calculate log base less than 1");
max_code_ = 0;
float value = 1; // code = 1 -> value = 1 for any base
std::vector<float> code_to_value_vec;
while (log2(value) < 30) { // assume 2^30 is largest count
code_to_value_vec.push_back(value);
value = pow(base_, ++max_code_);
}
code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_]
// get valid range
max_value_ = code_to_value_vec[max_code_];
min_value_ = 1;
// store codes in array for lookup
code_to_value_ = new float[max_code_ + 1];
code_to_log_value_ = new float[max_code_ + 1];
for (int j = 0; j <= max_code_; ++j) {
// map to integers
code_to_value_[j] = floor(kFloatErr + code_to_value_vec[j]); //
code_to_log_value_[j] = log10(code_to_value_[j]); // log_base 10 to match srilm
}
std::cerr << "Initialized quantization (size = " << max_code_ + 1 << ")" << std::endl;
}
LogQtizer(Moses::FileHandler* fin) {
UTIL_THROW_IF2(fin == NULL, "Null file handle");
load(fin);
}
int code(float value) {
// should just be: return log_b(value)
UTIL_THROW_IF2(value < min_value_ || value > max_value_,
"Value " << value << " out of bound");
// but binary search removes errors due to floor operator above
int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
value) - code_to_value_);
// make sure not overestimating
code = code_to_value_[code] > value ? code - 1 : code;
return code;
}
inline float value(int code) {
// table look up for values
return code_to_value_[code];
}
inline int maxcode() {
return max_code_;
}
inline float logValue(int code) {
// table look up for log of values
return code_to_log_value_[code];
}
~LogQtizer() {
delete[] code_to_value_;
delete[] code_to_log_value_;
}
void save(Moses::FileHandler* fout) {
fout->write((char*)&base_, sizeof(base_));
fout->write((char*)&max_code_, sizeof(max_code_));
fout->write((char*)&max_value_, sizeof(max_value_));
fout->write((char*)&min_value_, sizeof(min_value_));
for (int j = 0; j <= max_code_; ++j)
fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j]));
for (int j = 0; j <= max_code_; ++j)
fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j]));
std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <<std::endl;
}
private:
float base_;
float* code_to_value_;
float* code_to_log_value_;
int max_code_;
float max_value_;
float min_value_;
void load(Moses::FileHandler* fin) {
fin->read((char*)&base_, sizeof(base_));
fin->read((char*)&max_code_, sizeof(max_code_));
fin->read((char*)&max_value_, sizeof(max_value_));
fin->read((char*)&min_value_, sizeof(min_value_));
code_to_value_ = new float[max_code_ + 1];
for(int j = 0; j <= max_code_; ++j)
fin->read((char*)&code_to_value_[j], sizeof(code_to_value_[j]));
code_to_log_value_ = new float[max_code_ + 1];
for(int j = 0; j <= max_code_; ++j)
fin->read((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j]));
std::cerr << "Loaded log codebook with " << max_code_ + 1 << " codes." << std::endl;
}
};
#endif

View File

@ -1,35 +0,0 @@
#ifndef moses_DynSAInclude_types_h
#define moses_DynSAInclude_types_h
#include <iostream>
#include <map>
#include <set>
#include <vector>
#include <typeinfo>
#include <stdint.h>
#if defined WIN32 && !defined __MINGW32__
#define iterate(c, i) for(decltype(c.begin()) i = c.begin(); i != c.end(); ++i)
#define piterate(c, i) for(decltype(c->begin()) i = c->begin(); i != c->end(); ++i)
#define riterate(c, i) for(decltype(c.rbegin()) i = c.rbegin(); i != c.rend(); ++i)
#else
#define iterate(c, i) for(__typeof__(c.begin()) i = c.begin(); i != c.end(); ++i)
#define piterate(c, i) for(__typeof__(c->begin()) i = c->begin(); i != c->end(); ++i)
#define riterate(c, i) for(__typeof__(c.rbegin()) i = c.rbegin(); i != c.rend(); ++i)
#endif
#define THREADED false
#define THREAD_MAX 2
#define MAX_NGRAM_ORDER 8
#define MAX_STR_LEN 300
#define PRIME 8589935681ULL
#define MAX_HASH_FUNCS 1000
//#define PRIME 409
//typedefs for projects
typedef std::string word_t; // word as string
typedef unsigned int wordID_t; // word mapped to integer
typedef std::string date_t; // a date marker
typedef unsigned int count_t; // for 64-bit to 32-bit compatibility
#endif

View File

@ -1,67 +0,0 @@
#ifndef moses_DynSAInclude_utils_h
#define moses_DynSAInclude_utils_h
#include <cstdlib>
#include <vector>
#include <string>
#include <sstream>
#include <cctype>
#include <cmath>
#include <cstring>
//! @todo ask abby2
class Utils
{
public:
static void trim(std::string& str, const std::string dropChars = " \t\n\r") {
str.erase(str.find_last_not_of(dropChars)+1);
str.erase(0, str.find_first_not_of(dropChars));
}
static void rtrim(std::string& str, const std::string dropChars = " \t\n\r") {
str.erase(str.find_last_not_of(dropChars)+1);
}
static void ltrim(std::string& str, const std::string dropChars = " \t\n\r") {
str.erase(0, str.find_first_not_of(dropChars));
}
static std::string IntToStr(int integer) {
std::ostringstream stream;
stream << integer;
return stream.str();
}
static int splitToStr(const char * str,
std::vector<std::string> & items,
const char * delm = "\t") {
char * buff = const_cast<char *>(str);
items.clear();
char * pch = strtok(buff, delm);
while( pch != NULL ) {
items.push_back(pch);
pch = strtok(NULL, delm);
}
return items.size();
}
static int splitToStr(std::string buff,
std::vector<std::string> & items,
std::string delm = "\t") {
std::string cp = buff.substr();
return splitToStr(cp.c_str(), items, delm.c_str());
}
static int splitToInt(std::string buff, std::vector<int>& items,
std::string delm = ",") {
items.clear();
std::vector<std::string> tmpVector(0);
int i = 0;
i = splitToStr(buff.c_str(), tmpVector, delm.c_str());
if( i > 0 )
for( int j = 0; j < i; j++ )
items.push_back(atoi(tmpVector[j].c_str()));
return i;
}
static void strToLowercase(std::string& str) {
for(unsigned i=0; i < str.length(); i++) {
str[i] = tolower(str[i]);
}
}
};
#endif

View File

@ -1,158 +0,0 @@
#include <sstream>
#include "vocab.h"
namespace Moses
{
// Vocab class
void Vocab::InitSpecialWords()
{
m_kBOSWord = InitSpecialWord(BOS_); // BOS_ is a string <s> (defined in ../typedef.h)
m_kEOSWord = InitSpecialWord(EOS_); // EOS_ is a string </s> (defined in ../typedef.h)
m_kOOVWord = InitSpecialWord(UNKNOWN_FACTOR); // UNKNOWN_FACTOR also defined in ../typedef.h
}
const Word Vocab::InitSpecialWord( const std::string& word_str)
{
FactorList factors;
factors.push_back(0); // store the special word string as the first factor
Word word;
// define special word as Input word with one factor and isNonTerminal=false
word.CreateFromString( Input, factors, word_str, false ); // Input is enum defined in ../typedef.h
// TODO not sure if this will work properly:
// - word comparison can fail because the last parameter (isNonTerminal)
// in function CreateFromString may not match properly created words
// - special word is Input word but what about Output words?
// - currently Input/Output variable is not stored in class Word, but in the future???
return word;
}
wordID_t Vocab::GetWordID(const std::string& word_str)
{
FactorList factors;
factors.push_back(0);
Word word;
word.CreateFromString(Input, factors, word_str, false);
return GetWordID(word);
}
// get wordID_t index for word represented as string
wordID_t Vocab::GetWordID(const std::string& word_str,
const FactorDirection& direction, const FactorList& factors, bool isNonTerminal)
{
// get id for factored string
Word word;
word.CreateFromString( direction, factors, word_str, isNonTerminal);
return GetWordID( word);
}
wordID_t Vocab::GetWordID(const Word& word)
{
// get id and possibly add to vocab
if(m_words2ids.find(word) == m_words2ids.end()) {
if (!m_closed) {
wordID_t id = m_words2ids.size() + 1;
m_ids2words[id] = word;
// update lookup tables
m_words2ids[word] = id;
} else {
return m_kOOVWordID;
}
}
wordID_t id = m_words2ids[word];
return id;
}
Word& Vocab::GetWord(wordID_t id)
{
// get word string given id
return (m_ids2words.find(id) == m_ids2words.end()) ? m_kOOVWord : m_ids2words[id];
}
bool Vocab::InVocab(wordID_t id)
{
return m_ids2words.find(id) != m_ids2words.end();
}
bool Vocab::InVocab(const Word& word)
{
return m_words2ids.find(word) != m_words2ids.end();
}
bool Vocab::Save(const std::string & vocab_path)
{
// save vocab as id -> word
FileHandler vcbout(vocab_path, std::ios::out);
return Save(&vcbout);
}
bool Vocab::Save(FileHandler* vcbout)
{
// then each vcb entry
*vcbout << m_ids2words.size() << "\n";
for (Id2Word::const_iterator iter = m_ids2words.begin();
iter != m_ids2words.end(); ++iter) {
*vcbout << iter->second << "\t" << iter->first << "\n";
}
return true;
}
bool Vocab::Load(const std::string & vocab_path, const FactorDirection& direction,
const FactorList& factors, bool closed)
{
FileHandler vcbin(vocab_path, std::ios::in);
std::cerr << "Loading vocab from " << vocab_path << std::endl;
return Load(&vcbin, direction, factors, closed);
}
bool Vocab::Load(FileHandler* vcbin)
{
FactorList factors;
factors.push_back(0);
return Load(vcbin, Input, factors);
}
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
const FactorList& factors, bool closed)
{
// load vocab id -> word mapping
m_words2ids.clear(); // reset mapping
m_ids2words.clear();
std::string line, word_str;
wordID_t id;
std::istream &ret = getline(*vcbin, line);
UTIL_THROW_IF2(!ret, "Couldn't read file");
std::istringstream first(line.c_str());
uint32_t vcbsize(0);
first >> vcbsize;
uint32_t loadedsize = 0;
while (loadedsize++ < vcbsize && getline(*vcbin, line)) {
std::istringstream entry(line.c_str());
entry >> word_str;
Word word;
word.CreateFromString( direction, factors, word_str, false); // TODO set correctly isNonTerminal
entry >> id;
// may be no id (i.e. file may just be a word list)
if (id == 0 && word != GetkOOVWord())
id = m_ids2words.size() + 1; // assign ids sequentially starting from 1
UTIL_THROW_IF2(m_ids2words.count(id) != 0 || m_words2ids.count(word) != 0,
"Error");
m_ids2words[id] = word;
m_words2ids[word] = id;
}
m_closed = closed; // once loaded fix vocab ?
std::cerr << "Loaded vocab with " << m_ids2words.size() << " words." << std::endl;
return true;
}
void Vocab::PrintVocab()
{
for (Id2Word::const_iterator iter = m_ids2words.begin();
iter != m_ids2words.end(); ++iter ) {
std::cerr << iter->second << "\t" << iter->first << "\n";
}
for (Word2Id::const_iterator iter = m_words2ids.begin();
iter != m_words2ids.end(); ++iter ) {
std::cerr << iter->second << "\t" << iter->first << "\n";
}
}
} //end namespace

View File

@ -1,127 +0,0 @@
#ifndef moses_DynSAInclude_vocab_h
#define moses_DynSAInclude_vocab_h
#include <map>
#include <string>
#include "types.h"
#include "FileHandler.h"
#include "utils.h"
#include "moses/TypeDef.h"
#include "moses/Word.h"
namespace Moses
{
//! Vocab maps between strings and uint32 ids.
class Vocab
{
public:
typedef std::map<Word, wordID_t> Word2Id;
typedef std::map<wordID_t, Word> Id2Word;
Vocab(bool sntMarkers = true):
m_closed(false),
m_kOOVWordID(0),
m_kBOSWordID(1) {
InitSpecialWords();
if(sntMarkers) {
GetWordID(m_kBOSWord); // added in case not observed in corpus
GetWordID(m_kEOSWord);
}
}
// if no file then must allow new words
// specify whether more words can be added via 'closed'
// assume that if a vocab is loaded from file then it should be closed.
Vocab(const std::string & vocab_path, const FactorDirection& direction,
const FactorList& factors, bool closed = true):
m_kOOVWordID(0),
m_kBOSWordID(1) {
InitSpecialWords();
bool ret = Load(vocab_path, direction, factors, closed);
UTIL_THROW_IF2(!ret, "Unable to load vocab file: " << vocab_path);
}
Vocab(FileHandler * fin, const FactorDirection& direction,
const FactorList& factors, bool closed = true):
m_kOOVWordID(0),
m_kBOSWordID(1) {
InitSpecialWords();
bool ret = Load(fin, direction, factors, closed);
UTIL_THROW_IF2(!ret, "Unable to load vocab file");
}
Vocab(FileHandler *fin):
m_kOOVWordID(0),
m_kBOSWordID(1) {
Load(fin);
}
~Vocab() {}
// parse 'word' into factored Word and get id
wordID_t GetWordID(const std::string& word, const FactorDirection& direction,
const FactorList& factors, bool isNonTerminal);
wordID_t GetWordID(const Word& word);
wordID_t GetWordID(const std::string& word);
Word& GetWord(wordID_t id);
inline wordID_t GetkOOVWordID() {
return m_kOOVWordID;
}
inline wordID_t GetBOSWordID() {
return m_kBOSWordID;
}
inline const Word& GetkOOVWord() {
return m_kOOVWord;
}
inline const Word& GetkBOSWord() {
return m_kBOSWord;
}
inline const Word& GetkEOSWord() {
return m_kEOSWord;
}
bool InVocab(wordID_t id);
bool InVocab(const Word& word);
uint32_t Size() {
return m_words2ids.size();
}
void MakeClosed() {
m_closed = true;
}
void MakeOpen() {
m_closed = false;
}
bool IsClosed() {
return m_closed;
}
bool Save(const std::string & vocab_path);
bool Save(FileHandler* fout);
bool Load(const std::string & vocab_path, const FactorDirection& direction,
const FactorList& factors, bool closed = true);
bool Load(FileHandler* fin, const FactorDirection& direction,
const FactorList& factors, bool closed = true);
bool Load(FileHandler* fin);
void PrintVocab();
Word2Id::const_iterator VocabStart() {
return m_words2ids.begin();
}
Word2Id::const_iterator VocabEnd() {
return m_words2ids.end();
}
protected:
bool m_closed; // can more words be added
const wordID_t m_kOOVWordID; // out of vocabulary word id
const wordID_t m_kBOSWordID;
Word m_kBOSWord; // beginning of sentence marker
Word m_kEOSWord; // end of sentence marker
Word m_kOOVWord; // <unk>
const Word InitSpecialWord( const std::string& type); // initialize special word like kBOS, kEOS
void InitSpecialWords();
Word2Id m_words2ids; // map from words to word ids
Id2Word m_ids2words; // map from ids to words
};
}
#endif