mosesdecoder/moses/TranslationModel/DynSAInclude/hash.h
2013-11-22 20:27:46 +00:00

359 lines
8.7 KiB
C++

#ifndef INC_ALLHASHFUNCS_H
#define INC_ALLHASHFUNCS_H
#include <cmath>
#include "types.h"
#include "utils.h"
#include "FileHandler.h"
#include "util/exception.hh"
using namespace Moses;
typedef uint64_t P; // largest input range is 2^64
//! @todo ask abby2
template <typename T>
class HashBase
{
protected:
T m_; // range of hash output
count_t H_; // number of hash functions to instantiate
virtual void initSeeds()=0;
virtual void freeSeeds()=0;
public:
HashBase(float m, count_t H=1):m_((T)m), H_(H) {
//cerr << "range = (0..." << m_ << "]" << endl;
}
HashBase(FileHandler* fin) {
load(fin);
}
virtual ~HashBase() {}
virtual T hash(const char*s, count_t h)=0; // string hashing
virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
count_t size() {
return H_;
}
virtual void save(FileHandler* fout) {
UTIL_THROW_IF2(fout == 0, "Null file handle");
fout->write((char*)&m_, sizeof(m_));
fout->write((char*)&H_, sizeof(H_));
}
virtual void load(FileHandler* fin) {
UTIL_THROW_IF2(fin == 0, "Null file handle");
fin->read((char*)&m_, sizeof(m_));
fin->read((char*)&H_, sizeof(H_));
}
};
//! @todo ask abby2
template <typename T>
class UnivHash_linear: public HashBase<T>
{
public:
UnivHash_linear(float m, count_t H, P pr):
HashBase<T>(m, H), pr_(pr) {
initSeeds();
}
UnivHash_linear(FileHandler* fin):
HashBase<T>(fin) {
load(fin);
}
~UnivHash_linear() {
freeSeeds();
}
T hash(const char* s, count_t h) {
return 0; //not implemented
}
T hash(const wordID_t* id, const int len, count_t h);
T hash(const wordID_t id, const count_t pos,
const T prevValue, count_t h);
void save(FileHandler* fout);
void load(FileHandler* fin);
private:
T** a_, **b_;
P pr_;
void initSeeds();
void freeSeeds();
};
/** UnivHash_noPrimes:
* From Dietzfelbinger 2008
* p = input domain range = 2^l
* m = output range = 2^k
* # of hash function = 2^(l-1)
*/
template <typename T>
class UnivHash_noPrimes: public HashBase<T>
{
public:
UnivHash_noPrimes(float k, float l):
HashBase<T>(k, 100), d_(count_t((l-k))) {
if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
else p_ = (P) pow(2,l);
initSeeds();
}
UnivHash_noPrimes(FileHandler* fin):
HashBase<T>(fin) {
load(fin);
}
~UnivHash_noPrimes() {
freeSeeds();
}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h);
T hash(const P x, count_t h);
void save(FileHandler* fout);
void load(FileHandler* fin);
private:
count_t d_; // l-k
P p_, *a_; // real-valued input range, storage
void initSeeds();
void freeSeeds() {
delete[] a_;
}
};
//! @todo ask abby2
template <typename T>
class Hash_shiftAddXOR: public HashBase<T>
{
public:
Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
l_(5), r_(2) {
initSeeds();
}
~Hash_shiftAddXOR() {
freeSeeds();
}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h) {} // empty
private:
T* v_; // random seed storage
const unsigned short l_, r_; // left-shift bits, right-shift bits
void initSeeds();
void freeSeeds() {
delete[] v_;
}
};
//! @todo ask abby2
template <typename T>
class UnivHash_tableXOR: public HashBase<T>
{
public:
UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
table_(NULL), tblLen_(255*MAX_STR_LEN) {
initSeeds();
}
~UnivHash_tableXOR() {
freeSeeds();
}
T hash(const char* s, count_t h);
T hash(const wordID_t* id, const int len, count_t h) {}
private:
T** table_; // storage for random numbers
count_t tblLen_; // length of table
void initSeeds();
void freeSeeds();
};
// ShiftAddXor
template <typename T>
void Hash_shiftAddXOR<T>::initSeeds()
{
v_ = new T[this->H_];
for(count_t i=0; i < this->H_; i++)
v_[i] = Utils::rand<T>() + 1;
}
template <typename T>
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h)
{
T value = v_[h];
int pos(0);
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN)) {
value ^= ((value << l_) + (value >> r_) + c);
}
return (value % this->m_);
}
// UnivHash_tableXOR
template <typename T>
void UnivHash_tableXOR<T>::initSeeds()
{
// delete any values in table
if(table_) freeSeeds();
// instance of new table
table_ = new T* [this->H_];
// fill with random values
for(count_t j=0; j < this->H_; j++) {
table_[j] = new T[tblLen_];
for(count_t i=0; i < tblLen_; i++) {
table_[j][i] = Utils::rand<T>(this->m_-1);
}
}
}
template <typename T>
void UnivHash_tableXOR<T>::freeSeeds()
{
for(count_t j = 0; j < this->H_; j++)
delete[] table_[j];
delete[] table_;
table_ = NULL;
}
template <typename T>
T UnivHash_tableXOR<T>::hash(const char* s, count_t h)
{
T value = 0;
count_t pos = 0, idx = 0;
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN))
value ^= table_[h][idx += c];
UTIL_THROW_IF2(value >= this->m_, "Error");
return value;
}
// UnivHash_noPrimes
template <typename T>
void UnivHash_noPrimes<T>::initSeeds()
{
a_ = new P[this->H_];
for(T i=0; i < this->H_; i++) {
a_[i] = Utils::rand<P>();
if(a_[i] % 2 == 0) a_[i]++; // a must be odd
}
}
template <typename T>
T UnivHash_noPrimes<T>::hash(const P x, count_t h)
{
// h_a(x) = (ax mod 2^l) div 2^(l-k)
T value = ((a_[h] * x) % p_) >> d_;
return value % this->m_;
}
template <typename T>
T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
count_t h)
{
T value = 0;
int pos(0);
while(pos < len) {
value ^= hash((P)id[pos], h++);
pos++;
}
return value % this->m_;
}
template <typename T>
T UnivHash_noPrimes<T>::hash(const char* s, count_t h)
{
T value = 0;
int pos(0);
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN)) {
value ^= hash((P)c, h);
}
return value % this->m_;
}
template <typename T>
void UnivHash_noPrimes<T>::save(FileHandler* fout)
{
HashBase<T>::save(fout);
fout->write((char*)&p_, sizeof(p_));
fout->write((char*)&d_, sizeof(d_));
for(T i=0; i < this->H_; i++) {
fout->write((char*)&a_[i], sizeof(a_[i]));
}
}
template <typename T>
void UnivHash_noPrimes<T>::load(FileHandler* fin)
{
a_ = new P[this->H_];
// HashBase<T>::load(fin) already done in constructor
fin->read((char*)&p_, sizeof(p_));
fin->read((char*)&d_, sizeof(d_));
for(T i=0; i < this->H_; i++) {
fin->read((char*)&a_[i], sizeof(a_[i]));
}
}
//UnivHash_linear
template <typename T>
void UnivHash_linear<T>::initSeeds()
{
a_ = new T*[this->H_];
b_ = new T*[this->H_];
for(count_t i=0; i < this->H_; i++) {
a_[i] = new T[MAX_NGRAM_ORDER];
b_[i] = new T[MAX_NGRAM_ORDER];
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
a_[i][j] = 1 + Utils::rand<T>();
b_[i][j] = Utils::rand<T>();
}
}
}
template <typename T>
void UnivHash_linear<T>::freeSeeds()
{
for(count_t i=0; i < this->H_; i++) {
delete[] a_[i];
delete[] b_[i];
}
delete[] a_;
delete[] b_;
a_ = b_ = NULL;
}
template <typename T>
inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
count_t h)
{
UTIL_THROW_IF2(h >= this->H_, "Error");
T value = 0;
int pos(0);
while(pos < len) {
value += ((a_[h][pos] * id[pos]) + b_[h][pos]);// % pr_;
++pos;
}
return value % this->m_;
}
template <typename T>
inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos,
const T prevValue, count_t h)
{
UTIL_THROW_IF2(h >= this->H_, "Error");
T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_;
return value % this->m_;
}
template <typename T>
void UnivHash_linear<T>::save(FileHandler* fout)
{
// int bytes = sizeof(a_[0][0]);
HashBase<T>::save(fout);
fout->write((char*)&pr_, sizeof(pr_));
for(count_t i=0; i < this->H_; i++) {
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
fout->write((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
}
}
}
template <typename T>
void UnivHash_linear<T>::load(FileHandler* fin)
{
// HashBase<T>::load(fin) already done in constructor
fin->read((char*)&pr_, sizeof(pr_));
a_ = new T*[this->H_];
b_ = new T*[this->H_];
for(count_t i=0; i < this->H_; i++) {
a_[i] = new T[MAX_NGRAM_ORDER];
b_[i] = new T[MAX_NGRAM_ORDER];
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
}
}
}
#endif