2011-06-08 19:45:08 +04:00
|
|
|
#ifndef INC_ALLHASHFUNCS_H
|
|
|
|
#define INC_ALLHASHFUNCS_H
|
|
|
|
|
|
|
|
#include <cmath>
|
|
|
|
#include "types.h"
|
|
|
|
#include "utils.h"
|
2012-07-10 13:05:14 +04:00
|
|
|
#include "FileHandler.h"
|
2013-11-20 22:18:02 +04:00
|
|
|
#include "util/exception.hh"
|
|
|
|
|
2011-06-08 19:45:08 +04:00
|
|
|
using namespace Moses;
|
|
|
|
typedef uint64_t P; // largest input range is 2^64
|
|
|
|
|
2012-06-29 02:29:46 +04:00
|
|
|
//! @todo ask abby2
|
2011-06-08 19:45:08 +04:00
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
class HashBase
|
|
|
|
{
|
|
|
|
protected:
|
|
|
|
T m_; // range of hash output
|
|
|
|
count_t H_; // number of hash functions to instantiate
|
|
|
|
virtual void initSeeds()=0;
|
|
|
|
virtual void freeSeeds()=0;
|
|
|
|
public:
|
|
|
|
HashBase(float m, count_t H=1):m_((T)m), H_(H) {
|
|
|
|
//cerr << "range = (0..." << m_ << "]" << endl;
|
|
|
|
}
|
|
|
|
HashBase(FileHandler* fin) {
|
|
|
|
load(fin);
|
|
|
|
}
|
|
|
|
virtual ~HashBase() {}
|
|
|
|
virtual T hash(const char*s, count_t h)=0; // string hashing
|
|
|
|
virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
|
|
|
|
count_t size() {
|
|
|
|
return H_;
|
|
|
|
}
|
|
|
|
virtual void save(FileHandler* fout) {
|
2013-11-23 00:27:46 +04:00
|
|
|
UTIL_THROW_IF2(fout == 0, "Null file handle");
|
2013-05-29 21:16:15 +04:00
|
|
|
fout->write((char*)&m_, sizeof(m_));
|
|
|
|
fout->write((char*)&H_, sizeof(H_));
|
|
|
|
}
|
|
|
|
virtual void load(FileHandler* fin) {
|
2013-11-23 00:27:46 +04:00
|
|
|
UTIL_THROW_IF2(fin == 0, "Null file handle");
|
2013-05-29 21:16:15 +04:00
|
|
|
fin->read((char*)&m_, sizeof(m_));
|
|
|
|
fin->read((char*)&H_, sizeof(H_));
|
|
|
|
}
|
2011-06-08 19:45:08 +04:00
|
|
|
};
|
2012-06-29 02:29:46 +04:00
|
|
|
|
|
|
|
//! @todo ask abby2
|
2011-06-08 19:45:08 +04:00
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
class UnivHash_linear: public HashBase<T>
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
UnivHash_linear(float m, count_t H, P pr):
|
|
|
|
HashBase<T>(m, H), pr_(pr) {
|
|
|
|
initSeeds();
|
|
|
|
}
|
|
|
|
UnivHash_linear(FileHandler* fin):
|
|
|
|
HashBase<T>(fin) {
|
|
|
|
load(fin);
|
|
|
|
}
|
|
|
|
~UnivHash_linear() {
|
|
|
|
freeSeeds();
|
|
|
|
}
|
|
|
|
T hash(const char* s, count_t h) {
|
|
|
|
return 0; //not implemented
|
|
|
|
}
|
|
|
|
T hash(const wordID_t* id, const int len, count_t h);
|
|
|
|
T hash(const wordID_t id, const count_t pos,
|
|
|
|
const T prevValue, count_t h);
|
|
|
|
void save(FileHandler* fout);
|
|
|
|
void load(FileHandler* fin);
|
|
|
|
private:
|
|
|
|
T** a_, **b_;
|
|
|
|
P pr_;
|
|
|
|
void initSeeds();
|
|
|
|
void freeSeeds();
|
2011-06-08 19:45:08 +04:00
|
|
|
};
|
|
|
|
|
2012-06-29 02:29:46 +04:00
|
|
|
/** UnivHash_noPrimes:
|
2011-06-08 19:45:08 +04:00
|
|
|
* From Dietzfelbinger 2008
|
|
|
|
* p = input domain range = 2^l
|
|
|
|
* m = output range = 2^k
|
|
|
|
* # of hash function = 2^(l-1)
|
2012-06-29 02:29:46 +04:00
|
|
|
*/
|
2011-06-08 19:45:08 +04:00
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
class UnivHash_noPrimes: public HashBase<T>
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
UnivHash_noPrimes(float k, float l):
|
|
|
|
HashBase<T>(k, 100), d_(count_t((l-k))) {
|
|
|
|
if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
|
|
|
|
else p_ = (P) pow(2,l);
|
|
|
|
initSeeds();
|
|
|
|
}
|
|
|
|
UnivHash_noPrimes(FileHandler* fin):
|
|
|
|
HashBase<T>(fin) {
|
|
|
|
load(fin);
|
|
|
|
}
|
|
|
|
~UnivHash_noPrimes() {
|
|
|
|
freeSeeds();
|
|
|
|
}
|
|
|
|
T hash(const char* s, count_t h);
|
|
|
|
T hash(const wordID_t* id, const int len, count_t h);
|
|
|
|
T hash(const P x, count_t h);
|
|
|
|
void save(FileHandler* fout);
|
|
|
|
void load(FileHandler* fin);
|
|
|
|
private:
|
|
|
|
count_t d_; // l-k
|
|
|
|
P p_, *a_; // real-valued input range, storage
|
|
|
|
void initSeeds();
|
|
|
|
void freeSeeds() {
|
|
|
|
delete[] a_;
|
|
|
|
}
|
2011-06-08 19:45:08 +04:00
|
|
|
};
|
|
|
|
|
2012-06-29 02:29:46 +04:00
|
|
|
//! @todo ask abby2
|
2011-06-08 19:45:08 +04:00
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
class Hash_shiftAddXOR: public HashBase<T>
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
|
|
|
|
l_(5), r_(2) {
|
|
|
|
initSeeds();
|
|
|
|
}
|
|
|
|
~Hash_shiftAddXOR() {
|
|
|
|
freeSeeds();
|
|
|
|
}
|
|
|
|
T hash(const char* s, count_t h);
|
|
|
|
T hash(const wordID_t* id, const int len, count_t h) {} // empty
|
|
|
|
private:
|
|
|
|
T* v_; // random seed storage
|
|
|
|
const unsigned short l_, r_; // left-shift bits, right-shift bits
|
|
|
|
void initSeeds();
|
|
|
|
void freeSeeds() {
|
|
|
|
delete[] v_;
|
|
|
|
}
|
2011-06-08 19:45:08 +04:00
|
|
|
};
|
|
|
|
|
2012-06-29 02:29:46 +04:00
|
|
|
//! @todo ask abby2
|
2011-06-08 19:45:08 +04:00
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
class UnivHash_tableXOR: public HashBase<T>
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
|
|
|
|
table_(NULL), tblLen_(255*MAX_STR_LEN) {
|
|
|
|
initSeeds();
|
|
|
|
}
|
|
|
|
~UnivHash_tableXOR() {
|
|
|
|
freeSeeds();
|
|
|
|
}
|
|
|
|
T hash(const char* s, count_t h);
|
|
|
|
T hash(const wordID_t* id, const int len, count_t h) {}
|
|
|
|
private:
|
|
|
|
T** table_; // storage for random numbers
|
|
|
|
count_t tblLen_; // length of table
|
|
|
|
void initSeeds();
|
|
|
|
void freeSeeds();
|
2011-06-08 19:45:08 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
// ShiftAddXor
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
void Hash_shiftAddXOR<T>::initSeeds()
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
v_ = new T[this->H_];
|
|
|
|
for(count_t i=0; i < this->H_; i++)
|
2013-05-29 21:16:15 +04:00
|
|
|
v_[i] = Utils::rand<T>() + 1;
|
2011-06-08 19:45:08 +04:00
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
T Hash_shiftAddXOR<T>::hash(const char* s, count_t h)
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
T value = v_[h];
|
|
|
|
int pos(0);
|
|
|
|
unsigned char c;
|
|
|
|
while((c = *s++) && (++pos < MAX_STR_LEN)) {
|
|
|
|
value ^= ((value << l_) + (value >> r_) + c);
|
|
|
|
}
|
|
|
|
return (value % this->m_);
|
|
|
|
}
|
|
|
|
|
|
|
|
// UnivHash_tableXOR
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
void UnivHash_tableXOR<T>::initSeeds()
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
// delete any values in table
|
2013-05-29 21:16:15 +04:00
|
|
|
if(table_) freeSeeds();
|
2011-06-08 19:45:08 +04:00
|
|
|
// instance of new table
|
|
|
|
table_ = new T* [this->H_];
|
|
|
|
// fill with random values
|
|
|
|
for(count_t j=0; j < this->H_; j++) {
|
|
|
|
table_[j] = new T[tblLen_];
|
2013-05-29 21:16:15 +04:00
|
|
|
for(count_t i=0; i < tblLen_; i++) {
|
|
|
|
table_[j][i] = Utils::rand<T>(this->m_-1);
|
2011-06-08 19:45:08 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
void UnivHash_tableXOR<T>::freeSeeds()
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
for(count_t j = 0; j < this->H_; j++)
|
|
|
|
delete[] table_[j];
|
|
|
|
delete[] table_;
|
|
|
|
table_ = NULL;
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
T UnivHash_tableXOR<T>::hash(const char* s, count_t h)
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
T value = 0;
|
|
|
|
count_t pos = 0, idx = 0;
|
|
|
|
unsigned char c;
|
|
|
|
while((c = *s++) && (++pos < MAX_STR_LEN))
|
|
|
|
value ^= table_[h][idx += c];
|
2013-11-23 00:27:46 +04:00
|
|
|
UTIL_THROW_IF2(value >= this->m_, "Error");
|
2011-06-08 19:45:08 +04:00
|
|
|
return value;
|
|
|
|
}
|
|
|
|
|
|
|
|
// UnivHash_noPrimes
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
void UnivHash_noPrimes<T>::initSeeds()
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
a_ = new P[this->H_];
|
|
|
|
for(T i=0; i < this->H_; i++) {
|
|
|
|
a_[i] = Utils::rand<P>();
|
|
|
|
if(a_[i] % 2 == 0) a_[i]++; // a must be odd
|
|
|
|
}
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
T UnivHash_noPrimes<T>::hash(const P x, count_t h)
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
// h_a(x) = (ax mod 2^l) div 2^(l-k)
|
|
|
|
T value = ((a_[h] * x) % p_) >> d_;
|
|
|
|
return value % this->m_;
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
|
|
|
|
count_t h)
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
T value = 0;
|
|
|
|
int pos(0);
|
|
|
|
while(pos < len) {
|
|
|
|
value ^= hash((P)id[pos], h++);
|
|
|
|
pos++;
|
|
|
|
}
|
|
|
|
return value % this->m_;
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
T UnivHash_noPrimes<T>::hash(const char* s, count_t h)
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
T value = 0;
|
|
|
|
int pos(0);
|
|
|
|
unsigned char c;
|
|
|
|
while((c = *s++) && (++pos < MAX_STR_LEN)) {
|
2013-05-29 21:16:15 +04:00
|
|
|
value ^= hash((P)c, h);
|
2011-06-08 19:45:08 +04:00
|
|
|
}
|
|
|
|
return value % this->m_;
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
void UnivHash_noPrimes<T>::save(FileHandler* fout)
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
HashBase<T>::save(fout);
|
|
|
|
fout->write((char*)&p_, sizeof(p_));
|
|
|
|
fout->write((char*)&d_, sizeof(d_));
|
2013-05-29 21:16:15 +04:00
|
|
|
for(T i=0; i < this->H_; i++) {
|
2011-06-08 19:45:08 +04:00
|
|
|
fout->write((char*)&a_[i], sizeof(a_[i]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
void UnivHash_noPrimes<T>::load(FileHandler* fin)
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
a_ = new P[this->H_];
|
|
|
|
// HashBase<T>::load(fin) already done in constructor
|
|
|
|
fin->read((char*)&p_, sizeof(p_));
|
|
|
|
fin->read((char*)&d_, sizeof(d_));
|
2013-05-29 21:16:15 +04:00
|
|
|
for(T i=0; i < this->H_; i++) {
|
2011-06-08 19:45:08 +04:00
|
|
|
fin->read((char*)&a_[i], sizeof(a_[i]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//UnivHash_linear
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
void UnivHash_linear<T>::initSeeds()
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
a_ = new T*[this->H_];
|
|
|
|
b_ = new T*[this->H_];
|
|
|
|
for(count_t i=0; i < this->H_; i++) {
|
|
|
|
a_[i] = new T[MAX_NGRAM_ORDER];
|
|
|
|
b_[i] = new T[MAX_NGRAM_ORDER];
|
|
|
|
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
|
|
|
|
a_[i][j] = 1 + Utils::rand<T>();
|
|
|
|
b_[i][j] = Utils::rand<T>();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
void UnivHash_linear<T>::freeSeeds()
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
for(count_t i=0; i < this->H_; i++) {
|
|
|
|
delete[] a_[i];
|
|
|
|
delete[] b_[i];
|
|
|
|
}
|
|
|
|
delete[] a_;
|
|
|
|
delete[] b_;
|
|
|
|
a_ = b_ = NULL;
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
|
|
|
|
count_t h)
|
|
|
|
{
|
2013-11-23 00:27:46 +04:00
|
|
|
UTIL_THROW_IF2(h >= this->H_, "Error");
|
2013-11-20 22:18:02 +04:00
|
|
|
|
2011-06-08 19:45:08 +04:00
|
|
|
T value = 0;
|
|
|
|
int pos(0);
|
|
|
|
while(pos < len) {
|
|
|
|
value += ((a_[h][pos] * id[pos]) + b_[h][pos]);// % pr_;
|
|
|
|
++pos;
|
|
|
|
}
|
|
|
|
return value % this->m_;
|
|
|
|
}
|
|
|
|
template <typename T>
|
|
|
|
inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos,
|
2013-05-29 21:16:15 +04:00
|
|
|
const T prevValue, count_t h)
|
|
|
|
{
|
2013-11-23 00:27:46 +04:00
|
|
|
UTIL_THROW_IF2(h >= this->H_, "Error");
|
2011-06-08 19:45:08 +04:00
|
|
|
T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_;
|
|
|
|
return value % this->m_;
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
void UnivHash_linear<T>::save(FileHandler* fout)
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
// int bytes = sizeof(a_[0][0]);
|
|
|
|
HashBase<T>::save(fout);
|
|
|
|
fout->write((char*)&pr_, sizeof(pr_));
|
|
|
|
for(count_t i=0; i < this->H_; i++) {
|
|
|
|
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
|
2013-05-29 21:16:15 +04:00
|
|
|
fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
|
2011-06-08 19:45:08 +04:00
|
|
|
fout->write((char*)&b_[i][j], sizeof(b_[i][j]));
|
|
|
|
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
|
|
|
|
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
template <typename T>
|
2013-05-29 21:16:15 +04:00
|
|
|
void UnivHash_linear<T>::load(FileHandler* fin)
|
|
|
|
{
|
2011-06-08 19:45:08 +04:00
|
|
|
// HashBase<T>::load(fin) already done in constructor
|
|
|
|
fin->read((char*)&pr_, sizeof(pr_));
|
|
|
|
a_ = new T*[this->H_];
|
|
|
|
b_ = new T*[this->H_];
|
|
|
|
for(count_t i=0; i < this->H_; i++) {
|
|
|
|
a_[i] = new T[MAX_NGRAM_ORDER];
|
|
|
|
b_[i] = new T[MAX_NGRAM_ORDER];
|
|
|
|
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
|
2013-05-29 21:16:15 +04:00
|
|
|
fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
|
|
|
|
fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
|
2011-06-08 19:45:08 +04:00
|
|
|
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
|
|
|
|
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|