open-source-search-engine/HashTableX.h

533 lines
15 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
// Matt Wells, Copyright, Dec. 2002
// . generic hash table class
#ifndef _HASHTABLEX_H_
#define _HASHTABLEX_H_
#include "SafeBuf.h"
2013-08-03 00:12:24 +04:00
class HashTableX {
public:
2014-11-11 01:45:11 +03:00
bool set ( int32_t keySize ,
int32_t dataSize ,
int32_t initialNumSlots , // = 0 ,
2013-08-03 00:12:24 +04:00
char *buf , // = NULL ,
2014-11-11 01:45:11 +03:00
int32_t bufSize , // = 0 ,
2013-08-03 00:12:24 +04:00
bool allowDups , // = false ,
2014-11-11 01:45:11 +03:00
int32_t niceness , // = MAX_NICENESS ,
2013-09-25 21:58:03 +04:00
char *allocName ,
bool useKeyMagic = false );
2013-08-03 00:12:24 +04:00
// key size is 0 if UNinitialized
bool isInitialized ( ) { return (m_ks != 0); };
2013-08-03 00:12:24 +04:00
HashTableX ( );
~HashTableX ( );
void constructor ();
void destructor ();
// . add key/value entry to hash table
// . will grow hash table if it needs to
// . returns false and sets g_errno on error, returns true otherwise
2014-11-11 01:45:11 +03:00
bool addKey ( void *key , void *value , int32_t *slot = NULL );
2013-08-03 00:12:24 +04:00
// for value-less hashtables
bool addKey ( void *key );
// . remove key/value entry to hash table.
// . returns false and sets g_errno on error.
bool removeKey ( void *key );
// same as remove
2014-11-11 01:45:11 +03:00
bool deleteSlot ( int32_t n ) { return removeSlot(n); };
2013-08-03 00:12:24 +04:00
// like removeKey. returns false and sets g_errno on error.
2014-11-11 01:45:11 +03:00
bool removeSlot ( int32_t n );
2013-08-03 00:12:24 +04:00
// see how optimal the hashtable is
2014-11-11 01:45:11 +03:00
int32_t getLongestString ();
2013-08-03 00:12:24 +04:00
// how many keys are dups
2014-11-11 01:45:11 +03:00
int32_t getNumDups();
2013-08-03 00:12:24 +04:00
2014-09-21 20:26:13 +04:00
// if in a thread to dont allow it to grow
void setNonGrow() { m_allowGrowth = false; }
bool m_allowGrowth;
2014-11-11 01:45:11 +03:00
bool addFloat ( int32_t *wid , float score ) {
int32_t slot = getSlot ( wid );
2013-08-03 00:12:24 +04:00
if ( slot<0 ) return addKey( wid ,&score,&slot);
float *val = (float *)getValueFromSlot ( slot );
*val = *val + score;
return true;
};
// a replacement for TermTable.cpp
2014-11-11 01:45:11 +03:00
bool addTerm ( int64_t *wid , int32_t score = 1 ) {
int32_t slot = getSlot ( wid );
2013-08-03 00:12:24 +04:00
if ( slot<0 ) return addKey( wid ,&score,&slot);
uint32_t *val = (uint32_t *)getValueFromSlot ( slot );
// overflow check
if ( *val + (uint32_t)score < *val ) *val = 0xffffffff;
else *val = *val + score;
return true;
};
bool addTerm64 ( char *str ) {
uint64_t wid64 = hash64n ( str );
2014-10-30 22:36:39 +03:00
return addTerm64 ( (int64_t *)&wid64 );
2013-08-03 00:12:24 +04:00
};
2014-11-11 01:45:11 +03:00
bool addTerm64 ( int64_t *wid , int32_t score = 1 ) {
2013-08-03 00:12:24 +04:00
return addTerm(wid,score); }
// a replacement for TermTable.cpp
2014-10-30 22:36:39 +03:00
uint32_t getScore ( int64_t *wid ) {
2014-11-11 01:45:11 +03:00
int32_t slot = getSlot ( wid );
2013-08-03 00:12:24 +04:00
if ( slot < 0 ) return 0;
return *(uint32_t *)getValueFromSlot ( slot );
};
// a replacement for TermTable.cpp
2014-11-11 01:45:11 +03:00
uint32_t getScoreFromSlot ( int32_t slot ) {
2013-08-03 00:12:24 +04:00
return *(uint32_t *)getValueFromSlot ( slot ); };
2014-11-11 01:45:11 +03:00
uint64_t getScore64FromSlot ( int32_t slot ) {
2013-08-03 00:12:24 +04:00
return *(uint64_t *)getValueFromSlot ( slot ); };
bool addTerm32 ( char *str ) {
uint32_t wid32 = hash32n ( str );
return addTerm32 ( &wid32 );
};
2014-11-11 01:45:11 +03:00
bool addTerm32 ( int32_t *wid , int32_t score = 1 ) {
int32_t slot = getSlot ( wid );
2013-08-03 00:12:24 +04:00
if ( slot<0 ) return addKey( wid ,&score,&slot);
uint32_t *val = (uint32_t *)getValueFromSlot ( slot );
// overflow check
if ( *val + (uint32_t)score < *val ) *val = 0xffffffff;
else *val = *val + score;
return true;
};
2014-11-11 01:45:11 +03:00
//bool addTerm32 ( uint32_t *wid , int32_t score = 1 ) {
// int32_t slot = getSlot ( wid );
// if ( slot<0 ) return addKey( wid ,&score,&slot);
// uint32_t *val = (uint32_t *)getValueFromSlot ( slot );
// // overflow check
// if ( *val + (uint32_t)score < *val ) *val = 0xffffffff;
// else *val = *val + score;
// return true;
//};
bool addTerm32 ( uint32_t *wid , int32_t score = 1 ) {
int32_t slot = getSlot ( wid );
2013-08-03 00:12:24 +04:00
if ( slot<0 ) return addKey( wid ,&score,&slot);
uint32_t *val = (uint32_t *)getValueFromSlot ( slot );
// overflow check
if ( *val + (uint32_t)score < *val ) *val = 0xffffffff;
else *val = *val + score;
return true;
};
2014-11-11 01:45:11 +03:00
bool addScore ( int32_t *key , int32_t score = 1 ) {
return addTerm32 ( key , score );
};
2014-11-11 01:45:11 +03:00
uint32_t getScore32 ( int32_t *wid ) {
int32_t slot = getSlot ( wid );
2013-08-03 00:12:24 +04:00
if ( slot < 0 ) return 0;
return *(uint32_t *)getValueFromSlot ( slot );
};
2014-11-11 01:45:11 +03:00
uint32_t getScore32 ( uint32_t *wid ) {
int32_t slot = getSlot ( wid );
2013-08-03 00:12:24 +04:00
if ( slot < 0 ) return 0;
return *(uint32_t *)getValueFromSlot ( slot );
};
2014-11-11 01:45:11 +03:00
bool addTerm144 ( key144_t *kp , int32_t score = 1 ) {
/*
// debug XmlDoc.cpp's hash table
int64_t termId = ((key144_t *)kp)->n2 >> 16;
uint64_t d = 0LL;
d = ((unsigned char *)kp)[11];
d <<= 32;
d |= *(uint32_t *)(((unsigned char *)kp)+7);
d >>= 2;
if ( d==110324895284 && termId == 39206941907955LL ) {
log("got it");
char *xx=NULL;*xx=0;
}
*/
2013-08-03 00:12:24 +04:00
// grow it!
if ( (m_numSlots < 20 || 4 * m_numSlotsUsed >= m_numSlots) &&
m_numSlots < m_maxSlots ) {
2014-10-30 22:36:39 +03:00
int64_t growTo ;
growTo = ((int64_t)m_numSlots * 150LL )/100LL+20LL;
2013-08-03 00:12:24 +04:00
if ( growTo > m_maxSlots ) growTo = m_maxSlots;
2014-11-11 01:45:11 +03:00
if ( ! setTableSize ( (int32_t)growTo , NULL , 0 ) )
2013-08-03 00:12:24 +04:00
return false;
}
// hash it up
2014-11-11 01:45:11 +03:00
int32_t n = hash32 ( (char *)kp, 18 );
2013-08-03 00:12:24 +04:00
// then mask it
n &= m_mask;
2014-11-11 01:45:11 +03:00
int32_t count = 0;
2013-08-03 00:12:24 +04:00
while ( count++ < m_numSlots ) {
// this is set to 0x01 if non-empty
if ( m_flags [ n ] == 0 ) {
gbmemcpy( &((key144_t *)m_keys)[n] ,kp,18);
2013-08-03 00:12:24 +04:00
m_vals[n*m_ds] = score;
m_flags[n] = 1;
m_numSlotsUsed++;
return true;
}
// get the key there
if (((key144_t *)m_keys)[n] == *kp) {
uint32_t *val = (uint32_t *)&m_vals[n*m_ds];
// overflow check
if ( *val + (uint32_t)score < *val )
*val = 0xffffffff;
else
*val = *val + score;
return true;
}
// advance otherwise
if ( ++n == m_numSlots ) n = 0;
}
// crazy!
log("hash: table is full!");
char *xx=NULL;*xx=0;
return true;
};
// return 32-bit checksum of keys in table
2014-11-11 01:45:11 +03:00
int32_t getKeyChecksum32 ();
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int32_t getSlot144 ( key144_t *kp ) {
2013-08-03 00:12:24 +04:00
// return NULL if completely empty
if ( m_numSlots <= 0 ) return -1;
// sanity check
if ( m_ks != 18 ) { char *xx=NULL;*xx=0; }
// mask on termid bits i guess
2014-11-11 01:45:11 +03:00
//int32_t n = *((uint32_t *)(((char *)kp)+12));
2013-08-03 00:12:24 +04:00
// xor with word posand hashgroup ,etc
2014-11-11 01:45:11 +03:00
//n ^= *((uint32_t *)(((char *)kp)+2));
int32_t n = hash32 ( (char *)kp, 18 );
2013-08-03 00:12:24 +04:00
// then mask it
n &= m_mask;
2014-11-11 01:45:11 +03:00
int32_t count = 0;
2013-08-03 00:12:24 +04:00
while ( count++ < m_numSlots ) {
// this is set to 0x01 if non-empty
if ( m_flags [ n ] == 0 ) return -1;
// get the key there
if (((key144_t *)m_keys)[n] == *kp)
return n;
// advance otherwise
if ( ++n == m_numSlots ) n = 0;
}
return -1;
};
// . used by ../english/Bits.h to store stop words, abbr's, ...
// . returns the score for this termId (0 means empty usually)
// . return 0 if key not in hash table
void *getValue ( void *key ) {
// make it fast
2014-11-11 01:45:11 +03:00
if ( m_ks == 4 ) return getValue32 ( *(int32_t *)key );
2014-10-30 22:36:39 +03:00
if ( m_ks == 8 ) return getValue64 ( *(int64_t *)key );
2013-08-03 00:12:24 +04:00
// returns -1 if key not in hash table
2014-11-11 01:45:11 +03:00
int32_t n = getOccupiedSlotNum ( key );
2013-08-03 00:12:24 +04:00
if ( n < 0 ) return NULL;
return &m_vals[n*m_ds];
};
2014-11-11 01:45:11 +03:00
int32_t getSlot32 ( int32_t key ) {
2013-08-03 00:12:24 +04:00
// return NULL if completely empty
if ( m_numSlots <= 0 ) return -1;
// sanity check
if ( m_ks != 4 ) { char *xx=NULL;*xx=0; }
2014-11-11 01:45:11 +03:00
int32_t n;
2013-09-25 21:58:03 +04:00
if ( ! m_useKeyMagic ) {
// mask on the lower 32 bits i guess
n = key & m_mask;
}
else {
// get lower 32 bits of key
2014-11-11 01:45:11 +03:00
n =*(uint32_t *)(((char *)&key) +m_maskKeyOffset);
2013-09-25 21:58:03 +04:00
// use magic to "randomize" key a little
n^=g_hashtab[(unsigned char)((char *)&key)[m_maskKeyOffset]][0];
2013-09-25 21:58:03 +04:00
// mask on the lower 32 bits i guess
n &= m_mask;
}
2014-11-11 01:45:11 +03:00
int32_t count = 0;
2013-08-03 00:12:24 +04:00
while ( count++ < m_numSlots ) {
// this is set to 0x01 if non-empty
if ( m_flags [ n ] == 0 ) return -1;
// get the key there
2014-11-11 01:45:11 +03:00
if (((int32_t *)m_keys)[n] == key)
2013-08-03 00:12:24 +04:00
return n;
// advance otherwise
if ( ++n == m_numSlots ) n = 0;
}
return -1;
};
// . specialized for 32-bit keys for speed
// . returns NULL if not in table
2014-11-11 01:45:11 +03:00
void *getValue32 ( int32_t key ) {
2013-08-03 00:12:24 +04:00
// return NULL if completely empty
if ( m_numSlots <= 0 ) return NULL;
// sanity check
if ( m_ks != 4 ) { char *xx=NULL;*xx=0; }
2014-11-11 01:45:11 +03:00
int32_t n;
2013-09-25 21:58:03 +04:00
if ( ! m_useKeyMagic ) {
// mask on the lower 32 bits i guess
n = key & m_mask;
}
else {
// get lower 32 bits of key
2014-11-11 01:45:11 +03:00
//n = (uint32_t)key;
n =*(uint32_t *)(((char *)&key) +m_maskKeyOffset);
2013-09-25 21:58:03 +04:00
// use magic to "randomize" key a little
n^=g_hashtab[(unsigned char)((char *)&key)[m_maskKeyOffset]][0];
2013-09-25 21:58:03 +04:00
// mask on the lower 32 bits i guess
n &= m_mask;
}
2014-11-11 01:45:11 +03:00
int32_t count = 0;
2013-08-03 00:12:24 +04:00
while ( count++ < m_numSlots ) {
// this is set to 0x01 if non-empty
if ( m_flags [ n ] == 0 ) return NULL;
// get the key there
2014-11-11 01:45:11 +03:00
if (((int32_t *)m_keys)[n] == key)
2013-08-03 00:12:24 +04:00
return &m_vals[n*m_ds];
// advance otherwise
if ( ++n == m_numSlots ) n = 0;
}
return NULL;
};
// . specialized for 64-bit keys for speed
// . returns NULL if not in table
2014-10-30 22:36:39 +03:00
void *getValue64 ( int64_t key ) {
2013-08-03 00:12:24 +04:00
// return NULL if completely empty
if ( m_numSlots <= 0 ) return NULL;
// sanity check
if ( m_ks != 8 ) { char *xx=NULL;*xx=0; }
2014-11-11 01:45:11 +03:00
int32_t n;
2013-09-25 21:58:03 +04:00
if ( ! m_useKeyMagic ) {
// mask on the lower 32 bits i guess
// get lower 32 bits of key
n = key & m_mask;
}
else {
// use magic to "randomize" key a little
2014-11-11 01:45:11 +03:00
n =*(uint32_t *)(((char *)&key) +m_maskKeyOffset);
n ^= g_hashtab[(unsigned char)((char *)&key)[m_maskKeyOffset]][0];
2013-09-25 21:58:03 +04:00
// mask on the lower 32 bits i guess
n &= m_mask;
}
2014-11-11 01:45:11 +03:00
int32_t count = 0;
2013-08-03 00:12:24 +04:00
while ( count++ < m_numSlots ) {
// this is set to 0x01 if non-empty
if ( m_flags [ n ] == 0 ) return NULL;
// get the key there
2014-10-30 22:36:39 +03:00
if (((int64_t *)m_keys)[n] == key)
2013-08-03 00:12:24 +04:00
return &m_vals[n*m_ds];
// advance otherwise
if ( ++n == m_numSlots ) n = 0;
}
return NULL;
};
// value of 0 means empty
bool isEmpty ( void *key ) { return (getSlot(key) < 0); };
bool isInTable ( void *key ) { return (getSlot(key) >= 0); };
2014-11-11 01:45:11 +03:00
bool isEmpty ( int32_t n ) { return (m_flags[n] == 0); };
2013-08-03 00:12:24 +04:00
bool isTableEmpty ( ) { return (m_numSlotsUsed == 0); };
2014-11-11 01:45:11 +03:00
void *getKey ( int32_t n ) { return m_keys + n * m_ks; };
void *getKeyFromSlot ( int32_t n ) { return m_keys + n * m_ks; };
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int64_t getKey64FromSlot ( int32_t n ) {
2014-10-30 22:36:39 +03:00
return *(int64_t *)(m_keys+n*m_ks); }
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int32_t getKey32FromSlot ( int32_t n ) {
return *(int32_t *)(m_keys+n*m_ks); }
2014-06-20 23:28:50 +04:00
2014-11-11 01:45:11 +03:00
int32_t getSlot ( void *key ) { return getOccupiedSlotNum ( key ); };
2013-08-03 00:12:24 +04:00
// . specialized for 64-bit keys for speed
// . returns -1 if not in table
2014-11-11 01:45:11 +03:00
int32_t getSlot64 ( int64_t *key ) {
2013-08-03 00:12:24 +04:00
// return NULL if completely empty
if ( m_numSlots <= 0 ) return -1;
// sanity check
if ( m_ks != 8 ) { char *xx=NULL;*xx=0; }
2014-11-11 01:45:11 +03:00
int32_t n;
2013-09-25 21:58:03 +04:00
if ( ! m_useKeyMagic ) {
// mask on the lower 32 bits i guess
n = *key & m_mask;
}
else {
// use magic to "randomize" key a little
2014-11-11 01:45:11 +03:00
n =*(uint32_t *)(((char *)&key) +m_maskKeyOffset);
n ^= g_hashtab[(unsigned char)((char *)key)[m_maskKeyOffset]][0];
2013-09-25 21:58:03 +04:00
// mask on the lower 32 bits i guess
n &= m_mask;
}
2014-11-11 01:45:11 +03:00
int32_t count = 0;
2013-08-03 00:12:24 +04:00
while ( count++ < m_numSlots ) {
// this is set to 0x01 if non-empty
if ( m_flags [ n ] == 0 ) return -1;
// get the key there
2014-10-30 22:36:39 +03:00
if (((int64_t *)m_keys)[n] == *key)
2013-08-03 00:12:24 +04:00
return n;
// advance otherwise
if ( ++n == m_numSlots ) n = 0;
}
return -1;
};
2014-11-11 01:45:11 +03:00
int32_t getNextSlot ( int32_t slot , void *key );
2013-08-03 00:12:24 +04:00
// count how many slots have this key
2014-11-11 01:45:11 +03:00
int32_t getCount ( void *key );
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
void setValue ( int32_t n , void *val ) {
2013-08-03 00:12:24 +04:00
if (m_ds == 4) ((int32_t *)m_vals)[n] = *(int32_t *)val;
else if (m_ds == 8) ((int64_t *)m_vals)[n] = *(int64_t *)val;
else gbmemcpy(m_vals+n*m_ds,val,m_ds);
2013-08-03 00:12:24 +04:00
};
2014-11-11 01:45:11 +03:00
void *getValueFromSlot ( int32_t n ) { return m_vals + n * m_ds; };
void *getValFromSlot ( int32_t n ) { return m_vals + n * m_ds; };
void *getDataFromSlot ( int32_t n ) { return m_vals + n * m_ds; };
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int32_t getVal32FromSlot ( int32_t n ){return *(int32_t *)(m_vals+n*m_ds);};
int32_t getValue32FromSlot ( int32_t n ){return *(int32_t *)(m_vals+n*m_ds);};
2014-06-20 23:28:50 +04:00
2013-08-03 00:12:24 +04:00
// frees the used memory, etc.
void reset ( );
// removes all key/value pairs from hash table, vacates all slots
void clear ( );
// how many are occupied?
2014-11-11 01:45:11 +03:00
int32_t getNumSlotsUsed ( ) { return m_numSlotsUsed; };
int32_t getNumUsedSlots ( ) { return m_numSlotsUsed; };
2013-08-03 00:12:24 +04:00
bool isEmpty() {
if ( m_numSlotsUsed == 0 ) return true;
return false; };
2013-08-03 00:12:24 +04:00
// how many are there total? used and unused.
2014-11-11 01:45:11 +03:00
int32_t getNumSlots ( ) { return m_numSlots; };
2013-08-03 00:12:24 +04:00
// how many bytes are required to serialize this hash table?
2014-11-11 01:45:11 +03:00
int32_t getStoredSize();
2013-08-03 00:12:24 +04:00
// return buffer we allocated and stored into. return -1 on error
// with g_errno set.
2014-11-11 01:45:11 +03:00
char *serialize ( int32_t *bufSize ) ;
// int16_tcut
int32_t serialize ( class SafeBuf *sb );
2013-08-03 00:12:24 +04:00
// returns # bytes written into "buf"
2014-11-11 01:45:11 +03:00
int32_t serialize ( char *buf , int32_t bufSize );
2013-08-03 00:12:24 +04:00
// inflate it. returns false with g_errno set on error
2014-11-11 01:45:11 +03:00
bool deserialize ( char *buf , int32_t bufSize , int32_t niceness );
2013-08-03 00:12:24 +04:00
// both return false and set g_errno on error, true otherwise
bool load ( char *dir , char *filename ,
2014-11-11 01:45:11 +03:00
char **tbuf = NULL , int32_t *tsize = NULL );
2013-08-03 00:12:24 +04:00
bool save ( char *dir , char *filename ,
2014-11-11 01:45:11 +03:00
char *tbuf = NULL , int32_t tsize = 0);
2013-08-03 00:12:24 +04:00
bool save ( char *dir , char *filename , SafeBuf *tbuf ) {
return save ( dir,
filename,
tbuf->getBufStart(),
tbuf->length());
};
bool load ( char *dir , char *filename , SafeBuf *fillBuf );
// thread based save
bool fastSave ( bool useThread ,
char *dir ,
char *filename ,
char *tbuf ,
2014-11-11 01:45:11 +03:00
int32_t tsize ,
2013-08-03 00:12:24 +04:00
void *state ,
void (* callback)(void *state) );
2014-11-11 01:45:11 +03:00
bool setTableSize ( int32_t numSlots , char *buf , int32_t bufSize );
2013-08-03 00:12:24 +04:00
2015-07-03 02:42:05 +03:00
// print as text into sb for debugging
void print ( class SafeBuf *sb );
2013-08-03 00:12:24 +04:00
void disableWrites () { m_isWritable = false; };
void enableWrites () { m_isWritable = true ; };
bool m_isWritable;
private:
2014-11-11 01:45:11 +03:00
int32_t getOccupiedSlotNum ( void *key ) ;
2013-08-03 00:12:24 +04:00
public:
// . the array of buckets in which we store the terms
// . scores are allowed to exceed 8 bits for weighting purposes
char *m_keys;
char *m_vals;
char *m_flags;
2014-11-11 01:45:11 +03:00
int32_t m_numSlots;
int32_t m_numSlotsUsed;
2013-08-03 00:12:24 +04:00
uint32_t m_mask;
char m_doFree;
char *m_buf;
2014-11-11 01:45:11 +03:00
int32_t m_bufSize;
2013-08-03 00:12:24 +04:00
char m_useKeyMagic;
2014-11-11 01:45:11 +03:00
int32_t m_ks;
int32_t m_ds;
2013-08-03 00:12:24 +04:00
char m_allowDups;
2014-11-11 01:45:11 +03:00
int32_t m_niceness;
2013-08-03 00:12:24 +04:00
// a flag used by XmlDoc.cpp
bool m_addIffNotUnique;
bool m_isSaving;
bool m_needsSave;
char m_dir[100];
char m_filename[64];
void *m_state ;
void (* m_callback) ( void *state);
char *m_tbuf ;
2014-11-11 01:45:11 +03:00
int32_t m_tsize ;
2013-08-03 00:12:24 +04:00
// limits growing to this # of slots total
2014-10-30 22:36:39 +03:00
int64_t m_maxSlots;
2013-08-03 00:12:24 +04:00
char *m_allocName;
2014-11-11 01:45:11 +03:00
int32_t m_maskKeyOffset;
2013-08-03 00:12:24 +04:00
// the addon buf used by SOME hashtables. data that the ptrs
// in the table itself reference.
char *m_txtBuf;
2014-11-11 01:45:11 +03:00
int32_t m_txtBufSize;
2013-08-03 00:12:24 +04:00
};
#endif