open-source-search-engine/HashTable.cpp
2014-11-10 14:45:11 -08:00

266 lines
7.7 KiB
C++

#include "gb-include.h"
#include "HashTable.h"
HashTable::HashTable () {
m_keys = NULL;
m_vals = NULL;
m_numSlots = 0;
m_numSlotsUsed = 0;
m_doFree = true;
m_label = NULL;
}
// returns false and sets errno on error
bool HashTable::set ( int32_t initialNumTerms , char *buf , int32_t bufSize ,
char *label ) {
reset();
m_label = label;
if ( ! m_label ) m_label = "hashtablekv";
return setTableSize ( initialNumTerms , buf , bufSize );
}
HashTable::~HashTable ( ) { reset ( ); }
// . call clean() to do a more careful reset
// . clean will rehash
void HashTable::reset ( ) {
if ( m_doFree ) {
if (m_keys) mfree(m_keys,m_numSlots*sizeof(int32_t),"hashtablev");
if (m_vals) mfree(m_vals,m_numSlots*sizeof(int32_t),"hashtablev");
}
m_keys = NULL;
m_vals = NULL;
m_numSlots = 0;
m_numSlotsUsed = 0;
// do not do this because then ::load() fails b/c you can't
// pass a label into that yet
//m_label = NULL;
}
void HashTable::clear ( ) {
// vacate all slots
if ( m_keys ) memset ( m_keys , 0 , sizeof(int32_t) * m_numSlots );
m_numSlotsUsed = 0;
}
// . returns the slot number for "key"
// . returns -1 if key not in hash table
int32_t HashTable::getOccupiedSlotNum ( int32_t key ) {
if ( m_numSlots <= 0 ) return -1;
//int32_t n = ((uint32_t)key) % ((uint32_t)m_numSlots);
int32_t n = ((uint32_t)key) & m_mask;
int32_t count = 0;
while ( count++ < m_numSlots ) {
if ( m_keys [ n ] == 0 ) return -1;
if ( m_keys [ n ] == key ) return n;
if ( ++n == m_numSlots ) n = 0;
}
log("hashtable: Could not get key. Table is full.");
return -1;
}
// return 0 if key not in hash table
int32_t HashTable::getValue ( int32_t key ) {
// returns -1 if key not in hash table
int32_t n = getOccupiedSlotNum ( key );
if ( n < 0 ) return 0;
return m_vals[n];
}
// . returns false and sets errno on error, returns true otherwise
// . adds scores if termId already exists in table
bool HashTable::addKey ( int32_t key , int32_t value , int32_t *slot ) {
// keys of 0 mean empty! they are reserved... fix that!
if ( key == 0 ) { char *xx=NULL; *xx=0; }
// check to see if we should grow the table
if ( 100 * m_numSlotsUsed >= m_numSlots * 90 ) {
int32_t growTo = (m_numSlots * 120 ) / 100 + 20;
if ( ! setTableSize ( growTo , NULL , 0 ) ) return false;
}
//int32_t n = ((uint32_t)key) % ((uint32_t)m_numSlots);
int32_t n = ((uint32_t)key) & m_mask;
int32_t count = 0;
while ( count++ < m_numSlots ) {
if ( m_keys [ n ] == 0 ) break;
if ( m_keys [ n ] == key ) break;
if ( ++n == m_numSlots ) n = 0;
}
// bail if not found
if ( count >= m_numSlots ) {
g_errno = ENOMEM;
return log("hashtable: Could not add key. Table is full.");
}
if ( m_keys [ n ] == 0 ) {
// inc count if we're the first
m_numSlotsUsed++;
// and store the ky
m_keys [ n ] = key;
}
// insert the value for this key
m_vals [ n ] = value;
if ( slot ) *slot = n;
return true;
}
// patch the hole so chaining still works
bool HashTable::removeKey ( int32_t key ) {
// returns -1 if key not in hash table
int32_t n = getOccupiedSlotNum(key);
if ( n < 0 ) return true;
m_keys[n] = 0;
m_numSlotsUsed--;
if ( ++n >= m_numSlots ) n = 0;
// keep looping until we hit an empty slot
int32_t val;
while ( m_keys[n] ) {
key = m_keys[n];
val = m_vals[n];
m_keys[n] = 0;
m_numSlotsUsed--;
addKey ( key , val );
if ( ++n >= m_numSlots ) n = 0;
}
return true;
}
// patch the hole so chaining still works
void HashTable::removeSlot ( int32_t n ) {
// returns -1 if key not in hash table
//int32_t n = getOccupiedSlotNum(key);
//if ( n < 0 ) return true;
int32_t key = m_keys[n];
// sanity check, must not be empty
if ( key == 0 ) { char *xx = NULL; *xx = 0; }
// delete it
m_keys[n] = 0;
m_numSlotsUsed--;
if ( ++n >= m_numSlots ) n = 0;
// keep looping until we hit an empty slot
int32_t val;
while ( m_keys[n] ) {
key = m_keys[n];
val = m_vals[n];
m_keys[n] = 0;
m_numSlotsUsed--;
addKey ( key , val );
if ( ++n >= m_numSlots ) n = 0;
}
}
// . set table size to "n" slots
// . rehashes the termId/score pairs into new table
// . returns false and sets errno on error
bool HashTable::setTableSize ( int32_t oldn , char *buf , int32_t bufSize ) {
// don't change size if we do not need to
if ( oldn == m_numSlots ) return true;
// make it a power of 2
int32_t n = getHighestLitBitValue ( (uint32_t)oldn * 2 - 1 );
// do not go negative on me
if ( oldn == 0 ) n = 0;
// sanity check
if ( n < oldn ) { char *xx = NULL; *xx = 0; }
// do we have a buf?
int32_t need = 2 * n * sizeof(int32_t);
// sanity check, buf should also meet what we need
if ( buf && bufSize < need ) { char *xx = NULL; *xx = 0; }
// set the buf
int32_t *newKeys ;
int32_t *newVals ;
// if we should not free note that
bool savedDoFree = m_doFree ;
// use our buf if we can
if ( buf ) {
m_doFree = false;
bzero ( buf , need );
newKeys = (int32_t *)buf;
buf += n * sizeof(int32_t);
newVals = (int32_t *)buf;
buf += n * sizeof(int32_t);
}
else {
m_doFree = true;
char *label = m_label;
if ( ! label ) label = "hashtablev";
newKeys = (int32_t *)mcalloc ( n * sizeof(int32_t) , label);
if ( ! newKeys ) return false;
newVals = (int32_t *)mmalloc ( n * sizeof(int32_t) , label);
if ( ! newVals ) {
mfree ( newKeys , n * sizeof(int32_t) , label );
return false;
}
}
// rehash the slots if we had some
if ( m_keys ) {
for ( int32_t i = 0 ; i < m_numSlots ; i++ ) {
// skip the empty slots
if ( m_keys [ i ] == 0 ) continue;
// get the new slot # for this slot (might be the same)
//int32_tnum=((uint32_t)m_keys[i])%((uint32_t)n)
int32_t num=((uint32_t)m_keys[i])&
((uint32_t)(n-1));
// if that is occupied, go down
while ( newKeys[num] ) if ( ++num >= n ) num = 0;
// move the slotPtr/key/size to this new slot
newKeys [ num ] = m_keys [ i ];
newVals [ num ] = m_vals [ i ];
}
}
// free the old guys
if ( m_keys && savedDoFree ) {
mfree ( m_keys , m_numSlots * sizeof(int32_t) , "hashtablev" );
mfree ( m_vals , m_numSlots * sizeof(int32_t) , "hashtablev" );
}
// assign the new slots, m_numSlotsUsed should be the same
m_keys = newKeys;
m_vals = newVals;
m_numSlots = n;
m_mask = n - 1;
return true;
}
// both return false and set g_errno on error, true otherwise
bool HashTable::load ( char *dir , char *filename ) {
reset();
File f;
f.set ( dir , filename );
if ( ! f.doesExist() ) return true;
log(LOG_INFO,"admin: Loading hashtable from %s%s",dir,filename);
if ( ! f.open ( O_RDONLY) ) return false;
int32_t numSlots;
int32_t numSlotsUsed;
int32_t off = 0;
if ( ! f.read ( &numSlots , 4 , off ) ) return false;
off += 4;
if ( ! f.read ( &numSlotsUsed , 4 , off ) ) return false;
off += 4;
if ( ! setTableSize ( numSlots , NULL , 0 ) ) return false;
if ( ! f.read ( m_keys , numSlots * 4 , off ) ) return false;
off += numSlots * 4;
if ( ! f.read ( m_vals , numSlots * 4 , off ) ) return false;
off += numSlots * 4;
m_numSlotsUsed = numSlotsUsed;
f.close();
return true;
}
bool HashTable::save ( char *dir , char *filename ) {
File f;
f.set ( dir , filename );
log(LOG_INFO,"admin: Saving hashtable from %s%s",dir,filename);
if ( ! f.open ( O_RDWR | O_CREAT ) ) return false;
int32_t numSlots = m_numSlots;
int32_t numSlotsUsed = m_numSlotsUsed;
int32_t off = 0;
if ( ! f.write ( &numSlots , 4 , off ) ) return false;
off += 4;
if ( ! f.write ( &numSlotsUsed , 4 , off ) ) return false;
off += 4;
if ( ! f.write ( m_keys , numSlots * 4 , off ) ) return false;
off += numSlots * 4;
if ( ! f.write ( m_vals , numSlots * 4 , off ) ) return false;
off += numSlots * 4;
f.close();
return true;
}