open-source-search-engine/Clusterdb.cpp
mwells 87285ba3cd use gbmemcpy not memcpy so we can get profiler working again
since memcpy can't be interrupted and backtrace() called.
2015-01-13 12:25:42 -07:00

868 lines
27 KiB
C++

#include "gb-include.h"
#include "Clusterdb.h"
#include "Threads.h"
#include "Rebalance.h"
// a global class extern'd in .h file
Clusterdb g_clusterdb;
Clusterdb g_clusterdb2;
/*
// for making the cluster cache key
static key_t makeClusterCacheKey ( uint32_t vfd,
uint32_t pageNum ) {
key_t key;
key.n1 = vfd + 1;
key.n0 = (uint64_t)pageNum + 1;
return key;
}
// DiskPageCache override functions
static void clusterGetPages ( DiskPageCache *pc,
int32_t vfd,
char *buf,
int32_t numBytes,
int64_t offset,
int32_t *newNumBytes,
int64_t *newOffset ) {
bool cacheMiss = false;
// return new disk offset, assume unchanged
*newOffset = offset;
*newNumBytes = numBytes;
// what is the page range?
int64_t sp = offset / GB_PAGE_SIZE ;
int64_t ep = (offset + (numBytes-1)) / GB_PAGE_SIZE ;
// setup the cache list
RdbList cacheList;
key_t startKey;
startKey.n1 = 0;
startKey.n0 = 0;
// point to the buffer to fill
char *bufPtr = buf;
char *bufEnd = buf + numBytes;
// read in the pages
while ( sp <= ep && bufPtr < bufEnd ) {
cacheList.reset();
// get the cache key for the page
key_t cacheKey = makeClusterCacheKey ( vfd, sp );
// read in the list from cache
collnum_t collnum = 0;
g_clusterdb.getRdb()->m_cache.getList ( collnum,
(char*)&cacheKey,
(char*)&startKey,
&cacheList,
false,
3600*24*265,
true );
//cacheList.checkList_r ( false, true );
//log ( LOG_INFO, "cache: got list [%"INT32", %"INT64"] [%"INT32"]",
// vfd, sp, cacheList.m_listSize );
int32_t size = cacheList.m_listSize;
if ( size == 0 ) {
cacheMiss = true;
goto getPagesEnd;
}
//log ( LOG_INFO, "cache: got list [%"INT32", %"INT32"] [%"INT32"]",
// vfd, sp, size );
if ( bufPtr + size >= bufEnd )
size = bufEnd - bufPtr;
// copy the list into the buffer
gbmemcpy ( bufPtr, cacheList.m_list, size );
// advance to the next page
bufPtr += size;
*newOffset += size;
*newNumBytes -= size;
sp++;
}
getPagesEnd:
if ( !cacheMiss ) {
pc->m_hits++;
// *newNumBytes = -(*newNumBytes);
}
else
pc->m_misses++;
}
static void clusterAddPages ( DiskPageCache *pc,
int32_t vfd,
char *buf,
int32_t numBytes,
int64_t offset ) {
// make sure we have a clean vfd
if ( vfd < 0 || vfd >= MAX_NUM_VFDS2 )
return;
// make sure the file didn't get unlinked
if ( ! pc->m_memOff[vfd] )
return;
// get the number of twins, used for filtering
int32_t numTwins = g_hostdb.getNumHostsPerShard();
int32_t thisTwin = g_hostdb.m_hostId/g_hostdb.m_numShards;
// get the bias range for this twin
int64_t biasStart = ((0x0000003fffffffffLL)/(int64_t)numTwins) *
(int64_t)thisTwin;
int64_t biasEnd;
if ( thisTwin == numTwins - 1 )
biasEnd = 0x0000003fffffffffLL + 1LL;
else
biasEnd = ((0x0000003fffffffffLL)/(int64_t)numTwins) *
(int64_t)(thisTwin+1);
// get the page range
int64_t sp = offset / GB_PAGE_SIZE;
// point to it
char *bufPtr = buf;
char *bufEnd = buf + numBytes;
// how much did we exceed the boundary by?
int32_t skip = (int32_t)(offset - sp * GB_PAGE_SIZE);
int32_t size = GB_PAGE_SIZE - skip;
// setup the cache lists, may need to merge with an old list
RdbList cacheList1;
cacheList1.set ( NULL,
0,
NULL,
0,
0,
true,
true,
g_clusterdb.getRdb()->m_ks );
cacheList1.growList(GB_PAGE_SIZE);
// set the buffer data to a list so we can read it nicely
key_t startKey;
key_t endKey;
startKey.n1 = 0;
startKey.n0 = 0;
endKey.n1 = 0xffffffff;
endKey.n0 = 0xffffffffffffffffULL;
// setup our source list
RdbList dataList;
dataList.set ( bufPtr,
numBytes,
bufPtr,
numBytes,
(char*)&startKey,
(char*)&endKey,
0,
false,
true,
g_clusterdb.getRdb()->m_ks );
dataList.resetListPtr();
// add pages to the cache
while ( bufPtr < bufEnd ) {
int32_t filled = 0;
// ensure "size" is not too big
if ( bufPtr + size > bufEnd )
size = bufEnd - bufPtr;
// . add the page to the cache
cacheList1.reset();
// check the first key, if it's too large, we're all done here
key_t key = dataList.getCurrentKey();
int64_t docId = g_clusterdb.getDocId ( key );
//if ( docId >= biasEnd ) {
// log ( "clusterdb: DocId after bias end, key.n1=%"XINT32" key.n0=%"XINT64"", key.n1, key.n0 );
// log ( "clusterdb: DocId after bias end, %"XINT64" >= %"XINT64"", docId, biasEnd );
// return;
//}
// make the cache key using vfd and page number
key_t cacheKey = makeClusterCacheKey ( vfd, sp );
// filter the data into a list to be cached
while ( filled < size && !dataList.isExhausted() ) {
key = dataList.getCurrentKey();
// check the key for filtering
//int64_t docId = g_clusterdb.getDocId ( key );
//int32_t twin = hashLong((int32_t)docId) % numTwins;
//if ( twin == thisTwin ) {
// add the key to the rdb list
cacheList1.addRecord(key, 0, NULL);
//}
// next key
filled += dataList.getCurrentRecSize();
dataList.skipCurrentRecord();
}
collnum_t collnum = 0;
// if the last key is too small, don't add the page
docId = g_clusterdb.getDocId ( key );
if ( docId >= biasStart )
g_clusterdb.getRdb()->m_cache.addList ( collnum,
cacheKey,
&cacheList1 );
//else
// log ( "clusterdb: DocId before bias start, %"INT64" >= %"INT64"", docId, biasStart );
//cacheList1.checkList_r ( false, true );
//log ( LOG_INFO, "cache: add list [%"INT32", %"INT64"] [%"INT32"]",
// vfd, sp, cacheList1.m_listSize );
// advance
bufPtr += filled;
sp++;
size = GB_PAGE_SIZE;
skip = 0;
}
}
static int32_t clusterGetVfd ( DiskPageCache *pc,
int64_t maxFileSize ) {
// pick a vfd for this file, will be used in the cache key
int32_t i;
int32_t count = MAX_NUM_VFDS2;
for ( i = pc->m_nexti; count-- > 0; i++ ) {
if ( i >= MAX_NUM_VFDS2 ) i = 0;
if ( ! pc->m_memOff[i] ) break;
}
// bail if none left
if ( count == 0 ) {
g_errno = EBADENGINEER;
log ( LOG_LOGIC, "db: pagecache: clusterGetVfd: "
"no vds remaining." );
return -1;
}
// start looking here next time
pc->m_nexti = i + 1;
// set m_memOff[i] to something to hold the vfd
pc->m_memOff[i] = (int32_t*)0x7fffffff;
// return the vfd
return i;
}
static void clusterRmVfd ( DiskPageCache *pc,
int32_t vfd ) {
// make sure it's a clean vfd
if ( vfd < 0 || vfd >= MAX_NUM_VFDS2 )
return;
// clear the vfd for use
pc->m_memOff[vfd] = NULL;
// need to clear out the cache records using this vfd
collnum_t collnum = 0;
key_t startKey, endKey;
startKey.n1 = vfd + 1;
startKey.n0 = 0;
endKey.n1 = vfd + 1;
endKey.n0 = 0xffffffffffffffffULL;
g_clusterdb.getRdb()->m_cache.removeKeyRange ( collnum,
(char*)&startKey,
(char*)&endKey );
//log ( LOG_INFO, "cache: BIASED CACHE REMOVED VFD!!" );
}
*/
// reset rdb
void Clusterdb::reset() { m_rdb.reset(); }
// . this no longer maintains an rdb of cluster recs
// . Msg22 now just uses the cache to hold cluster recs that it computes
// from titlteRecs
// . clusterRecs are now just TitleRec keys...
// . we can load one the same from titledb as we could from clusterdb
// and we still don't need to uncompress the titleRec to get the info
bool Clusterdb::init ( ) {
// this should be about 200/4 = 50 megs per host on my current setup
int32_t maxTreeMem = g_conf.m_clusterdbMaxTreeMem;
// . what's max # of tree nodes?
// . key+4+left+right+parents+dataPtr = 12+4 +4+4+4+4 = 32
// . 28 bytes per record when in the tree
int32_t maxTreeNodes = maxTreeMem / ( 16 + CLUSTER_REC_SIZE );
// . each cahched list is just one key in the tree...
// . 28(tree space) + 24(cacheoverhead) = 52
//int32_t maxCacheMem = g_conf.m_clusterdbMaxCacheMem ;
// do not use any page cache if doing tmp cluster in order to
// prevent swapping
//int32_t pcmem = g_conf.m_clusterdbMaxDiskPageCacheMem;
int32_t pcmem = 0;
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// we need that 100MB for termlists! they are >90MB now!!
pcmem = 10000000; // 10MB
// temp hack for rebuild
//pcmem = 0;
// RdbCache has a 4 byte ptr to each rec in the cache
//int32_t maxCacheNodes = maxCacheMem / ( 4 + CLUSTER_REC_SIZE );
//int32_t nodeSize = sizeof(key_t) + sizeof(collnum_t);
int32_t pageSize = GB_TFNDB_PAGE_SIZE;
//int32_t nodeSize = (pageSize + 12) + sizeof(collnum_t) + 20;
//int32_t maxCacheNodes = maxCacheMem / nodeSize ;
// init the page cache
if ( ! m_pc.init ( "clusterdb",
RDB_CLUSTERDB,
pcmem ,
pageSize ) )
//g_conf.m_clusterdbMaxDiskPageCacheMem,
//clusterGetPages,
//clusterAddPages,
//clusterGetVfd,
//clusterRmVfd ))
return log("db: Clusterdb init failed.");
//bool bias = true;
//if ( g_conf.m_fullSplit ) bias = false;
bool bias = false;
// initialize our own internal rdb
return m_rdb.init ( g_hostdb.m_dir ,
"clusterdb" ,
true , // dedup
//CLUSTER_REC_SIZE - sizeof(key_t),//fixedDataSize
0 , // no data now! just docid/s/c
2, // g_conf.m_clusterdbMinFilesToMerge,
g_conf.m_clusterdbMaxTreeMem,
maxTreeNodes , // maxTreeNodes ,
true , //false , // balance tree?
0,//maxCacheMem ,
0,//maxCacheNodes ,
true , // half keys?
g_conf.m_clusterdbSaveCache,
&m_pc ,
false, // is titledb
true , // preload disk page cache
12, // key size
bias ); // bias disk page cache?
}
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool Clusterdb::init2 ( int32_t treeMem ) {
// . what's max # of tree nodes?
// . key+4+left+right+parents+dataPtr = 12+4 +4+4+4+4 = 32
// . 28 bytes per record when in the tree
int32_t maxTreeNodes = treeMem / ( 16 + CLUSTER_REC_SIZE );
// initialize our own internal rdb
return m_rdb.init ( g_hostdb.m_dir ,
"clusterdbRebuild" ,
true , // dedup
0 , // no data now! just docid/s/c
50 , // m_clusterdbMinFilesToMerge,
treeMem , // g_conf.m_clusterdbMaxTreeMem,
maxTreeNodes ,
true , // balance tree?
0 , // maxCacheMem ,
0 , // maxCacheNodes ,
true , // half keys?
false , // g_conf.m_clusterdbSaveCache,
NULL , // &m_pc ,
false , // is titledb
false , // preload disk page cache
12 , // key size
true ); // bias disk page cache
}
/*
bool Clusterdb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
// verify
if ( verify(coll) ) return true;
// if not allowing scale, return false
if ( ! g_conf.m_allowScale ) return false;
// otherwise let it go
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Clusterdb::verify ( char *coll ) {
log ( LOG_DEBUG, "db: Verifying Clusterdb for coll %s...", coll );
g_threads.disableThreads();
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key_t startKey;
key_t endKey;
startKey.setMin();
endKey.setMax();
//int32_t minRecSizes = 64000;
CollectionRec *cr = g_collectiondb.getRec(coll);
if ( ! msg5.getList ( RDB_CLUSTERDB ,
cr->m_collnum ,
&list ,
startKey ,
endKey ,
64000 , // minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL ,
0 ,
-1 ,
true ,
-1LL ,
&msg5b ,
true )) {
g_threads.enableThreads();
return log("db: HEY! it did not block");
}
int32_t count = 0;
int32_t got = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key_t k = list.getCurrentKey();
// skip negative keys
if ( (k.n0 & 0x01) == 0x00 ) continue;
count++;
//uint32_t groupId = getGroupId ( RDB_CLUSTERDB , &k );
//if ( groupId == g_hostdb.m_groupId ) got++;
uint32_t shardNum = getShardNum( RDB_CLUSTERDB , &k );
if ( shardNum == getMyShardNum() ) got++;
}
if ( got != count ) {
// tally it up
g_rebalance.m_numForeignRecs += count - got;
log ("db: Out of first %"INT32" records in clusterdb, "
"only %"INT32" belong to our group.",count,got);
// exit if NONE, we probably got the wrong data
if ( got == 0 ) log("db: Are you sure you have the "
"right "
"data in the right directory? "
"Exiting.");
log ( "db: Exiting due to Clusterdb inconsistency." );
g_threads.enableThreads();
return g_conf.m_bypassValidation;
}
log ( LOG_DEBUG, "db: Clusterdb passed verification successfully for "
"%"INT32" recs.", count );
// DONE
g_threads.enableThreads();
return true;
}
#include "IndexList.h"
// . this routine is very slow...
// . it is used to get a titleRec's (document's) sample vector at query time
// but we should really compute this vector at build time and store it in
// the titleRec itself, to avoid having to compute it at query time.
// . vector must have at least VECTOR_SIZE bytes available
/*
void Clusterdb::getSampleVector ( char *vec ,
class Doc *doc,
char *coll ,
int32_t collLen ,
int32_t niceness) {
int64_t startTime = gettimeofdayInMilliseconds();
TitleRec *tr = doc->getTitleRec();
SiteRec *sr = doc->getSiteRec();
//sr->set ( tr->getSite() , tr->getColl() , tr->getCollLen() ,
sr->set ( tr->getSite() , coll , collLen ,
tr->getSiteFilenum() , SITEREC_CURRENT_VERSION );
// hashes the whole doc, but more importantly for us, computes
// XmlDoc::m_vector
//doc->set ( niceness );
XmlDoc *xd = doc->getXmlDoc();
xd->set ( tr , sr , NULL, niceness);
// this just sets the vector
doc->getIndexList(NULL,true,true,false,NULL,NULL,NULL, niceness);
// log the time
int64_t took =gettimeofdayInMilliseconds()-startTime;
if ( took > 3 )
log(LOG_INFO,"query: Took %"INT64" ms to make indexlist.",took);
// so get it
char *p = doc->getSampleVector ( );
// and store it. int16_t vectors are padded with 0's.
gbmemcpy ( vec , p , SAMPLE_VECTOR_SIZE );
}
*/
// if VECTOR_SIZE is 128 bytes then that is 32 termIds (4 bytes each) that we
// use to make this vector. these 32 termids are the lowest 32 termids out of
// all the termids for the document. we can further hash pairs to reduce the
// vector size from 128 to 64 bytes. but we must hash the pair strategically.
// What are the odds of two things being 90% similar when they are not?
#define SAMPLE_VECTOR_LEN (SAMPLE_VECTOR_SIZE / 4)
// . it would be nice to use the new addition to the Words class that allows
// a word to be a tag. this kinda replaces the xml class.
// . returns false and sets g_errno on error
/*
bool Clusterdb::getGigabitVector ( char *vec , Xml *xml ) {
// . get filtered text, no link text since that is usually for menus
// . get first 64k
char buf[64*1024];
xml->getText ( buf , 64*1024 );
// hash into this table
TermTable table;
Query q;
TopicGroup t;
t.m_numTopics = 32;
t.m_maxTopics = 32;
t.m_docsToScanForTopics = 1;
t.m_minTopicScore = 0;
t.m_maxWordsPerTopic = 4;
t.m_meta[0] = '\0';
t.m_delimeter = 0;
t.m_useIdfForTopics = true;
t.m_dedup = false;
t.m_minDocCount = 1;
t.m_ipRestrict = false;
t.m_dedupSamplePercent = 0;
t.m_topicRemoveOverlaps = true;
t.m_topicSampleSize = 64*1024;
t.m_topicMaxPunctLen = 3;
State23 st;
st.m_numRequests = 1;
st->m_msg20[0].m_bufSampleBuf = buf;
st->m_msg20[0].m_bufSampleBufLen = bufLen;
st->m_returnDocIdCount = false;
st->m_returnDocIds = false;
st->m_returnPops = false;
Msg24 msg24;
if ( ! msg24.getTopics ( &st , // State24
&t ,
&table ,
&q ,
0 , // gid
&buf ,
&bufLen ) )
return false;
// now hash the winning topics into our vector
}
*/
/*
void Clusterdb::getSampleVector ( char *vec , TermTable *table ) {
// no compression is used in this list so each docId/termId is 12 bytes
int32_t numTerms = table->getNumTermsUsed();
// . how many can we hold? we'll just use 4 bytes per vector component
// . let's get 2x as many termids as required, then we will combine
// every 2 termids into one via hashing... this makes falsely high
// similarities less likely, but makes truly high similarities less
// likely to be detected as well.
int32_t maxTerms = (1 * SAMPLE_VECTOR_LEN) - 1;
// what portion of them do we want to mask out from the rest?
int32_t ratio = numTerms / maxTerms ;
unsigned char mask = 0x00;
while ( ratio >= 2 ) {
// shift the mask down, ensure hi bit is set
mask >>= 1;
mask |= 0x80;
ratio >>= 1; // /2
}
uint32_t d [ 3000 ];
// if we don't have enough, make them 0's
memset ( d , 0 , SAMPLE_VECTOR_SIZE );
memset ( vec , 0 , SAMPLE_VECTOR_SIZE );
again:
// a buffer to hold the top termIds
int32_t nd = 0;
// . buffer should have at least "maxTerms" in it
// . these should all be 12 byte keys
int32_t i = 0 ;
int32_t n = table->getNumTerms();
int64_t *termIds = table->getTermIds();
uint32_t *scores = table->getScores ();
for ( ; i < n ; i++ ) {
// skip if empty bucket
if ( ! scores[i] ) continue;
// skip if negative key, since we can be deleting old keys
// from call from Msg14.cpp
// NO! this should be the indexlist directly from Msg16, not
// after subtracting the one from Msg15
//if ( (*p & 0x01) == 0x00 ) continue;
// skip if it's not to be considered
//fprintf(stderr,"%hhu\n",p[11]);
//if ( (p[11] & mask) != 0 ) continue;
if ( ((termIds[i]>>(NUMTERMIDBITS-8)) & mask) != 0 ) continue;
// add it
//d[nd++] = *(int32_t *)(p+12-5); // last byte has del bit, etc.
d[nd] = (uint32_t)(termIds[i] >> (NUMTERMIDBITS-32));
// 0 has special meaning, it terminates the vector
if ( d[nd] == 0 ) d[nd] = 1;
if ( ++nd < 3000 ) continue;
// bitch and break out on error
log(LOG_INFO,"build: Sample vector overflow. Slight "
"performance hit.");
break;
}
// if nd was too small, don't use a mask to save time
if ( nd < maxTerms && nd < numTerms && mask ) {
// sanity check
if ( mask == 0 ) {
log (LOG_LOGIC,"build: Clusterdb sample vector mask "
"is already at 0.");
char *xx = NULL; *xx = 0;
}
// debug msg
//log("AGAIN");
//val >>= 1;
// shift the mask UP, allow more termIds to pass through
mask <<= 1;
goto again;
}
// bubble sort them
bool flag = true;
while ( flag ) {
flag = false;
for ( int32_t i = 1 ; i < nd ; i++ ) {
if ( d[i-1] <= d[i] ) continue;
uint32_t tmp = d[i-1];
d[i-1] = d[i];
d[i] = tmp;
flag = true;
}
}
if ( nd > SAMPLE_VECTOR_LEN - 1 ) nd = SAMPLE_VECTOR_LEN - 1;
// make sure last component is a 0
d [ nd ] = 0;
gbmemcpy ( vec , (char *)d , (nd+1) * 4 );
}
*/
// return the percent similar
char Clusterdb::getSampleSimilarity ( char *vec0 , char *vec1, int32_t size ) {
// . the termIds are sorted
// . point each recs sample vector of termIds
//int32_t *t0 = (int32_t *)(vec0 + sizeof(key_t) + 3*4);
//int32_t *t1 = (int32_t *)(vec1 + sizeof(key_t) + 3*4);
// . we sorted them above as uint32_ts, so we must make sure
// we use uint32_ts here, too
uint32_t *t0 = (uint32_t *)vec0;
uint32_t *t1 = (uint32_t *)vec1;
// if either is empty, return 0 to be on the safe side
if ( *t0 == 0 ) return 0;
if ( *t1 == 0 ) return 0;
//int32_t size0 = *(int32_t *)(rec + sizeof(key_t));
//int32_t *end0 = (int32_t *)(vec0 + *(int32_t *)(vec0+12));
//int32_t *end1 = (int32_t *)(vec1 + *(int32_t *)(vec1+12));
// how many total termIds?
//int32_t total = (end0 - t0 + end1 - t1) / 2;
//if ( total <= 0 ) return 0;
// count matches between the sample vectors
int32_t count = 0;
loop:
if( ((char*)t0 - vec0) > size ) {
log( LOG_INFO, "query: sample vector 0 is malformed. "
"Returning 0%% similarity." );
return 0;
}
if( ((char*)t1 - vec1) > size ) {
log( LOG_INFO, "query: sample vector 1 is malformed. "
"Returning 0%% similarity." );
return 0;
}
// terminate on a 0
if ( *t0 < *t1 ) { if ( *++t0 == 0 ) goto done; }
else if ( *t1 < *t0 ) { if ( *++t1 == 0 ) goto done; }
else {
// if both are zero... do not inc count
if ( *t0 == 0 ) goto done;
count++;
t0++;
t1++;
if ( *t0 == 0 ) goto done;
if ( *t1 == 0 ) goto done;
}
goto loop;
done:
// count total components in each sample vector
while ( *t0 ) {
t0++;
if( ((char*)t0 - vec0) > size ) {
log( LOG_INFO, "query: sample vector 0 is malformed. "
"Returning 0%% similarity." );
return 0;
}
}
while ( *t1 ) {
t1++;
if( ((char*)t1 - vec1) > size ) {
log( LOG_INFO, "query: sample vector 1 is malformed. "
"Returning 0%% similarity." );
return 0;
}
}
int32_t total = 0;
total += t0 - ((uint32_t *)vec0);
total += t1 - ((uint32_t *)vec1);
// how similar are they?
// if both are empty, assume not similar at all. this happens if we
// do not have a content vector for either, or if both are small docs
// with no words or links in them (framesets?)
if ( total == 0 ) return 0;
int32_t sim = (count * 2 * 100) / total;
if ( sim > 100 ) sim = 100;
return (char)sim;
}
/*
// return the percent similar
char Clusterdb::getGigabitSimilarity ( char *vec0 , char *vec1 ,
int32_t *qtable , int32_t numSlots ) {
// . the termIds are sorted
// . point each recs sample vector of termIds
//int32_t *t0 = (int32_t *)(vec0 + sizeof(key_t) + 3*4);
//int32_t *t1 = (int32_t *)(vec1 + sizeof(key_t) + 3*4);
uint32_t *t0 = (uint32_t *)vec0;
uint32_t *t1 = (uint32_t *)vec1;
int16_t *s0 = (int16_t *)(vec0 + 4*GIGABITS_IN_VECTOR);
int16_t *s1 = (int16_t *)(vec1 + 4*GIGABITS_IN_VECTOR);
int32_t i0 = 0;
int32_t i1 = 0;
// if both empty, cluster together... assume same topic
//if ( *t0 == 0 && *t1 == 0 ) return 100;
if ( *t0 == 0 && *t1 == 0 ) return 0;
// if either is empty, return 0 to be on the safe side
if ( *t0 == 0 ) return 0;
if ( *t1 == 0 ) return 0;
if ( numSlots == 0 ) return 0;
//int32_t size0 = *(int32_t *)(rec + sizeof(key_t));
//int32_t *end0 = (int32_t *)(vec0 + *(int32_t *)(vec0+12));
//int32_t *end1 = (int32_t *)(vec1 + *(int32_t *)(vec1+12));
// how many total termIds?
//int32_t total = (end0 - t0 + end1 - t1) / 2;
//if ( total <= 0 ) return 0;
// count matches between the sample vectors
int32_t count = 0;
int32_t n;
uint32_t mask = numSlots - 1;
loop:
// skip if t0[i0] matches a query term
n = t0[i0] & mask;
while ( qtable[n] && qtable[n] != (int32_t)t0[i0] )
if ( ++n >= numSlots ) n = 0;
if ( qtable[n] ) {
s0[i0] = 0; // remove score for tallying up total
i0++; if (t0[i0] == 0 || i0>=GIGABITS_IN_VECTOR) goto done; }
// skip if t1[i1] matches a query term
n = t1[i1] & mask;
while ( qtable[n] && qtable[n] != (int32_t)t1[i1] )
if ( ++n >= numSlots ) n = 0;
if ( qtable[n] ) {
s1[i1] = 0; // remove score for tallying up total
i1++; if (t1[i1] == 0 || i1>=GIGABITS_IN_VECTOR) goto done; }
// terminate on a 0
if ( t0[i0] < t1[i1] ) {
i0++; if (t0[i0] == 0 || i0>=GIGABITS_IN_VECTOR) goto done; }
else if ( t1[i1] < t0[i0] ) {
i1++; if (t1[i1] == 0 || i1>=GIGABITS_IN_VECTOR) goto done; }
else {
// if both are zero... do not inc count
if ( t0[i0] == 0 ) goto done;
//count++;
// now we do a weighted count
count += s0[i0] + s1[i1];
i0++;
i1++;
if ( t0[i0] == 0 || i0>=GIGABITS_IN_VECTOR) goto done;
if ( t1[i1] == 0 || i1>=GIGABITS_IN_VECTOR) goto done;
}
goto loop;
done:
// count total components in each sample vector
while ( t0[i0] && i0 < GIGABITS_IN_VECTOR ) i0++;
while ( t1[i1] && i1 < GIGABITS_IN_VECTOR ) i1++;
int32_t total = 0;
//total += t0 - ((int32_t *)vec0);
//total += t1 - ((int32_t *)vec1);
// get total score
for ( int32_t i = 0 ; i < i0 ; i++ ) total += s0[i] ;
for ( int32_t i = 0 ; i < i1 ; i++ ) total += s1[i] ;
// how similar are they?
// if both are empty, assume not similar at all. this happens if we
// do not have a content vector for either, or if both are small docs
// with no words or links in them (framesets?)
if ( total == 0 ) return 0;
//int32_t sim = (count * 2 * 100) / total;
int32_t sim = (count * 100) / total;
if ( sim > 100 ) sim = 100;
return (char)sim;
}
*/
key_t Clusterdb::makeClusterRecKey ( int64_t docId,
bool familyFilter,
uint8_t languageBits,
int32_t siteHash,
bool isDelKey,
bool isHalfKey ) {
key_t key;
// set the docId upper bits
key.n1 = (uint32_t)(docId >> 29);
key.n1 &= 0x000001ff;
// set the docId lower bits
key.n0 = docId;
key.n0 <<= 35;
// set the family filter bit
if ( familyFilter ) key.n0 |= 0x0000000400000000ULL;
else key.n0 &= 0xfffffffbffffffffULL;
// set the language bits
key.n0 |= ((uint64_t)(languageBits & 0x3f)) << 28;
// set the site hash
key.n0 |= (uint64_t)(siteHash & 0x03ffffff) << 2;
// set the del bit
if ( isDelKey ) key.n0 &= 0xfffffffffffffffeULL;
else key.n0 |= 0x0000000000000001ULL;
// set half bit
if ( !isHalfKey ) key.n0 &= 0xfffffffffffffffdULL;
else key.n0 |= 0x0000000000000002ULL;
// return the key
return key;
}
/*
key_t Clusterdb::convertTitleRecKey ( key_t titleKey ) {
// extract the docid
int64_t docId;
docId = titleKey.n1;
docId <<= 6;
docId |= titleKey.n0 >> 58;
// extract the family filter
bool familyFilter;
if ( ( titleKey.n1 & 0x0100000000000000ULL ) ||
( titleKey.n1 & 0x0200000000000000ULL ) )
familyFilter = true;
else
familyFilter = false;
// extract the site hash
uint32_t siteHash;
siteHash = (uint32_t)((titleKey.n0 >> 30) & 0x0000000003ffffffULL);
// make and return the key
return makeClusterRecKey ( docId, familyFilter, 0, siteHash, false );
}
void Clusterdb::makeRecFromTitleRec ( char *rec,
TitleRec *titleRec,
bool isDelKey ) {
// get the docId
int64_t docId = titleRec->getDocId();
// get the family filter
bool familyFilter = titleRec->hasAdultContent();
// get the language byte
unsigned char lang = titleRec->getLanguage();
// . get the site hash
// . this is really the host hash because tfndb key most use
// the host hash in case site changes in tagdb
uint32_t siteHash = titleRec->getHostHash();
// make the key and copy it to rec
key_t key = makeClusterRecKey ( docId,
familyFilter,
lang,
siteHash,
false );
gbmemcpy(rec, &key, sizeof(key_t));
}
void Clusterdb::makeRecFromTitleRecKey ( char *rec,
char *key,
bool isDelKey ) {
// get the docId
int64_t docId = g_titledb.getDocIdFromKey((key_t*)key);
// get the family filter
bool familyFilter = g_titledb.hasAdultContent(*(key_t*)key);
// . get the site hash
// . this is really the host hash because tfndb key most use
// the host hash in case site changes in tagdb
uint32_t siteHash = g_titledb.getHostHash((key_t*)key);
// make the key and copy it to rec
key_t ckey = makeClusterRecKey ( docId,
familyFilter,
0,
siteHash,
false );
gbmemcpy(rec, &ckey, sizeof(key_t));
}
*/