#include "gb-include.h" #include "Clusterdb.h" #include "Threads.h" #include "Rebalance.h" // a global class extern'd in .h file Clusterdb g_clusterdb; Clusterdb g_clusterdb2; /* // for making the cluster cache key static key_t makeClusterCacheKey ( uint32_t vfd, uint32_t pageNum ) { key_t key; key.n1 = vfd + 1; key.n0 = (uint64_t)pageNum + 1; return key; } // DiskPageCache override functions static void clusterGetPages ( DiskPageCache *pc, int32_t vfd, char *buf, int32_t numBytes, int64_t offset, int32_t *newNumBytes, int64_t *newOffset ) { bool cacheMiss = false; // return new disk offset, assume unchanged *newOffset = offset; *newNumBytes = numBytes; // what is the page range? int64_t sp = offset / GB_PAGE_SIZE ; int64_t ep = (offset + (numBytes-1)) / GB_PAGE_SIZE ; // setup the cache list RdbList cacheList; key_t startKey; startKey.n1 = 0; startKey.n0 = 0; // point to the buffer to fill char *bufPtr = buf; char *bufEnd = buf + numBytes; // read in the pages while ( sp <= ep && bufPtr < bufEnd ) { cacheList.reset(); // get the cache key for the page key_t cacheKey = makeClusterCacheKey ( vfd, sp ); // read in the list from cache collnum_t collnum = 0; g_clusterdb.getRdb()->m_cache.getList ( collnum, (char*)&cacheKey, (char*)&startKey, &cacheList, false, 3600*24*265, true ); //cacheList.checkList_r ( false, true ); //log ( LOG_INFO, "cache: got list [%"INT32", %"INT64"] [%"INT32"]", // vfd, sp, cacheList.m_listSize ); int32_t size = cacheList.m_listSize; if ( size == 0 ) { cacheMiss = true; goto getPagesEnd; } //log ( LOG_INFO, "cache: got list [%"INT32", %"INT32"] [%"INT32"]", // vfd, sp, size ); if ( bufPtr + size >= bufEnd ) size = bufEnd - bufPtr; // copy the list into the buffer gbmemcpy ( bufPtr, cacheList.m_list, size ); // advance to the next page bufPtr += size; *newOffset += size; *newNumBytes -= size; sp++; } getPagesEnd: if ( !cacheMiss ) { pc->m_hits++; // *newNumBytes = -(*newNumBytes); } else pc->m_misses++; } static void clusterAddPages ( DiskPageCache *pc, int32_t vfd, char *buf, int32_t numBytes, int64_t offset ) { // make sure we have a clean vfd if ( vfd < 0 || vfd >= MAX_NUM_VFDS2 ) return; // make sure the file didn't get unlinked if ( ! pc->m_memOff[vfd] ) return; // get the number of twins, used for filtering int32_t numTwins = g_hostdb.getNumHostsPerShard(); int32_t thisTwin = g_hostdb.m_hostId/g_hostdb.m_numShards; // get the bias range for this twin int64_t biasStart = ((0x0000003fffffffffLL)/(int64_t)numTwins) * (int64_t)thisTwin; int64_t biasEnd; if ( thisTwin == numTwins - 1 ) biasEnd = 0x0000003fffffffffLL + 1LL; else biasEnd = ((0x0000003fffffffffLL)/(int64_t)numTwins) * (int64_t)(thisTwin+1); // get the page range int64_t sp = offset / GB_PAGE_SIZE; // point to it char *bufPtr = buf; char *bufEnd = buf + numBytes; // how much did we exceed the boundary by? int32_t skip = (int32_t)(offset - sp * GB_PAGE_SIZE); int32_t size = GB_PAGE_SIZE - skip; // setup the cache lists, may need to merge with an old list RdbList cacheList1; cacheList1.set ( NULL, 0, NULL, 0, 0, true, true, g_clusterdb.getRdb()->m_ks ); cacheList1.growList(GB_PAGE_SIZE); // set the buffer data to a list so we can read it nicely key_t startKey; key_t endKey; startKey.n1 = 0; startKey.n0 = 0; endKey.n1 = 0xffffffff; endKey.n0 = 0xffffffffffffffffULL; // setup our source list RdbList dataList; dataList.set ( bufPtr, numBytes, bufPtr, numBytes, (char*)&startKey, (char*)&endKey, 0, false, true, g_clusterdb.getRdb()->m_ks ); dataList.resetListPtr(); // add pages to the cache while ( bufPtr < bufEnd ) { int32_t filled = 0; // ensure "size" is not too big if ( bufPtr + size > bufEnd ) size = bufEnd - bufPtr; // . add the page to the cache cacheList1.reset(); // check the first key, if it's too large, we're all done here key_t key = dataList.getCurrentKey(); int64_t docId = g_clusterdb.getDocId ( key ); //if ( docId >= biasEnd ) { // log ( "clusterdb: DocId after bias end, key.n1=%"XINT32" key.n0=%"XINT64"", key.n1, key.n0 ); // log ( "clusterdb: DocId after bias end, %"XINT64" >= %"XINT64"", docId, biasEnd ); // return; //} // make the cache key using vfd and page number key_t cacheKey = makeClusterCacheKey ( vfd, sp ); // filter the data into a list to be cached while ( filled < size && !dataList.isExhausted() ) { key = dataList.getCurrentKey(); // check the key for filtering //int64_t docId = g_clusterdb.getDocId ( key ); //int32_t twin = hashLong((int32_t)docId) % numTwins; //if ( twin == thisTwin ) { // add the key to the rdb list cacheList1.addRecord(key, 0, NULL); //} // next key filled += dataList.getCurrentRecSize(); dataList.skipCurrentRecord(); } collnum_t collnum = 0; // if the last key is too small, don't add the page docId = g_clusterdb.getDocId ( key ); if ( docId >= biasStart ) g_clusterdb.getRdb()->m_cache.addList ( collnum, cacheKey, &cacheList1 ); //else // log ( "clusterdb: DocId before bias start, %"INT64" >= %"INT64"", docId, biasStart ); //cacheList1.checkList_r ( false, true ); //log ( LOG_INFO, "cache: add list [%"INT32", %"INT64"] [%"INT32"]", // vfd, sp, cacheList1.m_listSize ); // advance bufPtr += filled; sp++; size = GB_PAGE_SIZE; skip = 0; } } static int32_t clusterGetVfd ( DiskPageCache *pc, int64_t maxFileSize ) { // pick a vfd for this file, will be used in the cache key int32_t i; int32_t count = MAX_NUM_VFDS2; for ( i = pc->m_nexti; count-- > 0; i++ ) { if ( i >= MAX_NUM_VFDS2 ) i = 0; if ( ! pc->m_memOff[i] ) break; } // bail if none left if ( count == 0 ) { g_errno = EBADENGINEER; log ( LOG_LOGIC, "db: pagecache: clusterGetVfd: " "no vds remaining." ); return -1; } // start looking here next time pc->m_nexti = i + 1; // set m_memOff[i] to something to hold the vfd pc->m_memOff[i] = (int32_t*)0x7fffffff; // return the vfd return i; } static void clusterRmVfd ( DiskPageCache *pc, int32_t vfd ) { // make sure it's a clean vfd if ( vfd < 0 || vfd >= MAX_NUM_VFDS2 ) return; // clear the vfd for use pc->m_memOff[vfd] = NULL; // need to clear out the cache records using this vfd collnum_t collnum = 0; key_t startKey, endKey; startKey.n1 = vfd + 1; startKey.n0 = 0; endKey.n1 = vfd + 1; endKey.n0 = 0xffffffffffffffffULL; g_clusterdb.getRdb()->m_cache.removeKeyRange ( collnum, (char*)&startKey, (char*)&endKey ); //log ( LOG_INFO, "cache: BIASED CACHE REMOVED VFD!!" ); } */ // reset rdb void Clusterdb::reset() { m_rdb.reset(); } // . this no longer maintains an rdb of cluster recs // . Msg22 now just uses the cache to hold cluster recs that it computes // from titlteRecs // . clusterRecs are now just TitleRec keys... // . we can load one the same from titledb as we could from clusterdb // and we still don't need to uncompress the titleRec to get the info bool Clusterdb::init ( ) { // this should be about 200/4 = 50 megs per host on my current setup int32_t maxTreeMem = g_conf.m_clusterdbMaxTreeMem; // . what's max # of tree nodes? // . key+4+left+right+parents+dataPtr = 12+4 +4+4+4+4 = 32 // . 28 bytes per record when in the tree int32_t maxTreeNodes = maxTreeMem / ( 16 + CLUSTER_REC_SIZE ); // . each cahched list is just one key in the tree... // . 28(tree space) + 24(cacheoverhead) = 52 //int32_t maxCacheMem = g_conf.m_clusterdbMaxCacheMem ; // do not use any page cache if doing tmp cluster in order to // prevent swapping //int32_t pcmem = g_conf.m_clusterdbMaxDiskPageCacheMem; int32_t pcmem = 0; if ( g_hostdb.m_useTmpCluster ) pcmem = 0; // we need that 100MB for termlists! they are >90MB now!! pcmem = 10000000; // 10MB // temp hack for rebuild //pcmem = 0; // RdbCache has a 4 byte ptr to each rec in the cache //int32_t maxCacheNodes = maxCacheMem / ( 4 + CLUSTER_REC_SIZE ); //int32_t nodeSize = sizeof(key_t) + sizeof(collnum_t); //int32_t pageSize = GB_TFNDB_PAGE_SIZE; //int32_t nodeSize = (pageSize + 12) + sizeof(collnum_t) + 20; //int32_t maxCacheNodes = maxCacheMem / nodeSize ; // init the page cache // if ( ! m_pc.init ( "clusterdb", // RDB_CLUSTERDB, // pcmem , // pageSize ) ) // //g_conf.m_clusterdbMaxDiskPageCacheMem, // //clusterGetPages, // //clusterAddPages, // //clusterGetVfd, // //clusterRmVfd )) // return log("db: Clusterdb init failed."); //bool bias = true; //if ( g_conf.m_fullSplit ) bias = false; bool bias = false; // initialize our own internal rdb return m_rdb.init ( g_hostdb.m_dir , "clusterdb" , true , // dedup //CLUSTER_REC_SIZE - sizeof(key_t),//fixedDataSize 0 , // no data now! just docid/s/c 2, // g_conf.m_clusterdbMinFilesToMerge, g_conf.m_clusterdbMaxTreeMem, maxTreeNodes , // maxTreeNodes , true , //false , // balance tree? 0,//maxCacheMem , 0,//maxCacheNodes , true , // half keys? g_conf.m_clusterdbSaveCache, NULL,//&m_pc , false, // is titledb true , // preload disk page cache 12, // key size bias ); // bias disk page cache? } // init the rebuild/secondary rdb, used by PageRepair.cpp bool Clusterdb::init2 ( int32_t treeMem ) { // . what's max # of tree nodes? // . key+4+left+right+parents+dataPtr = 12+4 +4+4+4+4 = 32 // . 28 bytes per record when in the tree int32_t maxTreeNodes = treeMem / ( 16 + CLUSTER_REC_SIZE ); // initialize our own internal rdb return m_rdb.init ( g_hostdb.m_dir , "clusterdbRebuild" , true , // dedup 0 , // no data now! just docid/s/c 50 , // m_clusterdbMinFilesToMerge, treeMem , // g_conf.m_clusterdbMaxTreeMem, maxTreeNodes , true , // balance tree? 0 , // maxCacheMem , 0 , // maxCacheNodes , true , // half keys? false , // g_conf.m_clusterdbSaveCache, NULL , // &m_pc , false , // is titledb false , // preload disk page cache 12 , // key size true ); // bias disk page cache } /* bool Clusterdb::addColl ( char *coll, bool doVerify ) { if ( ! m_rdb.addColl ( coll ) ) return false; if ( ! doVerify ) return true; // verify if ( verify(coll) ) return true; // if not allowing scale, return false if ( ! g_conf.m_allowScale ) return false; // otherwise let it go log ( "db: Verify failed, but scaling is allowed, passing." ); return true; } */ bool Clusterdb::verify ( char *coll ) { log ( LOG_DEBUG, "db: Verifying Clusterdb for coll %s...", coll ); g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; RdbList list; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); //int32_t minRecSizes = 64000; CollectionRec *cr = g_collectiondb.getRec(coll); if ( ! msg5.getList ( RDB_CLUSTERDB , cr->m_collnum , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , 0 , -1 , true , -1LL , &msg5b , true )) { g_threads.enableThreads(); return log("db: HEY! it did not block"); } int32_t count = 0; int32_t got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); // skip negative keys if ( (k.n0 & 0x01) == 0x00 ) continue; count++; //uint32_t groupId = getGroupId ( RDB_CLUSTERDB , &k ); //if ( groupId == g_hostdb.m_groupId ) got++; uint32_t shardNum = getShardNum( RDB_CLUSTERDB , &k ); if ( shardNum == getMyShardNum() ) got++; } if ( got != count ) { // tally it up g_rebalance.m_numForeignRecs += count - got; log ("db: Out of first %"INT32" records in clusterdb, " "only %"INT32" belong to our group.",count,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); log ( "db: Exiting due to Clusterdb inconsistency." ); g_threads.enableThreads(); return g_conf.m_bypassValidation; } log ( LOG_DEBUG, "db: Clusterdb passed verification successfully for " "%"INT32" recs.", count ); // DONE g_threads.enableThreads(); return true; } #include "IndexList.h" // . this routine is very slow... // . it is used to get a titleRec's (document's) sample vector at query time // but we should really compute this vector at build time and store it in // the titleRec itself, to avoid having to compute it at query time. // . vector must have at least VECTOR_SIZE bytes available /* void Clusterdb::getSampleVector ( char *vec , class Doc *doc, char *coll , int32_t collLen , int32_t niceness) { int64_t startTime = gettimeofdayInMilliseconds(); TitleRec *tr = doc->getTitleRec(); SiteRec *sr = doc->getSiteRec(); //sr->set ( tr->getSite() , tr->getColl() , tr->getCollLen() , sr->set ( tr->getSite() , coll , collLen , tr->getSiteFilenum() , SITEREC_CURRENT_VERSION ); // hashes the whole doc, but more importantly for us, computes // XmlDoc::m_vector //doc->set ( niceness ); XmlDoc *xd = doc->getXmlDoc(); xd->set ( tr , sr , NULL, niceness); // this just sets the vector doc->getIndexList(NULL,true,true,false,NULL,NULL,NULL, niceness); // log the time int64_t took =gettimeofdayInMilliseconds()-startTime; if ( took > 3 ) log(LOG_INFO,"query: Took %"INT64" ms to make indexlist.",took); // so get it char *p = doc->getSampleVector ( ); // and store it. int16_t vectors are padded with 0's. gbmemcpy ( vec , p , SAMPLE_VECTOR_SIZE ); } */ // if VECTOR_SIZE is 128 bytes then that is 32 termIds (4 bytes each) that we // use to make this vector. these 32 termids are the lowest 32 termids out of // all the termids for the document. we can further hash pairs to reduce the // vector size from 128 to 64 bytes. but we must hash the pair strategically. // What are the odds of two things being 90% similar when they are not? #define SAMPLE_VECTOR_LEN (SAMPLE_VECTOR_SIZE / 4) // . it would be nice to use the new addition to the Words class that allows // a word to be a tag. this kinda replaces the xml class. // . returns false and sets g_errno on error /* bool Clusterdb::getGigabitVector ( char *vec , Xml *xml ) { // . get filtered text, no link text since that is usually for menus // . get first 64k char buf[64*1024]; xml->getText ( buf , 64*1024 ); // hash into this table TermTable table; Query q; TopicGroup t; t.m_numTopics = 32; t.m_maxTopics = 32; t.m_docsToScanForTopics = 1; t.m_minTopicScore = 0; t.m_maxWordsPerTopic = 4; t.m_meta[0] = '\0'; t.m_delimeter = 0; t.m_useIdfForTopics = true; t.m_dedup = false; t.m_minDocCount = 1; t.m_ipRestrict = false; t.m_dedupSamplePercent = 0; t.m_topicRemoveOverlaps = true; t.m_topicSampleSize = 64*1024; t.m_topicMaxPunctLen = 3; State23 st; st.m_numRequests = 1; st->m_msg20[0].m_bufSampleBuf = buf; st->m_msg20[0].m_bufSampleBufLen = bufLen; st->m_returnDocIdCount = false; st->m_returnDocIds = false; st->m_returnPops = false; Msg24 msg24; if ( ! msg24.getTopics ( &st , // State24 &t , &table , &q , 0 , // gid &buf , &bufLen ) ) return false; // now hash the winning topics into our vector } */ /* void Clusterdb::getSampleVector ( char *vec , TermTable *table ) { // no compression is used in this list so each docId/termId is 12 bytes int32_t numTerms = table->getNumTermsUsed(); // . how many can we hold? we'll just use 4 bytes per vector component // . let's get 2x as many termids as required, then we will combine // every 2 termids into one via hashing... this makes falsely high // similarities less likely, but makes truly high similarities less // likely to be detected as well. int32_t maxTerms = (1 * SAMPLE_VECTOR_LEN) - 1; // what portion of them do we want to mask out from the rest? int32_t ratio = numTerms / maxTerms ; unsigned char mask = 0x00; while ( ratio >= 2 ) { // shift the mask down, ensure hi bit is set mask >>= 1; mask |= 0x80; ratio >>= 1; // /2 } uint32_t d [ 3000 ]; // if we don't have enough, make them 0's memset ( d , 0 , SAMPLE_VECTOR_SIZE ); memset ( vec , 0 , SAMPLE_VECTOR_SIZE ); again: // a buffer to hold the top termIds int32_t nd = 0; // . buffer should have at least "maxTerms" in it // . these should all be 12 byte keys int32_t i = 0 ; int32_t n = table->getNumTerms(); int64_t *termIds = table->getTermIds(); uint32_t *scores = table->getScores (); for ( ; i < n ; i++ ) { // skip if empty bucket if ( ! scores[i] ) continue; // skip if negative key, since we can be deleting old keys // from call from Msg14.cpp // NO! this should be the indexlist directly from Msg16, not // after subtracting the one from Msg15 //if ( (*p & 0x01) == 0x00 ) continue; // skip if it's not to be considered //fprintf(stderr,"%hhu\n",p[11]); //if ( (p[11] & mask) != 0 ) continue; if ( ((termIds[i]>>(NUMTERMIDBITS-8)) & mask) != 0 ) continue; // add it //d[nd++] = *(int32_t *)(p+12-5); // last byte has del bit, etc. d[nd] = (uint32_t)(termIds[i] >> (NUMTERMIDBITS-32)); // 0 has special meaning, it terminates the vector if ( d[nd] == 0 ) d[nd] = 1; if ( ++nd < 3000 ) continue; // bitch and break out on error log(LOG_INFO,"build: Sample vector overflow. Slight " "performance hit."); break; } // if nd was too small, don't use a mask to save time if ( nd < maxTerms && nd < numTerms && mask ) { // sanity check if ( mask == 0 ) { log (LOG_LOGIC,"build: Clusterdb sample vector mask " "is already at 0."); char *xx = NULL; *xx = 0; } // debug msg //log("AGAIN"); //val >>= 1; // shift the mask UP, allow more termIds to pass through mask <<= 1; goto again; } // bubble sort them bool flag = true; while ( flag ) { flag = false; for ( int32_t i = 1 ; i < nd ; i++ ) { if ( d[i-1] <= d[i] ) continue; uint32_t tmp = d[i-1]; d[i-1] = d[i]; d[i] = tmp; flag = true; } } if ( nd > SAMPLE_VECTOR_LEN - 1 ) nd = SAMPLE_VECTOR_LEN - 1; // make sure last component is a 0 d [ nd ] = 0; gbmemcpy ( vec , (char *)d , (nd+1) * 4 ); } */ // return the percent similar char Clusterdb::getSampleSimilarity ( char *vec0 , char *vec1, int32_t size ) { // . the termIds are sorted // . point each recs sample vector of termIds //int32_t *t0 = (int32_t *)(vec0 + sizeof(key_t) + 3*4); //int32_t *t1 = (int32_t *)(vec1 + sizeof(key_t) + 3*4); // . we sorted them above as uint32_ts, so we must make sure // we use uint32_ts here, too uint32_t *t0 = (uint32_t *)vec0; uint32_t *t1 = (uint32_t *)vec1; // if either is empty, return 0 to be on the safe side if ( *t0 == 0 ) return 0; if ( *t1 == 0 ) return 0; //int32_t size0 = *(int32_t *)(rec + sizeof(key_t)); //int32_t *end0 = (int32_t *)(vec0 + *(int32_t *)(vec0+12)); //int32_t *end1 = (int32_t *)(vec1 + *(int32_t *)(vec1+12)); // how many total termIds? //int32_t total = (end0 - t0 + end1 - t1) / 2; //if ( total <= 0 ) return 0; // count matches between the sample vectors int32_t count = 0; loop: if( ((char*)t0 - vec0) > size ) { log( LOG_INFO, "query: sample vector 0 is malformed. " "Returning 0%% similarity." ); return 0; } if( ((char*)t1 - vec1) > size ) { log( LOG_INFO, "query: sample vector 1 is malformed. " "Returning 0%% similarity." ); return 0; } // terminate on a 0 if ( *t0 < *t1 ) { if ( *++t0 == 0 ) goto done; } else if ( *t1 < *t0 ) { if ( *++t1 == 0 ) goto done; } else { // if both are zero... do not inc count if ( *t0 == 0 ) goto done; count++; t0++; t1++; if ( *t0 == 0 ) goto done; if ( *t1 == 0 ) goto done; } goto loop; done: // count total components in each sample vector while ( *t0 ) { t0++; if( ((char*)t0 - vec0) > size ) { log( LOG_INFO, "query: sample vector 0 is malformed. " "Returning 0%% similarity." ); return 0; } } while ( *t1 ) { t1++; if( ((char*)t1 - vec1) > size ) { log( LOG_INFO, "query: sample vector 1 is malformed. " "Returning 0%% similarity." ); return 0; } } int32_t total = 0; total += t0 - ((uint32_t *)vec0); total += t1 - ((uint32_t *)vec1); // how similar are they? // if both are empty, assume not similar at all. this happens if we // do not have a content vector for either, or if both are small docs // with no words or links in them (framesets?) if ( total == 0 ) return 0; int32_t sim = (count * 2 * 100) / total; if ( sim > 100 ) sim = 100; return (char)sim; } /* // return the percent similar char Clusterdb::getGigabitSimilarity ( char *vec0 , char *vec1 , int32_t *qtable , int32_t numSlots ) { // . the termIds are sorted // . point each recs sample vector of termIds //int32_t *t0 = (int32_t *)(vec0 + sizeof(key_t) + 3*4); //int32_t *t1 = (int32_t *)(vec1 + sizeof(key_t) + 3*4); uint32_t *t0 = (uint32_t *)vec0; uint32_t *t1 = (uint32_t *)vec1; int16_t *s0 = (int16_t *)(vec0 + 4*GIGABITS_IN_VECTOR); int16_t *s1 = (int16_t *)(vec1 + 4*GIGABITS_IN_VECTOR); int32_t i0 = 0; int32_t i1 = 0; // if both empty, cluster together... assume same topic //if ( *t0 == 0 && *t1 == 0 ) return 100; if ( *t0 == 0 && *t1 == 0 ) return 0; // if either is empty, return 0 to be on the safe side if ( *t0 == 0 ) return 0; if ( *t1 == 0 ) return 0; if ( numSlots == 0 ) return 0; //int32_t size0 = *(int32_t *)(rec + sizeof(key_t)); //int32_t *end0 = (int32_t *)(vec0 + *(int32_t *)(vec0+12)); //int32_t *end1 = (int32_t *)(vec1 + *(int32_t *)(vec1+12)); // how many total termIds? //int32_t total = (end0 - t0 + end1 - t1) / 2; //if ( total <= 0 ) return 0; // count matches between the sample vectors int32_t count = 0; int32_t n; uint32_t mask = numSlots - 1; loop: // skip if t0[i0] matches a query term n = t0[i0] & mask; while ( qtable[n] && qtable[n] != (int32_t)t0[i0] ) if ( ++n >= numSlots ) n = 0; if ( qtable[n] ) { s0[i0] = 0; // remove score for tallying up total i0++; if (t0[i0] == 0 || i0>=GIGABITS_IN_VECTOR) goto done; } // skip if t1[i1] matches a query term n = t1[i1] & mask; while ( qtable[n] && qtable[n] != (int32_t)t1[i1] ) if ( ++n >= numSlots ) n = 0; if ( qtable[n] ) { s1[i1] = 0; // remove score for tallying up total i1++; if (t1[i1] == 0 || i1>=GIGABITS_IN_VECTOR) goto done; } // terminate on a 0 if ( t0[i0] < t1[i1] ) { i0++; if (t0[i0] == 0 || i0>=GIGABITS_IN_VECTOR) goto done; } else if ( t1[i1] < t0[i0] ) { i1++; if (t1[i1] == 0 || i1>=GIGABITS_IN_VECTOR) goto done; } else { // if both are zero... do not inc count if ( t0[i0] == 0 ) goto done; //count++; // now we do a weighted count count += s0[i0] + s1[i1]; i0++; i1++; if ( t0[i0] == 0 || i0>=GIGABITS_IN_VECTOR) goto done; if ( t1[i1] == 0 || i1>=GIGABITS_IN_VECTOR) goto done; } goto loop; done: // count total components in each sample vector while ( t0[i0] && i0 < GIGABITS_IN_VECTOR ) i0++; while ( t1[i1] && i1 < GIGABITS_IN_VECTOR ) i1++; int32_t total = 0; //total += t0 - ((int32_t *)vec0); //total += t1 - ((int32_t *)vec1); // get total score for ( int32_t i = 0 ; i < i0 ; i++ ) total += s0[i] ; for ( int32_t i = 0 ; i < i1 ; i++ ) total += s1[i] ; // how similar are they? // if both are empty, assume not similar at all. this happens if we // do not have a content vector for either, or if both are small docs // with no words or links in them (framesets?) if ( total == 0 ) return 0; //int32_t sim = (count * 2 * 100) / total; int32_t sim = (count * 100) / total; if ( sim > 100 ) sim = 100; return (char)sim; } */ key_t Clusterdb::makeClusterRecKey ( int64_t docId, bool familyFilter, uint8_t languageBits, int32_t siteHash, bool isDelKey, bool isHalfKey ) { key_t key; // set the docId upper bits key.n1 = (uint32_t)(docId >> 29); key.n1 &= 0x000001ff; // set the docId lower bits key.n0 = docId; key.n0 <<= 35; // set the family filter bit if ( familyFilter ) key.n0 |= 0x0000000400000000ULL; else key.n0 &= 0xfffffffbffffffffULL; // set the language bits key.n0 |= ((uint64_t)(languageBits & 0x3f)) << 28; // set the site hash key.n0 |= (uint64_t)(siteHash & 0x03ffffff) << 2; // set the del bit if ( isDelKey ) key.n0 &= 0xfffffffffffffffeULL; else key.n0 |= 0x0000000000000001ULL; // set half bit if ( !isHalfKey ) key.n0 &= 0xfffffffffffffffdULL; else key.n0 |= 0x0000000000000002ULL; // return the key return key; } /* key_t Clusterdb::convertTitleRecKey ( key_t titleKey ) { // extract the docid int64_t docId; docId = titleKey.n1; docId <<= 6; docId |= titleKey.n0 >> 58; // extract the family filter bool familyFilter; if ( ( titleKey.n1 & 0x0100000000000000ULL ) || ( titleKey.n1 & 0x0200000000000000ULL ) ) familyFilter = true; else familyFilter = false; // extract the site hash uint32_t siteHash; siteHash = (uint32_t)((titleKey.n0 >> 30) & 0x0000000003ffffffULL); // make and return the key return makeClusterRecKey ( docId, familyFilter, 0, siteHash, false ); } void Clusterdb::makeRecFromTitleRec ( char *rec, TitleRec *titleRec, bool isDelKey ) { // get the docId int64_t docId = titleRec->getDocId(); // get the family filter bool familyFilter = titleRec->hasAdultContent(); // get the language byte unsigned char lang = titleRec->getLanguage(); // . get the site hash // . this is really the host hash because tfndb key most use // the host hash in case site changes in tagdb uint32_t siteHash = titleRec->getHostHash(); // make the key and copy it to rec key_t key = makeClusterRecKey ( docId, familyFilter, lang, siteHash, false ); gbmemcpy(rec, &key, sizeof(key_t)); } void Clusterdb::makeRecFromTitleRecKey ( char *rec, char *key, bool isDelKey ) { // get the docId int64_t docId = g_titledb.getDocIdFromKey((key_t*)key); // get the family filter bool familyFilter = g_titledb.hasAdultContent(*(key_t*)key); // . get the site hash // . this is really the host hash because tfndb key most use // the host hash in case site changes in tagdb uint32_t siteHash = g_titledb.getHostHash((key_t*)key); // make the key and copy it to rec key_t ckey = makeClusterRecKey ( docId, familyFilter, 0, siteHash, false ); gbmemcpy(rec, &ckey, sizeof(key_t)); } */