// Matt Wells, copyright May 2003 // convert the old 1.0 index format to the new 2.0 compressed index format #include "BigFile.h" #include "RdbMap.h" #include "Mem.h" bool mainShutdown ( bool urgent ) { return true; } int main ( int argc , char *argv[] ) { // must have big filename if ( argc != 3 ) { printf("Usage: convert [dir] [N]\n"); printf(" Converts [dir]/indexdb000N.dat from 1.0 " "to 2.0 format.\n"); printf(" Stores new index in [dir]/newindex\n"); exit(-1); } // point to old dir char *olddir = argv[1]; // store the index index in this directory char newdir[256]; sprintf ( newdir , "%s/newindex/", olddir ); // index filename char iname [ 256 ]; sprintf ( iname , "indexdb%04i.dat", atoi(argv[2]) ); // big filename is provided BigFile f1; f1.set ( olddir , iname ); // get the file size long long fsize = f1.getFileSize(); // open it if ( ! f1.open ( O_RDONLY ) ) { printf("convert: error opening %s/%s\n",olddir,iname); exit(-1); } // ...insert old commented out code at bottom here... BigFile f2; f2.set ( newdir , iname ); // open it if ( ! f2.open ( O_RDWR | O_CREAT ) ) { printf("convert: error opening %s/%s\n",newdir,iname); exit(-1); } // RdbMap needs like 14 bytes per page long maxmem = (long) ((fsize / PAGE_SIZE) * 14LL + 5000000LL); g_mem.init ( maxmem ); fprintf(stderr,"convert: converting %s/%s to %s/%s (maxmem=%li)\n", olddir,iname,newdir,iname,maxmem); // always make the hash table if ( ! hashinit() ) { log(" convert: hashinit failed" ); return 1; } // read in 12 byte key, 4 byte size then data of that size long long offset = 0; long long woffset = 0; unsigned char ophi[6]; memset ( ophi , 0 , 6 ); // read into this buf unsigned char buf [ 32000*sizeof(key_t) ]; // output into this one unsigned char obuf [ 32000*sizeof(key_t) ]; // trash, at least as big as a truncated termlist, plus a lil more // for error over truncation unsigned char *trash = (unsigned char *)malloc (220000*sizeof(key_t)); if ( ! trash ) { fprintf(stderr,"malloc failed\n"); exit(-1); } unsigned char *trashEnd = trash + 220000*sizeof(key_t) ; unsigned char *trashTop = trash; unsigned char *trashBot = trash; long trashed = 0; long recycled = 0; long removed = 0; RdbMap map; char mname[256]; sprintf (mname ,"%s/indexdb%04i.map",newdir,atoi(argv[2])); map.set (mname, 0); loop: long toRead = sizeof(key_t) * 32000; // truncate by file size if ( offset + toRead > fsize ) toRead = fsize - offset; // . begin the loop // . read up 6 bytes in case we start on half key if ( ! f1.read ( buf , toRead , offset ) ) { printf("convert: error reading file\n"); exit(-1); } // input buffer ptrs unsigned char *p ; unsigned char *pend = buf + toRead; // quickly check for keys out of order... that is disk corruption again: for ( p = buf + 12 ; p < pend ; p += 12 ) { // skip if order is ok if ( *(key_t *)(p-12) <= *(key_t *)(p) ) continue; // remove with radius of 2 keys unsigned char *a = p - 2*12; unsigned char *b = p + 2*12; if ( a < buf ) a = buf; if ( b > pend ) b = pend; memmove ( a , b , pend - b ); pend -= (b - a); // count it removed += (b-a)/12; //fprintf(stderr,"removed %li bad keys\n",(long)(b-a)/12); goto again; } // reset p p = buf; // output buffer ptr unsigned char *op = obuf; // save hi key if any char savedhi[6]; memcpy ( savedhi , ophi , 6 ); for ( ; p < pend ; p += 12 ) { // shift the lower 5 bytes up 1 bit to cover the old // punish bit (the bit after the 8 score bits) // if his punish bit is clear, fix it and store in trash if ( (p[4] & 0x80) == 0 ) { // if a corrupt key gets thrown in the trash // he might act as a damn and never let anyone out!! // turn on his punish bit --> fixing him p[4] |= 0x80; // store in trash memcpy ( trashTop , p , 12 ); trashTop += 12; // count it trashed++; // wrap around if we need too if ( trashTop >= trashEnd ) trashTop = trash; // might be full if ( trashTop == trashBot ) { //fprintf(stderr,"TRASH FULL!\n"); //fprintf(stderr,"skip n1=%08lx n0=%016llx\n", // *(long *)(p+8),*(long long *)p); trashTop -= 12; if ( trashTop < trash ) trashTop = trashEnd - 12; } continue; } // save p unsigned char *psaved = p; // is key @ p BIGGER than key in trash? if ( trashBot != trashTop && *(key_t *)p >= *(key_t *)trashBot ) { // skip dups -- dedup // i think the ad collection will use this one if ( *(key_t *)p == *(key_t *)trashBot ) continue; // temporarily assign to us p = trashBot; // advance trash trashBot += 12; // count it recycled++; // wrap it if ( trashBot >= trashEnd ) trashBot = trash; } // shift & carry p[4] <<= 1; if ( p[3] & 0x80 ) p[4] |= 0x01; // shift & carry p[3] <<= 1; if ( p[2] & 0x80 ) p[3] |= 0x01; // shift & carry p[2] <<= 1; if ( p[1] & 0x80 ) p[2] |= 0x01; // shift & carry p[1] <<= 1; if ( p[0] & 0x80 ) p[1] |= 0x01; // just shift p[0] <<= 1; // what was his delbit? if ( p[0] & 0x02 ) p[0] |= 0x01; else p[0] &= 0xfe; // . add to new file // . should we only add low 6 bytes? if ( *(long *) ophi == *(long *)(p+6 ) && *(short *)(ophi+4) == *(short *)(p+10) ) { // set half bit p[0] |= 0x02; // store low 6 bytes *(long *) op = *(long *) p; *(short *)(op+4) = *(short *)(p+4); op += 6; } else { // clear half bit p[0] &= 0xfd; // store low 6 bytes *(long *) op = *(long *)p; *(long *)(op+4) = *(long *)(p+4); *(long *)(op+8) = *(long *)(p+8); // if we're the first 12, save hi 6 bytes *(long *) ophi = *(long *)(p+6); *(short *)(ophi+4) = *(short *)(p+10); //ophi = op + 6; op += 12; } // if p referenced trash, go back! p = psaved; } // dump it out if ( ! f2.write ( obuf , op - obuf , woffset ) ) { log("convert: write failed"); return -1; } // add to map key_t startKey ; key_t endKey ; startKey.n0 = 0LL; startKey.n1 = 0; endKey.n0 = 0xffffffffffffffffLL; endKey.n1 = 0xffffffff; RdbList list; list.set ( (char *)obuf , op - obuf , (char *)obuf , op - obuf , startKey , endKey , 0 , false , true ); // HACK: if first key is only 6 bytes, set hi ptr to top 6 if ( (*obuf & 0x02) == 0x02 ) list.setListPtrs ( (char *)obuf , (char *)&savedhi ); map.addList ( &list ); // advance write offset woffset += op - obuf ; // advance read offset offset += toRead; // bail if we need to if ( offset < fsize ) goto loop; map.close(); f2.close(); f1.close(); // print final message fprintf(stderr,"convert: done converting %s/%s\n",olddir,iname); fprintf(stderr,"convert: recycled %li of the %li trashed\n", recycled,trashed); fprintf(stderr,"convert: removed %li bad keys\n",removed); } /* // init our new indexdb Rdb newdb; if ( ! newdb.init ( newdir , "indexdb" , true , // dedup? 0 , // fixed rec size minToMerge , // min files to merge maxTreeMem , maxTreeNodes , true , // balance tree? 0 , // max cache mem 0 , // max cache nodes true )){// use half keys? log("convert: rdb init failed"); return -1; } // read in 12 byte key, 4 byte size then data of that size long long offset = 0; // start and end keys key_t startKey ; key_t endKey ; startKey.n0 = 0LL; startKey.n1 = 0; endKey.n0 = 0xffffffffffffffffLL; endKey.n1 = 0xffffffff; loop: long toRead = sizeof(key_t) * 32000; // truncate by file size if ( offset + toRead > fsize ) toRead = fsize - offset; // read into this buf char buf [ 32000*sizeof(key_t) ]; // . begin the loop // . read up 6 bytes in case we start on half key if ( ! f.read ( buf , toRead , offset ) ) { printf("dump: error reading file\n"); exit(-1); } // use this just for parsing IndexList ilist; char *p = buf; char *pend = buf + toRead; for ( ; p < pend ; p += 12 ) { // it is same format as new format, but no punish bit // after the score... that is now half bit and moved down, too // extract the score from key, it is complemented in old ones char score = 255 - ((unsigned char)p[5]); // extract termid from the key // get the upper 32 bits of the termId long long termId = ilist.getTermId12 ( p ); // remove date codes for now //if ( termId == 0xdadadadaLL || // termId == 0xdadadad2LL ) continue; // extract docid from the old format key unsigned long long docId = *(long long *)p; docId >>= 1; docId &= DOCID_MASK; // was it a delete key? bool isDelKey ; if ( (*p & 0x01) == 0x00 ) isDelKey = true; else isDelKey = false; // make it all into a new indexdb key key_t k = g_indexdb.makeKey ( termId , score , docId , isDelKey ); // add new key to new indexdb // from Msg1.cpp:55 unsigned long groupId = k.n1 & g_conf.m_groupMask; // bitch if it's not us! if ( groupId != g_conf.m_groupId ) { log("convert: termId would escape!"); sleep(50000); } // do a non-blocking dump of tree if it's 90% full now if ( newdb.m_mem.is90PercentFull() || newdb.m_tree.is90PercentFull() ) { if ( ! newdb.dumpTree ( 0 ) ) { // niceness log("convert: dumpTree failed" ); sleep(50000); } } // now add it if ( newdb.addRecord ( k , NULL , 0 ) < 0 ) { log("convert: addRecord failed" ); sleep(50000); } } // advance offset offset += toRead; // bail if we need to if ( offset < fsize ) goto loop; // done // force dump of tree if ( ! newdb.dumpTree ( 0 ) ) { // niceness log("convert: final tree dump failed" ); sleep(50000); } // print final message log("convert: done converting %s/%s",olddir,iname); } */