fix a couple catdb generation bugs.

MAX_CATIDS violation causing corruption.
not saving catdb tree to catdb-saved.dat
causing missing catdb recs.
This commit is contained in:
mwells 2013-10-12 20:33:04 -07:00
parent 1d133e87c9
commit 3374ce450a
10 changed files with 43 additions and 15 deletions

View File

@ -198,6 +198,7 @@ bool CatRec::set ( Url *url , char *data , long dataSize , bool gotByIp ) {
log ( "tagdb: Deserialized datasize %i != %li for url %s so "
"ignoring tagdb record.",
p - m_data, m_dataSize , url->getUrl() );
return false;
char *xx = NULL; *xx = 0;
}

View File

@ -62,7 +62,7 @@ bool Catdb::init ( ) {
-1 , // fixed record size
//g_hostdb.m_groupMask ,
//g_hostdb.m_groupId ,
g_conf.m_catdbMinFilesToMerge ,
2,//g_conf.m_catdbMinFilesToMerge ,
treeMem ,//g_conf.m_catdbMaxTreeMem ,
maxTreeNodes ,
// now we balance so Sync.cpp can ordered huge list
@ -74,7 +74,7 @@ bool Catdb::init ( ) {
&m_pc ,
false,
false,
12,
12, // keysize
false,
true )) // is collectionless?
return false;

View File

@ -26,7 +26,7 @@
#define MAX_TAG_LEN 127
#define MAX_URL_CATIDS 64
#define MAX_URLTXT_SIZE 500000
#define MAX_CATIDS 64
#define MAX_CATIDS 96
#define MAX_CATNAME_LEN 1024
#define HASHTABLE_SIZE (1024*1024)

2
Conf.h
View File

@ -164,7 +164,7 @@ class Conf {
long m_catdbMaxTreeMem;
long m_catdbMaxDiskPageCacheMem;
long m_catdbMaxCacheMem;
long m_catdbMinFilesToMerge;
//long m_catdbMinFilesToMerge;
long m_revdbMaxTreeMem;
long m_timedbMaxTreeMem;

View File

@ -100,6 +100,10 @@ bool Msg9b::addCatRecs ( char *urls ,
site.set ( p , e - p , false ); // addwww?
// normalize the url
g_catdb.normalizeUrl(&site, &site);
// sanity
if ( numCatids[k] > MAX_CATIDS ) { char *xx=NULL;*xx=0; }
// make a siteRec from this url
CatRec sr;
// returns false and sets g_errno on error
@ -110,6 +114,10 @@ bool Msg9b::addCatRecs ( char *urls ,
char *data = sr.getData ();
long dataSize = sr.getDataSize ();
key_t key;
// sanity test
CatRec cr2;
if ( ! cr2.set ( NULL , sr.getData(), sr.getDataSize(),false)){
char *xx=NULL;*xx=0; }
// debug when generating catdb
//char *x = p;
//for ( ; x<e ; x++ ) {
@ -133,6 +141,19 @@ bool Msg9b::addCatRecs ( char *urls ,
else if ( ! m_list.addRecord ( key, dataSize, data ) )
return true;
/*
// debug point
SafeBuf sb;
//sb.safeMemcpy(p , e-p );
sb.safeStrcpy(sr.m_url);
sb.safePrintf(" ");
for ( long i = 0 ; i < numCatids[k] ; i++ )
sb.safePrintf ( "%li " , catids[c+i] );
log("catdb: adding key=%s url=%s",
KEYSTR(&key,12),
sb.getBufStart());
*/
// debug
//log("gencat: adding url=%s",sr.m_url);

View File

@ -1086,7 +1086,6 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
//g_tfndb.getRdb(),
g_tagdb.getRdb(),
g_clusterdb.getRdb(),
//g_catdb.getRdb(),
g_linkdb.getRdb(),
g_cachedb.getRdb(),
g_serpdb.getRdb(),

View File

@ -4105,6 +4105,7 @@ void Parms::init ( ) {
m->m_type = TYPE_LONG;
m++;
/*
m->m_title = "catdb min files to merge";
m->m_desc = "";
m->m_off = (char *)&g_conf.m_catdbMinFilesToMerge - g;
@ -4113,7 +4114,6 @@ void Parms::init ( ) {
m->m_save = 0;
m++;
/*
m->m_title = "revdb max tree mem";
m->m_desc = "Revdb holds the meta list we added for this doc.";
m->m_off = (char *)&g_conf.m_revdbMaxTreeMem - g;

View File

@ -6,7 +6,7 @@
#include "Clusterdb.h"
#include "Hostdb.h"
#include "Tagdb.h"
//#include "Catdb.h"
#include "Catdb.h"
#include "Posdb.h"
#include "Cachedb.h"
#include "Monitordb.h"
@ -411,7 +411,7 @@ bool Process::init ( ) {
m_rdbs[m_numRdbs++] = g_spiderdb.getRdb ();
m_rdbs[m_numRdbs++] = g_clusterdb.getRdb ();
m_rdbs[m_numRdbs++] = g_tagdb.getRdb ();
//m_rdbs[m_numRdbs++] = g_catdb.getRdb ();
m_rdbs[m_numRdbs++] = g_catdb.getRdb ();
m_rdbs[m_numRdbs++] = g_statsdb.getRdb ();
m_rdbs[m_numRdbs++] = g_linkdb.getRdb ();
m_rdbs[m_numRdbs++] = g_cachedb.getRdb ();
@ -1657,7 +1657,7 @@ void Process::resetAll ( ) {
rdb->reset();
}
//g_catdb .reset();
g_catdb .reset();
g_collectiondb .reset();
g_categories1 .reset();
g_categories2 .reset();
@ -1758,7 +1758,7 @@ void Process::resetPageCaches ( ) {
//g_tfndb .getDiskPageCache()->reset();
//g_checksumdb .getDiskPageCache()->reset();
g_clusterdb .getDiskPageCache()->reset();
//g_catdb .getDiskPageCache()->reset();
g_catdb .getDiskPageCache()->reset();
//g_placedb .getDiskPageCache()->reset();
g_doledb .getDiskPageCache()->reset();
//g_statsdb .getDiskPageCache()->reset();

13
Rdb.cpp
View File

@ -261,8 +261,8 @@ bool Rdb::init ( char *dir ,
if ( ! loadTree ( ) ) return false;
// add the single dummy collection for catdb
//if ( g_catdb.getRdb() == this ) //||
// return g_catdb.addColl ( NULL );
if ( g_catdb.getRdb() == this )
return g_catdb.addColl ( NULL );
if ( g_statsdb.getRdb() == this )
return g_statsdb.addColl ( NULL );
if ( g_cachedb.getRdb() == this )
@ -275,8 +275,6 @@ bool Rdb::init ( char *dir ,
// return g_facebookdb.addColl ( NULL );
if ( g_syncdb.getRdb() == this )
return g_syncdb.addColl ( NULL );
if ( g_catdb.getRdb() == this )
return g_catdb.addColl ( NULL );
// set this for use below
//*(long long *)m_gbcounteventsTermId =
@ -2001,6 +1999,13 @@ bool Rdb::addRecord ( collnum_t collnum,
}
*/
// debug testing
//if ( m_rdbId == RDB_CATDB ) {
// // show key
// log("rdb: adding key=%s to tree n=%li",KEYSTR(key,12) ,n);
//}
//jumpdown:
// if it exists then annihilate it

View File

@ -11060,7 +11060,9 @@ void dumpTagdb (char *coll,long startFileNum,long numFiles,bool includeTree,
data ,
size ,
false);
printf("caturl=%s #catids=%li version=%li\n"
fprintf(stdout,
"key=%s caturl=%s #catids=%li version=%li\n"
,KEYSTR(&k,12)
,crec.m_url
,(long)crec.m_numCatids
,(long)crec.m_version