// // Matt Wells, copyright Sep 2001 // #include "gb-include.h" #include // clone() // declare this stuff up here for call the pread() in our seek test below // // maybe we should put this in a common header file so we don't have // certain files compiled with the platform default, and some not -partap //#include "GBVersion.h" #include "Mem.h" #include "Conf.h" #include "Threads.h" #include "Hostdb.h" #include "Indexdb.h" #include "Posdb.h" #include "Cachedb.h" #include "Monitordb.h" #include "Datedb.h" #include "Titledb.h" #include "Revdb.h" #include "Tagdb.h" #include "Catdb.h" #include "Users.h" #include "Tfndb.h" #include "Spider.h" //#include "Doledb.h" //#include "Checksumdb.h" #include "Clusterdb.h" #include "Sections.h" #include "Statsdb.h" #include "UdpServer.h" #include "PingServer.h" #include "Repair.h" #include "DailyMerge.h" #include "MsgC.h" #include "HttpServer.h" #include "Loop.h" #include "Spider.h" #include // setrlimit #include "Stats.h" #include "Spider.h" //#include "GBVersion.h" #include "Speller.h" // g_speller //#include "Thesaurus.h" // g_thesaurus //#include "Synonyms.h" // g_synonyms #include "Wiki.h" // g_wiki #include "Wiktionary.h" // g_wiktionary #include "Scraper.h" // g_scraper //#include "QueryRouter.h" #include "Categories.h" #include "CountryCode.h" #include "Pos.h" #include "Title.h" #include "Speller.h" //#include "Syncdb.h" // include all msgs that have request handlers, cuz we register them with g_udp #include "Msg0.h" #include "Msg1.h" #include "Msg4.h" //#include "Msg6.h" //#include "Msg7.h" //#include "Msg11.h" //#include "Msg12.h" #include "Msg13.h" #include "Msg20.h" #include "Msg22.h" //#include "Msg23.h" #include "Msg2a.h" #include "Msg36.h" #include "Msg39.h" #include "Msg40.h" // g_resultsCache #include "Msg9b.h" #include "Msg17.h" //#include "Msg34.h" #include "Msg35.h" //#include "Msg24.h" //#include "Msg28.h" //#include "Msg30.h" //#include "MsgB.h" //#include "Msg3e.h" #include "Parms.h" //#include "Msg50.h" //#include "MsgF.h" //#include "Msg33.h" //#include "mmseg.h" // open_lexicon(), etc. for Chinese parsing //#include "PageTopDocs.h" #include "PageNetTest.h" //#include "Sync.h" #include "Pages.h" //#include "Msg1c.h" //#include "Msg2e.h" //#include "Msg6a.h" #include "Unicode.h" //#include #include "AutoBan.h" //#include "SiteBonus.h" #include "Msg1f.h" #include "Profiler.h" //#include "HashTableT.h" //#include "Classifier.h" #include "Blaster.h" #include "Proxy.h" //#include "HtmlCarver.h" //#include "Matchers.h" #include "linkspam.h" #include "Process.h" #include "sort.h" //#include "SiteBonus.h" #include "Ads.h" #include "LanguagePages.h" //#include "Msg3b.h" #include "ValidPointer.h" #include "RdbBuckets.h" //#include "PageTurk.h" //#include "QAClient.h" //#include "Diff.h" #include "Placedb.h" #include "Test.h" #include "seo.h" #include "Json.h" //#include "Facebook.h" //#include "Accessdb.h" // call this to shut everything down bool mainShutdown ( bool urgent ) ; //bool mainShutdown2 ( bool urgent ) ; bool registerMsgHandlers ( ) ; bool registerMsgHandlers1 ( ) ; bool registerMsgHandlers2 ( ) ; bool registerMsgHandlers3 ( ) ; // makes a default conf file and saves into confFilename //void makeNewConf ( long hostId , char *confFilename ); void getPageWrapper ( int fd , void *state ) ; void allExitWrapper ( int fd , void *state ) ; //bool QuerySerializeTest( char *ff ); // Query.cpp //#ifndef _LARS_ static void dumpTitledb ( char *coll,long sfn,long numFiles,bool includeTree, long long docId , char justPrintDups , bool dumpSentences , bool dumpWords ); static void dumpTfndb ( char *coll,long sfn,long numFiles,bool includeTree, bool verify); static long dumpSpiderdb ( char *coll,long sfn,long numFiles,bool includeTree, char printStats , long firstIp ); static void dumpSectiondb( char *coll,long sfn,long numFiles,bool includeTree); static void dumpRevdb ( char *coll,long sfn,long numFiles,bool includeTree); static void dumpTagdb ( char *coll,long sfn,long numFiles,bool includeTree, long c, char rec=0, long rdbId = RDB_TAGDB ); static void dumpIndexdb ( char *coll,long sfn,long numFiles,bool includeTree, long long termId ) ; void dumpPosdb ( char *coll,long sfn,long numFiles,bool includeTree, long long termId , bool justVerify ) ; static void dumpWaitingTree( char *coll ); static void dumpDoledb ( char *coll,long sfn,long numFiles,bool includeTree); void dumpDatedb ( char *coll,long sfn,long numFiles,bool includeTree, long long termId , bool justVerify ) ; void dumpClusterdb ( char *coll,long sfn,long numFiles,bool includeTree); //void dumpChecksumdb ( char *coll,long sfn,long numFiles,bool includeTree); //void dumpStatsdb ( long startFileNum, long numFiles, bool includeTree, // int test ); void dumpLinkdb ( char *coll,long sfn,long numFiles,bool includeTree, char *url ); void exitWrapper ( void *state ) { exit(0); }; ////// // // if seo.o is being linked to it needs to override these weak stubs: // ////// bool loadQueryLog() __attribute__((weak)); void runSEOQueryLoop ( int fd, void *state ) __attribute__((weak)); bool sendPageSEO(TcpSocket *, HttpRequest *) __attribute__((weak)); void handleRequest8e(UdpSlot *, long netnice ) __attribute__((weak)); void handleRequest4f(UdpSlot *, long netnice ) __attribute__((weak)); void handleRequest95(UdpSlot *, long netnice ) __attribute__((weak)); // make the stubs here. seo.o will override them bool loadQueryLog() { return true; } void runSEOQueryLoop ( int fd, void *state ) { return; } bool sendPageSEO(TcpSocket *s, HttpRequest *hr) { return g_httpServer.sendErrorReply(s,500,"Seo support not present"); } void handleRequest8e(UdpSlot *, long netnice ) {return; } void handleRequest4f(UdpSlot *, long netnice ) {return; } void handleRequest95(UdpSlot *, long netnice ) {return; } // for cleaning up indexdb void dumpMissing ( char *coll ); void dumpDups ( char *coll ); void removeDocIds ( char *coll , char *filename ); static void dumpIndexdbFile ( long fn , long long off , char *f , long ks , char *NAME = NULL ); //static void dumpCachedRecs ( char *coll,long sfn,long numFiles,bool includeTree, // long long docId ); //static bool testBoolean() ; //static void qaTest(char *s1, char *s2, char *u, char *q); //static void xmlDiffTest(char *f1, char *f2, DiffOpt *opt); //void testSpamRules(char *coll,long startFileNum,long numFiles,bool includeTree, // long long docid); //void takeSnapshotWrapper( int status, void *state); // JAB: warning abatement //static bool checkDataParity ( ) ; //#endif static long checkDirPerms ( char *dir ) ; //static bool fixTitleRecs( char *coll ) ; //static long getRecSize ( BigFile *f , long long off ) ; //static bool addToChecksumdb ( char *coll , TitleRec *tr ) ; //static bool addToSpiderdb ( char *coll , TitleRec *tr ) ; //Need these two if tr's in addtospiderdb are getting their quality from // their root urls. /*static HashTableT s_rootUrls; static bool loadRootUrls ( char *filename);*/ //static bool addToTfndb ( char *coll , TitleRec *tr , long id2 ) ; //static bool addToTfndb2 ( char *coll , SpiderRec *sr , long id2 ) ; //static bool mergeChecksumFiles ( ) ; //static bool genDbs ( char *coll ) ; //static bool genTfndb ( char *coll ) ; //static bool fixTfndb ( char *coll ) ; //static bool makeClusterdb ( char *coll ) ; //static bool genDateRange ( char *coll ) ; // diff with indexdb in sync/ dir //bool syncIndexdb ( ); //bool gbgzip (char *filename); //bool gbgunzip (char *filename); //bool trietest ( ) ; //bool matchertest ( int argc, char* argv[] ); // benchmark RdbTree::addRecord() for indexdb bool treetest ( ) ; bool bucketstest ( char *dbname ) ; bool hashtest ( ) ; // how fast to parse the content of this docId? bool parseTest ( char *coll , long long docId , char *query ); //bool carveTest ( uint32_t radius, char *fname, char* query ); bool summaryTest1 ( char *rec, long listSize, char *coll , long long docId , char *query ); //bool summaryTest2 ( char *rec, long listSize, char *coll , long long docId , // char *query ); //bool summaryTest3 ( char *rec, long listSize, char *coll , long long docId , // char *query ); // time a big write, read and then seeks bool thrutest ( char *testdir , long long fileSize ) ; void seektest ( char *testdir , long numThreads , long maxReadSize , char *filename ); bool pingTest ( long hid , unsigned short clientPort ); bool memTest(); bool cacheTest(); bool ramdiskTest(); void countdomains( char* coll, long numRecs, long verb, long output ); UdpProtocol g_dp; // Default Proto //void zlibtest ( ); // installFlag konstants typedef enum { ifk_install = 1, ifk_start = 2, ifk_installgb = 3, ifk_installconf = 4, ifk_gendbs = 10, ifk_fixtfndb = 11, ifk_gentfndb = 12, ifk_installcat = 13, ifk_installnewcat = 14, ifk_genclusterdb = 15, ifk_distributeC = 16, ifk_installgb2 = 17, ifk_dsh = 18, ifk_dsh2 = 19, ifk_backupcopy = 20, ifk_backupmove = 21, ifk_backuprestore = 22, ifk_proxy_start = 23, ifk_installconf2 = 24, ifk_installcat2 = 25, ifk_kstart = 26, ifk_installnewcat2 = 27, ifk_dumpmissing = 30, ifk_removedocids = 31, ifk_dumpdups = 32, ifk_install2 = 33, ifk_tmpstart = 41, ifk_installtmpgb = 42, ifk_proxy_kstart = 43, ifk_start2 = 222 } install_flag_konst_t; int install ( install_flag_konst_t installFlag , long hostId , char *dir = NULL , char *coll = NULL , long hostId2 = -1 , char *cmd = NULL ); int scale ( char *newhostsconf , bool useShotgunIp ); int collinject ( char *newhostsconf ); int collcopy ( char *newHostsConf , char *coll , long collnum ) ; bool doCmd ( const char *cmd , long hostId , char *filename , bool sendToHosts, bool sendToProxies, long hostId2=-1 ); int injectFile ( char *filename , char *ips , long long startDocId , long long endDocId , bool isDelete ) ; int injectFileTest ( long reqLen , long hid ); // generates the file void membustest ( long nb , long loops , bool readf ) ; bool dosOpen(long targetIp, unsigned short port, int numSocks); //void tryMergingWrapper ( int fd , void *state ) ; void saveRdbs ( int fd , void *state ) ; bool shutdownOldGB ( short port ) ; //void resetAll ( ); //void spamTest ( ) ; extern void resetPageAddUrl ( ); extern void resetHttpMime ( ); extern void reset_iana_charset ( ); extern void resetAdultBit ( ); extern void resetDomains ( ); extern void resetEntities ( ); extern void resetQuery ( ); extern void resetStopWords ( ); extern void resetUnicode ( ); #if 0 void stack_test(); void stack_test(){ char *dummy[7000000]; dummy[0] = '\0'; dummy[6999999] = '\0'; printf("dummy: 0x%x = 0x%x", (unsigned int)&(dummy[0]), (unsigned int)&(dummy[6999999])); } #endif int main ( int argc , char *argv[] ) { // appears that linux 2.4.17 kernel would crash with this? // let's try again on gk127 to make sure // YES! gk0 cluster has run for months with this just fine!! mlockall(MCL_CURRENT|MCL_FUTURE); //g_timedb.makeStartKey ( 0 ); // Anchor the stack start point at the first stack variable // in main. char stackPointTestAnchor; g_mem.setStackPointer( &stackPointTestAnchor ); // record time for uptime g_stats.m_uptimeStart = time(NULL); // malloc test for efence //char *ff = (char *)mmalloc(100,"efence"); //ff[100] = 1; // Begin Pointer Check setup //uint32_t firstArg = 0; //ValidPointer vpointerObject((void*)&firstArg); //vpointerObject.isValidPointer(&vpointerObject); // whiny compiler // End Pointer Check setup if (argc < 1) { printHelp: fprintf(stdout, "Usage: gb [-c hostsConf] \n\n"); fprintf (stdout, "Items in []'s are optional, and items in <>'s are " "required.\n\n"); fprintf (stdout, "[hostsConf] is the hosts.conf config file as " "described in overview.html. If not\nprovided then " "it is assumed to be ./hosts.conf.\n\n" ); fprintf(stdout, " can have the following values:\n\n" "-h\tprint this help.\n\n" "-v\tprint version and exit.\n\n" "-o\tprint the overview documentation in HTML. " "Contains the format of\n\thosts.conf.\n\n" "-r\tindicates recovery mode, " "sends email to addresses " "specified in Conf.h\n\tupon startup.\n\n" "\n" "\tstart the gb process for this locally.\n\n" "start [hostId]\n" "\tstart the gb process on all hosts or just on " "[hostId] if specified.\n\n" "stop [hostId]\n" "\tsaves and exits for all gb hosts or " "just on [hostId] if specified.\n\n" "save [hostId]\n" "\tjust saves for all gb hosts or " "just on [hostId] if specified.\n\n" "start [hostId1-hostId2]\n" "\ttwo hostids with a hyphen in between indicates a " "range.\n\n" "stop [hostId1-hostId2]\n" "\ttwo hostids with a hyphen in between indicates a " "range.\n\n" "tmpstart [hostId]\n" "\tstart the gb process on all hosts or just on " "[hostId] if specified, but\n\t" "use the ports specified in hosts.conf PLUS one. " "Then you can switch the\n\t" "proxy over to point to those and upgrade the " "original cluster's gb.\n\t" "That can be done in the Master Controls of the " "proxy using the 'use\n\t" "temporary cluster'. Also, this assumes the binary " "name is tmpgb not gb.\n\n" "tmpstop [hostId]\n" "\tsaves and exits for all gb hosts or " "just on [hostId] if specified, for\n\tthe " "tmpstart command.\n\n" "spidersoff [hostId]\n" "\tdisables spidering for all gb hosts or " "just on [hostId] if specified.\n\n" "spiderson [hostId]\n" "\tensables spidering for all gb hosts or " "just on [hostId] if specified.\n\n" "cacheoff [hostId]\n" "\tdisables all disk PAGE caches on all hosts or " "just on [hostId] if\n\tspecified.\n\n" "freecache [maxShmid]\n" "\tfinds and frees all shared memory up to shmid " "maxShmid, default is 3000000.\n\n" "ddump [hostId]\n" "\tdisk dump in memory trees to binary files " "just on [hostId] if specified.\n\n" "pmerge [hostId|hostId1-hostId2]\n" "\tforce merge of posdb files " "just on [hostId] if specified.\n\n" "smerge [hostId|hostId1-hostId2]\n" "\tforce merge of sectiondb files " "just on [hostId] if specified.\n\n" "tmerge [hostId|hostId1-hostId2]\n" "\tforce merge of titledb files " "just on [hostId] if specified.\n\n" "merge [hostId|hostId1-hostId2]\n" "\tforce merge of all rdb files " "just on [hostId] if specified.\n\n" "dsh \n" "\trun this command on the primary IPs of " "all active hosts in\n\thosts.conf. Example: " "gb dsh 'ps auxw; uptime'\n\n" "dsh2 \n" "\trun this command on the secondary IPs of " "all active hosts in\n\thosts.conf. Example: " "gb dsh2 'ps auxw; uptime'\n\n" "install [hostId]\n" "\tinstall all required files for gb from " "current working directory\n" "\tto [hostId]. If no [hostId] is specified install " "to ALL hosts.\n\n" "install2 [hostId]\n" "\tlike above, but use the secondary IPs in the " "hosts.conf.\n\n" "installgb [hostId]\n" "\tlike above, but install just the gb executable.\n\n" "installgb2 [hostId]\n" "\tlike above, but use the secondary IPs in the " "hosts.conf.\n\n" "installtmpgb [hostId]\n" "\tlike above, but install just the gb executable " "as tmpgb (for tmpstart).\n\n" "installconf [hostId]\n" "\tlike above, but install hosts.conf and gbN.conf\n\n" "installconf2 [hostId]\n" "\tlike above, but install hosts.conf and gbN.conf " "to the secondary IPs.\n\n" "installcat [hostId]\n" "\tlike above, but install just the catdb files.\n\n" "installcat2 [hostId]\n" "\tlike above, but install just the catdb files to " "the secondary IPs.\n\n" "installnewcat [hostId]\n" "\tlike above, but install just the new catdb files." "\n\n" "installnewcat2 [hostId]\n" "\tlike above, but install just the new catdb files " "to the secondary IPs.\n\n" "backupcopy \n" "\tsave a copy of all xml, config, data and map files " "into \n\twhich is relative " "to the working dir. Done for all hosts.\n\n" "backupmove \n" "\tmove all all xml, config, data and map files " "into which \n\tis relative " "to the working dir. Done for all hosts.\n\n" "backuprestore \n" "\tmove all all xml, config, data and map files " "in , which\n\tis relative " "to the working dir, into the working dir. " "Will NOT\n\toverwrite anything. Done for all " "hosts.\n\n" "proxy start [proxyId]\n" "\tStart a proxy that acts as a frontend to gb " "and passes on\n\trequests to random machines on " "the cluster given in hosts.conf.\n\tHelps to " "distribute the load evenly across all machines.\n\n" "proxy load \n" "\tStart a proxy process directly without calling " "ssh. Called\n\tby 'gb proxy start'.\n\n" "proxy stop [proxyId]\n" "\tStop a proxy that acts as a frontend to gb.\n\n" "blasterdiff [-v] [-j] [-p] " " \n" "\tcompare search results between urls in file1 and" "file2 and output the\n\tsearch results in the url" " from file1 not found in the url from file2\n\t" "maxNumThreads is the number of concurrent " "comparisons " "that should be\n\tdone at one time and wait is the" "time to wait between comparisons. \n\t-v is for " "verbose " " and -j is to just display links not found and " "not\n\t" "search for them on server2. If you do not want to" " use the proxy server\n\t" "on gk10, use -p\n\n" "blaster [-l|-u|-i] \n" "\tget documents from the urls given in file. The " "-l argument is to\n\t" "automatically get documents " "from the gigablast log file.\n" "\t-u means to inject/index the url into gb.\n" "\t-i means to inject/index the url into gb AND " "add all of its outlinks to\n" "\tspiderdb for spidering, " "which also entails a DNS lookup on each outlink.\n" "\tmaxNumThreads is the" " number of concurrent threads at one time and wait " "\n\tis the time to wait between threads.\n\n" "scale \n" "\tGenerate a script to be called to migrate the " "data to the new places.\n\tRemaining hosts will " "keep the data they have, but it will be\n\t" "filtered during the next merge operations.\n\n" "collcopy \n" "\tGenerate a script to copy the collection data on " "the cluster defined by\n\tnewHosts.conf to the " "current cluster. Remote network must have " "called\n\t\"gb ddump\" twice in a row just before to " "ensure all of its data is\n\ton disk.\n\n" // gb inject [startdocid] // gb inject titledb [startdocid] "inject [startdocid]\n" "inject titledb [startdocid]\n" "\tInject all documents in into [hostId]. If " "[hostId] not given,\n\t0 is assumed. Each document " "must be preceeded by a valid HTTP mime with\n\t" "a Content-Length: field.\n\n" "injecttest [hostId]\n" "\tinject random documents into [hostId]. If [hostId] " "not given\n\t0 is assumed.\n\n" "ping [clientport]\n" "\tperforms pings to . [clientport] defaults " "to 2050.\n\n" "spellcheck \n" "\tspellchecks the the queries in .\n\n" "dictlookuptest \n" "\tgets the popularities of the entries in the " ".\n Used to only check performance of " "getPhrasePopularity.\n\n" //"stemmertest \n" //"\truns the stemmer on words in .\n\n" //"queryserializetest \n" //"\tserializes every query in and tracks " //"statistics, as well as \t\nverifying consistency; " //"takes raw strings or URLs as input\n\n" // less common things "gendict [numWordsToDump]\n\tgenerate " "dictionary used for spellchecker " "from titledb\n\tfiles in collection . Use " "first [numWordsToDump] words.\n\n" //#ifndef _LARS_ //"gendbs [hostId]\n\tgenerate missing spiderdb, " //"tfndb and checksumdb files from titledb\n\tfiles.\n\n" //"gentfndb [hostId]\n\tgenerate missing tfndb. " //"titledb disk dumps and tight merges are no\n\t" //"longer necessary. Also " //"generates tfndb from spiderdb. tfndb-saved.dat\n\t" //"and all tfndb* files in the collection subdir " //"must not exist, so move\n\tthem to a temp dir.\n\n" //"fixtfndb [hostId]\n\tremove tfndb recs " //"referring to non-existent titledb recs.\n\n" //"genclusterdb [hostId]\n\tgenerate missing " //"clusterdb.\n\n" //"gendaterange [hostId]\n\tgenerate missing " //"date range terms in all title recs.\n\n" //"update\tupdate titledb0001.dat\n\n" //"mergechecksumdb\tmerge checksumdb flat files\n\n" "treetest\n\ttree insertion speed test\n\n" "bucketstest [dbname]\n\tcompare speed and accuracy of " "buckets vs tree in add, getList and deleteList. " "With an argument, test validity of db's saved buckets\n\n" "hashtest\n\tadd and delete into hashtable test\n\n" "parsetest [coll] [query]\n\t" "parser speed tests\n\n" "thrutest [dir] [fileSize]\n\tdisk write/read speed " "test\n\n" "seektest [dir] [numThreads] [maxReadSize] " "[filename]\n" "\tdisk seek speed test\n\n" "memtest\n" "\t Test how much memory we can use\n\n" // Quality Tests "countdomains \n" "\tCounts the domains and IPs in collection coll and " "in the first X\n\ttitledb records. Results are sorted" "by popularity and stored in\n\tthe log file. \n\n" "cachetest\n\t" "cache stability and speed tests\n\n" "ramdisktest\n\t" "test ramdisk functionality\n\n" "dosopen \n" "\tOpen numThreads tcp sockets to ip:port and just " "sit there. For\n\ttestingthe robustness of gb.\n\n" "xmldiff [-td] \n" "\tTest xml diff routine on file1 and file2.\n" "\t-t: only show diffs in tag structure.\n" "\t-d: print debug output.\n" "\n" "dump e \n\tdump all events " "as if the time is UTCtimestamp.\n\n" "dump es \n\tdump stats for " "all events as if the time is UTCtimestamp.\n\n" #ifdef _CLIENT_ //there was in this command but it // wasn't used in the program, so deleting it from // here "dump [C [X [Y [Z]]]]\n\tdump a db in " #else "dump [C [X [Y [Z [T]]]]]\n\tdump a db in " #endif "working directory.\n" #ifndef _CLIENT_ #ifndef _METALINCS_ //"\tV is u to dump tfndb.\n" "\tV is d to dump datedb.\n" #endif #endif "\tV is s to dump spiderdb. set [T] to 1 to print " "new stats. 2 to print old stats. T is ip of firstip." "\n" "\tV is t to dump titledb.\n" "\tV is ts to dump sentences from events.\n" "\tV is tw to dump words from events.\n" "\tV is D to dump duplicate docids in titledb.\n" "\tV is c to dump checksumdb.\n" "\tV is S to dump tagdb.\n" "\tV is W to dump tagdb for wget.\n" "\tV is V to dump revdb.\n" "\tV is x to dump doledb.\n" "\tV is w to dump waiting tree.\n" "\tV is B to dump sectiondb.\n" "\tV is C to dump catdb.\n" "\tV is l to dump clusterdb.\n" "\tV is z to dump statsdb all keys.\n" "\tV is Z to dump statsdb all keys and data samples.\n" "\tV is L to dump linkdb.\n" "\tV is u to dump tfndb.\n" "\tV is vu to verify tfndb.\n" "\tC is the name of the collection.\n" "\tX is start file num. (default 0)\n" "\tY is num files. (default -1)\n" "\tZ is 1 to include tree. (default 1)\n" #ifndef _CLIENT_ #ifndef _METALINCS_ #ifndef _GLOBALSPEC_ "\tT is the termid to dump. Applies only to indexdb.\n" #endif #endif #endif "\tT is the first docId to dump. Applies only to " "titledb. " "(default none)\n\n" "\tV is c to dump cached recs.\n" "dump s [X [Y [Z [C]]]\n" "\tdump spider in working directory.\n" "\tC is the collection name. (default none)\n" "\tX is start file num. (default 0)\n" "\tY is num files. (default -1)\n" "\tZ is 1 to include tree. (default 1)\n" //"\tA is 1 for new urls, 0 for old. (default 1)\n" //"\tA is -1 to dump all urls in all queues.\n" //"\tB is priority of urls. (default -1)\n" //"\tB is -1 to dump all priorities\n" "\tC is 1 to just show the stats. (default 0)\n" "\n" //"dump i X Y Z t\n\tdump indexdb termId t in working " //"directory.\n" //"\tX is start file num. (default 0)\n" //"\tY is num files. (default -1)\n" //"\tZ is 1 to include tree. (default 1)\n" //"\tt is the termid to dump. (default none)\n\n" #ifndef _CLIENT_ #ifndef _METALINCS_ "dump I [X [V]]\n\tdump indexdb in working " "directory at " "an offset.\n" #endif #endif "\tX is the file NAME. (default NULL)\n" "\tV is the start offset. (default 0)\n" "\n" "dumpmissing [hostId]\n\t" "dump the docIds in indexdb but not " "in tfndb/titledb to stderr. " "\n\tUsed for passing in to removedocids.\n" "\n" "dumpdups [hostId]\n\t" "dump the docIds in duplicated in indexdb when " "they should not be to\n\tstderr. Usually a sign " "of mis-indexing. Used for passing in to\n\t" "removedocids.\n" "\n" "removedocids " "[hostId|hostId1-hostId2]" "\n\tremoves the docids in fileOfDocIds from indexdb, " "clusterdb, checksumdb\n\tand tfndb. Effectively " "completely deleting that docid. " "fileOfDocIds\n\tcontains one " "docId per line, and nothing more.\n" "\n" "setnote " "\n\tsets the note for host with hostid to " "the given note .\n" "\n" "setsparenote " "\n\tsets the note for spare with spareid to " "the given note .\n" "\n" "replacehost " "\n\treplaces host with hostid with the " "spare that has the spareid\n\t. the host " "being replaced should already be shut down or dead.\n" "\n" "synchost " "\n\trecopies this host from its twin. host directory " "must be empty and\n\tthe host must be marked as dead " "in the current gb. Use synchost2 to use secondary " "IPs.\n" "\n" //#endif ); // disable printing of used memory g_mem.m_used = 0; return 0; } // get hosts.conf file char *hostsConf = "./hosts.conf"; long hostId = 0; long cmdarg = 1; if ( argc >= 3 && argv[1][0]=='-'&&argv[1][1]=='c'&&argv[1][2]=='\0') { hostsConf = argv[2]; cmdarg = 3; } // get command if ( argc <= cmdarg ) goto printHelp; char *cmd = argv[cmdarg]; // help if ( strcmp ( cmd , "-h" ) == 0 ) goto printHelp; // version if ( strcmp ( cmd , "-v" ) == 0 ) { // fprintf(stderr,"Gigablast %s\nMD5KEY: %s\n" // "TAG: %s\nPATH: %s\n", // GBVersion, GBCommitID, GBTag, GBBuildPath); return 0; } // print overview if ( strcmp ( cmd , "-o" ) == 0 ) { //printOverview ( ); return 0; } // assume our hostId is the command! // now we advance 'cmd' past the hostId if we detect // the presence of more args if ( is_digit(argv[cmdarg][0]) ) { hostId = atoi(argv[cmdarg]); if(argc > cmdarg+1) { cmd = argv[++cmdarg]; } } if ( strcmp ( cmd , "dosopen" ) == 0 ) { long ip; short port = 8000; long numSockets = 100; if ( cmdarg + 1 < argc ) ip = atoip(argv[cmdarg+1],gbstrlen(argv[cmdarg+1])); else goto printHelp; if ( cmdarg + 2 < argc ) port = (short)atol ( argv[cmdarg+2] ); if ( cmdarg + 3 < argc ) numSockets = atol ( argv[cmdarg+3] ); return dosOpen(ip, port, numSockets); } //SafeBuf sb; //char *str = "fun glassblowing now"; //sb.truncateLongWords ( str , strlen(str),10); //send an email on startup for -r, like if we are recovering from an //unclean shutdown. bool recoveryMode = false; if ( strcmp ( cmd , "-r" ) == 0 ) recoveryMode = true; bool testMandrill = false; if ( strcmp ( cmd , "emailmandrill" ) == 0 ) { testMandrill = true; } // gb gendbs, preset the hostid at least if ( //strcmp ( cmd , "gendbs" ) == 0 || //strcmp ( cmd , "gentfndb" ) == 0 || //strcmp ( cmd , "fixtfndb" ) == 0 || strcmp ( cmd , "dumpmissing" ) == 0 || strcmp ( cmd , "dumpdups" ) == 0 || //strcmp ( cmd , "gencatdb" ) == 0 || //strcmp ( cmd , "genclusterdb" ) == 0 || //strcmp ( cmd , "gendaterange" ) == 0 || strcmp ( cmd , "distributeC" ) == 0 ) { // ensure we got a collection name after the cmd if ( cmdarg + 2 > argc ) goto printHelp; // may also have an optional hostid if ( cmdarg + 3 == argc ) hostId = atoi ( argv[cmdarg+2] ); } if( (strcmp( cmd, "countdomains" ) == 0) && (argc >= (cmdarg + 2)) ) { unsigned long tmp = atoi( argv[cmdarg+2] ); if( (tmp * 10) > g_mem.m_memtablesize ) g_mem.m_memtablesize = tmp * 10; } // set it for g_hostdb and for logging g_hostdb.m_hostId = hostId; //if ( strcmp ( cmd , "gzip" ) == 0 ) { // if ( argc > cmdarg+1 ) gbgzip(argv[cmdarg+1]); // else goto printHelp; // return 0; //} //if ( strcmp ( cmd , "gunzip" ) == 0 ) { // if ( argc > cmdarg+1 ) gbgunzip(argv[cmdarg+1]); // else goto printHelp; // return 0; //} // these tests do not need a hosts.conf /* if ( strcmp ( cmd , "trietest" ) == 0 ) { trietest(); return 0; } if (strcmp ( cmd, "matchertest" ) == 0 ) { matchertest(argc - 2, argv + 2); return 0; } */ if ( strcmp ( cmd , "bucketstest" ) == 0 ) { if ( argc > cmdarg+1 ) bucketstest(argv[cmdarg+1]); else if( argc == cmdarg+1 ) bucketstest(NULL); else goto printHelp; return 0; } // these tests do not need a hosts.conf if ( strcmp ( cmd , "treetest" ) == 0 ) { if ( argc > cmdarg+1 ) goto printHelp; treetest(); return 0; } // these tests do not need a hosts.conf if ( strcmp ( cmd , "hashtest" ) == 0 ) { if ( argc > cmdarg+1 ) goto printHelp; hashtest(); return 0; } // these tests do not need a hosts.conf if ( strcmp ( cmd , "memtest" ) == 0 ) { if ( argc > cmdarg+1 ) goto printHelp; memTest(); return 0; } if ( strcmp ( cmd , "cachetest" ) == 0 ) { if ( argc > cmdarg+1 ) goto printHelp; cacheTest(); return 0; } if ( strcmp ( cmd , "ramdisktest" ) == 0 ) { if ( argc > cmdarg+1 ) goto printHelp; ramdiskTest(); return 0; } if ( strcmp ( cmd , "parsetest" ) == 0 ) { if ( cmdarg+1 >= argc ) goto printHelp; // load up hosts.conf if ( ! g_hostdb.init(hostsConf, hostId) ) { log("db: hostdb init failed." ); return 1; } // init our table for doing zobrist hashing if ( ! hashinit() ) { log("db: Failed to init hashtable." ); return 1; } long long docid = atoll1(argv[cmdarg+1]); char *coll = ""; char *query = ""; if ( cmdarg+3 <= argc ) coll = argv[cmdarg+2]; if ( cmdarg+4 == argc ) query = argv[cmdarg+3]; parseTest( coll, docid, query ); return 0; } /* if ( strcmp ( cmd , "carvetest" ) == 0 ) { if ( ! g_hostdb.init(hostsConf, hostId) ) { log("db: hostdb init failed." ); return 1; } if ( ! hashinit() ) { log("db: Failed to init hashtable." ); return 1; } if (!ucInit(g_hostdb.m_dir)) { log("Unicode initialization failed!"); return 1; } if (cmdarg+2 >= argc) { log("usage: gb carvetest qt1 ..." ); return 2; } uint32_t radius = atoi(argv[cmdarg+1]); char* fname = argv[cmdarg+2]; char buf[65535]; *buf = '\0'; int virgin = 1; for (int i = cmdarg+3; i < argc; i++) { if (!virgin) strcat(buf, " "); else virgin = 0; strcat(buf, argv[i]); } printf("file: '%s' query: '%s'\n", fname, buf); carveTest(radius, fname, buf); return 0; } */ if ( strcmp ( cmd , "booltest" ) == 0 ){ if ( ! g_hostdb.init(hostsConf, hostId) ) { log("db: hostdb init failed." ); return 1; } // init our table for doing zobrist hashing if ( ! hashinit() ) { log("db: Failed to init hashtable." ); return 1; } if (!ucInit(g_hostdb.m_dir)) { log("Unicode initialization failed!"); return 1; } //testBoolean(); return 0; } /* if ( strcmp ( cmd , "querytest" ) == 0){ if ( ! g_hostdb.init(hostsConf, hostId) ) { log("db: hostdb init failed." ); return 1; } // init our table for doing zobrist hashing if ( ! hashinit() ) { log("db: Failed to init hashtable." ); return 1; } if (!ucInit(g_hostdb.m_dir)) { log("Unicode initialization failed!"); return 1; } queryTest(); return 0; } */ // need threads here for tests? // gb thrutest if ( strcmp ( cmd , "thrutest" ) == 0 ) { if ( cmdarg+2 >= argc ) goto printHelp; char *testdir = argv[cmdarg+1]; long long fileSize = atoll1 ( argv[cmdarg+2] ); thrutest ( testdir , fileSize ); return 0; } // gb seektest if ( strcmp ( cmd , "seektest" ) == 0 ) { char *testdir = "/tmp/"; long numThreads = 20; //30; long long maxReadSize = 20000; char *filename = NULL; if ( cmdarg+1 < argc ) testdir = argv[cmdarg+1]; if ( cmdarg+2 < argc ) numThreads = atol(argv[cmdarg+2]); if ( cmdarg+3 < argc ) maxReadSize = atoll1(argv[cmdarg+3]); if ( cmdarg+4 < argc ) filename = argv[cmdarg+4]; seektest ( testdir , numThreads , maxReadSize , filename ); return 0; } /* if ( strcmp ( cmd, "qa" ) == 0 ) { if ( ! g_hostdb.init(hostsConf, hostId) ) { log("db: hostdb init failed." ); return 1; } // init our table for doing zobrist hashing if ( ! hashinit() ) { log("db: Failed to init hashtable." ); return 1; } if (!ucInit(g_hostdb.m_dir)) { log("Unicode initialization failed!"); return 1; } char *s1 = NULL; char *s2 = NULL; char *u = NULL; char *q = NULL; if ( cmdarg+1 < argc ) s1 = argv[cmdarg+1]; if ( cmdarg+2 < argc ) s2 = argv[cmdarg+2]; if ( cmdarg+3 < argc ) u = argv[cmdarg+3]; if ( cmdarg+4 < argc ) q = argv[cmdarg+4]; qaTest(s1, s2, u, q); return 0; } // gb xmldiff file1 file2 if (strcmp ( cmd, "xmldiff" ) == 0 ) { if ( cmdarg+2 >= argc ) goto printHelp; // init our table for doing zobrist hashing if ( ! g_hostdb.init(hostsConf, hostId) ) { log("db: hostdb init failed." ); return 1; } if ( ! hashinit() ) { log("db: Failed to init hashtable." ); return 1; } if (!ucInit(g_hostdb.m_dir)) { log("Unicode initialization failed!"); return 1; } DiffOpt opt; int nextArg = cmdarg+1; while ( argc > nextArg && argv[nextArg][0] == '-'){ char *c = argv[nextArg] + 1; while (*c){ switch(*c++){ case 't': opt.m_tagOnly = true; break; case 'd': opt.m_debug++ ; break; case 'c': opt.m_context++ ; break; default: goto printHelp; } } nextArg++; } if ( nextArg+1 >= argc ) goto printHelp; char *file1 = argv[nextArg ]; char *file2 = argv[nextArg+1]; xmlDiffTest(file1, file2, &opt); return 0; } */ // note the stack size for debug purposes struct rlimit rl; getrlimit(RLIMIT_STACK, &rl); log(LOG_INFO,"db: Stack size is %ld.", rl.rlim_cur); // set the s_pages array for print admin pages g_pages.init ( ); bool isProxy = false; if ( strcmp( cmd , "proxy" ) == 0 && strcmp( argv[cmdarg+1] , "load" ) == 0 ) { isProxy = true; // we need to parse out the hostid too! if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] ); } // this is just like starting up a gb process, but we add one to // each port, we are a dummy machine in the dummy cluster. // gb -c hosts.conf tmpstart [hostId] char useTmpCluster = 0; if ( strcmp ( cmd , "tmpstart" ) == 0 ) useTmpCluster = 1; // gb -c hosts.conf tmpstop [hostId] if ( strcmp ( cmd , "tmpstop" ) == 0 ) useTmpCluster = 1; // gb -c hosts.conf tmpstarthost if ( strcmp ( cmd , "tmpstarthost" ) == 0 ) { useTmpCluster = 1; // we need to parse out the hostid too! if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); else goto printHelp; } // load up hosts.conf if ( ! g_hostdb.init(hostsConf, hostId, NULL, isProxy,useTmpCluster)){ log("db: hostdb init failed." ); return 1; } // set clock file name so gettimeofdayInMmiilisecondsGlobal() // see g_clockInSync to be true... unles clockadjust.dat is more // than 2 days old in which case not! if ( g_hostdb.m_myHost->m_hostId != 0 ) { // host #0 does not need this, everyone syncs with him setTimeAdjustmentFilename(g_hostdb.m_dir , "clockadjust.dat"); // might as well load it i guess loadTimeAdjustment(); } // the supporting network, used by gov.gigablast.com to get link text // from the larger main index. g_hostdb2. we don't care if this load // fails or not. //char h2[128]; //sprintf ( h2 , "%shosts2.conf" , g_hostdb.m_dir ); //if ( ! g_hostdb2.init(h2, 0 ,"external") ) { // log("db: hosts2.conf hostdb init failed." ); return 1; } // init our table for doing zobrist hashing if ( ! hashinit() ) { log("db: Failed to init hashtable." ); return 1; } // . hashinit() calls srand() w/ a fixed number // . let's mix it up again srand ( time(NULL) ); // do not save conf if any core dump occurs starting here // down to where we set this back to true g_conf.m_save = false; // start up log file if ( ! g_log.init( g_hostdb.m_logFilename ) ) { fprintf (stderr,"db: Log file init failed.\n" ); return 1; } // log the version //log(LOG_INIT,"conf: Gigablast Server %s",GBVersion); //Put this here so that now we can log messages if ( strcmp ( cmd , "proxy" ) == 0 ) { if (argc < 3){ goto printHelp; exit (1); } long proxyId = -1; if ( cmdarg+2 < argc ) proxyId = atoi ( argv[cmdarg+2] ); if ( strcmp ( argv[cmdarg+1] , "start" ) == 0 ) { return install ( ifk_proxy_start , proxyId ); } if ( strcmp ( argv[cmdarg+1] , "kstart" ) == 0 ) { return install ( ifk_proxy_kstart , proxyId ); } else if ( strcmp ( argv[cmdarg+1] , "stop" ) == 0 ) { g_proxy.m_proxyRunning = true; return doCmd ( "save=1" , proxyId , "master" , false,//sendtohosts true);//sendtoproxies } else if ( strcmp ( argv[cmdarg+1] , "replacehost" ) == 0 ) { g_proxy.m_proxyRunning = true; long hostId = -1; long spareId = -1; if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] ); if ( cmdarg + 2 < argc ) spareId = atoi ( argv[cmdarg+3] ); char replaceCmd[256]; sprintf(replaceCmd, "replacehost=1&rhost=%li&rspare=%li", hostId, spareId); return doCmd ( replaceCmd, -1, "master/hosts" , false,//sendtohosts true);//sendtoproxies } else if ( proxyId == -1 || strcmp ( argv[cmdarg+1] , "load" ) != 0 ) { goto printHelp; exit(1); } long yippyPort; if ( g_isYippy ) { yippyPort = proxyId; proxyId = 0; } Host *h = g_hostdb.getProxy( proxyId ); unsigned short httpPort = h->m_httpPort; if ( g_isYippy ) httpPort = yippyPort; unsigned short httpsPort = h->m_httpsPort; //we need udpserver for addurl and udpserver2 for pingserver unsigned short udpPort = h->m_port; //unsigned short udpPort2 = h->m_port2; // g_conf.m_maxMem = 2000000000; if ( ! g_conf.init ( h->m_dir ) ) { // , h->m_hostId ) ) { log("db: Conf init failed." ); return 1; } // init the loop before g_process since g_process // registers a sleep callback! if ( ! g_loop.init() ) { log("db: Loop init failed." ); return 1; } if ( ! g_threads.init() ) { log("db: Threads init failed." ); return 1; } g_process.init(); if ( ! g_process.checkNTPD() ) return log("db: ntpd not running on proxy"); if ( ! g_isYippy && !ucInit(g_hostdb.m_dir)) return log("db: Unicode initialization failed!"); // load speller unifiedDict for spider compression proxy //if ( g_hostdb.m_myHost->m_type & HT_SCPROXY ) // g_speller.init(); if ( ! g_udpServer.init( g_hostdb.getMyPort() , &g_dp, 0 , // niceness 20000000 , // readBufSIze 20000000 , // writeBufSize 20 , // pollTime in ms 3500 , // max udp slots false )){ // is dns? log("db: UdpServer init failed." ); return 1; } if (!g_proxy.initProxy (proxyId, udpPort, 0, &g_dp)) return log("proxy: init failed"); // initialize Users if ( ! g_users.init() ){ log("db: Users init failed. "); return 1;} // then statsdb if ( ! g_isYippy && ! g_statsdb.init() ) { log("db: Statsdb init failed." ); return 1; } // init our table for doing zobrist hashing if ( ! hashinit() ) { log("db: Failed to init hashtable." ); return 1; } // Msg13.cpp now uses the address class so it needs this //if ( ! initPlaceDescTable ( ) ) { // log("events: places table init failed"); return 1; } tryagain: if ( ! g_proxy.initHttpServer( httpPort, httpsPort ) ) { log("db: HttpServer init failed. Another gb " "already running?" ); // this is dangerous!!! do not do the shutdown thing return 1; // just open a socket to port X and send // GET /master?save=1 if ( shutdownOldGB(httpPort) ) goto tryagain; log("db: Shutdown failed."); return 1; } //we should save gb.conf right ? g_conf.m_save = true; // initiazlie Users //if ( ! g_users.init() ){ //log("db: Users init failed. "); return 1;} if ( ! g_loop.runLoop() ) { log("db: runLoop failed." ); return 1; } // disable any further logging so final log msg is clear g_log.m_disabled = true; return 0; } if(strcmp(cmd, "catlang") == 0) { log(LOG_INFO, "cat: Building the DMOZ category language tables...\n"); g_categories->initLangTables(); log(LOG_INFO, "cat: Done.\n"); return(0); } if(strcmp(cmd, "catcountry") == 0) { // Load categories and generate country table char structureFile[256]; g_conf.m_maxMem = 1000000000LL; // 1G g_mem.m_maxMem = 1000000000LL; // 1G sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir); g_categories = &g_categories1; if (g_categories->loadCategories(structureFile) != 0) { log("cat: Loading Categories From %s Failed.", structureFile); return(0); } log(LOG_INFO, "cat: Building the DMOZ category country table...\n"); g_countryCode.createHashTable(); log(LOG_INFO, "cat: Done.\n"); return(0); } if ( strcmp ( cmd , "blaster" ) == 0 ) { long i=cmdarg+1; bool isLogFile=false; bool injectUrlWithLinks=false; bool injectUrl=false; long wait = 0; if ( strcmp (argv[i],"-l") == 0 ){ isLogFile=true; i++; } if ( strcmp (argv[i],"-i") == 0 ){ injectUrlWithLinks=true; i++; } if ( strcmp (argv[i],"-u") == 0 ){ injectUrl=true; i++; } char *filename = argv[i]; long maxNumThreads=1; if (argv[i+1]) maxNumThreads=atoi(argv[i+1]); if (argv[i+2]) wait=atoi(argv[i+2]); g_conf.m_maxMem = 2000000000; //wait atleast 10 msec before you start again. if (wait<1000) wait=10; g_blaster.runBlaster (filename,NULL, maxNumThreads,wait, isLogFile,false,false,false, injectUrlWithLinks, injectUrl); // disable any further logging so final log msg is clear g_log.m_disabled = true; return 0; } if ( strcmp ( cmd , "blasterdiff" ) == 0 ) { long i=cmdarg+1; bool verbose=false; bool justDisplay=false; bool useProxy=true; //cycle through the arguments to check for -v,-j,-p while (argv[i] && argv[i][0]=='-'){ if ( strcmp (argv[i],"-v") == 0 ){ verbose=true; } else if ( strcmp (argv[i],"-j") == 0 ){ justDisplay=true; } else if ( strcmp (argv[i],"-p") == 0){ useProxy=false; } i++; } char *file1 = argv[i]; char *file2 = argv[i+1]; long maxNumThreads=1; if (argv[i+2]) maxNumThreads=atoi(argv[i+2]); long wait; if (argv[i+3]) wait=atoi(argv[i+3]); //wait atleast 1 sec before you start again. if (wait<1000) wait=1000; g_blaster.runBlaster(file1,file2, maxNumThreads,wait,false, verbose,justDisplay,useProxy); // disable any further logging so final log msg is clear g_log.m_disabled = true; return 0; } // g_conf.init was here // now that we have init'd g_hostdb and g_log, call this for an ssh //if ( strcmp ( cmd , "gendbs" ) == 0 && cmdarg + 2 == argc ) // return install ( ifk_gendbs , -1 , NULL , // argv[cmdarg+1] ); // coll if( strcmp(cmd, "distributeC") == 0 && cmdarg +2 == argc ) return install ( ifk_distributeC, -1, NULL, argv[cmdarg+1] ); //if ( strcmp ( cmd , "gentfndb" ) == 0 && cmdarg + 2 == argc ) // return install ( ifk_gentfndb , -1 , NULL , // argv[cmdarg+1] ); // coll //if ( strcmp ( cmd , "fixtfndb" ) == 0 && cmdarg + 2 == argc ) // return install ( ifk_fixtfndb , -1 , NULL , // argv[cmdarg+1] ); // coll //if ( strcmp ( cmd, "genclusterdb" ) == 0 && cmdarg + 2 == argc ) // return install ( ifk_genclusterdb , -1 , NULL , // argv[cmdarg+1] ); // coll // . dumpmissing [hostid] // . if hostid not there, ssh to all using install() if ( strcmp ( cmd, "dumpmissing" ) == 0 && cmdarg + 2 == argc ) return install ( ifk_dumpmissing , -1 , NULL , argv[cmdarg+1] ); // coll if ( strcmp ( cmd, "dumpdups" ) == 0 && cmdarg + 2 == argc ) return install ( ifk_dumpdups , -1 , NULL , argv[cmdarg+1] ); // coll // . gb removedocids [hostid1-hostid2] // . if hostid not there, ssh to all using install() // . use removedocids below if only running locally // . cmdarg+3 can be 4 or 5, depending if [hostid1-hostid2] is present // . argc is 5 if [hostid1-hostid2] is present, 4 if not if ( strcmp ( cmd, "removedocids" ) == 0 && cmdarg + 3 >= 4 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 3 < argc ) hostId = atoi ( argv[cmdarg+3] ); // might have a range if ( cmdarg + 3 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+3],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return install ( ifk_removedocids , h1, argv[cmdarg+2], // filename argv[cmdarg+1], // coll h2 ); } // if we had no hostid given, cast to all if ( hostId == -1 ) return install ( ifk_removedocids , -1 , // hostid1 argv[cmdarg+2], // filename argv[cmdarg+1], // coll -1 ); // hostid2 // otherwise, a hostid was given and we will call // removedocids() directly below } // gb ping [hostId] [clientPort] if ( strcmp ( cmd , "ping" ) == 0 ) { long hostId = 0; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); unsigned short port = 2050; if ( cmdarg + 2 < argc ) port = (unsigned short)atoi ( argv[cmdarg+2] ); pingTest ( hostId , port ); return 0; } // gb injecttest [hostId] if ( strcmp ( cmd , "injecttest" ) == 0 ) { if ( cmdarg+1 >= argc ) goto printHelp; long hostId = 0; if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] ); long reqLen = atoi ( argv[cmdarg+1] ); if ( reqLen == 0 ) goto printHelp; injectFileTest ( reqLen , hostId ); return 0; } // gb updatetitledb /* if ( strcmp ( cmd , "updatetitledb" ) == 0 ) { if ( cmdarg+1 != argc ) goto printHelp; log(LOG_INIT,"db: *-*-*-* Updating Titledb et al."); g_conf.m_spiderdbMinFilesToMerge = 5; g_conf.m_tfndbMaxDiskPageCacheMem = 0; //g_conf.m_checksumdbMaxDiskPageCacheMem = 0; g_conf.m_spiderdbMaxDiskPageCacheMem = 0; //g_conf.m_tfndbMaxTreeMem = 100*1024*1024; // . re-write all the keys so that they contain the site and // content hashes in the low bits // . there should only be one file for this since we don't // support negatives fixTitleRecs ( "" ); // coll return 0; } */ // this is a hack too! /* if ( strcmp ( cmd , "mergechecksumdb" ) == 0 ) { if ( cmdarg+1 != argc ) goto printHelp; log(LOG_INIT,"db: *-*-*-* Merging checksumdb flat files."); long old = g_conf.m_checksumdbMinFilesToMerge ; g_conf.m_checksumdbMinFilesToMerge = 50; // set up checksumdb g_conf.m_checksumdbMaxTreeMem = 50000000; // 50M g_conf.m_maxMem = 1000000000LL; // 1G g_mem.m_maxMem = 1000000000LL; // 1G // init it if ( ! g_checksumdb.init ( ) ) { log("db: Checksumdb init failed for merge." ); return 1; } g_collectiondb.init(true); g_checksumdb.getRdb()->addColl ( "finalmerge" ); // no, otherwise won't be able to load into tree! //g_conf.m_checksumdbMaxTreeMem = 50*1024*1024; mergeChecksumFiles(); // reset so when we save value goes back to original g_conf.m_checksumdbMinFilesToMerge = old; // save tree to disk Rdb *r = g_checksumdb.getRdb(); r->m_tree.fastSave ( r->getDir() , r->m_dbname , // &m_saveFile , false , // useThread , NULL , // this , NULL );// doneSaving ) ) return 0; } */ // gb inject [startdocid] // gb inject titledb [startdocid] if ( strcmp ( cmd , "inject" ) == 0 ) { if ( argc != cmdarg+3 && argc != cmdarg+4 && argc != cmdarg+5 ) goto printHelp; char *file = argv[cmdarg+1]; char *ips = argv[cmdarg+2]; long long startDocId = 0LL; long long endDocId = DOCID_MASK; if ( cmdarg+3 < argc ) startDocId = atoll(argv[cmdarg+3]); if ( cmdarg+4 < argc ) endDocId = atoll(argv[cmdarg+4]); injectFile ( file , ips , startDocId , endDocId , false ); return 0; } if ( strcmp ( cmd , "reject" ) == 0 ) { if ( argc != cmdarg+3 && argc != cmdarg+4 && argc != cmdarg+5 ) goto printHelp; char *file = argv[cmdarg+1]; char *ips = argv[cmdarg+2]; long long startDocId = 0LL; long long endDocId = DOCID_MASK; //if ( cmdarg+3 < argc ) startDocId = atoll(argv[cmdarg+3]); //if ( cmdarg+4 < argc ) endDocId = atoll(argv[cmdarg+4]); injectFile ( file , ips , startDocId , endDocId , true ); return 0; } // gb dsh if ( strcmp ( cmd , "dsh" ) == 0 ) { // get hostId to install TO (-1 means all) //long hostId = -1; if ( cmdarg+1 >= argc ) goto printHelp; char *cmd = argv[cmdarg+1]; return install ( ifk_dsh , -1,NULL,NULL,-1, cmd ); } // gb dsh2 if ( strcmp ( cmd , "dsh2" ) == 0 ) { // get hostId to install TO (-1 means all) //long hostId = -1; if ( cmdarg+1 >= argc ) goto printHelp; char *cmd = argv[cmdarg+1]; return install ( ifk_dsh2 , -1,NULL,NULL,-1, cmd ); } // gb install if ( strcmp ( cmd , "install" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_install , hostId ); } // gb install if ( strcmp ( cmd , "install2" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_install2 , hostId ); } // gb installgb if ( strcmp ( cmd , "installgb" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_installgb , hostId ); } // gb installgb if ( strcmp ( cmd , "installgb2" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_installgb2 , hostId ); } // gb installtmpgb if ( strcmp ( cmd , "installtmpgb" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_installtmpgb , hostId ); } // gb installconf if ( strcmp ( cmd , "installconf" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_installconf , hostId ); } // gb installconf2 if ( strcmp ( cmd , "installconf2" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_installconf2 , hostId ); } // gb installcat if ( strcmp ( cmd , "installcat" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_installcat , hostId ); } // gb installcat2 if ( strcmp ( cmd , "installcat2" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_installcat2 , hostId ); } // gb installnewcat if ( strcmp ( cmd , "installnewcat" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_installnewcat , hostId ); } // gb installnewcat2 if ( strcmp ( cmd , "installnewcat2" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return install ( ifk_installnewcat2 , hostId ); } // gb start [hostId] if ( strcmp ( cmd , "start" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return install ( ifk_start , h1, NULL,NULL,h2 ); } // if it is us, do it //if ( hostId != -1 ) goto mainStart; return install ( ifk_start , hostId ); } // gb tmpstart [hostId] if ( strcmp ( cmd , "tmpstart" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return install ( ifk_tmpstart , h1, NULL,NULL,h2 ); } // if it is us, do it //if ( hostId != -1 ) goto mainStart; return install ( ifk_tmpstart, hostId ); } if ( strcmp ( cmd , "tmpstop" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return doCmd ( "save=1" , h1 , "master" , true , //sendtohosts false,//sendtoproxies h2 ); } return doCmd ( "save=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb start2 [hostId] if ( strcmp ( cmd , "start2" ) == 0 ) { // get hostId to install TO (-1 means all) long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return install ( ifk_start2 , h1, NULL,NULL,h2 ); } // if it is us, do it //if ( hostId != -1 ) goto mainStart; return install ( ifk_start2 , hostId ); } //keep alive start if ( strcmp ( cmd , "kstart" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return install ( ifk_kstart , h1, NULL,NULL,h2 ); } return install ( ifk_kstart , hostId ); } if ( strcmp ( cmd , "kstop" ) == 0 ) { //same as stop, here for consistency long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return doCmd ( "save=1" , h1 , "master" , true , //sendtohosts false,//sendtoproxies h2 ); } return doCmd ( "save=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb backupcopy [hostId] if ( strcmp ( cmd , "backupcopy" ) == 0 ) { if ( cmdarg + 1 >= argc ) goto printHelp; return install ( ifk_backupcopy , -1 , argv[cmdarg+1] ); } // gb backupmove [hostId] if ( strcmp ( cmd , "backupmove" ) == 0 ) { if ( cmdarg + 1 >= argc ) goto printHelp; return install ( ifk_backupmove , -1 , argv[cmdarg+1] ); } // gb backupmove [hostId] if ( strcmp ( cmd , "backuprestore" ) == 0 ) { if ( cmdarg + 1 >= argc ) goto printHelp; return install ( ifk_backuprestore, -1 , argv[cmdarg+1] ); } // gb scale if ( strcmp ( cmd , "scale" ) == 0 ) { if ( cmdarg + 1 >= argc ) goto printHelp; return scale ( argv[cmdarg+1] , true ); } if ( strcmp ( cmd , "collinject" ) == 0 ) { if ( cmdarg + 1 >= argc ) goto printHelp; return collinject ( argv[cmdarg+1] ); } // gb collcopy > if ( strcmp ( cmd , "collcopy" ) == 0 ) { if ( cmdarg + 4 != argc ) goto printHelp; char *hostsconf = argv[cmdarg+1]; char *coll = argv[cmdarg+2]; long collnum = atoi(argv[cmdarg+3]); return collcopy ( hostsconf , coll , collnum ); } // gb stop [hostId] if ( strcmp ( cmd , "stop" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return doCmd ( "save=1" , h1 , "master" , true , //sendtohosts false,//sendtoproxies h2 ); } return doCmd ( "save=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb save [hostId] if ( strcmp ( cmd , "save" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return doCmd ( "js=1" , h1 , "master" , true , //sendtohosts false,//sendtoproxies h2 ); } return doCmd ( "js=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb spidersoff [hostId] if ( strcmp ( cmd , "spidersoff" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return doCmd ( "se=0" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb spiderson [hostid] if ( strcmp ( cmd , "spiderson" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return doCmd ( "se=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb cacheoff [hostId] if ( strcmp ( cmd , "cacheoff" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return doCmd ( "dpco=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb freecache [hostId] if ( strcmp ( cmd , "freecache" ) == 0 ) { long max = 7000000; if ( cmdarg + 1 < argc ) max = atoi ( argv[cmdarg+1] ); freeAllSharedMem( max ); return true; } // gb ddump [hostId] if ( strcmp ( cmd , "ddump" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); return doCmd ( "dump=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb pmerge [hostId] if ( strcmp ( cmd , "pmerge" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return doCmd ( "pmerge=1",h1,"master", true , //sendtohosts false ,//sendtoproxiesh2 h2 ); } return doCmd ( "pmerge=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb smerge [hostId] if ( strcmp ( cmd , "smerge" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return doCmd ( "smerge=1",h1,"master", true , //sendtohosts false ,//sendtoproxies h2 ); } return doCmd ( "smerge=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb tmerge [hostId] if ( strcmp ( cmd , "tmerge" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return doCmd ( "tmerge=1",h1,"master", true , //sendtohosts false, //sendtoproxies h2); } return doCmd ( "tmerge=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb merge [hostId] if ( strcmp ( cmd , "merge" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); // might have a range if ( cmdarg + 1 < argc ) { long h1 = -1; long h2 = -1; sscanf ( argv[cmdarg+1],"%li-%li",&h1,&h2); if ( h1 != -1 && h2 != -1 && h1 <= h2 ) return doCmd ( "merge=1",h1,"master", true , //sendtohosts false,//sendtoproxies h2); } return doCmd ( "merge=1" , hostId , "master" , true , //sendtohosts false );//sendtoproxies } // gb setnote if ( strcmp ( cmd, "setnote" ) == 0 ) { long hostId; char *note; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); else return false; if ( cmdarg + 2 < argc ) note = argv[cmdarg+2]; else return false; char urlnote[1024]; urlEncode(urlnote, 1024, note, gbstrlen(note)); log ( LOG_INIT, "conf: setnote %li: %s", hostId, urlnote ); char setnoteCmd[256]; sprintf(setnoteCmd, "setnote=1&host=%li¬e=%s", hostId, urlnote); return doCmd ( setnoteCmd, -1, "master/hosts" , true , //sendtohosts false );//sendtoproxies } // gb setsparenote if ( strcmp ( cmd, "setsparenote" ) == 0 ) { long spareId; char *note; if ( cmdarg + 1 < argc ) spareId = atoi ( argv[cmdarg+1] ); else return false; if ( cmdarg + 2 < argc ) note = argv[cmdarg+2]; else return false; char urlnote[1024]; urlEncode(urlnote, 1024, note, gbstrlen(note)); log(LOG_INIT, "conf: setsparenote %li: %s", spareId, urlnote); char setnoteCmd[256]; sprintf(setnoteCmd, "setsparenote=1&spare=%li¬e=%s", spareId, urlnote); return doCmd ( setnoteCmd, -1, "master/hosts" , true , //sendtohosts false );//sendtoproxies } // gb replacehost if ( strcmp ( cmd, "replacehost" ) == 0 ) { long hostId = -1; long spareId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); if ( cmdarg + 2 < argc ) spareId = atoi ( argv[cmdarg+2] ); char replaceCmd[256]; sprintf(replaceCmd, "replacehost=1&rhost=%li&rspare=%li", hostId, spareId); return doCmd ( replaceCmd, -1, "master/hosts" , true , //sendtohosts true );//sendtoproxies } // gb synchost if ( strcmp ( cmd, "synchost" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); else return false; char syncCmd[256]; sprintf(syncCmd, "synchost=1&shost=%li", hostId); return doCmd ( syncCmd, g_hostdb.m_hostId, "master/hosts" , true , //sendtohosts false );//sendtoproxies } if ( strcmp ( cmd, "synchost2" ) == 0 ) { long hostId = -1; if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] ); else return false; char syncCmd[256]; sprintf(syncCmd, "synchost=2&shost=%li", hostId); return doCmd ( syncCmd, g_hostdb.m_hostId, "master/hosts" , true, //sendToHosts false );// sendtoproxies } // gb startclassifier coll ruleset [hostId] /* if ( strcmp ( cmd , "startclassifier" ) == 0 ) { long hostId = 0; char *coll; char *ruleset; char *siteListFile = NULL; if ( cmdarg + 1 < argc ) coll = argv[cmdarg+1]; else return false; if ( cmdarg + 2 < argc ) ruleset = argv[cmdarg+2]; else return false; if ( cmdarg + 3 < argc ) hostId = atoi ( argv[cmdarg+3] ); if ( cmdarg + 4 < argc ) siteListFile = argv[cmdarg+4]; char classifierCmd[512]; if ( ! siteListFile ) sprintf(classifierCmd, "startclassifier=1&c=%s" "&ruleset=%s", coll, ruleset); else sprintf(classifierCmd, "startclassifier=1&c=%s" "&ruleset=%s&sitelistfile=%s", coll, ruleset, siteListFile ); return doCmd(classifierCmd , hostId , "master/tagdb" , true , //sendtohosts false );//sendtoproxies } // gb stopclassifier [hostId] if ( strcmp ( cmd , "stopclassifier" ) == 0 ) { char *coll; if ( cmdarg + 1 < argc ) coll = argv[cmdarg+1]; else return false; long hostId = 0; if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] ); char classifierCmd[512]; sprintf(classifierCmd, "stopclassifier=1&c=%s", coll ); return doCmd(classifierCmd , hostId , "master/tagdb" , true , //sendtohosts false );//sendtoproxies } */ // gb [-h hostsConf] // mainStart: // get host info for this host Host *h = g_hostdb.getHost ( hostId ); if ( ! h ) { log("db: No host has id %li.",hostId); return 1;} // HACK: enable logging for Conf.cpp, etc. g_process.m_powerIsOn = true; // . read in the conf file // . this now initializes from a dir and hostId, they should all be // name gbHID.conf // . now that hosts.conf has more of the burden, all gbHID.conf files // can be identical if ( ! g_conf.init ( h->m_dir ) ) { // , h->m_hostId ) ) { log("db: Conf init failed." ); return 1; } //if ( ! g_hostdb.validateIps ( &g_conf ) ) { // log("db: Failed to validate ips." ); return 1;} //if ( ! g_hostdb2.validateIps ( &g_conf ) ) { // log("db: Failed to validate ips." ); return 1;} // put in read only mode if ( useTmpCluster ) g_conf.m_readOnlyMode = true; if ( useTmpCluster ) g_conf.m_sendEmailAlerts = false; // log how much mem we can use log(LOG_INIT,"conf: Max mem allowed to use is %lli\n",g_conf.m_maxMem); // load the language specific pages g_languagePages.reloadPages(); // init the loop, needs g_conf if ( ! g_loop.init() ) { log("db: Loop init failed." ); return 1; } // the new way to save all rdbs and conf // if g_process.m_powerIsOn is false, logging will not work, so init // this up here. must call after Loop::init() so it can register // its sleep callback g_process.init(); // set up the threads, might need g_conf if ( ! g_threads.init() ) { log("db: Threads init failed." ); return 1; } // gb gendict if ( strcmp ( cmd , "gendict" ) == 0 ) { // get hostId to install TO (-1 means all) if ( argc != cmdarg + 2 && argc != cmdarg + 3 ) goto printHelp; // take no other args char *coll = argv[cmdarg+1]; // get numWordsToDump long nn = 10000000; if ( argc == cmdarg + 3 ) nn = atoi ( argv[cmdarg+2] ); // . generate the dict files // . use the first 100,000,000 words/phrases to make them g_speller.generateDicts ( nn , coll ); return 0; } if ( strcmp ( cmd , "dumpmissing" ) == 0 ) { // got collection and hostid in here if ( argc != cmdarg + 3 ) goto printHelp; char *coll = argv[cmdarg+1]; dumpMissing ( coll ); // disable any further logging so final log msg is clear g_log.m_disabled = true; return 0; } if ( strcmp ( cmd , "dumpdups" ) == 0 ) { // got collection and hostid in here if ( argc != cmdarg + 3 ) goto printHelp; char *coll = argv[cmdarg+1]; dumpDups ( coll ); // disable any further logging so final log msg is clear g_log.m_disabled = true; return 0; } // removedocids if ( strcmp ( cmd , "removedocids" ) == 0 ) { if ( argc != cmdarg + 4 ) goto printHelp; char *coll = argv[cmdarg+1]; char *file = argv[cmdarg+2]; removeDocIds ( coll , file ); // disable any further logging so final log msg is clear g_log.m_disabled = true; return 0; } #ifndef _CLIENT_ #ifndef _METALINCS_ // gb dump i [fileNum] [off] if ( strcmp ( cmd , "dump" ) == 0 && argc > cmdarg + 1 && argv[cmdarg+1][0]=='I') { long fileNum = 0; long long off = 0LL; char *NAME = NULL; //if ( cmdarg + 2 < argc ) fileNum = atoi (argv[cmdarg+2]); if ( cmdarg + 2 < argc ) NAME = argv[cmdarg+2]; if ( cmdarg + 3 < argc ) off = atoll1(argv[cmdarg+3]); dumpIndexdbFile ( fileNum , off , "indexdb" , 12 , NAME ); // disable any further logging so final log msg is clear g_log.m_disabled = true; return 0; } if ( strcmp ( cmd , "dump" ) == 0 && argc > cmdarg + 1 && argv[cmdarg+1][0]=='T') { long fileNum = 0; long long off = 0LL; if ( cmdarg + 2 < argc ) fileNum = atoi (argv[cmdarg+2]); if ( cmdarg + 3 < argc ) off = atoll1(argv[cmdarg+3]); dumpIndexdbFile ( fileNum , off , "datedb" , 16 ); // disable any further logging so final log msg is clear g_log.m_disabled = true; return 0; } #endif #endif // . gb dump [dbLetter][coll][fileNum] [numFiles] [includeTree][termId] // . spiderdb is special: // gb dump s [coll][fileNum] [numFiles] [includeTree] [0=old|1=new] // [priority] [printStats?] if ( strcmp ( cmd , "dump" ) == 0 ) { if ( cmdarg+1 >= argc ) goto printHelp; long startFileNum = 0; long numFiles = -1; long includeTree = 1; long long termId = -1; char *coll = ""; // we have to init collection db because we need to know if // the collnum is legit or not in the tree if ( ! g_collectiondb.init(true) ) { log("db: Collectiondb init failed." ); return 1; } if ( cmdarg+2 < argc ) coll = argv[cmdarg+2]; if ( cmdarg+3 < argc ) startFileNum = atoi(argv[cmdarg+3]); if ( cmdarg+4 < argc ) numFiles = atoi(argv[cmdarg+4]); if ( cmdarg+5 < argc ) includeTree = atoi(argv[cmdarg+5]); if ( cmdarg+6 < argc ) { char *targ = argv[cmdarg+6]; if ( is_alpha_a(targ[0]) ) { termId = hash64n(targ); termId &= TERMID_MASK; } else { termId = atoll1(targ); } } if ( argv[cmdarg+1][0] == 't' ) { long long docId = 0LL; if ( cmdarg+6 < argc ) docId = atoll1(argv[cmdarg+6]); bool justPrintSentences = false; bool justPrintWords = false; // support "ts" if ( argv[cmdarg+1][1] == 's' ) justPrintSentences = true; // support "tw" if ( argv[cmdarg+1][1] == 'w' ) justPrintWords = true; dumpTitledb (coll,startFileNum,numFiles,includeTree, docId,0, justPrintSentences, justPrintWords); } else if ( argv[cmdarg+1][0] == 'D' ) { long long docId = 0LL; if ( cmdarg+6 < argc ) docId = atoll1(argv[cmdarg+6]); dumpTitledb(coll,startFileNum,numFiles,includeTree, docId,1,false,false); } else if ( argv[cmdarg+1][0] == 'v' && argv[cmdarg+1][1] =='u' ) dumpTfndb (coll,startFileNum,numFiles,includeTree,1); else if ( argv[cmdarg+1][0] == 'u' ) dumpTfndb (coll,startFileNum,numFiles,includeTree,0); else if ( argv[cmdarg+1][0] == 'w' ) dumpWaitingTree(coll); else if ( argv[cmdarg+1][0] == 'x' ) dumpDoledb (coll,startFileNum,numFiles,includeTree); else if ( argv[cmdarg+1][0] == 's' ) { //long isNew = 1; //long priority = -1; char printStats = 0; long firstIp = 0; //char *coll = NULL; //if(cmdarg+6 < argc ) isNew = atol(argv[cmdarg+6]); //if(cmdarg+7 < argc ) priority = atol(argv[cmdarg+7]); if ( cmdarg+6 < argc ){ printStats= atol(argv[cmdarg+6]); // it could be an ip instead of printstats if ( strstr(argv[cmdarg+6],".") ) { printStats = 0; firstIp = atoip(argv[cmdarg+6]); } } //if ( cmdarg+7 < argc ) coll = argv[cmdarg+7]; long ret = dumpSpiderdb ( coll,startFileNum,numFiles, includeTree , printStats , firstIp ); if ( ret == -1 ) fprintf(stdout,"error dumping spiderdb\n"); } else if ( argv[cmdarg+1][0] == 'B' ) dumpSectiondb(coll,startFileNum,numFiles,includeTree); else if ( argv[cmdarg+1][0] == 'V' ) dumpRevdb(coll,startFileNum,numFiles,includeTree); else if ( argv[cmdarg+1][0] == 'S' ) dumpTagdb (coll,startFileNum,numFiles,includeTree,0); else if ( argv[cmdarg+1][0] == 'A' ) dumpTagdb (coll,startFileNum,numFiles,includeTree,0, 'A'); else if ( argv[cmdarg+1][0] == 'a' ) dumpTagdb (coll,startFileNum,numFiles,includeTree,0, 'D'); else if ( argv[cmdarg+1][0] == 'G' ) dumpTagdb (coll,startFileNum,numFiles,includeTree,0, 'G'); else if ( argv[cmdarg+1][0] == 'W' ) dumpTagdb (coll,startFileNum,numFiles,includeTree,1); else if ( argv[cmdarg+1][0] == 'C' ) dumpTagdb (coll,startFileNum,numFiles,includeTree,0, 0,RDB_CATDB); else if ( argv[cmdarg+1][0] == 'l' ) dumpClusterdb (coll,startFileNum,numFiles,includeTree); //else if ( argv[cmdarg+1][0] == 'c' ) // dumpChecksumdb(coll,startFileNum,numFiles,includeTree); //else if ( argv[cmdarg+1][0] == 'z' ) // dumpStatsdb(startFileNum,numFiles,includeTree,2); //else if ( argv[cmdarg+1][0] == 'Z' ) // dumpStatsdb(startFileNum,numFiles,includeTree,4); else if ( argv[cmdarg+1][0] == 'L' ) { char *url = NULL; if ( cmdarg+6 < argc ) url = argv[cmdarg+6]; dumpLinkdb(coll,startFileNum,numFiles,includeTree,url); } #ifndef _CLIENT_ #ifndef _METALINCS_ #ifndef _GLOBALSPEC_ else if ( argv[cmdarg+1][0] == 'i' ) dumpIndexdb (coll,startFileNum,numFiles,includeTree, termId); else if ( argv[cmdarg+1][0] == 'p' ) dumpPosdb (coll,startFileNum,numFiles,includeTree, termId,false); else if ( argv[cmdarg+1][0] == 'd' ) dumpDatedb (coll,startFileNum,numFiles,includeTree, termId,false); #endif #endif #endif /* else if ( argv[cmdarg+1][0] == 'c' ) { long long docId = 0LL; if ( cmdarg+6 < argc ) docId = atoll1(argv[cmdarg+6]); dumpCachedRecs (coll,startFileNum,numFiles,includeTree, docId); } */ /* else if ( argv[cmdarg+1][0] == 'R' ) { long long docId = 0LL; if ( cmdarg+6 < argc ) docId = atoll1(argv[cmdarg+6]); testSpamRules (coll,startFileNum,numFiles,includeTree, docId); } */ else goto printHelp; // disable any further logging so final log msg is clear g_log.m_disabled = true; return 0; } if( strcmp( cmd, "countdomains" ) == 0 && argc >= (cmdarg + 2) ) { char *coll = ""; long verb; long outpt; coll = argv[cmdarg+1]; if( argv[cmdarg+2][0] < 0x30 && argv[cmdarg+2][0] > 0x39 ) goto printHelp; long numRecs = atoi( argv[cmdarg+2] ); if( argc > (cmdarg + 2) ) verb = atoi( argv[cmdarg+2] ); else verb = 0; if( argc > (cmdarg + 3) ) outpt = atoi( argv[cmdarg+3] ); else outpt = 0; log( LOG_INFO, "cntDm: Allocated Larger Mem Table for: %li", g_mem.m_memtablesize ); if (!ucInit(g_hostdb.m_dir)) { log("Unicode initialization failed!"); return 1; } countdomains( coll, numRecs, verb, outpt ); g_log.m_disabled = true; return 0; } // let's ensure our core file can dump struct rlimit lim; lim.rlim_cur = lim.rlim_max = RLIM_INFINITY; if ( setrlimit(RLIMIT_CORE,&lim) ) log("db: setrlimit: %s.", mstrerror(errno) ); // limit fds // try to prevent core from systems where it is above 1024 // because our FD_ISSET() libc function will core! (it's older) long NOFILE = 1024; lim.rlim_cur = lim.rlim_max = NOFILE; if ( setrlimit(RLIMIT_NOFILE,&lim)) log("db: setrlimit RLIMIT_NOFILE %li: %s.", NOFILE,mstrerror(errno) ); struct rlimit rlim; getrlimit ( RLIMIT_NOFILE,&rlim); if ( (long)rlim.rlim_max > NOFILE || (long)rlim.rlim_cur > NOFILE ) { log("db: setrlimit RLIMIT_NOFILE failed!"); char *xx=NULL;*xx=0; } log("db: RLIMIT_NOFILE = %li",(long)rlim.rlim_max); //exit(0); // . disable o/s's and hard drive's read ahead // . set multcount to 16 --> 1 interrupt for every 16 sectors read // . multcount of 16 reduces OS overhead by 30%-50% (more throughput) // . use hdparm -i to find max mult count // . -S 100 means turn off spinning if idle for 500 seconds // . this should be done in /etc/rc.sysinit or /etc/sysconfig/harddisks //system("hdparm -a 0 -A 0 -m 16 -S 100 /dev/hda"); //system("hdparm -a 0 -A 0 -m 16 -S 100 /dev/hdb"); //system("hdparm -a 0 -A 0 -m 16 -S 100 /dev/hdc"); //system("hdparm -a 0 -A 0 -m 16 -S 100 /dev/hdd"); //system ("rm /gigablast/*.dat"); //system ("rm /gigablast/*.map"); //if ( g_hostdb.m_hostId == 0 ) g_conf.m_logDebugUdp = 1; //g_conf.m_spideringEnabled = 1; //g_conf.m_logDebugBuild = 1; // temp merge test //RdbList list; //list.testIndexMerge(); // file creation test, make sure we have dir control if ( checkDirPerms ( g_hostdb.m_dir ) < 0 ) return 1; // . make sure we have critical files // . make sure elvtune is in the /etc/rcS.d/S99local if need be //if ( ! checkFiles ( g_hostdb.m_dir ) ) return 1; if ( ! g_process.checkFiles ( g_hostdb.m_dir ) ) return 1; // load the appropriate dictionaries //g_speller.init(); //if ( !g_speller.init ( ) ) return 1; g_errno = 0; //g_speller.test ( ); //exit(-1); /* char dst[1024]; char test[1024]; spellLoop: test[0] = '\0'; gets ( test ); if ( test[gbstrlen(test)-1] == '\n' ) test[gbstrlen(test)-1] = '\0'; Query qq; qq.set ( test , gbstrlen(test) , NULL , 0 , false ); if ( g_speller.getRecommendation ( &qq , dst , 1000 ) ) log("spelling suggestion: %s", dst ); goto spellLoop; */ //if ( strcmp ( cmd , "fixtfndb" ) == 0 ) { // char *coll = argv[cmdarg+1]; // // clean out tfndb*.dat // fixTfndb ( coll ); // coll //} //if ( strcmp ( cmd , "gendbs" ) == 0 ) goto jump; //if ( strcmp ( cmd , "gentfndb" ) == 0 ) goto jump; if ( strcmp ( cmd , "gencatdb" ) == 0 ) goto jump; //if ( strcmp ( cmd , "genclusterdb" ) == 0 ) goto jump; // if ( cmd && ! is_digit(cmd[0]) ) goto printHelp; /* // tmp stuff to generate new query log if ( ! ucInit(g_hostdb.m_dir, true)) return 1; if ( ! g_wiktionary.load() ) return 1; if ( ! g_wiktionary.test() ) return 1; if ( ! g_wiki.load() ) return 1; if ( ! g_speller.init() && g_conf.m_isLive ) return 1; if ( ! g_langList.loadLists ( ) ) log("init: loadLists Failed"); if ( ! loadQueryLog() ) return 1; return 0; */ // make sure port is available, no use loading everything up then // failing because another process is already running using this port //if ( ! g_udpServer.testBind ( g_hostdb.getMyPort() ) ) if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->m_httpPort)) return 1; g_errno = 0; if (!ucInit(g_hostdb.m_dir, true)) { log("Unicode initialization failed!"); return 1; } // the wiktionary for lang identification and alternate word forms/ // synonyms if ( ! g_wiktionary.load() ) return 1; if ( ! g_wiktionary.test() ) return 1; // . load synonyms, synonym affinity, and stems // . now we are using g_synonyms //g_thesaurus.init(); //g_synonyms.init(); // the wiki titles if ( ! g_wiki.load() ) return 1; // the query log split //if ( ! loadQueryLog() ) return 1; jump: // force give up on dead hosts to false g_conf.m_giveupOnDeadHosts = 0; // shout out if we're in read only mode if ( g_conf.m_readOnlyMode ) log("db: -- Read Only Mode Set. Can Not Add New Data. --"); //#ifdef SPLIT_INDEXDB if ( g_hostdb.m_indexSplits > 1 ) log("db: -- Split Index ENABLED. Split count set to: %li --", g_hostdb.m_indexSplits); //#endif // . set up shared mem now, only on udpServer2 // . will only set it up if we're the lowest hostId on this ip //if ( ! g_udpServer2.setupSharedMem() ) { // log("db: SharedMem init failed" ); return 1; } // the robots.txt db //if ( ! g_robotdb.init() ) { // log("db: Robotdb init failed." ); return 1; } // . collectiondb, does not use rdb, loads directly from disk // . do this up here so RdbTree::fixTree() can fix RdbTree::m_collnums // . this is a fake init, cuz we pass in "true" if ( ! g_isYippy && ! g_collectiondb.init(true) ) { log("db: Collectiondb init failed." ); return 1; } // a hack to rename files that were not renamed because of a bug // in the repair/build process /* if ( ! g_titledb2.init2 ( 100000000 ) ) { log("db: Titledb init2 failed." ); return 1; } if ( ! g_titledb2.addColl ( "mainRebuild" ) ) { log("db: Titledb addcoll failed." ); return 1; } g_titledb2 // get the base RdbBase *base = g_titledb2.m_rdb.m_bases[1]; // panic? if ( ! base ) { log("db: titledb2: no base."); return 1; } // now clean them up base->removeRebuildFromFilenames ( ) ; // stop return 1; */ // then statsdb if ( ! g_statsdb.init() ) { log("db: Statsdb init failed." ); return 1; } // allow adds to statsdb rdb tree g_process.m_powerIsOn = true; // then indexdb //if ( ! g_indexdb.init() ) { // log("db: Indexdb init failed." ); return 1; } if ( ! g_posdb.init() ) { log("db: Posdb init failed." ); return 1; } // for sorting results by date //if ( ! g_datedb.init() ) { // log("db: Datedb init failed." ); return 1; } // for sorting events by time //if ( ! g_timedb.init() ) { // log("db: Datedb init failed." ); return 1; } // then titledb if ( ! g_titledb.init() ) { log("db: Titledb init failed." ); return 1; } // then revdb //if ( ! g_revdb.init() ) { // log("db: Revdb init failed." ); return 1; } // then tagdb if ( ! g_tagdb.init() ) { log("db: Tagdb init failed." ); return 1; } // the catdb, it's an instance of tagdb, pass RDB_CATDB if ( ! g_catdb.init() ) { log("db: Catdb1 init failed." ); return 1; } // initialize Users if ( ! g_users.init() ){ log("db: Users init failed. "); return 1;} //if ( ! g_syncdb.init() ) { // log("db: Syncdb init failed." ); return 1; } // if generating spiderdb/tfndb/checksumdb, boost minfiles //if ( strcmp ( cmd, "gendbs" ) == 0 ) { // // don't let spider merge all the time! // g_conf.m_spiderdbMinFilesToMerge = 20; // g_conf.m_tfndbMinFilesToMerge = 5; // // set up spiderdb // g_conf.m_spiderdbMaxTreeMem = 200000000; // 200M // g_conf.m_maxMem = 2950000000LL; // 2G // g_mem.m_maxMem = 2950000000LL; // 2G //} //if ( strcmp ( cmd, "gentfndb" ) == 0 ) { // g_conf.m_tfndbMinFilesToMerge = 20; // // set up tfndb // g_conf.m_tfndbMaxTreeMem = 200000000; // 200M // g_conf.m_maxMem = 2000000000LL; // 2G // g_mem.m_maxMem = 2000000000LL; // 2G //} // then tfndb //if ( ! g_tfndb.init() ) { // log("db: Tfndb init failed." ); return 1; } // then spiderdb if ( ! g_spiderdb.init() ) { log("db: Spiderdb init failed." ); return 1; } // then doledb if ( ! g_doledb.init() ) { log("db: Doledb init failed." ); return 1; } // the spider cache used by SpiderLoop if ( ! g_spiderCache.init() ) { log("db: SpiderCache init failed." ); return 1; } if ( ! g_test.init() ) { log("db: test init failed" ); return 1; } // then checksumdb //if ( ! g_checksumdb.init() ) { // log("db: Checksumdb init failed." ); return 1; } // ensure clusterdb tree is big enough for quicker generation //if ( strcmp ( cmd, "genclusterdb" ) == 0 ) { // g_conf.m_clusterdbMinFilesToMerge = 20; // // set up clusterdb // g_conf.m_clusterdbMaxTreeMem = 50000000; // 50M // g_conf.m_maxMem = 2000000000LL; // 2G // g_mem.m_maxMem = 2000000000LL; // 2G //} // site clusterdb if ( ! g_clusterdb.init() ) { log("db: Clusterdb init failed." ); return 1; } // linkdb if ( ! g_linkdb.init() ) { log("db: Linkdb init failed." ); return 1; } if ( ! g_cachedb.init() ) { log("db: Cachedb init failed." ); return 1; } if ( ! g_serpdb.init() ) { log("db: Serpdb init failed." ); return 1; } if ( ! g_monitordb.init() ) { log("db: Monitordb init failed." ); return 1; } // use sectiondb again for its immense voting power for detecting and // removing web page chrome, categories, etc. only use if // CollectionRec::m_isCustomCrawl perhaps to save space. if ( ! g_sectiondb.init() ) { log("db: Sectiondb init failed." ); return 1; } //if ( ! g_placedb.init() ) { // log("db: Placedb init failed." ); return 1; } // collectiondb, does not use rdb, loads directly from disk if ( ! g_collectiondb.init() ) { log("db: Collectiondb init failed." ); return 1; } // . now read in a little bit of each db and make sure the contained // records belong in our group // . only do this if we have more than one group // . we may have records from other groups if we are scaling, but // if we cannot find *any* records in our group we probably have // the wrong data files. //if ( ! checkDataParity() ) return 1; // init pageturk //if ( ! g_pageTurk.init() ){ // log("db: PageTurk init failed. "); return 1;} // init the vector cache /* if ( ! g_vectorCache.init ( g_conf.m_maxVectorCacheMem, VECTOR_REC_SIZE-sizeof(key_t), true, g_conf.m_maxVectorCacheMem / ( sizeof(collnum_t) + 20 + VECTOR_REC_SIZE ) , true, "vector", false, 12, 12 ) ) { log("db: Vector Cache init failed." ); return 1; } */ // . gb gendbs // . hostId should have already been picked up above, so it could be // used to initialize all the rdbs //if ( strcmp ( cmd , "gendbs" ) == 0 ) { // char *coll = argv[cmdarg+1]; // // generate the dbs // genDbs ( coll ); // coll // g_log.m_disabled = true; // return 0; //} //if ( strcmp ( cmd , "gentfndb" ) == 0 ) { // char *coll = argv[cmdarg+1]; // genTfndb ( coll ); // g_log.m_disabled = true; // return 0; //} //if ( strcmp ( cmd, "genclusterdb" ) == 0 ) { // char *coll = argv[cmdarg+1]; // makeClusterdb ( coll ); // g_log.m_disabled = true; // return 0; //} // test all collection dirs for write permission -- metalincs' request for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; char tt[1024 + MAX_COLL_LEN ]; sprintf ( tt , "%scoll.%s.%li", g_hostdb.m_dir, cr->m_coll , (long)cr->m_collnum ); checkDirPerms ( tt ) ; } // and now that all rdbs have loaded lets count the gbeventcount // keys we have in datedb. those represent the # of events we // have indexed. //g_collectiondb.countEvents(); //if (!ucInit(g_hostdb.m_dir, true)) { // log("Unicode initialization failed!"); // return 1; //} // // NOTE: ANYTHING THAT USES THE PARSER SHOULD GO BELOW HERE, UCINIT! // // load the appropriate dictionaries if ( ! g_speller.init() && g_conf.m_isLive ) { return 1; } // have to test after unified dict is loaded because if word is // of unknown langid we try to get syns for it anyway if it has // only one possible lang according to unified dict //if ( ! g_wiktionary.test2() ) return 1; /* if ( strcmp ( cmd, "gendaterange" ) == 0 ) { char *coll = argv[cmdarg+1]; genDateRange ( coll ); g_log.m_disabled = true; return 0; } */ // load language lists if ( !g_langList.loadLists ( ) ) { log("init: LangList loadLists Failed" ); //not really fatal, so carry on. //return 1; } // the query log split. only for seo tools, so only do if // we are running in Matt Wells's datacenter. if ( g_conf.m_isMattWells && ! loadQueryLog() ) { log("init: failed to load query log. continuing with seo " "support."); //return 1; } //if( !g_pageTopDocs.init() ) { // log( "init: PageTopDocs init failed." ); // return 1; //} if( !g_pageNetTest.init() ) { log( "init: PageNetTest init failed." ); return 1; } //if(!Msg6a::init()) { // log( "init: Quality Agent init failed." ); //} if ( ! g_scraper.init() ) return 1; //if ( ! DateParse::init() ) { // log("db: DateParse init failed." ); return 1; //} //countdomains was HERE, moved up to access more mem. // load up the dmoz categories here char structureFile[256]; sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir); g_categories = &g_categories1; if (g_categories->loadCategories(structureFile) != 0) { log("cat: Loading Categories From %s Failed.", structureFile); //return 1; } log(LOG_INFO, "cat: Loaded Categories From %s.", structureFile); // Load the category language table g_countryCode.loadHashTable(); log(LOG_INFO, "cat: Loaded %ld entries from Category country table.", g_countryCode.getNumEntries()); //g_siteBonus.init(); if(!g_autoBan.init()) { log("autoban: init failed."); return 1; } //if(!g_classifier.restore()) { // log("classifier: init failed."); // //return 1; //} // deprecated in favor of Msg13-based throttling //if ( !g_msg6.init() ) { // log ( "init: msg6 init failed." ); // return 1; //} if(!g_profiler.init()) { log("profiler: init failed."); } g_profiler.readSymbolTable(); //exit(0); // diff with indexdb in sync/ dir //syncIndexdb ( ); //exit(-1); // init the cache in Msg40 for caching search results // if cache not initialized now then do it now long maxMem = g_conf.m_searchResultsMaxCacheMem; if ( ! g_genericCache[SEARCHRESULTS_CACHEID].init ( maxMem , // max cache mem -1 , // fixedDataSize false , // support lists of recs? maxMem/2048 , // max cache nodes false , // use half keys? "results" , // filename //g_conf.m_searchResultsSaveCache ) ) { true)){ log("db: ResultsCache: %s",mstrerror(g_errno)); return 1; } /* maxMem = 40000000; long maxNodes2 = maxMem/(8+8+50*(8+4+4)); if ( ! g_genericCache[SEORESULTS_CACHEID].init ( maxMem , // max cache mem -1 , // fixedDataSize false , // support lists of recs? maxNodes2 , // max cache nodes false , // use half keys? "seoresults" , // filename true)){ // save to disk? log("db: ResultsCache: %s",mstrerror(g_errno)); return 1; } */ /* long maxMem1 = g_conf.m_siteLinkInfoMaxCacheMem; if ( ! g_genericCache[SITELINKINFO_CACHEID].init ( maxMem1 , // max cache mem 4 , // fixedDataSize false , // support lists of recs? maxMem1/36 , // max cache nodes false , // use half keys? "sitelinkinfo" , // filename //g_conf.m_siteLinkInfoSaveCache ) ) { true)){ log("db: SiteLinkInfoCache: %s",mstrerror(g_errno)); return 1; } long maxMem2a = g_conf.m_siteQualityMaxCacheMem; if ( ! g_genericCache[SITEQUALITY_CACHEID].init ( maxMem2a , // max cache mem 1 , // fixedDataSize false , // support lists of recs? maxMem2a/36 , // max cache nodes false , // use half keys? "sitequality" , // filename //g_conf.m_siteQualitySaveCache ) ) { true)) { log("db: SiteQualityCache: %s",mstrerror(g_errno)); return 1; } */ /* long maxMem2b = g_conf.m_siteQualityMaxCacheMem * .10 ; if ( ! g_genericCacheSmallLocal[SITEQUALITY_CACHEID].init ( maxMem2b , // max cache mem 1 , // fixedDataSize false , // support lists of recs? maxMem2b/36 , // max cache nodes false , // use half keys? "sitequality" , // filename //g_conf.m_siteQualitySaveCache ) ) { false)) { log("db: SiteQualityCacheSmallLocal: %s",mstrerror(g_errno)); return 1; } */ // . then our main udp server // . must pass defaults since g_dns uses it's own port/instance of it // . server should listen to a socket and register with g_loop // . sock read/write buf sizes are both 64000 // . poll time is 60ms // . if the read/write bufs are too small it severely degrades // transmission times for big messages. just use ACK_WINDOW * // MAX_DGRAM_SIZE as the size so when sending you don't drop dgrams // . the 400k size allows us to cover Sync.cpp's activity well if ( ! g_udpServer.init( g_hostdb.getMyPort() ,&g_dp,2/*niceness*/, 20000000 , // readBufSIze 20000000 , // writeBufSize 20 , // pollTime in ms 3500 , // max udp slots false )){ // is dns? log("db: UdpServer init failed." ); return 1; } // . this is the high priority udpServer, it's stuff is handled 1st // sock read/write buf sizes are both almost 2 megs // . a niceness of -1 means its signal won't be blocked, real time // . poll time is 20ms //if ( ! g_udpServer2.init( g_hostdb.getMyPort2(),&g_dp,-1/*niceness*/, // 10000000 , // readBufSIze // 10000000 , // writeBufSize // 20 , // pollTime in ms // 1000 )){ // max udp slots // log("db: UdpServer2 init failed." ); return 1; } // start pinging right away if ( ! g_pingServer.init() ) { log("db: PingServer init failed." ); return 1; } // start up repair loop if ( ! g_repair.init() ) { log("db: Repair init failed." ); return 1; } // start up repair loop if ( ! g_dailyMerge.init() ) { log("db: Daily merge init failed." ); return 1; } // . then dns Distributed client // . server should listen to a socket and register with g_loop // . Only the distributed cache shall call the dns server. if ( ! g_dns.init( h->m_dnsClientPort ) ) { log("db: Dns distributed client init failed." ); return 1; } // . then dns Local client //if ( ! g_dnsLocal.init( 0 , false ) ) { // log("db: Dns local client init failed." ); return 1; } // . then webserver // . server should listen to a socket and register with g_loop // again: if ( ! g_httpServer.init( h->m_httpPort, h->m_httpsPort ) ) { log("db: HttpServer init failed. Another gb already " "running?" ); // this is dangerous!!! do not do the shutdown thing return 1; /* // just open a socket to port X and send GET /master?save=1 if ( shutdownOldGB(h->m_httpPort) ) goto again; log("db: Shutdown failed."); resetAll(); return 1; */ } if(!Msg1f::init()) { log("logviewer: init failed."); return 1; } // . now register all msg handlers with g_udp server if ( ! registerMsgHandlers() ) { log("db: registerMsgHandlers failed" ); return 1; } // for Events.cpp event extraction we need to parse out "places" from // each doc //if ( ! initPlaceDescTable ( ) ) { // log("events: places table init failed"); return 1; } // init our city lists for mapping a lat/lon to nearest cityid // for getting the timezone for getting all events "today". // city lists are used by the get //if ( ! initCityLists() ) { // log("events: city lists init failed"); return 1; } //if ( ! initCityLists_new() ) { // log("events: city lists init failed"); return 1; } // . get a doc every hour from gigablast.com as a registration thang // . security, man //if((long) g_conf.m_mainExternalIp != atoip ( "207.114.174.29" ,14) ) g_loop.registerSleepCallback(5000, NULL, getPageWrapper); // save our rdbs every 5 seconds and save rdb if it hasn't dumped // in the last 10 mins //if ( ! g_loop.registerSleepCallback(5, NULL, saveRdbs ) ) { // return log("db: save register failed"); return 1; } // // the new way to save all rdbs and conf // //g_process.init(); // gb spellcheck if ( strcmp ( cmd , "spellcheck" ) == 0 ) { if ( argc != cmdarg + 2 ) goto printHelp; // take no other args g_speller.test ( argv[cmdarg + 1] ); return 0; } // gb dictLookupTest if ( strcmp ( cmd , "dictlookuptest" ) == 0 ) { if ( argc != cmdarg + 2 ) goto printHelp; // take no other args g_speller.dictLookupTest ( argv[cmdarg + 1] ); return 0; } // gb stemmertest //if ( strcmp ( cmd , "stemmertest" ) == 0 ) { // if ( argc != cmdarg + 2 ) goto printHelp; // g_stemmer.test ( argv[cmdarg + 1] ); // return 0; //} // gb queryserializetest /* if ( strcmp ( cmd , "queryserializetest" ) == 0 ) { if ( argc != cmdarg + 2 ) goto printHelp; long long starttime = gettimeofdayInMilliseconds(); QuerySerializeTest( argv[cmdarg + 1] ); log(LOG_INFO, "query: took %lldmsecs for query serialize" \ "test on %s", gettimeofdayInMilliseconds() - starttime, argv[cmdarg + 1]); return 0; } */ #ifdef _LIMIT10_ // how many pages have we indexed so far? //long long numPages = g_titledb.getRdb()->getNumGlobalRecs(); long long numPages = g_clusterdb.getRdb()->getNumGlobalRecs(); if ( numPages > 10123466 ) log("WARNING: Over 10 million documents are in the index. " "You have exceeded the terms of your license. " "Please contact mwells@gigablast.com for a new license."); #endif // bdflush needs to be turned off because we need to control the // writes directly. we do this by killing the write thread. // we kill it when we need to do important reads, otherwise, if // we cannot control the writes it fucks up our reading. // no, now i use fsync(fd) in BigFile.cpp //log("WARNING: burstify bdflush with a " // "'echo 1 > /proc/sys/vm/bdflush' to optimize query response time " // "during spidering."); //log("WARNING: mount with noatime option to speed up writes."); //log(" since we now call fsync(fd) after each write." ); // debug msgs //log("REMINDER: make HOT again!"); //log("REMINDER: reinsert thread call failed warning in BigFile.cpp."); //log("REMINDER: remove mem leack checking"); //log("REMINDER: put thread back in Msg39"); // . now check with gigablast.com (216.243.113.1) to see if we // are licensed, for now, just get the doc // . TODO: implement this (GET /license.html \r\n // Host: www.gigablast.com\r\n\r) // do the zlib test //zlibtest(); // . now m_minToMerge might have changed so try to do a merge // . only does one merge at a time // . other rdb's will sleep and retry until it's their turn //g_indexdb.getRdb()->m_minToMerge = 3; //g_loop.registerSleepCallback ( 1000 , // NULL , // tryMergingWrapper ); // . register a callback to try to merge everything every 2 seconds // . do not exit if we couldn't do this, not a huge deal // . put this in here instead of Rdb.cpp because we don't want // generator commands merging on us // . the (void *)1 prevents gb from logging merge info every 2 seconds if ( ! g_loop.registerSleepCallback(2000,(void *)1,attemptMergeAll)) log("db: Failed to init merge sleep callback."); // SEO MODULE // . only use if we are in Matt Wells's data center // and have access to the seo tools if ( g_conf.m_isMattWells && ! g_loop.registerSleepCallback(2000,(void *)1,runSEOQueryLoop)) log("db: Failed to register seo query loop"); //if( !g_loop.registerSleepCallback(2000,(void *)1,controlDumpTopDocs) ) // log("db: Failed to init dump TopDocs sleep callback."); // MTS: removing nettest, this breaks NetGear switches when all links // are transmitting full bore and full duplex. //if( !g_loop.registerSleepCallback(2000,(void *)1,controlNetTest) ) // log("db: Failed to init network test sleep callback."); //if( !g_loop.registerSleepCallback(60000,(void *)1,takeSnapshotWrapper)) // log("db: Failed to init Statsdb snapshot sleep callback."); // check to make sure we have the latest parms //Msg3e msg3e; //msg3e.checkForNewParms(); // this stuff is similar to alden's msg3e but will sync collections // that were added/deleted if ( ! g_parms.syncParmsWithHost0() ) { log("parms: error syncing parms: %s",mstrerror(g_errno)); return 0; } if(recoveryMode) { //now that everything is init-ed send the message. char buf[256]; log("admin: Sending emails."); sprintf(buf, "Host %li respawning after crash.(%s)", hostId, iptoa(g_hostdb.getMyIp())); g_pingServer.sendEmail(NULL, buf); } if ( testMandrill ) { static EmailInfo ei; //ei.m_cr = g_collectiondb.getRec(1); ei.m_collnum = 1; ei.m_fromAddress.safePrintf("support@diffbot.com"); ei.m_toAddress.safePrintf("matt@diffbot.com"); ei.m_callback = exitWrapper; sendEmailThroughMandrill ( &ei ); g_conf.m_spideringEnabled = false; g_conf.m_save = true; } Json json; json.test(); // . start the spiderloop // . comment out when testing SpiderCache g_spiderLoop.startLoop(); // allow saving of conf again g_conf.m_save = true; // test speed of select statement used in Loop::doPoll() // descriptor bits for calling select() /* fd_set readfds; fd_set writefds; fd_set exceptfds; // clear fds for select() FD_ZERO ( &readfds ); FD_ZERO ( &writefds ); FD_ZERO ( &exceptfds ); timeval v; v.tv_sec = 0; v.tv_usec = 1; // set descriptors we should watch for ( long i = 0 ; i < MAX_NUM_FDS ; i++ ) { if ( g_loop.m_readSlots [i] ) { FD_SET ( i , &readfds ); FD_SET ( i , &exceptfds ); } if ( g_loop.m_writeSlots[i] ) { FD_SET ( i , &writefds ); FD_SET ( i , &exceptfds ); } } // . poll the fd's searching for socket closes // . this takes 113ms with the FD_SET() stuff, and 35ms without // for doing 10,000 loops... pretty fast. long long t1 = gettimeofdayInMilliseconds(); long i = 0; for ( i = 0 ; i < 10000 ; i++ ) { // descriptor bits for calling select() fd_set readfds; fd_set writefds; fd_set exceptfds; // clear fds for select() FD_ZERO ( &readfds ); FD_ZERO ( &writefds ); FD_ZERO ( &exceptfds ); timeval v; v.tv_sec = 0; v.tv_usec = 1; // set descriptors we should watch for ( long i = 0 ; i < MAX_NUM_FDS ; i++ ) { if ( g_loop.m_readSlots [i] ) { FD_SET ( i , &readfds ); FD_SET ( i , &exceptfds ); } if ( g_loop.m_writeSlots[i] ) { FD_SET ( i , &writefds ); FD_SET ( i , &exceptfds ); } } long n = select (MAX_NUM_FDS,&readfds,&writefds,&exceptfds,&v); if ( n >= 0 ) continue; log("loop: select: %s.",strerror(g_errno)); break; } long long t2 = gettimeofdayInMilliseconds(); log(LOG_INFO,"loop: %li selects() called in %lli ms.",i,t2-t1); */ //spamTest(); // flush stats //g_statsdb.flush(); // ok, now activate statsdb g_statsdb.m_disabled = false; // sync loop //if ( ! g_sync.init() ) { // log("db: Sync init failed." ); return 1; } // . now start g_loops main interrupt handling loop // . it should block forever // . when it gets a signal it dispatches to a server or db to handle it if ( ! g_loop.runLoop() ) { log("db: runLoop failed." ); return 1; } // dummy return (0-->normal exit status for the shell) return 0; } /* void spamTest ( ) { // quick test // load in sample char *filename = "/home/mwells/poo"; int fd = open ( filename , O_RDONLY ); char ppp[100000]; struct stat stats; stat ( filename , &stats ); long size = stats.st_size; if ( size > 100000 ) size = 99999; logf(LOG_INFO,"linkspam: Read %li bytes.",(long)size); // copy errno to g_errno read ( fd , ppp , size ); ppp[size]=0; Xml xml; xml.set ( csUTF8, ppp , size , false , size , false , TITLEREC_CURRENT_VERSION ); Url linker; Url linkee; char *lee = "www.viagrapunch.com"; linkee.set ( lee , gbstrlen ( lee ) ); char *rr = "http://www.propeciauk.co.uk/links.htm"; linker.set ( rr , gbstrlen(rr) ); char *note = NULL; long linkNode = -1; Links links; //long siteFileNum = 48;//tr->getSiteFilenum(); //Xml *sx = g_tagdb.getSiteXml ( siteFileNum, "main" , 4 ); if (!links.set ( true , &xml , &linker , false, // includeLinkHashes true , // useBaseHref? TITLEREC_CURRENT_VERSION, 0 )) // niceness )) return; char linkText[1024]; if ( linkNode < 0 ) logf(LOG_INFO,"linkspam: linkee not found in content."); //long linkTextLen = links.getLinkText ( &linkee , linkText , 1023 , NULL,//&m_itemPtr , NULL,//&m_itemLen , &linkNode , 0 ); // niceness ); bool ttt = isLinkSpam ( &linker , NULL , //class TitleRec *tr , &xml , &links , size , ¬e , &linkee , linkNode , "main" , 0 ); // niceness logf(LOG_INFO,"linkspam: linkNode=%li val=%li note=%s", linkNode,(long)ttt,note); exit(0); } */ long checkDirPerms ( char *dir ) { if ( g_conf.m_readOnlyMode ) return 0; File f; f.set ( dir , "tmpfile" ); if ( ! f.open ( O_RDWR | O_CREAT | O_TRUNC ) ) { log("disk: Unable to create %s/tmpfile. Need write permission " "in this directory.",dir); return -1; } if ( ! f.unlink() ) { log("disk: Unable to delete %s/tmpfile. Need write permission " "in this directory.",dir); return -1; } return 0; } // save them all static void doCmdAll ( int fd, void *state ) ; static bool s_sendToHosts; static bool s_sendToProxies; static long s_hostId; static long s_hostId2; static const char *s_cmd ; static char s_buffer[128]; static HttpRequest s_r; bool doCmd ( const char *cmd , long hostId , char *filename , bool sendToHosts , bool sendToProxies , long hostId2 ) { // need loop to work if ( ! g_loop.init() ) return log("db: Loop init failed." ); // save it s_cmd = cmd; // we are no part of it //g_hostdb.m_hostId = -1; // pass it on s_hostId = hostId; s_sendToHosts = sendToHosts; s_sendToProxies = sendToProxies; s_hostId2 = hostId2; // set stuff so http server client-side works right g_conf.m_httpMaxSockets = 512; sprintf ( g_conf.m_spiderUserAgent ,"Gigabot/1.0"); // then webserver, client side only //if ( ! g_httpServer.init( -1, -1 ) ) // return log("db: HttpServer init failed." ); // no, we just need udp server //if ( ! g_udpServer.init( 6345/*port*/,&g_dp,-1/*niceness*/, // 10000000,10000000,20,1000) ) { // log("admin: UdpServer init failed." ); return false; } // register sleep callback to get started if ( ! g_loop.registerSleepCallback(1, NULL, doCmdAll , 0 ) ) return log("admin: Loop init failed."); // not it log(LOG_INFO,"admin: broadcasting %s",cmd); // make a fake http request sprintf ( s_buffer , "GET /%s?%s HTTP/1.0" , filename , cmd ); TcpSocket sock; sock.m_ip = 0; // make it local loopback so it passes the permission test in // doCmdAll()'s call to convertHttpRequestToParmList sock.m_ip = atoip("127.0.0.1"); s_r.set ( s_buffer , gbstrlen ( s_buffer ) , &sock ); // run the loop if ( ! g_loop.runLoop() ) return log("INJECT: loop run failed."); return true; } //static Msg28 s_msg28; //static TcpSocket s_s; void doneCmdAll ( void *state ) { /* if ( s_sendToProxies ){ if ( ! g_loop.registerSleepCallback(1, NULL, doCmdAll,0 ) ){ log("admin: Loop init failed."); exit ( 0 ); } return; } */ log("cmd: completed command"); exit ( 0 ); } void doCmdAll ( int fd, void *state ) { // do not keep calling it! g_loop.unregisterSleepCallback ( NULL, doCmdAll ); // make port -1 to indicate none to listen on if ( ! g_udpServer.init( 18123 , // port to listen on &g_dp, 0, // niceness 20000000 , // readBufSIze 20000000 , // writeBufSize 20 , // pollTime in ms 3500 , // max udp slots false )){ // is dns? log("db: UdpServer init on port 18123 failed: %s" , mstrerror(g_errno)); exit(0); } // udpserver::sendRequest() checks we have a handle for msgs we send! // so fake it out with this lest it cores g_udpServer.registerHandler(0x3f,handleRequest3f); SafeBuf parmList; // returns false and sets g_errno on error if ( ! g_parms.convertHttpRequestToParmList ( &s_r , &parmList ,0) ) { log("cmd: error converting command: %s",mstrerror(g_errno)); exit(0); } if ( parmList.length() <= 0 ) { log("cmd: no parmlist to send"); exit(0); } // returns true with g_errno set on error. uses g_udpServer if ( g_parms.broadcastParmList ( &parmList , NULL , doneCmdAll , // callback when done s_sendToHosts , s_sendToProxies ) ) { log("cmd: error sending command: %s",mstrerror(g_errno)); exit(0); return; } // wait for it log("cmd: sent command"); /* bool status = true; if ( s_sendToHosts ){ s_sendToHosts = false; status = s_msg28.massConfig ( &s_s, &s_r, s_hostId, NULL, doneCmdAll,false, false,s_hostId2); } else if ( s_sendToProxies ){ s_sendToProxies = false; status = s_msg28.massConfig ( &s_s, &s_r, s_hostId, NULL, doneCmdAll,false, true,s_hostId2); } g_loop.unregisterSleepCallback ( NULL, doCmdAll ); // if we did not block, call the callback directly if ( status ) doneCmdAll(NULL); */ } // copy a collection from one network to another (defined by 2 hosts.conf's) int collcopy ( char *newHostsConf , char *coll , long collnum ) { Hostdb hdb; if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) { log("clusterCopy failed. Could not init hostdb with %s", newHostsConf); return -1; } // sanity check if ( hdb.getNumShards() != g_hostdb.getNumShards() ) { log("Hosts.conf files do not have same number of groups."); return -1; } if ( hdb.getNumHosts() != g_hostdb.getNumHosts() ) { log("Hosts.conf files do not have same number of hosts."); return -1; } // host checks for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { Host *h = &g_hostdb.m_hosts[i]; fprintf(stderr,"ssh %s '",iptoa(h->m_ip)); fprintf(stderr,"du -skc %scoll.%s.%li | tail -1 '\n", h->m_dir,coll,collnum); } // loop over dst hosts for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { Host *h = &g_hostdb.m_hosts[i]; // get the src host from the provided hosts.conf Host *h2 = &hdb.m_hosts[i]; // print the copy //fprintf(stderr,"rcp %s:%s*db*.dat* ", // iptoa( h->m_ip), h->m_dir ); fprintf(stderr,"nohup ssh %s '",iptoa(h->m_ip)); fprintf(stderr,"rcp -pr "); fprintf(stderr,"%s:%scoll.%s.%li ", iptoa(h2->m_ip), h2->m_dir , coll, collnum ); fprintf(stderr,"%s' &\n", h->m_dir ); //fprintf(stderr," rcp -p %s*.map* ", h->m_dir ); //fprintf(stderr," rcp -r %scoll.* ", h->m_dir ); //fprintf(stderr,"%s:%s " ,iptoa(h2->m_ip), h2->m_dir ); } return 1; } // generate the copies that need to be done to scale from oldhosts.conf // to newhosts.conf topology. int scale ( char *newHostsConf , bool useShotgunIp) { g_hostdb.resetPortTables(); Hostdb hdb; if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) { log("Scale failed. Could not init hostdb with %s", newHostsConf); return -1; } // ptrs to the two hostdb's Hostdb *hdb1 = &g_hostdb; Hostdb *hdb2 = &hdb; // this function was made to scale UP, but if scaling down // then swap them! if ( hdb1->m_numHosts > hdb2->m_numHosts ) { Hostdb *tmp = hdb1; hdb1 = hdb2; hdb2 = tmp; } // . ensure old hosts in g_hostdb are in a derivate groupId in // newHostsConf // . old hosts may not even be present! consider them the same host, // though, if have same ip and working dir, because that would // interfere with a file copy. for ( long i = 0 ; i < hdb1->m_numHosts ; i++ ) { Host *h = &hdb1->m_hosts[i]; // look in new guy for ( long j = 0 ; j < hdb2->m_numHosts ; j++ ) { Host *h2 = &hdb2->m_hosts[j]; // if a match, ensure same group if ( h2->m_ip != h->m_ip ) continue; if ( strcmp ( h2->m_dir , h->m_dir ) != 0 ) continue; // bitch if twins not preserved when scaling //if ( h2->m_group != h->m_group ) { /* if ( (h2->m_groupId & hdb1->m_groupMask) != (h->m_groupId & hdb1->m_groupMask) ) { log("Twins not preserved when scaling. New hosts.conf " "must have same twins as old hosts.conf. That is, " "if two hosts were in the same group (GRP) in the " "old hosts.conf, they must be in the same group " "in the new hosts.conf"); return -1; } // bitch if a major group change if ( (h2->m_group & (hdb1->m_numGroups - 1)) == h->m_group ) continue; log ("hostId #%li (in group #%li) in %s is not in a " "derivative group of " "hostId #%li (in group #%li) in old hosts.conf.", h2->m_hostId,h2->m_group, newHostsConf, h->m_hostId,h->m_group); return -1; */ } } // . ensure that: // (h2->m_groupId & (hdb1->m_numGroups -1)) == h->m_groupId // where h2 is in a derivative group of h. // . do a quick monte carlo test to make sure that a key in old // group #0 maps to groups 0,8,16,24 for all keys and all dbs unsigned long shard1; unsigned long shard2; for ( long i = 0 ; i < 1000 ; i++ ) { //key_t k; //k.n1 = rand(); k.n0 = rand(); k.n0 <<= 32; k.n0 |= rand(); //key128_t k16; //k16.n0 = k.n0; //k16.n1 = rand(); k16.n1 <<= 32; k16.n1 |= k.n1; char k[MAX_KEY_BYTES]; for ( long ki = 0 ; ki < MAX_KEY_BYTES ; ki++ ) k[ki] = rand() & 0xff; //char *k2; //if ( g_conf.m_checksumdbKeySize == 12 ) // k2 = (char *)&k; //else // k2 = (char *)&k16; // get old group (groupId1) and new group (groupId2) shard1 = hdb1->getShardNum ( RDB_TITLEDB , k );//, hdb1 ); shard2 = hdb2->getShardNum( RDB_TITLEDB , k );//, hdb2 ); /* // ensure groupId2 is derivative of groupId1 if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) { log("Bad engineer. Group id 0x%lx not derivative of " "group id 0x%lx for titledb.",groupId2,groupId1); return -1; } */ /* // get old group (groupId1) and new group (groupId2) //groupId1 = g_checksumdb.getGroupId ( k , &g_hostdb ); //groupId2 = g_checksumdb.getGroupId ( k , &hdb ); groupId1 = hdb1->g_checksumdb.getGroupId ( k2 , hdb1 ); groupId2 = hdb2->g_checksumdb.getGroupId ( k2 , hdb2 ); // ensure groupId2 is derivative of groupId1 if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) { log("Bad engineer. Group id 0x%lx not derivative of " "group id 0x%lx for checksumdb.", groupId2,groupId1); return -1; } */ /* // get old group (groupId1) and new group (groupId2) groupId1 = hdb1->getGroupId ( RDB_SPIDERDB , k ); groupId2 = hdb2->getGroupId ( RDB_SPIDERDB , k ); // ensure groupId2 is derivative of groupId1 if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) { log("Bad engineer. Group id 0x%lx not derivative of " "group id 0x%lx for spiderdb.", groupId2,groupId1); return -1; } // get old group (groupId1) and new group (groupId2) groupId1 = hdb1->getGroupId ( RDB_POSDB , k ); groupId2 = hdb2->getGroupId ( RDB_POSDB , k ); // ensure groupId2 is derivative of groupId1 if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) { log("Bad engineer. Group id 0x%lx not derivative of " "group id 0x%lx for posdb.", groupId2,groupId1); return -1; } // get old group (groupId1) and new group (groupId2) groupId1 = hdb1->getGroupId ( RDB_CLUSTERDB , k ); groupId2 = hdb2->getGroupId ( RDB_CLUSTERDB , k ); // ensure groupId2 is derivative of groupId1 if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) { log("Bad engineer. Group id 0x%lx not derivative of " "group id 0x%lx for clusterdb.", groupId2,groupId1); return -1; } // get old group (groupId1) and new group (groupId2) groupId1 = hdb1->getGroupId ( RDB_TAGDB , k ); groupId2 = hdb2->getGroupId ( RDB_TAGDB , k ); // ensure groupId2 is derivative of groupId1 if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) { log("Bad engineer. Group id 0x%lx not derivative of " "group id 0x%lx for tagdb.", groupId2,groupId1); return -1; } // get old group (groupId1) and new group (groupId2) groupId1 = hdb1->getGroupId ( RDB_SECTIONDB , k ); groupId2 = hdb2->getGroupId ( RDB_SECTIONDB , k ); // ensure groupId2 is derivative of groupId1 if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) { log("Bad engineer. Group id 0x%lx not derivative of " "group id 0x%lx for sectiondb.", groupId2,groupId1); return -1; } // get old group (groupId1) and new group (groupId2) groupId1 = hdb1->getGroupId ( RDB_LINKDB , k ); groupId2 = hdb2->getGroupId ( RDB_LINKDB , k ); // ensure groupId2 is derivative of groupId1 if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) { log("Bad engineer. Group id 0x%lx not derivative of " "group id 0x%lx for linkdb.", groupId2,groupId1); return -1; } */ } // . now copy all titleRecs in old hosts to all derivatives // . going from 8 (3bits) hosts to 32 (5bits), for instance, old // group id #0 would copy to group ids 0,8,16 and 24. // . 000 --> 00000(#0), 01000(#8), 10000(#16), 11000(#24) // . titledb and tfndb determine groupId by mod'ding the docid // contained in their most significant key bits with the number // of groups. see Titledb.h::getGroupId(docid) // . indexdb and tagdb mask the hi bits of the key with // hdb1->m_groupMask, which is like a reverse mod'ding: // 000 --> 00000, 00001, 00010, 00011 char done [ 8196 ]; memset ( done , 0 , 8196 ); for ( long i = 0 ; i < hdb1->m_numHosts ; i++ ) { Host *h = &hdb1->m_hosts[i]; char flag = 0; // look in new guy for ( long j = 0 ; j < hdb2->m_numHosts ; j++ ) { Host *h2 = &hdb2->m_hosts[j]; // do not copy to oneself if ( h2->m_ip == h->m_ip && strcmp ( h2->m_dir , h->m_dir ) == 0 ) continue; // skip if not derivative groupId for titledb //if ( (h2->m_groupId & hdb1->m_groupMask) != // h->m_groupId ) continue; // continue if already copying to here if ( done[j] ) continue; // mark as done done[j] = 1; /* // . don't copy to a twin in the old hosts.conf // . WE MUST preserve twins when scaling for this to work if ( h2->m_group == h->m_group ) { // only skip host h2 if he's in old hosts.conf // somewhere. does newhosts.conf contain hosts from // old hosts.conf? long k = 0; for ( k = 0 ; k < hdb1->m_numHosts ; k++ ) { Host *h3 = &hdb1->m_hosts[k]; if ( h2->m_ip == h3->m_ip && strcmp ( h2->m_dir , h3->m_dir ) == 0 ) break; } if ( k < hdb1->m_numHosts ) continue; } */ // skip local copies for now!! //if ( h->m_ip == h2->m_ip ) continue; // use ; separator if ( flag ) fprintf(stderr,"; "); //else fprintf(stderr,"ssh %s \"",iptoa(h->m_ip)); else fprintf(stderr,"ssh %s \"",h->m_hostname); // flag flag = 1; // print the copy //fprintf(stderr,"rcp %s:%s*db*.dat* ", // iptoa( h->m_ip), h->m_dir ); // if same ip then do a 'cp' not rcp char *cmd = "rcp -pr"; if ( h->m_ip == h2->m_ip ) cmd = "cp -pr"; fprintf(stderr,"%s %s*db*.dat* ", cmd, h->m_dir ); if ( h->m_ip == h2->m_ip ) fprintf(stderr,"%s ;", h2->m_dir ); else { //long ip = h2->m_ip; //if ( useShotgunIp ) ip = h2->m_ipShotgun; //fprintf(stderr,"%s:%s ;",iptoa(ip), h2->m_dir ); char *hn = h2->m_hostname; if ( useShotgunIp ) hn = h2->m_hostname;//2 fprintf(stderr,"%s:%s ;",hn, h2->m_dir ); } //fprintf(stderr," rcp -p %s*.map* ", h->m_dir ); fprintf(stderr," %s %scoll.* ", cmd, h->m_dir ); if ( h->m_ip == h2->m_ip ) fprintf(stderr,"%s " , h2->m_dir ); else { //long ip = h2->m_ip; //if ( useShotgunIp ) ip = h2->m_ipShotgun; //fprintf(stderr,"%s:%s " ,iptoa(ip), h2->m_dir ); char *hn = h2->m_hostname; if ( useShotgunIp ) hn = h2->m_hostname;//2; fprintf(stderr,"%s:%s " ,hn, h2->m_dir ); } /* fprintf(stderr,"scp %s:%s/titledb* %s:%s\n", iptoa( h->m_ip), h->m_dir , iptoa(h2->m_ip), h2->m_dir ); fprintf(stderr,"scp %s:%s/tfndb* %s:%s\n", iptoa( h->m_ip), h->m_dir , iptoa(h2->m_ip), h2->m_dir ); fprintf(stderr,"scp %s:%s/indexdb* %s:%s\n", iptoa( h->m_ip), h->m_dir , iptoa(h2->m_ip), h2->m_dir ); fprintf(stderr,"scp %s:%s/spiderdb* %s:%s\n", iptoa( h->m_ip), h->m_dir , iptoa(h2->m_ip), h2->m_dir ); fprintf(stderr,"scp %s:%s/checksumdb* %s:%s\n", iptoa( h->m_ip), h->m_dir , iptoa(h2->m_ip), h2->m_dir ); fprintf(stderr,"scp %s:%s/clusterdb* %s:%s\n", iptoa( h->m_ip), h->m_dir , iptoa(h2->m_ip), h2->m_dir ); fprintf(stderr,"scp %s:%s/tagdb* %s:%s\n", iptoa( h->m_ip), h->m_dir , iptoa(h2->m_ip), h2->m_dir ); */ } if ( flag ) fprintf(stderr,"\" &\n"); } return 1; } // installFlag is 1 if we are really installing, 2 if just starting up gb's // installFlag should be a member of the ifk_ enum defined above int install ( install_flag_konst_t installFlag , long hostId , char *dir , char *coll , long hostId2 , char *cmd ) { // use hostId2 to indicate the range hostId-hostId2, but if it is -1 // then it was not given, so restrict to just hostId if ( hostId2 == -1 ) hostId2 = hostId; char tmp[1024]; /* long i,j; if( installFlag == ifk_distributeC ) { long numGroups = g_hostdb.getNumShards(); char tmp2[100]; unsigned long groupId1, groupId2; long numHostsPerGroup = g_hostdb.getNumHostsPerShard(); log("distribute copying files to twins for each host"); for(i=0;im_hostId; Host *h2 = h1; h2++; for(j=1; j< numHostsPerGroup; j++) { sprintf(tmp, "scp %s:%schecksumg%lih%lidb ", iptoa(h1->m_ip), h1->m_dir,baseHostId, (long)h1->m_hostId); sprintf(tmp2, "%s:%s &", iptoa(h2->m_ip), h2->m_dir); strcat(tmp,tmp2); log("distribute %s",tmp); system(tmp); h2++; } } for(i=1;im_hostId; for(int k=0;km_ip), h1->m_dir,baseHostId, (long)h1->m_hostId); if(j == numGroups-1 && k == numHostsPerGroup-1) sprintf(tmp2, "%s:%s ", iptoa(h2->m_ip), h2->m_dir); else sprintf(tmp2, "%s:%s &", iptoa(h2->m_ip), h2->m_dir); strcat(tmp,tmp2); log("distribute %s",tmp); system(tmp); h2++; } } } return 0; } */ if ( installFlag == ifk_proxy_start ) { for ( long i = 0; i < g_hostdb.m_numProxyHosts; i++ ) { Host *h2 = g_hostdb.getProxy(i); // limit install to this hostId if it is >= 0 if ( hostId >= 0 && h2->m_hostId != hostId ) continue; // . save old log now, too char tmp2[1024]; tmp2[0]='\0'; // let's do this for everyone now //if ( h2->m_hostId == 0 ) sprintf(tmp2, "mv ./proxylog ./proxylog-`date '+" "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ); // . assume conf file name gbHID.conf // . assume working dir ends in a '/' sprintf(tmp, "ssh %s \"cd %s ; " "cp -f gb gb.oldsave ; " "mv -f gb.installed gb ; %s" "./gb proxy load %li >& ./proxylog &\" &", iptoa(h2->m_ip), h2->m_dir , tmp2 , i); // log it log(LOG_INIT,"%s", tmp); // execute it long ret = system ( tmp ); if ( ret < 0 ) { fprintf(stderr,"Error loading proxy: %s\n", mstrerror(errno)); exit(-1); } fprintf(stderr,"If proxy does not start, make sure " "its ip is correct in hosts.conf\n"); } return 0; } if ( installFlag == ifk_proxy_kstart ) { for ( long i = 0; i < g_hostdb.m_numProxyHosts; i++ ) { Host *h2 = g_hostdb.getProxy(i); // limit install to this hostId if it is >= 0 if ( hostId >= 0 && h2->m_hostId != hostId ) continue; // . save old log now, too char tmp2[1024]; tmp2[0]='\0'; // let's do this for everyone now //if ( h2->m_hostId == 0 ) sprintf(tmp2, "mv ./proxylog ./proxylog-`date '+" "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ); // . assume conf file name gbHID.conf // . assume working dir ends in a '/' //to test add: ulimit -t 10; to the ssh cmd sprintf(tmp, "ssh %s \"cd %s ; " "cp -f gb gb.oldsave ; " "mv -f gb.installed gb ; " "ADDARGS='' ; " "EXITSTATUS=1 ; " "while [ \\$EXITSTATUS != 0 ]; do " "{ " "mv ./proxylog ./proxylog-\\`date '+" "%%Y_%%m_%%d-%%H:%%M:%%S'\\` ; " "./gb proxy load %li " // mdw "\\$ADDARGS " " >& ./proxylog ;" "EXITSTATUS=\\$? ; " "ADDARGS='-r' ; " "} " "done >& /dev/null & \" & ", iptoa(h2->m_ip), h2->m_dir , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it long ret = system ( tmp ); if ( ret < 0 ) { fprintf(stderr,"Error loading proxy: %s\n", mstrerror(errno)); exit(-1); } fprintf(stderr,"If proxy does not start, make sure " "its ip is correct in hosts.conf\n"); } return 0; } // go through each host for ( long i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) { Host *h2 = g_hostdb.getHost(i); // limit install to this hostId if it is >= 0 //if ( hostId >= 0 && h2->m_hostId != hostId ) continue; if ( hostId >= 0 && hostId2 == -1 ) { if ( h2->m_hostId != hostId ) continue; } // if doing a range of hostid, hostId2 is >= 0 else if ( hostId >= 0 && hostId2 >= 0 ) { if ( h2->m_hostId < hostId ) continue; if ( h2->m_hostId > hostId2 ) continue; } // do not install to self //if ( h2->m_hostId == g_hostdb.m_hostId ) continue; // backupcopy if ( installFlag == ifk_backupcopy ) { sprintf(tmp, "ssh %s \"cd %s ; " "mkdir %s ; " "cp -ai *.dat* *.map gb.conf " "hosts.conf %s\" &", iptoa(h2->m_ip), h2->m_dir , dir , dir ); // log it log ( "%s", tmp); // execute it system ( tmp ); continue; } // backupmove if ( installFlag == ifk_backupmove ) { sprintf(tmp, "ssh %s \"cd %s ; " "mkdir %s ; " "mv -i *.dat* *.map " "%s\" &", iptoa(h2->m_ip), h2->m_dir , dir , dir ); // log it log ( "%s", tmp); // execute it system ( tmp ); continue; } // backuprestore if ( installFlag == ifk_backuprestore ) { sprintf(tmp, "ssh %s \"cd %s ; cd %s ; " "mv -i *.dat* *.map gb.conf " "hosts.conf %s\" &", iptoa(h2->m_ip), h2->m_dir , dir , h2->m_dir ); // log it log ( "%s", tmp); // execute it system ( tmp ); continue; } // dumpmissing logic else if ( installFlag == ifk_dumpmissing ) { sprintf(tmp, "ssh %s \"cd %s ; " "cp -f gb gb.oldsave ; " "mv -f gb.installed gb ; " "./gb dumpmissing %s %li " ">& ./missing%li &\" &", iptoa(h2->m_ip), h2->m_dir , //h2->m_dir , coll , h2->m_hostId , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } else if ( installFlag == ifk_dumpdups ) { sprintf(tmp, "ssh %s \"cd %s ; " "cp -f gb gb.oldsave ; " "mv -f gb.installed gb ; " "./gb dumpdups %s %li " ">& ./dups%li &\" &", iptoa(h2->m_ip), h2->m_dir , //h2->m_dir , coll , h2->m_hostId , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } // removedocids logic else if ( installFlag == ifk_removedocids ) { sprintf(tmp, "ssh %s \"cd %s ; " "cp -f gb gb.oldsave ; " "mv -f gb.installed gb ; " "./gb %li " "removedocids %s %s %li " ">& ./removelog%03li &\" &", iptoa(h2->m_ip), h2->m_dir , //h2->m_dir , h2->m_hostId , coll , dir , // really docidsFile h2->m_hostId , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } char *dir = "./"; // install to it if ( installFlag == ifk_install ) { // don't copy to ourselves //if ( h2->m_hostId == h->m_hostId ) continue; sprintf(tmp, "rcp -pr " "%sgb " //"%sgbfilter " "%shosts.conf " "%slocalhosts.conf " //"%shosts2.conf " "%sgb.conf " "%stmpgb " //"%scollections.dat " "%sgb.pem " //"%sdict " "%sucdata " //"%stop100000Alexa.txt " //"%slanglist " "%santiword " //"%s.antiword " "badcattable.dat " "catcountry.dat " "%spdftohtml " "%spstotext " //"%sxlhtml " "%sppthtml " //"%stagdb*.xml " "%shtml " "%scat " "%santiword-dir " "%sgiftopnm " "%spostalCodes.txt " "%stifftopnm " "%sppmtojpeg " "%spnmscale " "%spngtopnm " "%sjpegtopnm " "%sbmptopnm " "%swiktionary-buf.txt " "%swiktionary-lang.txt " "%swiktionary-syns.dat " "%swikititles.txt.part1 " "%swikititles.txt.part2 " "%swikititles2.dat " "%sunifiedDict.txt " "%sunifiedDict-buf.txt " "%sunifiedDict-map.dat " "%s:%s" , dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp %sgb.conf %s:%sgb.conf", dir , //h->m_hostId , iptoa(h2->m_ip), h2->m_dir); //h2->m_hostId); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } if ( installFlag == ifk_install2 ) { // don't copy to ourselves //if ( h2->m_hostId == h->m_hostId ) continue; sprintf(tmp, "rcp -pr " "%sgb " //"%sgbfilter " "%shosts.conf " "%shosts2.conf " "%sgb.conf " "%stmpgb " //"%scollections.dat " "%sgb.pem " "%sdict " "%sucdata " "%stop100000Alexa.txt " //"%slanglist " "%santiword " "%s.antiword " "badcattable.dat " "catcountry.dat " "%spdftohtml " "%spstotext " "%sxlhtml " "%sppthtml " //"%stagdb*.xml " "%shtml " "%scat " "%s:%s", dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, dir, //iptoa(h2->m_ip2), iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp %sgb.conf %s:%sgb.conf", dir , //h->m_hostId , //iptoa(h2->m_ip), iptoa(h2->m_ipShotgun), h2->m_dir); //h2->m_hostId); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } else if ( installFlag == ifk_installgb ) { // don't copy to ourselves //if ( h2->m_hostId == h->m_hostId ) continue; sprintf(tmp, "rcp " "%sgb.new " "%s:%s/gb.installed &", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } else if ( installFlag == ifk_installtmpgb ) { // don't copy to ourselves //if ( h2->m_hostId == h->m_hostId ) continue; sprintf(tmp, "rcp " "%sgb.new " "%s:%s/tmpgb.installed &", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } else if ( installFlag == ifk_installconf ) { // don't copy to ourselves //if ( h2->m_hostId == h->m_hostId ) continue; sprintf(tmp, "rcp %sgb.conf %s:%sgb.conf &", dir , //h->m_hostId , iptoa(h2->m_ip), h2->m_dir); //h2->m_hostId); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp %shosts.conf %s:%shosts.conf &", dir , iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp %shosts2.conf %s:%shosts2.conf &", dir , iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } else if ( installFlag == ifk_start ) { // . save old log now, too char tmp2[1024]; tmp2[0]='\0'; // let's do this for everyone now //if ( h2->m_hostId == 0 ) sprintf(tmp2, "mv ./log%03li ./log%03li-`date '+" "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " , h2->m_hostId , h2->m_hostId ); // . assume conf file name gbHID.conf // . assume working dir ends in a '/' sprintf(tmp, "ssh %s \"cd %s ; " "cp -f gb gb.oldsave ; " "mv -f gb.installed gb ; %s" "./gb %li >& ./log%03li &\" &", iptoa(h2->m_ip), h2->m_dir , tmp2 , //h2->m_dir , h2->m_hostId , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } // start up a dummy cluster using hosts.conf ports + 1 else if ( installFlag == ifk_tmpstart ) { // . assume conf file name gbHID.conf // . assume working dir ends in a '/' sprintf(tmp, "ssh %s \"cd %s ; " "cp -f tmpgb tmpgb.oldsave ; " "mv -f tmpgb.installed tmpgb ; " "./tmpgb -c %shosts.conf tmpstarthost " "%li >& ./tmplog%03li &\" &", iptoa(h2->m_ip), h2->m_dir , h2->m_dir , h2->m_hostId , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } else if ( installFlag == ifk_start2 ) { // . save old log now, too char tmp2[1024]; tmp2[0]='\0'; // let's do this for everyone now //if ( h2->m_hostId == 0 ) sprintf(tmp2, "mv ./log%03li ./log%03li-`date '+" "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " , h2->m_hostId , h2->m_hostId ); // . assume conf file name gbHID.conf // . assume working dir ends in a '/' sprintf(tmp, "ssh %s \"cd %s ; " "cp -f gb gb.oldsave ; " "mv -f gb.installed gb ; %s" "./gb %li >& ./log%03li &\" &", iptoa(h2->m_ipShotgun), h2->m_dir , tmp2 , //h2->m_dir , h2->m_hostId , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } else if ( installFlag == ifk_kstart ) { //keepalive // . save old log now, too char tmp2[1024]; tmp2[0]='\0'; // let's do this for everyone now //if ( h2->m_hostId == 0 ) sprintf(tmp2, "mv ./log%03li ./log%03li-`date '+" "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " , h2->m_hostId , h2->m_hostId ); // . assume conf file name gbHID.conf // . assume working dir ends in a '/' //to test add: ulimit -t 10; to the ssh cmd sprintf(tmp, "ssh %s \"cd %s ; " "cp -f gb gb.oldsave ; " "mv -f gb.installed gb ; " "ADDARGS='' ; " "EXITSTATUS=1 ; " "while [ \\$EXITSTATUS != 0 ]; do " "{ " "mv ./log%03li ./log%03li-\\`date '+" "%%Y_%%m_%%d-%%H:%%M:%%S'\\` ; " "./gb %li " "\\$ADDARGS " " >& ./log%03li ;" "EXITSTATUS=\\$? ; " "ADDARGS='-r' ; " "} " "done >& /dev/null & \" & ", iptoa(h2->m_ip), h2->m_dir , h2->m_hostId , h2->m_hostId , //h2->m_dir , h2->m_hostId , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } /* else if ( installFlag == ifk_gendbs ) { // . save old log now, too char tmp2[1024]; tmp2[0]='\0'; // let's do this for everyone now //if ( h2->m_hostId == 0 ) sprintf(tmp2, "mv ./log%03li ./log%03li-`date '+" "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " , h2->m_hostId , h2->m_hostId ); // . assume conf file name gbHID.conf // . assume working dir ends in a '/' sprintf(tmp, "ssh %s \"cd %s ; %s" "./gb -c %shosts.conf gendbs %s %li >&" "./log%03li &\" &", iptoa(h2->m_ip), h2->m_dir , tmp2 , h2->m_dir , coll , h2->m_hostId , h2->m_hostId ); // log it log(LOG_INFO,"installM %s",tmp); log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } else if ( installFlag == ifk_fixtfndb ) { // . save old log now, too char tmp2[1024]; tmp2[0]='\0'; // let's do this for everyone now //if ( h2->m_hostId == 0 ) sprintf(tmp2, "mv ./log%03li ./log%03li-`date '+" "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " , h2->m_hostId , h2->m_hostId ); // . assume conf file name gbHID.conf // . assume working dir ends in a '/' sprintf(tmp, "ssh %s \"cd %s ; %s" "./gb -c %shosts.conf fixtfndb %s %li >&" "./log%03li &\" &", iptoa(h2->m_ip), h2->m_dir , tmp2 , h2->m_dir , coll , h2->m_hostId , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } else if ( installFlag == ifk_gentfndb ) { // . save old log now, too char tmp2[1024]; tmp2[0]='\0'; // let's do this for everyone now //if ( h2->m_hostId == 0 ) sprintf(tmp2, "mv ./log%03li ./log%03li-`date '+" "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " , h2->m_hostId , h2->m_hostId ); // . assume conf file name gbHID.conf // . assume working dir ends in a '/' sprintf(tmp, "ssh %s \"cd %s ; %s" "./gb -c %shosts.conf gentfndb %s %li >&" "./log%03li &\" &", iptoa(h2->m_ip), h2->m_dir , tmp2 , h2->m_dir , coll , h2->m_hostId , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } */ else if ( installFlag == ifk_installcat ) { // . copy catdb files to all hosts // don't copy to ourselves if ( h2->m_hostId == 0 ) continue; sprintf(tmp, "rcp " "%scatdb/content.rdf.u8 " "%s:%scatdb/content.rdf.u8", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/structure.rdf.u8 " "%s:%scatdb/structure.rdf.u8", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/gbdmoz.structure.dat " "%s:%scatdb/gbdmoz.structure.dat", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/gbdmoz.content.dat " "%s:%scatdb/gbdmoz.content.dat", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); //system ( tmp ); //sprintf(tmp, // "rcp " // "%scatdb/gbdmoz.content.dat.diff " // "%s:%scatdb/gbdmoz.content.dat.diff", // dir, // iptoa(h2->m_ip), // h2->m_dir); //log(LOG_INIT,"admin: %s", tmp); //system ( tmp ); } else if ( installFlag == ifk_installnewcat ) { // . copy catdb files to all hosts // don't copy to ourselves if ( h2->m_hostId == 0 ) continue; sprintf(tmp, "rcp " "%scatdb/content.rdf.u8.new " "%s:%scatdb/content.rdf.u8.new", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/structure.rdf.u8.new " "%s:%scatdb/structure.rdf.u8.new", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/gbdmoz.structure.dat.new " "%s:%scatdb/gbdmoz.structure.dat.new", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/gbdmoz.content.dat.new " "%s:%scatdb/gbdmoz.content.dat.new", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/gbdmoz.content.dat.new.diff " "%s:%scatdb/gbdmoz.content.dat.new.diff", dir, iptoa(h2->m_ip), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } else if ( installFlag == ifk_genclusterdb ) { // . save old log now, too char tmp2[1024]; tmp2[0]='\0'; // let's do this for everyone now //if ( h2->m_hostId == 0 ) //sprintf(tmp2, // "mv ./log%03li ./log%03li-`date '+" // "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " , // h2->m_hostId , // h2->m_hostId ); // . assume conf file name gbHID.conf // . assume working dir ends in a '/' sprintf(tmp, "ssh %s \"cd %s ;" //"%s" "./gb genclusterdb %s %li >&" "./log%03li-genclusterdb &\" &", iptoa(h2->m_ip), h2->m_dir , //h2->m_dir , //tmp2 , coll , h2->m_hostId , h2->m_hostId ); // log it log(LOG_INIT,"admin: %s", tmp); // execute it system ( tmp ); } else if ( installFlag == ifk_installgb2 ) { // don't copy to ourselves //if ( h2->m_hostId == h->m_hostId ) continue; sprintf(tmp, "rcp " "%sgb.new " "%s:%s/gb.installed &", dir, iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } // dsh else if ( installFlag == ifk_dsh ) { // don't copy to ourselves //if ( h2->m_hostId == h->m_hostId ) continue; sprintf(tmp, "ssh %s '%s' &", iptoa(h2->m_ip), cmd ); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } // dsh2 else if ( installFlag == ifk_dsh2 ) { // don't copy to ourselves //if ( h2->m_hostId == h->m_hostId ) continue; sprintf(tmp, "ssh %s '%s' &", iptoa(h2->m_ipShotgun), cmd ); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } // installconf2 else if ( installFlag == ifk_installconf2 ) { // don't copy to ourselves //if ( h2->m_hostId == h->m_hostId ) continue; sprintf(tmp, "rcp %sgb.conf %shosts.conf %shosts2.conf " "%s:%s &", dir , dir , dir , //h->m_hostId , iptoa(h2->m_ipShotgun), h2->m_dir); //h2->m_hostId); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } // installcat2 else if ( installFlag == ifk_installcat2 ) { // . copy catdb files to all hosts // don't copy to ourselves if ( h2->m_hostId == 0 ) continue; sprintf(tmp, "rcp " "%scatdb/content.rdf.u8 " "%s:%scatdb/content.rdf.u8", dir, iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/structure.rdf.u8 " "%s:%scatdb/structure.rdf.u8", dir, iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/gbdmoz.structure.dat " "%s:%scatdb/gbdmoz.structure.dat", dir, iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/gbdmoz.content.dat " "%s:%scatdb/gbdmoz.content.dat", dir, iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); //system ( tmp ); //sprintf(tmp, // "rcp " // "%scatdb/gbdmoz.content.dat.diff " // "%s:%scatdb/gbdmoz.content.dat.diff", // dir, // iptoa(h2->m_ip), // h2->m_dir); //log(LOG_INIT,"admin: %s", tmp); //system ( tmp ); } // installnewcat2 else if ( installFlag == ifk_installnewcat2 ) { // . copy catdb files to all hosts // don't copy to ourselves if ( h2->m_hostId == 0 ) continue; sprintf(tmp, "rcp " "%scatdb/content.rdf.u8.new " "%s:%scatdb/content.rdf.u8.new", dir, iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/structure.rdf.u8.new " "%s:%scatdb/structure.rdf.u8.new", dir, iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/gbdmoz.structure.dat.new " "%s:%scatdb/gbdmoz.structure.dat.new", dir, iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/gbdmoz.content.dat.new " "%s:%scatdb/gbdmoz.content.dat.new", dir, iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); sprintf(tmp, "rcp " "%scatdb/gbdmoz.content.dat.new.diff " "%s:%scatdb/gbdmoz.content.dat.new.diff", dir, iptoa(h2->m_ipShotgun), h2->m_dir); log(LOG_INIT,"admin: %s", tmp); system ( tmp ); } } // return 0 on success return 0; } // . only call this once at start up // . this wrapper logic is now in Rdb.cpp, attemptMergeAll() /* void tryMergingWrapper ( int fd , void *state ) { g_tagdb.getRdb()->attemptMerge ( 1 , false ); g_catdb.getRdb()->attemptMerge ( 1 , false ); g_indexdb.getRdb()->attemptMerge ( 1 , false ); g_datedb.getRdb()->attemptMerge ( 1 , false ); g_titledb.getRdb()->attemptMerge ( 1 , false ); g_tfndb.getRdb()->attemptMerge ( 1 , false ); g_spiderdb.getRdb()->attemptMerge ( 1 , false ); g_checksumdb.getRdb()->attemptMerge ( 1 , false ); g_clusterdb.getRdb()->attemptMerge ( 1 , false ); g_loop.unregisterSleepCallback ( NULL , tryMergingWrapper ); } */ // as a security measure so we know who is using gigablast get a page void getPageWrapper ( int fd , void *state ) { //Url u; //u.set ( "http://www.gigablast.com/register.html" , // gbstrlen("http://www.gigablast.com/register.html") ); // dns servers might not be working, so do this one //u.set ( "http://207.114.174.29/register.html" , // gbstrlen("http://207.114.174.29/register.html") ); //u.set ( "http://64.62.168.40/register.html" , // gbstrlen("http://64.62.168.40/register.html") ); if ( ! g_conf.m_isLive ) return; char *s = "http://www.gigablast.com/register.html"; //u.set ( s , gbstrlen(s) ); g_httpServer.getDoc ( s,0, 0, -1 , 0 , NULL , NULL , 30*1000 , 0 , 0 , 20*1024 , 20*1024 ); // now do this every hour g_loop.unregisterSleepCallback( NULL, getPageWrapper); // do it every 10 hours now g_loop.registerSleepCallback(1000LL*60LL*60LL*10LL, NULL, getPageWrapper); } // take snapshot of g_stats //void takeSnapshotWrapper( int status, void *state) {g_statsdb.takeSnapshot();} bool registerMsgHandlers ( ) { if (! registerMsgHandlers1()) return false; if (! registerMsgHandlers2()) return false; if (! registerMsgHandlers3()) return false; //if ( ! Msg9a::registerHandler() ) return false; if ( ! g_pingServer.registerHandler() ) return false; //if ( ! g_accessdb.registerHandler () ) return false; return true; } bool registerMsgHandlers1(){ Msg20 msg20; if ( ! msg20.registerHandler () ) return false; //Msg22 msg22; if ( ! msg22.registerHandler () ) return false; //Msg23 msg23; if ( ! msg23.registerHandler () ) return false; Msg2a msg2a; if ( ! msg2a.registerHandler () ) return false; Msg36 msg36; if ( ! msg36.registerHandler () ) return false; //Msg30 msg30; if ( ! msg30.registerHandler () ) return false; MsgC msgC ; if ( ! msgC.registerHandler () ) return false; if ( ! Msg22::registerHandler() ) return false; //Msg2e msg2e; if ( ! msg2e.registerHandler () ) return false; // msg hanlder for pageturk //Msg60 msg60; if ( ! msg60.registerHandler () ) return false; return true; } bool registerMsgHandlers2(){ Msg0 msg0 ; if ( ! msg0.registerHandler () ) return false; Msg1 msg1 ; if ( ! msg1.registerHandler () ) return false; //Msg6 msg6 ; if ( ! msg6.registerHandler () ) return false; //Msg7 msg7 ; if ( ! msg7.registerHandler () ) return false; //Msg8a msg8a ;if ( ! msg8a.registerHandler () ) return false; Msg8b msg8b ; if ( ! msg8b.registerHandler () ) return false; //Msg10 msg10; if ( ! msg10.registerHandler () ) return false; //Msg11 msg11; if ( ! msg11.registerHandler () ) return false; //Msg12 msg12; if ( ! msg12.registerHandler () ) return false; //Msg13 msg13; if ( ! msg13.registerHandler () ) return false; //MsgE msge ; if ( ! msge.registerHandler () ) return false; //Speller speller;if ( ! speller.registerHandler()) return false; //Syncdb::registerHandlers(); if ( ! Msg13::registerHandler() ) return false; //if ( ! MsgF ::registerHandler() ) return false; //if(! g_udpServer.registerHandler(0x10,handleRequest10)) return false; if ( ! g_udpServer.registerHandler(0xc1,handleRequestc1)) return false; if ( ! g_udpServer.registerHandler(0x39,handleRequest39)) return false; if ( ! g_udpServer.registerHandler(0x2c,handleRequest2c)) return false; if ( ! g_udpServer.registerHandler(0x12,handleRequest12)) return false; if ( ! registerHandler4 () ) return false; // seo module handlers. this will just be stubs declared above // if no seo module. the seo module is not part of the open source. if(! g_udpServer.registerHandler(0x8e,handleRequest8e)) return false; if(! g_udpServer.registerHandler(0x4f,handleRequest4f)) return false; if(! g_udpServer.registerHandler(0x95,handleRequest95)) return false; if(! g_udpServer.registerHandler(0x3e,handleRequest3e)) return false; if(! g_udpServer.registerHandler(0x3f,handleRequest3f)) return false; return true; /* // VALGRIND does not like this huge stack waster, aka, Msg39 Msg39 *msg39; // Ha HA!!! //msg39 = new Msg39(); msg39 = new ( Msg39 ); mnew (msg39 , sizeof(Msg39) , "mainmsg39" ); bool ret = msg39->registerHandler (); mdelete (msg39 , sizeof(Msg39) , "mainmsg39" ); delete msg39; return ret; */ } bool registerMsgHandlers3(){ Msg17 msg17; if ( ! msg17.registerHandler () ) return false; //Msg34 msg34; if ( ! msg34.registerHandler () ) return false; Msg35 msg35; if ( ! msg35.registerHandler () ) return false; //Msg24 msg24; if ( ! msg24.registerHandler () ) return false; //Msg40 msg40; if ( ! msg40.registerHandler () ) return false; //MsgB msgb; if ( ! msgb.registerHandler () ) return false; //Msg3e msg3e; if ( ! msg3e.registerHandler () ) return false; //Msg42 msg42; if ( ! msg42.registerHandler () ) return false; //Msg33 msg33; if ( ! msg33.registerHandler () ) return false; //if ( ! g_pingServer.registerHandler() ) return false; //if ( ! Msg1c::init() ) return false; if ( ! Msg40::registerHandler() ) return false; return true; } /* void makeNewConf ( long hostId , char *confFilename ) { // read in the conf file // if ( ! g_conf.init ( confFilename ) ) { g_conf.init ( confFilename ) ; // minimal non-default description into conf char buf[1024]; sprintf ( buf , " %li" "209.157.102.11" // ns2.best.com , hostId ); // add it -- the rest will be filled in as defaults g_conf.add ( buf ); // save it g_conf.save (); } */ bool mainShutdown ( bool urgent ) { return g_process.shutdown(urgent); } /* static long s_shutdownCount; static void doneShutdownServerWrapper ( void *state ) ; static bool doneShutdownServer ( ) ; static void doneSavingWrapper ( void *state ) ; static bool isAllClosed ( ) ; bool closeAll ( void *state , void (* callback)(void *state) ); bool allExit ( ) ; static bool s_urgent = false ; static bool s_shutdownLock = false; // call this from gdb if stuck in an infinite loop and we need to save all bool mainShutdown2 ( ) { s_shutdownLock = false; g_indexdb.getRdb()->m_isClosed = false; g_titledb.getRdb()->m_isClosed = false; g_tfndb.getRdb()->m_isClosed = false; g_clusterdb.getRdb()->m_isClosed = false; g_linkdb.getRdb()->m_isClosed = false; g_checksumdb.getRdb()->m_isClosed = false; g_spiderdb.getRdb()->m_isClosed = false; g_datedb.getRdb()->m_isClosed = false; g_tagdb.getRdb()->m_isClosed = false; g_statsdb.getRdb()->m_isClosed = false; g_indexdb.getRdb()->m_tree.m_needsSave = false; g_titledb.getRdb()->m_tree.m_needsSave = false; g_tfndb.getRdb()->m_tree.m_needsSave = false; g_clusterdb.getRdb()->m_tree.m_needsSave = false; g_linkdb.getRdb()->m_needsSave = false; g_checksumdb.getRdb()->m_tree.m_needsSave = false; g_spiderdb.getRdb()->m_tree.m_needsSave = false; g_datedb.getRdb()->m_tree.m_needsSave = false; g_tagdb.getRdb()->m_tree.m_needsSave = false; g_statsdb.getRdb()->m_tree.m_needsSave = false; return mainShutdown ( true ); } // . save and exit this server // . if easydown is true, we broadcast to all others and wait to complete // the necessary transactions in each udpServer bool mainShutdown ( bool urgent ) { // no longer allow threads to do this if ( g_threads.amThread() ) return true; // hack for now //log("FIX THIS HACK"); //if ( urgent ) return true; //exit(-1); // . turn off interrupts // . we don't want to be interrupted in here! // . this is really only useful if we're NOT in a thread cuz // main process could still be interrupted // . if we call it from a thread it just results in us getting an // interrupt and since the g_interruptsOn flag is false we'll end // up saying ?wtf? if ( ! g_threads.amThread() ) g_loop.interruptsOff(); // ensure this is not re-entered if ( s_shutdownLock ) return true; s_shutdownLock = true; // save current spidering process g_spiderLoop.saveCurrentSpidering(); // save the Conf file now g_conf.save(); // turn off spidering and addUrl (don't save these) g_conf.m_spideringEnabled = 0; // i keep forgetting to turn add url back on, so don't turn off now //g_conf.m_addUrlEnabled = 0; // save state for top docs g_pageTopDocs.saveStateToDisk(); g_autoBan.save(); // save it s_urgent = urgent; // if we're going down hard don't bother waiting on transactions... if ( s_urgent ) { // disable threads from spawning g_threads.disableThreads(); // . save the Conf file again since we turned off spider/addurl // . we don't want them to be on after we recover from crash g_conf.save(); // . try to save all rdbs // . return false if blocked if ( ! closeAll(NULL,doneSavingWrapper) ) { fprintf(stderr,"why did this block? Please fix asap. " "Important data is not getting saved.\n"); return false; } // we didn't block, so they must all be closed return allExit ( ); } // . close our tcp server // . this will shut it down right away w/o worrying about completing // transactions //g_httpServer.reset(); // . send notes to all the hosts in the network telling them we're // shutting down // . this uses g_udpServer2 // . this returns false if it blocks // . we don't care if it blocks or not // . don't bother asking the hosts to send an email alert for us // since we're going down gracefully by letting everyone know g_pingServer.broadcastShutdownNotes ( false , // sendEmailAlert? NULL , NULL ); // reset the shutdown count s_shutdownCount = 0; // log it log(LOG_INFO,"udp: Shutting down servers."); // start shutting down our high priority udp server //if ( g_udpServer2.shutdown ( NULL , doneShutdownServerWrapper ) ) // s_shutdownCount++; // and low priority if ( g_udpServer.shutdown ( NULL , doneShutdownServerWrapper ) ) s_shutdownCount++; if ( g_dnsUdpServer.shutdown ( NULL , doneShutdownServerWrapper ) ) s_shutdownCount++; // bail if we're waiting to complete transactions or something if ( s_shutdownCount < 2 ) return false; // otherwise, did not block return doneShutdownServer(); } void doneShutdownServerWrapper ( void *state ) { doneShutdownServer ( ); } bool doneShutdownServer ( ) { // inc count s_shutdownCount++; // return if one more to go if ( s_shutdownCount < 2 ) return false; // . otherwise, save contents of each rdb // . this returns false if blocked, true otherwise if ( ! closeAll(NULL,doneSavingWrapper) ) return false; // do not exit if not all closed if ( ! isAllClosed () ) { log(LOG_LOGIC,"db: Not all closed but was exiting."); return false; } // otherwise, nobody blocked return allExit( ); } // return false if blocked, true otherwise bool closeAll ( void *state , void (* callback)(void *state) ) { // TODO: why is this called like 100x per second when a merge is // going on? why don't we sleep longer in between? g_tagdb.getRdb()->close(state,callback,s_urgent,true); g_catdb.getRdb()->close(state,callback,s_urgent,true); g_indexdb.getRdb()->close(state,callback,s_urgent,true); g_datedb.getRdb()->close(state,callback,s_urgent,true); g_titledb.getRdb()->close(state,callback,s_urgent,true); g_tfndb.getRdb()->close(state,callback,s_urgent,true); g_spiderdb.getRdb()->close(state,callback,s_urgent,true); g_checksumdb.getRdb()->close(state,callback,s_urgent,true); g_clusterdb.getRdb()->close(state,callback,s_urgent,true); g_statsdb.getRdb()->close(state,callback,s_urgent,true); g_linkdb.getRdb()->close(state,callback,s_urgent,true); g_tagdb2.getRdb()->close(state,callback,s_urgent,true); //g_catdb2.getRdb()->close(state,callback,s_urgent,true); g_indexdb2.getRdb()->close(state,callback,s_urgent,true); g_datedb2.getRdb()->close(state,callback,s_urgent,true); g_titledb2.getRdb()->close(state,callback,s_urgent,true); g_tfndb2.getRdb()->close(state,callback,s_urgent,true); g_spiderdb2.getRdb()->close(state,callback,s_urgent,true); g_checksumdb2.getRdb()->close(state,callback,s_urgent,true); g_clusterdb2.getRdb()->close(state,callback,s_urgent,true); long count = 0; long need = 0; count += g_tagdb.getRdb()->isClosed(); need++; count += g_catdb.getRdb()->isClosed(); need++; count += g_indexdb.getRdb()->isClosed(); need++; count += g_datedb.getRdb()->isClosed(); need++; count += g_titledb.getRdb()->isClosed(); need++; count += g_tfndb.getRdb()->isClosed(); need++; count += g_spiderdb.getRdb()->isClosed(); need++; count += g_checksumdb.getRdb()->isClosed(); need++; count += g_clusterdb.getRdb()->isClosed(); need++; count += g_statsdb.getRdb()->isClosed(); need++; count += g_linkdb.getRdb()->isClosed(); need++; count += g_tagdb2.getRdb()->isClosed(); need++; //count += g_catdb2.getRdb()->isClosed(); need++; count += g_indexdb2.getRdb()->isClosed(); need++; count += g_datedb2.getRdb()->isClosed(); need++; count += g_titledb2.getRdb()->isClosed(); need++; count += g_tfndb2.getRdb()->isClosed(); need++; count += g_spiderdb2.getRdb()->isClosed(); need++; count += g_checksumdb2.getRdb()->isClosed(); need++; count += g_clusterdb2.getRdb()->isClosed(); need++; // . don't try saving collectiondb until everyone else is done // . since we get called like 100x per second when a merge is // going on, this is a good idea until we fix that problem! if ( count < need ) return false; // this one always blocks g_collectiondb.save(); g_repair.save(); //this one too g_classifier.save(); // close the Chinese parser lexicon stuff //close_lexicon (); // save our caches for ( long i = 0; i < MAX_GENERIC_CACHES; i++ ) { if ( g_genericCache[i].useDisk() ) g_genericCache[i].save(); } // save dns caches RdbCache *c ; c = g_dnsDistributed.getCache(); if ( c->useDisk() ) c->save(); // return true if all closed right away w/o blocking return true; } void doneSavingWrapper ( void *state ) { // are they all closed now? if ( ! isAllClosed () ) return; allExit ( ); return; } void resetAll ( ) { g_log.reset(); g_hostdb.reset() ; g_hostdb2.reset() ; g_spiderLoop.reset(); g_indexdb.reset(); g_datedb.reset(); g_titledb.reset(); g_spiderdb.reset(); g_tfndb.reset(); g_checksumdb.reset(); g_clusterdb.reset(); g_linkdb.reset(); g_tagdb.reset(); g_catdb.reset(); g_statsdb.reset(); g_indexdb2.reset(); g_datedb2.reset(); g_titledb2.reset(); g_spiderdb2.reset(); g_tfndb2.reset(); g_checksumdb2.reset(); g_clusterdb2.reset(); g_tagdb2.reset(); //g_catdb2.reset(); g_collectiondb.reset(); g_categories1.reset(); g_categories2.reset(); g_robotdb.reset(); g_dnsDistributed.reset(); g_dnsLocal.reset(); g_udpServer.reset(); g_dnsUdpServer.reset(); //g_udpServer2.reset(); g_httpServer.reset(); g_loop.reset(); for ( long i = 0; i < MAX_GENERIC_CACHES; i++ ) g_genericCache[i].reset(); g_speller.reset(); resetMsg6(); g_spiderCache.reset(); g_threads.reset(); g_ucUpperMap.reset(); g_ucLowerMap.reset(); g_ucProps.reset(); g_ucCombiningClass.reset(); g_ucScripts.reset(); g_profiler.reset(); g_pageTopDocs.destruct(); g_pageNetTest.destructor(); resetDecompTables(); resetCompositionTable(); g_langList.reset(); g_autoBan.reset(); resetPageAddUrl(); resetHttpMime(); reset_iana_charset(); resetAdultBit(); resetDomains(); resetEntities(); resetQuery(); resetStopWords(); resetUnicode(); resetMsg12(); } void allExitWrapper ( int fd , void *state ) { allExit(); } // returns false if blocked, otherwise just exits bool allExit ( ) { // . wait for all renames and unlinks to complete // . BUT don't wait more than 100 seconds, we need that core //long t = getTime(); static char s_registered = 0; if ( g_unlinkRenameThreads > 0 ) { // && getTime()-t < 100 ) { //static char s_flag = 1; //if ( s_flag ) { log("db: Waiting for file unlink/rename threads to " "complete. numThreads=%li.",(long)g_unlinkRenameThreads); //s_flag = 0; //} if ( ! s_registered && ! g_loop.registerSleepCallback(1000,NULL, allExitWrapper) ) { log("db: Failed to register all exit wrapper. " "Sleeping 30 seconds to make sure all unlink/" "rename threads exit."); sleep(30); } else { s_registered = 1; return false; } } if ( s_registered ) g_loop.unregisterSleepCallback(NULL, allExitWrapper); // . this one always blocks // . save the "sync" file last, after all other files have saved // successfully, because if one has a problem it will need to be // sync'ed. //g_sync.close(); g_collectiondb.save(); g_repair.save(); // . don't bother resetting if we're urgent // . resetting makes it easier to see what memory has been leaked if ( ! s_urgent ) { resetAll(); // print out memory here, not from the destructor cuz it // freezes in malloc for some reason sometimes g_mem.printMem(); // . if we're not a panic/urgent dump, don't force dump core // . exit cleanly (0 means no errors) exit(0); } // . place breakpoint here for memory leak detection // . then say "print g_mem.printMem()" from gdb // . some TermTable's were not freed for stopWords, obsceneWords, ... // . if we the main process we must kill all threads since linux // has a bug that won't dump our core if threads are about if ( ! g_threads.amThread () ) { // . otherwise, we're the main process // . linux has a bug where the core won't dump when threads // are running //pthread_kill_other_threads_np(); // print it if ( g_loop.m_shutdown != 1 ) fprintf(stderr,"allExit: dumping core after saving\n"); } // print out memory here, not from the destructor cuz it freezes // in malloc for some reason sometimes g_mem.printMem(); // . this forces an abnormal termination which will cause a core dump // . do not dump core on SIGHUP signals any more though if ( g_loop.m_shutdown != 1 ) abort(); else exit(0); // a dummy return to keep compiler happy return false; } // return false if one or more is still not closed yet bool isAllClosed ( ) { long count = 0; long need = 0; // this one always blocks count += g_collectiondb.save(); need++; count += g_tagdb.getRdb()->isClosed(); need++; count += g_catdb.getRdb()->isClosed(); need++; count += g_indexdb.getRdb()->isClosed(); need++; count += g_datedb.getRdb()->isClosed(); need++; count += g_titledb.getRdb()->isClosed(); need++; count += g_tfndb.getRdb()->isClosed(); need++; count += g_spiderdb.getRdb()->isClosed(); need++; count += g_checksumdb.getRdb()->isClosed(); need++; count += g_clusterdb.getRdb()->isClosed(); need++; count += g_statsdb.getRdb()->isClosed(); need++; count += g_linkdb.getRdb()->isClosed(); need++; count += g_tagdb2.getRdb()->isClosed(); need++; //count += g_catdb2.getRdb()->isClosed(); need++; count += g_indexdb2.getRdb()->isClosed(); need++; count += g_datedb2.getRdb()->isClosed(); need++; count += g_titledb2.getRdb()->isClosed(); need++; count += g_tfndb2.getRdb()->isClosed(); need++; count += g_spiderdb2.getRdb()->isClosed(); need++; count += g_checksumdb2.getRdb()->isClosed(); need++; count += g_clusterdb2.getRdb()->isClosed(); need++; // . the sync file is now saved in g_collectiondb.save() // . this one always blocks //g_sync.close(); // return and wait if not return ( count >= need ); } */ //#include "./libmpm/mp_malloc.h" /* void zlibtest() { char *ptrs[1000]; long lens[1000]; for ( long j = 0 ; j < 220000 ; j++ ) { log("pass=%li",j); Msg0 *m = new (Msg0); delete (m); } return; for ( long j = 0 ; j < 120000 ; j++ ) { log("pass=%li",j); // malloc 1,000 bufs of size about 100-64k each for ( long i = 0 ; i < 100 ; i++ ) { long bufSize = 1000 + (rand() % 65000); ptrs[i] = (char *)mmalloc ( bufSize , "test" ); if ( ! ptrs[i] ) { log("no mem!"); exit(-1); } lens[i] = bufSize; // simple write for ( long k = 0 ; k < bufSize ; k+=900 ) ptrs[i][k] = 'a' + (rand() % 64); } // now free them for ( long i = 0 ; i < 100 ; i++ ) mfree (ptrs[i] , lens[i] , "test" ); } } */ #include "Rdb.h" #include "Xml.h" #include "Tfndb.h" //#include "Checksumdb.h" #include "Threads.h" // // dump routines here now // void dumpTitledb (char *coll,long startFileNum,long numFiles,bool includeTree, long long docid , char justPrintDups , bool justPrintSentences, bool justPrintWords ) { if (!ucInit(g_hostdb.m_dir, true)) { log("Unicode initialization failed!"); return; } // init our table for doing zobrist hashing if ( ! hashinit() ) { log("db: Failed to init hashtable." ); return ; } //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_dumpMode = true; //g_conf.m_checksumdbMaxDiskPageCacheMem = 0; //g_conf.m_spiderdbMaxDiskPageCacheMem = 0; g_conf.m_tfndbMaxDiskPageCacheMem = 0; g_titledb.init (); g_collectiondb.init(true); g_titledb.addColl ( coll, false ); key_t startKey ; key_t endKey ; key_t lastKey ; startKey.setMin(); endKey.setMax(); lastKey.setMin(); startKey = g_titledb.makeFirstKey ( docid ); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; Msg5 msg5b; RdbList list; long long prevId = 0LL; long count = 0; char ttt[2048+MAX_URL_LEN]; HashTableX dedupTable; dedupTable.set(4,0,10000,NULL,0,false,0,"maintitledb"); //g_synonyms.init(); // load the appropriate dictionaries -- why??? //g_speller.init(); // make this XmlDoc *xd; try { xd = new (XmlDoc); } catch ( ... ) { fprintf(stdout,"could not alloc for xmldoc\n"); exit(-1); } loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_TITLEDB , coll , &list , startKey , endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )){ log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); char *rec = list.getCurrentRec(); long recSize = list.getCurrentRecSize(); long long docId = g_titledb.getDocIdFromKey ( k ); //long hostHash = g_titledb.getHostHash ( k ); //long contentHash = g_titledb.getContentHash ( k ); if ( k <= lastKey ) log("key out of order. " "lastKey.n1=%lx n0=%llx " "currKey.n1=%lx n0=%llx ", lastKey.n1,lastKey.n0, k.n1,k.n0); lastKey = k; // print deletes if ( (k.n0 & 0x01) == 0) { fprintf(stdout,"n1=%08lx n0=%016llx docId=%012lli " "(del)\n", k.n1 , k.n0 , docId ); continue; } // free the mem xd->reset(); // uncompress the title rec //TitleRec tr; if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) ) continue; // get this //uint32_t siteHash32 = xd->m_siteHash32; // extract the url Url *u = xd->getFirstUrl(); // MOD to only print root urls //if (!u->isRoot()) continue; // get ip char ipbuf [ 32 ]; strcpy ( ipbuf , iptoa(u->getIp() ) ); // pad with spaces long blen = gbstrlen(ipbuf); while ( blen < 15 ) ipbuf[blen++]=' '; ipbuf[blen]='\0'; //long ext = g_tfndb.makeExt ( u ); //long nc = xd->size_catIds / 4;//tr.getNumCatids(); if ( justPrintDups ) { // print into buf if ( docId != prevId ) { time_t ts = xd->m_spideredTime;//tr.getSpiderDa struct tm *timeStruct = localtime ( &ts ); //struct tm *timeStruct = gmtime ( &ts ); char ppp[100]; strftime(ppp,100,"%b-%d-%Y-%H:%M:%S", timeStruct); LinkInfo *info = xd->ptr_linkInfo1;//tr.ge char foo[1024]; foo[0] = '\0'; //if ( tr.getVersion() >= 86 ) sprintf(foo, //"tw=%li hw=%li upw=%li " "sni=%li ", //(long)xd->m_titleWeight, //(long)xd->m_headerWeight, //(long)xd->m_urlPathWeight, (long)xd->m_siteNumInlinks); char *ru = xd->ptr_redirUrl; if ( ! ru ) ru = ""; sprintf(ttt, "n1=%08lx n0=%016llx docId=%012lli " //hh=%07lx ch=%08lx " //"e=%02lx " "size=%07li " "ch32=%010lu " "clen=%07li " "cs=%04d " "lang=%02d " "sni=%03li " //"cats=%li " "lastspidered=%s " "ip=%s " "numLinkTexts=%04li " "%s" "version=%02li " //"maxLinkTextWeight=%06lu%% " "hc=%li " "redir=%s " "url=%s " "firstdup=1 " "\n", k.n1 , k.n0 , //rec[0] , docId , //hostHash , //contentHash , //(long)ext , recSize - 16 , xd->m_contentHash32, xd->size_utf8Content,//tr.getContentLen xd->m_charset,//tr.getCharset(), xd->m_langId,//tr.getLanguage(), (long)xd->m_siteNumInlinks,//tr.getDo //nc, ppp, iptoa(xd->m_ip),//ipbuf , info->getNumGoodInlinks(), foo, (long)xd->m_version, //ms, (long)xd->m_hopCount, ru, u->getUrl() ); prevId = docId; count = 0; continue; } // print previous docid that is same as our if ( count++ == 0 ) printf ( "\n%s" , ttt ); } // nice, this is never 0 for a titlerec, so we can use 0 to signal // that the following bytes are not compressed, and we can store // out special checksum vector there for fuzzy deduping. //if ( rec[0] != 0 ) continue; // print it out //printf("n1=%08lx n0=%016llx b=0x%02hhx docId=%012lli sh=%07lx ch=%08lx " // date indexed as local time, not GMT/UTC time_t ts = xd->m_spideredTime;//tr.getSpiderDate(); struct tm *timeStruct = localtime ( &ts ); //struct tm *timeStruct = gmtime ( &ts ); char ppp[100]; strftime(ppp,100,"%b-%d-%Y-%H:%M:%S",timeStruct); //ppp[strlen(ppp)-2]='\0'; /* BEGIN MOD FOR DUMPING STUFF TO RE-LINK ANALYZE LinkInfo *info = tr.getLinkInfo(); long nLinkTexts = info->getNumLinkTexts(); if ( nLinkTexts > 10 ) continue; // continue if spidered after june 14 if ( timeStruct->tm_mon == 5 && // june timeStruct->tm_mday >= 14 ) continue; // get sum of them link texts long sum = 0; char *pp = NULL; long nexti = 0; unsigned long baseScore = 0; for ( long i = 0 ; i < nLinkTexts ; i++ ) { info->getLinkText ( 0 , NULL , // len NULL , // itemPtr NULL , // itemLen &baseScore , NULL , // quality NULL , // numLinks NULL , // docId NULL , // ip &nexti , // nexti &pp );// nextp sum += baseScore; } // skip if not very high scoring // *100/256 to get the percentages seen in PageTitledb.cpp if ( sum < 10000 ) continue; // print it log ( LOG_INFO, "%s %li links sum %li", tr.getUrl()->getUrl(), nLinkTexts , sum ); // continue continue; */ //unsigned long ms = 0; LinkInfo *info = xd->ptr_linkInfo1;//tr.getLinkInfo(); //for ( Inlink*k=NULL;info&&(k=info->getNextInlink(k)); ){ // // returns NULL if none // if ( k->m_baseScore > (long)ms ) ms = k->m_baseScore; //} // normalize //ms = ((long long)ms * 100LL) / 256LL; char foo[1024]; foo[0] = '\0'; //if ( tr.getVersion() >= 86 ) sprintf(foo, //"tw=%li hw=%li upw=%li " "sni=%li ", //(long)xd->m_titleWeight, //(long)xd->m_headerWeight, //(long)xd->m_urlPathWeight, (long)xd->m_siteNumInlinks); char *ru = xd->ptr_redirUrl; if ( ! ru ) ru = ""; fprintf(stdout, "n1=%08lx n0=%016llx docId=%012lli " //hh=%07lx ch=%08lx " //"e=%02lx " "size=%07li " "ch32=%010lu " "clen=%07li " "cs=%04d " "lang=%02d " "sni=%03li " //"cats=%li " "lastspidered=%s " "ip=%s " "numLinkTexts=%04li " "%s" "version=%02li " //"maxLinkTextWeight=%06lu%% " "hc=%li " "redir=%s " "url=%s\n", k.n1 , k.n0 , //rec[0] , docId , //hostHash , //contentHash , //(long)ext , recSize - 16 , xd->m_contentHash32, xd->size_utf8Content,//tr.getContentLen() , xd->m_charset,//tr.getCharset(), xd->m_langId,//tr.getLanguage(), (long)xd->m_siteNumInlinks,//tr.getDocQuality(), //nc, ppp, iptoa(xd->m_ip),//ipbuf , info->getNumGoodInlinks(), foo, (long)xd->m_version, //ms, (long)xd->m_hopCount, ru, u->getUrl() ); // free the mem xd->reset(); //g_mem.printMem(); } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ) return; goto loop; } void dumpTfndb (char *coll,long startFileNum,long numFiles,bool includeTree , bool verify) { g_dumpMode = true; //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; //g_conf.m_checksumdbMaxDiskPageCacheMem = 0; //g_conf.m_spiderdbMaxDiskPageCacheMem = 0; g_conf.m_tfndbMaxDiskPageCacheMem = 0; g_tfndb.init (); g_collectiondb.init(true); g_tfndb.addColl ( coll, false ); key_t startKey ; key_t endKey ; startKey.setMin(); endKey.setMax(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; RdbList list; key_t oldk; oldk.setMin(); loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_TFNDB , coll , &list , startKey , endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); if ( verify ) { if ( oldk > k ) fprintf(stdout,"got bad key order. " "%lx/%llx > %lx/%llx\n", oldk.n1,oldk.n0,k.n1,k.n0); oldk = k; continue; } long long docId = g_tfndb.getDocId ( &k ); //long e = g_tfndb.getExt ( k ); long tfn = g_tfndb.getTfn ( &k ); //long clean = 0 ; if ( g_tfndb.isClean ( k ) ) clean= 1; long half = 0 ; if ( k.n0 & 0x02 ) half = 1; char *dd = "" ; if ( (k.n0 & 0x01) == 0 ) dd =" (del)"; fprintf(stdout, "%08lx %016llx docId=%012lli " "tfn=%03li half=%li %s\n", k.n1,k.n0,docId,tfn,half,dd); } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ) return; goto loop; } void dumpWaitingTree (char *coll ) { RdbTree wt; if (!wt.set(0,-1,true,20000000,true,"waittree2", false,"waitingtree",sizeof(key_t)))return; collnum_t collnum = g_collectiondb.getCollnum ( coll ); // make dir char dir[500]; sprintf(dir,"%scoll.%s.%li",g_hostdb.m_dir,coll,(long)collnum); // load in the waiting tree, IPs waiting to get into doledb BigFile file; file.set ( dir , "waitingtree-saved.dat" , NULL ); bool treeExists = file.doesExist() > 0; // load the table with file named "THISDIR/saved" RdbMem wm; if ( treeExists && ! wt.fastLoad(&file,&wm) ) return; // the the waiting tree long node = wt.getFirstNode(); for ( ; node >= 0 ; node = wt.getNextNode(node) ) { // breathe QUICKPOLL(MAX_NICENESS); // get key key_t *key = (key_t *)wt.getKey(node); // get ip from that long firstIp = (key->n0) & 0xffffffff; // get the time unsigned long long spiderTimeMS = key->n1; // shift upp spiderTimeMS <<= 32; // or in spiderTimeMS |= (key->n0 >> 32); // get the rest of the data fprintf(stdout,"time=%llu firstip=%s\n", spiderTimeMS, iptoa(firstIp)); } } void dumpDoledb (char *coll,long startFileNum,long numFiles,bool includeTree){ g_dumpMode = true; //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; //g_conf.m_checksumdbMaxDiskPageCacheMem = 0; //g_conf.m_spiderdbMaxDiskPageCacheMem = 0; //g_conf.m_doledbMaxDiskPageCacheMem = 0; g_doledb.init (); g_collectiondb.init(true); g_doledb.addColl ( coll, false ); key_t startKey ; key_t endKey ; startKey.setMin(); endKey.setMax(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; RdbList list; key_t oldk; oldk.setMin(); loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_DOLEDB , coll , &list , startKey , endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); if ( oldk > k ) fprintf(stdout,"got bad key order. " "%lx/%llx > %lx/%llx\n", oldk.n1,oldk.n0,k.n1,k.n0); oldk = k; // get it char *drec = list.getCurrentRec(); // sanity check if ( (drec[0] & 0x01) == 0x00 ) {char *xx=NULL;*xx=0; } // get spider rec in it char *srec = drec + 12 + 4; // print doledb info first then spider request fprintf(stdout,"dolekey=%s (n1=%lu n0=%llu) " "pri=%li " "spidertime=%lu " "uh48=0x%llx\n", KEYSTR(&k,12), k.n1, k.n0, (long)g_doledb.getPriority(&k), g_doledb.getSpiderTime(&k), g_doledb.getUrlHash48(&k)); fprintf(stdout,"spiderkey="); // print it g_spiderdb.print ( srec ); // the \n printf("\n"); // must be a request -- for now, for stats if ( ! g_spiderdb.isSpiderRequest((key128_t *)srec) ) { char *xx=NULL;*xx=0; } // cast it SpiderRequest *sreq = (SpiderRequest *)srec; // skip negatives if ( (sreq->m_key.n0 & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; } } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ) return; goto loop; } // . dataSlot fo the hashtable for spider stats in dumpSpiderdb // . key is firstip class UStat { public: // for spider requests: long m_numRequests; long m_numRequestsWithReplies; long m_numWWWRoots; long m_numNonWWWRoots; long m_numHops1; long m_numHops2; long m_numHops3orMore; long m_ageOfYoungestSpideredRequest; long m_ageOfOldestUnspideredRequest; long m_ageOfOldestUnspideredWWWRootRequest; // for spider replies: long m_numGoodReplies; long m_numErrorReplies; }; static HashTableX g_ut; void addUStat1 ( SpiderRequest *sreq, bool hadReply , long now ) { long firstIp = sreq->m_firstIp; // lookup long n = g_ut.getSlot ( &firstIp ); UStat *us = NULL; UStat tmp; if ( n < 0 ) { us = &tmp; memset(us,0,sizeof(UStat)); g_ut.addKey(&firstIp,us); us = (UStat *)g_ut.getValue ( &firstIp ); } else { us = (UStat *)g_ut.getValueFromSlot ( n ); } long age = now - sreq->m_addedTime; // inc the counts us->m_numRequests++; if ( hadReply) us->m_numRequestsWithReplies++; if ( sreq->m_hopCount == 0 ) { if ( sreq->m_isWWWSubdomain ) us->m_numWWWRoots++; else us->m_numNonWWWRoots++; } else if ( sreq->m_hopCount == 1 ) us->m_numHops1++; else if ( sreq->m_hopCount == 2 ) us->m_numHops2++; else if ( sreq->m_hopCount >= 3 ) us->m_numHops3orMore++; if ( hadReply ) { if (age < us->m_ageOfYoungestSpideredRequest || us->m_ageOfYoungestSpideredRequest == 0 ) us->m_ageOfYoungestSpideredRequest = age; } if ( ! hadReply ) { if (age > us->m_ageOfOldestUnspideredRequest || us->m_ageOfOldestUnspideredRequest == 0 ) us->m_ageOfOldestUnspideredRequest = age; } if ( ! hadReply && sreq->m_hopCount == 0 && sreq->m_isWWWSubdomain ) { if (age > us->m_ageOfOldestUnspideredWWWRootRequest || us->m_ageOfOldestUnspideredWWWRootRequest == 0 ) us->m_ageOfOldestUnspideredWWWRootRequest = age; } } void addUStat2 ( SpiderReply *srep , long now ) { long firstIp = srep->m_firstIp; // lookup long n = g_ut.getSlot ( &firstIp ); UStat *us = NULL; UStat tmp; if ( n < 0 ) { us = &tmp; memset(us,0,sizeof(UStat)); g_ut.addKey(&firstIp,us); us = (UStat *)g_ut.getValue ( &firstIp ); } else { us = (UStat *)g_ut.getValueFromSlot ( n ); } //long age = now - srep->m_spideredTime; // inc the counts if ( srep->m_errCode ) us->m_numErrorReplies++; else us->m_numGoodReplies++; } long dumpSpiderdb ( char *coll, long startFileNum , long numFiles , bool includeTree , char printStats , long firstIp ) { if ( startFileNum < 0 ) { log(LOG_LOGIC,"db: Start file number is < 0. Must be >= 0."); return -1; } if ( printStats == 1 ) { //g_conf.m_maxMem = 2000000000LL; // 2G //g_mem.m_maxMem = 2000000000LL; // 2G if ( ! g_ut.set ( 4, sizeof(UStat), 10000000, NULL, 0,0,false,"utttt") ) return -1; } //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; //g_conf.m_checksumdbMaxDiskPageCacheMem = 0; //g_conf.m_spiderdbMaxDiskPageCacheMem = 0; g_conf.m_tfndbMaxDiskPageCacheMem = 0; g_spiderdb.init (); g_collectiondb.init(true); g_spiderdb.addColl ( coll, false ); key128_t startKey ; key128_t endKey ; startKey.setMin(); endKey.setMax(); // start based on firstip if non-zero if ( firstIp ) { startKey = g_spiderdb.makeFirstKey ( firstIp ); endKey = g_spiderdb.makeLastKey ( firstIp ); } //long t1 = 0; //long t2 = 0x7fffffff; // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; RdbList list; // clear before calling Msg5 g_errno = 0; // init stats vars long negRecs = 0; long emptyRecs = 0; long uniqDoms = 0; // count urls per domain in "domTable" HashTable domTable; domTable.set ( 1024*1024 ); // count every uniq domain per ip in ipDomTable (uses dup keys) HashTableX ipDomTable; // allow dups? true! ipDomTable.set ( 4,4,5000000 , NULL, 0, true ,0, "ipdomtbl"); // count how many unique domains per ip HashTable ipDomCntTable; ipDomCntTable.set ( 1024*1024 ); // buffer for holding the domains long bufSize = 1024*1024; char *buf = (char *)mmalloc(bufSize,"spiderstats"); long bufOff = 0; long count = 0; long countReplies = 0; long countRequests = 0; long long offset = 0LL; long now; static long long s_lastRepUh48 = 0LL; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_SPIDERDB , coll , &list , (char *)&startKey , (char *)&endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return -1; } // all done if empty if ( list.isEmpty() ) goto done; // this may not be in sync with host #0!!! now = getTimeLocal(); // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { // get it char *srec = list.getCurrentRec(); // save it long long curOff = offset; // and advance offset += list.getCurrentRecSize(); // must be a request -- for now, for stats if ( ! g_spiderdb.isSpiderRequest((key128_t *)srec) ) { // print it if ( ! printStats ) { printf( "offset=%lli ",curOff); g_spiderdb.print ( srec ); printf("\n"); } // its a spider reply SpiderReply *srep = (SpiderReply *)srec; // store it s_lastRepUh48 = srep->getUrlHash48(); countReplies++; // get firstip if ( printStats == 1 ) addUStat2 ( srep , now ); continue; } // cast it SpiderRequest *sreq = (SpiderRequest *)srec; countRequests++; long long uh48 = sreq->getUrlHash48(); // count how many requests had replies and how many did not bool hadReply = ( uh48 == s_lastRepUh48 ); // get firstip if ( printStats == 1 ) addUStat1 ( sreq , hadReply , now ); // print it if ( ! printStats ) { printf( "offset=%lli ",curOff); g_spiderdb.print ( srec ); printf(" age=%lis",now-sreq->m_addedTime); printf(" hadReply=%li\n",(long)hadReply); } // print a counter if ( ((count++) % 100000) == 0 ) fprintf(stderr,"Processed %li records.\n",count-1); if ( printStats != 2 ) continue; // skip negatives if ( (sreq->m_key.n0 & 0x01) == 0x00 ) continue; // skip bogus shit if ( sreq->m_firstIp == 0 || sreq->m_firstIp==-1 ) continue; // shortcut long domHash = sreq->m_domHash32; // . is it in the domain table? // . keeps count of how many urls per domain long slot = domTable.getSlot ( domHash ); if ( slot >= 0 ) { long off = domTable.getValueFromSlot ( slot ); // just inc the count for this domain *(long *)(buf + off) = *(long *)(buf + off) + 1; continue; } // get the domain long domLen = 0; char *dom = getDomFast ( sreq->m_url , &domLen ); // always need enough room... if ( bufOff + 4 + domLen + 1 >= bufSize ) { long growth = bufSize * 2 - bufSize; // limit growth to 10MB each time if ( growth > 10*1024*1024 ) growth = 10*1024*1024; long newBufSize = bufSize + growth; char *newBuf = (char *)mrealloc( buf , bufSize , newBufSize, "spiderstats"); if ( ! newBuf ) return -1; // re-assign buf = newBuf; bufSize = newBufSize; } // otherwise add it, it is a new never-before-seen domain //char poo[999]; //memcpy ( poo , dom , domLen ); //poo[domLen]=0; //fprintf(stderr,"new dom %s hash=%li\n",dom,domHash); // store the count of urls followed by the domain char *ptr = buf + bufOff; *(long *)ptr = 1; ptr += 4; memcpy ( ptr , dom , domLen ); ptr += domLen; *ptr = '\0'; // use an ip of 1 if it is 0 so it hashes right long useip = sreq->m_firstIp; // ip; // can't use 1 because it all clumps up!! //if ( ip == 0 ) useip = domHash ; // this table counts how many urls per domain, as // well as stores the domain if ( ! domTable.addKey (domHash , bufOff) ) return -1; // . if this is the first time we've seen this domain, // add it to the ipDomTable // . this hash table must support dups. // . we need to print out all the domains for each ip if ( ! ipDomTable.addKey ( &useip , &bufOff ) ) return -1; // . this table counts how many unique domains per ip // . it is kind of redundant since we have ipDomTable long ipCnt = ipDomCntTable.getValue ( useip ); if ( ipCnt < 0 ) ipCnt = 0; if ( ! ipDomCntTable.addKey ( useip, ipCnt+1) ) return -1; // advance to next empty spot bufOff += 4 + domLen + 1; // count unque domains uniqDoms++; } startKey = *(key128_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey >= *(key128_t *)list.getLastKey() ) goto loop; done: // print out the stats if ( ! printStats ) return 0; // print UStats now if ( printStats == 1 ) { for ( long i = 0 ; i < g_ut.getNumSlots();i++ ) { if ( g_ut.m_flags[i] == 0 ) continue; UStat *us = (UStat *)g_ut.getValueFromSlot(i); long firstIp = *(long *)g_ut.getKeyFromSlot(i); fprintf(stdout,"%s ", iptoa(firstIp)); fprintf(stdout,"requests=%li ", us->m_numRequests); fprintf(stdout,"wwwroots=%li ", us->m_numWWWRoots); fprintf(stdout,"nonwwwroots=%li ", us->m_numNonWWWRoots); fprintf(stdout,"1hop=%li ", us->m_numHops1); fprintf(stdout,"2hop=%li ", us->m_numHops2); fprintf(stdout,"3hop+=%li ", us->m_numHops3orMore); fprintf(stdout,"mostrecentspider=%lis ", us->m_ageOfYoungestSpideredRequest); fprintf(stdout,"oldestunspidered=%lis ", us->m_ageOfOldestUnspideredRequest); fprintf(stdout,"oldestunspideredwwwroot=%li ", us->m_ageOfOldestUnspideredWWWRootRequest); fprintf(stdout,"spidered=%li ", us->m_numRequestsWithReplies); fprintf(stdout,"goodspiders=%li ", us->m_numGoodReplies); fprintf(stdout,"errorspiders=%li", us->m_numErrorReplies); fprintf(stdout,"\n"); } return 0; } long uniqIps = ipDomCntTable.getNumSlotsUsed(); // print out all ips, and # of domains they have and list of their // domains long nn = ipDomTable.getNumSlots(); // i is the bucket to start at, must be EMPTY! long i = 0; // count how many buckets we visit long visited = 0; // find the empty bucket for ( i = 0 ; i < nn ; i++ ) if ( ipDomTable.m_flags[i] == 0 ) break; //if ( ipDomTable.getKey(i) == 0 ) break; // now we can do our scan of the ips. there can be dup ips in the // table so we must chain for each one we find for ( ; visited++ < nn ; i++ ) { // wrap it if ( i == nn ) i = 0; // skip empty buckets if ( ipDomTable.m_flags[i] == 0 ) continue; // get ip of the ith slot long ip = *(long *)ipDomTable.getKeyFromSlot(i); // get it in the ip table, if not there, skip it long domCount = ipDomCntTable.getValue ( ip ) ; if ( domCount == 0 ) continue; // log the count long useip = ip; if ( ip == 1 ) useip = 0; fprintf(stderr,"%s has %li domains.\n",iptoa(useip),domCount); // . how many domains on that ip, print em out // . use j for the inner loop long j = i; // buf for printing ip char ipbuf[64]; sprintf (ipbuf,"%s",iptoa(useip) ); jloop: long ip2 = *(long *)ipDomTable.getKeyFromSlot ( j ) ; if ( ip2 == ip ) { // get count long off = *(long *)ipDomTable.getValueFromSlot ( j ); char *ptr = buf + off; long cnt = *(long *)ptr; char *dom = buf + off + 4; // print: "IP Domain urlCountInDomain" fprintf(stderr,"%s %s %li\n",ipbuf,dom,cnt); // advance && wrap if ( ++j >= nn ) j = 0; // keep going goto jloop; } // not an empty bucket, so keep chaining if ( ip2 != 0 ) { // advance & wrap if ( ++j >= nn ) j = 0; // keep going goto jloop; } // ok, we are done, do not do this ip any more ipDomCntTable.removeKey(ip); } if ( negRecs ) fprintf(stderr,"There are %li total negative records.\n", negRecs); if ( emptyRecs ) fprintf(stderr,"There are %li total negative records.\n", emptyRecs); //fprintf(stderr,"There are %li total urls.\n",count); fprintf(stderr,"There are %li total records.\n",count); fprintf(stderr,"There are %li total request records.\n",countRequests); fprintf(stderr,"There are %li total replies records.\n",countReplies); // end with total uniq domains fprintf(stderr,"There are %li unique domains.\n",uniqDoms); // and with total uniq ips in this priority fprintf(stderr,"There are %li unique IPs.\n",uniqIps); return 0; } /* static bool makeNewTitleRecKey ( char *rec , long recSize , key_t *newk , TitleRec *tr , long long *h ) ; // how big can a compressed title record be? #define MAX_TR_SIZE (2*1024*1024) // returns false and sets g_errno on error bool makeNewTitleRecKey ( char *rec , long recSize , key_t *newk , TitleRec *tr , long long *h ) { // if uncompress failed, just keep looping if ( ! xd->set if ( ! tr->set ( rec , MAX_TR_SIZE , false ) ) return log("db: TitleRec uncompress failed. continuing."); // get hashes Xml xml; //CrashMe(); xml.set ( tr->getCharset(),tr->getContent() , tr->getContentLen() , false, 0, false, tr->getVersion() ); *h = g_checksumdb.getContentHash ( &xml,tr->getUrl(), tr->getLinkInfo(), tr->getVersion(), 0);// niceness long contentHash = (long)*h; long hostHash = hash32 (tr->getUrl()->getHost() , tr->getUrl()->getHostLen() ); // remake the key with these hashes in the low bits *newk = g_titledb.makeTitleRecKey ( tr->getDocId() , false , // del key? hostHash , contentHash , false , // adult bit is false false ); // adul category is false return true; } */ /* bool addToTfndb ( char *coll , TitleRec *tr , long id2 ) { // add to tfndb if we should long e = g_tfndb.makeExt ( tr->getUrl() ); key_t k = g_tfndb.makeKey ( tr->getDocId(), e, id2 , // tfn false , false ); // clean? del? // get the rdb of the tfndb Rdb *r = g_tfndb.getRdb(); // do a non-blocking dump of tree if it's 90% full now if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){ if ( ! r->dumpTree ( 0 ) ) // niceness return log("db: addToTfndb: dumpTree failed" ); } // returns false and sets g_errno on error if ( ! r->addRecord ( coll, k , NULL , 0 , 0) ) return log("db: addToTfndb: addRecord: %s",mstrerror(g_errno)); return true; } bool addToTfndb2 ( char *coll , SpiderRec *sr , long id2 ) { // add to tfndb if we should long e = g_tfndb.makeExtQuick ( sr->getUrl() ); long long d = g_titledb.getProbableDocId ( sr->getUrl() ); key_t k = g_tfndb.makeKey ( d, e, 255 , // tfn 0 , false ); // is clean?, del? // get the rdb of the tfndb Rdb *r = g_tfndb.getRdb(); // do a non-blocking dump of tree if it's 90% full now if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){ if ( ! r->dumpTree ( 0 ) ) // niceness return log("db: addToTfndb2: dumpTree failed" ); } // returns false and sets g_errno on error if ( ! r->addRecord ( coll, k , NULL , 0 , 0) ) return log("db: addToTfndb2: addRecord: %s",mstrerror(g_errno)); return true; } */ //Need these two if tr's in addtospiderdb are getting their quality from // their root urls. /*bool loadRootUrls ( char *filename){ File f; f.set ( filename ); // open files if ( ! f.open ( O_RDONLY ) ) { log("init: Rooturls open: %s %s",filename,mstrerror(g_errno)); return 0; } // get file size long fileSize = f.getFileSize() ; //init hashtable to lets say 1 mil // store a \0 at the end long bufSize = fileSize + 1; // make buffers to hold all char *buf = (char *) mmalloc ( bufSize , "RootUrls" ); if ( ! buf) { log("init: Rooturls mmalloc: %s",mstrerror(errno)); return 0; } //char *bufEnd = buf + bufSize; // set m_p1 char *p = buf; char *pend = buf + bufSize - 1; // read em all in if ( ! f.read ( buf , fileSize , 0 ) ) { log("init: Rooturls read: %s %s",filename,mstrerror(g_errno)); return 0; } //Close the file, no need to waste mem f.close(); // making all the \n's \0's for (long i=0; igetSpiderDate(); // get length //long collLen = gbstrlen(coll); // base priority on # of path components //unsigned char priority = tr->getSpiderPriority(); //long npc = tr->getUrl()->getPathDepth(); // count ending file name as a path component //if ( tr->getUrl()->getFilenameLen() > 0 ) npc++; // count cgi crap as one path component //if ( tr->getUrl()->isCgi() ) npc++; //if ( npc <= 5 ) priority = 19 - npc; //else priority = 0; // spammers love to create millions of hostnames on the same domain //if ( ! tr->getUrl()->isSuperRoot() ) npc++; // if more than 10 linkers, make it 5 // hey, doesn't his count internal linkers, too? skip it then // higher quality pages get higher priority, too // MOD for GK cluster // But don't get the quality from the titleRec. Since for gk // the titlerecs do not have the right quality, get the // root urls quality. For that loadRootUrls() must have already // been called by gendbs. //long q = tr->getDocQuality(); //if ( q > 50 && priority < 13 ) priority = 13; //if ( q > 70 && priority < 14 ) priority = 14; //if ( q > 85 && priority < 15 ) priority = 15; // // BEGIN SPECIAL CODE FOR FIXING SCORING BUG // // only get older versions before the fix //if ( tr->getVersion() >= 49 ) return true; // quick estimate of words, this works fast and well!! // see *exactly* how many words we have here // temp filter, only add big ones because they are the ones // that are messing us up the most //if ( tr->getContentLen() < 40000 ) return true; // temp hack //priority = 6; // // END SPECIAL CODE FOR FIXING SCORING BUG // key_t k = g_spiderdb.makeSpiderRecKey ( tr->getDocId() , tr->getNextSpiderDate(), // date , tr->getNextSpiderPriority() , // priority, 0, false, false , // this is now obsolete true ); //!tr->isSpiderLinksFalse()); // sanity check if ( getGroupId(RDB_SPIDERDB,&k) != g_hostdb.m_groupId ) { log("spider key is wrong groupId"); char *xx = NULL; *xx = 0; } // add to spiderdb now SpiderRec sr; sr.set ( tr->getUrl () , coll , // tr->getColl () , gbstrlen(coll) , // tr->getCollLen () , tr->getNextSpiderDate() , tr->getNextSpiderPriority() , // priority , 0 , // retryNum false , // forced? false , // is new? -1 , // url, not docId based false , // forceDelete? -1 , // ruleset tr->getIp() , // ip tr->getIp() , // sanityIp tr->getDocQuality() , // docQuality tr->getHopCount() );// hopCount if ( sr.getStoredSize () > 2048 ) return log("db: makespiderdb: could not store %s", tr->getUrl()->getUrl()); // serialize into buf char buf [ 4096 ]; long recSize = sr.store ( buf , 2048 ); // get the rdb of the spiderdb Rdb *r = g_spiderdb.getRdb(); // do a non-blocking dump of tree if it's 90% full now if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){ if ( ! r->dumpTree ( 0 ) ) // niceness return log("db: makespiderdb: dumpTree failed" ); } // returns false and sets g_errno on error if ( ! r->addRecord ( coll , k , buf , recSize , 0) ) return log("db: addToSpiderdb: addRecord: %s", mstrerror(g_errno)); return true; } BigFile s_cf [ MAX_HOSTS ]; long long s_cfoff [ MAX_HOSTS ] ; // file offsets static bool s_cdbInit = false; bool addToChecksumdb ( char *coll , TitleRec *tr ) { // we have to make multiple checksumdbs since we won't store // all of them locally ourselves if ( ! s_cdbInit ) { // open up one checksumdb FLAT file for each group long ng = g_hostdb.getNumShards(); for ( long i = 0 ; i < ng ; i++ ){ char name[64]; // . initialize our own internal rdb // . the %lx in "g%li" is the group NUM to which the // keys in this file belong, the "h%li" is the host // number that generated these keys sprintf(name,"checksumg%lih%lidb",i,g_hostdb.m_hostId); // unlink this file just in case s_cf[i].set ( g_hostdb.m_dir , name ); s_cf[i].unlink(); if ( ! s_cf[i].open ( O_RDWR | O_CREAT ) ) return log("db: addToChecksumdb: cannot open %s", name); s_cfoff[i] = 0LL; } s_cdbInit = true; } //key_t k ; long cKeySize = g_conf.m_checksumdbKeySize; char k[16]; // this fails on out of memory to set the Xml class. //if ( ! tr->getChecksumKey(&k) ) //if ( ! tr->getChecksumKey(k) ) // return log("db: addToChecksumdb: getChecksumKey failed: %s.", // mstrerror(g_errno)); TitleRec *otr = tr; // // get the checksumdb key just like we get it in Msg16.cpp!! // TODO: store in title rec // Xml xml; if ( ! xml.set ( tr->getCharset() , tr->getContent() , tr->getContentLen() , false , 0, false , tr->getVersion() , true , // setParentArgs MAX_NICENESS) ) return log("db: addToChecksumdb: getChecksumKey failed: %s.", mstrerror(g_errno)); // MDW: we should have the xml already parsed here! //Xml *xml = m_oldDoc.getXmlDoc()->getXml(); long long h; // get link infos LinkInfo *linkInfo = otr->getLinkInfo (); //LinkInfo *linkInfo2 = otr->getLinkInfo2(); h = g_checksumdb.getContentHash ( &xml , otr->getUrl() , linkInfo , otr->getVersion() , MAX_NICENESS ); // get our doc's link-adjusted quality char newQuality = otr->getDocQuality(); // make the OLD dup key char oldk[16]; g_checksumdb.makeDedupKey ( otr->getUrl() , h , otr->getDocId() , otr->getVersion () , false , //del newQuality , oldk ); // from Msg1.cpp:55 unsigned long groupId = getGroupId ( RDB_CHECKSUMDB , &k ); long dbnum = g_hostdb.makeHostId ( groupId ); log(LOG_INFO,"mila groupId= %lu hostId=%ld", groupId,dbnum); // debug msg //log("db: %08lx %016llx %s",k.n1,k.n0,url->getUrl()); // add to the appropriate checksumdb slice //if ( ! s_cf[dbnum].write ( &k , sizeof(key_t), s_cfoff[dbnum] ) ) if ( ! s_cf[dbnum].write ( k , cKeySize, s_cfoff[dbnum] ) ) return log("db: addToChecksumdb: write checksumdb failed"); //s_cfoff[dbnum] += sizeof(key_t); s_cfoff[dbnum] += cKeySize; return true; } */ /* bool mergeChecksumFiles ( ) { // if main checksumdb file already exists, do not do merge BigFile f; f.set (g_hostdb.m_dir,"checksumdb-saved.dat"); if ( f.doesExist() ) return true; f.set (g_hostdb.m_dir,"checksumdb0001.dat"); if ( f.doesExist() ) return true; // disable threads so everything is blocking g_threads.disableThreads(); // open up one checksumdb FLAT file for each group bool flag = false; long long count = 0; long ng = g_hostdb.getNumShards(); for ( long i = 0 ; i < ng ; i++ ) { // . initialize our own internal rdb // . the %lx in "g%lx" is the group id to which the keys // in this file belong, the "h%li" is the host number that // generated these keys // . g_hostdb.m_hostId is also our group NUM char name[64]; sprintf(name,"checksumg%lih%lidb",g_hostdb.m_hostId,i); f.set (g_hostdb.m_dir,name); // if file does not exist then do not do any merging if ( ! f.doesExist() ) continue; // otherwise, we're doing a merge, so announce it if ( ! flag ) { flag = true; log("db: *-*-*-* mergeChecksumdbs: merging " "%s/checksumg*h*db* files",g_hostdb.m_dir ); } // open just for reading if ( ! f.open ( O_RDONLY ) ) { g_threads.enableThreads(); return log("db: mergeChecksumFiles: cannot open %s", name); } // mention it log("db: mergeChecksumdbs: merging %s",name); long long off = 0LL; // now add them one at a time to our g_checksumdb //key_t k; long cKeySize = g_conf.m_checksumdbKeySize; char k[16]; // how big is the file? long long fileSize = f.getFileSize(); loop: //if ( ! f.read ( &k, sizeof(key_t) , off ) ) { if ( ! f.read ( k , cKeySize , off ) ) { g_threads.enableThreads(); return log("db: mergeChecksumFiles: %s off=%lli " "read failed", name, off ); } //off += sizeof(key_t); off += cKeySize; Rdb *r = g_checksumdb.getRdb(); count++; // do a non-blocking dump of tree if it's 90% full now if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){ if ( ! r->dumpTree ( 0 ) ) {// niceness g_threads.enableThreads(); return log("db: mergeChecksums: dumpTree failed" ); } } // returns false and sets g_errno on error. finalmerge=coll if ( ! r->addRecord ( "finalmerge", k , NULL , 0 , 0) ) { g_threads.enableThreads(); return log("db: mergeChecksums: addRecord: %s", mstrerror(g_errno)); } // loop if more to go if ( off < fileSize ) goto loop; // otherwise, we're done with this file, do next one f.close(); } // save g_checksumdb g_checksumdb.getRdb()->close ( NULL, NULL, true, false ); // announce it log("db: *-*-*-* mergeChecksumdbs: merge complete. added %lli keys to " "checksumdb.",count); g_threads.enableThreads(); return true; } */ /* // . returns false and sets g_errno on error, true on success // . some temp code to convert our key format to the new key format // . can also be used to regenerate tfndb and checksumdb bool fixTitleRecs( char *coll ) { RdbBase *tbase = getRdbBase ( RDB_TITLEDB , coll ); bool flag = true; bool doChecksumdb = true ; bool doTfndb = true ; bool doSpiderdb = true ; // disable threads so everything is blocking g_threads.disableThreads(); // but if titledb has more than 1 file on disk, they need to be merged // so we can re-write the keys without fear of encountering deletes // for which we cannot compute the site or content hashes to make // the new titleRec key if ( tbase->getNumFiles() > 1 ) return log("fixTitleRecs: more than one titledb file " "found"); collnum_t collnum = g_collectiondb.getCollnum ( coll ); key_t k; char *rec; long recSize; TitleRec tr; key_t newk; long long h; bool isNegative = false; long count = 0; // change the keys of TitleRecs in the RdbTree RdbTree *tt = &g_titledb.getRdb()->m_tree; // how many nodes in title rec tree? long nn = tt->getNumNodes(); // debug msg log("db: *-*-*-* Converting %li title rec keys in tree.",nn); if ( doChecksumdb ) log("db: *-*-*-* Generating checksumdb"); if ( doTfndb ) log("db: *-*-*-* Generating tfndb"); if ( doSpiderdb ) log("db: *-*-*-* Generating spiderdb"); // make sure tree is good //if ( ! tt->checkTree ( true ) ) return false; // get id2 of titledb long id2 = tbase->m_fileIds2[0]; // loop through all the nodes, go by k for ( long i = 0 ; i < nn ; i++ ) { // skip if empty if ( tt->m_parents[i] == -2 ) continue; // get his key k = *(key_t *)tt->getKey(i); // declare these up here since we have a "goto skip" RdbList tlist; Msg5 msg5; Msg5 msg5b; key_t startKey ; key_t endKey ; // positives are easy if ( (k.n0 & 0x01) == 0x01 ) { if(!tt->getList(collnum,k,k,10,&tlist,NULL,NULL,false)) return log("getlist failed"); if ( tlist.isExhausted() ) { log("db: getlist failed 2 " "i=%li n1=%lx n0=%llx. continuing.", i,k.n1,k.n0); continue; } tlist.resetListPtr(); goto skip; } // get this rec and its positive, if any startKey = k; endKey = k; endKey.n0 |= 0x01; // look it up, block if ( ! msg5.getList ( RDB_TITLEDB , coll , &tlist , startKey , endKey , 8000 , // minRecSizes false , // includeTree? false , // addToCache? 0 , // max cache age 0 , // startFileNum -1 , // numFiles (-1 =all) NULL , // state NULL , // callback 2 , // niceness false ,// error correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )) return log(LOG_LOGIC,"db: getList did not block."); // . if the negative has no positive, list will NOT be empty // . this also happens if negative key has been converted in // the tree, but positive key on disk have not been... if ( ! tlist.isExhausted() ) { long long d = g_titledb.getDocIdFromKey ( k ); log("db: docId %lli has negative but no positive",d); continue; } isNegative = true; skip: // make tr rec = tlist.getCurrentRec(); // get new key, skip if set failed if ( ! makeNewTitleRecKey ( rec,MAX_TR_SIZE, &newk,&tr,&h ) ) { log("db: tree node titleRec set failed. continuing."); continue; } // if positive, save checksumdb, tfndb and spiderdb if ( ! isNegative ) { if ( doTfndb && ! addToTfndb (coll,&tr,id2)) return false; if ( doSpiderdb && ! addToSpiderdb (coll,&tr)) return false; if ( doChecksumdb && ! addToChecksumdb(coll,&tr)) return false; // log every 100 or so if ( count % 100 == 0 ) log("db: #%li %s",count,xd.ptr_firstUrl); count++; } // if already processed, skip it! if ( newk == k ) continue; // make negative again if ( isNegative ) { newk.n0 &= 0xfffffffffffffffeLL; isNegative = false; ((key_t *)(tt->m_keys))[i] = newk; continue; } // change the key, should not affect the ordering ((key_t *)(tt->m_keys))[i] = newk; } // save the converted tree log("db: *-*-*-* Saving titledb-saved.dat"); tt->fastSave ( g_hostdb.m_dir , "titledb" , false , NULL , NULL ); // open the file of TitleRecs, should only be one of them BigFile f; f.set ( g_hostdb.m_dir , "titledb0001.dat" ); if ( ! f.open ( O_RDWR | O_TRUNC ) ) return log("fixTitleRecs: open: %s", mstrerror(g_errno)); f.setBlocking ( ); long long off = 0; // get one rec at a time and store in this buffer char *buf = (char *)mmalloc ( MAX_TR_SIZE , "main"); if ( ! buf ) return log("fixTitleRecs: malloc failed"); long long fsize = f.getFileSize(); if ( fsize <= 0 ) { mfree ( buf , MAX_TR_SIZE , "main" ); return log("filesize of %s is %lli", f.getFilename(),fsize); } long ng = g_hostdb.getNumShards(); // save the old map, do not overwrite any old one log("db: *-*-*-* Moving old titledb0001.map to titledb0001.map.old"); sprintf ( buf , "mv -i %s/titledb0001.map %s/titledb0001.map.old", g_hostdb.m_dir,g_hostdb.m_dir); system ( buf ); // get the old map in memory RdbMap *m = tbase->getMaps()[0]; // make a new map for the converted titledb //sprintf ( buf , "%s/titledb0001.map",g_hostdb.m_dir); // this will reset it m->set ( g_hostdb.m_dir , "titledb0001.map" , -1, false,sizeof(key_t), GB_INDEXDB_PAGE_SIZE); loop: // are we done? if ( off >= fsize ) { log("db: *-*-*-* Reached end of title file and tree. " "Saving data to disk"); // save titledb tree if we modified it //g_titledb.getRdb()->close ( NULL, NULL, true, false ); // dump trees we did for ( long i = 0 ; doChecksumdb && i < ng ; i++ ) s_cf[i].close ( ); if ( doTfndb ) g_tfndb.getRdb()->close ( NULL, NULL, true, false ); if ( doSpiderdb ) g_spiderdb.getRdb()->close ( NULL, NULL, true, false ); // re-enable threads g_threads.enableThreads(); mfree ( buf , MAX_TR_SIZE , "main" ); // return now if we did not update titledb0001.dat at all if ( flag ) return true; //f.set ( g_hostdb.m_dir , "titledb0001.map"); //f.unlink(); log("db: *-*-*-*- Saving new titledb0001.map"); if ( ! m->writeMap() ) return log("fixTitleRecs: could not write " "map file."); return true; } // read in info about next titleRec if ( ! f.read ( buf , 16 , off ) ) { mfree ( buf , MAX_TR_SIZE , "main" ); return log("reading blocked"); } if ( g_errno ) { mfree ( buf , MAX_TR_SIZE , "main" ); return log("reading size error, needed 16"); } // get the key and recSize k = *(key_t *) buf; recSize = *(long *)(buf+12) + 16 ; // bitch and fix if recSize is corrupt if ( recSize > 4*1024*1024 || recSize < 16 ) { log("db: fixTitleRecs: bad TitleRec size of %li.",recSize); log("db: fixTitleRecs: attempting to determine correct size."); recSize = getRecSize ( &f , off ); if ( recSize < 0 ) { mfree ( buf , MAX_TR_SIZE , "main" ); return log("fixTitleRecs: attempt failed."); } log("db: fixTitleRecs: found size to be %li",recSize); } if ( recSize > MAX_TR_SIZE ) { log("db: fixTitleRecs: tr size is %li. skipping.",recSize); off += recSize ; goto loop; } // read in the key_recSiez+titleRec if ( ! f.read ( buf , recSize, off )) { mfree ( buf , MAX_TR_SIZE , "main" ); return log("reading blocked"); } if ( g_errno ) { mfree ( buf , MAX_TR_SIZE , "main" ); return log("reading size error, needed 16"); } // set our rec ptr to what we just read rec = buf; // get new key, skip if set failed bool status = makeNewTitleRecKey ( rec, MAX_TR_SIZE, &newk,&tr,&h ) ; // add to the map if ( ! m->addRecord ( newk , buf , recSize ) ) { mfree ( buf , MAX_TR_SIZE , "main" ); return log("add to new map error"); } // deal with title rec decompress failure if ( ! status ) { log("db: fixTitleRecs: makeNewTitleRecKey failed. " "off=%lli recSize=%li.",off,recSize); off += recSize ; goto loop; } // only write back the new key if different from the old key if ( newk != k ) { // if we haven't already logged this do it now if ( flag ) { log("db: *-*-*-* Converting keys in titledb0001.dat."); flag = false; } // ovewrite the old if ( ! f.write ( &newk , 12 , off ) ) { mfree ( buf , MAX_TR_SIZE , "main" ); return log("overwrite failed. that sucks."); } } // if key has negative equivalent in tree, do not add it to the 3 dbs key_t negk = newk ; negk.n0 &= 0xfffffffffffffffeLL; if ( tt->getNode ( collnum , negk ) < 0 ) { // add recs to the three dbs if ( doTfndb && ! addToTfndb ( coll , &tr , id2 ) ) { mfree ( buf , MAX_TR_SIZE , "main" ); return false; } if ( doSpiderdb && ! addToSpiderdb ( coll , &tr ) ) { mfree ( buf , MAX_TR_SIZE , "main" ); return false; } if ( doChecksumdb && ! addToChecksumdb ( coll , &tr ) ) { mfree ( buf , MAX_TR_SIZE , "main" ); return false; } } else log("db: fixTitleRecs: key is negative in tree"); // advance to point to next titleRec now off += recSize ; // log every 100 or so if ( count % 100 == 0 ) log("db: #%li %s",count,xd.ptr_firstUrl); count++; // loop for more goto loop; } // . when a titleRec has an impossible size, there was disk corruption // . try all possible size combinations, up to 1 million long getRecSize ( BigFile *f , long long off ) { char *buf = (char *) mmalloc ( 50*1024*1024 , "main" ); if ( ! buf ) return -1; f->read ( buf , 50*1024*1024 , off ); TitleRec tr; // loop over possible sizes for ( long i = 0 ; i < 48*1024*1024 - 32 ; i++ ) { char *next = buf + 12 + 4 + i; // log every 1000 or so if ( i % 1000 == 0 ) log("db: i=%li",i); // ensure sane size, if not try next i long size = *(long *)(next + 12); if ( size < 0 || size > 1024*1024 ) continue; // if uncompress failed, just keep looping if ( ! tr.set ( next , MAX_TR_SIZE , false ) ) continue; // if it uncompressed successfully, make sure url is valid char *u = tr.getUrl()->getUrl(); // log it log("db: getRecSize: recSize of %li has next url of %s",i,u); // is valid? if ( u[0] != 'h' ) { log("db: getRecSize: skipping since url does not start " "with 'h'"); continue; } // otherwise, return it mfree ( buf , 50*1024*1024 , "main" ); return i + 16; } log("getRecSize: no good recSize found"); mfree ( buf , 50*1024*1024 , "main" ); return -1; } */ // . also makes checksumdb // . g_hostdb.m_hostId should be set correctly /* bool genDbs ( char *coll ) { if (!ucInit(g_hostdb.m_dir, true)) return log("build: Unicode initialization failed!"); RdbBase *base = getRdbBase ( RDB_TITLEDB , coll ); BigFile f; // if no titledb, there is no generating //bool hasTitledb = false; //f.set ( g_hostdb.m_dir , "titledb-saved.dat"); //if ( f.doesExist() ) hasTitledb = true ; //f.set ( g_hostdb.m_dir , "titledb0001.dat"); //if ( f.doesExist() ) hasTitledb = true ; //if ( ! hasTitledb ) return true; bool doChecksumdb = true ; bool doTfndb = true ; bool doSpiderdb = true ; // build checksumdb if there not one char tmp[256]; long ng = g_hostdb.getNumShards(); long gnum = g_hostdb.m_hostId % ng; sprintf ( tmp , "checksumg%lih%lidb",gnum,g_hostdb.m_hostId); f.set ( g_hostdb.m_dir , tmp ); if ( f.doesExist() ) doChecksumdb = false; f.set ( g_hostdb.m_dir , "checksumdb-saved.dat"); if ( f.doesExist() ) doChecksumdb = false; f.set ( g_hostdb.m_dir , "checksumdb0001.dat"); if ( f.doesExist() ) doChecksumdb = false; // same for tfndb f.set ( g_hostdb.m_dir , "tfndb-saved.dat"); if ( f.doesExist() ) doTfndb = false; f.set ( g_hostdb.m_dir , "tfndb0001.dat"); if ( f.doesExist() ) doTfndb = false; // and spiderdb f.set ( g_hostdb.m_dir , "spiderdb-saved.dat"); if ( f.doesExist() ) doSpiderdb = false; f.set ( g_hostdb.m_dir , "spiderdb0001.dat"); if ( f.doesExist() ) doSpiderdb = false; // bail if all are three already if ( ! doChecksumdb && ! doTfndb && ! doSpiderdb ) return true; // disable threads so everything is blocking g_threads.disableThreads(); if ( doChecksumdb ) log("db: *-*-*-* Generating checksumdb"); if ( doTfndb ) log("db: *-*-*-* Generating tfndb"); if ( doSpiderdb ){ log("db: *-*-*-* Generating spiderdb"); //Need this if tr's in addtospiderdb are getting their //quality from their root urls. // if dospiderdb, also load rooturls for MOD } // we only add tfn's of 0, so everybody should be in the root file, // should be ok if in tree though! if ( doTfndb && base->getNumFiles() > 1 ) { log("genDbs: More than one titledb file found. " "Can not create tfndb. Do a tight merge on " "titledb and then try again."); return true; } // get id2 of titledb long id2 = base->m_fileIds2[0]; // we have to make multiple checksumdbs since we won't store all of // them locally ourselves BigFile cf [ MAX_HOSTS ]; long long cfoff [ MAX_HOSTS ] ; // file offsets // open up one checksumdb FLAT file for each group for ( long i = 0 ; doChecksumdb && i < ng ; i++ ){ char name[64]; // . initialize our own internal rdb // . the %lx in "g%li" is the group NUM to which the keys // in this file belong, the "h%li" is the host number that // generated these keys sprintf(name,"checksumg%lih%lidb",i,g_hostdb.m_hostId); // unlink this file just in case cf[i].set ( g_hostdb.m_dir , name ); cf[i].unlink(); if ( ! cf[i].open ( O_RDWR | O_CREAT ) ) return log("genDbs: cannot open %s",name); cfoff[i] = 0LL; } // reset some stuff key_t nextKey; nextKey.setMin(); RdbList tlist; tlist.reset(); long minRecSizes=3*1024*1024; // 3 megs // keep these declared before the loop so compiler stops complaining key_t endKey; Msg5 msg5; Msg5 msg5b; char *rec ; long listSize ; TitleRec tr ; static unsigned long count = 0; endKey.setMax(); // now pick titleRec from old titledb loop: tlist.reset(); // always clear last bit of g_nextKey nextKey.n0 &= 0xfffffffffffffffeLL; // a niceness of 0 tells it to block until it gets results!! if ( ! msg5.getList ( RDB_TITLEDB , coll , &tlist , nextKey , endKey , // should be maxed! minRecSizes , // min rec sizes true , // include tree? false , // includeCache false , // addToCache 0 , // startFileNum -1 , // m_numFiles NULL , // state NULL , // callback 0 , // niceness true , // do error correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )) return log(LOG_LOGIC,"db: getList did not block."); // close up if no titleRec if ( tlist.isEmpty() ) { log("db: *-*-*-* All done generating. saving files."); // dump trees we did for ( long i = 0 ; doChecksumdb && i < ng ; i++ ) cf[i].close ( ); if ( doTfndb ) { // force tree dump to disk g_tfndb.getRdb()->dumpTree(0); g_tfndb.getRdb()->close ( NULL, NULL, true, false ); } if ( doSpiderdb ) { // force tree dump to disk g_spiderdb.getRdb()->dumpTree(0); g_spiderdb.getRdb()->close ( NULL, NULL, true, false ); } g_threads.enableThreads(); return true; } tlist.resetListPtr(); listLoop: if (tlist.isExhausted() ) { goto loop; } // advance g_nextKey to get next titleRec nextKey = tlist.getCurrentKey(); nextKey += (unsigned long)1; // get raw rec from list rec = tlist.getCurrentRec(); listSize = tlist.getListSize (); // set the titleRec we got if ( ! tr.set ( rec , listSize , false ) ) { // own data? log("db: gotList: error setting titleRec! skipping." ); tlist.skipCurrentRecord(); goto listLoop; } if ( doTfndb && ! addToTfndb (coll, &tr, id2)) return false; if ( doSpiderdb && ! addToSpiderdb (coll, &tr )) return false; if ( doChecksumdb && ! addToChecksumdb (coll, &tr )) return false; // log the url if ( (count % 300) == 0 ) logf(LOG_INFO,"db: %li) %s %li", count,tr.getUrl()->getUrl(),tr.getContentLen()); count++; tlist.skipCurrentRecord(); // get another record from the list we've got goto listLoop; // make the compiler happy // return true; } // . also makes checksumdb // . g_hostdb.m_hostId should be set correctly // 1. if a url is in spiderdb as old but is not really old (i.e. it does // not exist in titledb/tfndb) then it will not make it into tfndb // and we will get EDOCNOTOLD errors when we try to spider it, and // and it will be deleted from spiderdb. // 2. if a url is in spiderdb as new but also in titledb, then we add it // to tfndb with the probable docid, but when adding to tfndb from titledb // it gets added with the actual docid. so tfndb kinda has a double // record. but when the spiderdb record is done as new it should remove // the old tfndb record if the probable docid did not match the actual // in Msg14.cpp.... // . Try seeing if there are recs with the same probable docid (convert actual // to probable) and the same extension hash. they should not both be in tfndb bool genTfndb ( char *coll ) { RdbBase *base = getRdbBase ( RDB_TITLEDB , coll ); BigFile f; // same for tfndb f.set ( g_hostdb.m_dir , "tfndb-saved.dat"); if ( f.doesExist() ) { log("db: %stfndb-saved.dat exists. " "Not generating tfndb. Please move it to a tmp dir.", g_hostdb.m_dir); return false; } f.set ( g_hostdb.m_dir , "tfndb0001.dat"); if ( f.doesExist() ) { log("db: %stfndb0001.dat exists. Not generating tfndb. " "Please move all %stfndb* files to a tmp dir.", g_hostdb.m_dir,g_hostdb.m_dir); return false; } g_conf.m_maxMem = 2000000000LL; g_mem.m_maxMem = 2000000000LL; // we only add tfn's of 0, so everybody should be in the root file, // should be ok if in tree though! //if ( base->getNumFiles() > 1 ) { // log("db: More than one titledb file found. " // "Can not create tfndb. Do a tight merge on " // "titledb and then try again."); // exit(-1); // return true; //} // disable threads so everything is blocking g_threads.disableThreads(); log("db: Generating tfndb."); // reset some stuff key_t nextKey; nextKey.setMin(); RdbList tlist; tlist.reset(); key_t endKey; endKey.setMax(); long fn = 0; long id2; long local = 0; long long dd; SpiderRec sr; static unsigned long count = 0; Msg5 msg5; Msg5 msg5b; // debug stuff //nextKey = g_titledb.makeFirstTitleRecKey ( 4949692421LL ); //goto loop2; // add url recs for spiderdb loop: // always clear last bit of g_nextKey nextKey.n0 &= 0xfffffffffffffffeLL; // a niceness of 0 tells it to block until it gets results!! if ( ! msg5.getList ( RDB_SPIDERDB , coll , &tlist , nextKey , endKey , // should be maxed! 200048 , // min rec sizes true , // include tree? false , // includeCache false , // addToCache 0 , // startFileNum -1 , // m_numFiles NULL , // state NULL , // callback 0 , // niceness true ))// do error correction? return log(LOG_LOGIC,"db: getList did not block."); // close up if no titleRec if ( tlist.isEmpty() ) { log(LOG_INFO,"db: Read %li spiderdb recs.",local); local = 0; log(LOG_INFO,"db: All done reading spiderdb."); //g_tfndb.getRdb()->close ( NULL, NULL, true, false ); //g_threads.enableThreads(); // is the list from the tree in memory? if ( fn == base->getNumFiles() ) id2 = 255; else id2 = base->m_fileIds2[fn]; if ( id2 == 255 ) log(LOG_INFO,"db: Reading titledb tree."); else log(LOG_INFO,"db: Reading " "file #%li titledb*-%03li.dat*.",fn,id2); // reset key nextKey.setMin(); local = 0; goto loop2; //return true; } nextRec: // advance g_nextKey to get next titleRec nextKey = tlist.getCurrentKey(); nextKey += (unsigned long)1; // set the titleRec we got if ( ! sr.set ( &tlist ) ) { log("db: gotList: error setting spiderRec! skipping." ); goto skip; } // . skip docid based spider recs // . if its old, we'll take care of it below // . no, add here even if old, it will be overridden because if it is // old then it is using its actual docid, not just probable docid // . if we find a spider rec is really not old and throw it into // the new category, that is bad because it may be using its actual // docid and not probable... // . this logic here assumes spiderdb is 100% correct, if it isn't // we should have a fixspiderdb command // . if spiderdb rec in there is labelled as new but it is really old // we will add it to tfndb here with its probable docid, but when // finding it in the titledb we will add it again to tfndb with its // actual docid. the two may not match and we end up with double // tfndb entries. // . if spider rec is labelled as old, and we say 'doc not old' and // move it to new, then there was not a titlerec for it!! ok we need // to regen tfndb and stop moving spider recs like that. if ( sr.m_url.getUrlLen() > 0 && g_spiderdb.isSpiderRecNew ( tlist.getCurrentKey() ) ) // add url based spider recs if ( ! addToTfndb2 (coll, &sr, 255)) return false; // id2=255 // log the url if ( (count % 10000) == 0 ) { if ( sr.m_url.getUrlLen() > 0 ) logf(LOG_INFO,"db: *%li) %s",count,sr.getUrl()->getUrl()); else logf(LOG_INFO,"db: *%li) %lli",count,sr.m_docId); } skip: count++; local++; // try going down list if ( tlist.skipCurrentRecord() ) goto nextRec; // start it all over for another TitleRec goto loop; loop2: // just the tree? long nf = 1; bool includeTree = false; if ( fn == base->getNumFiles() ) { nf = 0; includeTree = true; } // always clear last bit of g_nextKey nextKey.n0 &= 0xfffffffffffffffeLL; // a niceness of 0 tells it to block until it gets results!! if ( ! msg5.getList ( RDB_TITLEDB , coll , &tlist , nextKey , endKey , // should be maxed! 1024 , // min rec sizes includeTree , // include tree? false , // includeCache false , // addToCache fn , // startFileNum nf , // m_numFiles NULL , // state NULL , // callback 0 , // niceness true , // do error correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )) return log(LOG_LOGIC,"db: getList did not block."); // close up if no titleRec if ( tlist.isEmpty() ) { fn++; if ( fn <= base->getNumFiles() ) { log(LOG_INFO,"db: Scanning titledb file #%li.", fn); nextKey.setMin(); goto loop2; } done: // otherwise, wrap it up log(LOG_INFO, "db: Scanned %li spiderdb and titledb recs.",count); log(LOG_INFO, "db: All done generating tfndb. Saving files."); // force tree dump to disk, we use more mem for tfndb than // most gb process, so they won't be able to load the tree g_tfndb.getRdb()->dumpTree(0); // save our tree to disk, should be empty. g_tfndb.getRdb()->close ( NULL, NULL, true, false ); g_threads.enableThreads(); return true; } nextRec2: key_t tkey; tkey = tlist.getCurrentKey(); dd = g_titledb.getDocIdFromKey ( tkey ); // skip if bad... CORRUPTION if ( tkey < nextKey ) { long p1 = msg5.m_msg3.m_startpg[0] + 1; log("db: Encountered corruption in titledb while making " "tfndb. Page = %li. " "NextKey.n1=%lu %llu. " "Key.n1=%lu %llu " "FirstDocId=%llu.", p1-1,nextKey.n1,nextKey.n0,tkey.n1,tkey.n0, g_titledb.getDocIdFromKey(nextKey)); RdbMap **maps = base->getMaps(); here: // bail if done if ( p1 >= maps[0]->getNumPages() ) goto done; key_t kk = *(key_t *)maps[0]->getKeyPtr ( p1 ); if ( kk <= nextKey ) { p1++; goto here; } // otherwise, use that next key nextKey = kk; goto loop2; } // advance g_nextKey to get next titleRec nextKey = tlist.getCurrentKey(); nextKey += (unsigned long)1; // advance one if positive, must always start on a negative key if ( (nextKey.n0 & 0x01) == 0x01 ) nextKey += (unsigned long)1; // get raw rec from list char *rec = tlist.getCurrentRec(); long listSize = tlist.getListSize (); // is the list from the tree in memory? if ( fn == base->getNumFiles() ) id2 = 255; else id2 = base->m_fileIds2[fn]; TitleRec tr ; // skip if its a delete // let's print these out if ( (tkey.n0 & 0x01) == 0x00 ) { static bool ff = true; if ( ff ) { log("GOT NEGATIVE KEY. tfndb generation will " "contain positive tfndb keys for title recs " "that were deleted!! bad... need to tight " "merge titledb to fix this. better yet, " "you should be using the Repair tool to repair " "tfndb, that one actually works!"); ff = false; } goto skip2; } // set the titleRec we got if ( ! tr.set ( rec , listSize , false ) ) { // own data? long long d = g_titledb.getDocIdFromKey ( tkey ); log("db: gotList: Error setting titleRec. docId=%lli. " "Skipping." , d ); goto loop2; // skip2; } if ( ! addToTfndb (coll, &tr, id2)) return false; // log the url if ( (count % 1000) == 0 ) logf(LOG_INFO,"db: %li) %s %li %lli", count,tr.getUrl()->getUrl(),tr.getContentLen(),dd); count++; local++; skip2: // try going down list if ( tlist.skipCurrentRecord() ) goto nextRec2; // start it all over for another TitleRec goto loop2; // make the compiler happy return true; } */ // . for cleaning up indexdb // . print out docids in indexdb but not in our titledb, if they should be void dumpMissing ( char *coll ) { g_dumpMode = true; // load tfndb, assume it is a perfect reflection of titledb //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_conf.m_tfndbMaxDiskPageCacheMem = 0; g_conf.m_indexdbMaxCacheMem = 0; //g_conf.m_clusterdbMaxDiskPageCacheMem = 0; g_tfndb.init (); g_collectiondb.init(true); // isDump? g_tfndb.getRdb()->addColl ( coll ); g_titledb.init(); g_titledb.getRdb()->addColl ( coll ); // if titledb has stuff in memory, do not do this, it needs to // be dumped out. this way we can assume a tfn of 255 means the docid // is probable and just in spiderdb. (see loop below) if ( g_titledb.getRdb()->m_tree.getNumUsedNodes() ) { logf(LOG_INFO,"db: Titledb needs to be dumped to disk before " "we can scan tfndb. Please do ./gb ddump to do this or " "click on \"dump to disk\" in the Master Controls."); return; } // . just get the docids from tfndb... // . this tfndb rec count is for ALL colls!! DOH! long long numRecs = g_tfndb.getRdb()->getNumTotalRecs(); long long oldNumSlots = (numRecs * 100) / 80; // make a power of 2 // make it a power of 2 //oldNumSlots *= 2; //oldNumSlots -= 1; //long numSlots = getHighestLitBitValue((unsigned long)oldNumSlots); long numSlots = oldNumSlots; //unsigned long mask = numSlots - 1; // make a hash table for docids logf(LOG_INFO,"db: Allocating %li bytes for docids.",numSlots*8); unsigned long long *slots = (unsigned long long *)mcalloc ( numSlots * 8 , "dumpMissing" ); if ( ! slots ) { log("db: Could not alloc %li bytes to load in %lli docids.", numSlots*8,numRecs); return; } // load in all tfndb recs key_t startKey ; key_t endKey ; startKey.setMin(); endKey.setMax(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 5*1024*1024; RdbList list; Msg5 msg5; logf(LOG_INFO,"db: Loading tfndb for hostId %li, has %lli recs.", g_hostdb.m_hostId,numRecs); long long count = 0; long next = 0; long used = 0; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_TFNDB , coll , &list , startKey , endKey , minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); exit(-1); } // all done if empty if ( list.isEmpty() ) goto done; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { // get the tfn key_t k = list.getCurrentKey(); count++; // skip if negative if ( (k.n0 & 0x01LL) == 0x00 ) continue; // titledb tree is empty, so this must indicate it is in // spiderdb only long tfn = g_tfndb.getTfn(&k); if ( tfn == 255 ) continue; // get docid unsigned long long d = g_tfndb.getDocId ( &k ); // add to hash table //long n = (unsigned long)d & mask; long n = (unsigned long)d % numSlots; // chain if not in there while ( slots[n] ) if ( ++n >= numSlots ) n = 0; // add it here slots[n] = d; // count it if ( used >= next ) { logf(LOG_INFO,"db: Loaded %li docids.",used); next = used + 1000000; } used++; } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey >= *(key_t *)list.getLastKey() ) goto loop; // ok now, scan indexdb and report docids in indexdb that are not // in our tfndb when they should be. done: logf(LOG_INFO,"db: Scanned %lli tfndb recs.",count); logf(LOG_INFO,"db: Scanning indexdb."); logf(LOG_INFO,"db: Tight merge indexdb to make this faster."); //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_indexdb.init (); g_collectiondb.init(true); g_indexdb.getRdb()->addColl ( coll ); startKey.setMin(); endKey.setMax(); // get a meg at a time minRecSizes = 5*1024*1024; Msg5 msg5b; //unsigned long groupId = g_hostdb.m_groupId; unsigned long shardNum = g_hostdb.getMyShardNum(); count = 0; long scanned = 0; //HashTableT repeat; HashTableX repeat; if ( ! repeat.set ( 8,1,1000000,NULL,0,false,0,"rpttbl" ) ) { log("db: Failed to init repeat hash table."); return; } loop2: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_INDEXDB , coll , &list , startKey , endKey , minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // something to log scanned += list.getListSize(); if ( scanned >= 100000000 ) { count += scanned; scanned = 0; logf(LOG_INFO,"db: Scanned %lli bytes.",count); } // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); // skip deletes if ( (k.n0 & 0x01) == 0x00 ) continue; // do we hold his titleRec? continue if not if ( getShardNum ( RDB_TITLEDB , &k ) != shardNum ) continue; // get his docid unsigned long long d = g_indexdb.getDocId(k); // otherwise, report him if not in tfndb //long n = (unsigned long)d & mask; long n = (unsigned long)d % numSlots; while ( slots[n] && slots[n] != d ) if ( ++n >= numSlots ) n = 0; // if he was not in tfndb when he should have been, // print him on stdout if ( slots[n] == d ) continue; // is he in the repeat table? long slot=repeat.getSlot(&d); if (slot!=-1) if ( *(char *)repeat.getValueFromSlot ( slot ) == 1 ) continue; // print if this is the first time fprintf(stderr,"missingdocid %012llu\n",d); // put him in a table so we don't repeat him char one = 1; repeat.addKey ( &d , &one ); } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey >= *(key_t *)list.getLastKey() ) goto loop2; logf(LOG_INFO,"db: Done generating missing docids."); return; } // . for cleaning up indexdb // . print out docids in the same termlist multiple times void dumpDups ( char *coll ) { g_dumpMode = true; // load tfndb, assume it is a perfect reflection of titledb //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_conf.m_indexdbMaxCacheMem = 0; //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_indexdb.init (); g_collectiondb.init(true); g_indexdb.getRdb()->addColl ( coll ); key_t startKey ; key_t endKey ; startKey.setMin(); endKey.setMax(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 6*1024*1024; long numSlots = 2 * 1024 * 1024; long long * slots; char * scores; slots = (long long *) mmalloc ( numSlots * 8, "main-dumpDups"); scores = (char *) mmalloc ( numSlots, "main-dumpDups"); if(!slots || !scores) { if(!slots) log(LOG_INFO,"admin: Could not allocate %lld " "bytes for dumpDups" , (long long) numSlots * 8 ); else mfree(slots, numSlots * 8, "main-dumpDups" ); if(!scores) log(LOG_INFO,"admin: Could not allocate %ld " "bytes for dumpDups" , numSlots ); else mfree(scores, numSlots , "main-dumpDups" ); return; } long offset1 = 0; long offset2 = 0; long long tempTid = -1; long long lastTid = -1; long long tid = -1; long long indexdbCount = 0; char * tempListPtr; char * tempListPtrHi; key_t k; long long d; long hashMod; unsigned long long n2; long long endTid; char filename[30]; char buff[100]; long numParsed = 0; long collNum = g_collectiondb.getCollnum ( coll ); File f; File f2; Rdb *r = g_indexdb.getRdb(); RdbTree *tree = &r->m_tree; sprintf(filename,"removedDupKeys.%li", collNum ); f.set(g_hostdb.m_dir, filename); if(f.doesExist() ) { log(LOG_INFO,"admin: File %s%s already exists. " "Aborting process" , g_hostdb.m_dir, filename ); return; } if( !f.open(O_RDWR | O_CREAT) ) { log( LOG_INFO, "admin: Could not create %s/%s.", g_hostdb.m_dir, filename); return ; } sprintf(filename,"removedDupDocIds.%li", collNum); // g_collectiondb.getCollnum ( coll )); f2.set(g_hostdb.m_dir, filename); if(f2.doesExist() ) { f2.unlink(); } if( !f2.open(O_RDWR | O_CREAT) ) { log( LOG_INFO, "admin: Could not create %s/%s.", g_hostdb.m_dir, filename); return ; } RdbList list; Msg5 msg5; Msg5 msg5b; unsigned long count = 0; unsigned long count2 = 0; long long byteCount = 0; unsigned long highLitBit; char *p; //unsigned long groupId = g_hostdb.m_groupId; count = 0; long long scanned = 0; long long dups = 0LL; char lookup[256] = { 8, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; // HashTableT repeat; // HashTableT local; logf(LOG_INFO,"db: Scanning indexdb for repeated docids."); logf(LOG_INFO,"db: Tight merge indexdb to make this faster."); logf(LOG_INFO,"db: Dumping docid termId pairs to file."); /* if ( ! repeat.set ( 1000000 ) ) { log("db: Failed to init repeat hash table."); return; } if ( ! local.set ( 1000000 ) ) { log("db: Failed to init repeat hash table2."); return; } */ loop: //long long startTime = gettimeofdayInMilliseconds(); // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_INDEXDB , coll , &list , startKey , endKey , minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } //long long endTime = gettimeofdayInMilliseconds(); //log(LOG_INFO,"dumpdups Msg5 time = %li",(long)endTime - startTime); // all done if empty if ( list.isEmpty() ) { mfree ( slots, numSlots * 8, "main-dumpDups"); mfree ( scores, numSlots, "main-dumpDups"); f.close(); f2.close(); return; } // something to log scanned += list.getListSize(); if ( scanned >= 10000000 ) { byteCount += scanned; scanned = 0; logf(LOG_INFO,"db: Scanned %lli bytes. Parsed %lli records" "dups=%lli. ",byteCount,indexdbCount, dups); } tid = -1; k = *(key_t *) list.getStartKey(); tempTid = g_indexdb.getTermId( k ); k = *(key_t *) list.getEndKey(); endTid = g_indexdb.getTermId( k ); // loop over entries in list memset(slots , 0, numSlots * 8); memset(scores, 0, numSlots); offset1 = 0; offset2 = 0; //startTime = gettimeofdayInMilliseconds(); // loop over entries in list //startTime = gettimeofdayInMilliseconds(); //long totalNumParsed = 0; bool sameTidList = 0; long thisDup = 0; if(tempTid == endTid) sameTidList = 1; //log(LOG_INFO,"sameTidList = %d",sameTidList); for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { k = list.getCurrentKey(); if ( (k.n0 & 0x01LL) == 0x00 ) continue; tempTid = g_indexdb.getTermId(k); d = g_indexdb.getDocId(k); //totalNumParsed++; numParsed++; //change in tid, get the count if(tempTid != tid ) { thisDup = 0; //is this tid the same as we process in the last run if(tid == -1 && tempTid == lastTid) { log(LOG_INFO,"admin: We broke termlist of " "termid=%lld. Some " "docids may be repeated in this termlist and " "we will not know.", tempTid); } //check if we hit the endTid - then reload the // list from that point if(tempTid == endTid && !sameTidList) { break; } if(sameTidList) { count = numSlots - 1; offset1 = 0; offset2 = numSlots; if(tid != -1) { memset(slots , 0, numSlots * 8); memset(scores, 0, numSlots); //log(LOG_INFO,"dumpDups Wish more numslots"); } tid = tempTid; hashMod = numSlots; lastTid = tid; numParsed = 1; } else { tid = tempTid; tempListPtr = list.m_listPtr; tempListPtrHi = list.m_listPtrHi; count = 1; for( list.skipCurrentRecord(); !list.isExhausted(); list.skipCurrentRecord() ) { if( *(list.m_listPtr) & 0x02 ) { count++; } else { key_t kt = list.getCurrentKey(); tempTid=g_indexdb.getTermId(kt); if(tempTid != tid) break; count++; } } list.m_listPtr = tempListPtr; list.m_listPtrHi = tempListPtrHi; if(count == 1) { continue; } p = (char *) &count; if( count*2 > (unsigned) numSlots ) count = numSlots; else { if(count <= 255) { highLitBit= lookup[(unsigned char) *p]; highLitBit++; count2 = 1; count2 <<= highLitBit; if( count2/2 < count ) { count2 <<= 1; } count = count2; } else if (count <= 65535) { p++; highLitBit= lookup[(unsigned char) *p]; highLitBit += 9; count2 = 1; count2 <<= highLitBit; if( count2/2 < count ) { count2 <<= 1; } count = count2; } else if (count <= 16777216) { p += 2; highLitBit= lookup[(unsigned char) *p]; highLitBit += 17; count2 = 1; count2 <<= highLitBit; if( count2/2 < count ) { count2 <<= 1; if(count2 > (unsigned) numSlots) count2=numSlots; } count = count2; } else { p += 3; highLitBit= lookup[(unsigned char) *p]; highLitBit += 25; count2 = 1; count2 <<= highLitBit; if( count2/2 < count ) { count2 <<= 1; if(count2 > (unsigned) numSlots) count2=numSlots; } count = count2; } } if(offset2 + count + 1 < (unsigned) numSlots ) { offset1 = offset2 + 1; offset2 += count; } else { memset(slots , 0, numSlots * 8); memset(scores, 0, numSlots); offset1 = 0; offset2 = count; } hashMod = count; lastTid = tid; numParsed = 1; } } indexdbCount ++; n2 = (unsigned long long) d & (hashMod-1); n2 += offset1; while ( slots[n2] && slots[n2] != d ) { if ( ++n2 >= (unsigned long long) offset2 ) n2 = offset1; } if( slots[n2] != d ) { slots[n2] = d; scores[n2] = (unsigned char) g_indexdb.getScore(k); } else { dups++; //add negative keys sprintf(buff,"%08lx %016llx\n", k.n1, k.n0); f.write(buff,gbstrlen(buff), -1); sprintf(buff,"%lld\n",d); f2.write(buff, gbstrlen(buff), -1); k.n0 &= 0xfffffffffffffffeLL; if ( ! r->addRecord ( coll, k , NULL , 0 , 0) ) { log("admin: could not add negative key: %s", mstrerror(g_errno)); return; } if ( tree->getNumAvailNodes() <= 0 ) { // this should block r->dumpTree(0); } key_t kt; kt = g_indexdb.makeKey(tid,scores[n2],slots[n2],false); sprintf(buff,"%08lx %016llx\n", kt.n1, kt.n0); kt.n0 &= 0xfffffffffffffffeLL; f.write(buff,gbstrlen(buff), -1); if ( ! r->addRecord ( coll, kt , NULL , 0 , 0) ) { log("admin: could not add negative key: %s", mstrerror(g_errno)); return; } if ( tree->getNumAvailNodes() <= 0 ) { // this should block r->dumpTree(0); } } if(numParsed*2 + 1 > numSlots ) { //log(LOG_INFO,"dumpDups wished more numSlots numParsed=%ld", // numParsed); tid = 0; } } // no, use the last termid!! well this is not perfect, oh well //endTime = gettimeofdayInMilliseconds(); //log(LOG_INFO,"dumpdups Loop time = %lli notParsed=%ld", // endTime - startTime, (list.getNumRecs()-totalNumParsed)); if( list.isExhausted() ) { startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; if ( startKey >= *(key_t *)list.getLastKey() ) goto loop; } else { startKey = k; goto loop; } // watch out for wrap around logf(LOG_INFO,"db: Done generating missing docids. Parsed %lld" " indexdb records", indexdbCount); mfree ( slots, numSlots * 8, "main-dumpDups"); mfree ( scores, numSlots, "main-dumpDups"); f.close(); f2.close(); list.freeList(); r->close ( NULL , NULL , false , false ); return; } // . remove the docids in "filename" from indexdb. // . make a hashtable of these docids // . once each host has a list of the docids in /a/missing*, do this: // dsh -ac 'cd /a/ ; echo -n "cat /a/missing* | grep missingdoc | awk '{print $2}' | sort > sorted." > /a/poo ; cd /a/ ; ls missing* >> /a/poo ; chmod +x /a/poo; /a/poo' // . then each host will have a file called /a/sorted.missing* and you can // copy them to host #0 and merge sort them with 'sort -m -t /a/tmp sorted.*' void removeDocIds ( char *coll , char *filename ) { int fd; fd = open ( filename , O_RDONLY ); if ( fd <= 0 ) { log("db: Count not open %s for reading: %s", filename,strerror(errno)); return ; } long long dcount = 0; long long offset ; char buf [ 1024*1024*2+1 ]; long readSize ; long n ; char *p; char *pend; // note it logf(LOG_INFO,"db: Counting docids in file %s.",filename); loop1: // read in docids and hash them offset = 0; readSize = 1024*1024*2; n = read ( fd , buf , readSize ); if ( n < 0 ) { log("db: Had error reading %s: %s", filename,strerror(errno)); return ; } offset += n; // 0 is EOF p = buf; pend = buf + n; *pend = 0; while ( *p ) { // count it for now dcount++; // advance over \n while ( *p && *p !='\n' ) p++; // all done? while ( *p == '\n' ) p++; } if ( n > 0 ) goto loop1; // note it logf(LOG_INFO,"db: Counted %lli docids in file %s.",dcount,filename); long long oldNumSlots = (dcount * 100LL) / 80LL; oldNumSlots *= 2; oldNumSlots -= 1; long numSlots = getHighestLitBitValue ((unsigned long)oldNumSlots); if ( numSlots < 64 ) numSlots = 64; long need = numSlots * 8; logf(LOG_INFO,"db: Allocating %li bytes for hash table.",need); unsigned long mask = numSlots - 1; unsigned long long *slots = (unsigned long long *)mcalloc(need,"loaddocids"); if ( ! slots ) { log("db: Could not allocate %li bytes to read in docids. " "Please split this file and do multiple runs.", need); return; } // now hash those docids offset = 0; close ( fd ); fd = open ( filename , O_RDONLY ); if ( fd <= 0 ) { log("db: Count not open %s for reading: %s", filename,strerror(errno)); return ; } // note it logf(LOG_INFO,"db: Loading and hashing docids from file %s.",filename); loop2: // read in docids and hash them n = read ( fd , buf , readSize ); if ( n < 0 ) { log("db: Had error reading %s: %s", filename,strerror(errno)); return ; } offset += n; // 0 is EOF p = buf; pend = buf + n; *pend = 0; while ( *p ) { // get docid unsigned long long d = atoll(p); // hash it long n = (unsigned long)d & mask; while ( slots[n] && slots[n] != d ) if ( ++n >= numSlots ) n = 0; // add him slots[n] = d; // advance over \n while ( *p && *p !='\n' ) p++; // all done? while ( *p == '\n' ) p++; } if ( n > 0 ) goto loop2; // do not merge so much //if ( g_conf.m_indexdbMinFilesToMerge < 100 ) // g_conf.m_indexdbMinFilesToMerge = 100; //if ( g_conf.m_checksumdbMinFilesToMerge < 100 ) // g_conf.m_checksumdbMinFilesToMerge = 100; if ( g_conf.m_clusterdbMinFilesToMerge < 100 ) g_conf.m_clusterdbMinFilesToMerge = 100; g_conf.m_tfndbMaxDiskPageCacheMem = 0; //g_conf.m_checksumdbMaxDiskPageCacheMem = 0; //g_conf.m_clusterdbMaxDiskPageCacheMem = 0; g_conf.m_indexdbMaxCacheMem = 0; //g_conf.m_checksumdbMaxCacheMem = 0; //g_conf.m_clusterdbMaxCacheMem = 0; g_tfndb.init(); g_indexdb.init (); //g_checksumdb.init(); g_clusterdb.init(); g_collectiondb.init(true); g_tfndb.getRdb()->addColl ( coll ); g_indexdb.getRdb()->addColl ( coll ); //g_checksumdb.getRdb()->addColl ( coll ); g_clusterdb.getRdb()->addColl ( coll ); // this what set to 2 on me before, triggering a huge merge // every dump!!! very bad, i had to gdb to each process and set // this value to 50 myself. //CollectionRec *cr = g_collectiondb.getRec ( coll ); //if ( cr->m_indexdbMinFilesToMerge < 50 ) // cr->m_indexdbMinFilesToMerge = 50; // note it logf(LOG_INFO,"db: Loaded %lli docids from file \"%s\".", dcount,filename); // now scan indexdb and remove recs with docids in this hash table logf(LOG_INFO,"db: Scanning indexdb and removing recs."); //logf(LOG_INFO,"db: Tight merge indexdb to make this faster."); //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); // compatability with checksumdb's variable size keys /* long cKeySize = g_conf.m_checksumdbKeySize; char startKey2[16]; char endKey2[16]; // initialize checksumdb specific keys if (cKeySize == 16) { ((key128_t *)startKey2)->setMin(); ((key128_t *)endKey2)->setMax(); } else { KEYSET( startKey2, (char *)&startKey, cKeySize ); KEYSET( endKey2, (char *)&endKey, cKeySize ); } */ g_threads.disableThreads(); Rdb *r = g_indexdb.getRdb(); collnum_t collnum = g_collectiondb.getCollnum ( coll ); // do not start if any indexdb recs in tree or more than 1 disk file RdbBase *base = r->getBase(collnum); if ( base->getNumFiles() > 1 ) { log("db: More than 1 indexdb file. Please tight merge."); return; } if ( g_indexdb.getRdb()->m_tree.getNumUsedNodes() ) { log("db: Indexdb tree not empty. Please dump."); return; } // set niceness really high if ( setpriority ( PRIO_PROCESS, getpid() , 20 ) < 0 ) log("db: Call to setpriority failed: %s.", mstrerror(errno)); // get a meg at a time long minRecSizes = 5*1024*1024; Msg5 msg5; Msg5 msg5b; RdbList list; // // // SCAN INDEXDB and remove missing docids // // r = g_indexdb.getRdb(); long long count = 0; long scanned = 0; long long recs = 0; long long removed = 0; RdbTree *tree = &r->m_tree; loop3: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_INDEXDB , coll , &list , startKey , endKey , minRecSizes , // HACK: use false for now //false , // includeTree , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , // HACK: use 1 for now //1 , // numFiles , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // something to log scanned += list.getListSize(); if ( scanned >= 100000000 ) { count += scanned; scanned = 0; logf(LOG_INFO,"db: Scanned %lli bytes. Scanned %lli records. " "Removed %lli records.",count,recs,removed); } // yield every 256k records long ymask = 0x40000; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { recs++; if ( (recs & ymask) == 0x00 ) sched_yield(); key_t k = list.getCurrentKey(); // skip deletes if ( (k.n0 & 0x01) == 0x00 ) continue; unsigned long long d = g_indexdb.getDocId(k); // see if docid is in delete list long n = (unsigned long)d & mask; while ( slots[n] && slots[n] != d ) if ( ++n >= numSlots ) n = 0; // skip him if we should not delete him if ( slots[n] != d ) continue; // otherwise, remove him // make him a delete, turn off his last bit (the del bit) k.n0 &= 0xfffffffffffffffeLL; if ( ! r->addRecord ( collnum , (char *)&k , NULL , 0 , 0) ) { log("db: Could not delete record."); return; } removed++; // dump tree? if ( tree->getNumAvailNodes() <= 0 ) { // this should block r->dumpTree(0); } } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey >= *(key_t *)list.getLastKey() ) goto loop3; logf(LOG_INFO,"db: Scanned %lli bytes. Scanned %lli records. " "Removed %lli records.",count+scanned,recs,removed); // this should block //r->dumpTree(0); // save the tree man! logf(LOG_INFO,"db: Finished removing docids from indexdb. Saving."); r->close ( NULL , NULL , false , false ); // // // SCAN CHECKSUMDB and remove missing docids // // /* logf(LOG_INFO,"db: Scanning checksumdb and removing recs."); r = g_checksumdb.getRdb(); count = 0; scanned = 0; recs = 0; removed = 0; tree = &r->m_tree; loop4: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_CHECKSUMDB, coll , &list , //startKey , //endKey , startKey2 , endKey2 , minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // something to log scanned += list.getListSize(); if ( scanned >= 100000000 ) { count += scanned; scanned = 0; logf(LOG_INFO,"db: Scanned %lli bytes. Scanned %lli records. " "Removed %lli records.",count,recs,removed); } // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { recs++; if ( (recs & ymask) == 0x00 ) sched_yield(); //key_t k = list.getCurrentKey(); char k[16]; list.getCurrentKey( k ); // skip deletes //if ( (k.n0 & 0x01) == 0x00 ) continue; if ( (((key_t *)k)->n0 & 0x01) == 0x00 ) continue; unsigned long long d = g_checksumdb.getDocId( k ); // see if docid is in delete list long n = (unsigned long)d & mask; while ( slots[n] && slots[n] != d ) if ( ++n >= numSlots ) n = 0; // skip him if we should not delete him if ( slots[n] != d ) continue; // otherwise, remove him // make him a delete, turn off his last bit (the del bit) //k.n0 &= 0xfffffffffffffffeLL; ((key_t *)k)->n0 &= 0xfffffffffffffffeLL; if ( ! r->addRecord ( collnum , k , NULL , 0 , 0) ) { log("db: Could not delete record."); return; } removed++; // dump tree? if ( tree->getNumAvailNodes() <= 0 ) { // this should block r->dumpTree(0); } } //startKey = *(key_t *)list.getLastKey(); //startKey += (unsigned long) 1; list.getLastKey( startKey2 ); if ( cKeySize == 12 ) *((key_t *)startKey2) += (unsigned long) 1; else if ( cKeySize == 16 ) *((key128_t *)startKey2) += (unsigned long) 1; // watch out for wrap around //if ( startKey >= *(key_t *)list.getLastKey() ) goto loop4; if ( KEYCMP(startKey2, list.getLastKey(), cKeySize) >= 0 ) goto loop4; logf(LOG_INFO,"db: Scanned %lli bytes. Scanned %lli records. " "Removed %lli records.",count+scanned,recs,removed); // this should block //r->dumpTree(0); logf(LOG_INFO,"db: Finished removing docids from checksumdb. Saving."); r->close ( NULL , NULL , false , false ); */ // // // SCAN CLUSTERDB and remove missing docids // // logf(LOG_INFO,"db: Scanning clusterdb and removing recs."); r = g_clusterdb.getRdb(); count = 0; scanned = 0; recs = 0; removed = 0; tree = &r->m_tree; loop5: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_CLUSTERDB , coll , &list , startKey , endKey , minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // something to log scanned += list.getListSize(); if ( scanned >= 100000000 ) { count += scanned; scanned = 0; logf(LOG_INFO,"db: Scanned %lli bytes. Scanned %lli records. " "Removed %lli records.",count,recs,removed); } // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { recs++; if ( (recs & ymask) == 0x00 ) sched_yield(); key_t k = list.getCurrentKey(); // skip deletes if ( (k.n0 & 0x01) == 0x00 ) continue; unsigned long long d = g_clusterdb.getDocId(&k); // see if docid is in delete list long n = (unsigned long)d & mask; while ( slots[n] && slots[n] != d ) if ( ++n >= numSlots ) n = 0; // skip him if we should not delete him if ( slots[n] != d ) continue; // otherwise, remove him // make him a delete, turn off his last bit (the del bit) k.n0 &= 0xfffffffffffffffeLL; if ( ! r->addRecord ( collnum , (char *)&k , NULL , 0 , 0) ) { log("db: Could not delete record."); return; } removed++; // dump tree? if ( tree->getNumAvailNodes() <= 0 ) { // this should block r->dumpTree(0); } } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey >= *(key_t *)list.getLastKey() ) goto loop5; logf(LOG_INFO,"db: Scanned %lli bytes. Scanned %lli records. " "Removed %lli records.",count+scanned,recs,removed); // this should block //r->dumpTree(0); logf(LOG_INFO,"db: Finished removing docids from clusterdb. Saving."); r->close ( NULL , NULL , false , false ); // // // SCAN TFNDB and remove missing docids // one twin might have the docid, while the other doesn't, // so make sure to remove it from both. // // logf(LOG_INFO,"db: Scanning tfndb and removing recs."); r = g_tfndb.getRdb(); count = 0; scanned = 0; recs = 0; removed = 0; tree = &r->m_tree; loop6: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_TFNDB , coll , &list , startKey , endKey , minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // something to log scanned += list.getListSize(); if ( scanned >= 100000000 ) { count += scanned; scanned = 0; logf(LOG_INFO,"db: Scanned %lli bytes. Scanned %lli records. " "Removed %lli records.",count,recs,removed); } // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { recs++; if ( (recs & ymask) == 0x00 ) sched_yield(); key_t k = list.getCurrentKey(); // skip deletes if ( (k.n0 & 0x01) == 0x00 ) continue; unsigned long long d = g_tfndb.getDocId(&k); // see if docid is in delete list long n = (unsigned long)d & mask; while ( slots[n] && slots[n] != d ) if ( ++n >= numSlots ) n = 0; // skip him if we should not delete him if ( slots[n] != d ) continue; // otherwise, remove him // make him a delete, turn off his last bit (the del bit) k.n0 &= 0xfffffffffffffffeLL; if ( ! r->addRecord ( collnum , (char *)&k , NULL , 0 , 0) ) { log("db: Could not delete record."); return; } removed++; // dump tree? if ( tree->getNumAvailNodes() <= 0 ) { // this should block r->dumpTree(0); } } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey >= *(key_t *)list.getLastKey() ) goto loop6; logf(LOG_INFO,"db: Scanned %lli bytes. Scanned %lli records. " "Removed %lli records.",count+scanned,recs,removed); logf(LOG_INFO,"db: Finished removing docids from tfndb. Saving."); r->close ( NULL , NULL , false , false ); return; } /* // . also makes checksumdb // . g_hostdb.m_hostId should be set correctly bool fixTfndb ( char *coll ) { // get the list of tfns g_titledb.init(); //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_tfndb.init (); g_collectiondb.init(true); // isDump? g_tfndb.addColl ( coll ); g_titledb.addColl ( coll ); RdbBase *base = getRdbBase ( RDB_TITLEDB , coll ); long nf = base->getNumFiles(); key_t startKey ; key_t endKey ; startKey.setMin(); endKey.setMax(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; RdbList list; BigFile *f = NULL; RdbMap *m = NULL; long long offset = 0LL; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_TFNDB , coll , &list , startKey , endKey , minRecSizes , false , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); exit(-1); } // all done if empty if ( list.isEmpty() ) goto done; // create new tfndb*.dat file to hold the negative keys if ( ! f ) { RdbBase *base = getRdbBase ( RDB_TFNDB , coll ); long fn = base->addNewFile ( -1 ); // id2 if ( fn < 0 ) { log("fixtfndb: Failed to create new file for " "tfndb."); exit(-1); } f = base->m_files [ fn ]; m = base->m_maps [ fn ]; f->open ( O_RDWR | O_CREAT | O_EXCL , NULL ); log(LOG_INFO,"fixtfndb: writing fixes to %s",f->getFilename()); } // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { // get the tfn key_t k = list.getCurrentKey(); long tfn = g_tfndb.getTitleFileNum ( k ); if ( tfn == 255 ) continue; // skip if negative if ( (k.n0 & 0x01LL) == 0x00 ) continue; long i = 0; for ( ; i < nf ; i++ ) if ( base->m_fileIds2[i] == tfn ) break; if ( i < nf ) continue; // does not correspond to a tfn, remove it long long docId = g_tfndb.getDocId ( k ); long e = g_tfndb.getExt ( k ); long clean = 0 ; if ( g_tfndb.isClean ( k ) ) clean= 1; long half = 0 ; if ( k.n0 & 0x02 ) half = 1; char *dd = "" ; if ( (k.n0 & 0x01) == 0 ) dd =" (del)"; fprintf(stdout, "%016llx docId=%012lli " "e=0x%08lx tfn=%03li clean=%li half=%li %s\n", k.n0,docId,e,tfn,clean,half,dd); // make negative k.n0 &= 0xfffffffffffffffeLL; f->write ( &k , sizeof(key_t) , offset ); offset += sizeof(key_t); //m->addRecord ( k , NULL , 0 ); } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey >= *(key_t *)list.getLastKey() ) goto loop; done: if ( ! f ) return true; // write map //m->writeMap(); f->close(); exit(1); return true; } */ // . diff with indexdb in sync/ dir // . returns false if they differ, true otherwise /* bool syncIndexdb ( ) { // open indexdb in sync/ dir Indexdb idb; // temporarily set the working dir char newdir [ 256 ]; sprintf ( newdir , "%s/sync", g_hostdb.m_dir ); char olddir [ 256 ]; strcpy ( olddir , g_hostdb.m_dir ); strcpy ( g_hostdb.m_dir , newdir ); // init the second indexdb with this new directory if ( ! idb.init() ) return false; //if ( ! idb.addColl ( "main" ) ) return false; // restore working dir strcpy ( g_hostdb.m_dir , olddir ); // count diffs long long count = 0; // always block g_threads.disableThreads(); // reset some stuff key_t nextKey; nextKey.setMin(); RdbList ilist1; RdbList ilist2; ilist1.reset(); ilist2.reset(); // now read list from sync dir, and make sure in old dir loop: key_t endKey; endKey.setMax(); // always clear last bit of g_nextKey nextKey.n0 &= 0xfffffffffffffffeLL; // announce startKey log("db: next k.n1=%08lx n0=%016llx",nextKey.n1,nextKey.n0); // a niceness of 0 tells it to block until it gets results!! Msg5 msg5; if ( ! msg5.getList ( RDB_INDEXDB , coll1 , &ilist1 , nextKey , endKey , // should be maxed! 1024*1024 , // min rec sizes true , // include tree? false , // includeCache false , // addToCache 0 , // startFileNum -1 , // m_numFiles NULL , // state NULL , // callback 0 , // niceness true ))// do error correction? return log(LOG_LOGIC,"db: getList did not block."); if ( ! msg5.getList ( RDB_INDEXDB , coll2 , &ilist2 , nextKey , endKey , // should be maxed! 1024*1024 , // min rec sizes true , // include tree? false , // includeCache false , // addToCache 0 , // startFileNum -1 , // m_numFiles NULL , // state NULL , // callback 0 , // niceness true ))// do error correction? return log(LOG_LOGIC,"db: getList did not block."); // get last keys of both key_t last1 ; key_t last2 ; if ( ! ilist1.isEmpty() ) last1 = ilist1.getLastKey(); else last1.setMax(); if ( ! ilist2.isEmpty() ) last2 = ilist2.getLastKey(); else last2.setMax(); // get the min key_t min = last1; if ( min > last2 ) min = last2; // now compare the two lists iloop: key_t k1; key_t k2; // skip if both empty if ( ilist1.isExhausted() && ilist2.isExhausted() ) goto done; // if one list is exhausted before the other, dump his keys if ( ilist1.isExhausted() ) k1.setMax(); else k1 = ilist1.getCurrentKey(); if ( ilist2.isExhausted() ) k2.setMax(); else k2 = ilist2.getCurrentKey(); // if different report it if ( k1 < k2 ) { log("db: sync dir has k.n1=%08lx n0=%016llx",k1.n1,k1.n0); ilist1.skipCurrentRecord(); count++; goto iloop; } else if ( k2 < k1 ) { log("db: orig dir has k.n1=%08lx n0=%016llx",k2.n1,k2.n0); ilist2.skipCurrentRecord(); count++; goto iloop; } if ( ! ilist1.isExhausted() ) ilist1.skipCurrentRecord(); if ( ! ilist2.isExhausted() ) ilist2.skipCurrentRecord(); goto iloop; done: // if both lists were completely empty, we're done if ( ilist1.isEmpty() && ilist2.isEmpty() ) { log("db: *-*-*-* found %li discrepancies",count); g_threads.enableThreads(); return (count==0); } // advance nextKey to get next pair of lists nextKey = min; nextKey += (unsigned long)1; // start it all over again goto loop; } */ /* // generates clusterdb from titledb bool makeClusterdb ( char *coll ) { key_t nextKey; key_t endKey; RdbList list; RdbList rlist; Msg5 msg5; Msg5 msg5b; long minRecSizes = 1024*1024; //long minRecSizes = 32*1024; unsigned long count = 0; // make sure the files are clean BigFile f; f.set ( g_hostdb.m_dir , "clusterdb-saved.dat"); if ( f.doesExist() ) { log("db: %sclusterdb-saved.dat exists. " "Not generating clusterdb.", g_hostdb.m_dir); return false; } f.set ( g_hostdb.m_dir , "clusterdb0001.dat"); if ( f.doesExist() ) { log("db: %sclusterdb0001.dat exists. Not generating clusterdb.", g_hostdb.m_dir); return false; } // turn off threads g_threads.disableThreads(); // log the start log("db: Generating clusterdb for Collection %s.", coll); // how many are we processing? log("db: makeclusterdb: processing %li urls", g_titledb.getLocalNumDocs()); // reset some stuff nextKey.n1 = 0; nextKey.n0 = 0; endKey.setMax(); rlist.set ( NULL, 0, NULL, 0, 0, false, true ); loop: list.reset(); // always clear last bit of g_nextKey nextKey.n0 &= 0xfffffffffffffffeLL; //long long startTime = gettimeofdayInMilliseconds(); // a niceness of 0 tells it to block until it gets results!! bool status = msg5.getList ( RDB_TITLEDB , coll , &list , nextKey , endKey , // should be maxed! minRecSizes , // get this many bytes of rec true , // include tree? false , // includeCache false , // addToCache 0 , // startFileNum -1 , // m_numFiles NULL , // state NULL , // callback 0 , // niceness true , // do error correction? NULL , // cache key 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b ); if ( ! status ) { log("db: critical error. msg5 did a non-blocking call"); exit(-1); } // close up if no titleRec if ( list.isEmpty() ) { log ( LOG_INFO, "db: Added %li files to clusterdb.", count); log ( LOG_INFO, "db: All done generating clusterdb. Saving files."); // force tree dump to disk g_clusterdb.getRdb()->dumpTree(0); // dump trees we did g_clusterdb.getRdb()->close ( NULL, NULL, true, false ); g_threads.enableThreads(); return true; } list.resetListPtr(); rlist.reset(); listLoop: if ( list.isExhausted() ) { // . add our list to rdb if ( ! g_clusterdb.getRdb()->addList ( coll, &rlist ) ) { log ( "db: clusterdb addList had error: %s", mstrerror(g_errno) ); return false; } goto loop; } // advance g_nextKey to get next titleRec nextKey = list.getCurrentKey(); nextKey += 1; // advance one if positive, must always start on a negative key if ( (nextKey.n0 & 0x01) == 0x01 ) nextKey += (unsigned long)1; // get raw rec from list char *rec = list.getCurrentRec(); long recSize = list.getCurrentRecSize(); // set the titleRec we got TitleRec oldtr ; if ( ! oldtr.set ( rec , recSize , false ) ) {// own data? log("db: error setting titleRec! skipping." ); list.skipCurrentRecord(); goto listLoop; } Url *url = oldtr.getUrl(); // log the url //if ( count % 1000 == 0 ) //log(LOG_INFO, "%li) %s %li", // count,url->getUrl(),oldtr.getContentLen()); count++; // make a cluster rec char crec [ CLUSTER_REC_SIZE ]; g_clusterdb.makeRecFromTitleRec ( crec , &oldtr, false ); //g_clusterdb.makeRecFromTitleRecKey ( crec , // rec, // false ); rlist.addRecord ( crec, 0, NULL ); long nLinkTexts = oldtr.getLinkInfo()->getNumInlinks(); if ( nLinkTexts > 10 ) log ( LOG_INFO, "db: %s (%li links)", url->getUrl(), nLinkTexts ); if ( count % 10000 == 0 ) log(LOG_INFO, "db: %li) %lx %llx", count, ((key_t*)crec)->n1, ((key_t*)crec)->n0); // set startKey, endKey //key_t key1 = *(key_t *)crec; //key_t key2 = key1; // add to our g_clusterdb //rlist.set ( crec , // CLUSTER_REC_SIZE , // crec , // CLUSTER_REC_SIZE , // key1 , // key2 , // CLUSTER_REC_SIZE - 12 , // false , // own data? // true );// use half keys? // . add our list to rdb //if ( ! g_clusterdb.getRdb()->addList ( coll, &rlist ) ) { // log ( "db: clusterdb addList had error: %s", // mstrerror(g_errno) ); // return false; //} list.skipCurrentRecord(); goto listLoop; //goto loop; } */ // forces add the hash of the date meta tag into a range for every rec /* bool genDateRange ( char *coll ) { key_t nextKey; key_t endKey; RdbList list; RdbList rlist; Msg5 msg5; Msg5 msg5b; Msg1 msg1; long minRecSizes = 1024*1024; //long minRecSizes = 32*1024; unsigned long count = 0; unsigned long long addSize = 0; // turn off threads g_threads.disableThreads(); // log the start log("db: Generating date range index for Collection %s.", coll); // how many are we processing? log("db: genDateRange: processing %li urls", g_titledb.getLocalNumDocs()); // get site rec 16 for hashing date range ?? SiteRec sr; sr.m_xml = g_tagdb.getSiteXml ( 16, coll, gbstrlen(coll) ); // reset some stuff nextKey.n1 = 0; nextKey.n0 = 0; endKey.setMax(); rlist.set ( NULL, 0, NULL, 0, 0, false, true ); loop: list.reset(); // always clear last bit of g_nextKey nextKey.n0 &= 0xfffffffffffffffeLL; //long long startTime = gettimeofdayInMilliseconds(); // a niceness of 0 tells it to block until it gets results!! bool status = msg5.getList ( RDB_TITLEDB , coll , &list , nextKey , endKey , // should be maxed! minRecSizes , // get this many bytes of rec true , // include tree? false , // includeCache false , // addToCache 0 , // startFileNum -1 , // m_numFiles NULL , // state NULL , // callback 0 , // niceness true , // do error correction? NULL , // cache key 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b ); if ( ! status ) { log("db: critical error. msg5 did a non-blocking call"); exit(-1); } // close up if no titleRec if ( list.isEmpty() ) { // FOR SMALL TEST ONLY!! //if ( list.isEmpty() || count > 500 ) { //log ( LOG_INFO, "db: THIS WAS ONLY A TEST OF 500 RECS!" ); log ( LOG_INFO, "db: Generated date range for %li TitleRecs.", count); log ( LOG_INFO, "db: All done generating date range. Saving files. " "(%llu)", addSize ); // dump trees we did // force tree dump to disk g_indexdb.getRdb()->dumpTree(0); g_indexdb.getRdb()->close ( NULL, NULL, true, false ); g_threads.enableThreads(); return true; } list.resetListPtr(); rlist.reset(); listLoop: if ( list.isExhausted() ) { goto loop; } // advance g_nextKey to get next titleRec nextKey = list.getCurrentKey(); nextKey += 1; // advance one if positive, must always start on a negative key if ( (nextKey.n0 & 0x01) == 0x01 ) nextKey += (unsigned long)1; // get raw rec from list char *rec = list.getCurrentRec(); //long listSize = list.getListSize (); long recSize = list.getCurrentRecSize(); // set the titleRec we got TitleRec oldtr ; if ( ! oldtr.set ( rec , recSize , false ) ) { // own data? log("gotList: error setting titleRec! skipping." ); goto loop; } // log the url Url *url = oldtr.getUrl(); if ( count % 10000 == 0 ) log(LOG_INFO, "%li) %s %li", count,url->getUrl(),oldtr.getContentLen()); count++; // use XmlDoc and TermTable to hash the date range TermTable tt; XmlDoc xmlDoc; xmlDoc.set(&oldtr, &sr, NULL, 0); xmlDoc.hashDate ( &tt, &oldtr, &sr ); // dump the term table into an index list IndexList indexList; IndexList newDateList; unsigned long long chksum1; indexList.set ( &tt, oldtr.getDocId(), NULL, &newDateList, 0, NULL, &chksum1 , 0 ); // niceness addSize += indexList.getListSize(); // . add our list to rdb if ( ! g_indexdb.getRdb()->addList ( coll, &indexList ) ) { log ( "db: indexdb addList had error: %s", mstrerror(g_errno) ); return false; } // go to the next titlerec list.skipCurrentRecord(); goto listLoop; } */ static int keycmp(const void *, const void *); int keycmp ( const void *p1 , const void *p2 ) { // returns 0 if equal, -1 if p1 < p2, +1 if p1 > p2 if ( *(key_t *)p1 < *(key_t *)p2 ) return -1; if ( *(key_t *)p1 > *(key_t *)p2 ) return 1; return 0; } /* bool matchertest ( int argc, char* argv[] ) { const int iterCompile = 10000; int numTerms = -1; // find -- separator for (int i = 0; i < argc; i++) { if (strcmp(argv[i], "--") == 0) { numTerms = i; break; } } if (numTerms == -1) return false; MatchTerm terms[numTerms]; for (int i = 0; i < numTerms; i++) { terms[i].m_term = (uint8_t*) argv[i]; terms[i].m_termSize = gbstrlen(argv[i]); } // -------------------------------------------------------------------- // do times compiles of various types struct timeval tv; // -------------------------------------------------------------------- gettimeofday(&tv, NULL); uint64_t tBMMStart = tv.tv_sec * 1000000 + tv.tv_usec; for (int i = 0; i < iterCompile; i++) { BitMatrixMatcher matcher; matcher.Compile(terms, numTerms, false); if (!matcher.Ready()) { fprintf(stderr, "BitMatrixMatcher compile\n"); return false; } } gettimeofday(&tv, NULL); uint64_t tBMMElapsed = (tv.tv_sec * 1000000 + tv.tv_usec) - tBMMStart; fprintf(stderr, "STAT %24s %6llduS Compile/Free\n", "BitMatrixMatcher", tBMMElapsed / iterCompile); // -------------------------------------------------------------------- gettimeofday(&tv, NULL); uint64_t tSATMStart = tv.tv_sec * 1000000 + tv.tv_usec; for (int i = 0; i < iterCompile; i++) { SmallAsciiTrieMatcher matcher; matcher.Compile(terms, numTerms, false); if (!matcher.Ready()) { fprintf(stderr, "SmallAsciiTrieMatcher compile\n"); return false; } } gettimeofday(&tv, NULL); uint64_t tSATMElapsed = (tv.tv_sec * 1000000 + tv.tv_usec) - tSATMStart; fprintf(stderr, "STAT %24s %6llduS Compile/Free\n", "SmallAsciiTrieMatcher", tSATMElapsed / iterCompile); // -------------------------------------------------------------------- gettimeofday(&tv, NULL); uint64_t tMBTMStart = tv.tv_sec * 1000000 + tv.tv_usec; for (int i = 0; i < iterCompile; i++) { MediumBinaryTrieMatcher matcher; matcher.Compile(terms, numTerms, false); if (!matcher.Ready()) { fprintf(stderr, "MediumBinaryTrieMatcher compile\n"); return false; } } gettimeofday(&tv, NULL); uint64_t tMBTMElapsed = (tv.tv_sec * 1000000 + tv.tv_usec) - tMBTMStart; fprintf(stderr, "STAT %24s %6llduS Compile/Free\n", "MediumBinaryTrieMatcher", tMBTMElapsed / iterCompile); // -------------------------------------------------------------------- gettimeofday(&tv, NULL); uint64_t tMMStart = tv.tv_sec * 1000000 + tv.tv_usec; for (int i = 0; i < iterCompile; i++) { MatrixMatcher matcher; matcher.Compile(terms, numTerms, false); if (!matcher.Ready()) { fprintf(stderr, "MatrixMatcher compile\n"); return false; } } gettimeofday(&tv, NULL); uint64_t tMMElapsed = (tv.tv_sec * 1000000 + tv.tv_usec) - tMMStart; fprintf(stderr, "STAT %24s %6llduS Compile/Free\n", "MatrixMatcher", tMMElapsed / iterCompile); // -------------------------------------------------------------------- // get contents of each file into memory argv += (numTerms + 1); argc -= (numTerms + 1); int numFiles = argc; uint8_t* content[numFiles]; uint32_t len[numFiles]; for (int i = 0; i < numFiles; i++) { FILE *pf = fopen(argv[i], "rb"); if (pf == NULL) { fprintf(stderr, "unable to open '%s'\n", argv[i]); return false; } struct stat sb; if (fstat(fileno(pf), &sb) != 0) { fprintf(stderr, "unable to stat '%s'\n", argv[i]); return false; } len[i] = sb.st_size; content[i] = (uint8_t*) mmalloc(len[i], "file"); if (content == NULL) { fprintf(stderr, "unable to malloc '%s'\n", argv[i]); return false; } if (fread(content[i], len[i], 1, pf) != 1) { fprintf(stderr, "unable to fread '%s'\n", argv[i]); return false; } fclose(pf); } // -------------------------------------------------------------------- // compile a matcher of each type BitMatrixMatcher matcherBMM; matcherBMM.Compile(terms, numTerms, false); //matcherBMM.Dump(); SmallAsciiTrieMatcher matcherSATM; matcherSATM.Compile(terms, numTerms, false); //matcherSATM.Dump(); MediumBinaryTrieMatcher matcherMBTM; matcherMBTM.Compile(terms, numTerms, false); //matcherMBTM.Dump(); MatrixMatcher matcherMM; matcherMM.Compile(terms, numTerms, false); //matcherMM.Dump(); const int numMatchers = 4; Matcher* matchers[numMatchers] = { &matcherBMM, &matcherSATM, &matcherMBTM, &matcherMM, }; char* matcherNames[numMatchers] = { "BitMatrixMatcher", "SmallAsciiTrieMatcher", "MediumBinaryTrieMatcher", "MatrixMatcher" }; // -------------------------------------------------------------------- // perform matching on each file using each type of matcher const int iterExec = 1000; for (int fileix = 0; fileix < numFiles; fileix++) { for (int matcherix = 0; matcherix < numMatchers; matcherix++) { int hits = 0; gettimeofday(&tv, NULL); uint64_t tStart = tv.tv_sec * 1000000 + tv.tv_usec; for (int iter = 0; iter < iterExec; iter++) { hits = 0; const uint8_t* icursor = content[fileix]; const uint8_t* iend = icursor + len[fileix]; Matcher* matcher = matchers[matcherix]; uint16_t termNum; while (icursor < iend) { icursor = matcher->Exec(icursor, iend - icursor, &termNum); hits++; if (icursor == NULL) break; //fprintf(stderr, "hit: %s\n", // terms[termNum].m_term); icursor += terms[termNum].m_termSize; } } gettimeofday(&tv, NULL); uint64_t tElapsed = (tv.tv_sec * 1000000 + tv.tv_usec) - tStart; fprintf(stderr,"STAT %24s %6llduS %4dKB %4d hits %s\n", matcherNames[matcherix], tElapsed / iterExec, len[fileix] / 1024, hits - 1, argv[fileix]); } } return true; } bool trietest ( ) { //TrieMatcher matcher; MatrixMatcher matcher; MatchTerm terms[3]; terms[0].m_term = (uint8_t*) "jackie"; terms[0].m_termSize = 6; terms[1].m_term = (uint8_t*) "jack"; terms[1].m_termSize = 4; terms[2].m_term = (uint8_t*) "sandi"; terms[2].m_termSize = 5; matcher.Compile(terms, 3, false); matcher.Dump(); uint16_t numTerm; const uint8_t* pos; #define STRING (uint8_t*) "this is jAck's test for Sandi's enjoyment" for (int i = 0; i < 10; i++) { pos = matcher.Exec(STRING, gbstrlen((char*) STRING), &numTerm); if (pos != NULL) { fprintf(stderr, "term[%d] '%s' -> %s\n", numTerm, terms[numTerm].m_term, pos); pos += gbstrlen((char*) terms[numTerm].m_term); pos = matcher.Exec(pos, gbstrlen((char*) pos), &numTerm); if (pos != NULL) { fprintf(stderr, "term[%d] '%s' -> %s\n", numTerm, terms[numTerm].m_term, pos); pos += gbstrlen((char*) terms[numTerm].m_term); pos = matcher.Exec(pos, gbstrlen((char*) pos), &numTerm); if (pos != NULL) exit(1); } } } return false; } */ /* bool gbgzip (char *filename) { File f; File w; char outfile[1024]; *(outfile + snprintf(outfile,1023,"%s.gz",filename)) = '\0'; f.set (".",filename); w.set (".",outfile); if ( f.doesExist() && f.open ( O_RDONLY ) && w.open ( O_RDWR | O_CREAT )) {} else return log("FATAL: could not open " "file for reading:%s", filename); g_conf.m_maxMem = 2000000000LL; g_mem.m_maxMem = 2000000000LL; long long fileSize = f.getFileSize(); if(g_conf.m_maxMem < fileSize) return log("FATAL: file too large:%s", filename); char* srcbuf = (char*)mmalloc(fileSize,"gzip src"); long long dstbufSize = (long long)(fileSize*1.001 + 32); char* dstbuf = (char*)mmalloc(dstbufSize,"gzip dst"); if(srcbuf == NULL || dstbuf == NULL) return log("FATAL: file too large:%s, out of memory.", filename); long unsigned int written = dstbufSize; f.read ( srcbuf , fileSize , 0); long err = gbcompress( (unsigned char*)dstbuf , &written, (unsigned char*)srcbuf , (uint32_t)fileSize , ET_GZIP); if(written == 0 || err != Z_OK) if ( err == Z_BUF_ERROR ) return log("FATAL: could not write file srclen=%lli, " "dstlen=0, %s%s", fileSize, mstrerror(g_errno), err == Z_BUF_ERROR?", buffer too small":""); w.write ( dstbuf , written , 0); sync(); // f.flush ( ); return true; } bool gbgunzip (char *filename) { //make the output filename: char outfile[1024]; long filenamelen = gbstrlen(filename); long outfilelen = filenamelen - 3; if(strcmp(filename + outfilelen, ".gz") != 0) return log("FATAL: could not open " "file, not a .gz:%s", filename); memcpy(outfile, filename, outfilelen); outfile[outfilelen] = '\0'; //open our input and output files right away File f; File w; f.set (filename); w.set (outfile); if ( f.doesExist() && f.open ( O_RDONLY ) && w.open ( O_RDWR | O_CREAT )) {} else return log("FATAL: could not open " "file for reading:%s", filename); g_conf.m_maxMem = 2000000000LL; g_mem.m_maxMem = 2000000000LL; long long fileSize = f.getFileSize(); if(g_conf.m_maxMem < fileSize) return log("FATAL: file too large:%s", filename); char* srcbuf = (char*)mmalloc(fileSize,"gzip src"); if(srcbuf == NULL) return log("FATAL: file too large:%s, out of memory.", filename); f.read ( srcbuf , fileSize , 0); long dstbufSize = getGunzippedSize(srcbuf,fileSize); char* dstbuf = (char*)mmalloc(dstbufSize,"gzip dst"); if(dstbuf == NULL) return log("FATAL: file too large:%s, out of memory.", filename); long unsigned int written = dstbufSize; long err = gbuncompress( (unsigned char*)dstbuf , &written , (unsigned char*)srcbuf , (uint32_t)fileSize); if(written == 0 || err != Z_OK) if ( err == Z_BUF_ERROR ) return log("FATAL: could not write file srclen=%lli, " "dstlen=0, %s%s", fileSize, mstrerror(g_errno), err == Z_BUF_ERROR?", buffer too small":""); w.write ( dstbuf , written , 0); sync(); // f.flush ( ); return true; } */ // time speed of inserts into RdbTree for indexdb bool bucketstest ( char* dbname ) { g_conf.m_maxMem = 2000000000LL; // 2G g_mem.m_maxMem = 2000000000LL; // 2G if ( dbname ) { char keySize = 16; if(strcmp(dbname, "indexdb") == 0) keySize = 12; RdbBuckets rdbb; rdbb.set (0, LONG_MAX , false ,//own data "buckets-test", RDB_INDEXDB, false , //data in ptrs "TestBuckets" , keySize , false ); rdbb.loadBuckets ( dbname ); if(!rdbb.selfTest(true/*testall*/, false/*core*/)) if(!rdbb.repair()) log("db: unrepairable buckets."); return 0; } char oppKey[MAX_KEY_BYTES]; RdbBuckets rdbb; char keySize = 12; rdbb.set (0, LONG_MAX , false ,//own data "buckets-test", RDB_INDEXDB, false , //data in ptrs "TestBuckets" , keySize , false ); long numKeys = 1000000; log("db: speedtest: generating %li random keys.",numKeys); // seed randomizer srand ( (long)gettimeofdayInMilliseconds() ); // make list of one million random keys char *k = (char*)mmalloc ( keySize * numKeys , "main" ); if ( ! k ) return log("speedtest: malloc failed"); long *r = (long *)k; long ksInLongs = keySize / 4; for ( long i = 0 ; i < numKeys * ksInLongs ; i++ ) { r[i] = rand();// % 2000; } for ( long i = 0 ; i < 1000 ; i++ ) { long j = (rand() % numKeys) * keySize; long m = (rand() % numKeys) * keySize; memcpy((char*)&k[j], (char*)&k[m], keySize); KEYXOR((char*)&k[j],0x01); } // init the tree RdbTree rt; if ( ! rt.set ( 0 , // fixedDataSize , numKeys + 1000 , // maxTreeNodes , false , // isTreeBalanced , numKeys * 32 , // maxTreeMem , false , "tree-test" , false , "TestTree" , keySize) ) // own data? return log("speedTest: tree init failed."); // add to regular tree long long t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < numKeys * keySize; i += keySize ) { char* key = k+i; KEYSET(oppKey,key,keySize); KEYXOR(oppKey,0x01); long n; n = rt.getNode ( 0, oppKey ); if ( n >= 0 ) { rt.deleteNode ( n , true ); } if ( rt.addNode ( 0, key , NULL , 0 ) < 0 ) return log("speedTest: rdb tree addNode " "failed"); } // print time it took long long e = gettimeofdayInMilliseconds(); log("db: added %li keys to rdb tree in %lli ms, " "now have %li keys",numKeys,e - t, rt.getNumUsedNodes()); for ( long i = 0 ; i < numKeys * keySize; i+=keySize ) { char* key = k+i; //if ( k[i].n1 == 1234567 ) // fprintf(stderr,"i=%li\n",i); if ( rdbb.addNode ( 0,key , NULL , 0 ) < 0 ) return log("speedTest: rdb buckets addNode " "failed"); } rdbb.testAndRepair(); t = gettimeofdayInMilliseconds(); log("db: added %li keys to rdb buckets in %lli ms, " "now have %li keys, mem used: %li", numKeys,t - e, rdbb.getNumKeys(),rdbb.getMemOccupied()); rdbb.selfTest(true, true); log("db: saving and loading buckets."); e = gettimeofdayInMilliseconds(); rdbb.fastSave ( ".", false, NULL, NULL); t = gettimeofdayInMilliseconds(); log("db: saved rdbbuckets in %lli ms",t - e); //rdbb.setNeedsSave(false); rdbb.clear(); e = gettimeofdayInMilliseconds(); rdbb.loadBuckets ( "TestBuckets" ); t = gettimeofdayInMilliseconds(); log("db: loaded rdbbuckets in %lli ms", t - e); rdbb.selfTest(true, true); //now test loading a tree, the keys will be sorted, so this // is the worst case performance. RdbBuckets rdbb2; rdbb2.set (0, 10000000 , false ,//own data "buckets-test", RDB_INDEXDB, false , //data in ptrs "TestBuckets" , keySize , false ); rdbb2.addTree (&rt); rdbb2.selfTest(true, true); rdbb2.setNeedsSave(false); //now test finding of individual keys long tests = numKeys * 2; log("db: Testing retrival of %li individual keys",tests ); long long ttook = 0; long long btook = 0; long tgot = 0; long bgot = 0; long found = 0; for ( long i = 0 ; i < 0; i++ ) { long j = (rand() % numKeys) * keySize; e = gettimeofdayInMilliseconds(); long nodeNum = rt.getNode ( 0 , (char*)&k[j]); t = gettimeofdayInMilliseconds(); ttook += t - e; e = gettimeofdayInMilliseconds(); char* foundKey = rdbb.getKeyVal ( 0 , (char*)&k[j], NULL, NULL); t = gettimeofdayInMilliseconds(); btook += t - e; if(nodeNum == -1) { if(foundKey == NULL) { continue; } log("speedTest: node not found in tree, but found in buckets! " "looked up %016llx%08lx, got %016llx%08lx", *(long long*)((char*)&k[j]+(sizeof(long))), *(long*)(char*)&k[j], *(long long*)(foundKey+(sizeof(long))), *(long*)foundKey); rdbb.printBuckets(); char* xx = NULL; *xx = 0; } if(foundKey == NULL) { if(nodeNum == -1) { continue; } log("speedTest: node not found in buckets, but found in tree! " "%016llx%08lx", *(long long*)((char*)&k[j]+(sizeof(long))), *(long*)(char*)&k[j]); rdbb.printBuckets(); char* xx = NULL; *xx = 0; } found++; } log("db: found %li keys from rdbtree in %lli ms",found, ttook); log("db: found %li keys from rdbbuckets in %lli ms",found, btook); // sort the list of keys t = gettimeofdayInMilliseconds(); gbsort ( k , numKeys , sizeof(key_t) , keycmp ); // print time it took e = gettimeofdayInMilliseconds(); log("db: sorted %li in %lli ms",numKeys,e - t); tests = 100; log("db: Testing retrival of a list of keys, %li random ranges", tests); RdbList treelist; RdbList bucketlist; RdbList list; long numPosRecs; long numNegRecs; char *tmpkey1; char *tmpkey2; char key1 [ MAX_KEY_BYTES ]; char key2 [ MAX_KEY_BYTES ]; long minRecSizes = 10000000; //long minRecSizes = -1; for ( long i = 0 ; i < tests; i++ ) { // long startKey = rand() % numKeys; // long endKey = (rand() % (numKeys - startKey)) + startKey; for ( long x = 0 ; x < MAX_KEY_BYTES; x++ ) { key1[x] = rand(); key2[x] = rand(); } char* skey; char* ekey; if ( KEYCMP(key1,key2,keySize) < 0 ) { skey = key1; ekey = key2; } else { skey = key2; ekey = key1; } e = gettimeofdayInMilliseconds(); rt.getList ( (collnum_t)0 , skey, ekey, minRecSizes, //min rec sizes &treelist, &numPosRecs, &numNegRecs, true ); //use half keys t = gettimeofdayInMilliseconds(); ttook += t - e; tgot += treelist.getNumRecs(); tmpkey1 = treelist.getStartKey(); tmpkey2 = treelist.getEndKey(); /* log(LOG_WARN, "db rdbtree found %li recs (%li pos, %li neg) " "between " "%016llx%08lx and " "%016llx%08lx. " "took %lli ms", treelist.getNumRecs(), numPosRecs, numNegRecs, *(long long*)(tmpkey1+(sizeof(long))), *(long*)tmpkey1, *(long long*)(tmpkey2+(sizeof(long))), *(long*)tmpkey2, t - e ); */ e = gettimeofdayInMilliseconds(); rdbb.getList ( (collnum_t)0 , skey, ekey, minRecSizes, //min rec sizes &bucketlist, &numPosRecs, &numNegRecs, true ); //use half keys t = gettimeofdayInMilliseconds(); btook += t - e; bgot += bucketlist.getNumRecs(); tmpkey1 = bucketlist.getStartKey(); tmpkey2 = bucketlist.getEndKey(); /* log(LOG_WARN, "db buckets found %li recs (%li pos, %li neg) " "between " "%016llx%08lx and " "%016llx%08lx. " "took %lli ms", bucketlist.getNumRecs(), numPosRecs, numNegRecs, *(long long*)(tmpkey1+(sizeof(long))), *(long*)tmpkey1, *(long long*)(tmpkey2+(sizeof(long))), *(long*)tmpkey2, t - e ); */ //check for consistency char tkey [ MAX_KEY_BYTES ]; char bkey [ MAX_KEY_BYTES ]; while(1) { if(treelist.isExhausted() ) { if(bucketlist.isExhausted() ) break; bucketlist.getCurrentKey(bkey); log(LOG_WARN, "db tree and buckets " "inconsistency" " remaining key in buckets is " "%016llx%08lx. ", *(long long*)(bkey+(sizeof(long))), *(long*)bkey); char* xx = NULL; *xx = 0; } else if (bucketlist.isExhausted() ) { treelist.getCurrentKey(tkey); log(LOG_WARN, "db tree and buckets " "inconsistency" " remaining key in tree is " "%016llx%08lx. ", *(long long*)(tkey+(sizeof(long))), *(long*)tkey); char* xx = NULL; *xx = 0; } treelist.getCurrentKey(tkey); bucketlist.getCurrentKey(bkey); if ( KEYCMP(tkey,bkey,keySize) != 0 ) { log(LOG_WARN, "db tree and buckets " "inconsistency " "%016llx%08lx and " "%016llx%08lx. ", *(long long*)(tkey+(sizeof(long))), *(long*)tkey, *(long long*)(bkey+(sizeof(long))), *(long*)bkey); char* xx = NULL; *xx = 0; } treelist.skipCurrentRecord(); bucketlist.skipCurrentRecord(); } } log("db: List retrieval successful. "); log("db: rdbtree took %lli ms for %li recs ", ttook, tgot); log("db: rdbbuckets took %lli ms for %li recs", btook, bgot); long long tAddTook = 0; long long bAddTook = 0; long long tgetListTook = 0; long long bgetListTook = 0; long long tdelListTook = 0; long long bdelListTook = 0; ttook = 0; btook = 0; tgot = 0; bgot = 0; minRecSizes = 200000; KEYSET(key1,KEYMIN(), keySize); KEYSET(key2,KEYMAX(), keySize); bool status = true; log("db: simulating dump, deleting entire list of keys"); while(rdbb.getNumKeys() > 0 && status) { status = rdbb.getList ( (collnum_t)0, key1 , KEYMAX() , minRecSizes, &list , &numPosRecs , &numNegRecs , false ); if(!status) {char* xx = NULL; *xx = 0;} if ( status && list.isEmpty() ) break; long numBefore = rdbb.getNumKeys(); rdbb.deleteList((collnum_t)0, &list); // if (KEYCMP(key2,key1,keySize) < 0) break; log("db: buckets now has %li keys. " "difference of %li, list size was %li. " "%016llx%08lx. ", rdbb.getNumKeys(), numBefore - rdbb.getNumKeys(), list.getNumRecs(), *(long long*)(key1+(sizeof(long))), *(long*)key1);; if(numBefore - rdbb.getNumKeys() != list.getNumRecs()) {char* xx = NULL; *xx = 0;} KEYSET(key2,key1,keySize); KEYSET(key1,list.getLastKey(),keySize); KEYADD(key1,1,keySize); } if(rdbb.getNumKeys() > 0) {char* xx = NULL; *xx = 0;} rdbb.setNeedsSave(false); rdbb.clear(); log("db: Testing retrival of a list of keys, %li random ranges " "interspersed with adds and deletes", numKeys); rt.clear(); rt.m_needsSave = false; for ( long i = 0 ; i < numKeys ; i++ ) { e = gettimeofdayInMilliseconds(); char* key = &k[i*keySize]; KEYSET(oppKey,key,keySize); KEYXOR(oppKey,0x01); long n; collnum_t collnum = rand() % 10; n = rt.getNode ( collnum , oppKey ); if ( n >= 0 ) rt.deleteNode ( n , true ); if ( rt.addNode (collnum, key, NULL , 0 ) < 0 ) return log("speedTest: rdb tree addNode " "failed"); t = gettimeofdayInMilliseconds(); tAddTook += t - e; e = gettimeofdayInMilliseconds(); if ( rdbb.addNode(collnum, key, NULL, 0 ) < 0 ) return log("speedTest: rdb buckets addNode " "failed"); t = gettimeofdayInMilliseconds(); bAddTook += t - e; if(i % 100 != 0) continue; char* skey; char* ekey; if(rand() % 2) { //check keys that exist long beg = (rand() % numKeys) * keySize; long end = (rand() % numKeys) * keySize; skey = (char*)&k[beg]; ekey = (char*)&k[end]; if(KEYCMP(skey,ekey,keySize) > 0) { skey = (char*)&k[end]; ekey = (char*)&k[beg]; } } else {//otherwise check keys that don't exist for ( long x = 0 ; x < MAX_KEY_BYTES; x++ ) { key1[x] = rand(); key2[x] = rand(); } if ( KEYCMP(key1,key2,keySize) < 0 ) { skey = key1; ekey = key2; } else { skey = key2; ekey = key1; } } e = gettimeofdayInMilliseconds(); rt.getList ( collnum, skey, ekey, minRecSizes, //min rec sizes &treelist, &numPosRecs, &numNegRecs, true ); //use half keys t = gettimeofdayInMilliseconds(); tgetListTook += t - e; tgot += treelist.getNumRecs(); if(!treelist.checkList_r(false, false, RDB_INDEXDB)) log("tree's list was bad"); tmpkey1 = treelist.getStartKey(); tmpkey2 = treelist.getEndKey(); if(treelist.getNumRecs() > 0) { log(LOG_WARN, "db inserted %li keys", i+1); log(LOG_WARN, "db rdbtree found %li recs (%li pos, " "%li neg) between " "%016llx%08lx and " "%016llx%08lx. " "took %lli ms, %lli ms so far", treelist.getNumRecs(), numPosRecs, numNegRecs, *(long long*)(tmpkey1+(sizeof(long))), *(long*)tmpkey1, *(long long*)(tmpkey2+(sizeof(long))), *(long*)tmpkey2, t - e ,tgetListTook ); } e = gettimeofdayInMilliseconds(); rdbb.getList ( collnum, skey, ekey, minRecSizes, //min rec sizes &bucketlist, &numPosRecs, &numNegRecs, true ); //use half keys t = gettimeofdayInMilliseconds(); bgetListTook += t - e; bgot += bucketlist.getNumRecs(); if(!bucketlist.checkList_r(false, false, RDB_INDEXDB)) log("bucket's list was bad"); tmpkey1 = bucketlist.getStartKey(); tmpkey2 = bucketlist.getEndKey(); if(treelist.getNumRecs() > 0) { log(LOG_WARN, "db buckets found %li recs (%li pos, " "%li neg) between " "%016llx%08lx and " "%016llx%08lx. " "took %lli ms, %lli ms so far.", bucketlist.getNumRecs(), numPosRecs, numNegRecs, *(long long*)(tmpkey1+(sizeof(long))), *(long*)tmpkey1, *(long long*)(tmpkey2+(sizeof(long))), *(long*)tmpkey2, t - e , bgetListTook); } //check for consistency treelist.resetListPtr(); bucketlist.resetListPtr(); char tkey [ MAX_KEY_BYTES ]; char bkey [ MAX_KEY_BYTES ]; while(1) { if(treelist.isExhausted() ) { if(bucketlist.isExhausted() ) break; bucketlist.getCurrentKey(bkey); log(LOG_WARN, "db tree and buckets " "inconsistency" " remaining key in buckets is " "%016llx%08lx. ", *(long long*)(bkey+(sizeof(long))), *(long*)bkey); char* xx = NULL; *xx = 0; } else if (bucketlist.isExhausted() ) { treelist.getCurrentKey(tkey); log(LOG_WARN, "db tree and buckets " "inconsistency" " remaining key in tree is " "%016llx%08lx. ", *(long long*)(tkey+(sizeof(long))), *(long*)tkey); char* xx = NULL; *xx = 0; } treelist.getCurrentKey(tkey); bucketlist.getCurrentKey(bkey); if ( KEYCMP(tkey,bkey,keySize) != 0 ) { log(LOG_WARN, "db tree and buckets " "inconsistency " "%016llx%08lx and " "%016llx%08lx. ", *(long long*)(tkey+(sizeof(long))), *(long*)tkey, *(long long*)(bkey+(sizeof(long))), *(long*)bkey); char* xx = NULL; *xx = 0; } treelist.skipCurrentRecord(); bucketlist.skipCurrentRecord(); } //continue; if(rand() % 100 != 0) continue; log("db: removing %li nodes from tree. " "tree currently has %li keys", treelist.getNumRecs(), rt.getNumUsedNodes ( )); e = gettimeofdayInMilliseconds(); rt.deleteList(collnum, &treelist, true); t = gettimeofdayInMilliseconds(); tdelListTook += t - e; log("db: Now tree has %li keys", rt.getNumUsedNodes()); log("db: removing %li nodes from buckets. " "buckets currently has %li keys", bucketlist.getNumRecs(), rdbb.getNumKeys(0)); e = gettimeofdayInMilliseconds(); rdbb.deleteList(collnum, &bucketlist); t = gettimeofdayInMilliseconds(); bdelListTook += t - e; log("db: Now buckets has %li keys", rdbb.getNumKeys(0)); } log("db: List retrieval successful. "); log("db: rdbtree Add %lli ms, GetList %lli ms, Delete %lli " "for %li recs ", tAddTook, tgetListTook, tdelListTook, tgot); log("db: rdbBuckets Add %lli ms, GetList %lli ms, Delete %lli " "for %li recs ", bAddTook, bgetListTook, bdelListTook, bgot); #if 0 // get the list key_t kk; kk.n0 = 0LL; kk.n1 = 0; //kk.n1 = 1234567; //long n = rt.getNextNode ( (collnum_t)0, (char *)&kk ); long n = rt.getFirstNode(); // loop it t = gettimeofdayInMilliseconds(); long count = 0; while ( n >= 0 ) { n = rt.getNextNode ( n ); count++; } e = gettimeofdayInMilliseconds(); log("db: getList for %li nodes in %lli ms",count,e - t); #endif rt.m_needsSave = false; rdbb.setNeedsSave(false); return true; } // time speed of inserts into RdbTree for indexdb bool treetest ( ) { long numKeys = 500000; log("db: speedtest: generating %li random keys.",numKeys); // seed randomizer srand ( (long)gettimeofdayInMilliseconds() ); // make list of one million random keys key_t *k = (key_t *)mmalloc ( sizeof(key_t) * numKeys , "main" ); if ( ! k ) return log("speedtest: malloc failed"); long *r = (long *)k; long size = 0; long first = 0; for ( long i = 0 ; i < numKeys * 3 ; i++ ) { if ( (i % 3) == 2 && first++ < 50000 ) { r[i] = 1234567; size++; } else r[i] = rand(); } // init the tree RdbTree rt; if ( ! rt.set ( 0 , // fixedDataSize , numKeys + 1000 , // maxTreeNodes , false , // isTreeBalanced , numKeys * 28 , // maxTreeMem , false , // own data? "tree-test" ) ) return log("speedTest: tree init failed."); // add to regular tree long long t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < numKeys ; i++ ) { //if ( k[i].n1 == 1234567 ) // fprintf(stderr,"i=%li\n",i); if ( rt.addNode ( (collnum_t)0 , k[i] , NULL , 0 ) < 0 ) return log("speedTest: rdb tree addNode " "failed"); } // print time it took long long e = gettimeofdayInMilliseconds(); log("db: added %li keys to rdb tree in %lli ms",numKeys,e - t); // sort the list of keys t = gettimeofdayInMilliseconds(); gbsort ( k , numKeys , sizeof(key_t) , keycmp ); // print time it took e = gettimeofdayInMilliseconds(); log("db: sorted %li in %lli ms",numKeys,e - t); // get the list key_t kk; kk.n0 = 0LL; kk.n1 = 0; kk.n1 = 1234567; long n = rt.getNextNode ( (collnum_t)0, (char *)&kk ); // loop it t = gettimeofdayInMilliseconds(); long count = 0; while ( n >= 0 && --first >= 0 ) { n = rt.getNextNode ( n ); count++; } e = gettimeofdayInMilliseconds(); log("db: getList for %li nodes in %lli ms",count,e - t); return true; } // time speed of inserts into RdbTree for indexdb bool hashtest ( ) { // load em up long numKeys = 1000000; log("db: speedtest: generating %li random keys.",numKeys); // seed randomizer srand ( (long)gettimeofdayInMilliseconds() ); // make list of one million random keys key_t *k = (key_t *)mmalloc ( sizeof(key_t) * numKeys , "main" ); if ( ! k ) return log("speedtest: malloc failed"); long *r = (long *)k; for ( long i = 0 ; i < numKeys * 3 ; i++ ) r[i] = rand(); // init the tree //HashTableT ht; HashTable ht; ht.set ( (long)(1.1 * numKeys) ); // add to regular tree long long t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < numKeys ; i++ ) if ( ! ht.addKey ( r[i] , 1 ) ) return log("hashtest: add key failed."); // print time it took long long e = gettimeofdayInMilliseconds(); // add times log("db: added %li keys in %lli ms",numKeys,e - t); // do the delete test t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < numKeys ; i++ ) if ( ! ht.removeKey ( r[i] ) ) return log("hashtest: add key failed."); // print time it took e = gettimeofdayInMilliseconds(); // add times log("db: deleted %li keys in %lli ms",numKeys,e - t); return true; } // time speed of big write, read and the seeks bool thrutest ( char *testdir , long long fileSize ) { // always block g_threads.disableThreads(); // a read/write buffer of 30M long bufSize = 30000000; // 30M //long long fileSize = 4000000000LL; // 4G #undef malloc char *buf = (char *) malloc ( bufSize ); #define malloc coreme if ( ! buf ) return log("speedtestdisk: %s",strerror(errno)); // store stuff in there for ( long i = 0 ; i < bufSize ; i++ ) buf[i] = (char)i; BigFile f; // try a read test from speedtest*.dat* f.set (testdir,"speedtest"); if ( f.doesExist() ) { if ( ! f.open ( O_RDONLY ) ) return log("speedtestdisk: cannot open %s/%s", testdir,"speedtest"); // ensure big enough if ( f.getFileSize() < fileSize ) return log("speedtestdisk: File %s/%s is too small " "for requested read size.", testdir,"speedtest"); log("db: reading from speedtest0001.dat"); f.setBlocking(); goto doreadtest; } // try a read test from indexdb*.dat* f.set (testdir,"indexdb0001.dat"); if ( f.doesExist() ) { if ( ! f.open ( O_RDONLY ) ) return log("speedtestdisk: cannot open %s/%s", testdir,"indexdb0001.dat"); log("db: reading from indexdb0001.dat"); f.setBlocking(); goto doreadtest; } // try a write test to speedtest*.dat* f.set (testdir,"speedtest"); if ( ! f.doesExist() ) { if ( ! f.open ( O_RDWR | O_CREAT | O_SYNC ) ) return log("speedtestdisk: cannot open %s/%s", testdir,"speedtest"); log("db: writing to speedtest0001.dat"); f.setBlocking(); } // write 2 gigs to the file, 1M at a time { long long t1 = gettimeofdayInMilliseconds(); long numLoops = fileSize / bufSize; long long off = 0LL; long next = 0; for ( long i = 0 ; i < numLoops ; i++ ) { f.write ( buf , bufSize , off ); sync(); // f.flush ( ); off += bufSize ; next += bufSize; //if ( i >= numLoops || next < 100000000 ) continue; if ( i + 1 < numLoops && next < 100000000 ) continue; next = 0; // print speed every X seconds long long t2 = gettimeofdayInMilliseconds(); float mBps = (float)off / (float)(t2-t1) / 1000.0 ; fprintf(stderr,"wrote %lli bytes in %lli ms (%.1f MB/s)\n", off,t2-t1,mBps); } } doreadtest: { long long t1 = gettimeofdayInMilliseconds(); long numLoops = fileSize / bufSize; long long off = 0LL; long next = 0; for ( long i = 0 ; i < numLoops ; i++ ) { f.read ( buf , bufSize , off ); //sync(); // f.flush ( ); off += bufSize ; next += bufSize; //if ( i >= numLoops || next < 100000000 ) continue; if ( i + 1 < numLoops && next < 100000000 ) continue; next = 0; // print speed every X seconds long long t2 = gettimeofdayInMilliseconds(); float mBps = (float)off / (float)(t2-t1) / 1000.0 ; fprintf(stderr,"read %lli bytes in %lli ms (%.1f MB/s)\n", off,t2-t1,mBps); } } return true; } // // SEEK TEST // #include // gettimeofday() #include #include //#include #include #include #include #include //static pthread_attr_t s_attr; //static int startUp ( void *state ) ; static void *startUp ( void *state , ThreadEntry *t ) ; static long s_count = 0; static long long s_filesize = 0; //static long s_lock = 1; static long s_launched = 0; //static int s_fd1 ; // , s_fd2; static BigFile s_f; static long s_numThreads = 0; static long long s_maxReadSize = 1; static long long s_startTime = 0; //#define MAX_READ_SIZE (2000000) #include #include void seektest ( char *testdir, long numThreads, long maxReadSize , char *filename ) { g_loop.init(); g_threads.init(); s_numThreads = numThreads; s_maxReadSize = maxReadSize; if ( s_maxReadSize <= 0 ) s_maxReadSize = 1; //if ( s_maxReadSize > MAX_READ_SIZE ) s_maxReadSize = MAX_READ_SIZE; log(LOG_INIT,"admin: dir=%s threads=%li maxReadSize=%li file=%s\n", testdir,(long)s_numThreads, (long)s_maxReadSize , filename ); // maybe its a filename in the cwd if ( filename ) { s_f.set(testdir,filename); if ( s_f.doesExist() ) { log(LOG_INIT,"admin: reading from %s.", s_f.getFilename()); goto skip; } log("admin: %s does not exists. Use ./gb thrutest ... " "to create speedtest* files.", s_f.getFilename()); return; } // check other defaults s_f.set ( testdir , "speedtest" ); if ( s_f.doesExist() ) { log(LOG_INIT,"admin: reading from speedtest*.dat."); goto skip; } // try a read test from indexdb*.dat* s_f.set (testdir,"indexdb0001.dat"); if ( s_f.doesExist() ) { log(LOG_INIT,"admin: reading from indexdb0001.dat."); goto skip; } log("admin: Neither speedtest* or indexdb0001.dat* " "exist. Use ./gb thrutest ... to create speedtest* files."); return; skip: s_f.open ( O_RDONLY ); s_filesize = s_f.getFileSize(); log ( LOG_INIT, "admin: file size = %lli.",s_filesize); // always block //g_threads.disableThreads(); // seed rand srand(time(NULL)); //fprintf(stderr,"disabled until Threads class is used\n"); //return; //} // open 2 file descriptors //s_fd1 = open ( "/tmp/glibc-2.2.2.tar" , O_RDONLY ); //s_fd1 = open ( filename , O_RDONLY ); //s_fd2 = open ( "/tmp/glibc-2.2.5.tar" , O_RDONLY ); // . set up the thread attribute we use for all threads // . fill up with the default values first //if ( pthread_attr_init( &s_attr ) ) // fprintf (stderr,"Threads::init: pthread_attr_init: error\n"); // then customize //if ( pthread_attr_setdetachstate(&s_attr,PTHREAD_CREATE_DETACHED) ) // fprintf ( stderr,"Threads::init: pthread_attr_setdeatchstate:\n"); //if ( setpriority ( PRIO_PROCESS, getpid() , 0 ) < 0 ) { // fprintf(stderr,"Threads:: setpriority: failed\n"); // exit(-1); //} //s_lock = 1; //pthread_t tid1 ; //, tid2; // set time s_startTime = gettimeofdayInMilliseconds(); long stksize = 1000000 ; long bufsize = stksize * s_numThreads ; #undef malloc char *buf = (char *)malloc ( bufsize ); #define malloc coreme if ( ! buf ) { log("test: malloc of %li failed.",bufsize); return; } g_conf.m_useThreads = true; //int pid; for ( long i = 0 ; i < s_numThreads ; i++ ) { //int err = pthread_create ( &tid1,&s_attr,startUp,(void *)i) ; if (!g_threads.call(GENERIC_THREAD,0,(void *)i,NULL,startUp)){ log("test: Thread launch failed."); return; } //pid = clone ( startUp , buf + stksize * i , // CLONE_FS | CLONE_FILES | CLONE_VM | //CLONE_SIGHAND | // SIGCHLD , // (void *)NULL ); //if ( pid == (pid_t)-1) {log("test: error cloning"); return;} //log(LOG_INIT,"test:launched i=%li pid=%i",i,(int)pid); //log(LOG_INIT,"test:launched i=%li",i,(int)pid); log(LOG_INIT,"test: Launched thread #%li.",i); //if ( err != 0 ) return ;// -1; } // unset lock //s_lock = 0; // sleep til done while ( 1 == 1 ) sleep(1000); //int status; //for ( long i = 0 ; i < s_numThreads ; i++ ) waitpid(pid,&status,0); } //int startUp ( void *state ) { void *startUp ( void *state , ThreadEntry *t ) { long id = (long) state; // . what this lwp's priority be? // . can range from -20 to +20 // . the lower p, the more cpu time it gets // . this is really the niceness, not the priority //int p = 0; //if ( id == 1 ) p = 0; //else p = 30; // . set this process's priority // . setpriority() is only used for SCHED_OTHER threads //if ( setpriority ( PRIO_PROCESS, getpid() , p ) < 0 ) { // fprintf(stderr,"Threads::startUp: setpriority: failed\n"); // exit(-1); //} // read buf //char buf [ MAX_READ_SIZE ]; #undef malloc char *buf = (char *) malloc ( s_maxReadSize ); #define malloc coreme if ( ! buf ) { fprintf(stderr,"MALLOC FAILED in thread\n"); return 0; // NULL; } // we got ourselves s_launched++; // msg fprintf(stderr,"id=%li launched. Performing 100000 reads.\n",id); // wait for lock to be unleashed //while ( s_launched != s_numThreads ) usleep(10); // now do a stupid loop //long j, off , size; long long off , size; for ( long i = 0 ; i < 100000 ; i++ ) { unsigned long long r = rand(); r <<= 32 ; r |= rand(); off = r % (s_filesize - s_maxReadSize ); // rand size //size = rand() % s_maxReadSize; size = s_maxReadSize; //if ( size < 32*1024 ) size = 32*1024; // time it long long start = gettimeofdayInMilliseconds(); //fprintf(stderr,"%li) i=%li start\n",id,i ); //pread ( s_fd1 , buf , size , off ); s_f.read ( buf , size , off ); //fprintf(stderr,"%li) i=%li done\n",id,i ); long long now = gettimeofdayInMilliseconds(); usleep(0); s_count++; float sps = (float)((float)s_count * 1000.0) / (float)(now - s_startTime); fprintf(stderr,"count=%li off=%012lli size=%li time=%lims " "(%.2f seeks/sec)\n", (long)s_count, (long long)off, (long)size, (long)(now - start) , sps ); } // dummy return return 0; //NULL; } void dumpSectiondb(char *coll,long startFileNum,long numFiles, bool includeTree) { g_dumpMode = true; //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_sectiondb.init (); g_collectiondb.init(true); g_sectiondb.addColl ( coll, false ); key128_t startKey ; key128_t endKey ; startKey.setMin(); endKey.setMax(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; RdbList list; char tmpBuf[1024]; SafeBuf sb(tmpBuf, 1024); bool firstKey = true; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_SECTIONDB , coll , &list , (char *)&startKey , (char *)&endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; key128_t lastk; // loop over entries in list for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){ char *rec = list.getCurrentRec(); key128_t *k = (key128_t *)rec; char *data = list.getCurrentData(); long size = list.getCurrentDataSize(); // is it a delete? if ( (k->n0 & 0x01) == 0 ) { printf("k.n1=%016llx k.n0=%016llx (delete)\n", k->n1 , k->n0 | 0x01 ); // fix it! continue; } if ( size != sizeof(SectionVote) ) { char *xx=NULL;*xx=0; } // sanity check if ( ! firstKey ) { if ( k->n1 < lastk.n1 ) { char *xx=NULL;*xx=0; } if ( k->n1 == lastk.n1 && k->n0 < lastk.n0 ) { char *xx=NULL;*xx=0; } } // no longer a first key firstKey = false; // copy it memcpy ( &lastk , k , sizeof(key128_t) ); unsigned long shardNum; shardNum = getShardNum (RDB_SECTIONDB,k,true); //long groupNum = g_hostdb.getGroupNum ( gid ); // point to the data char *p = data; char *pend = data + size; // breach check if ( p >= pend ) { printf("corrupt sectiondb rec k.n0=%llu",k->n0); continue; } // parse it up SectionVote *sv = (SectionVote *)data; long long termId = g_datedb.getTermId ( k ); // score is the section type unsigned char score2 = g_datedb.getScore(k); char *stype = "unknown"; if ( score2 == SV_CLOCK ) stype = "clock "; if ( score2 == SV_EURDATEFMT ) stype = "eurdatefmt "; if ( score2 == SV_EVENT ) stype = "event "; if ( score2 == SV_ADDRESS ) stype = "address "; if ( score2 == SV_TAGPAIRHASH ) stype = "tagpairhash "; if ( score2 == SV_TAGCONTENTHASH ) stype = "tagcontenthash"; if ( score2 == SV_FUTURE_DATE ) stype = "futuredate "; if ( score2 == SV_PAST_DATE ) stype = "pastdate "; if ( score2 == SV_CURRENT_DATE ) stype = "currentdate "; if ( score2 == SV_SITE_VOTER ) stype = "sitevoter "; if ( score2 == SV_TURKTAGHASH ) stype = "turktaghash "; long long d = g_datedb.getDocId(k); long date = g_datedb.getDate(k); // dump it printf("k=%s " "sh48=%llx " // sitehash is the termid "date=%010lu " "%s (%lu) " "d=%012llu " "score=%f samples=%f " "shardnum=%li" "\n", //k->n1, //k->n0, KEYSTR(k,sizeof(key128_t)), termId, date, stype,(unsigned long)score2, d, sv->m_score, sv->m_numSampled, shardNum); } startKey = *(key128_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key128_t *)list.getLastKey() ){ printf("\n"); return;} goto loop; } void dumpRevdb(char *coll,long startFileNum,long numFiles, bool includeTree) { g_dumpMode = true; //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_revdb.init (); g_collectiondb.init(true); g_revdb.addColl ( coll, false ); key_t startKey ; key_t endKey ; startKey.setMin(); endKey.setMax(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; RdbList list; char tmpBuf[1024]; SafeBuf sb(tmpBuf, 1024); bool firstKey = true; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_REVDB , coll , &list , (char *)&startKey , (char *)&endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; key_t lastk; // loop over entries in list for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){ char *rec = list.getCurrentRec(); key_t *k = (key_t *)rec; char *data = list.getCurrentData(); long size = list.getCurrentDataSize(); // get docid from key long long d = g_revdb.getDocId(k); // is it a delete? if ( (k->n0 & 0x01) == 0 ) { printf("k.n1=%08lx k.n0=%016llx d=%llu (delete)\n", k->n1 , k->n0 | 0x01 , d ); // fix it! continue; } //if ( size != sizeof(SectionVote) ) { char *xx=NULL;*xx=0; } // sanity check if ( ! firstKey ) { if ( k->n1 < lastk.n1 ) { char *xx=NULL;*xx=0; } if ( k->n1 == lastk.n1 && k->n0 < lastk.n0 ) { char *xx=NULL;*xx=0; } } // no longer a first key firstKey = false; // copy it memcpy ( &lastk , k , sizeof(key_t) ); // point to the data char *p = data; char *pend = data + size; // breach check if ( p > pend ) { printf("corrupt revdb rec k.n1=0x%08lx d=%llu\n", k->n1,d); continue; } // parse it up //SectionVote *sv = (SectionVote *)data; // dump it printf("k.n1=%08lx k.n0=%016llx ds=%06li d=%llu\n", k->n1,k->n0,size,d); } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ){ printf("\n"); return;} goto loop; } void dumpTagdb (char *coll,long startFileNum,long numFiles,bool includeTree, long c , char req, long rdbId ) { g_dumpMode = true; //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_tagdb.init (); g_collectiondb.init(true); if ( rdbId == RDB_TAGDB ) g_tagdb.addColl ( coll, false ); if ( rdbId == RDB_CATDB ) g_catdb.init(); key128_t startKey ; key128_t endKey ; startKey.setMin(); endKey.setMax(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; RdbList list; //char tmpBuf[1024]; //SafeBuf sb(tmpBuf, 1024); // get my hostname and port char httpAddr[30]; long port = g_hostdb.getMyPort() - 1000; char action[50]=""; sprintf(httpAddr,"127.0.0.1:%li", port ); if ( req == 'D') strcpy(action,"&deleterec=1&useNew=1"); loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( rdbId, //RDB_TAGDB , coll , &list , (char *)&startKey , (char *)&endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){ char *rec = list.getCurrentRec(); //key_t k = list.getCurrentKey(); key128_t k; list.getCurrentKey ( &k ); char *data = list.getCurrentData(); long size = list.getCurrentDataSize(); // is it a delete? if ( (k.n0 & 0x01) == 0 ) { printf("k.n1=%016llx k.n0=%016llx (delete)\n", k.n1 , k.n0 | 0x01 ); // fix it! continue; } // point to the data char *p = data; char *pend = data + size; // breach check if ( p >= pend ) { printf("corrupt tagdb rec k.n0=%llu",k.n0); continue; } // catdb? if ( rdbId == RDB_CATDB ) { // for debug! CatRec crec; crec.set ( NULL, data , size , false); fprintf(stdout, "key=%s caturl=%s #catids=%li version=%li\n" ,KEYSTR(&k,12) ,crec.m_url ,(long)crec.m_numCatids ,(long)crec.m_version ); continue; } // parse it up //TagRec *tagRec = (TagRec *)rec; Tag *tag = (Tag *)rec; // print the version and site char tmpBuf[1024]; SafeBuf sb(tmpBuf, 1024); // print as an add request or just normal if ( req == 'A' ) tag->printToBufAsAddRequest ( &sb ); else tag->printToBuf ( &sb ); // dump it printf("%s\n",sb.getBufStart()); } startKey = *(key128_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key128_t *)list.getLastKey() ){ printf("\n"); return;} goto loop; } bool parseTest ( char *coll , long long docId , char *query ) { g_conf.m_maxMem = 2000000000LL; // 2G g_mem.m_maxMem = 2000000000LL; // 2G //g_conf.m_checksumdbMaxDiskPageCacheMem = 0; //g_conf.m_spiderdbMaxDiskPageCacheMem = 0; g_conf.m_tfndbMaxDiskPageCacheMem = 0; //g_conf.m_titledbMaxTreeMem = 1024*1024*10; g_titledb.init (); g_collectiondb.init(true); g_titledb.addColl ( coll ); log(LOG_INIT, "build: Testing parse speed of html docId %lli.",docId); // get a title rec g_threads.disableThreads(); RdbList tlist; key_t startKey = g_titledb.makeFirstKey ( docId ); key_t endKey = g_titledb.makeLastKey ( docId ); // a niceness of 0 tells it to block until it gets results!! Msg5 msg5; Msg5 msg5b; if ( ! msg5.getList ( RDB_TITLEDB , coll , &tlist , startKey , endKey , // should be maxed! 9999999 , // min rec sizes true , // include tree? false , // includeCache false , // addToCache 0 , // startFileNum -1 , // m_numFiles NULL , // state NULL , // callback 0 , // niceness false , // do error correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )) return log(LOG_LOGIC,"build: getList did not block."); // get the title rec if ( tlist.isEmpty() ) return log("build: speedtestxml: " "docId %lli not found.", docId ); if (!ucInit(g_hostdb.m_dir, true)) return log("Unicode initialization failed!"); // get raw rec from list char *rec = tlist.getCurrentRec(); long listSize = tlist.getListSize (); // set the titleRec we got //TitleRec tr ; //if ( ! tr.set ( rec , listSize , false /*own data?*/ ) ) // return log("build: speedtestxml: Error setting " // "titleRec." ); XmlDoc xd; if ( ! xd.set2 ( rec , listSize , coll , NULL , 0 ) ) return log("build: speedtestxml: Error setting " "xml doc." ); log("build: Doc url is %s",xd.ptr_firstUrl);//tr.getUrl()->getUrl()); log("build: Doc is %li bytes long.",xd.size_utf8Content-1); log("build: Doc charset is %s",get_charset_str(xd.m_charset)); // time the summary/title generation code log("build: Using query %s",query); summaryTest1 ( rec , listSize , coll , docId , query ); //summaryTest2 ( rec , listSize , coll , docId , query ); //summaryTest3 ( rec , listSize , coll , docId , query ); // for a 128k latin1 doc: (access time is probably 15-20ms) // 1.18 ms to set title rec (6ms total) // 1.58 ms to set Xml // 1.71 ms to set Words (~50% from Words::countWords()) // 0.42 ms to set Pos // 0.66 ms to set Bits // 0.51 ms to set Scores // 0.35 ms to getText() // speed test long long t = gettimeofdayInMilliseconds(); for ( long k = 0 ; k < 100 ; k++ ) xd.set2 (rec, listSize, coll , NULL , 0 ); long long e = gettimeofdayInMilliseconds(); logf(LOG_DEBUG,"build: Took %.3f ms to set title rec.", (float)(e-t)/100.0); // speed test t = gettimeofdayInMilliseconds(); for ( long k = 0 ; k < 100 ; k++ ) { char *mm = (char *)mmalloc ( 300*1024 , "test"); mfree ( mm , 300*1024 ,"test"); } e = gettimeofdayInMilliseconds(); logf(LOG_DEBUG,"build: Took %.3f ms to do mallocs.", (float)(e-t)/100.0); // get content char *content = xd.ptr_utf8Content;//tr.getContent(); long contentLen = xd.size_utf8Content-1;//tr.getContentLen(); // loop parse Xml xml; t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! xml.set ( content , contentLen , false, 0, false, xd.m_version ) ) return log("build: speedtestxml: xml set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Xml::set() took %.3f ms to parse docId %lli.", (double)(e - t)/100.0,docId); double bpms = contentLen/((double)(e-t)/100.0); log("build: %.3f bytes/msec", bpms); // get per char and per byte speeds xml.reset(); // loop parse t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! xml.set ( content , contentLen , false, 0, false, xd.m_version , false ) ) return log("build: xml(setparents=false): %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Xml::set(setparents=false) took %.3f ms to " "parse docId %lli.", (double)(e - t)/100.0,docId); if (!ucInit(g_hostdb.m_dir, true)) { log("Unicode initialization failed!"); return 1; } Words words; t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! words.set ( &xml , true , true ) ) return log("build: speedtestxml: words set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Words::set(xml,computeIds=true) took %.3f ms for %li words" " (precount=%li) for docId %lli.", (double)(e - t)/100.0,words.m_numWords,words.m_preCount,docId); t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! words.set2 ( &xml , true , true ) ) return log("build: speedtestxml: words set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Words::set2(xml,computeIds=true) took %.3f ms for %li "\ "words (precount=%li) for docId %lli.", (double)(e - t)/100.0,words.m_numWords,words.m_preCount,docId); t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! words.set ( &xml , true , false ) ) return log("build: speedtestxml: words set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Words::set(xml,computeIds=false) " "took %.3f ms for %li words" " (precount=%li) for docId %lli.", (double)(e - t)/100.0,words.m_numWords,words.m_preCount,docId); t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) //if ( ! words.set ( &xml , true , true ) ) if ( ! words.set ( content , TITLEREC_CURRENT_VERSION, true, 0 ) ) return log("build: speedtestxml: words set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Words::set(content,computeIds=true) " "took %.3f ms for %li words " "for docId %lli.", (double)(e - t)/100.0,words.m_numWords,docId); Pos pos; // computeWordIds from xml words.set ( &xml , true , true ) ; t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) //if ( ! words.set ( &xml , true , true ) ) if ( ! pos.set ( &words , NULL ) ) return log("build: speedtestxml: pos set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Pos::set() " "took %.3f ms for %li words " "for docId %lli.", (double)(e - t)/100.0,words.m_numWords,docId); Bits bits; // computeWordIds from xml words.set ( &xml , true , true ) ; t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) //if ( ! words.set ( &xml , true , true ) ) if ( ! bits.setForSummary ( &words ) ) return log("build: speedtestxml: Bits set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Bits::setForSummary() " "took %.3f ms for %li words " "for docId %lli.", (double)(e - t)/100.0,words.m_numWords,docId); Dates dates; if (!dates.parseDates(&words,DF_FROM_BODY,NULL,NULL,0,NULL,CT_HTML) ) return log("build: speedtestxml: parsedates: %s", mstrerror(g_errno)); Sections sections; // computeWordIds from xml words.set ( &xml , true , true ) ; bits.set ( &words ,TITLEREC_CURRENT_VERSION, 0); Phrases phrases; phrases.set ( &words,&bits,true,true,TITLEREC_CURRENT_VERSION,0); t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) //if ( ! words.set ( &xml , true , true ) ) // do not supply xd so it will be set from scratch if ( ! sections.set (&words,&phrases,&bits,NULL,0,0, NULL,0,NULL,NULL, 0, // contenttype &dates , NULL, // sectionsdata false, // sectionsdatavalid NULL, // sectionsdata2 //0, // tagpairhash NULL, // buf 0)) // bufSize return log("build: speedtestxml: sections set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Scores::set() " "took %.3f ms for %li words " "for docId %lli.", (double)(e - t)/100.0,words.m_numWords,docId); //Phrases phrases; t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! phrases.set ( &words , &bits , true , // use stop words false , // use stems TITLEREC_CURRENT_VERSION , 0 ) ) // niceness return log("build: speedtestxml: Phrases set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Phrases::set() " "took %.3f ms for %li words " "for docId %lli.", (double)(e - t)/100.0,words.m_numWords,docId); bool isPreformattedText ; long contentType = xd.m_contentType;//tr.getContentType(); if ( contentType == CT_TEXT ) isPreformattedText = true; else isPreformattedText = false; /* Weights weights; //LinkInfo info1; //LinkInfo info2; // computeWordIds from xml t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) //if ( ! words.set ( &xml , true , true ) ) if ( ! weights.set (&words , &phrases , &bits , NULL , // sections NULL , // debug? true , // elim menus? isPreformattedText , TITLEREC_CURRENT_VERSION , 600 , // titleWeight 300 , // headerWeight NULL , false , // isLinkText? false , // isCntTable? 0 , // sitenuminlnkx 0 )) // niceness return log("build: speedtestxml: Weights set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Weights::set() " "took %.3f ms for %li words " "for docId %lli.", (double)(e - t)/100.0,words.m_numWords,docId); */ /* if (!ucInit(g_hostdb.m_dir)) { log("Unicode initialization failed!"); return 1; } t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! words.set ( &xml , true , true ) ) return log("build: speedtestxml: words set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Words::set(computeIds=true) took %.3f ms for %li words " "for docId %lli.", (double)(e - t)/100.0,words.m_numWords,docId); t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! words.set ( &xml , false , true ) ) return log("build: speedtestxml: words set: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Words::set(computeIds=false) took %.3f ms for docId %lli.", (double)(e - t)/100.0,docId); */ char *buf = (char *)mmalloc(contentLen*2+1,"main"); t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) if ( ! xml.getText ( buf , contentLen*2+1 , 0 , // startNode 9999999 , // endNode (the last one!) false , // includeTags? true , // visible text only? true , // convert html entities? true , // filter spaces? false )) // use tag? return log("build: speedtestxml: getText: %s", mstrerror(g_errno)); // print time it took e = gettimeofdayInMilliseconds(); log("build: Xml::getText(computeIds=false) took %.3f ms for docId " "%lli.",(double)(e - t)/100.0,docId); t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) { long bufLen = xml.getText ( buf , contentLen*2+1 , 0 , // startNode 9999999 , // endNode (the last one!) false , // includeTags? true , // visible text only? true , // convert html entities? true , // filter spaces? false ); // use tag? if ( ! bufLen ) return log("build: speedtestxml: getText: %s", mstrerror(g_errno)); if ( ! words.set ( buf,TITLEREC_CURRENT_VERSION,true,0) ) return log("build: speedtestxml: words set: %s", mstrerror(g_errno)); } // print time it took e = gettimeofdayInMilliseconds(); log("build: Xml::getText(computeIds=false) w/ word::set() " "took %.3f ms for docId " "%lli.",(double)(e - t)/100.0,docId); Matches matches; Query q; //long collLen = gbstrlen(coll); q.set2 ( query , langUnknown , false ); matches.setQuery ( &q ); words.set ( &xml , true , 0 ) ; t = gettimeofdayInMilliseconds(); for ( long i = 0 ; i < 100 ; i++ ) { matches.reset(); if ( ! matches.addMatches ( &words ) ) return log("build: speedtestxml: matches set: %s", mstrerror(g_errno)); } // print time it took e = gettimeofdayInMilliseconds(); log("build: Matches::set() took %.3f ms for %li words" " (precount=%li) for docId %lli.", (double)(e - t)/100.0,words.m_numWords,words.m_preCount,docId); return true; } /* bool carveTest ( uint32_t radius, char *fname, char* query ) { Query q; q.set(query, 0); // boolflag FILE* f = fopen(fname, "rb"); if (f == NULL) { fprintf(stderr, "unable to open: '%s' %d\n", fname, errno); return false; } char buf[128*1024]; int bytes = fread(buf, 1, sizeof(buf), f); if (bytes < 1) { fprintf(stderr, "unable to read: '%s' %d\n", fname, errno); fclose(f); return false; } buf[bytes] = '\0'; log(LOG_INFO, "carve[%d]: %s", bytes, buf); HtmlCarver carver(csISOLatin1, radius); char out[128*1024]; int carvedbytes; carvedbytes = carver.AsciiAndCarveNoTags( (uint8_t*) buf, (uint32_t) bytes, (uint8_t*) out, sizeof(out) - 1, q); out[carvedbytes] = '\0'; fprintf(stderr, "carved[%d]: '%s'\n", carvedbytes, out); return true; } */ bool summaryTest1 ( char *rec , long listSize, char *coll , long long docId , char *query ) { //long collLen = gbstrlen(coll); // CollectionRec *cr = g_collectiondb.getRec ( coll ); // start the timer long long t = gettimeofdayInMilliseconds(); //long titleMaxLen = cr->m_titleMaxLen; //bool considerTitlesFromBody = false; // long summaryMaxLen = cr->m_summaryMaxLen; // long numSummaryLines = cr->m_summaryMaxNumLines; // long summaryMaxNumCharsPerLine = cr->m_summaryMaxNumCharsPerLine; // these are arbitrary (taken from Msg24.cpp) // long bigSampleRadius = 100; // long bigSampleMaxLen = 4000; // bool ratInSummary = false; Query q; q.set2 ( query , langUnknown , false ); char *content ; long contentLen ; // loop parse for ( long i = 0 ; i < 100 ; i++ ) { //TitleRec tr; XmlDoc xd; xd.set2 (rec, listSize, coll,NULL,0); // get content content = xd.ptr_utf8Content;//tr.getContent(); contentLen = xd.size_utf8Content-1;//tr.getContentLen(); // now parse into xhtml (takes 15ms on lenny) Xml xml; xml.set ( content, contentLen , false/*ownData?*/, 0, false, xd.m_version ); xd.getSummary(); //Summary s; // bool status; /* status = s.set ( &xml , &q , NULL , // termFreqs false , // doStemming? summaryMaxLen , numSummaryLines , summaryMaxNumCharsPerLine , bigSampleRadius , bigSampleMaxLen , ratInSummary , &tr ); */ } // print time it took long long e = gettimeofdayInMilliseconds(); log("build: V1 Summary/Title/Gigabits generation took %.3f ms for docId " "%lli.", (double)(e - t)/100.0,docId); double bpms = contentLen/((double)(e-t)/100.0); log("build: %.3f bytes/msec", bpms); return true; } // mostly taken from Msg20.cpp /* bool summaryTest2 ( char *rec , long listSize, char *coll , long long docId , char *query ) { //long collLen = gbstrlen(coll); CollectionRec *cr = g_collectiondb.getRec ( coll ); // start the timer long long t = gettimeofdayInMilliseconds(); long titleMaxLen = cr->m_titleMaxLen; long summaryMaxLen = cr->m_summaryMaxLen; long numSummaryLines = cr->m_summaryMaxNumLines; long summaryMaxNumCharsPerLine = cr->m_summaryMaxNumCharsPerLine; // these are arbitrary (taken from Msg24.cpp) long bigSampleRadius = 100; long bigSampleMaxLen = 4000; bool ratInSummary = false; Query q; q.set ( query , 0 ); // boolFlag char *content ; long contentLen ; // loop parse for ( long i = 0 ; i < 100 ; i++ ) { // 4ms TitleRec tr; tr.set (rec, listSize, false); // get content content = tr.getContent(); contentLen = tr.getContentLen(); // time it //logf(LOG_TIMING,"query: summary step 1"); // now parse into xhtml (takes 15ms on lenny) // 1ms Xml xml; xml.set ( tr.getCharset() , content, contentLen , false, 0, false, tr.getVersion() ); // time it //logf(LOG_TIMING,"query: summary step 2"); // 7ms Words ww; ww.set ( &xml , true , // compute word ids? true );// has html entities? // time it // 0ms //logf(LOG_TIMING,"query: summary step 3"); //long sfn = tr.getSiteFilenum(); //Xml *sx = g_tagdb.getSiteXml ( sfn , coll , collLen ); // time it //logf(LOG_TIMING,"query: summary step 4"); // 5ms Sections ss; ss.set ( &ww ,NULL,0,NULL,NULL,&tr); // time it //logf(LOG_TIMING,"query: summary step 5"); // 3.5ms Pos pos; pos.set ( &ww , &ss ); // time it //logf(LOG_TIMING,"query: summary step 6"); // .5ms Title tt; // use hard title? false! tt.setTitle(&tr,&xml,&ww,&ss,&pos,titleMaxLen,0xffff, NULL); char *tbuf = tt.getTitle(); long tbufLen = tt.m_titleBytes; // sanity check if ( ! tbuf && tbufLen ) { char *xx = NULL; *xx = 0; } // time it //logf(LOG_TIMING,"query: summary step 7"); // 1ms Bits bb; if ( ! bb.setForSummary ( &ww ) ) return false; // time it //logf(LOG_TIMING,"query: summary step 8"); // 8-9ms Summary s; bool status; status = s.set2 ( &xml , &ww , &bb , &ss , &pos , &q , NULL , // termFreqs NULL , // affWeights coll , collLen , false , // doStemming? summaryMaxLen , numSummaryLines , summaryMaxNumCharsPerLine , bigSampleRadius , bigSampleMaxLen , ratInSummary , &tr ); // time it //logf(LOG_TIMING,"query: summary step 9"); } // print time it took long long e = gettimeofdayInMilliseconds(); log("build: V2 Summary/Title/Gigabits generation took %.3f ms for " "docId %lli.", (double)(e - t)/100.0,docId); double bpms = contentLen/((double)(e-t)/100.0); log("build: %.3f bytes/msec", bpms); return true; } bool summaryTest3 ( char *rec , long listSize, char *coll , long long docId , char *query ) { //log(LOG_DEBUG, "HTML mem %d %d %d", // g_mem.m_used, g_mem.m_numAllocated, g_mem.m_numTotalAllocated); //long collLen = gbstrlen(coll); CollectionRec *cr = g_collectiondb.getRec ( coll ); // start the timer long long t = gettimeofdayInMilliseconds(); long titleMaxLen = cr->m_titleMaxLen; long summaryMaxLen = cr->m_summaryMaxLen; long numSummaryLines = cr->m_summaryMaxNumLines; long summaryMaxNumCharsPerLine = cr->m_summaryMaxNumCharsPerLine; // these are arbitrary (taken from Msg24.cpp) long bigSampleRadius = 100; long bigSampleMaxLen = 4000; bool ratInSummary = false; Query q; q.set ( query , 0 ); // boolFlag unsigned char *content ; long contentLen ; // loop parse for ( long i = 0 ; i < 100 ; i++ ) { // 4ms TitleRec tr; tr.set (rec, listSize, false); // get content char *html = tr.getContent(); long htmlLen = tr.getContentLen(); HtmlCarver parser(tr.getCharset(), 256); unsigned char carved[128 * 1024]; long carvedMax = sizeof(carved); // choose this one to convert to utf8 prior to carving //long carvedLen = parser.Utf8AndCarve((unsigned char*) content, // choose this one to emulate documents that are stored in utf8 // set this to whatever makes sense for your test... switch (2) { case 1: //log(LOG_DEBUG, "HTML utf8 summary"); contentLen = parser.Utf8AndCarve( (unsigned char*) html, htmlLen, carved, carvedMax, q); content = carved; break; case 2: //log(LOG_DEBUG, "HTML fast ascii summary"); contentLen = parser.AsciiAndCarveNoTags( (unsigned char*) html, htmlLen, carved, carvedMax, q); content = carved; break; case 0: default: //log(LOG_DEBUG, "HTML compatible summary"); content = (unsigned char*) html; contentLen = htmlLen; break; } // time it //logf(LOG_TIMING,"query: summary step 1"); // now parse into xhtml (takes 15ms on lenny) // 1ms Xml xml; xml.set ( tr.getCharset() , (char*) content, contentLen , false, 0, false, tr.getVersion() ); // time it //logf(LOG_TIMING,"query: summary step 2"); // 7ms Words ww; ww.set ( &xml , true , // compute word ids? true );// has html entities? // time it // 0ms //logf(LOG_TIMING,"query: summary step 3"); //long sfn = tr.getSiteFilenum(); //Xml *sx = g_tagdb.getSiteXml ( sfn , coll , collLen ); // time it //logf(LOG_TIMING,"query: summary step 4"); // 5ms Sections ss; ss.set ( &ww ,NULL,0,NULL,NULL,&tr); // time it //logf(LOG_TIMING,"query: summary step 5"); // 3.5ms Pos pos; pos.set ( &ww , &ss ); // time it //logf(LOG_TIMING,"query: summary step 6"); // .5ms Title tt; // use hard title? false! tt.setTitle(&tr,&xml,&ww,&ss,&pos,titleMaxLen,0xffff,NULL); char *tbuf = tt.getTitle(); long tbufLen = tt.m_titleBytes; // sanity check if ( ! tbuf && tbufLen ) { char *xx = NULL; *xx = 0; } // time it //logf(LOG_TIMING,"query: summary step 7"); // 1ms Bits bb; if ( ! bb.setForSummary ( &ww ) ) return false; // time it //logf(LOG_TIMING,"query: summary step 8"); // 8-9ms Summary s; bool status; status = s.set2 ( &xml , &ww , &bb , &ss , &pos , &q , NULL , // termFreqs NULL , // affWeights coll , collLen , false , // doStemming? summaryMaxLen , numSummaryLines , summaryMaxNumCharsPerLine , bigSampleRadius , bigSampleMaxLen , ratInSummary , &tr ); // time it //logf(LOG_TIMING,"query: summary step 9"); } // print time it took long long e = gettimeofdayInMilliseconds(); log("build: V3 Summary/Title/Gigabits generation took %.3f ms for " "docId %lli.", (double)(e - t)/100.0,docId); double bpms = contentLen/((double)(e-t)/100.0); log("build: %.3f bytes/msec", bpms); //log(LOG_DEBUG, "HTML mem %d %d %d", // g_mem.m_used, g_mem.m_numAllocated, g_mem.m_numTotalAllocated); return true; } */ void dumpIndexdbFile ( long fn , long long off , char *ff , long ks , char *NAME ) { // this is confidential data format #ifdef _CLIENT_ return; #endif #ifdef _METALINCS_ return; #endif g_dumpMode = true; char buf [ 1000000 ]; long bufSize = 1000000; char fname[64]; sprintf ( fname , "%s%04li.dat" , ff,fn ); if ( NAME ) sprintf ( fname , "%s", NAME ); BigFile f; fprintf(stderr,"opening ./%s\n",fname); f.set ( "./" , fname ); if ( ! f.open ( O_RDONLY ) ) return; // init our vars bool haveTop = false; char top[6]; memset ( top , 0 , 6 ); bool warned = false; // how big is this guy? long long filesize = f.getFileSize(); fprintf(stderr,"filesize=%lli\n",filesize); fprintf(stderr,"off=%lli\n",off); // reset error number g_errno = 0; // the big read loop loop: long long readSize = bufSize; if ( off + readSize > filesize ) readSize = filesize - off; // return if we're done reading the whole file if ( readSize <= 0 ) return; // read in as much as we can f.read ( buf , readSize , off ); // bail on read error if ( g_errno ) { fprintf(stderr,"read of %s failed",f.getFilename()); return; } char *p = buf; char *pend = buf + readSize; inner: // parse out the keys long size; if ( ((*p) & 0x02) == 0x00 ) size = ks; else size = ks-6; if ( p + size > pend ) { // skip what we read off += readSize ; // back up so we don't split a key we should not off -= ( pend - p ); // read more goto loop; } // new top? if ( size == ks ) { memcpy ( top , p + (ks-6) , 6 ); haveTop = true; } // warning msg if ( ! haveTop && ! warned ) { warned = true; log("db: Warning: first key is a half key."); } // make the key char tmp [ MAX_KEY_BYTES ]; memcpy ( tmp , p , ks-6 ); memcpy ( tmp + ks-6 , top , 6 ); // print the key if ( ks == 12 ) fprintf(stdout,"%08lli) %08lx %016llx\n", off + (p - buf) , *(long *)(tmp+8),*(long long *)tmp ); else fprintf(stdout,"%08lli) %016llx %016llx\n", off + (p - buf) , *(long long *)(tmp+8),*(long long *)tmp ); // go to next key p += size; // loop up goto inner; } void dumpIndexdb (char *coll,long startFileNum,long numFiles,bool includeTree, long long termId ) { // this is confidential data format #ifdef _CLIENT_ #ifndef _GLOBALSPEC_ return; #endif #endif #ifdef _METALINCS_ return; #endif g_dumpMode = true; //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_indexdb.init (); g_collectiondb.init(true); g_indexdb.addColl ( coll, false ); key_t startKey ; key_t endKey ; startKey.setMin(); endKey.setMax(); if ( termId >= 0 ) { startKey = g_indexdb.makeFirstKey ( termId ); endKey = g_indexdb.makeLastKey ( termId ); } // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; // bail if not if ( g_indexdb.m_rdb.getNumFiles() <= startFileNum && numFiles > 0 ) { printf("Request file #%li but there are only %li " "indexdb files\n",startFileNum, g_indexdb.m_rdb.getNumFiles()); return; } Msg5 msg5; Msg5 msg5b; RdbList list; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_INDEXDB , coll , &list , startKey , endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); // is it a delete? char *dd = ""; if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)"; long long d = g_indexdb.getDocId(k); uint8_t dh = g_titledb.getDomHash8FromDocId(d); if ( termId < 0 ) printf("k.n1=%08lx k.n0=%016llx " "tid=%015llu score=%03li docId=%012lli dh=0x%02lx%s\n" , k.n1, k.n0, (long long)g_indexdb.getTermId(k), (long)g_indexdb.getScore(k) , d , (long)dh, dd ); else printf("k.n1=%08lx k.n0=%016llx " "score=%03li docId=%012lli dh=0x%02lx%s\n" , k.n1, k.n0, (long)g_indexdb.getScore(k) , d , (long)dh, dd ); continue; } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ) return; goto loop; } void dumpPosdb (char *coll,long startFileNum,long numFiles,bool includeTree, long long termId , bool justVerify ) { g_dumpMode = true; //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; if ( ! justVerify ) { g_posdb.init (); g_collectiondb.init(true); g_posdb.addColl ( coll, false ); } key144_t startKey ; key144_t endKey ; startKey.setMin(); endKey.setMax(); if ( termId >= 0 ) { g_posdb.makeStartKey ( &startKey, termId ); g_posdb.makeEndKey ( &endKey, termId ); printf("startkey=%s\n",KEYSTR(&startKey,sizeof(POSDBKEY))); printf("endkey=%s\n",KEYSTR(&endKey,sizeof(POSDBKEY))); } // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; // bail if not if ( g_posdb.m_rdb.getNumFiles() <= startFileNum && numFiles > 0 ) { printf("Request file #%li but there are only %li " "posdb files\n",startFileNum, g_posdb.m_rdb.getNumFiles()); return; } key144_t lastKey; lastKey.setMin(); Msg5 msg5; Msg5 msg5b; RdbList list; // set this flag so Msg5.cpp if it does error correction does not // try to get the list from a twin... g_isDumpingRdbFromMain = 1; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_POSDB , coll , &list , &startKey , &endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness true )) { // to debug RdbList::removeBadData_r() //false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // get last key in list char *ek2 = list.m_endKey; // print it printf("ek=%s\n",KEYSTR(ek2,list.m_ks) ); // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() && ! justVerify ; list.skipCurrentRecord() ) { key144_t k; list.getCurrentKey(&k); // compare to last char *err = ""; if ( KEYCMP((char *)&k,(char *)&lastKey,sizeof(key144_t))<0 ) err = " (out of order)"; lastKey = k; // is it a delete? char *dd = ""; if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)"; long long d = g_posdb.getDocId(&k); uint8_t dh = g_titledb.getDomHash8FromDocId(d); char *rec = list.m_listPtr; long recSize = 18; if ( rec[0] & 0x04 ) recSize = 6; else if ( rec[0] & 0x02 ) recSize = 12; // alignment bits check if ( recSize == 6 && !(rec[1] & 0x02) ) { long long nd1 = g_posdb.getDocId(rec+6); // seems like nd2 is it, so it really is 12 bytes but // does not have the alignment bit set... //long long nd2 = g_posdb.getDocId(rec+12); //long long nd3 = g_posdb.getDocId(rec+18); // what size is it really? // seems like 12 bytes //log("debug1: d=%lli nd1=%lli nd2=%lli nd3=%lli", //d,nd1,nd2,nd3); err = " (alignerror1)"; if ( nd1 < d ) err = " (alignordererror1)"; //char *xx=NULL;*xx=0; } if ( recSize == 12 && !(rec[1] & 0x02) ) { //long long nd1 = g_posdb.getDocId(rec+6); // seems like nd2 is it, so it really is 12 bytes but // does not have the alignment bit set... long long nd2 = g_posdb.getDocId(rec+12); //long long nd3 = g_posdb.getDocId(rec+18); // what size is it really? // seems like 12 bytes //log("debug1: d=%lli nd1=%lli nd2=%lli nd3=%lli", //d,nd1,nd2,nd3); //if ( nd2 < d ) { char *xx=NULL;*xx=0; } //char *xx=NULL;*xx=0; err = " (alignerror2)"; if ( nd2 < d ) err = " (alignorderrror2)"; } // if it if ( recSize == 12 && (rec[7] & 0x02)) { //long long nd1 = g_posdb.getDocId(rec+6); // seems like nd2 is it, so it really is 12 bytes but // does not have the alignment bit set... long long nd2 = g_posdb.getDocId(rec+12); //long long nd3 = g_posdb.getDocId(rec+18); // what size is it really? // seems like 12 bytes really as well! //log("debug2: d=%lli nd1=%lli nd2=%lli nd3=%lli", //d,nd1,nd2,nd3); //char *xx=NULL;*xx=0; err = " (alignerror3)"; if ( nd2 < d ) err = " (alignordererror3)"; } if ( KEYCMP((char *)&k,(char *)&startKey,list.m_ks)<0 || KEYCMP((char *)&k,ek2,list.m_ks)>0){ err = " (out of range)"; } //if ( err ) // printf("%s",err ); //continue; //if ( ! magicBit && recSize == 6 ) { char *xx=NULL;*xx=0; } if ( termId < 0 ) printf( "k=%s " "tid=%015llu " "docId=%012lli " "siterank=%02li " "langid=%02li " "pos=%06li " "hgrp=%02li " "spamrank=%02li " "divrank=%02li " "syn=%01li " "densrank=%02li " //"outlnktxt=%01li " "mult=%02li " "dh=0x%02lx " "rs=%li" //recSize "%s" // dd "%s" // err "\n" , KEYSTR(&k,sizeof(key144_t)), (long long)g_posdb.getTermId(&k), d , (long)g_posdb.getSiteRank(&k), (long)g_posdb.getLangId(&k), (long)g_posdb.getWordPos(&k), (long)g_posdb.getHashGroup(&k), (long)g_posdb.getWordSpamRank(&k), (long)g_posdb.getDiversityRank(&k), (long)g_posdb.getIsSynonym(&k), (long)g_posdb.getDensityRank(&k), //(long)g_posdb.getIsOutlinkText(&k), (long)g_posdb.getMultiplier(&k), (long)dh, recSize, dd , err ); else printf( "k=%s " "tid=%015llu " "docId=%012lli " "siterank=%02li " "langid=%02li " "pos=%06li " "hgrp=%02li " "spamrank=%02li " "divrank=%02li " "syn=%01li " "densrank=%02li " //"outlnktxt=%01li " "mult=%02li " "sh32=0x%08lx " "recSize=%li " "dh=0x%02lx%s%s\n" , KEYSTR(&k,sizeof(key144_t)), (long long)g_posdb.getTermId(&k), d , (long)g_posdb.getSiteRank(&k), (long)g_posdb.getLangId(&k), (long)g_posdb.getWordPos(&k), (long)g_posdb.getHashGroup(&k), (long)g_posdb.getWordSpamRank(&k), (long)g_posdb.getDiversityRank(&k), (long)g_posdb.getIsSynonym(&k), (long)g_posdb.getDensityRank(&k), //(long)g_posdb.getIsOutlinkText(&k), (long)g_posdb.getMultiplier(&k), (long)g_posdb.getSectionSiteHash32(&k), recSize, (long)dh, dd , err ); continue; } startKey = *(key144_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key144_t *)list.getLastKey() ) return; goto loop; } void dumpDatedb (char *coll,long startFileNum,long numFiles,bool includeTree, long long termId , bool justVerify ) { // this is confidential data format #ifdef _CLIENT_ return; #endif #ifdef _METALINCS_ return; #endif g_dumpMode = true; //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; if ( ! justVerify ) { g_datedb.init (); g_collectiondb.init(true); g_datedb.addColl ( coll, false ); } char startKey[16]; char endKey [16]; long long termId1 = 0x0000000000000000LL; long long termId2 = 0xffffffffffffffffLL; if ( termId >= 0 ) { termId1 = termId; termId2 = termId; } key128_t kk; kk = g_datedb.makeStartKey ( termId1 , 0xffffffff ); // tmp hack //kk.n1 = 0x51064d5bdd71bd51LL; //kk.n0 = 0x649ffe3f20f617c6LL; KEYSET(startKey,(char *)&kk,16); kk = g_datedb.makeEndKey ( termId2 , 0x00000000 ); KEYSET(endKey,(char *)&kk,16); // get a meg at a time long minRecSizes = 1024*1024; // bail if not if ( g_datedb.m_rdb.getNumFiles() <= startFileNum ) { printf("Request file #%li but there are only %li " "datedb files\n",startFileNum, g_datedb.m_rdb.getNumFiles()); //return; } // turn off threads g_threads.disableThreads(); Msg5 msg5; Msg5 msg5b; IndexList list; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_DATEDB , coll , &list , (char *)&startKey , (char *)&endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? g_threads.enableThreads(); log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) { g_threads.enableThreads(); return; } uint8_t a,b; long long lattid = hash64n("gbxlatitude") & TERMID_MASK; long long lontid = hash64n("gbxlongitude")& TERMID_MASK; //long long lattid2 = hash64n("gbxlatitudecity") & TERMID_MASK; long long lattid2 = hash64n("gbxlatitude2") & TERMID_MASK; //long long lontid2 = hash64n("gbxlongitudecity")& TERMID_MASK; long long lontid2 = hash64n("gbxlongitude2")& TERMID_MASK; long long starttid= hash64n("gbxstart")& TERMID_MASK; long long endtid = hash64n("gbxend")& TERMID_MASK; // sanity check if ( list.m_ks != 16 ) { char *xx = NULL; *xx = 0; } // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() && ! justVerify ; list.skipCurrentRecord() ) { //key_t k = list.getCurrentKey(); uint8_t k[MAX_KEY_BYTES]; list.getCurrentKey(k); // is it a delete? char *dd = ""; //if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)"; if ( KEYNEG((char *)k) ) dd = " (delete)"; // get event id range a = 255 - k[7]; b = 255 - k[6]; // hack flag for indexing tag terms (complemented) bool isTagTerm = (k[9] == 0x7f); long long tid =(long long)list.getTermId16((char *)k); // print out for events if ( tid && tid != lattid && tid != lontid && tid != lattid2 && tid != lontid2 && tid != starttid && tid != endtid ) { char *ss = ""; if ( isTagTerm ) ss = " tagterm"; printf("k.n1=%016llx k.n0=%016llx " "tid=%015llu " //"date=%010lu " "eidrng=%li-%li " "score=%03li docId=%012lli%s%s\n" , KEY1((char *)k,16),KEY0((char *)k), tid, //list.getCurrentDate(), (long)a,(long)b, (long)list.getScore((char *)k), list.getCurrentDocId() , ss, dd ); } else if ( tid == starttid || tid == endtid ) { // this will uncomplement it long cd = list.getCurrentDate(); char *desc; if ( tid == starttid ) desc = "startTime"; else if ( tid == endtid ) desc = "endTime"; // convert to date str struct tm *timeStruct = localtime ( &cd ); char ppp[100]; strftime(ppp,100,"%b-%d-%Y-%H:%M:%S",timeStruct); // but use time if its not // otherwise a lat/lon/time key printf("k.n1=%016llx " "k.n0=%016llx " "tid=%015llu=%s " "time=%s(%lu) " "eventId=%03li docId=%012lli%s\n" , KEY1((char *)k,16), KEY0((char *)k), tid, desc, ppp,cd, (long)list.getScore((char *)k), list.getCurrentDocId() , dd ); } else if ( tid ) { // this will uncomplement it unsigned long cd = list.getCurrentDate(); // convert to float float latlon = (float)cd; // denormalize (we scaled by 10M) latlon /= 10000000.0; char *desc; if ( tid == lattid ) desc = "latitude"; else if ( tid == lontid ) desc = "longitude"; else if ( tid == lattid2 ) desc = "latitude2"; else if ( tid == lontid2 ) desc = "longitude2"; else desc = "unknownitude"; // but use time if its not // otherwise a lat/lon/time key printf("k.n1=%016llx " "k.n0=%016llx " "tid=%015llu " "%s=%.06f " "eventId=%03li docId=%012lli%s\n" , KEY1((char *)k,16), KEY0((char *)k), tid, desc, latlon, (long)list.getScore((char *)k), list.getCurrentDocId() , dd ); } /* if ( termId < 0 ) printf("k.n1=%016llx k.n0=%016llx " "tid=%015llu date=%010lu " "score=%03li docId=%012lli%s\n" , KEY1(k,16),KEY0(k), (long long)list.getTermId16(k), list.getCurrentDate(), (long)list.getScore(k), list.getCurrentDocId() , dd ); else printf("k.n1=%016llx k.n0=%016llx " "date=%010lu score=%03li docId=%012lli%s\n" , KEY1(k,16),KEY0(k), list.getCurrentDate(), (long)list.getScore(k), list.getCurrentDocId() , dd ); */ continue; } KEYSET(startKey,list.getLastKey(),16); KEYADD(startKey,1,16); // watch out for wrap around //if ( startKey < *(key_t *)list.getLastKey() ) return; if ( KEYCMP(startKey,list.getLastKey(),16)<0 ) { g_threads.enableThreads(); return; } goto loop; } void dumpClusterdb ( char *coll, long startFileNum, long numFiles, bool includeTree ) { // this is confidential data format #ifdef _CLIENT_ return; #endif #ifdef _METALINCS_ return; #endif g_dumpMode = true; g_clusterdb.init (); g_collectiondb.init(true); g_clusterdb.addColl ( coll, false ); key_t startKey ; key_t endKey ; startKey.setMin(); endKey.setMax(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; // bail if not if ( g_clusterdb.getRdb()->getNumFiles() <= startFileNum ) { printf("Request file #%li but there are only %li " "clusterdb files\n",startFileNum, g_clusterdb.getRdb()->getNumFiles()); return; } Msg5 msg5; Msg5 msg5b; RdbList list; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_CLUSTERDB , coll , &list , startKey , endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list char strLanguage[256]; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); // is it a delete? char *dd = ""; if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)"; // get the language string languageToString ( g_clusterdb.getLanguage((char*)&k), strLanguage ); //unsigned long gid = getGroupId ( RDB_CLUSTERDB , &k ); unsigned long shardNum = getShardNum( RDB_CLUSTERDB , &k ); Host *grp = g_hostdb.getShard ( shardNum ); Host *hh = &grp[0]; // print it printf("k.n1=%08lx k.n0=%016llx " "docId=%012lli family=%lu " "language=%li (%s) siteHash26=%lu%s " "groupNum=%lu " "shardNum=%lu\n", k.n1, k.n0, g_clusterdb.getDocId((char*)&k) , g_clusterdb.hasAdultContent((char*)&k) , (long)g_clusterdb.getLanguage((char*)&k), strLanguage, g_clusterdb.getSiteHash26((char*)&k) , dd , hh->m_hostId , shardNum); continue; } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ) return; goto loop; } /* void dumpStatsdb( long startFileNum, long numFiles, bool includeTree, int test ) { // this is confidential data format #ifdef _CLIENT_ return; #endif #ifdef _METALINCS_ return; #endif static char *coll = "stats"; // We don't want to close the previous session so we // must not do a real init. g_statsdb.init( );//false - Is full init? g_collectiondb.init( true ); // Is dump? g_statsdb.getRdb()->addColl ( coll ); uint64_t ss_keys = 0; uint64_t dd_keys = 0; key96_t startKey; key96_t endKey; startKey.setMin(); endKey.setMax(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; // bail if not if ( g_statsdb.getRdb()->getNumFiles() <= startFileNum ) { printf("Request file #%li but there are only %li " "statsdb files\n",startFileNum, g_statsdb.getRdb()->getNumFiles()); return; } Msg5 msg5; Msg5 msg5b; RdbList list; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_STATSDB , coll , &list , (char *)&startKey , (char *)&endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list key96_t k; time_t dateStamp; char txtDate[32]; char *txt; uint64_t uCRC = 0LL; uint8_t version = 0; int32_t dataSize = 0; SafeBuf cBuf( 1024 ); bool dataSummaryGen = false; bool first = true; if ( g_mem.checkStackSize() > (int)(6*1024*1024) ) { fprintf( stderr, "Running low on stack space, %li bytes " "used. %s:%d\n", g_mem.checkStackSize(), __PRETTY_FUNCTION__, __LINE__ ); return; } StatsV1 stats; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { list.getCurrentKey( (char *)&k ); version = g_statsdb.getVersionFromKey( k ); // is it a delete? char *dd = ""; if (!( k.n0 & 0x01LL)) dd = " (delete)"; dateStamp = g_statsdb.getTimestampFromKey( k ); snprintf( txtDate, 32, "%s", ctime( &dateStamp ) ); txt = txtDate; // get rid of the newline character while ( *txt ) { if ( *txt == '\n' ) { *txt = 0; break; } txt++; } // . We extract and verify the size of the data. // . If uCRC is zero, we failed to decompress the data. if ( k.n1 & SUMMARY_MASK ) { dataSummaryGen = true; cBuf.setBuf( list.getCurrentData(), list.getCurrentDataSize(), list.getCurrentDataSize(), false, //ownData csOther);//encoding if ( version == 1 ) { if ( ! stats.fromCompressed( cBuf ) ) { printf("Decompression Failed!!\n"); } dataSize = sizeof( StatsDataV1 ); uCRC = g_statsdb.quickCRC( (uint8_t *)stats.getData(), dataSize ); } } // print it if ( test != 3 && ! g_statsdb.getResolutionFromKey( k ) ) { printf("[Session Header Key] " "k.n1=%08lx k.n0=%016llx resolution=%03lu " "session=%05d timestamp=%010li [%s] " "hostId=%05lu version=%03lu %s\n", k.n1 , k.n0 , (unsigned long)g_statsdb.getResolutionFromKey(k), (short)g_statsdb.getSessionFromKey(k) , (long)dateStamp , txtDate , (unsigned long)g_statsdb.getHostIdFromKey(k) , (unsigned long)version , dd ); ss_keys++; } else if ( test == 2 ){ printf("k.n1=0x%08lx k.n0=0x%016llx resolution=%03lu " "session=%05d timestamp=%010li [%s] " "hostId=%05lu version=%03lu " "uLen=%010lu cLen=%010lu uCRC=%016llx %s \n", k.n1 , k.n0 , (unsigned long)g_statsdb.getResolutionFromKey(k), (short)g_statsdb.getSessionFromKey(k) , (long)dateStamp , txtDate , (unsigned long)g_statsdb.getHostIdFromKey( k ) , (unsigned long)version , (unsigned long)dataSize , list.getCurrentDataSize() , uCRC, dd ); dd_keys++; } else if ( test > 2 && first ) { StatsDataV1 &sData = *(StatsDataV1 *)stats.getData(); printf("k.n1=0x%08lx k.n0=0x%016llx resolution=%03lu " "session=%05d timestamp=%010li [%s] " "hostId=%05lu version=%03lu " "uLen=%010lu cLen=%010lu uCRC=%016llx %s \n" "allQueries %lli\n" "msg3aRecallCnt %i\n" "cpuUsage %f\n" "", k.n1 , k.n0 , (unsigned long)g_statsdb.getResolutionFromKey(k), (short)g_statsdb.getSessionFromKey(k) , (long)dateStamp , txtDate , (unsigned long)g_statsdb.getHostIdFromKey( k ) , (unsigned long)version , (unsigned long)dataSize , list.getCurrentDataSize() , uCRC, dd, sData.m_allQueries, sData.m_msg3aRecallCnt, sData.m_cpuUsage ); dd_keys++; if ( test == 3 ) { first = false; printf( "\nPlease wait...\n\n" ); } } } startKey = *(key96_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key96_t *)list.getLastKey() ) { printf( "Session Summary Keys: %llu\n" "Data Keys: %llu\n", ss_keys, dd_keys ); return; } goto loop; } */ /* void dumpChecksumdb( char *coll, long startFileNum, long numFiles, bool includeTree ) { // this is confidential data format #ifdef _CLIENT_ return; #endif #ifdef _METALINCS_ return; #endif g_dumpMode = true; g_checksumdb.init (); g_collectiondb.init(true); g_checksumdb.getRdb()->addColl ( coll ); //key_t startKey ; //key_t endKey ; //startKey.setMin(); //endKey.setMax(); long cKeySize = g_conf.m_checksumdbKeySize; char startKey[16]; char endKey[16]; if ( cKeySize == 12 ) { ((key_t *)startKey)->setMin(); ((key_t *)endKey)->setMax(); } else if ( cKeySize == 16 ) { ((key128_t *)startKey)->setMin(); ((key128_t *)endKey)->setMax(); } // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; //// bail if not //if ( g_checksumdb.getRdb()->getNumFiles() <= startFileNum ) { // printf("Request file #%li but there are only %li " // "checksumdb files\n",startFileNum, // g_checksumdb.getRdb()->getNumFiles()); // return; //} Msg5 msg5; Msg5 msg5b; RdbList list; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_CHECKSUMDB , coll , &list , startKey , endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { unsigned long hosthash; //key_t k = list.getCurrentKey(); char k[16]; list.getCurrentKey( k ); // is it a delete? char *dd = ""; //if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)"; if ( (((key_t *)k)->n0 & 0x01) == 0x00 ) dd = " (delete)"; char kBuf[20]; //unsigned long hosthash = (k.n1 >> 8) & 0xffff; // . check keys size before doing assignments if ( cKeySize == 12 ) { // get the language string hosthash = (((key_t *)k)->n1 >> 8) & 0xffff; sprintf( kBuf, "%08lx", ((key_t *)k)->n1); } else if ( cKeySize == 16 ) { // get the language string // . some extra manipulation needed to retrieve the // . host hash from the 16-byte key hosthash = ((((key128_t *)k)->n1 >> 38 ) & 0x3ff ) | ((((key128_t *)k)->n1 << 2) & 0x3fffc00 ); sprintf( kBuf, "%016llx", ((key128_t *)k)->n1); } // print it printf("k.n1=%s k.n0=%016llx " "docId=%012lli quality=%d hosthash=0x%04lx%s\n", kBuf, ((key_t *)k)->n0, g_checksumdb.getDocId( k ) , (int)g_checksumdb.getDocQuality( k ), hosthash , dd ); continue; } //startKey = *(key_t *)list.getLastKey(); KEYSET( startKey, list.getLastKey(), cKeySize ); //startKey += (unsigned long) 1; // must check key size before assignments if ( cKeySize == 12 ) *((key_t *)startKey) += (unsigned long) 1; else *((key128_t *)startKey) += (unsigned long) 1; // watch out for wrap around //if ( startKey < *(key_t *)list.getLastKey() ) return; if ( KEYCMP( startKey, list.getLastKey(), cKeySize ) < 0 ) return; goto loop; } */ void dumpLinkdb ( char *coll, long startFileNum, long numFiles, bool includeTree , char *url ) { // this is confidential data format #ifdef _CLIENT_ return; #endif #ifdef _METALINCS_ return; #endif g_dumpMode = true; g_linkdb.init (); g_collectiondb.init(true); g_linkdb.addColl ( coll, false ); key224_t startKey ; key224_t endKey ; startKey.setMin(); endKey.setMax(); // set to docid if ( url ) { Url u; u.set ( url , gbstrlen(url) , true ); // addWWW? unsigned long h32 = u.getHostHash32();//g_linkdb.getUrlHash(&u) long long uh64 = hash64n(url,0); startKey = g_linkdb.makeStartKey_uk ( h32 , uh64 ); endKey = g_linkdb.makeEndKey_uk ( h32 , uh64 ); } // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; // bail if not if ( g_linkdb.getRdb()->getNumFiles() <= startFileNum && !includeTree) { printf("Request file #%li but there are only %li " "linkdb files\n",startFileNum, g_linkdb.getRdb()->getNumFiles()); return; } Msg5 msg5; Msg5 msg5b; RdbList list; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_LINKDB , coll , &list , (char *)&startKey , (char *)&endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false )){// err correction? log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key224_t k; list.getCurrentKey((char *) &k); // is it a delete? char *dd = ""; if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)"; long long docId = (long long)g_linkdb.getLinkerDocId_uk(&k); //if ( docId != 74785425291LL && docId != 88145066810LL ) // log("hey"); //if ( list.m_listPtr-list.m_list >= 11784-24 ) // log("boo"); //unsigned char hc = g_linkdb.getLinkerHopCount_uk(&k); //unsigned long gid = g_hostdb.getGroupId (RDB_LINKDB,&k,true); //long groupNum = g_hostdb.getGroupNum ( gid ); unsigned long shardNum = getShardNum(RDB_LINKDB,&k,true); //if ( hc != 0 ) { char *xx=NULL;*xx=0; } // is it an ip or url record? //bool isHost = g_linkdb.isHostRecord ( &k ); // is it a url or site key? //bool isUrlKey = g_linkdb.isUrlKey ( &k ); // print this record type different //if ( isUrlKey ) { //long ip = g_linkdb.getIp2(&k); //char *ipString = iptoa(ip); printf("k=%s " "linkeesitehash32=0x%08lx " "linkeeurlhash=0x%012llx " "linkspam=%li " "siterank=%02li " //"hopcount=%03hhu " "ip32=%s " "docId=%012llu " "discovered=%lu " "lost=%lu " "sitehash32=0x%08lx " "shardNum=%lu " "%s\n", KEYSTR(&k,sizeof(key224_t)), (long)g_linkdb.getLinkeeSiteHash32_uk(&k), (long long)g_linkdb.getLinkeeUrlHash64_uk(&k), (long)g_linkdb.isLinkSpam_uk(&k), (long)g_linkdb.getLinkerSiteRank_uk(&k), //hc,//g_linkdb.getLinkerHopCount_uk(&k), iptoa((long)g_linkdb.getLinkerIp_uk(&k)), docId, (long)g_linkdb.getDiscoveryDate_uk(&k), (long)g_linkdb.getLostDate_uk(&k), (long)g_linkdb.getLinkerSiteHash32_uk(&k), shardNum, dd ); } startKey = *(key224_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key224_t *)list.getLastKey() ) return; goto loop; } bool pingTest ( long hid , unsigned short clientPort ) { Host *h = g_hostdb.getHost ( hid ); if ( ! h ) return log("net: pingtest: hostId %li is " "invalid.",hid); // set up our socket int sock = socket ( AF_INET, SOCK_DGRAM , 0 ); if ( sock < 0 ) return log("net: pingtest: socket: %s.", strerror(errno)); // sockaddr_in provides interface to sockaddr struct sockaddr_in name; // reset it all just to be safe memset((char *)&name, 0,sizeof(name)); name.sin_family = AF_INET; name.sin_addr.s_addr = 0; /*INADDR_ANY;*/ name.sin_port = htons(clientPort); // we want to re-use port it if we need to restart int options = 1; if ( setsockopt(sock, SOL_SOCKET, SO_REUSEADDR , &options,sizeof(options)) < 0 ) return log("net: pingtest: setsockopt: %s.", strerror(errno)); // bind this name to the socket if ( bind ( sock, (struct sockaddr *)&name, sizeof(name)) < 0) { close ( sock ); return log("net: pingtest: Bind on port %hu: %s.", clientPort,strerror(errno)); } //g_loop.setNonBlocking ( sock , 0 ); //g_loop.interruptsOff(); int fd = sock; int flags = fcntl ( fd , F_GETFL ) ; if ( flags < 0 ) return log("net: pingtest: fcntl(F_GETFL): %s.", strerror(errno)); //if ( fcntl ( fd, F_SETFL, flags|O_NONBLOCK|O_ASYNC) < 0 ) //return log("db: Loop::addSlot:fcntl(NONBLOCK):%s",strerror(errno)); char dgram[1450]; int n; struct sockaddr_in to; sockaddr_in from; unsigned int fromLen; long long startTime; // make the dgram UdpProtocol *up = &g_dp; // udpServer2.getProtocol(); long transId = 500000000 - 1 ; long dnum = 0; // dgramNum long sends = 0; long lost = 0; long recovered = 0; long acks = 0; long replies = 0; long ip = h->m_ip; ip = atoip("127.0.0.1",9); startTime = gettimeofdayInMilliseconds(); to.sin_family = AF_INET; to.sin_addr.s_addr = h->m_ip; to.sin_port = ntohs(h->m_port); memset ( &(to.sin_zero) , 0,8 ); log("net: pingtest: Testing hostId #%li at %s:%hu from client " "port %hu", hid,iptoa(h->m_ip),h->m_port,clientPort); // if this is higher than number of avail slots UdpServer.cpp // will not be able to free the slots and this will end up sticking, // because the slots can only be freed in destroySlot() which // is not async safe! //long count = 40000; // number of loops long count = 1000; // number of loops long avg = 0; sendLoop: if ( count-- <= 0 ) { log("net: pingtest: Got %li replies out of %li sent (%li lost)" "(%li recovered)", replies,sends,lost,recovered); log("net: pingtest: Average reply time of %.03f ms.", (double)avg/(double)replies); return true; } transId++; long msgSize = 3; // indicates a debug ping packet to PingServer.cpp up->setHeader ( dgram, msgSize, 0x11, dnum, transId, true, false , 0 ); long size = up->getHeaderSize(0) + msgSize; long long start = gettimeofdayInMilliseconds(); // debug //log("db: sending %li bytes",size); n = sendto(sock,dgram,size,0,(struct sockaddr *)&to,sizeof(to)); if ( n != size ) return log("net: pingtest: sendto returned " "%i " "(should have returned %li)",n,size); sends++; readLoop2: // loop until we read something n = recvfrom (sock,dgram,DGRAM_SIZE,0,(sockaddr *)&from, &fromLen); if (gettimeofdayInMilliseconds() - start>2000) {lost++; goto sendLoop;} if ( n <= 0 ) goto readLoop2; // { sched_yield(); goto readLoop2; } // for what transId? long tid = up->getTransId ( dgram , n ); // -1 is error if ( tid < 0 ) return log("net: pingtest: Bad transId."); // if no match, it was recovered, keep reading if ( tid != transId ) { log("net: pingTest: Recovered tid=%li, current tid=%li. " "Resend?",tid,transId); recovered++; goto readLoop2; } // an ack? if ( up->isAck ( dgram , n ) ) { acks++; // debug //log("db: read ack of %li bytes",n); goto readLoop2; } // debug //log("db: read %li bytes",n); // mark the time long long took = gettimeofdayInMilliseconds()-start; if ( took > 1 ) log("net: pingtest: got reply #%li (tid=%li) " "in %lli ms",replies,transId,took); // make average avg += took; // the reply? replies++; // send back an ack size = up->makeAck ( dgram, dnum, transId , true/*weinit?*/ , false ); n = sendto(sock,dgram,size,0,(struct sockaddr *)&to,sizeof(to)); // debug //log("db: send %li bytes",n); // mark our first read goto sendLoop; } int injectFileTest ( long reqLen , long hid ) { // make a mime char *req = (char *)mmalloc ( reqLen , "injecttest"); if ( ! req ) return log("build: injecttest: malloc(%li) " "failed", reqLen)-1; char *p = req; char *pend = req + reqLen; sprintf ( p , "POST /inject HTTP/1.0\r\n" "Content-Length: 000000000\r\n" // placeholder "Content-Type: text/html\r\n" "Connection: Close\r\n" "\r\n" ); p += gbstrlen(p); char *content = p; sprintf ( p , "u=%li.injecttest.com&c=&delete=0&ip=4.5.6.7&iplookups=0&" "dedup=1&rs=7&" "quick=1&hasmime=1&ucontent=" "HTTP 200\r\n" "Last-Modified: Sun, 06 Nov 1994 08:49:37 GMT\r\n" "Connection: Close\r\n" "Content-Type: text/html\r\n" "\r\n" , time(NULL) ); p += gbstrlen(p); // now store random words (just numbers of 8 digits each) while ( p + 12 < pend ) { long r ; r = rand(); sprintf ( p , "%010lu " , r ); p += gbstrlen ( p ); } // set content length long clen = p - content; char *ptr = req ; // find start of the 9 zeroes while ( *ptr != '0' || ptr[1] !='0' ) ptr++; // store length there sprintf ( ptr , "%09lu" , clen ); // remove the \0 ptr += gbstrlen(ptr); *ptr = '\r'; // what is total request length? long rlen = p - req; // generate the filename char *filename = "/tmp/inject-test"; File f; f.set ( filename ); f.unlink(); if ( ! f.open ( O_RDWR | O_CREAT ) ) return log("build: injecttest: Failed to create file " "%s for testing", filename) - 1; if ( rlen != f.write ( req , rlen , 0 ) ) return log("build: injecttest: Failed to write %li " "bytes to %s", rlen,filename) - 1; f.close(); mfree ( req , reqLen , "injecttest" ); Host *h = g_hostdb.getHost(hid); char *ips = iptoa(h->m_ip); // now inject the file return injectFile ( filename , ips , 0 , MAX_DOCID , false ); } //#define MAX_INJECT_SOCKETS 10 #define MAX_INJECT_SOCKETS 1 static void doInject ( int fd , void *state ) ; static void injectedWrapper ( void *state , TcpSocket *s ) ; static TcpServer s_tcp; static File s_file; static long long s_off = 0; // offset into file static long s_ip; static short s_port; static Hostdb s_hosts2; static long s_rrn = 0; static long s_registered = 1; static long s_maxSockets = MAX_INJECT_SOCKETS; static long s_outstanding = 0; static bool s_isDelete; static long s_injectTitledb; static key_t s_titledbKey; static char *s_req [MAX_INJECT_SOCKETS]; static long long s_docId[MAX_INJECT_SOCKETS]; static char s_init5 = false; static long long s_endDocId; int injectFile ( char *filename , char *ips , long long startDocId , long long endDocId , bool isDelete ) { g_mem.init ( 50000000 ); // set up the loop if ( ! g_loop.init() ) return log("build: inject: Loop init " "failed.")-1; // init the tcp server, client side only if ( ! s_tcp.init( NULL , // requestHandlerWrapper , getMsgSize, NULL , // getMsgPiece , 0 , // port, only needed for server , &s_maxSockets ) ) return false; s_tcp.m_doReadRateTimeouts = false; s_isDelete = isDelete; if ( ! s_init5 ) { s_init5 = true; for ( long i = 0; i < MAX_INJECT_SOCKETS ; i++ ) s_req[i] = NULL; } // get host //Host *h = g_hostdb.getHost ( hid ); //if ( ! h ) return log("build: inject: Hostid %li is invalid.", // hid)-1; char *colon = strstr(ips,":"); long port = 8000; if ( colon ) { *colon = '\0'; port = atoi(colon+1); } long ip = 0; // is ip field a hosts.conf instead? if ( strstr(ips,".conf") ) { if ( ! s_hosts2.init ( ips , 0 ) ) { fprintf(stderr,"failed to load %s",ips); exit(0); } s_ip = 0; s_port = 0; } else { ip = atoip(ips,strlen(ips)); if ( ip == 0 || ip == -1 ) { log("provided ip \"%s\" is a bad ip. " "exiting\n",ips); exit(0); } if ( port == 0 || port == -1 ) { log("bad port. exiting\n"); exit(0); } s_ip = ip;//h->m_ip; s_port = port;//h->m_httpPort; } s_injectTitledb = false; char *coll = "main"; if ( strcmp(filename,"titledb") == 0 ) { long hostId = 0; Host *h = g_hostdb.getHost ( hostId ); if ( ! h ) { log("db: No host has id %li.",hostId); exit(0);} if ( ! g_conf.init ( h->m_dir ) ) { // , h->m_hostId ) ) { log("db: Conf init failed." ); exit(0); } // init the loop, needs g_conf if ( ! g_loop.init() ) { log("db: Loop init failed." ); exit(0); } // set up the threads, might need g_conf if ( ! g_threads.init() ) { log("db: Threads init failed." ); exit(0); } s_injectTitledb = true; s_titledbKey.setMin(); // read where we left off from file if possible char fname[256]; sprintf(fname,"%s/lastinjectdocid.dat",g_hostdb.m_dir); SafeBuf ff; ff.fillFromFile(fname); if ( ff.length() > 1 ) { long long ffdocId = atoll(ff.getBufStart() ); // if process got killed in the middle of write // i guess the stored docid could be corrupted! // so make sure its in startDocId,endDocId range if ( ffdocId > 0 && ffdocId >= startDocId && ffdocId < endDocId ) startDocId = ffdocId; else log("build: saved docid %lli not " "in [%lli,%lli]", ffdocId, startDocId, endDocId ); } if ( startDocId != 0LL ) s_titledbKey = g_titledb.makeFirstKey(startDocId); s_endDocId = endDocId; // so we do not try to merge files, or write any data: g_dumpMode = true; //g_conf.m_checksumdbMaxDiskPageCacheMem = 0; //g_conf.m_spiderdbMaxDiskPageCacheMem = 0; //g_conf.m_urldbMaxDiskPageCacheMem = 0; g_titledb.init (); g_collectiondb.init(true); g_titledb.addColl ( coll, false ); } else { // open file s_file.set ( filename ); if ( ! s_file.open ( O_RDONLY ) ) return log("build: inject: Failed to open file %s " "for reading.", filename) - 1; s_off = 0; } // register sleep callback to get started if ( ! g_loop.registerSleepCallback(1, NULL, doInject) ) return log("build: inject: Loop init failed.")-1; // run the loop if ( ! g_loop.runLoop() ) return log("build: inject: Loop " "run failed.")-1; // dummy return return 0; } void doInject ( int fd , void *state ) { if ( s_registered ) { s_registered = 0; g_loop.unregisterSleepCallback ( NULL, doInject ); } long long fsize ; if ( ! s_injectTitledb ) fsize = s_file.getFileSize(); // turn off threads so this happens right away g_conf.m_useThreads = false; loop: long reqLen; long reqAlloc; char *req; // if reading from our titledb and injecting into another cluster if ( s_injectTitledb ) { // turn off threads so this happens right away g_conf.m_useThreads = false; key_t endKey; //endKey.setMax(); endKey = g_titledb.makeFirstKey(s_endDocId); RdbList list; Msg5 msg5; Msg5 msg5b; char *coll = "main"; msg5.getList ( RDB_TITLEDB , coll, &list , (char *)&s_titledbKey , (char *)&endKey , 100 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1, // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b ); // all done if empty if ( list.isEmpty() ) { g_loop.reset(); exit(0); } // loop over entries in list list.getCurrentKey((char *) &s_titledbKey); // advance for next s_titledbKey += 1; // is it a delete? char *rec = list.getCurrentRec (); long recSize = list.getCurrentRecSize(); // skip negative keys! if ( (rec[0] & 0x01) == 0x00 ) goto loop; // re-enable threads i guess g_conf.m_useThreads = true; // set and uncompress //TitleRec tr; XmlDoc xd; if ( ! xd.set2 ( rec , recSize , coll , NULL , // safebuf 0 , // niceness NULL ) ) { // spiderrequest log("build: inject skipping corrupt title rec" ); goto loop; } // sanity! if ( xd.size_utf8Content > 5000000 ) { log("build: inject skipping huge title rec" ); goto loop; } // get the content length. uenc can be 2140 bytes! seen it! reqAlloc = xd.size_utf8Content + 6000; // make space for content req = (char *)mmalloc ( reqAlloc , "maininject" ); if ( ! req ) { log("build: inject: Could not allocate %li bytes for " "request at offset %lli",reqAlloc,s_off); exit(0); } char *ipStr = iptoa(xd.m_ip); // encode the url char *url = xd.getFirstUrl()->getUrl(); char uenc[5000]; urlEncode ( uenc , 4000 , url , strlen(url) , true ); char *content = xd.ptr_utf8Content; long contentLen = xd.size_utf8Content; if ( contentLen > 0 ) contentLen--; char c = content[contentLen]; content[contentLen] = '\0'; // form what we would read from disk reqLen = sprintf(req, // print as unencoded content for speed "POST /inject HTTP/1.0\r\n" "Content-Length: 000000000\r\n"//placeholder "Content-Type: text/html\r\n" "Connection: Close\r\n" "\r\n" // now the post cgi parms "c=%s&" // quick docid only reply "quick=1&" // url of injecting page "u=%s&" "ip=%s&" //"firstip=%s&" "firstindexed=%lu&" "lastspidered=%lu&" // prevent looking up firstips // on all outlinks for speed: "spiderlinks=0&" "hopcount=%li&" "newonly=2&" // only inject if new "dontlog=1&" "charset=%li&" "ucontent=" // first the mime //"HTTP 200\r\n" //"Connection: Close\r\n" //"Content-Type: text/html\r\n" //"Content-Length: %li\r\n" //"\r\n" // then the content of the injecting page "%s" , coll , uenc , ipStr //, ipStr , xd.m_firstIndexedDate , xd.m_spideredTime , (long)xd.getHopCount() , (long)xd.m_charset //, contentLen , content ); content[contentLen] = c; if ( reqLen >= reqAlloc ) { log("inject: bad engineer here"); char *xx=NULL;*xx=0; } // set content length char *start = strstr(req,"c="); long realContentLen = strlen(start); char *ptr = req ; // find start of the 9 zeroes while ( *ptr != '0' || ptr[1] !='0' ) ptr++; // store length there sprintf ( ptr , "%09lu" , realContentLen ); // remove the \0 ptr += strlen(ptr); *ptr = '\r'; // map it long i; for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ ) { // skip if occupied if ( s_req[i] ) continue; s_req [i] = req; s_docId[i] = xd.m_docId; break; } if ( i >= MAX_INJECT_SOCKETS ) log("build: could not add req to map"); } else { // are we done? if ( s_off >= fsize ) { log("inject: done parsing file"); g_loop.reset(); exit(0); } // read the mime char buf [ 1000*1024 ]; long maxToRead = 1000*1024; long toRead = maxToRead; if ( s_off + toRead > fsize ) toRead = fsize - s_off; long bytesRead = s_file.read ( buf , toRead , s_off ) ; if ( bytesRead != toRead ) { log("build: inject: Read of %s failed at offset " "%lli", s_file.getFilename(), s_off); exit(0); } char *fend = buf + toRead; char *pbuf = buf; // partap padding? if ( pbuf[0] == '\n' ) pbuf++; if ( pbuf[0] == '\n' ) pbuf++; // need "++URL: " for ( ; *pbuf && strncmp(pbuf,"+++URL: ",8) ; pbuf++ ); // none? if ( ! *pbuf ) { log("inject: done!"); exit(0); } // sometimes line starts with "URL: http://www.xxx.com/\n" char *url = pbuf + 8; // NULL; // skip over url pbuf = strchr(pbuf,'\n'); // null term url *pbuf = '\0'; // log it log("inject: injecting url %s",url); // debug //if ( strstr(url,"worldexecutive.com") ) // log("poo"); // skip to next line pbuf++; // get offset into "buf" long len = pbuf - buf; // subtract that from toRead so it is the available bytes left toRead -= len; // advance this for next read s_off += len; //if ( ! strncmp(pbuf,"URL: ", 5 ) ) { // if it's not a mime header assume just a url //if ( strncmp(pbuf,"GET /",5) && // strncmp(pbuf,"POST /",6) ) { // skip "URL: " /* if ( strncmp(pbuf,"+++URL: ",8) == 0 ) url = pbuf + 8; else url = pbuf; // find \n pbuf = strchr(pbuf,'\n'); *pbuf = '\0'; pbuf++; long len = pbuf - buf; toRead -= len; s_off += len; } */ // should be a mime that starts with GET or POST //char *mimePtr = pbuf; HttpMime m; if ( ! m.set ( pbuf , toRead , NULL ) ) { if ( toRead > 128 ) toRead = 128; pbuf [ toRead ] = '\0'; log("build: inject: Failed to set mime at offset " "%lli where request=%s",s_off,buf); exit(0); } // find the end of it, the next "URL: " line or // end of file char *p = pbuf; char *contentPtrEnd = fend; for ( ; p < fend ; p++ ) { if ( p[0] == '+' && p[1] == '+' && p[2] == '+' && p[3] == 'U' && p[4] == 'R' && p[5] == 'L' && p[6] == ':' && p[7] == ' ' ) { contentPtrEnd = p; break; } } // point to the content (NOW INCLUDE MIME!) char *contentPtr = pbuf;// + m.getMimeLen(); long contentPtrLen = contentPtrEnd - contentPtr; if ( contentPtrEnd == fend && bytesRead == maxToRead ) { log("inject: not reading enough content to inject " "url %s . increase maxToRead from %li",url, maxToRead); exit(0); } // get the length of content (includes the submime for // injection) long contentLen = m.getContentLen(); if ( ! url && contentLen == -1 ) { log("build: inject: Mime at offset %lli does not " "specify required Content-Length: XXX field.", s_off); exit(0); } // alloc space for mime and content //reqAlloc = 5000; //if ( ! url ) reqAlloc += m.getMimeLen() + contentLen ; reqAlloc = contentPtrLen + 2 + 6000; // make space for content req = (char *)mmalloc ( reqAlloc , "maininject" ); if ( ! req ) { log("build: inject: Could not allocate %li bytes for " "request at offset %lli",reqAlloc,s_off); exit(0); } char *rp = req; // a different format? //if ( url ) { char *ipStr = "1.2.3.4"; //long recycle = 0; //if ( s_isDelete ) recycle = 1; rp += sprintf(rp, "POST /inject HTTP/1.0\r\n" "Content-Length: 000000000\r\n"//bookmrk "Content-Type: text/html\r\n" "Connection: Close\r\n" "\r\n" "c=main&" // do parsing consistency testing (slower!) //"dct=1&" // mime is in the "&ucontent=" parm "hasmime=1&" // prevent looking up firstips // on all outlinks for speed: "spiderlinks=0&" "quick=1&" // quick reply "dontlog=1&" "ip=%s&" //"recycle=%li&" "delete=%li&" "u=", ipStr, //recycle, (long)s_isDelete); // url encode the url rp += urlEncode ( rp , 4000 , url , gbstrlen(url) ); // finish it up rp += sprintf(rp,"&ucontent="); //} if ( ! url ) { // what is this? char *xx=NULL;*xx=0; /* // stick mime in there memcpy ( rp , mimePtr , m.getMimeLen() ); // skip that rp += m.getMimeLen(); // turn \n\n into \r\n\r\n if ( rp[-2] == '\n' && rp[-1] == '\n' ) { rp[-2] = '\r'; rp[ 0] = '\r'; rp[ 1] = '\n'; rp += 2; } // advance s_off += m.getMimeLen(); // read from file into content long contRead = contentLen; if ( s_off + contRead > fsize ) { log("build: inject: Content-Length of %li " "specified " "for content at offset %lli would breech " "EOF", contentLen,s_off); exit(0); } if ( contRead != s_file.read ( rp ,contRead , s_off)) { log("build: inject: Read of %s failed at " "offset %lli", s_file.getFilename(), s_off); exit(0); } // skip that rp += contRead; // success s_off += contRead; */ } // store the content after the &ucontent memcpy ( rp , contentPtr , contentPtrLen ); rp += contentPtrLen; s_off += contentPtrLen; // just for ease of display *rp = '\0'; // set content length char *start = strstr(req,"c="); long realContentLen = gbstrlen(start); char *ptr = req ; // find start of the 9 zeroes while ( *ptr != '0' || ptr[1] !='0' ) ptr++; // store length there sprintf ( ptr , "%09lu" , realContentLen ); // remove the \0 ptr += strlen(ptr); *ptr = '\r'; // set this reqLen = rp - req; // sanity if ( reqLen > reqAlloc ) { char *xx=NULL;*xx=0; } } long ip = s_ip; long port = s_port; // try hosts.conf if ( ip == 0 ) { // round robin over hosts in s_hosts2 if ( s_rrn >= s_hosts2.getNumHosts() ) s_rrn = 0; Host *h = s_hosts2.getHost ( s_rrn ); ip = h->m_ip; port = h->m_httpPort; s_rrn++; } // now inject it bool status = s_tcp.sendMsg ( ip , port , req , reqAlloc ,//Len , reqLen , reqLen , NULL , injectedWrapper , 9999*60*1000 , // timeout, 60days -1 , // maxTextDocLen -1 );// maxOtherDocLen // launch another if blocked //if ( ! status ) return; if ( ! status ) { //long nh = g_hostdb.getNumHosts(); //nh = (nh * 15) / 10; //if ( nh > MAX_INJECT_SOCKETS - 10 ) // nh = MAX_INJECT_SOCKETS - 10; //if ( nh < 5 ) nh = 5; // limit to one socket right now //if ( ++s_outstanding < 1 ) goto loop; if ( ++s_outstanding < MAX_INJECT_SOCKETS ) goto loop; return; } if ( g_errno ) log("build: inject had error: %s.",mstrerror(g_errno)); // free if did not block, tcpserver frees on immediate error else mfree ( req , reqAlloc , "maininject" ); // loop if not goto loop; } void injectedWrapper ( void *state , TcpSocket *s ) { s_outstanding--; // errno? if ( g_errno ) { log("build: inject: Got server error: %s.", mstrerror(g_errno)); doInject(0,NULL); return; } // free send buf char *req = s->m_sendBuf; long reqAlloc = s->m_sendBufSize; mfree ( req , reqAlloc , "maininject"); s->m_sendBuf = NULL; long i; static long s_last = 0; long now = getTimeLocal(); // save docid every 10 seconds if ( now - s_last > 10 ) { long long minDocId = 0x0000ffffffffffffLL; // get min outstanding docid inject request for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ ) { // skip if occupied if ( ! s_req[i] ) continue; if ( s_docId[i] < minDocId ) minDocId = s_docId[i]; } // map it bool saveIt = false; // are we the min? long i; for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ ) { // skip if occupied if ( s_req[i] != req ) continue; // we got our request if ( s_docId[i] == minDocId ) saveIt = true; break; } if ( saveIt ) { s_last = now; SafeBuf sb; sb.safePrintf("%lli\n",minDocId); char fname[256]; sprintf(fname,"%s/lastinjectdocid.dat",g_hostdb.m_dir); sb.dumpToFile(fname); } } // remove ourselves from map for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ ) if ( s_req[i] == req ) s_req[i] = NULL; // get return code char *reply = s->m_readBuf; logf(LOG_INFO,"build: inject: return=\n%s",reply); doInject(0,NULL); } void saveRdbs ( int fd , void *state ) { long long now = gettimeofdayInMilliseconds(); long long last; Rdb *rdb ; // . try saving every 10 minutes from time of last write to disk // . if nothing more added to tree since then, Rdb::close() return true //long long delta = 10LL*60LL*1000LL; // . this is in MINUTES long long delta = (long long)g_conf.m_autoSaveFrequency *60000LL; if ( delta <= 0 ) return; // jitter it up a bit so not all hostIds save at same time, 15 secs delta += (long long)(g_hostdb.m_hostId % 10) * 15000LL + (rand()%7500); rdb = g_tagdb.getRdb(); last = rdb->getLastWriteTime(); if ( now - last > delta ) if ( ! rdb->close(NULL,NULL,false,false)) return; rdb = g_catdb.getRdb(); last = rdb->getLastWriteTime(); if ( now - last > delta ) if ( ! rdb->close(NULL,NULL,false,false)) return; //rdb = g_indexdb.getRdb(); //last = rdb->getLastWriteTime(); //if ( now - last > delta ) // if ( ! rdb->close(NULL,NULL,false,false)) return; rdb = g_posdb.getRdb(); last = rdb->getLastWriteTime(); if ( now - last > delta ) if ( ! rdb->close(NULL,NULL,false,false)) return; //rdb = g_datedb.getRdb(); //last = rdb->getLastWriteTime(); //if ( now - last > delta ) // if ( ! rdb->close(NULL,NULL,false,false)) return; rdb = g_titledb.getRdb(); last = rdb->getLastWriteTime(); if ( now - last > delta ) if ( ! rdb->close(NULL,NULL,false,false)) return; //rdb = g_tfndb.getRdb(); //last = rdb->getLastWriteTime(); //if ( now - last > delta ) // if ( ! rdb->close(NULL,NULL,false,false)) return; rdb = g_spiderdb.getRdb(); last = rdb->getLastWriteTime(); if ( now - last > delta ) if ( ! rdb->close(NULL,NULL,false,false)) return; //rdb = g_checksumdb.getRdb(); //last = rdb->getLastWriteTime(); //if ( now - last > delta ) // if ( ! rdb->close(NULL,NULL,false,false)) return; rdb = g_clusterdb.getRdb(); last = rdb->getLastWriteTime(); if ( now - last > delta ) if ( ! rdb->close(NULL,NULL,false,false)) return; rdb = g_statsdb.getRdb(); last = rdb->getLastWriteTime(); if ( now - last > delta ) if ( ! rdb->close(NULL,NULL,false,false)) return; } // JAB: warning abatement #if 0 bool checkDataParity ( ) { //return true; g_threads.disableThreads(); // test the first collection char *coll = g_collectiondb.getCollName ( 0 ); Msg5 msg5; Msg5 msg5b; RdbList list; key_t startKey; key_t endKey; startKey.setMin(); endKey.setMax(); //long minRecSizes = 64000; // CHECK INDEXDB log ( LOG_INFO, "db: Verifying Indexdb..." ); if ( ! msg5.getList ( RDB_INDEXDB , coll , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false ))// err correction? return log("db: HEY! it did not block"); long count = 0; long got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); count++; //unsigned long groupId = k.n1 & g_hostdb.m_groupMask; uint32_t shardNum = getShardNum ( RDB_INDEXDB, &k ); if ( groupId == g_hostdb.m_groupId ) got++; } if ( got != count ) { log ("db: Out of first %li records in indexdb, only %li belong " "to our group.",count,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) return log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); return log ( "db: Exiting due to Indexdb inconsistency." ); } log ( LOG_INFO, "db: Indexdb passed verification successfully. (%li)", count ); // CHECK TITLEDB log ( LOG_INFO, "db: Verifying Titledb..." ); if ( ! msg5.getList ( RDB_TITLEDB , coll , &list , startKey , endKey , 1024*1024 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )) return log("db: HEY! it did not block"); count = 0; got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); count++; uint32_t shardNum = getShardNum ( RDB_TITLEDB , &k ); //long groupId = k.n1 & g_hostdb.m_groupMask; if ( groupId == g_hostdb.m_groupId ) got++; } if ( got != count ) { log ("db: Out of first %li records in titledb, only %li belong " "to our group.",count,got); // exit if NONE, we probably got the wrong data if ( count > 10 && got == 0 ) return log("db: Are you sure you have the right " "data in the right directory? " "Exiting."); return log ( "db: Exiting due to Titledb inconsistency." ); } log ( LOG_INFO, "db: Titledb passed verification successfully. (%li)", count ); // CHECK TFNDB log ( LOG_INFO, "db: Verifying Tfndb..." ); if ( ! msg5.getList ( RDB_TFNDB , coll , &list , startKey , endKey , 64000 , // minRecSizes , true , // includeTree , false , // add to cache? 0 , // max cache age 0 , // startFileNum , -1 , // numFiles , NULL , // state NULL , // callback 0 , // niceness false ))// err correction? return log("db: HEY! it did not block"); count = 0; got = 0; for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); count++; // verify the group uint32_t shardNum = getShardNum ( RDB_TFNDB , &k ); if ( groupId == g_hostdb.m_groupId ) got++; } if ( got != count ) { log ("db: Out of first %li records in tfndb, only %li passed " "verification.",count,got); // exit if NONE, we probably got the wrong data if ( got == 0 ) return log("db: Are you sure you have the " "right " "data in the right directory? " "Exiting."); return log ( "db: Exiting due to Tfndb inconsistency." ); } log ( LOG_INFO, "db: Tfndb passed verification successfully. (%li)", count ); // DONE g_threads.enableThreads(); return true; } #endif bool shutdownOldGB ( short port ) { log("db: Saving and shutting down the other gb process." ); // now make a new socket descriptor int sd = socket ( AF_INET , SOCK_STREAM , 0 ) ; // return NULL and set g_errno on failure if ( sd < 0 ) { // copy errno to g_errno g_errno = errno; log("tcp: Failed to create new socket: %s.", mstrerror(g_errno)); return false; } struct sockaddr_in to; to.sin_family = AF_INET; // our ip's are always in network order, but ports are in host order to.sin_addr.s_addr = atoip("127.0.0.1",9); to.sin_port = htons((unsigned short)port); bzero ( &(to.sin_zero) , 8 ); // TODO: bzero too slow? // note it log("db: Connecting to port %hu.",port); // connect to the socket. This should block until it does again: if ( ::connect ( sd, (sockaddr *)&to, sizeof(to) ) != 0 ) { if ( errno == EINTR ) goto again; return log("admin: Got connect error: %s.",mstrerror(errno)); } // note it log("db: Connected. Issuing shutdown command."); // send the message char *msg = "GET /master?usave=1 HTTP/1.0\r\n\r\n"; write ( sd , msg , gbstrlen(msg) ); // wait for him to shut down the socket char rbuf [5000]; long n; readmore: errno = 0; n = read ( sd , rbuf, 5000 ); if ( n == -1 && errno == EINTR ) goto readmore; if ( n == -1 ) return log("db: Got error reading reply: %s.", mstrerror(errno)); // success... close(sd); log("db: Received reply from old gb process."); return true; } bool memTest() { // let's ensure our core file can dump struct rlimit lim; lim.rlim_cur = lim.rlim_max = RLIM_INFINITY; if ( setrlimit(RLIMIT_CORE,&lim) ) log("db: setrlimit: %s.", mstrerror(errno) ); void *ptrs[4096]; int numPtrs=0; //int totalMem=0; int i; if ( ! g_log.init( g_hostdb.m_logFilename ) ) { fprintf (stderr,"db: Log file init failed.\n" ); return 1; } //g_mem.init(0xffffffff); g_mem.m_maxMem = 0xffffffffLL; log(LOG_INIT, "memtest: Testing memory bus bandwidth."); // . read in 20MB 100 times (~2GB total) // . tests main memory throughput log(LOG_INIT, "memtest: Testing main memory."); membustest ( 20*1024*1024 , 100 , true ); // . read in 1MB 2,000 times (~2GB) // . tests the L2 cache log(LOG_INIT, "memtest: Testing 1MB L2 cache."); membustest ( 1024*1024 , 2000 , true ); // . read in 8000 200,000 times (~1.6GB) // . tests the L1 cache log(LOG_INIT, "memtest: Testing 8KB L1 cache."); membustest ( 8000 , 100000 , true ); log(LOG_INIT, "memtest: Allocating up to %lld bytes",g_mem.m_maxMem); for (i=0;i<4096;i++) { ptrs[numPtrs] = mmalloc(1024*1024, "memtest"); if (!ptrs[numPtrs]) break; numPtrs++; } log(LOG_INIT, "memtest: Was able to allocate %lld bytes of a total of " "%lld bytes of memory attempted.", g_mem.m_used,g_mem.m_maxMem); log(LOG_INIT, "memtest: Dumping core to test max core file size."); char *xx = NULL; *xx = 0; for (i=0;i 50*1024*1024 ) { log(LOG_INIT,"memtest: truncating to 50 Megabytes."); nb = 50*1024*1024; } long n = nb ; //* 1024 * 1024 ; // make n divisble by 64 //long rem = n % 64; //if ( rem > 0 ) n += 64 - rem; // get some memory, 4 megs //#undef malloc //register char *buf = (char *)malloc(n + 64); //#define malloc coreme long bufSize = 50*1024*1024; register char *buf = (char *) mmalloc ( bufSize , "main" ); if ( ! buf ) return; char *bufStart = buf; register char *bufEnd = buf + n; //fprintf(stderr,"pre-reading %li NB \n",nb); // pre-read it so sbrk() can do its thing for ( long i = 0 ; i < n ; i++ ) buf[i] = 1; // time stamp long long t = gettimeofdayInMilliseconds(); // . time the read loop // . each read should only be 2 assenbly movl instructions: // movl -52(%ebp), %eax // movl (%eax), %eax // movl -52(%ebp), %eax // movl 4(%eax), %eax // ... loop: register long c; if ( readf ) { while ( buf < bufEnd ) { // repeat 16x for efficiency.limit comparison to bufEnd c = *(long *)(buf+ 0); c = *(long *)(buf+ 4); c = *(long *)(buf+ 8); c = *(long *)(buf+12); c = *(long *)(buf+16); c = *(long *)(buf+20); c = *(long *)(buf+24); c = *(long *)(buf+28); c = *(long *)(buf+32); c = *(long *)(buf+36); c = *(long *)(buf+40); c = *(long *)(buf+44); c = *(long *)(buf+48); c = *(long *)(buf+52); c = *(long *)(buf+56); c = *(long *)(buf+60); buf += 64; } } else { while ( buf < bufEnd ) { // repeat 8x for efficiency. limit comparison to bufEnd *(long *)(buf+ 0) = 0; *(long *)(buf+ 4) = 1; *(long *)(buf+ 8) = 2; *(long *)(buf+12) = 3; *(long *)(buf+16) = 4; *(long *)(buf+20) = 5; *(long *)(buf+24) = 6; *(long *)(buf+28) = 7; buf += 32; } } if ( --count > 0 ) { buf = bufStart; goto loop; } // completed long long now = gettimeofdayInMilliseconds(); // multiply by 4 since these are longs char *op = "read"; if ( ! readf ) op = "wrote"; log(LOG_INIT,"memtest: %s %li bytes (x%li) in %llu ms.", op , n , loops , now - t ); // stats if ( now - t == 0 ) now++; double d = (1000.0*(double)loops*(double)(n)) / ((double)(now - t)); log(LOG_INIT,"memtest: we did %.2f MB/sec." , d/(1024.0*1024.0)); mfree ( bufStart , bufSize , "main" ); return ; } bool cacheTest() { g_conf.m_maxMem = 2000000000LL; // 2G g_mem.m_maxMem = 2000000000LL; // 2G hashinit(); // use an rdb cache RdbCache c; // init, 50MB long maxMem = 50000000; // . how many nodes in cache tree can we fit? // . each rec is key (12) and ip(4) // . overhead in cache is 56 // . that makes 56 + 4 = 60 // . not correct? stats suggest it's less than 25 bytes each long maxCacheNodes = maxMem / 25; // set the cache if ( ! c.init ( maxMem , 4 , // fixed data size of rec false , // support lists of recs? maxCacheNodes , false , // use half keys? "test" , // dbname false )) // save cache to disk? return log("test: Cache init failed."); long numRecs = 0 * maxCacheNodes; logf(LOG_DEBUG,"test: Adding %li recs to cache.",numRecs); // timestamp long timestamp = 42; // keep ring buffer of last 10 keys key_t oldk[10]; long oldip[10]; long b = 0; // fill with random recs for ( long i = 0 ; i < numRecs ; i++ ) { if ( (i % 100000) == 0 ) logf(LOG_DEBUG,"test: Added %li recs to cache.",i); // random key key_t k ; k.n1 = rand(); k.n0 = rand(); k.n0 <<= 32; k.n0 |= rand(); long ip = rand(); // keep ring buffer oldk [b] = k; oldip[b] = ip; if ( ++b >= 10 ) b = 0; // make rec,size, like dns, will be 4 byte hash and 4 byte key? c.addRecord((collnum_t)0,k,(char *)&ip,4,timestamp); // reset g_errno in case it had an error (we don't care) g_errno = 0; // get a rec too! if ( i < 10 ) continue; long next = b + 1; if ( next >= 10 ) next = 0; key_t back = oldk[next]; char *rec; long recSize; if ( ! c.getRecord ( (collnum_t)0 , back , &rec , &recSize , false , // do copy? -1 , // maxAge , true , // inc count? NULL , // *cachedTime = NULL, true )){ // promoteRecord? char *xx= NULL; *xx = 0; } if ( ! rec || recSize != 4 || *(long *)rec != oldip[next] ) { char *xx= NULL; *xx = 0; } } // now try variable sized recs c.reset(); logf(LOG_DEBUG,"test: Testing variably-sized recs."); // init, 300MB maxMem = 300000000; // . how many nodes in cache tree can we fit? // . each rec is key (12) and ip(4) // . overhead in cache is 56 // . that makes 56 + 4 = 60 // . not correct? stats suggest it's less than 25 bytes each maxCacheNodes = maxMem / 5000; //maxCacheNodes = 1200; // set the cache if ( ! c.init ( maxMem , -1 , // fixed data size of rec false , // support lists of recs? maxCacheNodes , false , // use half keys? "test" , // dbname false )) // save cache to disk? return log("test: Cache init failed."); numRecs = 30 * maxCacheNodes; //numRecs = 2 * maxCacheNodes; logf(LOG_DEBUG,"test: Adding %li recs to cache.",numRecs); // timestamp timestamp = 42; // keep ring buffer of last 10 keys long oldrs[10]; b = 0; //char lastp; // rec to add char *rec; long recSize; long maxRecSize = 40000000; // 40MB for termlists long numMisses = 0; char *buf = (char *)mmalloc ( maxRecSize + 64 ,"cachetest" ); if ( ! buf ) return false; //sleep(2); // fill with random recs for ( long i = 0 ; i < numRecs ; i++ ) { if ( (i % 100) == 0 ) logf(LOG_DEBUG,"test: Added %li recs to cache. " "Misses=%li.",i,numMisses); // random key key_t k ; k.n1 = rand(); k.n0 = rand(); k.n0 <<= 32; k.n0 |= rand(); // random size recSize = rand()%maxRecSize;//100000; // keep ring buffer oldk [b] = k; oldrs[b] = recSize; //oldip[b] = ip; if ( ++b >= 10 ) b = 0; // make the rec rec = buf; memset ( rec , (char)k.n1, recSize ); //log("test: v0"); // make rec,size, like dns, will be 4 byte hash and 4 byte key? if ( ! c.addRecord((collnum_t)0,k,rec,recSize,timestamp) ) { char *xx=NULL; *xx=0; } // do a dup add 1% of the time if ( (i % 100) == 0 ) if(!c.addRecord((collnum_t)0,k,rec,recSize,timestamp)){ char *xx=NULL; *xx=0; } //log("test: v1"); //c.verify(); // reset g_errno in case it had an error (we don't care) g_errno = 0; // get a rec too! if ( i < 10 ) continue; long next = b + 1; if ( next >= 10 ) next = 0; key_t back = oldk[next]; //log("cache: get rec"); if ( ! c.getRecord ( (collnum_t)0 , back , &rec , &recSize , false , // do copy? -1 , // maxAge , true , // inc count? NULL , // *cachedTime = NULL, true) ) {//true )){ // promoteRecord? numMisses++; //logf(LOG_DEBUG,"test: missed"); continue; char *xx= NULL; *xx = 0; } //log("cache: got rec"); //char *p = c.m_bufs[0] + 9210679 + 51329; //if ( *p != lastp ) // logf(LOG_DEBUG,"test: p changed"); //lastp = *p; if ( recSize != oldrs[next] ) { logf(LOG_DEBUG,"test: bad rec size."); char *xx=NULL; *xx = 0; continue; } char r = (char)back.n1; for ( long j = 0 ; j < recSize ; j++ ) { if ( rec[j] == r ) continue; logf(LOG_DEBUG,"test: bad char in rec."); char *xx=NULL; *xx = 0; } //if ( ! rec || recSize != 4 || *(long *)rec != oldip[next] ) { // char *xx= NULL; *xx = 0; } } c.verify(); c.reset(); return true; } bool ramdiskTest() { //g_conf.m_maxMem = 2000000000LL; // 2G //g_mem.m_maxMem = 2000000000LL; // 2G //hashinit(); int fd = open ("/dev/ram2",O_RDWR); if ( fd < 0 ) { fprintf(stderr,"ramdisk: failed to open /dev/ram2\n"); return false; } char *buf[1000]; gbpwrite ( fd , buf , 1000, 0 ); close ( fd); return true; } void dosOpenCB( void *state, TcpSocket *s); bool dosOpen(long targetIp, unsigned short port, int numSocks) { TcpServer tcpClient; if ( ! g_loop.init() ) return log("loop: Loop init " "failed."); // init the tcp server, client side only if ( ! tcpClient.init( NULL , // requestHandlerWrapper , getMsgSize, NULL , // getMsgPiece , 0 // port, only needed for server ) ) { return log("tcp: Tcp init failed."); } long launched = 0; char* ebuf = ""; for( long i = 0; i < numSocks; i++) { if(!tcpClient.sendMsg( targetIp , port , ebuf, 0, 0, 0, NULL, dosOpenCB, 600 * 60 * 24, -1, -1)) { launched++; } } //printf("DOS version 5.2\n RAM: 000640K\n HIMEM: 1012\n\n"); log("init: dos launched %li simultaneous requests.", launched); if ( ! g_loop.runLoop() ) return log("tcp: inject: Loop " "run failed."); return true; } void dosOpenCB( void *state, TcpSocket *s) { log("init: dos timeout"); } // to get some of the hosts that were added to sitesearch.gigablast.com // but not added in May or Apr: (this adds www. to domains that need it) // ./gb dump t main 0 -1 0 >& foo // grep ch= foo | grep -v " May-" | grep -v " Apr-" | awk '{print $13}' | urlinfo | grep "host: " | awk '{print $2}' | sort | uniq > added // then the sites that have been searched: // grep "search site" log0* | awk '{print $7}' | sort | uniq | urlinfo | grep "host: " | awk '{print $2}' | sort | uniq > searched // then to print out the hosts that have not been searched in a while and // should be removed from the sitesearch index // diff added searched | grep "< " | awk '{print $2}' > toban /* void dumpCachedRecs (char *coll,long startFileNum,long numFiles,bool includeTree, long long docid) { //g_conf.m_spiderdbMaxTreeMem = 1024*1024*30; g_dumpMode = true; //g_conf.m_checksumdbMaxDiskPageCacheMem = 0; g_conf.m_spiderdbMaxDiskPageCacheMem = 0; g_conf.m_tfndbMaxDiskPageCacheMem = 0; g_titledb.init (); g_collectiondb.init(true); g_titledb.getRdb()->addColl ( coll ); key_t startKey ; key_t endKey ; key_t lastKey ; startKey.setMin(); endKey.setMax(); lastKey.setMin(); startKey = g_titledb.makeFirstTitleRecKey ( docid ); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; Msg5 msg5b; Msg5 msg5c; RdbList list; RdbList ulist; g_tfndb.init (); g_collectiondb.init(true); g_tfndb.getRdb()->addColl ( coll ); long long lastDocId = 0; long compressBufSize = 0; char* compressBuf = NULL; fprintf(stderr, "Dumping Records:\n"); long filenum = 0; char filename[64]; sprintf(filename, "%s-%li.ddmp", coll, filenum); int FD = open(filename, O_CREAT|O_WRONLY, S_IROTH); long numDumped = 0; unsigned long bytesDumped = 0; loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_TITLEDB , coll , &list , startKey , endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )){ log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); char *rec = list.getCurrentRec(); long recSize = list.getCurrentRecSize(); long long docId = g_titledb.getDocIdFromKey ( k ); if ( k <= lastKey ) log("key out of order. " "lastKey.n1=%lx n0=%llx " "currKey.n1=%lx n0=%llx ", lastKey.n1,lastKey.n0, k.n1,k.n0); lastKey = k; // print deletes // if ( (k.n0 & 0x01) == 0) { // fprintf(stderr,"n1=%08lx n0=%016llx docId=%012lli " // "hh=%07lx ch=%08lx (del)\n", // k.n1 , k.n0 , docId , hostHash , contentHash ); // continue; // } // uncompress the title rec TitleRec tr; if ( ! tr.set ( rec , recSize , false ) ) continue; lastDocId = tr.getDocId(); // extract the url Url *u = tr.getUrl(); long ext = g_tfndb.makeExt ( u ); key_t uk1 ; key_t uk2 ; uk1 = g_tfndb.makeMinKey ( docId ); uk2 = g_tfndb.makeMaxKey ( docId ); if(! msg5c.getList ( RDB_TFNDB , coll , &ulist , uk1 , // startKey uk2 , // endKey 0x7fffffff , // minRecSizes true , // includeTree? false , // addToCache? 0 , // max cache age 0 , // startFileNum -1 , // numFiles (-1 =all) NULL , NULL , 0 , //nice false )) { //error correct log(LOG_LOGIC,"db: getList did not block."); return; } if(g_errno) { log(LOG_LOGIC,"db: tfndb getList had error: %s", mstrerror(g_errno)); } bool found = false; for ( ulist.resetListPtr(); ! ulist.isExhausted() ; ulist.skipCurrentRecord() ) { key_t k = ulist.getCurrentKey(); if ( g_tfndb.getExt ( k ) == ext ) { found = true; break; } } if(!found) { //fprintf(stderr, "skipping %s %lli\n", u->getUrl(), docId); continue; } long needSize = (long)(tr.getContentLen() * 1.01 + 12); if(needSize > compressBufSize) { char* newBuf = (char*)mrealloc(compressBuf, compressBufSize, needSize, "recDump"); if(!newBuf) { log(LOG_WARN,"dump:couldn't dump this record:%s, no memory", u->getUrl()); continue; } compressBufSize = needSize; compressBuf = newBuf; } unsigned long destLen = compressBufSize; int status = compress((unsigned char*)compressBuf, &destLen, (unsigned char*)tr.getContent(), (unsigned long)tr.getContentLen()); if(status != Z_OK) { log(LOG_WARN,"dump:couldn't dump this record:" "%s, compress failed", u->getUrl()); continue; } long totSize = 2*sizeof(long) + destLen + u->getUrlLen()+1; long conLen = tr.getContentLen(); //fprintf(stderr, "%li %s %li %li\ng", totSize, u->getUrl(), conLen, destLen); write(FD, (char*)&totSize, sizeof(long)); write(FD, u->getUrl(), u->getUrlLen() + 1); write(FD, (char*)&conLen, sizeof(long)); write(FD, (char*)&(destLen), sizeof(long)); write(FD, compressBuf, destLen); numDumped++; bytesDumped += totSize; // if(numDumped == 1000) { // //change this later!!!!!!!!!! // long zero = 0; // write(FD, &zero, sizeof(long)); // return; // } } fprintf(stderr, "dumped %li records (%li bytes).\n",numDumped, bytesDumped); startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ) { long zero = 0; write(FD, &zero, sizeof(long)); return; } //start a new file if this one gets too big if(bytesDumped > 1000000000) { filenum++; sprintf(filename, "%s-%li.ddmp", coll, filenum); close(FD); FD = open(filename, O_CREAT|O_WRONLY, S_IROTH); bytesDumped = 0; fprintf(stderr, "Started new file: %s. starts at docId: %lli.\n",filename, lastDocId); } goto loop; } */ // CountDomains Structures and function definitions struct lnk_info { char *dom; long domLen; long pages; }; struct dom_info { char *dom; long domLen; long dHash; long pages; //long long quality; long *ip_list; long numIp; //HashTable *dht; long *lnk_table; long tableSize; long lnkCnt; long lnkPages; }; struct ip_info { unsigned long ip; long pages; //long long quality; long *dom_list; long numDom; }; // JAB: warning abatement //static int ip_hcmp (const void *p1, const void *p2); static int ip_fcmp (const void *p1, const void *p2); static int ip_dcmp (const void *p1, const void *p2); // JAB: warning abatement //static int dom_hcmp (const void *p1, const void *p2); static int dom_fcmp (const void *p1, const void *p2); static int dom_lcmp (const void *p1, const void *p2); // JAB: warning abatement //static int lnk_hcmp (const void *p1, const void *p2); // JAB: warning abatement //static int lnk_fcmp (const void *p1, const void *p2); void countdomains( char* coll, long numRecs, long verbosity, long output ) { long *ip_table; long *dom_table; //HashTable ipHT; //HashTable domHT; //ipHT.set ( numRecs+1 ); //domHT.set( numRecs+1 ); key_t startKey; key_t endKey ; key_t lastKey ; startKey.setMin(); endKey.setMax(); lastKey.setMin(); g_titledb.init (); g_collectiondb.init(true); g_titledb.addColl ( coll, false ); log( LOG_INFO, "cntDm: parms: %s, %ld", coll, numRecs ); long long time_start = gettimeofdayInMilliseconds(); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; Msg5 msg5b; RdbList list; long countDocs = 0; long countIp = 0; long countDom = 0; long attempts = 0; ip_table = (long *)mmalloc(sizeof(long) * numRecs, "main-dcit" ); dom_table = (long *)mmalloc(sizeof(long) * numRecs, "main-dcdt" ); for( long i = 0; i < numRecs; i++ ) { ip_table[i] = 0; dom_table[i] = 0; } loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_TITLEDB , coll , &list , startKey , endKey , minRecSizes , true , // Do we need to include tree? false , // add to cache? 0 , // max cache age 0 , -1 , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )){ log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) goto freeInfo; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); char *rec = list.getCurrentRec(); long recSize = list.getCurrentRecSize(); long long docId = g_titledb.getDocId ( &k ); //long hostHash = g_titledb.getHostHash ( k ); //long contentHash = g_titledb.getContentHash ( k ); attempts++; if ( k <= lastKey ) log("key out of order. " "lastKey.n1=%lx n0=%llx " "currKey.n1=%lx n0=%llx ", lastKey.n1,lastKey.n0, k.n1,k.n0); lastKey = k; // print deletes if ( (k.n0 & 0x01) == 0) { fprintf(stderr,"n1=%08lx n0=%016llx docId=%012lli " //"hh=%07lx ch=%08lx (del)\n", "(del)\n", k.n1 , k.n0 , docId ); continue; } if( (countIp >= numRecs) || (countDom >= numRecs) ) { log( LOG_INFO, "cntDm: countIp | countDom, greater than" "numRecs requested, should never happen!!!!" ); goto freeInfo; } // uncompress the title rec //TitleRec tr; //if ( ! tr.set ( rec , recSize , false ) ) // continue; XmlDoc xd; if ( ! xd.set2 (rec, recSize, coll,NULL,0) ) continue; // extract the url //Url *u = tr.getUrl(); struct ip_info *sipi ; struct dom_info *sdomi; //unsigned long hkey_ip = u->getIp(); //unsigned long hkey_dom = hash32( u->getHost(), u->getHostLen() ); //if( !(sipi = (struct ip_info *)ipHT.getValue( hkey_ip ))) { long i; for( i = 0; i < countIp; i++ ) { if( !ip_table[i] ) continue; sipi = (struct ip_info *)ip_table[i]; if( sipi->ip == (unsigned long)xd.m_ip ) break; } if( i == countIp ) { sipi = (struct ip_info *)mmalloc(sizeof(struct ip_info), "main-dcip" ); if( !sipi ) { char *XX=NULL; *XX=0; } //ipHT.addKey( hkey_ip, (long)sipi, 0 ); ip_table[countIp++] = (long)sipi; sipi->ip = xd.m_ip;//u->getIp(); sipi->pages = 1; sipi->numDom = 0; //sipi->quality = tr.getDocQuality(); } else { sipi->pages++; //sipi->quality += tr.getDocQuality(); } //if( !(sdomi = (struct dom_info *)domHT.getValue( hkey_dom ))) { char *fu = xd.ptr_firstUrl; long dlen; char *dom = getDomFast ( fu , &dlen ); long dkey = hash32( dom , dlen ); for( i = 0; i < countDom; i++ ) { if( !dom_table[i] ) continue; sdomi = (struct dom_info *)dom_table[i]; /* long len = u->getHostLen(); if( sdomi->domLen < u->getHostLen() ) len=sdomi->domLen; if(strncasecmp(sdomi->dom, u->getHost(), len)==0) break; */ if( sdomi->dHash == dkey ) break; } if( i == countDom ) { sdomi =(struct dom_info*)mmalloc(sizeof(struct dom_info), "main-dcdm" ); if( !sdomi ) { char *XX=NULL; *XX=0; } //domHT.addKey( hkey_dom, (long)sdomi, 0 ); dom_table[countDom++] = (long)sdomi; sdomi->dom = (char *)mmalloc( dlen,"main-dcsdm" ); strncpy( sdomi->dom, dom , dlen ); sdomi->domLen = dlen; sdomi->dHash = dkey; sdomi->pages = 1; //sdomi->quality = tr.getDocQuality(); sdomi->numIp = 0; //sdomi->dht = new( HashTable ); //mnew( sdomi->dht, sizeof(HashTable), "main-dcndht" ); //sdomi->dht->set( 1000 ); sdomi->tableSize = 0; sdomi->lnkCnt = 0; } else { sdomi->pages++; //sdomi->quality += tr.getDocQuality(); } Links *dlinks = xd.getLinks(); /* // Parse outgoing links and count frequency Links dLinks; //Xml *sx; //sx = g_tagdb.getSiteXml ( tr.getSiteFilenum(), coll , // gbstrlen( coll ) ); Xml xml; if (!xml.set( tr.getCharset(), tr.getContent(), tr.getContentLen(), false, 0, false, tr.getVersion() )){ log(LOG_WARN, "countdomains: error setting Xml: %s", mstrerror(g_errno)); return; } if (!dLinks.set( true, &xml, tr.getUrl(), false, false, xd.m_version,0 )){ log(LOG_WARN, "countdomains: error setting Links: %s", mstrerror(g_errno)); return; } */ long size = dlinks->getNumLinks(); if( !sdomi->tableSize ) { sdomi->lnk_table = (long *)mmalloc(size * sizeof(long), "main-dclt" ); sdomi->tableSize = size; } else { if( size > (sdomi->tableSize - sdomi->lnkCnt) ) { size += sdomi->lnkCnt; sdomi->lnk_table = (long *) mrealloc(sdomi->lnk_table, sdomi->tableSize*sizeof(long), size*sizeof(long), "main-dcrlt" ); sdomi->tableSize = size; } } for( long i = 0; i < dlinks->getNumLinks(); i++ ) { //struct lnk_info *slink; //Url url; //url.set(dLinks.getLink(i), dLinks.getLinkLen(i)); char *link = dlinks->getLink(i); long dlen; char *dom = getDomFast ( link , &dlen ); unsigned long lkey = hash32( dom , dlen ); //if( (slink = (struct lnk_info *) // sdomi->dht->getValue( lkey ))) { long j; for( j = 0; j < sdomi->lnkCnt; j++ ) { //slink=(struct lnk_info *)sdomi->lnk_table[j]; if( sdomi->lnk_table[j] == (long)lkey ) break; //if(slink->domLen != url.getHostLen()) continue; //if( !strcasecmp( slink->dom, url.getHost() ) ) //break; } sdomi->lnkPages++; if( j != sdomi->lnkCnt ) continue; sdomi->lnk_table[sdomi->lnkCnt++] = lkey; sdomi->lnkPages++; //slink=(struct lnk_info *)mmalloc(sizeof(struct lnk_info), // "main-dcli" ); //Sanity check, mallocing link_info struct //if( !slink ) { char *XX=NULL; *XX=0; } //sdomi->dht->addKey( lkey, (long)slink, 0 ); //sdomi->lnk_table[sdomi->lnkCnt++] = (long)slink; //slink->dom = (char *)mmalloc( url.getHostLen(), // "main-dcsld" ); //strncpy( slink->dom, url.getHost(), // url.getHostLen() ); //slink->domLen = url.getHostLen(); //slink->pages = 1; } // Handle lists if( !sipi->numDom || !sdomi->numIp ){ sdomi->numIp++; sipi->numDom++; //Add to IP list for Domain sdomi->ip_list = (long *) mrealloc( sdomi->ip_list, (sdomi->numIp-1)*sizeof(long), sdomi->numIp*sizeof(long), "main-dcldm" ); sdomi->ip_list[sdomi->numIp-1] = (long)sipi; //Add to domain list for IP sipi->dom_list = (long *) mrealloc( sipi->dom_list, (sipi->numDom-1)*sizeof(long), sipi->numDom*sizeof(long), "main-dclip" ); sipi->dom_list[sipi->numDom-1] = (long)sdomi; } else { long i; for( i = 0; (i < sdomi->numIp) && (sdomi->ip_list[i] != (long)sipi); i++ ); if( sdomi->numIp != i ) goto updateIp; sdomi->numIp++; sdomi->ip_list = (long *) mrealloc( sdomi->ip_list, (sdomi->numIp-1)*sizeof(long), sdomi->numIp*sizeof(long), "main-dcldm" ); sdomi->ip_list[sdomi->numIp-1] = (long)sipi; updateIp: for( i = 0; (i < sipi->numDom) && (sipi->dom_list[i] != (long)sdomi); i++ ); if( sipi->numDom != i ) goto endListUpdate; sipi->numDom++; sipi->dom_list = (long *) mrealloc( sipi->dom_list, (sipi->numDom-1)*sizeof(long), sipi->numDom*sizeof(long), "main-dclip" ); sipi->dom_list[sipi->numDom-1] = (long)sdomi; endListUpdate: i=0; } if( !((++countDocs) % 1000) ) log(LOG_INFO, "cntDm: %li records searched.",countDocs); if( countDocs == numRecs ) goto freeInfo; //else countDocs++; } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ) { log( LOG_INFO, "cntDm: Keys wrapped around! Exiting." ); goto freeInfo; } if ( countDocs >= numRecs ) { freeInfo: char buf[128]; //long value ; long len ; char loop ; long recsDisp; struct ip_info *tmpipi ; struct dom_info *tmpdomi ; //struct lnk_info *tmplnk ; loop = 0; FILE *fhndl; char out[128]; if( output != 9 ) goto printHtml; // Dump raw data to a file to parse later sprintf( out, "%scntdom.xml", g_hostdb.m_dir ); if( (fhndl = fopen( out, "wb" )) < 0 ) { log( LOG_INFO, "cntDm: File Open Failed." ); return; } gbsort( dom_table, countDom, sizeof(long), dom_fcmp ); for( long i = 0; i < countDom; i++ ) { if( !dom_table[i] ) continue; tmpdomi = (struct dom_info *)dom_table[i]; len = tmpdomi->domLen; if( tmpdomi->domLen > 127 ) len = 126; strncpy( buf, tmpdomi->dom, len ); buf[len] = '\0'; fprintf(fhndl, "\n\t%s\n" "\t%ld\n" //"\t%lld\n" "\t\n", buf, tmpdomi->pages //,(tmpdomi->quality/tmpdomi->pages) ); gbsort( tmpdomi->ip_list,tmpdomi->numIp, sizeof(long), ip_fcmp ); for( long j = 0; j < tmpdomi->numIp; j++ ) { if( !tmpdomi->ip_list[j] ) continue; tmpipi = (struct ip_info *)tmpdomi->ip_list[j]; strcpy ( buf , iptoa( tmpipi->ip ) ); fprintf(fhndl,"\t\t%s\n",buf); } fprintf(fhndl, "\t\n" "\t\n"); /* gbsort(tmpdomi->lnk_table,tmpdomi->lnkCnt,sizeof(long), lnk_fcmp ); for( long j = 0; j < tmpdomi->lnkCnt; j++ ) { tmplnk = (struct lnk_info *)tmpdomi->lnk_table[j]; len = tmplnk->domLen; if( len > 127 ) len = 126; strncpy( buf, tmplnk->dom, len ); buf[len] = '\0'; fprintf(fhndl, "\t\t\n" "\t\t\t%s\n" "\t\t\t%ld\n" "\t\t\n", buf, tmplnk->pages); } fprintf(fhndl, "\t\n" "\n"); */ } gbsort( ip_table, countIp, sizeof(long), ip_fcmp ); for( long i = 0; i < countIp; i++ ) { if( !ip_table[i] ) continue; tmpipi = (struct ip_info *)ip_table[i]; strcpy ( buf , iptoa( tmpipi->ip ) ); fprintf(fhndl, "\n\t%s\n" "\t%ld\n" //"\t%lld\n" "\t\n", buf, tmpipi->pages); //(tmpipi->quality/tmpipi->pages)); for( long j = 0; j < tmpipi->numDom; j++ ) { tmpdomi = (struct dom_info *)tmpipi->dom_list[j]; len = tmpdomi->domLen; if( tmpdomi->domLen > 127 ) len = 126; strncpy( buf, tmpdomi->dom, len ); buf[len] = '\0'; fprintf(fhndl, "\t\t%s\n", buf); } fprintf(fhndl, "\t\n" "\n"); } if( fclose( fhndl ) < 0 ) { log( LOG_INFO, "cntDm: File Close Failed." ); return; } fhndl = 0; /* // Terminal Output format for( long i = 0; i < countIp; i++ ) { if( !ip_table[i] ) continue; tmpipi = (struct ip_info *)ip_table[i]; strcpy ( buf , iptoa( tmpipi->ip ) ); fprintf( stderr, "\t\tIP: %s \t\t\t\t\t%ld\n", buf, tmpipi->pages ); for( long j = 0; j < tmpipi->numDom; j++ ) { long len; tmpdomi = (struct dom_info *)tmpipi->dom_list[j]; len = tmpdomi->domLen; if( tmpdomi->domLen > 127 ) len = 126; strncpy( buf, tmpdomi->dom, len ); buf[len] = '\0'; fprintf( stderr, "\t\t\tDM: %s", buf ); if( tmpdomi->domLen > 27 ) fprintf( stderr, "\t\t" ); else if( tmpdomi->domLen <= 11 ) fprintf( stderr, "\t\t\t\t\t" ); else if( tmpdomi->domLen >= 20 ) fprintf( stderr, "\t\t\t" ); else fprintf( stderr, "\t\t\t\t" ); fprintf( stderr, "%ld\n", tmpdomi->pages ); if( verbosity != 10 ) continue; gbsort( tmpdomi->lnk_table, tmpdomi->lnkCnt, sizeof(long), lnk_fcmp ); for( long k = 0; k < tmpdomi->lnkCnt; k++ ) { tmplnk = (struct lnk_info *) tmpdomi->lnk_table[k]; len = tmplnk->domLen; if( len > 127 ) len = 126; strncpy( buf, tmplnk->dom, len ); buf[len] = '\0'; fprintf( stderr, "\t\t\t\tLD: %s", buf ); if( len > 27 ) fprintf( stderr, "\t" ); else if( len <= 11 ) fprintf( stderr, "\t\t\t\t" ); else if( len >= 20 ) fprintf( stderr, "\t\t" ); else fprintf( stderr, "\t\t\t" ); fprintf(stderr, "%ld\n", tmplnk->pages); } } fprintf( stderr, "\n" ); } */ printHtml: // HTML file Output sprintf( out, "%scntdom.html", g_hostdb.m_dir ); if( (fhndl = fopen( out, "wb" )) < 0 ) { log( LOG_INFO, "cntDm: File Open Failed." ); return; } long long total = g_titledb.getGlobalNumDocs(); char link_ip[] = "http://www.gigablast.com/search?" "code=gbmonitor&q=ip%3A"; char link_dom[] = "http://www.gigablast.com/search?" "code=gbmonitor&q=site%3A"; char menu[] = "\n" "" "" "" "" "" "\n
" "Domains Sorted By Pages" "Domains Sorted By Links" "IPs Sorted By Pages" "IPs Sorted By Domains" "Stats
\n
\n"; char hdr[] = "" "" "" "" //"" "" "" "" "\n"; char hdr2[] = "
DomainDomains LinkedAvg Quality# PagesExtrap # PagesIP
" "" "" "" "" //"" "" "" "\n"; char clr1[] = "#FFFF00";//"yellow"; char clr2[] = "#FFFF66";//"orange"; //char clr3[] = "#0099FF";//"#66FF33"; //char clr4[] = "#33FFCC";//"#33CC33"; char *color; fprintf( fhndl, "Domain/IP Counter\n" "" "

Domain/IP Counter



" "" "

Stats

\n%s", menu ); // Stats fprintf( fhndl, "
\n\n
IPDomainDomains LinkedAvg Quality# PagesExtrap # Pages
\n" "" "\n" "" "\n" "\n" "" "\n" "\n" "" "" "
Total Number of Domains%ld
Total Number of Ips%ld
Number of Documents Searched" "%ld
Number of Failed Attempts%ld
Number of Documents in Index" "%lld
Estimated Domains in index%lld



\n" ,countDom,countIp, countDocs, attempts-countDocs,total, ((countDom*total)/countDocs) ); fprintf( fhndl, "
\n" "

Domains Sorted By Pages

\n" "%s", menu ); gbsort( dom_table, countDom, sizeof(long), dom_fcmp ); printDomLp: fprintf( fhndl,"%s", hdr ); recsDisp = countDom; if( countDom > 1000 ) recsDisp = 1000; for( long i = 0; i < recsDisp; i++ ) { char buf[128]; long len; if( !dom_table[i] ) continue; if( i%2 ) color = clr2; else color = clr1; tmpdomi = (struct dom_info *)dom_table[i]; len = tmpdomi->domLen; if( tmpdomi->domLen > 127 ) len = 126; strncpy( buf, tmpdomi->dom, len ); buf[len] = '\0'; fprintf( fhndl, "" "
%s" "%ld" //"%lld" "%ld" "%lld", color, link_dom, buf, buf, tmpdomi->lnkCnt, //(tmpdomi->quality/tmpdomi->pages), tmpdomi->pages, ((tmpdomi->pages*total)/countDocs) ); for( long j = 0; j < tmpdomi->numIp; j++ ) { tmpipi = (struct ip_info *)tmpdomi->ip_list[j]; strcpy ( buf , iptoa(tmpipi->ip) ); fprintf( fhndl, "%s\n", link_ip, buf, buf ); } fprintf( fhndl, "\n" ); /* if( verbosity != 10 ) goto printDone; gbsort(tmpdomi->lnk_table,tmpdomi->lnkCnt,sizeof(long), lnk_fcmp ); for( long k = 0; k < tmpdomi->lnkCnt; k++ ) { tmplnk = (struct lnk_info *)tmpdomi->lnk_table[k]; len = tmplnk->domLen; if( len > 127 ) len = 126; strncpy( buf, tmplnk->dom, len ); buf[len] = '\0'; fprintf( fhndl, "\t\t" "%s" "%ld%lld\n", buf, tmplnk->pages, ((tmplnk->pages*total)/countDocs) ); } printDone: */ fprintf( fhndl, "\n" ); } fprintf( fhndl, "\n


" ); if( loop == 0 ) { loop = 1; gbsort( dom_table, countDom, sizeof(long), dom_lcmp ); fprintf( fhndl, "" "

Domains Sorted By Links

\n%s", menu ); goto printDomLp; } loop = 0; fprintf( fhndl, "
" "

IPs Sorted By Pages

\n%s", menu ); gbsort( ip_table, countIp, sizeof(long), ip_fcmp ); printIpLp: fprintf( fhndl,"%s", hdr2 ); recsDisp = countIp; if( countIp > 1000 ) recsDisp = 1000; for( long i = 0; i < recsDisp; i++ ) { char buf[128]; if( !ip_table[i] ) continue; tmpipi = (struct ip_info *)ip_table[i]; strcpy ( buf , iptoa(tmpipi->ip) ); if( i%2 ) color = clr2; else color = clr1; long linked = 0; for( long j = 0; j < tmpipi->numDom; j++ ) { tmpdomi=(struct dom_info *)tmpipi->dom_list[j]; linked += tmpdomi->lnkCnt; } fprintf( fhndl, "\t" "
%s" "" "%ld" "%ld" //"%lld" "%ld" "%lld\n", color, link_ip, buf, buf, tmpipi->numDom, linked, //(tmpipi->quality/tmpipi->pages), tmpipi->pages, ((tmpipi->pages*total)/countDocs) ); /* for( long j = 0; j < tmpipi->numDom; j++ ) { long len; tmpdomi=(struct dom_info *)tmpipi->dom_list[j]; len = tmpdomi->domLen; if( tmpdomi->domLen > 127 ) len = 126; strncpy( buf, tmpdomi->dom, len ); buf[len] = '\0'; if( j%2 ) color = clr4; else color = clr3; fprintf( fhndl, "" "%s" "%ld%lld" "%ld %lld" "\n", color, link_dom, buf, buf, tmpdomi->lnkCnt, (tmpdomi->quality/tmpdomi->pages), tmpdomi->pages, ((tmpdomi->pages*total)/countDocs) ); } */ fprintf( fhndl, "\n" ); } fprintf( fhndl, "\n


" ); if( loop == 0 ) { loop = 1; gbsort( ip_table, countIp, sizeof(long), ip_dcmp ); fprintf( fhndl, "" "

IPs Sorted By Domains

\n%s", menu ); goto printIpLp; } if( fclose( fhndl ) < 0 ) { log( LOG_INFO, "cntDm: File Close Failed." ); return; } fhndl = 0; long ima = 0; long dma = 0; log( LOG_INFO, "cntDm: Freeing ip info struct..." ); for( long i = 0; i < countIp; i++ ) { if( !ip_table[i] ) continue; //value = ipHT.getValue( ip_table[i] ); //if(value == 0) continue; tmpipi = (struct ip_info *)ip_table[i]; mfree( tmpipi->dom_list, tmpipi->numDom*sizeof(long), "main-dcflip" ); ima += tmpipi->numDom * sizeof(long); mfree( tmpipi, sizeof(struct ip_info), "main-dcfip" ); ima += sizeof(struct ip_info); tmpipi = NULL; } mfree( ip_table, numRecs * sizeof(long), "main-dcfit" ); log( LOG_INFO, "cntDm: Freeing domain info struct..." ); for( long i = 0; i < countDom; i++ ) { if( !dom_table[i] ) continue; tmpdomi = (struct dom_info *)dom_table[i]; /* for( long j = 0; j < tmpdomi->lnkCnt; j++ ) { if( !tmpdomi->lnk_table[j] ) continue; tmplnk=(struct lnk_info *)tmpdomi->lnk_table[j]; mfree( tmplnk->dom, tmplnk->domLen, "main-dsfsld" ); mfree( tmplnk, sizeof(struct lnk_info), "main-dsfsli" ); } */ mfree( tmpdomi->lnk_table, tmpdomi->tableSize*sizeof(long), "main-dcfsdlt" ); dma += tmpdomi->tableSize * sizeof(long); mfree( tmpdomi->ip_list, tmpdomi->numIp*sizeof(long), "main-dcfldom" ); dma += tmpdomi->numIp * sizeof(long); mfree( tmpdomi->dom, tmpdomi->domLen, "main-dcfsdom" ); dma += tmpdomi->domLen; //tmpdomi->dht.reset(); //mdelete( tmpdomi->dht, sizeof(HashTable), "main-dcmdht" ); //delete tmpdomi->dht; mfree( tmpdomi, sizeof(struct dom_info), "main-dcfdom" ); dma+= sizeof(struct dom_info); tmpdomi = NULL; } mfree( dom_table, numRecs * sizeof(long), "main-dcfdt" ); long long time_end = gettimeofdayInMilliseconds(); log( LOG_INFO, "cntDm: Took %lldms to count domains in %ld recs.", time_end-time_start, countDocs ); log( LOG_INFO, "cntDm: %li bytes of Total Memory Used.", ima + dma + (8 * numRecs) ); log( LOG_INFO, "cntDm: %li bytes Total for IP.", ima ); log( LOG_INFO, "cntDm: %li bytes Total for Dom.", dma ); log( LOG_INFO, "cntDm: %li bytes Average for IP.", ima/countIp ); log( LOG_INFO, "cntDm: %li bytes Average for Dom.", dma/countDom ); return; } goto loop; } // JAB: warning abatement #if 0 // Sort by IP address 9->0 int ip_hcmp (const void *p1, const void *p2) { long n1, n2; struct ip_info *ii1; struct ip_info *ii2; long long n3 = 0; long long n4 = 0; *(((unsigned char *)(&n1))+0) = *(((char *)p1)+0); *(((unsigned char *)(&n1))+1) = *(((char *)p1)+1); *(((unsigned char *)(&n1))+2) = *(((char *)p1)+2); *(((unsigned char *)(&n1))+3) = *(((char *)p1)+3); *(((unsigned char *)(&n2))+0) = *(((char *)p2)+0); *(((unsigned char *)(&n2))+1) = *(((char *)p2)+1); *(((unsigned char *)(&n2))+2) = *(((char *)p2)+2); *(((unsigned char *)(&n2))+3) = *(((char *)p2)+3); ii1 = (struct ip_info *)n1; ii2 = (struct ip_info *)n2; *(((unsigned char *)(&n3))+3) = *(((char *)ii1->ip)+0); *(((unsigned char *)(&n3))+2) = *(((char *)ii1->ip)+1); *(((unsigned char *)(&n3))+1) = *(((char *)ii1->ip)+2); *(((unsigned char *)(&n3))+0) = *(((char *)ii1->ip)+3); *(((unsigned char *)(&n2))+3) = *(((char *)ii2->ip)+0); *(((unsigned char *)(&n2))+2) = *(((char *)ii2->ip)+1); *(((unsigned char *)(&n2))+1) = *(((char *)ii2->ip)+2); *(((unsigned char *)(&n2))+0) = *(((char *)ii2->ip)+3); return (n4 - n3)/100; } #endif // Sort by IP frequency in pages 9->0 int ip_fcmp (const void *p1, const void *p2) { long n1, n2; struct ip_info *ii1; struct ip_info *ii2; *(((unsigned char *)(&n1))+0) = *(((char *)p1)+0); *(((unsigned char *)(&n1))+1) = *(((char *)p1)+1); *(((unsigned char *)(&n1))+2) = *(((char *)p1)+2); *(((unsigned char *)(&n1))+3) = *(((char *)p1)+3); *(((unsigned char *)(&n2))+0) = *(((char *)p2)+0); *(((unsigned char *)(&n2))+1) = *(((char *)p2)+1); *(((unsigned char *)(&n2))+2) = *(((char *)p2)+2); *(((unsigned char *)(&n2))+3) = *(((char *)p2)+3); ii1 = (struct ip_info *)n1; ii2 = (struct ip_info *)n2; return ii2->pages-ii1->pages; } // Sort by number of domains linked to IP, descending int ip_dcmp (const void *p1, const void *p2) { long n1, n2; struct ip_info *ii1; struct ip_info *ii2; *(((unsigned char *)(&n1))+0) = *(((char *)p1)+0); *(((unsigned char *)(&n1))+1) = *(((char *)p1)+1); *(((unsigned char *)(&n1))+2) = *(((char *)p1)+2); *(((unsigned char *)(&n1))+3) = *(((char *)p1)+3); *(((unsigned char *)(&n2))+0) = *(((char *)p2)+0); *(((unsigned char *)(&n2))+1) = *(((char *)p2)+1); *(((unsigned char *)(&n2))+2) = *(((char *)p2)+2); *(((unsigned char *)(&n2))+3) = *(((char *)p2)+3); ii1 = (struct ip_info *)n1; ii2 = (struct ip_info *)n2; return ii2->numDom-ii1->numDom; } // JAB: warning abatement #if 0 // Sort by Host name, a->z int dom_hcmp (const void *p1, const void *p2) { long len, n1, n2; struct dom_info *di1; struct dom_info *di2; *(((unsigned char *)(&n1))+0) = *(((char *)p1)+0); *(((unsigned char *)(&n1))+1) = *(((char *)p1)+1); *(((unsigned char *)(&n1))+2) = *(((char *)p1)+2); *(((unsigned char *)(&n1))+3) = *(((char *)p1)+3); *(((unsigned char *)(&n2))+0) = *(((char *)p2)+0); *(((unsigned char *)(&n2))+1) = *(((char *)p2)+1); *(((unsigned char *)(&n2))+2) = *(((char *)p2)+2); *(((unsigned char *)(&n2))+3) = *(((char *)p2)+3); di1 = (struct dom_info *)n1; di2 = (struct dom_info *)n2; if( di1->domLen < di2->domLen ) len = di1->domLen; else len = di2->domLen; return strncasecmp( di1->dom, di2->dom, len ); } #endif // Sort by page frequency in titlerec 9->0 int dom_fcmp (const void *p1, const void *p2) { long n1, n2; struct dom_info *di1; struct dom_info *di2; *(((unsigned char *)(&n1))+0) = *(((char *)p1)+0); *(((unsigned char *)(&n1))+1) = *(((char *)p1)+1); *(((unsigned char *)(&n1))+2) = *(((char *)p1)+2); *(((unsigned char *)(&n1))+3) = *(((char *)p1)+3); *(((unsigned char *)(&n2))+0) = *(((char *)p2)+0); *(((unsigned char *)(&n2))+1) = *(((char *)p2)+1); *(((unsigned char *)(&n2))+2) = *(((char *)p2)+2); *(((unsigned char *)(&n2))+3) = *(((char *)p2)+3); di1 = (struct dom_info *)n1; di2 = (struct dom_info *)n2; return di2->pages-di1->pages; } // Sort by quantity of outgoing links 9-0 int dom_lcmp (const void *p1, const void *p2) { long n1, n2; struct dom_info *di1; struct dom_info *di2; *(((unsigned char *)(&n1))+0) = *(((char *)p1)+0); *(((unsigned char *)(&n1))+1) = *(((char *)p1)+1); *(((unsigned char *)(&n1))+2) = *(((char *)p1)+2); *(((unsigned char *)(&n1))+3) = *(((char *)p1)+3); *(((unsigned char *)(&n2))+0) = *(((char *)p2)+0); *(((unsigned char *)(&n2))+1) = *(((char *)p2)+1); *(((unsigned char *)(&n2))+2) = *(((char *)p2)+2); *(((unsigned char *)(&n2))+3) = *(((char *)p2)+3); di1 = (struct dom_info *)n1; di2 = (struct dom_info *)n2; return di2->lnkCnt-di1->lnkCnt; } // JAB: warning abatement #if 0 // Sort by domain name a-z int lnk_hcmp (const void *p1, const void *p2) { long len, n1, n2; struct lnk_info *li1; struct lnk_info *li2; *(((unsigned char *)(&n1))+0) = *(((char *)p1)+0); *(((unsigned char *)(&n1))+1) = *(((char *)p1)+1); *(((unsigned char *)(&n1))+2) = *(((char *)p1)+2); *(((unsigned char *)(&n1))+3) = *(((char *)p1)+3); *(((unsigned char *)(&n2))+0) = *(((char *)p2)+0); *(((unsigned char *)(&n2))+1) = *(((char *)p2)+1); *(((unsigned char *)(&n2))+2) = *(((char *)p2)+2); *(((unsigned char *)(&n2))+3) = *(((char *)p2)+3); li1 = (struct lnk_info *)n1; li2 = (struct lnk_info *)n2; if( li1->domLen < li2->domLen ) len = li1->domLen; else len = li2->domLen; return strncasecmp( li1->dom, li2->dom, len ); } #endif // JAB: warning abatement #if 0 // Sort by frequency of link use, 9-0 int lnk_fcmp (const void *p1, const void *p2) { long n1, n2; struct lnk_info *li1; struct lnk_info *li2; *(((unsigned char *)(&n1))+0) = *(((char *)p1)+0); *(((unsigned char *)(&n1))+1) = *(((char *)p1)+1); *(((unsigned char *)(&n1))+2) = *(((char *)p1)+2); *(((unsigned char *)(&n1))+3) = *(((char *)p1)+3); *(((unsigned char *)(&n2))+0) = *(((char *)p2)+0); *(((unsigned char *)(&n2))+1) = *(((char *)p2)+1); *(((unsigned char *)(&n2))+2) = *(((char *)p2)+2); *(((unsigned char *)(&n2))+3) = *(((char *)p2)+3); li1 = (struct lnk_info *)n1; li2 = (struct lnk_info *)n2; return li2->pages-li1->pages; } #endif /* static void printBits(qvec_t bits, long numDigits, char *buf){ long pos = 0; for (long i=0; i < numDigits ; i++){ if (i && i%4 == 0) buf[pos++] = ' '; if (bits & (1 << (numDigits-i-1))) buf[pos++] = '1'; else buf[pos++] = '0'; } buf[pos] = 0; } bool testBoolean() { if (!queryTest()) return false; char *testQueries [] = { "a AND b OR c", "a OR b AND c", "a AND NOT b OR b AND NOT a", //vivismo query bug "canada suntanning OR beaches", "canada AND suntanning OR beaches", "canada AND (suntanning OR beaches)", "(canada AND suntanning) OR beaches", "a AND b OR c AND d AND e OR f", // buzz problem query "(a AND NOT (b OR c)) d | f", "foo AND (bar OR boo) keywords | sortkey" // "a AND NOT b OR c", // "a AND NOT b OR b AND NOT a", // "a OR b | c", // "(a AND b OR c) | d", }; char *truthTables [] = { "00011111", "01010111", "0110", // term 0 has implicitbits for 1 and 2 "0101011111111111", "00011111", "00010101", "00011111", // big uns "00010001000100010001000100011111" "11111111111111111111111111111111", "00000000000000000000000001000000", "00000000000000000000000000010101", }; int numTests = 10; // buffer for holding truth table long bufSize = 10000000; char *bitScoresBuf = (char*) mmalloc(bufSize, "bitScoreBuf"); if (!bitScoresBuf){ log("query: unable to alloc bitScores buffer: %s", mstrerror(g_errno) ); return false; } for (int i=0; i < numTests ; i++) { Query q; if ( ! q.set2 ( testQueries[i] , langUnknown ) ) { log("query: unable to set query: %s", mstrerror(g_errno) ); continue; } q.setBitMap(); if ( ! q.setBitScoresBoolean(bitScoresBuf, bufSize) ) { log("query: unable to set bitScores: %s", mstrerror(g_errno) ); mfree(bitScoresBuf, bufSize,"bitScoresBuf"); return false; } printf("\n"); log(LOG_INIT, "query: Test #%d: %s", i, testQueries[i]); // print parsed expressions SafeBuf sbuf(1024); Expression *e = &q.m_expressions[0]; while (e->m_parent) e = e->m_parent; e->print(&sbuf); log("query: %s", sbuf.getBufStart()); long numCombos = 1 << q.m_numExplicitBits; //log("query: numcombos: %d", numCombos); // hack for duplicate terms bits so we don't need // an unreasonably // large test table qvec_t bitMask = 0; for (int j=0;jm_explicitBit; bitMask |= qt->m_implicitBits; sbuf.reset(); //sbuf.utf16Encode(qt->m_term, qt->m_termLen); sbuf.safeMemcpy(qt->m_term, qt->m_termLen); log("query: term #%d: ebit=0x08%llx ibit=0x08%llx %s", j, (long long) q.m_qterms[j].m_explicitBit, (long long) q.m_qterms[j].m_implicitBits, sbuf.getBufStart()); } //some problem queries give no terms, and a zero bitmask // causes it to produce no errors if (!bitMask) bitMask = numCombos-1; long errorCount = 0; char bitBuf[64]; bitBuf[63] = 0; printBits(bitMask, q.m_numExplicitBits, bitBuf); log("query: bit mask: 0x%08llx (%s)", (long long) bitMask, bitBuf); for (int j=0;jaddColl ( coll ); key_t startKey ; key_t endKey ; key_t lastKey ; startKey.setMin(); endKey.setMax(); lastKey.setMin(); startKey = g_titledb.makeFirstTitleRecKey ( docid ); // turn off threads g_threads.disableThreads(); // get a meg at a time long minRecSizes = 1024*1024; Msg5 msg5; Msg5 msg5b; Msg5 msg5c; RdbList list; RdbList ulist; if (!ucInit(g_hostdb.m_dir, true)) { log("Unicode initialization failed!"); } g_tfndb.init (); g_collectiondb.init(true); g_tfndb.getRdb()->addColl ( coll ); loop: // use msg5 to get the list, should ALWAYS block since no threads if ( ! msg5.getList ( RDB_TITLEDB , coll , &list , startKey , endKey , minRecSizes , includeTree , false , // add to cache? 0 , // max cache age startFileNum , numFiles , NULL , // state NULL , // callback 0 , // niceness false , // err correction? NULL , // cache key ptr 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &msg5b )){ log(LOG_LOGIC,"db: getList did not block."); return; } // all done if empty if ( list.isEmpty() ) return; // loop over entries in list for ( list.resetListPtr() ; ! list.isExhausted() ; list.skipCurrentRecord() ) { key_t k = list.getCurrentKey(); char *rec = list.getCurrentRec(); long recSize = list.getCurrentRecSize(); //long long docId = g_titledb.getDocIdFromKey ( k ); if ( k <= lastKey ) log("key out of order. " "lastKey.n1=%lx n0=%llx " "currKey.n1=%lx n0=%llx ", lastKey.n1,lastKey.n0, k.n1,k.n0); lastKey = k; // print deletes // if ( (k.n0 & 0x01) == 0) { // fprintf(stderr,"n1=%08lx n0=%016llx docId=%012lli " // "hh=%07lx ch=%08lx (del)\n", // k.n1 , k.n0 , docId , hostHash , contentHash ); // continue; // } // uncompress the title rec TitleRec tr; if ( ! tr.set ( rec , recSize , false ) ) continue; Xml xml; char *s = tr.getContent(); long slen = tr.getContentLen(); short csEnum = tr.getCharset(); if ( ! xml.set ( csEnum, s , slen , false , // ownData? 0, false, tr.getVersion() ) ) continue; Links links; Url *linker = tr.getRedirUrl(); //Xml *sx = g_tagdb.getSiteXml ( tr.getSiteFilenum(), // coll , //tr.getColl() , // collLen);//tr.getCollLen()); links.set ( true , &xml , linker , false, // includeLinkHashes true, TITLEREC_CURRENT_VERSION, // true=useBaseHref? 0 ); Words words; words.set(&xml, true, 0); log(LOG_WARN, "looking at %s", tr.getUrl()->getUrl()); //g_siteBonus.isSerp ( tr.getUrl(), &xml, &links, &words); g_siteBonus.getNegativeQualityWeight (tr.getUrl(), &xml, &links, &words, coll, //NULL,//siterec NULL,//safebuf 0); //niceness } startKey = *(key_t *)list.getLastKey(); startKey += (unsigned long) 1; // watch out for wrap around if ( startKey < *(key_t *)list.getLastKey() ) { return; } goto loop; } // Run automated qa test showing the differences between servers located at // s1 and s2. // u: optional filename of list of urls to check for parse diffs // q: optional filename of list of queries to check for result diffs void qaTest ( char *s1, char *s2, char *u, char *q) { QAClient qaClient; qaClient.init(s1, s2, u, q); //qaClient.parseUrls(urlList); //qaClient.diffQueries(queryList); // Crap, we need a loop qaClient.runTests(); } // Need a test for the diff method used in qa test void xmlDiffTest(char *file1, char *file2, DiffOpt *opt){ diffXmlFiles(file1, file2, opt); } */ // generate the copies that need to be done to scale from oldhosts.conf // to newhosts.conf topology. int collinject ( char *newHostsConf ) { g_hostdb.resetPortTables(); Hostdb hdb; if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) { log("collinject failed. Could not init hostdb with %s", newHostsConf); return -1; } // ptrs to the two hostdb's Hostdb *hdb1 = &g_hostdb; Hostdb *hdb2 = &hdb; if ( hdb1->m_numHosts != hdb2->m_numHosts ) { log("collinject: num hosts differ!"); return -1; } // . ensure old hosts in g_hostdb are in a derivate groupId in // newHostsConf // . old hosts may not even be present! consider them the same host, // though, if have same ip and working dir, because that would // interfere with a file copy. for ( long i = 0 ; i < hdb1->m_numShards ; i++ ) { //Host *h1 = &hdb1->getHost(i);//m_hosts[i]; //long gid = hdb1->getGroupId ( i ); // groupNum unsigned long shardNum = (unsigned long)i; Host *h1 = hdb1->getShard ( shardNum ); Host *h2 = hdb2->getShard ( shardNum ); printf("ssh %s 'nohup /w/gbi -c /w/hosts.conf inject titledb " "%s:%li >& /w/ilog' &\n" , h1->m_hostname , iptoa(h2->m_ip) //, h2->m_hostname , (long)h2->m_httpPort ); } return 1; }