#include "gb-include.h" #include "Collectiondb.h" //#include "CollectionRec.h" #include "Stats.h" #include "Statsdb.h" #include "Ads.h" #include "Query.h" #include "Speller.h" #include "Msg40.h" #include "Pages.h" #include "Highlight.h" #include "SearchInput.h" #include #include "SafeBuf.h" #include "iana_charset.h" #include "Pos.h" #include "Bits.h" #include "AutoBan.h" #include "sort.h" #include "LanguageIdentifier.h" #include "LanguagePages.h" #include "LangList.h" #include "CountryCode.h" #include "Unicode.h" #include "XmlDoc.h" // GigabitInfo class #include "Posdb.h" // MAX_TOP definition #include "PageResults.h" #include "Proxy.h" static bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) ; static bool printMenu ( SafeBuf *sb , long menuNum ) ; //static void gotSpellingWrapper ( void *state ) ; static void gotResultsWrapper ( void *state ) ; //static void gotAdsWrapper ( void *state ) ; static void gotState ( void *state ) ; static bool gotResults ( void *state ) ; bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) ; bool printJsonItemInCSV ( char *json , SafeBuf *sb , class State0 *st ) ; bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps , Msg20Reply *mr , Msg40 *msg40 , bool first ) ; bool printScoresHeader ( SafeBuf *sb ) ; bool printSingleScore ( SafeBuf *sb , SearchInput *si , SingleScore *ss , Msg20Reply *mr , Msg40 *msg40 ) ; bool printDmozEntry ( SafeBuf *sb , long catId , bool direct , char *dmozTitle , char *dmozSummary , char *dmozAnchor , SearchInput *si ); bool sendReply ( State0 *st , char *reply ) { long savedErr = g_errno; TcpSocket *s = st->m_socket; if ( ! s ) { char *xx=NULL;*xx=0; } SearchInput *si = &st->m_si; char *ct = "text/html"; if ( si && si->m_format == FORMAT_XML ) ct = "text/xml"; if ( si && si->m_format == FORMAT_JSON ) ct = "application/json"; if ( si && si->m_format == FORMAT_CSV ) ct = "text/csv"; char *charset = "utf-8"; char format = si->m_format; // . filter anything < 0x20 to 0x20 to keep XML legal // . except \t, \n and \r, they're ok // . gotta set "f" down here in case it realloc'd the buf if ( format == FORMAT_XML && reply ) { unsigned char *f = (unsigned char *)reply; for ( ; *f ; f++ ) if ( *f < 0x20 && *f!='\t' && *f!='\n' && *f!='\r' ) *f = 0x20; } long rlen = 0; if ( reply ) rlen = gbstrlen(reply); logf(LOG_DEBUG,"gb: sending back %li bytes",rlen); // . use light brown if coming directly from an end user // . use darker brown if xml feed long color = 0x00b58869; if ( si->m_format != FORMAT_HTML )color = 0x00753d30 ; long long nowms = gettimeofdayInMilliseconds(); long long took = nowms - st->m_startTime ; g_stats.addStat_r ( took , st->m_startTime , nowms, color , STAT_QUERY ); // add to statsdb, use # of qterms as the value/qty g_statsdb.addStat ( 0, "query", st->m_startTime, nowms, si->m_q.m_numTerms); // . log the time // . do not do this if g_errno is set lest m_sbuf1 be bogus b/c // it failed to allocate its buf to hold terminating \0 in // SearchInput::setQueryBuffers() if ( ! g_errno && st->m_took >= g_conf.m_logQueryTimeThreshold ) { logf(LOG_TIMING,"query: Took %lli ms for %s. results=%li", st->m_took, si->m_sbuf1.getBufStart(), st->m_msg40.getNumResults()); } //bool xml = si->m_xml; g_stats.logAvgQueryTime(st->m_startTime); if ( ! savedErr ) { // g_errno ) { g_stats.m_numSuccess++; // . one hour cache time... no 1000 hours, basically infinite // . no because if we redo the query the results are cached long cacheTime = 3600;//*1000; // no... do not use cache cacheTime = -1; // the "Check it" link on add url uses &usecache=0 to tell // the browser not to use its cache... //if ( hr->getLong("usecache",-1) == 0 ) cacheTime = 0; // // send back the actual search results // g_httpServer.sendDynamicPage(s, reply, rlen,//gbstrlen(reply), // don't let the ajax re-gen // if they hit the back button! // so make this 1 hour, not 0 cacheTime, // cachetime in secs false, // POSTReply? ct, -1, // httpstatus -1 -> 200 NULL, // cookieptr charset ); // free st after sending reply since "st->m_sb" = "reply" mdelete(st, sizeof(State0), "PageResults2"); delete st; return true; } // error otherwise if ( savedErr != ENOPERM ) g_stats.m_numFails++; mdelete(st, sizeof(State0), "PageResults2"); delete st; /* if ( format == FORMAT_XML ) { SafeBuf sb; sb.safePrintf("\n" "\n" "\t%li\n" "\t%s\n" "\n" ,(long)savedErr ,mstrerror(savedErr) ); // clear it for sending back g_errno = 0; // send back as normal reply g_httpServer.sendDynamicPage(s, sb.getBufStart(), sb.length(), 0, // cachetime in secs false, // POSTReply? ct, -1, // httpstatus -1 -> 200 NULL, // cookieptr charset ); return true; } */ long status = 500; if (savedErr == ETOOMANYOPERANDS || savedErr == EBADREQUEST || savedErr == ENOPERM || savedErr == ENOCOLLREC) status = 400; g_httpServer.sendQueryErrorReply(s, status, mstrerror(savedErr), format,//xml, savedErr, "There was an error!"); return true; } bool printCSSHead ( SafeBuf *sb , char format ) { sb->safePrintf( "\n" //"\n" "\n" "\n" "Gigablast Search Results\n" "\n" "\n" ); return true; } // . returns false if blocked, true otherwise // . sets g_errno on error // . "msg" will be inserted into the access log for this request bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) { // . check for sdirt=4, this a site search on the given directory id // . need to pre-query the directory first to get the sites to search // this will likely have just been cached so it should be quick // . then need to construct a site search query //long rawFormat = hr->getLong("xml", 0); // was "raw" //long xml = hr->getLong("xml",0); // what format should search results be in? default is html char format = hr->getReplyFormat();//getFormatFromRequest ( hr ); // get the dmoz catid if given //long searchingDmoz = hr->getLong("dmoz",0); // // DO WE NEED TO ALTER cr->m_siteListBuf for a widget? // // when a wordpress user changes the "Websites to Include" for // her widget, it should send a /search?sites=xyz.com&wpid=xxx // request here... // so we need to remove her old sites and add in her new ones. // /* MDW TURN BACK ON IN A DAY. do indexing or err pages first. // get wordpressid supplied with all widget requests char *wpid = hr->getString("wpid"); // we have to add set &spidersites=1 which all widgets should do if ( wpid ) { // this returns NULL if cr->m_siteListBuf would be unchanged // because we already have the whiteListBuf sites in there // for this wordPressId (wpid) SafeBuf newSiteListBuf; makeNewSiteList( &si->m_whiteListBuf, cr->m_siteListBuf , wpid , &newSiteListBuf); // . update the list of sites to crawl/search & show in widget // . if they give an empty list then allow that, stops crawling SafeBuf parmList; g_parms.addNewParmToList1 ( &parmList, cr->m_collnum, newSiteListBuf, 0, "sitelist"); // send the parms to all hosts in the network g_parms.broadcastParmList ( &parmList , NULL,//s,// state is socket i guess NULL);//doneBroadcastingParms2 ); // nothing left to do now return g_httpServer.sendDynamicPage(s, "OK",//sb.getBufStart(), 2,//sb.length(), cacheTime,//0, false, // POST? "text/html", 200, // httpstatus NULL, // cookie "UTF-8"); // charset } */ // // . send back page frame with the ajax call to get the real // search results. do not do this if a "&dir=" (dmoz category) // is given. // . if not matt wells we do not do ajax // . the ajax is just there to prevent bots from slamming me // with queries. // if ( hr->getLong("id",0) == 0 && format == FORMAT_HTML && g_conf.m_isMattWells ) { SafeBuf sb; printCSSHead ( &sb ,format ); sb.safePrintf( "" , h32 , rand64 ); // // . login bar // . proxy will replace it byte by byte with a login/logout // link etc. // //g_proxy.insertLoginBarDirective(&sb); // // logo header // printLogoAndSearchBox ( &sb , hr , -1,NULL ); // catId = -1 // // script to populate search results // sb.safePrintf("\n" // put search results into this div "
" "" "
" "
" "" "Waiting for results... " "" "
" "
" "Please be a little " "patient I am trying to get more servers." "
\n" "
" "
" "" "Copyright © 2014. " "All Rights Reserved.
" "Powered by the " "" "GigaBlast open source search engine." "
" "
\n" "\n" "\n" ); // one hour cache time... no 1000 hours, basically infinite long cacheTime = 3600; // *1000; //if ( hr->getLong("usecache",-1) == 0 ) cacheTime = 0; // // send back the parent stub containing the ajax // return g_httpServer.sendDynamicPage(s, sb.getBufStart(), sb.length(), cacheTime,//0, false, // POST? "text/html", 200, // httpstatus NULL, // cookie "UTF-8"); // charset } // make a new state State0 *st; try { st = new (State0); } catch ( ... ) { g_errno = ENOMEM; log("query: Query failed. " "Could not allocate %li bytes for query. " "Returning HTTP status of 500.",(long)sizeof(State0)); g_stats.m_numFails++; return g_httpServer.sendQueryErrorReply (s,500,mstrerror(g_errno), format, g_errno, "Query failed. " "Could not allocate memory to execute a search. " "Please try later." ); } mnew ( st , sizeof(State0) , "PageResults2" ); // init some stuff st->m_didRedownload = false; st->m_xd = NULL; st->m_oldContentHash32 = 0; // copy yhits if ( ! st->m_hr.copy ( hr ) ) return sendReply ( st , NULL ); // set this in case SearchInput::set fails! st->m_socket = s; // save this count so we know if TcpServer.cpp calls destroySocket(s) st->m_numDestroys = s->m_numDestroys; // you have to say "&header=1" to get back the header for json now. // later on maybe it will default to on. st->m_header = hr->getLong("header",0); // . parse it up // . this returns false and sets g_errno and, maybe, g_msg on error SearchInput *si = &st->m_si; if ( ! si->set ( s , // si just copies the ptr into the httprequest // into stuff like SearchInput::m_defaultSortLanguage // so do not use the "hr" on the stack. SearchInput:: // m_hr points to the hr we pass into // SearchInput::set &st->m_hr ) ) { //&st->m_q ) ) { log("query: set search input: %s",mstrerror(g_errno)); if ( ! g_errno ) g_errno = EBADENGINEER; return sendReply ( st, NULL ); } long codeLen = 0; char *code = hr->getString("code", &codeLen, NULL); // allow up to 1000 results per query for paying clients CollectionRec *cr = si->m_cr; // save collnum now if ( cr ) st->m_collnum = cr->m_collnum; else st->m_collnum = -1; // turn this on for json output, unless diffbot collection if ( format == FORMAT_JSON && ! cr->m_isCustomCrawl ) st->m_header = 1; // take this out here as well! // limit here // long maxpp = cr->m_maxSearchResultsPerQuery ; // if ( si->m_docsWanted > maxpp && // // disable serp max per page for custom crawls // ! cr->m_isCustomCrawl ) // si->m_docsWanted = maxpp; st->m_numDocIds = si->m_docsWanted; // watch out for cowboys //if(si->m_firstResultNum>=si->m_maxResults) return sendReply(st,NULL); // save state in TcpSocket's m_tmp ptr for debugging. in case // we lose our string of control and Msg40::getResults() never // comes back. s->m_tmp = (char *)st; // add query stat st->m_startTime = gettimeofdayInMilliseconds(); // reset st->m_errno = 0; // debug msg log ( LOG_DEBUG , "query: Getting search results for q=%s", st->m_si.m_displayQuery); // assume we'll block st->m_gotResults = false; st->m_gotAds = false; st->m_gotSpell = false; // reset st->m_printedHeaderRow = false; long ip = s->m_ip; long uipLen; char *uip = hr->getString("uip", &uipLen, NULL); char testBufSpace[2048]; SafeBuf testBuf(testBufSpace, 1024); if( g_conf.m_doAutoBan && !g_autoBan.hasPerm(ip, code, codeLen, uip, uipLen, s, hr, &testBuf, false)) { // just check? no incrementing counts if ( uip ) log("results: returning EBUYFEED for uip=%s",uip); g_errno = EBUYFEED; return sendReply(st,NULL); } // LAUNCH ADS // . now get the ad space for this query // . don't get ads if we're not on the first page of results // . query must be NULL terminated st->m_gotAds = true; /* if (si->m_adFeedEnabled && ! si->m_xml && si->m_docsWanted > 0) { long pageNum = (si->m_firstResultNum/si->m_docsWanted) + 1; st->m_gotAds = st->m_ads. getAds(si->m_displayQuery , //query si->m_displayQueryLen , //q len pageNum , //page num si->m_queryIP , si->m_coll2 , //coll st , //state gotAdsWrapper );//clbk } */ // LAUNCH SPELLER // get our spelling correction if we should (spell checker) st->m_gotSpell = true; st->m_spell[0] = '\0'; /* if ( si->m_spellCheck && cr->m_spellCheck && g_conf.m_doSpellChecking ) { st->m_gotSpell = g_speller. getRecommendation( &st->m_q, // Query si->m_spellCheck, // spellcheck st->m_spell, // Spell buffer MAX_FRAG_SIZE, // spell buf size false, // narrow search? NULL,//st->m_narrow // narrow buf MAX_FRAG_SIZE, // narrow buf size NULL,// num of narrows ptr st, // state gotSpellingWrapper );// callback } */ // LAUNCH RESULTS // . get some results from it // . this returns false if blocked, true otherwise // . it also sets g_errno on error // . use a niceness of 0 for all queries so they take precedence // over the indexing process // . this will copy our passed "query" and "coll" to it's own buffer // . we print out matching docIds to long if m_isDebug is true // . no longer forward this, since proxy will take care of evenly // distributing its msg 0xfd "forward" requests now st->m_gotResults=st->m_msg40.getResults(si,false,st,gotResultsWrapper); // save error st->m_errno = g_errno; // wait for ads and spellcheck and results? if ( !st->m_gotAds || !st->m_gotSpell || !st->m_gotResults ) return false; // otherwise call gotResults which returns false if blocked, true else // and sets g_errno on error bool status2 = gotResults ( st ); return status2; } // if returned json result is > maxagebeforedownload then we redownload the // page and if its checksum has changed we return empty results void doneRedownloadingWrapper ( void *state ) { // cast our State0 class from this State0 *st = (State0 *) state; // resume gotResults ( st ); } /* void gotSpellingWrapper( void *state ){ // cast our State0 class from this State0 *st = (State0 *) state; // log the error first if ( g_errno ) log("query: speller: %s.",mstrerror(g_errno)); // clear any error cuz spellchecks aren't needed g_errno = 0; st->m_gotSpell = true; gotState(st); } */ void gotResultsWrapper ( void *state ) { // cast our State0 class from this State0 *st = (State0 *) state; // save error st->m_errno = g_errno; // mark as gotten st->m_gotResults = true; gotState (st); } /* void gotAdsWrapper ( void *state ) { // cast our State0 class from this State0 *st = (State0 *) state; // mark as gotten st->m_gotAds = true; // log the error first if ( g_errno ) log("query: adclient: %s.",mstrerror(g_errno)); // clear any error cuz ads aren't needed g_errno = 0; gotState (st);; } */ void gotState ( void *state ){ // cast our State0 class from this State0 *st = (State0 *) state; if ( !st->m_gotAds || !st->m_gotSpell || !st->m_gotResults ) return; // we're ready to go gotResults ( state ); } // print all sentences containing this gigabit (fast facts) (nuggabits) static bool printGigabitContainingSentences ( State0 *st, SafeBuf *sb , Msg40 *msg40 , Gigabit *gi , SearchInput *si , Query *gigabitQuery ) { static long s_gigabitCount = 0; sb->safePrintf(""); //""); HttpRequest *hr = &st->m_hr; CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum ); // make a new query sb->safePrintf("m_coll); sb->urlEncode(gi->m_term,gi->m_termLen); sb->safeMemcpy("+|+",3); char *q = hr->getString("q",NULL,""); sb->urlEncode(q); sb->safePrintf("\">"); sb->safeMemcpy(gi->m_term,gi->m_termLen); sb->safePrintf(""); sb->safePrintf(" "); long numOff = sb->m_length; sb->safePrintf(" ");//,gi->m_numPages); sb->safePrintf(""); sb->safePrintf(""); if ( si->m_isAdmin && 1 == 2 ) sb->safePrintf("[%.0f]{%li}", gi->m_gbscore, gi->m_minPop); long revert = sb->length(); sb->safePrintf("" "" , s_gigabitCount ); long spaceOutOff = sb->length(); sb->safePrintf( "%c%c%c", 0xe2, 0x87, 0x93); sb->safePrintf(//"[more]" ""); sb->safePrintf(""); //
// get facts long numNuggets = 0; long numFacts = msg40->m_factBuf.length() / sizeof(Fact); Fact *facts = (Fact *)msg40->m_factBuf.getBufStart(); bool first = true; bool second = false; bool printedSecond = false; //long long lastDocId = -1LL; long saveOffset = 0; for ( long i = 0 ; i < numFacts ; i++ ) { Fact *fi = &facts[i]; // if printed for a higher scoring gigabit, skip if ( fi->m_printed ) continue; // check gigabit match long k; for ( k = 0 ; k < fi->m_numGigabits ; k++ ) if ( fi->m_gigabitPtrs[k] == gi ) break; // skip this fact/sentence if does not contain gigabit if ( k >= fi->m_numGigabits ) continue; // do not print if no period at end char *s = fi->m_fact; char *e = s + fi->m_factLen; if ( e[-1] != '*' ) continue; e--; again: // first time, print in the single fact div if ( first ) { sb->safePrintf("
",s_gigabitCount); } if ( second ) { sb->safePrintf("
",s_gigabitCount); printedSecond = true; } Msg20Reply *reply = fi->m_reply; // ok, print it out if ( ! first && ! second ) { //if ( reply->m_docId != lastDocId ) sb->safePrintf("

\n"); //else { // sb->setLength ( saveOffset ); // sb->safePrintf("

\n"); //} } else { //sb->safePrintf(""); } numNuggets++; // print the fast fact (sentence) //sb->safeMemcpy ( s , e-s ); // let's highlight with gigabits and query terms SafeBuf tmpBuf; Highlight h; h.set ( &tmpBuf , // print it out here s , // content e - s , // len si->m_queryLangId , // from m_defaultSortLang gigabitQuery , // the gigabit "query" in quotes true , // stemming? -- unused false , // use anchors? NULL , // baseurl "", // front tag "", // back tag 0 , // fieldCode 0 ); // niceness // now highlight the original query as well but in black bold h.set ( sb , // print it out here tmpBuf.getBufStart() , // content tmpBuf.length() , // len si->m_queryLangId , // from m_defaultSortLang &si->m_q , // the regular query true , // stemming? -- unused false , // use anchors? NULL , // baseurl "" , // front tag "", // back tag 0 , // fieldCode 0 ); // niceness fi->m_printed = 1; saveOffset = sb->length(); sb->safePrintf(" ",cr->m_coll,reply->m_docId); long dlen; char *dom = getDomFast(reply->ptr_ubuf,&dlen); sb->safeMemcpy(dom,dlen); sb->safePrintf("\n"); //lastDocId = reply->m_docId; if ( first ) { sb->safePrintf("
"); } if ( second ) { second = false; } if ( first ) { first = false; second = true; // print first gigabit all over again but in 2nd div goto again; } } // we counted the first one twice since we had to throw it into // the hidden div too! if ( numNuggets > 1 ) numNuggets--; // do not print the double down arrow if no nuggets printed if ( numNuggets <= 0 ) { sb->m_length = revert; sb->safePrintf(""); } // just remove down arrow if only 1... else if ( numNuggets == 1 ) { char *dst = sb->getBufStart()+spaceOutOff; dst[0] = ' '; dst[1] = ' '; dst[2] = ' '; } // store the # of nuggets in ()'s like (10 ) else { char tmp[10]; sprintf(tmp,"(%li)",numNuggets); char *src = tmp; // starting storing digits after "( " char *dst = sb->getBufStart()+numOff; long srcLen = gbstrlen(tmp); if ( srcLen > 5 ) srcLen = 5; for ( long k = 0 ; k < srcLen ; k++ ) dst[k] = src[k]; } s_gigabitCount++; if ( printedSecond ) { sb->safePrintf("
"); } return true; } /* // print all sentences containing this gigabit static bool printGigabit ( State0 *st, SafeBuf *sb , Msg40 *msg40 , Gigabit *gi , SearchInput *si ) { //static long s_gigabitCount = 0; sb->safePrintf(""); //""); HttpRequest *hr = &st->m_hr; // make a new query sb->safePrintf("urlEncode(gi->m_term,gi->m_termLen); sb->safeMemcpy("+|+",3); char *q = hr->getString("q",NULL,""); sb->urlEncode(q); sb->safePrintf("\">"); sb->safeMemcpy(gi->m_term,gi->m_termLen); sb->safePrintf(""); sb->safePrintf(" "); //long numOff = sb->m_length; // now the # of pages not nuggets sb->safePrintf("(%li)",gi->m_numPages); sb->safePrintf(""); sb->safePrintf(""); if ( si->m_isAdmin ) sb->safePrintf("[%.0f]{%li}", gi->m_gbscore, gi->m_minPop); // that's it for the gigabit sb->safePrintf("
"); return true; } */ class StateAU { public: SafeBuf m_metaListBuf; Msg4 m_msg4; }; void freeMsg4Wrapper( void *st ) { StateAU *stau = (StateAU *)st; mdelete(stau, sizeof(StateAU), "staud"); delete stau; } // . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotResults ( void *state ) { // cast our State0 class from this State0 *st = (State0 *) state; long long nowMS = gettimeofdayInMilliseconds(); // log the time long long took = nowMS - st->m_startTime; // record that st->m_took = took; // grab the query Msg40 *msg40 = &(st->m_msg40); //char *q = msg40->getQuery(); //long qlen = msg40->getQueryLen(); SearchInput *si = &st->m_si; // if in streaming mode and we never sent anything and we had // an error, then send that back. we never really entered streaming // mode in that case. this happens when someone deletes a coll // and queries it immediately, then each shard reports ENOCOLLREC. // it was causing a socket to be permanently stuck open. if ( g_errno && si->m_streamResults && st->m_socket->m_totalSent == 0 ) return sendReply(st,NULL); // if already printed from Msg40.cpp, bail out now if ( si->m_streamResults ) { // this will be our final send if ( st->m_socket->m_streamingMode ) { log("res: socket still in streaming mode. wtf?"); st->m_socket->m_streamingMode = false; } log("msg40: done streaming. nuking state."); mdelete(st, sizeof(State0), "PageResults2"); delete st; return true; } // shortcuts //char *coll = si->m_coll2; //long collLen = si->m_collLen2; //collnum_t collnum = si->m_firstCollnum; // collection rec must still be there since SearchInput references // into it, and it must be the SAME ptr too! CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum ); if ( ! cr ) { // || cr != si->m_cr ) { g_errno = ENOCOLLREC; return sendReply(st,NULL); } //char *coll = cr->m_coll; /* // // BEGIN REDOWNLOAD LOGIC // //////////// // // if caller wants a certain freshness we might have to redownload the // parent url to get the new json // //////////// // get the first result Msg20 *m20first = msg40->m_msg20[0]; long mabr = st->m_hr.getLong("maxagebeforeredownload",-1); if ( mabr >= 0 && numResults > 0 && // only do this once ! st->m_didRedownload && // need at least one result m20first && // get the last spidered time from the msg20 reply of that result m20first->m_r->m_lastSpidered - now > mabr ) { // make a new xmldoc to do the redownload XmlDoc *xd; try { xd = new (XmlDoc); } catch ( ... ) { g_errno = ENOMEM; log("query: Failed to alloc xmldoc."); } if ( g_errno ) return sendReply (st,NULL); mnew ( xd , sizeof(XmlDoc) , "mabrxd"); // save it st->m_xd = xd; // get this st->m_oldContentHash32 = m20rep->m_contentHash32; // do not re-do redownload st->m_didRedownload = true; // set it xd->setUrl(parentUrl); xd->setCallback ( st , doneRedownloadingWrapper ); // get the checksum if ( xd->getContentChecksum32Fast() == (void *)-1 ) // return false if it blocked return false; // error? if ( g_errno ) return sendReply (st,NULL); // how did this not block log("page: redownload did not would block adding parent"); } // if we did the redownload and checksum changed, return 0 results if ( st->m_didRedownload ) { // get the doc we downloaded XmlDoc *xd = st->m_xd; // get it long newHash32 = xd->getContentHash32(); // log it if ( newHash32 != st->m_oldContentHash32 ) // note it in logs for now log("results: content changed for %s",xd->m_firstUrl.m_url); // free it mdelete(xd, sizeof(XmlDoc), "mabrxd" ); delete xd; // null it out so we don't try to re-free st->m_xd = NULL; // if content is significantly different, return 0 results if ( newHash32 != st->m_oldContentHash32 ) { SafeBuf sb; // empty json i guess sb.safePrintf("[]\n"); return sendReply(st,sb.getBufStart()); } // otherwise, print the diffbot json results, they are still valid } // // END REDOWNLOAD LOGIC // */ // // BEGIN ADDING URL // ////////// // // if its a special request to get diffbot json objects for // a given parent url, it often contains the same url in "addurl" // to add as a spider request to spiderdb so that // it gets spidered and processed through diffbot. // ////////// char *addUrl = st->m_hr.getString("addurl",NULL); if ( addUrl ) { // && cr->m_isCustomCrawl ) { Url norm; norm.set ( addUrl ); SpiderRequest sreq; // returns false and sets g_errno on error if ( ! sreq.setFromAddUrl ( norm.getUrl() ) ) { //addUrl ) ) { log("addurl: url had problem: %s",mstrerror(g_errno)); return true; } // addurl state StateAU *stau; try { stau = new(StateAU); } catch ( ... ) { g_errno = ENOMEM; return true; } mnew ( stau , sizeof(StateAU) , "stau"); // fill it up SafeBuf *mlist = &stau->m_metaListBuf; if ( ! mlist->pushChar(RDB_SPIDERDB) ) return true; if ( ! mlist->safeMemcpy ( &sreq , sreq.getRecSize() ) ) return true; Msg4 *msg4 = &stau->m_msg4; // this should copy the recs from list into the buffers if ( msg4->addMetaList ( mlist->getBufStart() , mlist->getLength() , cr->m_collnum, stau , freeMsg4Wrapper , MAX_NICENESS ) ) { // if it copied everything ok, nuke our msg4 // otherwise it will call freeMsg4Wraper when it // completes! freeMsg4Wrapper( stau ); } } // // DONE ADDING URL // long numResults = msg40->getNumResults(); // if user is doing ajax widget we need to know the current docid // that is listed at the top of their widget display so we can // hide the new docids above that and scroll them down slowly. /* //long topDocIdPos = -1; bool hasInvisibleResults = false; //long numInvisible = 0; long numAbove = 0; HttpRequest *hr = &st->m_hr; long long oldTop = 0LL; long long lastDocId = 0LL; double lastSerpScore = 0.0; if ( si->m_format == FORMAT_WIDGET_AJAX ) { // sanity, no stream mode here, it won't work if ( si->m_streamResults ) log("results: do not use stream=1 for widget"); // get current top docid long long topDocId = hr->getLongLong("topdocid",0LL); // DEBUG: force it on for now //topDocId = 4961990748LL; // scan results. this does not support &stream=1 streaming // mode. it doesn't make sense that it needs to. for ( long i = 0 ; i < numResults ; i++ ) { // skip if already invisible if ( msg40->m_msg3a.m_clusterLevels[i] != CR_OK ) continue; // get it Msg20 *m20 = msg40->m_msg20[i]; if ( ! m20 ) continue; // checkdocid Msg20Reply *mr = m20->m_r; if ( ! mr ) continue; // save this lastDocId = mr->m_docId; lastSerpScore = msg40->m_msg3a.m_scores[i]; // set "oldTop" to first docid we encounter if ( ! oldTop ) oldTop = mr->m_docId; // stop if no topdocid otherwise. oldTop is now set if ( ! topDocId ) continue; // == 0 ) break; if ( mr->m_docId != topDocId ) { hasInvisibleResults = true; // count # of docids above top docid numAbove++; continue; } // we match it, so set this if not already set //if ( topDocIdPos != -1 ) topDocIdPos = i; //break; } } */ SafeBuf *sb = &st->m_sb; // print javascript for scrolling down invisible div for // ajax based widgets // MDW: this does not execute because it is loaded via ajax... // so i moved logic into diffbot.php for now. /* if ( si->m_format == FORMAT_WIDGET_AJAX && numInvisible ) { sb->safePrintf("" , numInvisible * (long)RESULT_HEIGHT ); } */ // print logo, search box, results x-y, ... into st->m_sb printSearchResultsHeader ( st ); // propagate "topdocid" so when he does another query every 30 secs // or so we know what docid was on top for scrolling purposes //if ( si->m_format == FORMAT_WIDGET_AJAX ) // sb->safePrintf("\n", // oldTop); // report how many results we added above the topdocid provided, if any // so widget can scroll down automatically //if ( si->m_format == FORMAT_WIDGET_AJAX && numAbove ) // sb->safePrintf("\n",numAbove); // we often can add 100s of things to the widget's result set per // second especially when sorting by last spidered time and spidering // a lot. setting the maxserpscore of the serp score of the last result // allows us to append new search results to what we have in a // consistent manner. // if ( si->m_format == FORMAT_WIDGET_AJAX ) { // // let's make this ascii encoded crap // sb->safePrintf("\n", // lastSerpScore); // // let's make this ascii encoded crap // sb->safePrintf("\n", // lastDocId); // } // then print each result // don't display more than docsWanted results long count = msg40->getDocsWanted(); bool hadPrintError = false; long numPrintedSoFar = 0; //long widgetHeight = hr->getLong("widgetheight",400); //long widgetwidth = hr->getLong("widgetwidth",250); for ( long i = 0 ; count > 0 && i < numResults ; i++ ) { /* if ( hasInvisibleResults ) { // // MAKE THESE RESULTS INVISIBLE! // // if doing a widget, we initially hide the new results // and scroll them down in time so it looks cool. if ( i == 0 ) sb->safePrintf("
" , (-1* (RESULT_HEIGHT+ SERP_SPACER+ PADDING*2)* numInvisible)); // // END INSIVISBILITY // // to test scrolling, hide the first result and // scroll it out if ( i == topDocIdPos ) sb->safePrintf("
" "
" ); } */ ////////// // // prints in xml or html // ////////// if ( ! printResult ( st , i , &numPrintedSoFar ) ) { hadPrintError = true; break; } // limit it count--; } if ( hadPrintError ) { if ( ! g_errno ) g_errno = EBADENGINEER; log("query: had error: %s",mstrerror(g_errno)); //return sendReply ( st , sb.getBufStart() ); } // wrap it up with Next 10 etc. printSearchResultsTail ( st ); // if we split the serps into 2 divs for scrolling purposes // then close up the 2nd one //if ( hasInvisibleResults ) sb->safePrintf("
"); // END SERP DIV if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf(""); // send it off sendReply ( st , st->m_sb.getBufStart() ); return true; } // defined in PageRoot.cpp bool expandHtml ( SafeBuf& sb, char *head , long hlen , char *q , long qlen , HttpRequest *r , SearchInput *si, char *method , CollectionRec *cr ) ; bool printLeftNavColumn ( SafeBuf &sb, State0 *st ) { char *title = "Search Results"; sb.safePrintf("Gigablast - %s\n",title); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); // // DIVIDE INTO TWO PANES, LEFT COLUMN and MAIN COLUMN // sb.safePrintf("" "\n\n"); // // first the nav column // sb.safePrintf("" "" "
" "
" "
" "" "
" "
" "HOME" "
" "
" "
" "
" "
" ); /* // home link sb.safePrintf( "" "
" "              " "      " "HOME    " "
" "
" "
" ); */ SearchInput *si = &st->m_si; Msg40 *msg40 = &st->m_msg40; char format = si->m_format; // // BEGIN FACET PRINTING // // // . print out one table for each gbfacet: term in the query // . LATER: show the text string corresponding to the hash // by looking it up in the titleRec // if ( format == FORMAT_HTML ) msg40->printFacetTables ( &sb ); // // END FACET PRINTING // // // BEGIN PRINT GIGABITS // SafeBuf *gbuf = &msg40->m_gigabitBuf; long numGigabits = gbuf->length()/sizeof(Gigabit); if ( format != FORMAT_HTML ) numGigabits = 0; // print gigabits Gigabit *gigabits = (Gigabit *)gbuf->getBufStart(); //long numCols = 5; //long perRow = numGigabits / numCols; if ( numGigabits && format == FORMAT_HTML ) // gigabit unhide function sb.safePrintf ( "\n" ); if ( numGigabits && format == FORMAT_HTML ) sb.safePrintf("
" "" "
" "
" "
" "
" ); Query gigabitQuery; SafeBuf ttt; // limit it to 40 gigabits for now for ( long i = 0 ; i < numGigabits && i < 40 ; i++ ) { Gigabit *gi = &gigabits[i]; ttt.pushChar('\"'); ttt.safeMemcpy(gi->m_term,gi->m_termLen); ttt.pushChar('\"'); ttt.pushChar(' '); } if ( numGigabits > 0 ) gigabitQuery.set2 ( ttt.getBufStart() , si->m_queryLangId , true , // queryexpansion? true ); // usestopwords? for ( long i = 0 ; i < numGigabits ; i++ ) { //if ( i > 0 && format == FORMAT_HTML ) // sb.safePrintf("
"); //if ( perRow && (i % perRow == 0) ) // sb.safePrintf("
"); // print all sentences containing this gigabit Gigabit *gi = &gigabits[i]; // after the first 3 hide them with a more link if ( i == 1 && format == FORMAT_HTML ) { sb.safePrintf("" "Show more"); sb.safePrintf("" "

"); } //printGigabit ( st,sb , msg40 , gi , si ); //sb.safePrintf("
"); printGigabitContainingSentences(st,&sb,msg40,gi,si, &gigabitQuery); if ( format == FORMAT_HTML ) sb.safePrintf("

"); } //if ( numGigabits >= 1 && format == FORMAT_HTML ) if ( numGigabits && format == FORMAT_HTML ) sb.safePrintf("

"); // // now print various knobs // // // print sort by date options // /* if ( format == FORMAT_HTML ) sb.safePrintf( "
" "" "SEARCH TOOLS    " "" "
" "
" */ /* "
" "" "NEWSET FIRST    " "" "
" "
" "
" "" "OLDEST FIRST    " "" "
" "
" */ // // print date contraint functions now // if ( format == FORMAT_HTML && 1 == 2) sb.safePrintf( "
" "" "ANYTIME    " "" "
" "
" "
" "" "LAST 24 HOURS    " "" "
" "
" "
" "" "LAST 7 DAYS    " "" "
" "
" "
" "" "LAST 30 DAYS    " "" "
" "
" ); // // now the MAIN column // if ( format == FORMAT_HTML ) sb.safePrintf("\n
\n"); return true; } bool printSearchResultsHeader ( State0 *st ) { SearchInput *si = &st->m_si; // grab the query Msg40 *msg40 = &(st->m_msg40); char *q = msg40->getQuery(); long qlen = msg40->getQueryLen(); //char local[ 128000 ]; //SafeBuf sb(local, 128000); SafeBuf *sb = &st->m_sb; // reserve 1.5MB now! if ( ! sb->reserve(1500000 ,"pgresbuf" ) ) // 128000) ) return false; // just in case it is empty, make it null terminated sb->nullTerm(); // print first [ for json if ( si->m_format == FORMAT_JSON ) { if ( st->m_header ) sb->safePrintf("{\n"); else sb->safePrintf("[\n"); } CollectionRec *cr = si->m_cr; HttpRequest *hr = &st->m_hr; // if there's a ton of sites use the post method otherwise // they won't fit into the http request, the browser will reject // sending such a large request with "GET" char *method = "GET"; if ( si->m_sites && gbstrlen(si->m_sites)>800 ) method = "POST"; if ( si->m_format == FORMAT_HTML && cr->m_htmlHead.length() ) { return expandHtml ( *sb , cr->m_htmlHead.getBufStart(), cr->m_htmlHead.length(), q, qlen, hr, si, method, cr); } // . if not matt wells we do not do ajax // . the ajax is just there to prevent bots from slamming me // with queries. if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) { printCSSHead ( sb ,si->m_format ); sb->safePrintf(""); } if ( ! g_conf.m_isMattWells && si->m_format==FORMAT_WIDGET_IFRAME ) { printCSSHead ( sb ,si->m_format ); sb->safePrintf(""); } if ( si->m_format == FORMAT_WIDGET_IFRAME ) { long refresh = hr->getLong("refresh",0); if ( refresh ) sb->safePrintf("",refresh); } // lead with user's widget header which usually has custom style tags if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { char *header = hr->getString("header",NULL); if ( header ) sb->safeStrcpy ( header ); } if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) { printLeftNavColumn ( *sb,st ); } if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) { printLogoAndSearchBox ( sb,&st->m_hr,-1,si); // catId = -1 } // the calling function checked this so it should be non-null char *coll = cr->m_coll; long collLen = gbstrlen(coll); if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { char *pos = "relative"; if ( si->m_format == FORMAT_WIDGET_IFRAME ) pos = "absolute"; long widgetwidth = hr->getLong("widgetwidth",150); long widgetHeight = hr->getLong("widgetheight",400); //long iconWidth = 25; // put image in this div which will have top:0px JUST like // the div holding the search results we print out below // so that the image does not scroll when you use the // scrollbar. holds the magifying glass img and searchbox. sb->safePrintf("
"); //long refresh = hr->getLong("refresh",15); char *oq = hr->getString("q",NULL); if ( ! oq ) oq = ""; char *prepend = hr->getString("prepend"); if ( ! prepend ) prepend = ""; char *displayStr = "none"; if ( prepend && prepend[0] ) displayStr = ""; // to do a search we need to re-call the ajax, // just call reload like the one that is called every 15s or so sb->safePrintf("
"); sb->safePrintf("" ); //char *origq = hr->getString("q"); // we sort all results by spider date now so PREPEND // the actual user query char *origq = hr->getString("prepend"); if ( ! origq ) origq = ""; sb->safePrintf("
" // the box that holds the query "" , displayStr , widgetwidth / 23 , origq ); sb->safePrintf("
" "
\n" ); // . BEGIN SERP DIV // . div to hold the search results // . this will have the scrollbar to just scroll the serps // and not the magnifying glass sb->safePrintf("
" "
" , widgetwidth , widgetHeight); } // xml if ( si->m_format == FORMAT_XML ) sb->safePrintf("\n" "\n" ); long long nowMS = gettimeofdayInMillisecondsLocal(); // show current time if ( si->m_format == FORMAT_XML ) { long long globalNowMS = localToGlobalTimeMilliseconds(nowMS); sb->safePrintf("\t%lu\n", (long)(globalNowMS/1000)); } else if ( st->m_header && si->m_format == FORMAT_JSON ) { long long globalNowMS = localToGlobalTimeMilliseconds(nowMS); sb->safePrintf("\"currentTimeUTC\":%lu,\n", (long)(globalNowMS/1000)); } // show response time if not doing Quality Assurance if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t%lli\n", st->m_took); else if ( st->m_header && si->m_format == FORMAT_JSON ) sb->safePrintf("\"responseTimeMS\":%lli,\n", st->m_took); // out of memory allocating msg20s? if ( st->m_errno ) { log("query: Query failed. Had error processing query: %s", mstrerror(st->m_errno)); g_errno = st->m_errno; //return sendReply(st,sb->getBufStart()); return false; } //bool xml = si->m_xml; // if they are doing a search in dmoz, catId will be > 0. //if ( si->m_directCatId >= 0 ) { // printDMOZCrumb ( sb , si->m_directCatId , xml ); //} /////////// // // show DMOZ subcategories if doing either a // "gbpcatid: |" (Search restricted to category) // "gbcatid:" (DMOZ urls in that topic) // // The search gbcatid: results should be sorted by siterank i guess // since it is only search a single term: gbcatid: so we can // put our stars back onto that and should be sorted by them. // /////////// /* if ( si->m_catId >= 0 ) { // print the subtopcis in this topic. show as links above // the search results printDMOZSubTopics ( sb, si->m_catId , xml );//st, xml ); // ok, for now just print the dmoz topics since our search // results will be empty... until populated! //g_categories->printUrlsInTopic ( &sb , si->m_catId ); } */ // save how many docs are in this collection long long docsInColl = -1; //RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , si->m_coll ); //RdbBase *base = getRdbBase ( (uint8_t)RDB_CLUSTERDB , si->m_coll2 ); //if ( base ) docsInColl = base->getNumGlobalRecs(); docsInColl = g_hostdb.getNumGlobalRecs ( ); // include number of docs in the collection corpus if ( docsInColl >= 0LL ) { if ( si->m_format == FORMAT_XML) sb->safePrintf ( "\t%lli" "\n", docsInColl ); else if ( st->m_header && si->m_format == FORMAT_JSON) sb->safePrintf("\"docsInCollection\":%lli,\n", docsInColl); } long numResults = msg40->getNumResults(); bool moreFollow = msg40->moreResultsFollow(); // an estimate of the # of total hits long long totalHits = msg40->getNumTotalHits(); // only adjust upwards for first page now so it doesn't keep chaning if ( totalHits < numResults ) totalHits = numResults; if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t%lli\n",(long long)totalHits); else if ( st->m_header && si->m_format == FORMAT_JSON ) sb->safePrintf("\"hits\":%lli,\n", (long long)totalHits); // if streaming results we just don't know if we will require // a "Next 10" link or not! we can print that after we print out // the results i guess... if ( ! si->m_streamResults ) { if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t%li" "\n" ,(long)moreFollow); else if ( st->m_header && si->m_format == FORMAT_JSON ) sb->safePrintf("\"moreResultsFollow\":%li,\n", (long)moreFollow); } // . did he get a spelling recommendation? // . do not use htmlEncode() on this anymore since receiver // of the XML feed usually does not want that. if ( si->m_format == FORMAT_XML && st->m_spell[0] ) { sb->safePrintf ("\tsafeStrcpy(st->m_spell); sb->safePrintf ("]]>\n"); } if ( si->m_format == FORMAT_JSON && st->m_spell[0] ) { sb->safePrintf ("\t\"spell\":\""); sb->jsonEncode(st->m_spell); sb->safePrintf ("\"\n,"); } // when streaming results we lookup the facets last if ( si->m_format != FORMAT_HTML && ! si->m_streamResults ) msg40->printFacetTables ( sb ); // for diffbot collections only... if ( st->m_header && si->m_format == FORMAT_JSON && cr->m_isCustomCrawl ) { sb->safePrintf("\"objects\":[\n"); return true; } if ( si->m_format == FORMAT_JSON && ! cr->m_isCustomCrawl ) { sb->safePrintf("\"results\":[\n"); return true; } // debug if ( si->m_debug ) logf(LOG_DEBUG,"query: Displaying up to %li results.", numResults); // tell browser again //if ( si->m_format == FORMAT_HTML ) // sb->safePrintf("\n"); // get some result info from msg40 long firstNum = msg40->getFirstResultNum() ; // numResults may be more than we requested now! long n = msg40->getDocsWanted(); if ( n > numResults ) n = numResults; // . make the query class here for highlighting // . keepAllSingles means to convert all individual words into // QueryTerms even if they're in quotes or in a connection (cd-rom). // we use this for highlighting purposes Query qq; qq.set2 ( si->m_displayQuery, langUnknown , si->m_queryExpansion ); // si->m_boolFlag, // true ); // keepAllSingles? if ( g_errno ) return false;//sendReply (st,NULL); DocIdScore *dpx = NULL; if ( numResults > 0 ) dpx = msg40->getScoreInfo(0); if ( si->m_format == FORMAT_XML && dpx ) { // # query terms used! //long nr = dpx->m_numRequiredTerms; float max = 0.0; // max pairwise float lw = getHashGroupWeight(HASHGROUP_INLINKTEXT); // square that location weight lw *= lw; // assume its an inlinker's text, who has rank 15!!! lw *= getLinkerWeight(MAXSITERANK); // double loops /* for ( long i = 0 ; i< nr ; i++ ) { SingleScore *ssi = &dpx->m_singleScores[i]; float tfwi = getTermFreqWeight(ssi->m_listSize); for ( long j = i+1; j< nr ; j++ ) { SingleScore *ssj = &dpx->m_singleScores[j]; float tfwj =getTermFreqWeight(ssj->m_listSize); max += (lw * tfwi * tfwj)/3.0; } } */ // single weights float maxtfw1 = 0.0; long maxi1; // now we can have multiple SingleScores for the same term! // because we take the top MAX_TOP now and add them to // get the term's final score. for ( long i = 0 ; i< dpx->m_numSingles ; i++ ) { SingleScore *ssi = &dpx->m_singleScores[i]; float tfwi = ssi->m_tfWeight; if ( tfwi <= maxtfw1 ) continue; maxtfw1 = tfwi; maxi1 = i; } float maxtfw2 = 0.0; long maxi2; for ( long i = 0 ; i< dpx->m_numSingles ; i++ ) { if ( i == maxi1 ) continue; SingleScore *ssi = &dpx->m_singleScores[i]; float tfwi = ssi->m_tfWeight; if ( tfwi <= maxtfw2 ) continue; maxtfw2 = tfwi; maxi2 = i; } // only 1 term? if ( maxtfw2 == 0.0 ) maxtfw2 = maxtfw1; // best term freqs max *= maxtfw1 * maxtfw2; // site rank effect max *= MAXSITERANK/SITERANKDIVISOR + 1; sb->safePrintf ("\t\t%f" "\n", max ); } // debug msg log ( LOG_TIMING , "query: Got %li search results in %lli ms for q=%s", numResults,gettimeofdayInMilliseconds()-st->m_startTime, qq.getQuery()); //Highlight h; st->m_qe[0] = '\0'; // encode query buf //char qe[MAX_QUERY_LEN+1]; char *dq = si->m_displayQuery; //long dqlen = si->m_displayQueryLen; if ( dq ) urlEncode(st->m_qe,MAX_QUERY_LEN*2,dq,gbstrlen(dq)); // how many results were requested? //long docsWanted = msg40->getDocsWanted(); // store html head into p, but stop at %q //char *head = cr->m_htmlHead; //long hlen = cr->m_htmlHeadLen; //if ( ! si->m_xml ) sb->safeMemcpy ( head , hlen ); // ignore imcomplete or invalid multibyte or wide characters errors //if ( g_errno == EILSEQ ) { // log("query: Query error: %s. Ignoring.", mstrerror(g_errno)); // g_errno = 0; //} // secret search backdoor if ( qlen == 7 && q[0]=='3' && q[1]=='b' && q[2]=='Y' && q[3]=='6' && q[4]=='u' && q[5]=='2' && q[6]=='Z' ) { sb->safePrintf ( "
You owe me!

" ); } // print it with commas into "thbuf" and null terminate it char thbuf[64]; ulltoa ( thbuf , totalHits ); char inbuf[128]; ulltoa ( inbuf , docsInColl ); Query qq3; Query *qq2; bool firstIgnored; bool isAdmin = si->m_isAdmin; if ( si->m_format != FORMAT_HTML ) isAdmin = false; // otherwise, we had no error if ( numResults == 0 && si->m_format == FORMAT_HTML ) { sb->safePrintf ( "No results found in %s collection.", cr->m_coll); } // the token is currently in the collection name so do not show that else if ( numResults == 0 && ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) ) { sb->safePrintf ( "No results found. Wait for spider to " "kick in."); } else if ( moreFollow && si->m_format == FORMAT_HTML ) { if ( isAdmin && si->m_docsToScanForReranking > 1 ) sb->safePrintf ( "PQR'd " ); sb->safePrintf ("Results %li to %li of " "exactly %s from an index " "of %s pages" , firstNum + 1 , firstNum + n , thbuf , inbuf ); } // otherwise, we didn't get enough results to show this page else if ( si->m_format == FORMAT_HTML ) { if ( isAdmin && si->m_docsToScanForReranking > 1 ) sb->safePrintf ( "PQR'd " ); sb->safePrintf ("Results %li to %li of " "exactly %s from an index " "of %s pages" , firstNum + 1 , firstNum + n , thbuf , inbuf ); } // // if query was a url print add url msg // char *url = NULL; if ( !strncmp(q,"url:" ,4) && qlen > 4 ) url = q+4; if ( !strncmp(q,"http://" ,7) && qlen > 7 ) url = q; if ( !strncmp(q,"https://",8) && qlen > 8 ) url = q; if ( !strncmp(q,"www." ,4) && qlen > 4 ) url = q; // find end of url char *ue = url; for ( ; ue && *ue && ! is_wspace_a(*ue) ; ue++ ) ; if ( numResults == 0 && si->m_format == FORMAT_HTML && url ) { sb->safePrintf("

" "Could not find that url in the " "index. Try urlEncode(url,ue-url,false,false); sb->safePrintf(">Adding it."); } // sometimes ppl search for "www.whatever.com" so ask them if they // want to search for url:www.whatever.com if ( numResults > 0 && si->m_format == FORMAT_HTML && url && url ==q){ sb->safePrintf("

" "Did you mean to " "search for the url " "urlEncode(url,ue-url,false,false); sb->safePrintf(">"); sb->safeMemcpy(url,ue-url); sb->safePrintf(" itself?"); } // is it the main collection? bool isMain = false; if ( collLen == 4 && strncmp ( coll, "main", 4) == 0 ) isMain = true; // print "in collection ***" if we had a collection if (collLen>0 && numResults>0 && !isMain && si->m_format==FORMAT_HTML ) sb->safePrintf (" in collection %s",coll); //char *pwd = si->m_pwd; //if ( ! pwd ) pwd = ""; /* if ( si->m_format == FORMAT_HTML ) sb->safePrintf("   " "[show scores]" " ", numResults ); */ /* // convenient admin link if ( isAdmin ) { sb->safePrintf("   " "" "" "admin" "",coll); // print reindex link // get the filename directly char *langStr = si->m_defaultSortLang; if ( numResults>0 ) sb->safePrintf ("   " "" "" "respider these results" "" " ",coll, langStr , st->m_qe ); sb->safePrintf ("   " "" "" "scrape google/bing" " ", coll , st->m_qe ); sb->safePrintf ("   " "" "" "show banned results" " ", coll , langStr , st->m_qe ); sb->safePrintf ("   " "" "api" , coll ); sb->safePrintf ("   " "" "" "xml" " ", coll , langStr , st->m_qe ); sb->safePrintf ("   " "" "" "json" " ", coll , langStr , st->m_qe ); sb->safePrintf ("   " "" "" "hide admin links" " ", coll , langStr , st->m_qe ); } // if its an ip: or site: query, print ban link if ( isAdmin && strncmp(si->m_displayQuery,"ip:",3)==0) { // get the ip char *ips = si->m_displayQuery + 3; // copy to buf, append a ".0" if we need to char buf [ 32 ]; long i ; long np = 0; for ( i = 0 ; i<29 && (is_digit(ips[i])||ips[i]=='.'); i++ ){ if ( ips[i] == '.' ) np++; buf[i]=ips[i]; } // if not enough periods bail if ( np <= 1 ) goto skip2; if ( np == 2 ) { buf[i++]='.'; buf[i++]='0'; } buf[i] = '\0'; // search ip back or forward long ip = atoip(buf,i); sb->safePrintf ("  " "" "[prev %s]" , iptoa(ip-0x01000000),coll,docsWanted, iptoa(ip-0x01000000)); sb->safePrintf ("  " "" "[next %s]" , iptoa(ip+0x01000000),coll,docsWanted, iptoa(ip+0x01000000)); } // if its an ip: or site: query, print ban link if ( isAdmin && strncmp(si->m_displayQuery,"site:",5)==0) { // get the ip char *start = si->m_displayQuery + 5; char *sp = start; while ( *sp && ! is_wspace_a(*sp) ) sp++; char c = *sp; // get the filename directly sb->safePrintf ("   " "" "" "[ban %s]" " ",coll , start ); *sp = c; } if ( isAdmin && strncmp(si->m_displayQuery,"gbad:",5)==0) { // get the ip char *start = si->m_displayQuery + 5; char *sp = start; while ( *sp && ! is_wspace_a(*sp) ) sp++; char c = *sp; *sp = '\0'; sb->safePrintf ("   " "" "" "[ban %s]" " ", coll , start , start ); *sp = c; } skip2: // cache switch for admin if ( isAdmin && msg40->getCachedTime() > 0 ) { // get the filename directly sb->safePrintf("   " "" "safePrintf("&q=%s&rcache=0&seq=0&rtq=0\">" "[cache off]" " ", st->m_qe ); } */ // mention ignored query terms // we need to set another Query with "keepAllSingles" set to false qq2 = &si->m_q; //qq2.set ( q , qlen , NULL , 0 , si->m_boolFlag , false ); firstIgnored = true; for ( long i = 0 ; i < qq2->m_numWords ; i++ ) { //if ( si->m_xml ) break; QueryWord *qw = &qq2->m_qwords[i]; // only print out words ignored cuz they were stop words if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue; // print header -- we got one if ( firstIgnored ) { if ( si->m_format == FORMAT_XML ) sb->safePrintf ("\tm_format == FORMAT_HTML ) sb->safePrintf ("   The " "following query words " "were ignored: " ""); firstIgnored = false; } // print the word char *t = qw->m_word; long tlen = qw->m_wordLen; sb->utf8Encode2 ( t , tlen ); sb->safePrintf (" "); } // print tail if we had ignored terms if ( ! firstIgnored ) { sb->incrementLength(-1); if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]>\n"); else if ( si->m_format == FORMAT_HTML ) sb->safePrintf (". Preceed each with a '+' or " "wrap in " "quotes to not ignore."); } if ( si->m_format == FORMAT_HTML ) sb->safePrintf("

"); if ( si->m_format == FORMAT_HTML ) sb->safePrintf("" "" // ,ps->m_finalScore // ); // . print out the breakout tables then // . they should pop-up when the user // mouses over a cell in the distance matrix //sb->safePrintf("
"); // two pane table //if ( si->m_format == FORMAT_HTML ) // sb->safePrintf(""); // did we get a spelling recommendation? if ( si->m_format == FORMAT_HTML && st->m_spell[0] ) { // encode the spelling recommendation long len = gbstrlen ( st->m_spell ); char qe2[MAX_FRAG_SIZE]; urlEncode(qe2, MAX_FRAG_SIZE, st->m_spell, len); sb->safePrintf ("Did you mean:" " " "safePrintf ("\">"); sb->utf8Encode2(st->m_spell, len); // then finish it off sb->safePrintf ("\n

\n"); } // . Wrap results in a table if we are using ads. Easier to display. //Ads *ads = &st->m_ads; //if ( ads->hasAds() ) // sb->safePrintf("\n" // "
\n"); // debug if ( si->m_debug ) logf(LOG_DEBUG,"query: Printing up to %li results. " "bufStart=0x%lx", numResults,(long)sb->getBuf()); // // BEGIN PRINT THE RESULTS // //sb->safePrintf(""); //sb->safePrintf(""); /* sb->safePrintf( "\n" "
CLICK ME
\n" ); */ /* if ( si->m_format == FORMAT_HTML ) sb->safePrintf("incrementLength(-2); //} if ( si->m_format == FORMAT_JSON ) { // remove last },\n if there and replace with just \n char *e = sb->getBuf() - 2; if ( sb->length()>=2 && e[0]==',' && e[1]=='\n') { sb->m_length -= 2; sb->safePrintf("\n"); } // print ending ] for json sb->safePrintf("]\n"); // when streaming results we lookup the facets last if ( si->m_streamResults ) msg40->printFacetTables ( sb ); if ( st->m_header ) sb->safePrintf("}\n"); // all done for json return true; } // grab the query char *q = msg40->getQuery(); long qlen = msg40->getQueryLen(); HttpRequest *hr = &st->m_hr; // get some result info from msg40 long firstNum = msg40->getFirstResultNum() ; // end the two-pane table if ( si->m_format == FORMAT_HTML) sb->safePrintf("
"); // for storing a list of all of the sites we displayed, now we print a // link at the bottom of the page to ban all of the sites displayed // with one click SafeBuf banSites; //long tailLen = 0; //char *tail = NULL; // // PRINT PREV 10 NEXT 10 links! // // center everything below here if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "
" ); long remember = sb->length(); // now print "Prev X Results" if we need to if ( firstNum < 0 ) firstNum = 0; char abuf[300]; SafeBuf args(abuf,300); // show banned? if ( si->m_showBanned && ! si->m_isAdmin ) args.safePrintf("&sb=1"); if ( ! si->m_showBanned && si->m_isAdmin ) args.safePrintf("&sb=0"); // collection args.safePrintf("&c=%s",coll); // formatting info if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { args.safePrintf("&format=widget"); HttpRequest *hr = &st->m_hr; long widgetwidth = hr->getLong("widgetwidth",250); args.safePrintf("&widgetwidth=%li",widgetwidth); } // carry over the sites we are restricting the search results to if ( si->m_sites ) //whiteListBuf.getBufStart()); args.safePrintf("&sites=%s",si->m_sites); if ( firstNum > 0 && (si->m_format == FORMAT_HTML || si->m_format == FORMAT_WIDGET_IFRAME //|| //si->m_format == FORMAT_WIDGET_AJAX ) ) { long ss = firstNum - msg40->getDocsWanted(); sb->safePrintf("safeStrcpy ( st->m_qe ); // print other args if not zero sb->safeMemcpy ( &args ); // close it up sb->safePrintf ("\">" "Prev %li Results" "", msg40->getDocsWanted() ); } // now print "Next X Results" if ( msg40->moreResultsFollow() && (si->m_format == FORMAT_HTML || si->m_format == FORMAT_WIDGET_IFRAME //si->m_format == FORMAT_WIDGET_AJAX )) { long ss = firstNum + msg40->getDocsWanted(); // print a separator first if we had a prev results before us if ( sb->length() > remember ) sb->safePrintf ( "   " ); // add the query sb->safePrintf ("safeStrcpy ( st->m_qe ); // print other args if not zero sb->safeMemcpy ( &args ); // close it up sb->safePrintf("\">" "Next %li Results" "", msg40->getDocsWanted() ); } // print try this search on... // an additional
if we had a Next or Prev results link if ( sb->length() > remember ) sb->safeMemcpy ("
" , 4 ); // // END PRINT PREV 10 NEXT 10 links! // // end results table cell... and print calendar at top //tail = cr->m_htmlTail; //tailLen = gbstrlen (tail ); //if ( si->m_format == FORMAT_HTML ) sb->safeMemcpy ( tail , tailLen ); if ( si->m_format == FORMAT_HTML ) { /* sb->safePrintf("" "" "" "" ); sb->safePrintf("
" "" "htmlEncode ( si->m_sbuf1.getBufStart() , si->m_sbuf1.length() , false ); sb->safePrintf("\">" "" "
" "
"); */ sb->safePrintf("",coll); } bool isAdmin = si->m_isAdmin; if ( si->m_format != FORMAT_HTML ) isAdmin = false; if ( isAdmin && banSites.length() > 0 ) sb->safePrintf ("

" "
\n ", coll, banSites.getBufStart()); // TODO: print cache line in light gray here // TODO: "these results were cached X minutes ago" if ( msg40->getCachedTime() > 0 && si->m_format == FORMAT_HTML ) { sb->safePrintf("

"); sb->safePrintf ( " These results were cached " ); // this cached time is this local cpu's time long diff = getTime() - msg40->getCachedTime(); if ( diff < 60 ) sb->safePrintf ( "%li seconds" , diff ); else if ( diff < 2*60 ) sb->safePrintf ( "1 minute"); else sb->safePrintf ( "%li minutes",diff/60); sb->safePrintf ( " ago. [" "Info]"); sb->safePrintf ( "
"); } if ( si->m_format == FORMAT_XML ) { // when streaming results we lookup the facets last if ( si->m_streamResults ) msg40->printFacetTables ( sb ); sb->safePrintf("\n"); } // if we did not use ajax, print this tail here now if ( si->m_format == FORMAT_HTML && ! g_conf.m_isMattWells ) { sb->safePrintf ( "
" "
" "" "Copyright © 2014. All Rights " "Reserved.
" "Powered by the GigaBlast open source " "search engine." "
" "
\n" "\n" "\n" ); } // ajax widgets will have this outside the downloaded content if ( si->m_format == FORMAT_WIDGET_IFRAME ) { sb->safePrintf ( "
" "
" "" // link to edit the list of widget sites // or various other widget content properties // because we can't edit the width/height // of the widget like this. "edit " "• " //"Copyright © 2014. All Rights " //"Reserved.
" "Powered by " "Diffbot." "
" "
\n" "\n" "\n" ); } if ( sb->length() == 0 && si && si->m_format == FORMAT_JSON ) sb->safePrintf("[]\n"); if ( sb->length() == 0 ) { sb->pushChar('\n'); sb->nullTerm(); } if ( si->m_format == FORMAT_HTML && cr->m_htmlTail.length() && ! expandHtml ( *sb , cr->m_htmlTail.getBufStart(), cr->m_htmlTail.length(), q, qlen, hr, si, NULL, // method, cr) ) return false; return true; } bool printTimeAgo ( SafeBuf *sb , long ts , char *prefix , SearchInput *si ) { // Jul 23, 1971 sb->reserve2x(200); long now = getTimeGlobal(); // for printing long mins = 1000; long hrs = 1000; long days ; if ( ts > 0 ) { mins = (long)((now - ts)/60); hrs = (long)((now - ts)/3600); days = (long)((now - ts)/(3600*24)); if ( mins < 0 ) mins = 0; if ( hrs < 0 ) hrs = 0; if ( days < 0 ) days = 0; } // print the time ago if ( mins ==1)sb->safePrintf(" - %s: %li minute ago",prefix,mins); else if (mins<60)sb->safePrintf ( " - %s: %li minutes ago",prefix,mins); else if ( hrs == 1 )sb->safePrintf ( " - %s: %li hour ago",prefix,hrs); else if ( hrs < 24 )sb->safePrintf ( " - %s: %li hours ago",prefix,hrs); else if ( days == 1 )sb->safePrintf ( " - %s: %li day ago",prefix,days); else if (days< 7 )sb->safePrintf ( " - %s: %li days ago",prefix,days); // do not show if more than 1 wk old! we want to seem as // fresh as possible else if ( ts > 0 ) { // && si->m_isAdmin ) { struct tm *timeStruct = localtime ( &ts ); sb->safePrintf(" - %s: ",prefix); char tmp[100]; strftime(tmp,100,"%b %d %Y",timeStruct); sb->safeStrcpy(tmp); } return true; } int linkSiteRankCmp (const void *v1, const void *v2) { Inlink *i1 = *(Inlink **)v1; Inlink *i2 = *(Inlink **)v2; return i2->m_siteRank - i1->m_siteRank; } bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si , long *numPrinted ) { *numPrinted = 0; // . show the "LinkInfo" // . Msg20.cpp will have "computed" the LinkInfo if we set // Msg20Request::m_computeLinkInfo to true, but if we set // Msg20Request::m_getLinkInfo to true it will just get it // from the TitleRec, which is much faster but more stale. // . "&inlinks=1" is slow and fresh, "&inlinks=2" is fast // and stale. Both are really only for BuzzLogic. LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; // sanity if ( info && mr->size_linkInfo != info->m_size ){char *xx=NULL;*xx=0; } // NULLify if empty if ( mr->size_linkInfo <= 0 ) info = NULL; // do not both if none if ( info && ! info->m_numStoredInlinks ) info = NULL; // bail? if ( ! info ) return true; // now sort them up Inlink *k = info->getNextInlink(NULL); // #define from Linkdb.h Inlink *ptrs[MAX_LINKERS]; long numLinks = 0; for ( ; k ; k = info->getNextInlink(k) ) { ptrs[numLinks++] = k; if ( numLinks >= MAX_LINKERS ) break; } // sort them gbsort ( ptrs , numLinks , 4 , linkSiteRankCmp ); // print xml starter if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t\n"); // loop through the inlinks bool printedInlinkText = false; bool firstTime = true; long inlinkId = 0; long long starttime = gettimeofdayInMillisecondsLocal(); //long icount = 0; //long ecount = 0; //long absSum = 0; for ( long i = 0 ; i < numLinks ; i++ ) { k = ptrs[i]; if ( ! k->ptr_linkText ) continue; if ( ! si->m_doQueryHighlighting && si->m_format == FORMAT_HTML ) continue; char *str = k-> ptr_linkText; long strLen = k->size_linkText; //char tt[1024*3]; //char *ttend = tt + 1024*3; char *frontTag = "" ; char *backTag = ""; if ( si->m_format == FORMAT_XML ) { frontTag = ""; backTag = ""; } if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { frontTag = "" ; } Highlight hi; SafeBuf hb; long hlen = hi.set ( &hb,//tt , //ttend - tt , str, strLen , mr->m_language, // docLangId &si->m_hqq , // highlight query CLASS false , // doStemming? false , // use click&scroll? NULL , // base url frontTag, backTag, 0, 0 ); // niceness if ( hlen <= 0 ) continue; // skip it if nothing highlighted if ( hi.getNumMatches() == 0 ) continue; if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t\tm_docId ); // encode it for xml sb->htmlEncode ( k->ptr_urlBuf, k->size_urlBuf - 1 , false ); sb->safePrintf("\" " //"hostId=\"%lu\" " "firstindexed=\"%lu\" " // not accurate! //"lastspidered=\"%lu\" " "wordposstart=\"%li\" " "id=\"%li\" " "siterank=\"%li\" " "text=\"", //hh , //(long)k->m_datedbDate, (unsigned long)k->m_firstIndexedDate, //(unsigned long)k->m_lastSpidered, (long)k->m_wordPosStart, inlinkId, //linkScore); (long)k->m_siteRank ); // HACK!!! k->m_siteHash = inlinkId; // inc it inlinkId++; // encode it for xml if ( !sb->htmlEncode ( hb.getBufStart(), hb.length(), false)) return false; sb->safePrintf("\"/>\n"); continue; } if ( firstTime ) { sb->safePrintf(""); sb->safePrintf("" "" "" "" "" "" "" ); } firstTime = false; sb->safePrintf("",(long)k->m_siteRank); //sb->safePrintf("
"); printedInlinkText = true; *numPrinted = *numPrinted + 1; } long long took = gettimeofdayInMillisecondsLocal() - starttime; if ( took > 2 ) log("timing: took %lli ms to highlight %li links." ,took,numLinks); // closer for xml if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t\n"); //if ( printedInlinkText ) sb->safePrintf("
\n"); if ( printedInlinkText ) sb->safePrintf("" "
" "
" "Inlinks with Query Terms" "
" "
Inlink TextFromSite Rank
" "" //"" //k->ptr_urlBuf); ,si->m_cr->m_coll ,k->m_docId); if ( ! sb->safeMemcpy(&hb) ) return false; long hostLen = 0; char *host = getHostFast(k->ptr_urlBuf,&hostLen,NULL); sb->safePrintf(""); if ( host ) sb->safeMemcpy(host,hostLen); sb->safePrintf("%li
" "
"); return true; } // // . print a dmoz topic for the given numeric catid UNDER search result // . print "Search in Category" link as well // static bool printDMOZCategoryUnderResult ( SafeBuf *sb , SearchInput *si, long catid , State0 *st ) { char format = si->m_format; // these are handled in the logic below now if ( format == FORMAT_XML ) return true; if ( format == FORMAT_JSON ) return true; // if ( format == FORMAT_XML ) { // sb->safePrintf("\t\t\n" // "\t\t\t%li\n" // "\t\t\tprintPathFromId(&xb, catid, false,si->m_isRTL); // sb->cdataEncode(xb.getBufStart()); // sb->safePrintf("]]>\n"); // sb->safePrintf("\t\t\n"); // return true; // } // if ( format == FORMAT_JSON ) { // sb->safePrintf("\t\t\"dmozCat\":{\n" // "\t\t\t\"dmozCatId\":%li,\n" // "\t\t\t\"dmozCatStr\":\"" // ,catid); // // print the name of the dmoz category // char xbuf[256]; // SafeBuf xb(xbuf,256,0,false); // g_categories->printPathFromId(&xb, catid, false,si->m_isRTL); // sb->jsonEncode(xb.getBufStart()); // sb->safePrintf("\"\n" // "\t\t},\n"); // return true; // } //uint8_t queryLanguage = langUnknown; uint8_t queryLanguage = si->m_queryLangId; // Don't print category if not in native language category // Note that this only trims out "World" cats, not all // of them. Some of them may still sneak in. //if(si->m_langHint) // queryLanguage = si->m_langHint; if(queryLanguage != langUnknown) { char tmpbuf[1024]; SafeBuf langsb(tmpbuf, 1024); g_categories->printPathFromId(&langsb, catid, false); char *ptr = langsb.getBufStart(); uint8_t lang = g_langId.findLangFromDMOZTopic(ptr + 7); if(!strncmp("World: ", ptr, 6) && lang != langUnknown && lang != queryLanguage) // do not print it if not in our language return true; } ////// // // print a link to apply your query to this DMOZ category // ////// sb->safePrintf("urlEncode("|",1); sb->urlEncode(si->m_sbuf1.getBufStart(),si->m_sbuf1.length()); sb->safePrintf("\">Search in Category: "); // setup the host of the url //if ( dmozHost ) // sb->safePrintf("safePrintf("printPathFromId(sb, catid, true,si->m_isRTL); sb->safePrintf("/\">"); // print the name of the dmoz category sb->safePrintf(""); g_categories->printPathFromId(sb, catid, false,si->m_isRTL); sb->safePrintf("
"); //++tr.brCount; return true; } // use this for xml as well as html bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) { SafeBuf *sb = &st->m_sb; HttpRequest *hr = &st->m_hr; CollectionRec *cr = NULL; cr = g_collectiondb.getRec ( st->m_collnum ); if ( ! cr ) { log("query: printResult: collnum %li gone", (long)st->m_collnum); return true; } // shortcuts SearchInput *si = &st->m_si; Msg40 *msg40 = &st->m_msg40; // ensure not all cluster levels are invisible if ( si->m_debug ) logf(LOG_DEBUG,"query: result #%li clusterlevel=%li", ix, (long)msg40->getClusterLevel(ix)); long long d = msg40->getDocId(ix); if ( si->m_docIdsOnly ) { if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\n" "\t\t%lli\n" "\t\n", d ); else if ( si->m_format == FORMAT_JSON ) sb->safePrintf("\t\{\n" "\t\t\"docId\":%lli\n" "\t},\n", d ); else sb->safePrintf("%lli
\n", d ); // inc it *numPrintedSoFar = *numPrintedSoFar + 1; return true; } Msg20 *m20 ; if ( si->m_streamResults ) m20 = msg40->getCompletedSummary(ix); else m20 = msg40->m_msg20[ix]; // get the reply Msg20Reply *mr = m20->m_r; // . sometimes the msg20reply is NULL so prevent it coring // . i think this happens if all hosts in a shard are down or timeout // or something if ( ! mr ) return false; // . if section voting info was request, display now, it's in json // . so if in csv it will mess things up!!! if ( mr->ptr_sectionVotingInfo ) // it is possible this is just "\0" sb->safeStrcpy ( mr->ptr_sectionVotingInfo ); // each "result" is the actual cached page, in this case, a json // object, because we were called with &icc=1. in that situation // ptr_content is set in the msg20reply. if ( si->m_format == FORMAT_CSV && mr->ptr_content && mr->m_contentType == CT_JSON ) { // parse it up char *json = mr->ptr_content; // only print header row once, so pass in that flag if ( ! st->m_printedHeaderRow ) { sb->reset(); printCSVHeaderRow ( sb , st ); st->m_printedHeaderRow = true; } printJsonItemInCSV ( json , sb , st ); // inc it *numPrintedSoFar = *numPrintedSoFar + 1; return true; } // just print cached web page? if ( mr->ptr_content ) { // for json items separate with \n,\n if ( si->m_format != FORMAT_HTML && *numPrintedSoFar > 0 ) sb->safePrintf(",\n"); // a dud? just print empty {}'s if ( mr->size_content == 1 ) sb->safePrintf("{}"); else sb->safeStrcpy ( mr->ptr_content ); // . let's hack the spidertime onto the end // . so when we sort by that using gbsortby:spiderdate // we can ensure it is ordered correctly // As of the update on 5/13/2014, the end of sb may have whitespace, so first move away from that int distance; // distance from end to first non-whitespace char char *end; for (distance = 1; distance < sb->getLength(); distance++) { end = sb->getBuf() - distance; if (!is_wspace_a(*end)) break; } if ( si->m_format == FORMAT_JSON && end > sb->getBufStart() && *end == '}' ) { // replace trailing } with spidertime} sb->incrementLength(-distance); // comma? if ( mr->size_content>1 ) sb->pushChar(','); sb->safePrintf("\"docId\":%lli", mr->m_docId); sb->safePrintf(",\"gburl\":\""); sb->jsonEncode(mr->ptr_ubuf); sb->safePrintf("\""); // for deduping //sb->safePrintf(",\"crc\":%lu",mr->m_contentHash32); // crap, we lose resolution storing as a float // so fix that shit here... //float f = mr->m_lastSpidered; //sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f); // MDW: this is VERY convenient for debugging pls // leave in. we can easily see if a result // should be there for a query like // gbmin:gbspiderdate:12345678 sb->safePrintf(",\"lastCrawlTimeUTC\":%li", mr->m_lastSpidered); // also include a timestamp field with an RFC 1123 formatted date char timestamp[50]; struct tm *ptm = gmtime ( &mr->m_lastSpidered ); strftime(timestamp, 50, "%a, %d %b %Y %X %Z", ptm); sb->safePrintf(",\"timestamp\":\"%s\"}\n", timestamp); } //mr->size_content ); if ( si->m_format == FORMAT_HTML ) sb->safePrintf("\n\n

\n\n"); // inc it *numPrintedSoFar = *numPrintedSoFar + 1; // just in case sb->nullTerm(); return true; } if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\n" ); if ( si->m_format == FORMAT_JSON ) { if ( *numPrintedSoFar != 0 ) sb->safePrintf(",\n"); sb->safePrintf("\t{\n" ); } Highlight hi; // get the url char *url = mr->ptr_ubuf ; long urlLen = mr->size_ubuf - 1 ; long err = mr->m_errno ; // . remove any session ids from the url // . for speed reasons, only check if its a cgi url Url uu; uu.set ( url , urlLen, false, true ); url = uu.getUrl(); urlLen = uu.getUrlLen(); // get my site hash unsigned long long siteHash = 0; if ( uu.getHostLen() > 0 ) siteHash = hash64(uu.getHost(),uu.getHostLen()); // indent it if level is 2 bool indent = false; bool isAdmin = si->m_isAdmin; if ( si->m_format == FORMAT_XML ) isAdmin = false; //unsigned long long lastSiteHash = siteHash; if ( indent && si->m_format == FORMAT_HTML ) sb->safePrintf("
"); // print the rank. it starts at 0 so add 1 if ( si->m_format == FORMAT_HTML && si->m_streamResults ) //sb->safePrintf("
%li.", // ix+1 ); sb->safePrintf("
"); else if ( si->m_format == FORMAT_HTML ) //sb->safePrintf("
%li.", // ix+1 + si->m_firstResultNum ); sb->safePrintf("
"); if ( si->m_showBanned ) { if ( err == EDOCBANNED ) err = 0; if ( err == EDOCFILTERED ) err = 0; } // if this msg20 had an error print "had error" if ( err || urlLen <= 0 || ! url ) { // it's unprofessional to display this in browser // so just let admin see it if ( isAdmin ) { sb->safePrintf("docId %lli had error: " "%s

", mr->m_docId,//msg40->getDocId(i), mstrerror(err)); } // log it too! log("query: docId %lli had error: %s.", mr->m_docId,mstrerror(err)); // wrap it up if clustered if ( indent ) sb->safeMemcpy("",13); // inc it *numPrintedSoFar = *numPrintedSoFar + 1; return true; } // the score if admin /* if ( isAdmin ) { long level = (long)msg40->getClusterLevel(ix); // print out score sb->safePrintf ( "s=%.03f " "docid=%llu " "sitenuminlinks=%li%% " "hop=%li " "cluster=%li " "summaryLang=%s " "(%s)
", (float)msg40->getScore(ix) , mr->m_docId, (long )mr->m_siteNumInlinks, (long)mr->m_hopcount, level , getLanguageString(mr->m_summaryLanguage), g_crStrings[level]); } */ char *diffbotSuffix = strstr(url,"-diffbotxyz"); // print youtube and metacafe thumbnails here // http://www.youtube.com/watch?v=auQbi_fkdGE // http://img.youtube.com/vi/auQbi_fkdGE/2.jpg // get the thumbnail url if ( mr->ptr_imgUrl && si->m_format == FORMAT_HTML && // if we got thumbnail use that not this ! mr->ptr_imgData ) sb->safePrintf ("", url,mr->ptr_imgUrl); // if we have a thumbnail show it next to the search result, // base64 encoded if ( //(si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) && //! mr->ptr_imgUrl && si->m_showImages && mr->ptr_imgData ) { ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData; ThumbnailInfo *ti = ta->getThumbnailInfo(0); if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t"); ti->printThumbnailInHtml ( sb , 100 , // max width 100 , // max height true , // add NULL , " style=\"margin:10px;\" ", si->m_format ); if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t%li\n", ti->m_dy); sb->safePrintf("\t\t%li\n", ti->m_dx); sb->safePrintf("\t\t%li" "\n", ti->m_origDY); sb->safePrintf("\t\t%li" "\n", ti->m_origDX); sb->safePrintf("\t\tcdataEncode(ti->getUrl()); sb->safePrintf("]]>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"imageHeight\":%li,\n", ti->m_dy); sb->safePrintf("\t\t\"imageWidth\":%li,\n", ti->m_dx); sb->safePrintf("\t\t\"origImageHeight\":%li,\n", ti->m_origDY); sb->safePrintf("\t\t\"origImageWidth\":%li,\n", ti->m_origDX); sb->safePrintf("\t\t\"imageUrl\":\""); sb->jsonEncode(ti->getUrl()); sb->safePrintf("\",\n"); } } // print image for widget if ( //mr->ptr_imgUrl && ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX || si->m_format == FORMAT_WIDGET_APPEND ) ) { long widgetWidth = hr->getLong("widgetwidth",200); // prevent coring if ( widgetWidth < 1 ) widgetWidth = 1; // each search result in widget has a div around it sb->safePrintf("
" , mr->m_docId // this is a double now. this won't work // for streaming... , msg40->m_msg3a.m_scores[ix] , widgetWidth - 2*8 // padding is 8px , (long)RESULT_HEIGHT , (long)RESULT_HEIGHT , (long)PADDING ); // if ( mr->ptr_imgUrl ) // sb->safePrintf("background-repeat:no-repeat;" // "background-size:%lipx 140px;" // "background-image:url('%s');" // , widgetwidth - 2*8 // padding is 8px // , mr->ptr_imgUrl); long newdx = 0; if ( mr->ptr_imgData ) { ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData; ThumbnailInfo *ti = ta->getThumbnailInfo(0); // account for scrollbar on the right long maxWidth = widgetWidth - (long)SCROLLBAR_WIDTH; long maxHeight = (long)RESULT_HEIGHT; // false = do not print link on image ti->printThumbnailInHtml ( sb , maxWidth , maxHeight , false , // add &newdx ); } // end the div style attribute and div tag //sb->safePrintf("\">"); sb->safePrintf ( " .5 * widgetWidth ) sb->safePrintf("position:absolute;" "bottom:%li;" "left:%li;" , (long) PADDING , (long) PADDING ); // to align the text verticall we gotta make a textbox div // otherwise it wraps below image! mdw //else // sb->safePrintf("vertical-align:middle;"); else sb->safePrintf("position:absolute;" "bottom:%li;" "left:%li;" , (long) PADDING , (long) PADDING + newdx + 10 ); // close the style and begin the url sb->safePrintf( "\" " "href=\"" ); // truncate off -diffbotxyz%li long newLen = urlLen; if ( diffbotSuffix ) newLen = diffbotSuffix - url; // print the url in the href tag sb->safeMemcpy ( url , newLen ); // then finish the a href tag and start a bold for title sb->safePrintf ( "\">");//" ); sb->safePrintf(""); //sb->safePrintf ("", // mr->ptr_imgUrl); // then title over image } // only do link here if we have no thumbnail so no bg image if ( (si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) && ! mr->ptr_imgData ) { sb->safePrintf ( "");//" ); } // the a href tag if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "\n\n" ); // then if it is banned if ( mr->m_isBanned && si->m_format == FORMAT_HTML ) sb->safePrintf("BANNED "); /////// // // PRINT THE TITLE // /////// // the a href tag if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ( "");//" ); } // . then the title (should be NULL terminated) // . the title can be NULL // . highlight it first // . the title itself should not have any tags in it! char *str = mr->ptr_tbuf;//msg40->getTitle(i); long strLen = mr->size_tbuf - 1;// msg40->getTitleLen(i); if ( ! str || strLen < 0 ) strLen = 0; ///// // // are we printing a dmoz category page? // get the appropriate dmoz title/summary to use since the same // url can exist in multiple topics (catIds) with different // titles summaries. // ///// char *dmozSummary2 = NULL; // TODO: just get the catid from httprequest directly? if ( si->m_catId > 0 ) { // si->m_cat_dirId > 0) { // . get the dmoz title and summary // . if empty then just a bunch of \0s, except for catIds Msg20Reply *mr = m20->getReply(); char *dmozTitle = mr->ptr_dmozTitles; dmozSummary2 = mr->ptr_dmozSumms; char *dmozAnchor = mr->ptr_dmozAnchors; long *catIds = mr->ptr_catIds; long numCats = mr->size_catIds / 4; // loop through looking for the right ID for (long i = 0; i < numCats ; i++ ) { // assign shit if we match the dmoz cat we are showing if ( catIds[i] == si->m_catId) break; dmozTitle +=gbstrlen(dmozTitle)+1; dmozSummary2 +=gbstrlen(dmozSummary2)+1; dmozAnchor += gbstrlen(dmozAnchor)+1; } // now make the title the dmoz title str = dmozTitle; strLen = gbstrlen(str); } long hlen; //copy all summary and title excerpts for this result into here //char tt[1024*32]; //char *ttend = tt + 1024*32; char *frontTag = "" ; char *backTag = ""; if ( si->m_format == FORMAT_XML ) { frontTag = ""; backTag = ""; } if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) { frontTag = "" ; } long cols = 80; cols = si->m_summaryMaxWidth; SafeBuf hb; if ( str && strLen && si->m_doQueryHighlighting ) { hlen = hi.set ( &hb, //tt , //ttend - tt , str, strLen , mr->m_language, // docLangId &si->m_hqq , // highlight query CLASS false , // doStemming? false , // use click&scroll? NULL , // base url frontTag, backTag, 0, 0 ); // niceness // reassign! str = hb.getBufStart(); strLen = hb.getLength(); //if (!sb->utf8Encode2(tt, hlen)) return false; // if ( si->m_format != FORMAT_JSON ) // if ( ! sb->brify ( hb.getBufStart(), // hb.getLength(), // 0, // cols) ) return false; } // . use "UNTITLED" if no title // . msg20 should supply the dmoz title if it can if ( strLen == 0 && si->m_format != FORMAT_XML && si->m_format != FORMAT_JSON ) { str = "UNTITLED"; strLen = gbstrlen(str); } if ( str && strLen && ( si->m_format == FORMAT_HTML || si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) ) { // determine if TiTle wraps, if it does add a
count for // each wrap //if (!sb->utf8Encode2(str , strLen )) return false; if ( ! sb->brify ( str,strLen,0,cols) ) return false; } // close up the title tag if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t<![CDATA["); if ( str ) sb->cdataEncode(str); sb->safePrintf("]]>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"title\":\""); if ( str ) sb->jsonEncode(str); sb->safePrintf("\",\n"); } if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("

\n" ) ; // close the title tag stuf if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf("
\n"); // // print

tag contents. hack for client. // char *hp = mr->ptr_htag; char *hpend = hp + mr->size_htag; for ( ; hp && hp < hpend ; ) { if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\tcdataEncode(hp); sb->safePrintf("]]>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"h1Tag\":\""); sb->jsonEncode(hp); sb->safePrintf("\",\n"); } // it is a \0 separated list of headers generated from // XmlDoc::getHeaderTagBuf() hp += gbstrlen(hp) + 1; } // print all dmoz info for xml/json. // seems like both direct and indirect dmoz entries here. if ( mr->size_catIds > 0 && ( si->m_format == FORMAT_JSON || si->m_format == FORMAT_XML ) ) { char *dmozTitle = mr->ptr_dmozTitles; char *dmozSummary = mr->ptr_dmozSumms; char *dmozAnchor = mr->ptr_dmozAnchors; long *catIds = mr->ptr_catIds; long numCats = mr->size_catIds / 4; // loop through looking for the right ID for (long i = 0; i < numCats ; i++ ) { printDmozEntry ( sb, catIds[i], true, dmozTitle, dmozSummary, dmozAnchor , si ); dmozTitle += gbstrlen(dmozTitle ) + 1; dmozSummary += gbstrlen(dmozSummary) + 1; dmozAnchor += gbstrlen(dmozAnchor ) + 1; } } if ( mr->size_indCatIds > 0 && ( si->m_format == FORMAT_JSON || si->m_format == FORMAT_XML ) ) { // print INDIRECT dmoz entries as well long nIndCatids = mr->size_indCatIds / 4; for ( long i = 0; i < nIndCatids; i++ ) { long catId = ((long *)(mr->ptr_indCatIds))[i]; if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t" "%li\n", catId); if ( si->m_format == FORMAT_JSON ) sb->safePrintf("\t\t\"indirectDmozCatId\":" "%li,\n",catId); } // print INDIRECT dmoz entries as well // long nIndCatids = mr->size_indCatIds / 4; // dmozTitle = mr->ptr_indDmozTitles; // dmozSummary = mr->ptr_dmozSumms; // dmozAnchor = mr->ptr_dmozAnchors; // for ( long i = 0; i < nIndCatids; i++ ) { // long catId = ((long *)(mr->ptr_indCatIds))[i]; // printDmozEntry ( sb , // catId , // false, // dmozTitle, // dmozSummary, // dmozAnchor , // si ); // dmozTitle += gbstrlen(dmozTitle ) + 1; // dmozSummary += gbstrlen(dmozSummary) + 1; // dmozAnchor += gbstrlen(dmozAnchor ) + 1; // } } ///// // // print content type after title // ///// unsigned char ctype = mr->m_contentType; char *cs = g_contentTypeStrings[ctype]; if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t" "" "\n", cs); if ( si->m_format == FORMAT_JSON ) sb->safePrintf("\t\t\"contentType\":\"%s\",\n",cs); if ( si->m_format == FORMAT_HTML && ctype != CT_HTML && ctype != CT_UNKNOWN ){ sb->safePrintf(" "); char *p = cs; for ( ; *p ; p++ ) { char c = to_upper_a(*p); sb->pushChar(c); } sb->safePrintf("  "); } //////////// // // print the summary // //////////// // . then the summary // . "s" is a string of null terminated strings //char *send; // do the normal summary str = mr->ptr_displaySum; // sometimes the summary is longer than requested because for // summary deduping purposes (see "pss" parm in Parms.cpp) we do not // get it as short as request. so use mr->m_sumPrintSize here // not mr->size_sum strLen = mr->size_displaySum - 1;//-1; // this includes the terminating \0 or \0\0 so back up if ( strLen < 0 ) strLen = 0; //send = str + strLen; // dmoz summary might override if we are showing a dmoz topic page if ( dmozSummary2 && (si->m_catId>0 || strLen<=0) ) { str = dmozSummary2; strLen = gbstrlen(dmozSummary2); } bool printSummary = true; // do not print summaries for widgets by default unless overridden // with &summary=1 if ( (si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) && hr->getLong("summaries",0) == 0 ) printSummary = false; if ( printSummary && si->m_format == FORMAT_HTML ) sb->brify ( str , strLen, 0 , cols ); // niceness = 0 if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\tcdataEncode(str); sb->safePrintf("]]>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"sum\":\""); sb->jsonEncode(str); sb->safePrintf("\",\n"); } // new line if not xml if ( si->m_format == FORMAT_HTML && strLen ) sb->safePrintf("
\n"); //////////// // // . print DMOZ topics under the summary // . will print the "Search in Category" link too // //////////// //Msg20Reply *mr = m20->getMsg20Reply(); long nCatIds = mr->getNumCatIds(); for (long i = 0; i < nCatIds; i++) { long catid = ((long *)(mr->ptr_catIds))[i]; printDMOZCategoryUnderResult(sb,si,catid,st); } // skipCatsPrint: // print the indirect category Ids long nIndCatids = mr->size_indCatIds / 4; //if ( !cr->m_displayIndirectDmozCategories ) // goto skipCatsPrint2; for ( long i = 0; i < nIndCatids; i++ ) { long catid = ((long *)(mr->ptr_indCatIds))[i]; // skip it if it's a regular category //bool skip = false; long d; for ( d = 0; d < nCatIds; d++) { if ( catid == mr->ptr_catIds[i] ) break; } // skip if the indirect catid matched a directed catid if ( d < nCatIds ) continue; // otherwise print it printDMOZCategoryUnderResult(sb,si,catid,st); } /////////// // // print facet field/values // // if there was a gbfacet*: term (gbfacetstr, gbfacetfloat, gbfacetint) // this should be non-NULL and have the facet field/value pairs // and every string ends in a \0 // ////////// char *fp = mr->ptr_facetBuf; char *fpEnd = fp + mr->size_facetBuf; for ( ; fp && fp < fpEnd ; ) { if ( si->m_format == FORMAT_HTML ) { // print first one sb->safePrintf(""); sb->safeStrcpy(fp); sb->safePrintf(""); sb->safePrintf("   :   "); sb->safePrintf(""); fp += gbstrlen(fp) + 1; sb->htmlEncode(fp); // begin a new pair sb->safePrintf(""); sb->safeStrcpy("
\n"); fp += gbstrlen(fp) + 1; } else if ( si->m_format == FORMAT_XML ) { // print first one sb->safePrintf("\t\t\n" "\t\t\tcdataEncode(fp); sb->safePrintf("]]>\n"); fp += gbstrlen(fp) + 1; sb->safePrintf("\t\t\tcdataEncode(fp); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\n"); fp += gbstrlen(fp) + 1; } else if ( si->m_format == FORMAT_JSON ) { // print first one sb->safePrintf("\t\t\"facet\":{\n"); sb->safePrintf("\t\t\t\"field\":\""); sb->jsonEncode(fp); sb->safePrintf("\",\n"); fp += gbstrlen(fp) + 1; sb->safePrintf("\t\t\t\"value\":\""); sb->jsonEncode(fp); sb->safePrintf("\"\n"); fp += gbstrlen(fp) + 1; sb->safePrintf("\t\t},\n"); } } //////////// // // print the URL // //////////// // hack off the http:// if any for displaying it on screen if ( urlLen > 8 && strncmp ( url , "http://" , 7 )==0 ) { url += 7; urlLen -= 7; } // . remove trailing / // . only remove from root urls in case user cuts and // pastes it for link: search if ( url [ urlLen - 1 ] == '/' ) { // see if any other slash before us long j; for ( j = urlLen - 2 ; j >= 0 ; j-- ) if ( url[j] == '/' ) break; // if there wasn't, we must have been a root url // so hack off the last slash if ( j < 0 ) urlLen--; } if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ("" ); //sb->htmlEncode ( url , gbstrlen(url) , false ); // 20 for the date after it sb->safeTruncateEllipsis ( url , 50 ); // cols - 30 ); // turn off the color sb->safePrintf ( "\n" ); } if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\tsafeMemcpy ( url , urlLen ); sb->safePrintf("]]>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"url\":\""); sb->jsonEncode ( url , urlLen ); sb->safePrintf("\",\n"); } if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t%li\n", (long)mr->m_hopcount); if ( si->m_format == FORMAT_JSON ) sb->safePrintf("\t\t\"hopCount\":%li,\n",(long)mr->m_hopcount); // now the last spidered date of the document time_t ts = mr->m_lastSpidered; if ( si->m_format == FORMAT_HTML ) printTimeAgo ( sb , ts , "indexed" , si ); // the date it was last modified ts = mr->m_lastModified; if ( si->m_format == FORMAT_HTML ) printTimeAgo ( sb , ts , "modified" , si ); // // more xml stuff // if ( si->m_format == FORMAT_XML ) { // doc size in Kilobytes sb->safePrintf ( "\t\t\n", (float)mr->m_contentLen/1024.0); sb->safePrintf ( "\t\t%li\n", mr->m_contentLen); // . docId for possible cached link // . might have merged a bunch together sb->safePrintf("\t\t%lli\n",mr->m_docId ); // . show the site root // . for hompages.com/users/fred/mypage.html this will be // homepages.com/users/fred/ // . for www.xyz.edu/~foo/burp/ this will be // www.xyz.edu/~foo/ etc. long siteLen = 0; char *site = NULL; // seems like this isn't the way to do it, cuz Tagdb.cpp // adds the "site" tag itself and we do not always have it // in the XmlDoc::ptr_tagRec... so do it this way: site = mr->ptr_site; siteLen = mr->size_site-1; //char *site=uu.getSite( &siteLen , si->m_coll, false, tagRec); sb->safePrintf("\t\t 0 ) sb->safeMemcpy ( site , siteLen ); sb->safePrintf("]]>\n"); //long sh = hash32 ( site , siteLen ); //sb->safePrintf ("\t\t%lu\n",sh); //long dh = uu.getDomainHash32 (); //sb->safePrintf ("\t\t%lu\n",dh); // spider date sb->safePrintf ( "\t\t%lu\n", mr->m_lastSpidered); // backwards compatibility for buzz sb->safePrintf ( "\t\t%lu" "\n", mr->m_firstIndexedDate); sb->safePrintf( "\t\t%lu" "\n", mr->m_contentHash32); // pub date long datedbDate = mr->m_datedbDate; // show the datedb date as "" for now if ( datedbDate != -1 ) sb->safePrintf ( "\t\t%lu\n", datedbDate); } if ( si->m_format == FORMAT_JSON ) { // doc size in Kilobytes sb->safePrintf ( "\t\t\"size\":\"%4.0fk\",\n", (float)mr->m_contentLen/1024.0); sb->safePrintf ( "\t\t\"sizeInBytes\":%li,\n", mr->m_contentLen); // . docId for possible cached link // . might have merged a bunch together sb->safePrintf("\t\t\"docId\":%lli,\n",mr->m_docId ); // . show the site root // . for hompages.com/users/fred/mypage.html this will be // homepages.com/users/fred/ // . for www.xyz.edu/~foo/burp/ this will be // www.xyz.edu/~foo/ etc. long siteLen = 0; char *site = NULL; // seems like this isn't the way to do it, cuz Tagdb.cpp // adds the "site" tag itself and we do not always have it // in the XmlDoc::ptr_tagRec... so do it this way: site = mr->ptr_site; siteLen = mr->size_site-1; //char *site=uu.getSite( &siteLen , si->m_coll, false, tagRec); sb->safePrintf("\t\t\"site\":\""); if ( site && siteLen > 0 ) sb->safeMemcpy ( site , siteLen ); sb->safePrintf("\",\n"); //long sh = hash32 ( site , siteLen ); //sb->safePrintf ("\t\t%lu\n",sh); //long dh = uu.getDomainHash32 (); //sb->safePrintf ("\t\t%lu\n",dh); // spider date sb->safePrintf ( "\t\t\"spidered\":%lu,\n", mr->m_lastSpidered); // backwards compatibility for buzz sb->safePrintf ( "\t\t\"firstIndexedDateUTC\":%lu,\n" , mr->m_firstIndexedDate); sb->safePrintf( "\t\t\"contentHash32\":%lu,\n" , mr->m_contentHash32); // pub date long datedbDate = mr->m_datedbDate; // show the datedb date as "" for now if ( datedbDate != -1 ) sb->safePrintf ( "\t\t\"pubdate\":%lu,\n", datedbDate); } // . we also store the outlinks in a linkInfo structure // . we can call LinkInfo::set ( Links *outlinks ) to set it // in the msg20 LinkInfo *outlinks = (LinkInfo *)mr->ptr_outlinks; // NULLify if empty if ( mr->size_outlinks <= 0 ) outlinks = NULL; // only for xml for now if ( si->m_format == FORMAT_HTML ) outlinks = NULL; Inlink *k; // do we need absScore2 for outlinks? //k = NULL; while ( outlinks && (k =outlinks->getNextInlink(k))) // print it out sb->safePrintf("\t\tm_docId , k->m_ip, // hostHash, but use ip for now (long)k->m_firstIndexedDate , (long)k->m_datedbDate ); if ( si->m_format == FORMAT_XML ) { // result sb->safePrintf("\t\t" "\n", getLanguageString(mr->m_language)); sb->safePrintf("\t\t%s\n", getLangAbbr(mr->m_language)); char *charset = get_charset_str(mr->m_charset); if(charset) sb->safePrintf("\t\t" "\n", charset); } if ( si->m_format == FORMAT_JSON ) { // result sb->safePrintf("\t\t\"language\":\"%s\",\n", getLanguageString(mr->m_language)); sb->safePrintf("\t\t\"langAbbr\":\"%s\",\n", getLangAbbr(mr->m_language)); char *charset = get_charset_str(mr->m_charset); if(charset) sb->safePrintf("\t\t\"charset\":\"%s\",\n",charset); } // // end more xml stuff // if ( isAdmin && si->m_format == FORMAT_HTML ) { long lang = mr->m_language; if ( lang ) sb->safePrintf(" - %s",getLanguageString(lang)); uint16_t cc = mr->m_computedCountry; if( cc ) sb->safePrintf(" - %s", g_countryCode.getName(cc)); char *charset = get_charset_str(mr->m_charset); if ( charset ) sb->safePrintf(" - %s ", charset); } if ( si->m_format == FORMAT_HTML ) sb->safePrintf("
\n"); //char *coll = si->m_cr->m_coll; // print the [cached] link? bool printCached = true; if ( mr->m_noArchive ) printCached = false; if ( isAdmin ) printCached = true; if ( mr->m_contentLen <= 0 ) printCached = false; if ( si->m_format != FORMAT_HTML ) printCached = false; // get collnum result is from //collnum_t collnum = si->m_cr->m_collnum; // if searching multiple collections - federated search CollectionRec *scr = g_collectiondb.getRec ( mr->m_collnum ); char *coll = "UNKNOWN"; if ( scr ) coll = scr->m_coll; if ( printCached && cr->m_clickNScrollEnabled ) sb->safePrintf ( " - " "cached", st->m_qe , coll , mr->m_docId ); else if ( printCached ) sb->safePrintf ( "" "cached", st->m_qe , // "qlang" parm si->m_defaultSortLang, coll , mr->m_docId ); // the new links if ( si->m_format == FORMAT_HTML && g_conf.m_isMattWells && 1 == 0 ) { //sb->safePrintf(" - scoring", // coll ); //sb->safePrintf(" - safePrintf(" - safePrintf(" - safePrintf("d=%lli",mr->m_docId); sb->safePrintf("u="); sb->urlEncode ( url , gbstrlen(url) , false ); //sb->safePrintf("&page=1\">seo" ); sb->safePrintf("\">seo" ); } // only display re-spider link if addurl is enabled //if ( isAdmin && // g_conf.m_addUrlEnabled && // cr->m_addUrlEnabled ) { /* if ( si->m_format == FORMAT_HTML ) { // the [respider] link // save this for seo iframe! sb->safePrintf (" - urlEncode ( url , urlLen ); // then collection if ( coll ) { sb->safeMemcpy ( "&c=" , 3 ); sb->safeMemcpy ( coll , gbstrlen(coll) ); } //sb->safePrintf ( "&force=1\">reindex" ); sb->safePrintf ( "\">reindex" ); } */ // unhide the divs on click long placeHolder = -1; long placeHolderLen = 0; if ( si->m_format == FORMAT_HTML && si->m_getDocIdScoringInfo ) { // place holder for backlink table link placeHolder = sb->length(); sb->safePrintf (" - " "00000 backlinks" "" , ix ); placeHolderLen = sb->length() - placeHolder; } if ( si->m_format == FORMAT_HTML && si->m_getDocIdScoringInfo ) { // unhide the scoring table on click sb->safePrintf (" - " "scoring" "" ,ix ); } if ( si->m_format == FORMAT_HTML ) { // reindex sb->safePrintf(" - respider",rand64); } if ( si->m_format == FORMAT_HTML ) { sb->safePrintf (" - " "urlEncode ( url , gbstrlen(url) , false ); sb->safePrintf ( "\">" "spider info" ); } // // show rainbow sections link // if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ( " - " "sections", st->m_qe , // "qlang" parm si->m_defaultSortLang, coll , mr->m_docId ); } if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ( " - " "page info", //st->m_qe , // "qlang" parm //si->m_defaultSortLang, coll , mr->m_docId ); } // this stuff is secret just for local guys! not any more if ( si->m_format == FORMAT_HTML ) { // now the ip of url //long urlip = msg40->getIp(i); // don't combine this with the sprintf above cuz // iptoa uses a static local buffer like ctime() sb->safePrintf(//"
" " - %s", coll,iptoa(mr->m_ip), iptoa(mr->m_ip) ); // ip domain link unsigned char *us = (unsigned char *)&mr->m_ip;//urlip; sb->safePrintf (" - %li.%li.%li", coll, (long)us[0],(long)us[1],(long)us[2], (long)us[0],(long)us[1],(long)us[2]); /* // . now the info link // . if it's local, don't put the hostname/port in // there cuz it will mess up Global Spec's machine //if ( h->m_groupId == g_hostdb.m_groupId ) sb.safePrintf(" - m_docId); // then the [info] link to show the TitleRec sb->safePrintf ( "\">[info]" ); // now the analyze link sb.safePrintf (" - m_hopcount); // encode the url now sb->urlEncode ( url , urlLen ); // then the [analyze] link sb->safePrintf ("\">[analyze]" ); // and links: query link sb->safePrintf( " - urlEncode ( url , urlLen ); sb->safeMemcpy ("\">linkers" , 14 ); */ } char dbuf [ MAX_URL_LEN ]; long dlen = uu.getDomainLen(); if ( si->m_format == FORMAT_HTML ) { memcpy ( dbuf , uu.getDomain() , dlen ); dbuf [ dlen ] = '\0'; // newspaperarchive urls have no domain if ( dlen == 0 ) { dlen = uu.getHostLen(); memcpy ( dbuf , uu.getHost() , dlen ); dbuf [ dlen ] = '\0'; } } // admin always gets the site: option so he can ban if ( si->m_format == FORMAT_HTML ) { sb->safePrintf (" - " " " "domain " , dbuf , coll );//, dbuf ); } if ( si->m_format == FORMAT_HTML && ( isAdmin || cr->m_isCustomCrawl)){ char *un = ""; long banVal = 1; if ( mr->m_isBanned ) { un = "UN"; banVal = 0; } sb->safePrintf("
" " " "%sBAN %s" " " , banVal , dbuf , coll , un , dbuf ); //banSites->safePrintf("%s+", dbuf); dlen = uu.getHostLen(); memcpy ( dbuf , uu.getHost() , dlen ); dbuf [ dlen ] = '\0'; sb->safePrintf(" - " " " "%sBAN %s " , banVal , dbuf , coll , un , dbuf ); // take similarity out until working again /* sb->safePrintf (" - [similar -" " " "tag " , (long)mr->m_tagVectorHash, coll); sb->safePrintf ("" "topic " , (long)mr->m_gigabitVectorHash, coll); */ if ( mr->size_gbAdIds > 0 ) sb->safePrintf ("" "Ad Id " , mr->ptr_gbAdIds, coll); //sb->safePrintf ("] "); /* put this on 'page info' long urlFilterNum = (long)mr->m_urlFilterNum; if(urlFilterNum != -1) { sb->safePrintf (" - " "UrlFilter:%li", coll , urlFilterNum); } */ } /* // print the help SafeBuf help; help.safePrintf("The distance matrix uses the " "following formula to calculate " "a score in a table cell for a pair of query terms: " "
" "" "SCORE = (%li - |pos1-pos2|) * " "locationWeight * " "densityWeight * " "synWeight1 * " "synWeight2 * " "spamWeight1 * " "spamWeight2 * " "tfWeight1 * " "tfWeight2" "" "
" "
" , (long)MAXWORDPOS+1 ); help.safePrintf("" "" "" "
pos1The word position of " "query term 1
pos2The word position of " "query term 2
" ); help.safePrintf( //"where
" //"locationWeight is based on where " //"the two terms occur in the document " //"and uses the following table:
" "" "" "" ); for ( long i = 0 ; i < HASHGROUP_END ; i++ ) { char *hs = getHashGroupString(i); float hw = s_hashGroupWeights[i]; help.safePrintf("" ,hs,hw ); } help.safePrintf("
term locationlocationWeight
%s%.0f
"); help.safePrintf("

"); help.safePrintf( "" "" "" "" "" ); for ( long i = 0 ; i < MAXDENSITYRANK ; i++ ) { help.safePrintf("" "" "" "" "" ,maxw,i,dweight ); } help.safePrintf("
max # alphanumeric words in locationdensityRankdensityWeight
%li%li%.0f
"); help.safePrintf("

" */ // end serp div if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf("

"); if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "

\n"); // search result spacer if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf("

", (long)SERP_SPACER); // inc it *numPrintedSoFar = *numPrintedSoFar + 1; // done? DocIdScore *dp = msg40->getScoreInfo(ix); if ( ! dp ) { if ( si->m_format == FORMAT_XML ) sb->safePrintf ("\t\n\n"); if ( si->m_format == FORMAT_JSON ) { // remove last ,\n sb->m_length -= 2; sb->safePrintf ("\n\t}\n\n"); } // wtf? //char *xx=NULL;*xx=0; // at least close up the table if ( si->m_format != FORMAT_HTML ) return true; sb->safePrintf("
\n"); return true; } // // scoring info tables // long nr = dp->m_numRequiredTerms; if ( nr == 1 ) nr = 0; // print breakout tables here for distance matrix //SafeBuf bt; // final score calc SafeBuf ft; // shortcut //Query *q = si->m_q; // put in a hidden div so you can unhide it if ( si->m_format == FORMAT_HTML ) sb->safePrintf(""); sb->safePrintf("
"); // print the breakout tables //if ( nr ) { // //sb->safePrintf("
"); // sb->safeMemcpy ( &bt ); //} // the singles --- TODO: make it ALL query terms //nr = dp->m_numRequiredTerms; //for ( long i = 0 ; i < nr && nr == 1 ; i++ ) { long lastTermNum = -1; long numSingles = dp->m_numSingles; // do not print this if we got pairs if ( dp->m_numPairs ) numSingles = 0; for ( long i = 0 ; i < numSingles ; i++ ) { float totalSingleScore = 0.0; // print all the top winners for this single SingleScore *fss = &dp->m_singleScores[i]; // if same combo as last time skip if ( fss->m_qtermNum == lastTermNum ) continue; // do not reprint for this query term num lastTermNum = fss->m_qtermNum; bool firstTime = true; // print all singles for this combo for ( long j = i ; j < dp->m_numSingles ; j++ ) { // get it SingleScore *ss = &dp->m_singleScores[j]; // stop if different single now if ( ss->m_qtermNum != fss->m_qtermNum ) break; // skip if 0. skip neighborhoods i guess if ( ss->m_finalScore == 0.0 ) continue; // first time? if ( firstTime && si->m_format == FORMAT_HTML ) { Query *q = &si->m_q; printSingleTerm ( sb , q , ss ); printScoresHeader ( sb ); firstTime = false; } // print it printSingleScore ( sb , si , ss , mr , msg40 ); // add up totalSingleScore += ss->m_finalScore; } if ( ft.length() ) ft.safePrintf(" , "); ft.safePrintf("%f",totalSingleScore); // min? if ( minScore < 0.0 || totalSingleScore < minScore ) minScore = totalSingleScore; // we need to set "ft" for xml stuff below if ( si->m_format != FORMAT_HTML ) continue; //sb->safePrintf(""); sb->safePrintf("" "" "", totalSingleScore); // close table from printScoresHeader if ( ! firstTime ) sb->safePrintf("
"); // print pair text //long qtn = fss->m_qtermNum; //sb->safeMemcpy(q->m_qterms[qtn].m_term , // q->m_qterms[qtn].m_termLen ); //sb->safePrintf("
%.04ftotal of above scores

"); } char *ff = ""; if ( si->m_useMinAlgo ) ff = "MIN "; char *ff2 = "sum"; if ( si->m_useMinAlgo ) ff2 = "min"; //if ( nr ) sb->safePrintf("
"); //sb->safePrintf("
"); // final score!!! if ( si->m_format == FORMAT_XML ) { sb->safePrintf ("\t\t%li\n", (long)dp->m_siteRank ); sb->safePrintf ("\t\t%li" "\n", (long)mr->m_siteNumInlinks ); sb->safePrintf ("\t\t%li" "\n", (long)mr->m_siteNumInlinksTotal ); sb->safePrintf ("\t\t%li" "\n", (long)mr->m_siteNumUniqueIps ); sb->safePrintf ("\t\t%li" "\n", (long)mr->m_siteNumUniqueCBlocks ); struct tm *timeStruct3 = gmtime(&mr->m_pageInlinksLastUpdated); char tmp3[64]; strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 ); // -1 means unknown if ( mr->m_pageNumInlinks >= 0 ) // how many inlinks, external and internal, we have // to this page not filtered in any way!!! sb->safePrintf("\t\t%li" "\n" ,mr->m_pageNumInlinks ); // how many inlinking ips we got, including our own if // we link to ourself sb->safePrintf("\t\t%li" "\n" ,mr->m_pageNumUniqueIps ); // how many inlinking cblocks we got, including our own if // we link to ourself sb->safePrintf("\t\t%li" "\n" ,mr->m_pageNumUniqueCBlocks ); // how many "good" inlinks. i.e. inlinks whose linktext we // count and index. sb->safePrintf("\t\t%li" "\n" "\t\t%lu" "\n" ,mr->m_pageNumGoodInlinks ,mr->m_pageInlinksLastUpdated ); float score = msg40->getScore (ix); sb->safePrintf("\t\t%f\n", score ); sb->safePrintf ("\t\t" "" "\n" , SITERANKDIVISOR , SAMELANGMULT , ff2 ); sb->safePrintf ("\t\t" "%.03f = (%li/%.01f+1) " // * %s(" , dp->m_finalScore , (long)dp->m_siteRank , SITERANKDIVISOR //, ff ); // then language weight if ( si->m_queryLangId == 0 || mr->m_language == 0 || si->m_queryLangId == mr->m_language ) sb->safePrintf(" * %.01f", SAMELANGMULT);//FOREIGNLANGDIVISOR); // the actual min then sb->safePrintf(" * %.03f",minScore); // no longer list all the scores //sb->safeMemcpy ( &ft ); sb->safePrintf(//")" "]]>" "\n"); sb->safePrintf ("\t\n\n"); return true; } if ( si->m_format != FORMAT_HTML ) return true; char *cc = getCountryCode ( mr->m_country ); if ( mr->m_country == 0 ) cc = "Unknown"; sb->safePrintf("" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "
" "final score
" "
docId%lli
site%s
hopcount%li
language%s
country%s
siteRank%li
" , dp->m_docId , mr->ptr_site , (long)mr->m_hopcount //, getLanguageString(mr->m_summaryLanguage) , getLanguageString(mr->m_language) // use page language , cc , (long)dp->m_siteRank ); // list all final scores starting with pairs sb->safePrintf("%f = " "(%li/%.01f+1)" , dp->m_finalScore , (long)dp->m_siteRank , SITERANKDIVISOR ); // if lang is different if ( si->m_queryLangId == 0 || mr->m_language == 0 || si->m_queryLangId == mr->m_language ) sb->safePrintf(" * %.01f", SAMELANGMULT);//FOREIGNLANGDIVISOR); // list all final scores starting with pairs sb->safePrintf(" * %s(" , ff ); sb->safeMemcpy ( &ft ); sb->safePrintf(")

"); // put in a hidden div so you can unhide it sb->safePrintf("\n"); // result is in a table so we can put the result # in its own column sb->safePrintf("
"); /* // UN-indent it if level is 1 if ( si->m_format == FORMAT_HTML && si->m_doIpClustering ) { sb->safePrintf (" - [ " "More from this ip ]", iptoa ( mr->m_ip ) , st->m_qe , coll ); if ( indent ) sb->safePrintf ( "

\n"); else sb->safePrintf ( "

\n"); } else if ( si->m_format == FORMAT_HTML && si->m_doSiteClustering ) { char hbuf [ MAX_URL_LEN ]; long hlen = uu.getHostLen(); memcpy ( hbuf , uu.getHost() , hlen ); hbuf [ hlen ] = '\0'; sb->safePrintf (" - " "More from this site", hbuf , st->m_qe , coll ); if ( indent ) sb->safePrintf ( "
\n"); else sb->safePrintf ( "

\n"); } */ // space out 0000 backlinks char *p = sb->getBufStart() + placeHolder; long plen = placeHolderLen; if ( numInlinks == 0 ) memset ( p , ' ' , plen ); if ( numInlinks > 0 && numInlinks < 99999 ) { char *ss = strstr ( p, "00000" ); if ( ss ) { char c = ss[5]; sprintf(ss,"%5li",numInlinks); ss[5] = c; } } // print "1 backlink" not "1 backlinks" if ( numInlinks == 1 ) { char *xx = strstr(p,"backlinks"); if ( xx ) xx[8] = ' '; } return true; } bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps , Msg20Reply *mr , Msg40 *msg40 , bool first ) { // shortcut Query *q = &si->m_q; //SafeBuf ft; // store in final score calc //if ( ft.length() ) ft.safePrintf(" + "); //ft.safePrintf("%f",ps->m_finalScore); long qtn1 = ps->m_qtermNum1; long qtn2 = ps->m_qtermNum2; /* unsigned char drl1 = ps->m_diversityRankLeft1; unsigned char drl2 = ps->m_diversityRankLeft2; float dvwl1 = getDiversityWeight(dr1); float dvwl2 = getDiversityWeight(dr2); unsigned char drr1 = ps->m_diversityRankRight1; unsigned char drr2 = ps->m_diversityRankRight2; float dvwr1 = getDiversityWeight(dr1); float dvwr2 = getDiversityWeight(dr2); */ unsigned char de1 = ps->m_densityRank1; unsigned char de2 = ps->m_densityRank2; float dnw1 = getDensityWeight(de1); float dnw2 = getDensityWeight(de2); long hg1 = ps->m_hashGroup1; long hg2 = ps->m_hashGroup2; float hgw1 = getHashGroupWeight(hg1); float hgw2 = getHashGroupWeight(hg2); long wp1 = ps->m_wordPos1; long wp2 = ps->m_wordPos2; unsigned char wr1 = ps->m_wordSpamRank1; float wsw1 = getWordSpamWeight(wr1); unsigned char wr2 = ps->m_wordSpamRank2; float wsw2 = getWordSpamWeight(wr2); // HACK for inlink text! if ( hg1 == HASHGROUP_INLINKTEXT ) wsw1 = getLinkerWeight(wr1); if ( hg2 == HASHGROUP_INLINKTEXT ) wsw2 = getLinkerWeight(wr2); char *syn1 = "no"; char *syn2 = "no"; float sw1 = 1.0; float sw2 = 1.0; if ( ps->m_isSynonym1 ) { syn1 = "yes"; sw1 = SYNONYM_WEIGHT; } if ( ps->m_isSynonym2 ) { syn2 = "yes"; sw2 = SYNONYM_WEIGHT; } //char bf1 = ps->m_bflags1; //char bf2 = ps->m_bflags2; char *bs1 = "no"; char *bs2 = "no"; //if ( bf1 & BF_HALFSTOPWIKIBIGRAM ) bs1 = "yes"; //if ( bf2 & BF_HALFSTOPWIKIBIGRAM ) bs2 = "yes"; if ( ps->m_isHalfStopWikiBigram1 ) bs1 = "yes"; if ( ps->m_isHalfStopWikiBigram2 ) bs2 = "yes"; float wbw1 = 1.0; float wbw2 = 1.0; if ( ps->m_isHalfStopWikiBigram1 ) wbw1 = WIKI_BIGRAM_WEIGHT; if ( ps->m_isHalfStopWikiBigram2 ) wbw2 = WIKI_BIGRAM_WEIGHT; //long long sz1 = ps->m_listSize1; //long long sz2 = ps->m_listSize2; //long long tf1 = ps->m_termFreq1;//sz1 / 10; //long long tf2 = ps->m_termFreq2;//sz2 / 10; long long tf1 = msg40->m_msg3a.m_termFreqs[qtn1]; long long tf2 = msg40->m_msg3a.m_termFreqs[qtn2]; float tfw1 = ps->m_tfWeight1; float tfw2 = ps->m_tfWeight2; char *wp = "no"; float wiw = 1.0; if ( ps->m_inSameWikiPhrase ) { wp = "yes"; wiw = WIKI_WEIGHT; // 0.50; } long a = ps->m_wordPos2; long b = ps->m_wordPos1; char *es = ""; char *bes = ""; if ( a < b ) { a = ps->m_wordPos1; b = ps->m_wordPos2; // out of query order penalty! es = "+ 1.0"; bes = "+ 1.0"; } if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t\n"); /* sb->safePrintf("\t\t\t%li" "\n", (long)drl1); sb->safePrintf("\t\t\t%li" "\n", (long)drr1); sb->safePrintf("\t\t\t%f" "\n", dvwl1); sb->safePrintf("\t\t\t%f" "\n", dvwr1); sb->safePrintf("\t\t\t%li" "\n", (long)drl2); sb->safePrintf("\t\t\t%li" "\n", (long)drr2); sb->safePrintf("\t\t\t%f" "\n", dvwl2); sb->safePrintf("\t\t\t%f" "\n", dvwr2); */ sb->safePrintf("\t\t\t%li" "\n", (long)de1); sb->safePrintf("\t\t\t%li" "\n", (long)de2); sb->safePrintf("\t\t\t%f" "\n", dnw1); sb->safePrintf("\t\t\t%f" "\n", dnw2); sb->safePrintf("\t\t\tsafeMemcpy ( q->m_qterms[qtn1].m_term , q->m_qterms[qtn1].m_termLen ); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\tsafeMemcpy ( q->m_qterms[qtn2].m_term , q->m_qterms[qtn2].m_termLen ); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\t" "\n", getHashGroupString(hg1)); sb->safePrintf("\t\t\t" "\n", getHashGroupString(hg2)); sb->safePrintf("\t\t\t%.01f" "\n", hgw1 ); sb->safePrintf("\t\t\t%.01f" "\n", hgw2 ); sb->safePrintf("\t\t\t%li" "\n", wp1 ); sb->safePrintf("\t\t\t%li" "\n", wp2 ); //long wordDist = wp2 - wp1; //if ( wordDist < 0 ) wordDist *= -1; //sb->safePrintf("\t\t\t%li" // "\n",wdist); sb->safePrintf("\t\t\t" "" "\n", syn1); sb->safePrintf("\t\t\t" "" "\n", syn2); sb->safePrintf("\t\t\t%.01f" "\n", sw1); sb->safePrintf("\t\t\t%.01f" "\n", sw2); // word spam / link text weight char *r1 = "wordSpamRank1"; char *r2 = "wordSpamRank2"; char *t1 = "wordSpamWeight1"; char *t2 = "wordSpamWeight2"; if ( hg1 == HASHGROUP_INLINKTEXT ) { r1 = "inlinkSiteRank1"; t1 = "inlinkTextWeight1"; } if ( hg2 == HASHGROUP_INLINKTEXT ) { r2 = "inlinkSiteRank2"; t2 = "inlinkTextWeight2"; } sb->safePrintf("\t\t\t<%s>%li\n", r1,(long)wr1,r1); sb->safePrintf("\t\t\t<%s>%li\n", r2,(long)wr2,r2); sb->safePrintf("\t\t\t<%s>%.02f\n", t1,wsw1,t1); sb->safePrintf("\t\t\t<%s>%.02f\n", t2,wsw2,t2); // if offsite inlink text show the inlinkid for matching // to an LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; Inlink *k = info->getNextInlink(NULL); for (;k&&hg1==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){ if ( ! k->ptr_linkText ) continue; if ( k->m_wordPosStart > wp1 ) continue; if ( k->m_wordPosStart + 50 < wp1 ) continue; // got it. we HACKED this to put the id // in k->m_siteHash sb->safePrintf("\t\t\t%li" "\n", k->m_siteHash); } k = info->getNextInlink(NULL); for (;k&&hg2==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){ if ( ! k->ptr_linkText ) continue; if ( k->m_wordPosStart > wp2 ) continue; if ( k->m_wordPosStart + 50 < wp2 ) continue; // got it. we HACKED this to put the id // in k->m_siteHash sb->safePrintf("\t\t\t%li" "\n", k->m_siteHash); } // term freq sb->safePrintf("\t\t\t%lli" "\n",tf1); sb->safePrintf("\t\t\t%lli" "\n",tf2); sb->safePrintf("\t\t\t%f" "\n",tfw1); sb->safePrintf("\t\t\t%f" "\n",tfw2); sb->safePrintf("\t\t\t" "%li\n", (long)(ps->m_isHalfStopWikiBigram1)); sb->safePrintf("\t\t\t" "%li\n", (long)(ps->m_isHalfStopWikiBigram2)); sb->safePrintf("\t\t\t%.01f" "\n", wbw1); sb->safePrintf("\t\t\t%.01f" "\n", wbw2); sb->safePrintf("\t\t\t" "" "\n", wp); sb->safePrintf("\t\t\t" "%li" "\n", ps->m_qdist ); sb->safePrintf("\t\t\t" "%.01f" "\n", wiw ); sb->safePrintf("\t\t\t%f\n", ps->m_finalScore); sb->safePrintf("\t\t\t" "" "\n" , t1 , t2 ); sb->safePrintf("\t\t\t" "%.1f
"//hashgroupweight "*" "%.1f"//hashgroupweight "*" "%.1f" // syn weight "*" "%.1f" // syn weight "*" "%.1f"//wikibigramweight "*" "%.1f"//wikibigramweight "*" "%.02f"//density weight "*" "%.02f"//density weight "*" "%.02f" // wordspam weight "*" "%.02f" // wordspam weight "*" "%.02f"//tf weight "*" "%.02f"//tf weight , ps->m_finalScore , hgw1 , hgw2 , sw1 , sw2 , wbw1 , wbw2 , dnw1 , dnw2 , wsw1 , wsw2 , tfw1 , tfw2 ); if ( ps->m_fixedDistance ) sb->safePrintf( "/%li " , (long)FIXED_DISTANCE ); else sb->safePrintf( "/" "(((%li" "-%li" ")-%li)+1.0%s)" , a,b,ps->m_qdist,bes); // wikipedia weight if ( wiw != 1.0 ) sb->safePrintf("*%.01f", wiw ); sb->safePrintf("]]>" "\n" ); sb->safePrintf("\t\t\n"); return true; // continue; } // print out the entire details i guess //sb->safePrintf("
%.02f
" // ""); // then print the details just like the // single term table below //sb->safePrintf("" // "" // "" // "" // "" // "" // //"" // "" // "" // "" // "" // "" // "" // ); // // print first term in first row // sb->safePrintf("",ps->m_finalScore); //sb->safeMemcpy ( q->m_qterms[qtn1].m_term , // q->m_qterms[qtn1].m_termLen ); //sb->safePrintf(""); sb->safePrintf("" , getHashGroupString(hg1) , hgw1 ); // the word position sb->safePrintf("" "" ,mr->m_docId ,(long)ps->m_wordPos1 ,si->m_cr->m_coll ,(long)ps->m_wordPos1); // is synonym? //if ( sw1 != 1.00 ) sb->safePrintf("",syn1,sw1); //else // sb->safePrintf(""); // wikibigram?/weight //if ( wbw1 != 1.0 ) sb->safePrintf("",bs1,wbw1); //else // sb->safePrintf(""); // diversity - // not needed for term pair algo //sb->safePrintf("", // (long)dr1,dvw1); // density sb->safePrintf("", (long)de1,dnw1); // word spam if ( hg1 == HASHGROUP_INLINKTEXT ) { sb->safePrintf(""); sb->safePrintf("", (long)wr1,wsw1); } else { sb->safePrintf(""); sb->safePrintf(""); } // term freq sb->safePrintf("", tf1,tfw1); // insamewikiphrase? sb->safePrintf("", wp,ps->m_qdist,wiw); // end the row sb->safePrintf(""); // // print 2nd term in 2nd row // sb->safePrintf(""); sb->safePrintf(//"" , getHashGroupString(hg2) , hgw2 ); // the word position sb->safePrintf("" "" ,mr->m_docId ,(long)ps->m_wordPos2 ,si->m_cr->m_coll ,(long)ps->m_wordPos2); // is synonym? //if ( sw2 != 1.00 ) sb->safePrintf("",syn2,sw2); //else // sb->safePrintf(""); // wikibigram?/weight //if ( wbw2 != 1.0 ) sb->safePrintf("",bs2,wbw2); //else // sb->safePrintf(""); // diversity //sb->safePrintf("", // (long)dr2,dvw2); // density sb->safePrintf("", (long)de2,dnw2); // word spam if ( hg2 == HASHGROUP_INLINKTEXT ) { sb->safePrintf(""); sb->safePrintf("", (long)wr2,wsw2); } else { sb->safePrintf(""); sb->safePrintf(""); } // term freq sb->safePrintf("", tf2,tfw2); // insamewikiphrase? sb->safePrintf("", wp,ps->m_qdist,wiw); // end the row sb->safePrintf(""); sb->safePrintf("" //"
" // "
"); //if ( q->m_qterms[qtn1].m_isPhrase ) // sb->pushChar('\"'); //sb->safeMemcpy ( q->m_qterms[qtn1].m_term , // q->m_qterms[qtn1].m_termLen ); //if ( q->m_qterms[qtn1].m_isPhrase ) // sb->pushChar('\"'); //sb->safePrintf(" vs "); //if ( q->m_qterms[qtn2].m_isPhrase ) // sb->pushChar('\"'); //sb->safeMemcpy ( q->m_qterms[qtn2].m_term , // q->m_qterms[qtn2].m_termLen ); //if ( q->m_qterms[qtn2].m_isPhrase ) // sb->pushChar('\"'); //sb->safePrintf("
termlocationwordPossynonymwikibigramdiversityRank/weightdensityRankwordSpamRankinlinkSiteRanktermFreqinWikiPhrase/qdist
"); sb->safePrintf("" ); sb->safePrintf("%.04f" "%s " "%.01f"); //"safePrintf("safePrintf("safePrintf("%lli" "&page=4" //"&page=sections&" "&hipos=%li" "&c=%s#hipos\">" "%li%s %.02f" " %s %.02f" " %li/" // "%f%li " "%.02f %li " "%.02f%li", (long)wr1); //if ( wsw1 != 1.0 ) sb->safePrintf( " " "%.02f", wsw1); sb->safePrintf(" %lli " "%.02f%s %li/%.01f
"); //sb->safeMemcpy ( q->m_qterms[qtn2].m_term , // q->m_qterms[qtn2].m_termLen ); //sb->safePrintf("" "%s " "%.01f"); //"safePrintf("safePrintf("safePrintf("%lli" "&page=4&" "hipos=%li&c=%s#hipos\">" "%li%s %.02f" " %s %.02f" " %li/" // "%f%li " "%.02f %li " "%.02f%li", (long)wr2); //if ( wsw2 != 1.0 ) sb->safePrintf( " " "%.02f", wsw2); sb->safePrintf(" %lli " "%.02f%s/%li %.01f
safePrintf("id=poo%li ",s_count); } sb->safePrintf("colspan=50>" // style=\"display:none\">" "%.03f " "= " //" ( " "100*" "%.1f" "" "*" "%.1f" "" "*" //"(%li - " , ps->m_finalScore //, idstr , hgw1 , hgw2 //, (long)MAXWORDPOS+1 ); sb->safePrintf("%.1f" "*" " %.1f" "*" // wiki bigram weight "%.02f" "*" "%.02f" "*" "%.02f" "*" "%.02f" "*" "%.02f" "*" " %.02f" "*" "%.02f" "*" "%.02f" , sw1 , sw2 , wbw1 , wbw2 , dnw1 , dnw2 , wsw1 , wsw2 , tfw1 , tfw2 ); if ( ps->m_fixedDistance ) sb->safePrintf( "/%li " , (long)FIXED_DISTANCE ); else sb->safePrintf( "/" "(((%li" "-%li)-" "%li) + 1.0%s)" , a,b,ps->m_qdist,bes); // wikipedia weight if ( wiw != 1.0 ) sb->safePrintf("*%.01f", wiw ); sb->safePrintf( // end formula "
" //"
"); ); return true; } bool printSingleTerm ( SafeBuf *sb , Query *q , SingleScore *ss ) { long qtn = ss->m_qtermNum; sb->safePrintf(""); sb->safePrintf(""); return true; } bool printTermPairs ( SafeBuf *sb , Query *q , PairScore *ps ) { // print pair text long qtn1 = ps->m_qtermNum1; long qtn2 = ps->m_qtermNum2; sb->safePrintf("
"); // link to rainbow page //sb->safePrintf("urlEncode( mr->ptr_ubuf ); //sb->safePrintf("&page=4&recycle=1&c=%s\">",coll); if ( q->m_qterms[qtn].m_isPhrase ) sb->pushChar('\"'); sb->safeMemcpy ( q->m_qterms[qtn].m_term , q->m_qterms[qtn].m_termLen ); if ( q->m_qterms[qtn].m_isPhrase ) sb->pushChar('\"'); //sb->safePrintf(""); sb->safePrintf("
" "" "" "" "" "" "" //"" "" "" "" // nlinkSiteRank" "" "" ); return true; } bool printSingleScore ( SafeBuf *sb , SearchInput *si , SingleScore *ss , Msg20Reply *mr , Msg40 *msg40 ) { // shortcut Query *q = &si->m_q; //SafeBuf ft; // store in final score calc //if ( ft.length() ) ft.safePrintf(" + "); //ft.safePrintf("%f",ss->m_finalScore); char *syn = "no"; float sw = 1.0; if ( ss->m_isSynonym ) { syn = "yes"; sw = SYNONYM_WEIGHT; // Posdb.h } //char bf = ss->m_bflags; float wbw = 1.0; char *bs = "no"; if ( ss->m_isHalfStopWikiBigram ) { bs = "yes"; wbw = WIKI_BIGRAM_WEIGHT; } float hgw = getHashGroupWeight(ss->m_hashGroup); //float dvw = getDiversityWeight(ss->m_diversityRank); float dnw = getDensityWeight(ss->m_densityRank); float wsw = getWordSpamWeight(ss->m_wordSpamRank); // HACK for inlink text! if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) wsw = getLinkerWeight(ss->m_wordSpamRank); //long long tf = ss->m_termFreq;//ss->m_listSize; long qtn = ss->m_qtermNum; long long tf = msg40->m_msg3a.m_termFreqs[qtn]; float tfw = ss->m_tfWeight; if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t\n"); /* sb->safePrintf("\t\t\t%li" "\n", (long)ss->m_diversityRank); sb->safePrintf("\t\t\t%f" "\n", dvw); */ sb->safePrintf("\t\t\t%li" "\n", (long)ss->m_densityRank); sb->safePrintf("\t\t\t%f" "\n", dnw); sb->safePrintf("\t\t\tsafeMemcpy ( q->m_qterms[qtn].m_term , q->m_qterms[qtn].m_termLen ); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\t" "\n", getHashGroupString(ss->m_hashGroup)); sb->safePrintf("\t\t\t%.01f" "\n", hgw ); sb->safePrintf("\t\t\t%li" "\n", (long)ss->m_wordPos ); sb->safePrintf("\t\t\t" "" "\n", syn); sb->safePrintf("\t\t\t%.01f" "\n", sw); sb->safePrintf("\t\t\t%li" "\n", (long)(ss->m_isHalfStopWikiBigram) ); sb->safePrintf("\t\t\t%.01f" "\n", (float)WIKI_BIGRAM_WEIGHT); // word spam if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) { sb->safePrintf("\t\t\t%li" "\n", (long)ss->m_wordSpamRank); sb->safePrintf("\t\t\t%.02f" "\n", wsw); } else { sb->safePrintf("\t\t\t%li" "\n", (long)ss->m_wordSpamRank); sb->safePrintf("\t\t\t%.02f" "\n", wsw); } // if offsite inlink text show the inlinkid for matching // to an LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; Inlink *k = info->getNextInlink(NULL); for ( ; k && ss->m_hashGroup==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){ if ( ! k->ptr_linkText ) continue; if ( k->m_wordPosStart > ss->m_wordPos ) continue; if ( k->m_wordPosStart + 50 < ss->m_wordPos ) continue; // got it. we HACKED this to put the id // in k->m_siteHash sb->safePrintf("\t\t\t%li" "\n", k->m_siteHash); } // term freq sb->safePrintf("\t\t\t%lli" "\n",tf); sb->safePrintf("\t\t\t%f" "\n",tfw); sb->safePrintf("\t\t\t%f\n", ss->m_finalScore); sb->safePrintf("\t\t\t" "" "\n" ); sb->safePrintf("\t\t\t" "" "\n" , ss->m_finalScore , hgw , hgw , sw , sw , wbw , wbw , dnw , dnw , wsw , wsw , tfw , tfw ); sb->safePrintf("\t\t\n"); return true; } sb->safePrintf("" "" "" // wordpos "" "" // syn // wikibigram?/weight "" //"" // diversity "" // density , (long)ss->m_wordPos , syn , sw // synonym weight , bs , wbw //, (long)ss->m_diversityRank //, dvw , (long)ss->m_densityRank , dnw ); if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) { sb->safePrintf("" "" // wordspam , (long)ss->m_wordSpamRank , wsw ); } else { sb->safePrintf("" // wordspam "" , (long)ss->m_wordSpamRank , wsw ); } sb->safePrintf("" // termfreq "" , tf , tfw ); // last row is the computation of score sb->safePrintf("" , ss->m_finalScore //, (long)MAXWORDPOS+1 , hgw , hgw , sw , sw , wbw , wbw //, dvw //, dvw , dnw , dnw , wsw , wsw , tfw , tfw ); //sb->safePrintf("
"); if ( q->m_qterms[qtn1].m_isPhrase ) sb->pushChar('\"'); sb->safeMemcpy ( q->m_qterms[qtn1].m_term , q->m_qterms[qtn1].m_termLen ); if ( q->m_qterms[qtn1].m_isPhrase ) sb->pushChar('\"'); sb->safePrintf(" vs "); if ( q->m_qterms[qtn2].m_isPhrase ) sb->pushChar('\"'); sb->safeMemcpy ( q->m_qterms[qtn2].m_term , q->m_qterms[qtn2].m_termLen ); if ( q->m_qterms[qtn2].m_isPhrase ) sb->pushChar('\"'); return true; } bool printScoresHeader ( SafeBuf *sb ) { sb->safePrintf("
scorelocationwordPossynonymwikibigramdiversityRankdensityspaminlnkPRtermFreq
%.03f%s %.1f" "" "m_finalScore , getHashGroupString(ss->m_hashGroup) , hgw ); //sb->urlEncode( mr->ptr_ubuf ); sb->safePrintf("%lli",mr->m_docId ); sb->safePrintf("&page=4&" "hipos=%li&c=%s#hipos\">" ,(long)ss->m_wordPos ,si->m_cr->m_coll); sb->safePrintf("%li%s %.1f" "%s %.02f%li/%f" //"%li " "%.02f %li %.02f" "%li %.02f" " %lli " "%.02f
" "%.03f " " = " //" %li * " "100 * " " %.1f" " * " " %.1f" " * " " %.1f" " * " " %.1f" " * " " %.02f"//wikibigramwght " * " " %.02f" " * " "%.02f" " * " "%.02f" " * " "%.02f" " * " "%.02f" " * " "%.02f" " * " "%.02f" //" / ( 3.0 )" // end formula "
" // "
"); return true; } //////// // // . print the directory subtopics // . show these when we are in a directory topic browsing dmoz // . just a list of all the topics/categories // //////// bool printDMOZSubTopics ( SafeBuf *sb, long catId, bool inXml ) { if ( catId <= 0 ) return true; long currType; bool first; bool nextColumn; long maxPerColumn; long currInColumn; long currIndex; char *prefixp; long prefixLen; char *catName; long catNameLen; char encodedName[2048]; //SearchInput *si = &st->m_si; bool isRTL = g_categories->isIdRTL ( catId ); SafeBuf subCatBuf; // stores a list of SubCategories into "subCatBuf" long numSubCats = g_categories->generateSubCats ( catId , &subCatBuf ); // . get the subcategories for a given categoriy // . msg2b::gernerateDirectory() was launched in Msg40.cpp //long numSubCats = st->m_msg40.m_msg2b.m_numSubCats; //SubCategory *subCats = st->m_msg40.m_msg2b.m_subCats; //char *catBuffer = st->m_msg40.m_msg2b.m_catBuffer; //bool showAdultOnTop = st->m_si.m_cr->m_showAdultCategoryOnTop; // just print
if no sub categories if (inXml) { sb->safePrintf ( "\t\n" "\t\t%li\n" "\t\tprintPathFromId ( sb, catId, // st->m_si.m_cat_dirId, true ); sb->safePrintf ( "]]>\n"); sb->safePrintf ( "\t\t%li\n", (long)isRTL); } char *p = subCatBuf.getBufStart(); char *pend = subCatBuf.getBuf(); SubCategory *ptrs[MAX_SUB_CATS]; long count = 0; if (numSubCats <= 0) goto dirEnd; // print out the cats currType = 0; // first make ptrs to them for ( ; p < pend ; ) { SubCategory *cat = (SubCategory *)p; ptrs[count++] = cat; p += cat->getRecSize(); // do not breach if ( count >= MAX_SUB_CATS ) break; } for (long i = 0; i < count ; i++ ) { SubCategory *cat = ptrs[i]; first = false; catName = cat->getName();//&catBuffer[subCats[i].m_nameOffset]; catNameLen = cat->m_nameLen;//subCats[i].m_nameLen; // this is the last topic in the dmoz dir path // so if the dmoz topic is Top/Arts/Directories then // the prefixp is "Directories" prefixp = cat->getPrefix();//&catBuffer[subCats[i].m_prefixOffset]; prefixLen = cat->m_prefixLen;//subCats[i].m_prefixLen; // skip bad categories currIndex=g_categories->getIndexFromPath(catName,catNameLen); if (currIndex < 0) continue; // skip top adult category if we're supposed to /* if ( !inXml && st->m_si.m_catId == 1 && si->m_familyFilter && g_categories->isIndexAdultStart ( currIndex ) ) continue; */ // check for room //if (p + subCats[i].m_prefixLen*2 + // subCats[i].m_nameLen*2 + // 512 > pend){ // goto diroverflow; //} // print simple xml tag for inXml if (inXml) { switch ( cat->m_type ) { case SUBCAT_LETTERBAR: sb->safePrintf ( "\t\tsafePrintf ( "]]>" ); sb->safePrintf ( "%li", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_NARROW2: sb->safePrintf ( "\t\tutf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>"); sb->safePrintf ( "%li", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_NARROW1: sb->safePrintf ( "\t\tutf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%li", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_NARROW: sb->safePrintf ( "\t\tutf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%li", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_SYMBOLIC2: sb->safePrintf ( "\t\tutf8Encode2 ( prefixp, prefixLen ); sb->safePrintf ( ":" ); sb->utf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%li", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_SYMBOLIC1: sb->safePrintf ( "\t\tutf8Encode2 ( prefixp, prefixLen ); sb->safePrintf ( ":" ); sb->utf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%li", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_SYMBOLIC: sb->safePrintf ( "\t\tutf8Encode2 ( prefixp, prefixLen ); sb->safePrintf ( ":" ); sb->utf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%li", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_RELATED: sb->safePrintf ( "\t\tutf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%li", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_ALTLANG: sb->safePrintf ( "\t\tutf8Encode2 ( prefixp, prefixLen ); sb->safePrintf ( ":" ); sb->utf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%li", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n"); break; } continue; } // print type header if ( cat->m_type - currType >= 10) { // end the last type if (currType == SUBCAT_LETTERBAR) sb->safePrintf(" ]\n"); else if (currType != 0) sb->safePrintf ( "\n
\n" ); // start the new type switch (cat->m_type) { case SUBCAT_LETTERBAR: sb->safePrintf ( "" "
[ " ); break; case SUBCAT_NARROW2: case SUBCAT_SYMBOLIC2: case SUBCAT_NARROW1: case SUBCAT_SYMBOLIC1: case SUBCAT_NARROW: case SUBCAT_SYMBOLIC: sb->safePrintf("
\n"); break; case SUBCAT_RELATED: if (currType == 0 || currType == SUBCAT_LETTERBAR) sb->safePrintf("
"); else sb->safePrintf("
"); if (isRTL) sb->safePrintf(""); sb->safePrintf ( "Related Categories:" "" ); if (isRTL) sb->safePrintf(""); break; case SUBCAT_ALTLANG: if (currType == 0 || currType == SUBCAT_LETTERBAR) sb->safePrintf("
"); else sb->safePrintf("
"); if (isRTL) sb->safePrintf(""); sb->safePrintf ( "This category in other" " languages:"); if (isRTL) sb->safePrintf(""); break; } currType = ( cat->m_type/10)*10; first = true; nextColumn = false; currInColumn = 0; if (currType == SUBCAT_LETTERBAR || currType == SUBCAT_RELATED) maxPerColumn = 999; else { // . check how many columns we'll use for this // type long numInType = 1; for (long j = i+1; j < numSubCats; j++) { if ( ptrs[j]->m_type - currType >= 10) break; numInType++; } // column for every 5, up to 3 columns long numColumns = numInType/5; if ( numInType%5 > 0 ) numColumns++; if ( currType == SUBCAT_ALTLANG && numColumns > 4) numColumns = 4; else if (numColumns > 3) numColumns = 3; // max number of links per column maxPerColumn = numInType/numColumns; if (numInType%numColumns > 0) maxPerColumn++; } } // start the sub cat if (first) { if (currType != SUBCAT_LETTERBAR) sb->safePrintf ( "" "
" "
    " "\n
  • "); } // check for the next column else if (nextColumn) { sb->safePrintf ( "\n
" "
\n"); } dirEnd: if (inXml) sb->safePrintf("\t\n"); else { sb->safePrintf(""); sb->safePrintf("
\n");//
\n"); } return true; } bool printDMOZCrumb ( SafeBuf *sb , long catId , bool xml ) { // catid -1 means error if ( catId <= 0 ) return true; long dirIndex = g_categories->getIndexFromId(catId); // dirIndex = g_categories->getIndexFromId(si->m_cat_sdir); if (dirIndex < 0) dirIndex = 0; // display the directory bread crumb //if( (si->m_cat_dirId > 0 && si->m_isAdmin && !si->m_isFriend) // || (si->m_cat_sdir > 0 && si->m_cat_sdirt != 0) ) // sb->safePrintf("

"); // shortcut. rtl=Right To Left language format. bool rtl = g_categories->isIdRTL ( catId ) ; //st->m_isRTL = rtl; if ( ! xml ) { sb->safePrintf("\n"); if ( rtl ) sb->safePrintf(""); //sb->safePrintf("Top: "); } // put crumbin xml? if ( xml ) sb->safePrintf("printPathCrumbFromIndex(sb,dirIndex,rtl); if ( xml ) sb->safePrintf("]]>\n" ); // how many urls/entries in this topic? long nu =g_categories->getNumUrlsFromIndex(dirIndex); // print the num if ( ! xml ) { sb->safePrintf("  "); if ( rtl ) sb->safePrintf("(%li)",nu); else sb->safePrintf("(%li)", nu); sb->safePrintf("

\n"); } return true; } bool printDmozRadioButtons ( SafeBuf *sb , long catId ) ; // if catId >= 1 then print the dmoz radio button bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , long catId , SearchInput *si ) { char *root = ""; if ( g_conf.m_isMattWells ) root = "http://www.gigablast.com"; // now make a TABLE, left PANE contains gigabits and stuff sb->safePrintf( // logo and menu table "" //"style=color:blue;>" "" // take out logo now that we have the circle rocket // "" "" "" "
" // "" // "" // "" // "" //, root ); /* // menu above search box sb->safePrintf( "
" "   " ); if ( catId <= 0 ) sb->safePrintf("web"); else sb->safePrintf("web"); sb->safePrintf("      " ); if ( g_conf.m_isMattWells ) { // SEO functionality not included yet - so redir to gigablast. if ( g_conf.m_isMattWells ) sb->safePrintf(""); else sb->safePrintf(""); sb->safePrintf( "seo" "      " ); } if (catId <= 0 ) sb->safePrintf("" "directory" "" ); else sb->safePrintf("" "directory"); */ char *coll = hr->getString("c"); if ( ! coll ) coll = ""; // if there's a ton of sites use the post method otherwise // they won't fit into the http request, the browser will reject // sending such a large request with "GET" char *method = "GET"; if ( si && si->m_sites && gbstrlen(si->m_sites)>800 ) method = "POST"; sb->safePrintf( //"      " // i'm not sure why this was removed. perhaps // because it is not working yet because of // some bugs... // "" // "advanced" // "" // "     " // "" // "add url" // "" /* "   |   " "" "blog" "" "   |   " "" "about" "" */ //"

" // // search box // "
\n\n" // propagate the collection if they re-search "" , method , coll ); // propagate prepend char *prepend = hr->getString("prepend"); if ( prepend ) { sb->safePrintf("htmlEncode ( prepend, gbstrlen(prepend), false); sb->safePrintf("\">"); } // put search box in a box sb->safePrintf("
"); sb->safePrintf ( //"
"getString("q",&qlen,"",NULL); sb->htmlEncode ( qstr , qlen , false ); sb->safePrintf ("\">" //"" "   " "
" "GO" "
" "
" "
" "
" ); if ( catId >= 0 ) { printDmozRadioButtons(sb,catId); } /* else { sb->safePrintf("Try your search on: " "   " "google      " "bing"); } */ printSearchFiltersBar ( sb , hr ); sb->safePrintf( "\n" "
\n" ); return true; } bool printDmozRadioButtons ( SafeBuf *sb , long catId ) { sb->safePrintf("Search " " sites " " pages " "in this topic or below" , catId , catId ); return true; } /* // print the search options under a dmoz search box bool printDirectorySearchType ( SafeBuf& sb, long sdirt ) { // default to entire directory if (sdirt < 1 || sdirt > 4) sdirt = 3; // by default search the whole thing sb->safePrintf("safePrintf(" checked>"); else sb->safePrintf(">"); sb->safePrintf("Entire Directory
\n"); // entire category sb->safePrintf("safePrintf(" checked>"); else sb->safePrintf(">"); sb->safePrintf("Entire Category
\n"); // base category only sb->safePrintf("
safePrintf(" checked>"); else sb->safePrintf(">"); sb->safePrintf("Pages in Base Category
\n"); // sites in base category sb->safePrintf("safePrintf(" checked>"); else sb->safePrintf(">"); sb->safePrintf("Sites in Base Category
\n"); // sites in entire category sb->safePrintf("safePrintf(" checked>"); else sb->safePrintf(">"); sb->safePrintf("Sites in Entire Category
\n"); // end it return true; } */ // return 1 if a should be before b int csvPtrCmp ( const void *a, const void *b ) { //JsonItem *ja = (JsonItem **)a; //JsonItem *jb = (JsonItem **)b; char *pa = *(char **)a; char *pb = *(char **)b; if ( strcmp(pa,"type") == 0 ) return -1; if ( strcmp(pb,"type") == 0 ) return 1; // force title on top if ( strcmp(pa,"product.title") == 0 ) return -1; if ( strcmp(pb,"product.title") == 0 ) return 1; if ( strcmp(pa,"title") == 0 ) return -1; if ( strcmp(pb,"title") == 0 ) return 1; // otherwise string compare int val = strcmp(pa,pb); return val; } #include "Json.h" // // print header row in csv // bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) { Msg40 *msg40 = &st->m_msg40; long numResults = msg40->getNumResults(); char tmp1[1024]; SafeBuf tmpBuf (tmp1 , 1024); char tmp2[1024]; SafeBuf nameBuf (tmp2, 1024); char nbuf[27000]; HashTableX nameTable; if ( ! nameTable.set ( 8,4,2048,nbuf,27000,false,0,"ntbuf") ) return false; long niceness = 0; // . scan every fucking json item in the search results. // . we still need to deal with the case when there are so many // search results we have to dump each msg20 reply to disk in // order. then we'll have to update this code to scan that file. for ( long i = 0 ; i < numResults ; i++ ) { // get the msg20 reply for search result #i Msg20 *m20 = msg40->m_msg20[i]; Msg20Reply *mr = m20->m_r; if ( ! mr ) { log("results: missing msg20 reply for result #%li",i); continue; } // get content char *json = mr->ptr_content; // how can it be empty? if ( ! json ) continue; // parse it up Json jp; jp.parseJsonStringIntoJsonItems ( json , niceness ); // scan each json item for ( JsonItem *ji = jp.getFirstItem(); ji ; ji = ji->m_next ){ // skip if not number or string if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING ) continue; // if in an array, do not print! csv is not // good for arrays... like "media":[....] . that // one might be ok, but if the elements in the // array are not simple types, like, if they are // unflat json objects then it is not well suited // for csv. if ( ji->isInArray() ) continue; // reset length of buf to 0 tmpBuf.reset(); // . get the name of the item into "nameBuf" // . returns false with g_errno set on error if ( ! ji->getCompoundName ( tmpBuf ) ) return false; // is it new? long long h64 = hash64n ( tmpBuf.getBufStart() ); if ( nameTable.isInTable ( &h64 ) ) continue; // record offset of the name for our hash table long nameBufOffset = nameBuf.length(); // store the name in our name buffer if ( ! nameBuf.safeStrcpy ( tmpBuf.getBufStart() ) ) return false; if ( ! nameBuf.pushChar ( '\0' ) ) return false; // it's new. add it if ( ! nameTable.addKey ( &h64 , &nameBufOffset ) ) return false; } } // . make array of ptrs to the names so we can sort them // . try to always put title first regardless char *ptrs [ 1024 ]; long numPtrs = 0; for ( long i = 0 ; i < nameTable.m_numSlots ; i++ ) { if ( ! nameTable.m_flags[i] ) continue; long off = *(long *)nameTable.getValueFromSlot(i); char *p = nameBuf.getBufStart() + off; ptrs[numPtrs++] = p; if ( numPtrs >= 1024 ) break; } // sort them qsort ( ptrs , numPtrs , 4 , csvPtrCmp ); // set up table to map field name to column for printing the json items HashTableX *columnTable = &st->m_columnTable; if ( ! columnTable->set ( 8,4, numPtrs * 4,NULL,0,false,0,"coltbl" ) ) return false; // now print them out as the header row for ( long i = 0 ; i < numPtrs ; i++ ) { if ( i > 0 && ! sb->pushChar(',') ) return false; if ( ! sb->safeStrcpy ( ptrs[i] ) ) return false; // record the hash of each one for printing out further json // objects in the same order so columns are aligned! long long h64 = hash64n ( ptrs[i] ); if ( ! columnTable->addKey ( &h64 , &i ) ) return false; } st->m_numCSVColumns = numPtrs; if ( ! sb->pushChar('\n') ) return false; if ( ! sb->nullTerm() ) return false; return true; } // returns false and sets g_errno on error bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) { long niceness = 0; // parse the json Json jp; jp.parseJsonStringIntoJsonItems ( json , niceness ); HashTableX *columnTable = &st->m_columnTable; long numCSVColumns = st->m_numCSVColumns; // make buffer space that we need char ttt[1024]; SafeBuf ptrBuf(ttt,1024); long need = numCSVColumns * sizeof(JsonItem *); if ( ! ptrBuf.reserve ( need ) ) return false; JsonItem **ptrs = (JsonItem **)ptrBuf.getBufStart(); // reset json item ptrs for csv columns. all to NULL memset ( ptrs , 0 , need ); char tmp1[1024]; SafeBuf tmpBuf (tmp1 , 1024); JsonItem *ji; /////// // // print json item in csv // /////// for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) { // skip if not number or string if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING ) continue; // skip if not well suited for csv (see above comment) if ( ji->isInArray() ) continue; // . get the name of the item into "nameBuf" // . returns false with g_errno set on error if ( ! ji->getCompoundName ( tmpBuf ) ) return false; // is it new? long long h64 = hash64n ( tmpBuf.getBufStart() ); long slot = columnTable->getSlot ( &h64 ) ; // MUST be in there if ( slot < 0 ) { char *xx=NULL;*xx=0;} // get col # long column = *(long *)columnTable->getValueFromSlot ( slot ); // sanity if ( column >= numCSVColumns ) { char *xx=NULL;*xx=0; } // set ptr to it for printing when done parsing every field // for this json item ptrs[column] = ji; } // now print out what we got for ( long i = 0 ; i < numCSVColumns ; i++ ) { // , delimeted if ( i > 0 ) sb->pushChar(','); // get it ji = ptrs[i]; // skip if none if ( ! ji ) continue; // skip "html" field... too spammy for csv and > 32k causes // libreoffice calc to truncate it and break its parsing if ( ji->m_name && //! ji->m_parent && strcmp(ji->m_name,"html")==0) continue; // // get value and print otherwise // if ( ji->m_type == JT_NUMBER ) { // print numbers without double quotes if ( ji->m_valueDouble *10000000.0 == (double)ji->m_valueLong * 10000000.0 ) sb->safePrintf("%li",ji->m_valueLong); else sb->safePrintf("%f",ji->m_valueDouble); continue; } // print the value sb->pushChar('\"'); // get the json item to print out long vlen = ji->getValueLen(); // truncate char *truncStr = NULL; if ( vlen > 32000 ) { vlen = 32000; truncStr = " ... value truncated because " "Excel can not handle it. Download the " "JSON to get untruncated data."; } // print it out sb->csvEncode ( ji->getValue() , vlen ); // print truncate msg? if ( truncStr ) sb->safeStrcpy ( truncStr ); // end the CSV sb->pushChar('\"'); } sb->pushChar('\n'); sb->nullTerm(); return true; } /* RIP: OLD IFRAME WIDGET CODE HACK bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) { // // begin print controls // sb->safePrintf("" "" "Widget Creator" ); //char *coll = "GLOBAL-INDEX"; CollectionRec *cr = NULL; if ( coll ) cr = g_collectiondb.getRec(coll); // if admin clicks "edit" in the live widget itself put up // some simpler content editing boxes. token required! long edit = hr->getLong("inlineedit",0); if ( edit ) { // get widget sites char *sites = cr->m_siteListBuf.getBufStart(); sb->safePrintf("" , sites); sb->safePrintf("
" "" "
" "" ); return true; } sb->safePrintf("\n"); char *c1 = ""; char *c2 = ""; char *c3 = ""; long x1 = hr->getLong("dates" ,0); long x2 = hr->getLong("summaries",0); long x3 = hr->getLong("border" ,0); if ( x1 ) c1 = " checked"; if ( x2 ) c2 = " checked"; if ( x3 ) c3 = " checked"; long width = hr->getLong("width",250); long height = hr->getLong("height",400); long refresh = hr->getLong("refresh",15); char *def = "";//

News

"; long len1,len2,len3,len4; char *header = hr->getString("header",&len1,def); char *sites = hr->getString("sites",&len2,""); char *token = hr->getString("token",&len3,""); //char*query=hr->getString("query",&len4, //"type:article gbsortbyint:date"); char *query =hr->getString("query",&len4, "type:article gbsortbyint:gbspiderdate"); sb->safePrintf("
" "" "" , coll ); sb->safePrintf( "
" "" "" "" "" "" "" , sites , token , query , c1 , c2 , c3 , width , height , refresh , header ); // // end print controls // // // begin print widget // sb->safePrintf ( "" "" "
" "" "" "W" "idget Creator" "
" "" "" "Harness the power of Diffbot." "" "" "
" "Websites to crawl:" "
" "" "
" "Token:" "
" "" "
" "Query:" "
" "" "
" "Show Dates " "" "
" "Show Summaries " "" "
" "Frame border " "" "
" "Width " "" "
" "Height " "" "
" "Refresh in seconds " "" "
" "Custom widget header:" "
" "" "
" "" "" "
" "
" "

" //"
" //, RESULTSWIDTHSTR //,width ); //printTabs ( sb , st ); //printRedBoxes ( sb , st ); #define SHADOWCOLOR "#000000" sb->safePrintf ( // end widget div "
" // end widget column in table "
" // begin div with source in it // "
" // , SHADOWCOLOR // //"
" ); // space widget to the right using this table sb->safePrintf( //class=grad3 " //"style=\"" //"border:2px solid black;" //"padding-bottom:10px;" //"padding-top:10px;" //"padding-left:10px;" //"\"" //">" "
" "
" "" "

" ); long start = sb->length(); char *border = "frameborder=no "; if ( x3 ) border = ""; // this iframe contains the WIDGET sb->safePrintf ( // "
" "\n" //"
" //, si->m_urlParms); //, wp ); long end = sb->length(); sb->reserve ( end - start + 1000 ); char *wdir = "on the left"; long cols = 32; //if ( width <= 240 ) sb->safePrintf("
  "); //else { // sb->safePrintf("


"); // wdir = "above"; // cols = 60; // } sb->safePrintf ( "\n\n" "
" //"


" "" "Insert the following code into your webpage to " "generate the widget %s. " //"
" //"" //"" //"Make $1 per click!" //"
" "

" , wdir ); char *p = sb->getBufStart() + start; sb->safePrintf(""); sb->safePrintf(""); // space widget to the right using this table sb->safePrintf("
"); sb->safePrintf("
"); sb->safePrintf("
"); sb->safePrintf(""); sb->safePrintf(""); return true; } bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) { SafeBuf sb; char *token = hr->getString("token",NULL); if ( token && ! token[0] ) token = NULL; long edit = hr->getLong("inlineedit",0); if ( ! token && ! edit ) { g_errno = ENOTOKEN; char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(s,g_errno,msg); } long tlen = 0; if ( token ) tlen = gbstrlen(token); if ( tlen > 64 ) { g_errno = ENOCOLLREC; char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(s,g_errno,msg); } char coll[MAX_COLL_LEN]; CollectionRec *cr = NULL; if ( token ) { sprintf(coll,"%s-widget123",token); cr = g_collectiondb.getRec(coll); } SafeBuf parmList; collnum_t cn = -1; if ( cr ) cn = cr->m_collnum; // . first update their collection with the sites to crawl // . this is NOT a custom diffbot crawl, just a regular one using // the new crawl filters logic, "siteList" char *sites = hr->getString("sites",NULL); // add the collection if does not exist if ( sites && ! cr && token ) { // we need to add the new collnum, so reserve it collnum_t newCollnum = g_collectiondb.reserveCollNum(); // use that cn = newCollnum; // add the new colection named -widget123 g_parms.addNewParmToList1 ( &parmList,cn,coll,0,"addColl"); // note it log("widget: adding new widget coll %s",coll); } if ( cn >= 0 && token ) { // use special url filters profile that spiders sites // shallowly and frequently to pick up new news stories // "1" = (long)UFP_NEWS char ttt[12]; sprintf(ttt,"%li",(long)UFP_NEWS); // urlfiltersprofile g_parms.addNewParmToList1 ( &parmList,cn,ttt,0,"ufp"); // use diffbot analyze char durl[1024]; sprintf(durl, "http://api.diffbot.com/v2/analyze?mode=auto&token=%s", token); // TODO: ensure we call diffbot ok g_parms.addNewParmToList1 ( &parmList,cn,durl,0,"apiUrl"); } if ( ! sites ) sites = ""; // . update the list of sites to crawl and search and show in widget // . if they give an empty list then allow that, it will stop crawling if ( cn >= 0 && token ) g_parms.addNewParmToList1 ( &parmList,cn,sites,0,"sitelist"); if ( parmList.length() ) { // send the parms to all hosts in the network g_parms.broadcastParmList ( &parmList , NULL,//s,// state is socket i guess NULL);//doneBroadcastingParms2 ); } // now display the widget controls and the widget and the iframe code printWidgetPage ( &sb , hr , coll ); return g_httpServer.sendDynamicPage(s, sb.getBufStart(), sb.length(), -1,//cacheTime -1 means not tocache false, // POST? "text/html", 200, // httpstatus NULL, // cookie "UTF-8"); // charset } */ bool printDmozEntry ( SafeBuf *sb , long catId , bool direct , char *dmozTitle , char *dmozSummary , char *dmozAnchor , SearchInput *si ) { // assign shit if we match the dmoz cat we are showing //if ( catIds[i] == si->m_catId) break; if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t\n"); sb->safePrintf("\t\t\t%li" "\n",catId); sb->safePrintf("\t\t\t%li\n", (long)direct); // print the name of the dmoz category sb->safePrintf("\t\t\tprintPathFromId(&xb, catId, false, si->m_isRTL); sb->cdataEncode(xb.getBufStart()); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\tcdataEncode(dmozTitle); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\tcdataEncode(dmozSummary); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\tcdataEncode(dmozAnchor); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\n"); return true; } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"dmozEntry\":{\n"); sb->safePrintf("\t\t\t\"dmozCatId\":%li,\n", catId); sb->safePrintf("\t\t\t\"directCatId\":%li,\n",(long)direct); // print the name of the dmoz category sb->safePrintf("\t\t\t\"dmozCatStr\":\""); char xbuf[256]; SafeBuf xb(xbuf,256,0,false); g_categories->printPathFromId(&xb, catId, false, si->m_isRTL); sb->jsonEncode(xb.getBufStart()); sb->safePrintf("\",\n"); sb->safePrintf("\t\t\t\"dmozTitle\":\""); sb->jsonEncode(dmozTitle); sb->safePrintf("\",\n"); sb->safePrintf("\t\t\t\"dmozSum\":\""); sb->jsonEncode(dmozSummary); sb->safePrintf("\",\n"); sb->safePrintf("\t\t\t\"dmozAnchor\":\""); sb->jsonEncode(dmozAnchor); sb->safePrintf("\"\n"); sb->safePrintf("\t\t},\n"); return true; } return true; } class MenuItem { public: long m_menuNum; char *m_title; // we append this to the url char *m_cgiField; char *m_cgiVal; char m_tmp[10]; }; static MenuItem s_mi[200]; static long s_num = 0; bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) { SafeBuf cu; hr->getCurrentUrl ( cu ); sb->safePrintf("" ); static bool s_init = false; if ( ! s_init ) { long n = 0; s_mi[n].m_menuNum = 0; s_mi[n].m_title = "Any time"; s_mi[n].m_cgiField = "secsback"; s_mi[n].m_cgiVal = "0"; n++; s_mi[n].m_menuNum = 0; s_mi[n].m_title = "Past 24 hours"; s_mi[n].m_cgiField = "secsback"; s_mi[n].m_cgiVal = "86400"; n++; s_mi[n].m_menuNum = 0; s_mi[n].m_title = "Past week"; s_mi[n].m_cgiField = "secsback"; s_mi[n].m_cgiVal = "604800"; n++; s_mi[n].m_menuNum = 0; s_mi[n].m_title = "Past month"; s_mi[n].m_cgiField = "secsback"; s_mi[n].m_cgiVal = "2592000"; n++; s_mi[n].m_menuNum = 0; s_mi[n].m_title = "Past year"; s_mi[n].m_cgiField = "secsback"; s_mi[n].m_cgiVal = "31536000"; n++; // sort by s_mi[n].m_menuNum = 1; s_mi[n].m_title = "Sorted by relevance"; s_mi[n].m_cgiField = "sortby"; s_mi[n].m_cgiVal = "0"; n++; s_mi[n].m_menuNum = 1; s_mi[n].m_title = "Sorted by date"; s_mi[n].m_cgiField = "sortby"; s_mi[n].m_cgiVal = "1"; n++; s_mi[n].m_menuNum = 1; s_mi[n].m_title = "Reverse sorted by date"; s_mi[n].m_cgiField = "sortby"; s_mi[n].m_cgiVal = "2"; n++; // languages s_mi[n].m_menuNum = 2; s_mi[n].m_title = "Any language"; s_mi[n].m_cgiField = "qlang"; s_mi[n].m_cgiVal = ""; n++; for ( long i = 0 ; i < langLast ; i++ ) { s_mi[n].m_menuNum = 2; s_mi[n].m_title = getLanguageString(i); s_mi[n].m_cgiField = "qlang"; snprintf(s_mi[n].m_tmp,10,"%s",getLangAbbr(i)); s_mi[n].m_cgiVal = s_mi[n].m_tmp; n++; } // filetypes s_mi[n].m_menuNum = 3; s_mi[n].m_title = "Any filetype"; s_mi[n].m_cgiField = "filetype"; s_mi[n].m_cgiVal = ""; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "PDF"; s_mi[n].m_cgiField = "filetype"; s_mi[n].m_cgiVal = "pdf"; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "Microsoft Word"; s_mi[n].m_cgiField = "filetype"; s_mi[n].m_cgiVal = "doc"; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "Excel"; s_mi[n].m_cgiField = "filetype"; s_mi[n].m_cgiVal = "xls"; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "PostScript"; s_mi[n].m_cgiField = "filetype"; s_mi[n].m_cgiVal = "ps"; n++; // facets s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Facets"; s_mi[n].m_cgiField = "prepend"; s_mi[n].m_cgiVal = ""; n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Language facet"; s_mi[n].m_cgiField = "prepend"; s_mi[n].m_cgiVal = "gbfacetstr:gblangid"; n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Content type facet"; s_mi[n].m_cgiField = "prepend"; s_mi[n].m_cgiVal = "gbfacetstr:gbcontenttypeid"; n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Spider date facet"; s_mi[n].m_cgiField = "prepend"; s_mi[n].m_cgiVal = "gbfacetint:gbspiderdate"; n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Site rank facet"; s_mi[n].m_cgiField = "prepend"; s_mi[n].m_cgiVal = "gbfacetstr:gbsiterank"; n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Domains facet"; s_mi[n].m_cgiField = "prepend"; s_mi[n].m_cgiVal = "gbfacetint:gbdomhash"; n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Hopcount facet"; s_mi[n].m_cgiField = "prepend"; s_mi[n].m_cgiVal = "gbfacetstr:gbhopcount"; n++; // output s_mi[n].m_menuNum = 5; s_mi[n].m_title = "Output HTML"; s_mi[n].m_cgiField = "format"; s_mi[n].m_cgiVal = "html"; n++; s_mi[n].m_menuNum = 5; s_mi[n].m_title = "Output XML"; s_mi[n].m_cgiField = "format"; s_mi[n].m_cgiVal = "xml"; n++; s_mi[n].m_menuNum = 5; s_mi[n].m_title = "Output JSON"; s_mi[n].m_cgiField = "format"; s_mi[n].m_cgiVal = "json"; n++; s_num = n; if ( n > 200 ) { char *xx=NULL;*xx=0; } } // we'll print the admin menu custom since it's mostly off-page links // bar of drop down menus sb->safePrintf("
"); for ( long i = 0 ; i <= 5 ; i++ ) printMenu ( sb , i ); sb->safePrintf("
\n"); return true; } bool printMenu ( SafeBuf *sb , long menuNum ) { bool firstOne = true; MenuItem *first = NULL; for ( long i = 0 ; i < s_num ; i++ ) { // shortcut MenuItem *mi = &s_mi[i]; // skip if not our item if ( mi->m_menuNum != menuNum ) continue; if ( ! first ) first = mi; if ( ! firstOne ) goto skip; firstOne = false; // for centering the dropdown sb->safePrintf(""); // print hidden drop down menu sb->safePrintf( "" , mi->m_menuNum ); skip: // print each item in there sb->safePrintf("
" "  %s" "
" , mi->m_title ); //sb->safePrintf("

"); } // wrap up the drop down sb->safePrintf("
"); // print heading or current selection i guess sb->safePrintf( // separate menus with these two spaces "     " // print the menu header that when clicked // will show the drop down "" "%s %c%c%c" "" , first->m_menuNum , first->m_title ,0xe2 ,0x96 ,0xbc ); return true; }