#include "gb-include.h" #include "Collectiondb.h" //#include "CollectionRec.h" #include "Stats.h" #include "Statsdb.h" #include "Ads.h" #include "Query.h" #include "Speller.h" #include "Msg40.h" #include "Pages.h" #include "Highlight.h" #include "SearchInput.h" #include #include "SafeBuf.h" #include "iana_charset.h" #include "Pos.h" #include "Bits.h" #include "AutoBan.h" #include "sort.h" #include "LanguageIdentifier.h" #include "LanguagePages.h" #include "LangList.h" #include "CountryCode.h" #include "Unicode.h" #include "XmlDoc.h" // GigabitInfo class #include "Posdb.h" // MAX_TOP definition #include "PageResults.h" #include "Proxy.h" static bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) ; static bool printMenu ( SafeBuf *sb , int32_t menuNum , HttpRequest *hr ) ; //static void gotSpellingWrapper ( void *state ) ; static void gotResultsWrapper ( void *state ) ; //static void gotAdsWrapper ( void *state ) ; static void gotState ( void *state ) ; static bool gotResults ( void *state ) ; bool replaceParm ( char *cgi , SafeBuf *newUrl , HttpRequest *hr ) ; bool replaceParm2 ( char *cgi , SafeBuf *newUrl , char *oldUrl , int32_t oldUrlLen ) ; bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) ; bool printJsonItemInCSV ( char *json , SafeBuf *sb , class State0 *st ) ; bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps , Msg20Reply *mr , Msg40 *msg40 , bool first ) ; bool printScoresHeader ( SafeBuf *sb ) ; bool printMetaContent ( Msg40 *msg40 , int32_t i ,State0 *st, SafeBuf *sb ); bool printSingleScore ( SafeBuf *sb , SearchInput *si , SingleScore *ss , Msg20Reply *mr , Msg40 *msg40 ) ; bool printDmozEntry ( SafeBuf *sb , int32_t catId , bool direct , char *dmozTitle , char *dmozSummary , char *dmozAnchor , SearchInput *si ); bool sendReply ( State0 *st , char *reply ) { int32_t savedErr = g_errno; TcpSocket *s = st->m_socket; if ( ! s ) { char *xx=NULL;*xx=0; } SearchInput *si = &st->m_si; char *ct = "text/html"; if ( si && si->m_format == FORMAT_XML ) ct = "text/xml"; if ( si && si->m_format == FORMAT_JSON ) ct = "application/json"; if ( si && si->m_format == FORMAT_CSV ) ct = "text/csv"; char *charset = "utf-8"; char format = si->m_format; // . filter anything < 0x20 to 0x20 to keep XML legal // . except \t, \n and \r, they're ok // . gotta set "f" down here in case it realloc'd the buf if ( format == FORMAT_XML && reply ) { unsigned char *f = (unsigned char *)reply; for ( ; *f ; f++ ) if ( *f < 0x20 && *f!='\t' && *f!='\n' && *f!='\r' ) *f = 0x20; } int32_t rlen = 0; if ( reply ) rlen = gbstrlen(reply); logf(LOG_DEBUG,"gb: sending back %"INT32" bytes",rlen); // . use light brown if coming directly from an end user // . use darker brown if xml feed int32_t color = 0x00b58869; if ( si->m_format != FORMAT_HTML )color = 0x00753d30 ; int64_t nowms = gettimeofdayInMilliseconds(); int64_t took = nowms - st->m_startTime ; g_stats.addStat_r ( took , st->m_startTime , nowms, color , STAT_QUERY ); // add to statsdb, use # of qterms as the value/qty g_statsdb.addStat ( 0, "query", st->m_startTime, nowms, si->m_q.m_numTerms); // . log the time // . do not do this if g_errno is set lest m_sbuf1 be bogus b/c // it failed to allocate its buf to hold terminating \0 in // SearchInput::setQueryBuffers() if ( ! g_errno && st->m_took >= g_conf.m_logQueryTimeThreshold ) { logf(LOG_TIMING,"query: Took %"INT64" ms for %s. results=%"INT32"", st->m_took, si->m_sbuf1.getBufStart(), st->m_msg40.getNumResults()); } //bool xml = si->m_xml; g_stats.logAvgQueryTime(st->m_startTime); if ( ! savedErr ) { // g_errno ) { g_stats.m_numSuccess++; // . one hour cache time... no 1000 hours, basically infinite // . no because if we redo the query the results are cached int32_t cacheTime = 3600;//*1000; // no... do not use cache cacheTime = -1; // the "Check it" link on add url uses &usecache=0 to tell // the browser not to use its cache... //if ( hr->getLong("usecache",-1) == 0 ) cacheTime = 0; // // send back the actual search results // g_httpServer.sendDynamicPage(s, reply, rlen,//gbstrlen(reply), // don't let the ajax re-gen // if they hit the back button! // so make this 1 hour, not 0 cacheTime, // cachetime in secs false, // POSTReply? ct, -1, // httpstatus -1 -> 200 NULL, // cookieptr charset ); // free st after sending reply since "st->m_sb" = "reply" mdelete(st, sizeof(State0), "PageResults2"); delete st; return true; } // error otherwise if ( savedErr != ENOPERM ) g_stats.m_numFails++; mdelete(st, sizeof(State0), "PageResults2"); delete st; /* if ( format == FORMAT_XML ) { SafeBuf sb; sb.safePrintf("\n" "\n" "\t%"INT32"\n" "\t%s\n" "\n" ,(int32_t)savedErr ,mstrerror(savedErr) ); // clear it for sending back g_errno = 0; // send back as normal reply g_httpServer.sendDynamicPage(s, sb.getBufStart(), sb.length(), 0, // cachetime in secs false, // POSTReply? ct, -1, // httpstatus -1 -> 200 NULL, // cookieptr charset ); return true; } */ int32_t status = 500; if (savedErr == ETOOMANYOPERANDS || savedErr == EBADREQUEST || savedErr == ENOPERM || savedErr == ENOCOLLREC) status = 400; g_httpServer.sendQueryErrorReply(s, status, mstrerror(savedErr), format,//xml, savedErr, "There was an error!"); return true; } bool printCSSHead ( SafeBuf *sb , char format ) { sb->safePrintf( "\n" //"\n" "\n" "\n" "Gigablast Search Results\n" "\n" "\n" ); return true; } // . returns false if blocked, true otherwise // . sets g_errno on error // . "msg" will be inserted into the access log for this request bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) { // . check for sdirt=4, this a site search on the given directory id // . need to pre-query the directory first to get the sites to search // this will likely have just been cached so it should be quick // . then need to construct a site search query //int32_t rawFormat = hr->getLong("xml", 0); // was "raw" //int32_t xml = hr->getLong("xml",0); // what format should search results be in? default is html char format = hr->getReplyFormat();//getFormatFromRequest ( hr ); // get the dmoz catid if given //int32_t searchingDmoz = hr->getLong("dmoz",0); // // DO WE NEED TO ALTER cr->m_siteListBuf for a widget? // // when a wordpress user changes the "Websites to Include" for // her widget, it should send a /search?sites=xyz.com&wpid=xxx // request here... // so we need to remove her old sites and add in her new ones. // /* MDW TURN BACK ON IN A DAY. do indexing or err pages first. // get wordpressid supplied with all widget requests char *wpid = hr->getString("wpid"); // we have to add set &spidersites=1 which all widgets should do if ( wpid ) { // this returns NULL if cr->m_siteListBuf would be unchanged // because we already have the whiteListBuf sites in there // for this wordPressId (wpid) SafeBuf newSiteListBuf; makeNewSiteList( &si->m_whiteListBuf, cr->m_siteListBuf , wpid , &newSiteListBuf); // . update the list of sites to crawl/search & show in widget // . if they give an empty list then allow that, stops crawling SafeBuf parmList; g_parms.addNewParmToList1 ( &parmList, cr->m_collnum, newSiteListBuf, 0, "sitelist"); // send the parms to all hosts in the network g_parms.broadcastParmList ( &parmList , NULL,//s,// state is socket i guess NULL);//doneBroadcastingParms2 ); // nothing left to do now return g_httpServer.sendDynamicPage(s, "OK",//sb.getBufStart(), 2,//sb.length(), cacheTime,//0, false, // POST? "text/html", 200, // httpstatus NULL, // cookie "UTF-8"); // charset } */ // // . send back page frame with the ajax call to get the real // search results. do not do this if a "&dir=" (dmoz category) // is given. // . if not matt wells we do not do ajax // . the ajax is just there to prevent bots from slamming me // with queries. // if ( hr->getLong("id",0) == 0 && format == FORMAT_HTML && g_conf.m_isMattWells ) { SafeBuf sb; printCSSHead ( &sb ,format ); sb.safePrintf( "" , h32 , rand64 ); // // . login bar // . proxy will replace it byte by byte with a login/logout // link etc. // //g_proxy.insertLoginBarDirective(&sb); // // logo header // printLogoAndSearchBox ( &sb , hr , -1,NULL ); // catId = -1 // // script to populate search results // sb.safePrintf("\n" // put search results into this div "
" "" "
" "
" "" "Waiting for results... " "" "
" "
" "Please be a little " "patient I am trying to get more servers." "
\n" "
" "
" "" "Copyright © 2014. " "All Rights Reserved.
" "Powered by the " "" "GigaBlast open source search engine." "
" "
\n" "\n" "\n" ); // one hour cache time... no 1000 hours, basically infinite int32_t cacheTime = 3600; // *1000; //if ( hr->getLong("usecache",-1) == 0 ) cacheTime = 0; // // send back the parent stub containing the ajax // return g_httpServer.sendDynamicPage(s, sb.getBufStart(), sb.length(), cacheTime,//0, false, // POST? "text/html", 200, // httpstatus NULL, // cookie "UTF-8"); // charset } // make a new state State0 *st; try { st = new (State0); } catch ( ... ) { g_errno = ENOMEM; log("query: Query failed. " "Could not allocate %"INT32" bytes for query. " "Returning HTTP status of 500.",(int32_t)sizeof(State0)); g_stats.m_numFails++; return g_httpServer.sendQueryErrorReply (s,500,mstrerror(g_errno), format, g_errno, "Query failed. " "Could not allocate memory to execute a search. " "Please try later." ); } mnew ( st , sizeof(State0) , "PageResults2" ); // init some stuff st->m_didRedownload = false; st->m_xd = NULL; st->m_oldContentHash32 = 0; // copy yhits if ( ! st->m_hr.copy ( hr ) ) return sendReply ( st , NULL ); // set this in case SearchInput::set fails! st->m_socket = s; // save this count so we know if TcpServer.cpp calls destroySocket(s) st->m_numDestroys = s->m_numDestroys; // you have to say "&header=1" to get back the header for json now. // later on maybe it will default to on. st->m_header = hr->getLong("header",0); // . parse it up // . this returns false and sets g_errno and, maybe, g_msg on error SearchInput *si = &st->m_si; if ( ! si->set ( s , // si just copies the ptr into the httprequest // into stuff like SearchInput::m_defaultSortLanguage // so do not use the "hr" on the stack. SearchInput:: // m_hr points to the hr we pass into // SearchInput::set &st->m_hr ) ) { //&st->m_q ) ) { log("query: set search input: %s",mstrerror(g_errno)); if ( ! g_errno ) g_errno = EBADENGINEER; return sendReply ( st, NULL ); } int32_t codeLen = 0; char *code = hr->getString("code", &codeLen, NULL); // allow up to 1000 results per query for paying clients CollectionRec *cr = si->m_cr; // save collnum now if ( cr ) st->m_collnum = cr->m_collnum; else st->m_collnum = -1; // turn this on for json output, unless diffbot collection if ( format == FORMAT_JSON && ! cr->m_isCustomCrawl ) st->m_header = 1; // take this out here as well! // limit here // int32_t maxpp = cr->m_maxSearchResultsPerQuery ; // if ( si->m_docsWanted > maxpp && // // disable serp max per page for custom crawls // ! cr->m_isCustomCrawl ) // si->m_docsWanted = maxpp; st->m_numDocIds = si->m_docsWanted; // watch out for cowboys //if(si->m_firstResultNum>=si->m_maxResults) return sendReply(st,NULL); // save state in TcpSocket's m_tmp ptr for debugging. in case // we lose our string of control and Msg40::getResults() never // comes back. s->m_tmp = (char *)st; // add query stat st->m_startTime = gettimeofdayInMilliseconds(); // reset st->m_errno = 0; // debug msg log ( LOG_DEBUG , "query: Getting search results for q=%s", st->m_si.m_displayQuery); // assume we'll block st->m_gotResults = false; st->m_gotAds = false; st->m_gotSpell = false; // reset st->m_printedHeaderRow = false; int32_t ip = s->m_ip; int32_t uipLen; char *uip = hr->getString("uip", &uipLen, NULL); char testBufSpace[2048]; SafeBuf testBuf(testBufSpace, 1024); if( g_conf.m_doAutoBan && !g_autoBan.hasPerm(ip, code, codeLen, uip, uipLen, s, hr, &testBuf, false)) { // just check? no incrementing counts if ( uip ) log("results: returning EBUYFEED for uip=%s",uip); g_errno = EBUYFEED; return sendReply(st,NULL); } // LAUNCH ADS // . now get the ad space for this query // . don't get ads if we're not on the first page of results // . query must be NULL terminated st->m_gotAds = true; /* if (si->m_adFeedEnabled && ! si->m_xml && si->m_docsWanted > 0) { int32_t pageNum = (si->m_firstResultNum/si->m_docsWanted) + 1; st->m_gotAds = st->m_ads. getAds(si->m_displayQuery , //query si->m_displayQueryLen , //q len pageNum , //page num si->m_queryIP , si->m_coll2 , //coll st , //state gotAdsWrapper );//clbk } */ // LAUNCH SPELLER // get our spelling correction if we should (spell checker) st->m_gotSpell = true; st->m_spell[0] = '\0'; /* if ( si->m_spellCheck && cr->m_spellCheck && g_conf.m_doSpellChecking ) { st->m_gotSpell = g_speller. getRecommendation( &st->m_q, // Query si->m_spellCheck, // spellcheck st->m_spell, // Spell buffer MAX_FRAG_SIZE, // spell buf size false, // narrow search? NULL,//st->m_narrow // narrow buf MAX_FRAG_SIZE, // narrow buf size NULL,// num of narrows ptr st, // state gotSpellingWrapper );// callback } */ // LAUNCH RESULTS // . get some results from it // . this returns false if blocked, true otherwise // . it also sets g_errno on error // . use a niceness of 0 for all queries so they take precedence // over the indexing process // . this will copy our passed "query" and "coll" to it's own buffer // . we print out matching docIds to int32_t if m_isDebug is true // . no longer forward this, since proxy will take care of evenly // distributing its msg 0xfd "forward" requests now st->m_gotResults=st->m_msg40.getResults(si,false,st,gotResultsWrapper); // save error st->m_errno = g_errno; // wait for ads and spellcheck and results? if ( !st->m_gotAds || !st->m_gotSpell || !st->m_gotResults ) return false; // otherwise call gotResults which returns false if blocked, true else // and sets g_errno on error bool status2 = gotResults ( st ); return status2; } // if returned json result is > maxagebeforedownload then we redownload the // page and if its checksum has changed we return empty results void doneRedownloadingWrapper ( void *state ) { // cast our State0 class from this State0 *st = (State0 *) state; // resume gotResults ( st ); } /* void gotSpellingWrapper( void *state ){ // cast our State0 class from this State0 *st = (State0 *) state; // log the error first if ( g_errno ) log("query: speller: %s.",mstrerror(g_errno)); // clear any error cuz spellchecks aren't needed g_errno = 0; st->m_gotSpell = true; gotState(st); } */ void gotResultsWrapper ( void *state ) { // cast our State0 class from this State0 *st = (State0 *) state; // save error st->m_errno = g_errno; // mark as gotten st->m_gotResults = true; gotState (st); } /* void gotAdsWrapper ( void *state ) { // cast our State0 class from this State0 *st = (State0 *) state; // mark as gotten st->m_gotAds = true; // log the error first if ( g_errno ) log("query: adclient: %s.",mstrerror(g_errno)); // clear any error cuz ads aren't needed g_errno = 0; gotState (st);; } */ void gotState ( void *state ){ // cast our State0 class from this State0 *st = (State0 *) state; if ( !st->m_gotAds || !st->m_gotSpell || !st->m_gotResults ) return; // we're ready to go gotResults ( state ); } // print all sentences containing this gigabit (fast facts) (nuggabits) static bool printGigabitContainingSentences ( State0 *st, SafeBuf *sb , Msg40 *msg40 , Gigabit *gi , SearchInput *si , Query *gigabitQuery , int32_t gigabitId ) { //static int32_t s_gigabitCount = 0; char format = si->m_format; HttpRequest *hr = &st->m_hr; CollectionRec *cr = si->m_cr;//g_collectiondb.getRec(collnum ); int32_t numOff; int32_t revert; int32_t spaceOutOff; if ( format == FORMAT_HTML ) { sb->safePrintf(""); //""); // make a new query sb->safePrintf("m_coll); sb->urlEncode(gi->m_term,gi->m_termLen); sb->safeMemcpy("+|+",3); char *q = hr->getString("q",NULL,""); sb->urlEncode(q); sb->safePrintf("\">"); sb->safeMemcpy(gi->m_term,gi->m_termLen); sb->safePrintf(""); sb->safePrintf(" "); numOff = sb->m_length; sb->safePrintf(" ");//,gi->m_numPages); sb->safePrintf(""); sb->safePrintf(""); if ( si->m_isMasterAdmin && 1 == 2 ) sb->safePrintf("[%.0f]{%"INT32"}", gi->m_gbscore, gi->m_minPop); revert = sb->length(); sb->safePrintf("" "" , gigabitId // s_gigabitCount ); spaceOutOff = sb->length(); sb->safePrintf( "%c%c%c", 0xe2, 0x87, 0x93); sb->safePrintf(//"[more]" ""); sb->safePrintf(""); //
} if ( format == FORMAT_XML ) { sb->safePrintf("\t\t\n"); sb->safePrintf("\t\t\tcdataEncode(gi->m_term,gi->m_termLen); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\t%f\n",gi->m_gbscore); sb->safePrintf("\t\t\t%"INT32"\n",gi->m_minPop); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\"gigabit\":{\n"); sb->safePrintf("\t\t\"term\":\""); sb->jsonEncode(gi->m_term,gi->m_termLen); sb->safePrintf("\",\n"); sb->safePrintf("\t\t\"score\":%f,\n",gi->m_gbscore); sb->safePrintf("\t\t\"minPop\":%"INT32",\n",gi->m_minPop); } // get facts int32_t numNuggets = 0; int32_t numFacts = msg40->m_factBuf.length() / sizeof(Fact); Fact *facts = (Fact *)msg40->m_factBuf.getBufStart(); bool first = true; bool second = false; bool printedSecond = false; //int64_t lastDocId = -1LL; int32_t saveOffset = 0; for ( int32_t i = 0 ; i < numFacts ; i++ ) { Fact *fi = &facts[i]; // if printed for a higher scoring gigabit, skip if ( fi->m_printed ) continue; // check gigabit match int32_t k; for ( k = 0 ; k < fi->m_numGigabits ; k++ ) if ( fi->m_gigabitPtrs[k] == gi ) break; // skip this fact/sentence if does not contain gigabit if ( k >= fi->m_numGigabits ) continue; // do not print if no period at end char *s = fi->m_fact; char *e = s + fi->m_factLen; if ( e[-1] != '*' ) continue; e--; again: // first time, print in the single fact div if ( first && format == FORMAT_HTML ) { sb->safePrintf("
",gigabitId);//s_gigabitCount); } if ( second && format == FORMAT_HTML ) { sb->safePrintf("
",gigabitId);//s_gigabitCount); printedSecond = true; } Msg20Reply *reply = fi->m_reply; // ok, print it out if ( ! first && ! second && format == FORMAT_HTML ) { //if ( reply->m_docId != lastDocId ) sb->safePrintf("

\n"); //else { // sb->setLength ( saveOffset ); // sb->safePrintf("

\n"); //} } else { //sb->safePrintf(""); } numNuggets++; // print the fast fact (sentence) //sb->safeMemcpy ( s , e-s ); // let's highlight with gigabits and query terms SafeBuf tmpBuf; Highlight h; h.set ( &tmpBuf , // print it out here s , // content e - s , // len si->m_queryLangId , // from m_defaultSortLang gigabitQuery , // the gigabit "query" in quotes true , // stemming? -- unused false , // use anchors? NULL , // baseurl "", // front tag "", // back tag 0 , // fieldCode 0 ); // niceness // now highlight the original query as well but in black bold SafeBuf tmpBuf2; h.set ( &tmpBuf2 , // print it out here tmpBuf.getBufStart() , // content tmpBuf.length() , // len si->m_queryLangId , // from m_defaultSortLang &si->m_q , // the regular query true , // stemming? -- unused false , // use anchors? NULL , // baseurl "" , // front tag "", // back tag 0 , // fieldCode 0 ); // niceness int32_t dlen; char *dom = getDomFast(reply->ptr_ubuf,&dlen); // print the sentence if ( format == FORMAT_HTML ) sb->safeStrcpy(tmpBuf2.getBufStart()); if ( format == FORMAT_XML ) { sb->safePrintf("\t\t\t\n" "\t\t\t\tcdataEncode(tmpBuf2.getBufStart()); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\t\tcdataEncode(reply->ptr_ubuf); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\t\tcdataEncode(dom,dlen); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\t\n"); } if ( format == FORMAT_JSON ) { sb->safePrintf("\t\t\"instance\":{\n" "\t\t\t\"sentence\":\""); sb->jsonEncode(tmpBuf2.getBufStart()); sb->safePrintf("\",\n"); sb->safePrintf("\t\t\t\"url\":\""); sb->jsonEncode(reply->ptr_ubuf); sb->safePrintf("\",\n"); sb->safePrintf("\t\t\t\"domain\":\""); sb->jsonEncode(dom,dlen); sb->safePrintf("\"\n"); sb->safePrintf("\t\t},\n"); } fi->m_printed = 1; saveOffset = sb->length(); if ( format == FORMAT_HTML ) sb->safePrintf(" ", cr->m_coll,reply->m_docId); if ( format == FORMAT_HTML ) sb->safeMemcpy(dom,dlen); if ( format == FORMAT_HTML ) sb->safePrintf("\n"); //lastDocId = reply->m_docId; if ( first && format == FORMAT_HTML ) { sb->safePrintf("
"); } if ( second ) { second = false; } if ( first ) { first = false; second = true; // print first gigabit all over again but in 2nd div goto again; } } if ( format == FORMAT_XML ) sb->safePrintf("\t\t\n"); if ( format == FORMAT_JSON ) { // remove last ,\n sb->m_length -= 2; // replace with just \n sb->safePrintf("\n\t},\n"); } // all done if not html if ( format != FORMAT_HTML ) return true; // we counted the first one twice since we had to throw it into // the hidden div too! if ( numNuggets > 1 ) numNuggets--; // do not print the double down arrow if no nuggets printed if ( numNuggets <= 0 ) { sb->m_length = revert; sb->safePrintf(""); } // just remove down arrow if only 1... else if ( numNuggets == 1 ) { char *dst = sb->getBufStart()+spaceOutOff; dst[0] = ' '; dst[1] = ' '; dst[2] = ' '; } // store the # of nuggets in ()'s like (10 ) else { char tmp[10]; sprintf(tmp,"(%"INT32")",numNuggets); char *src = tmp; // starting storing digits after "( " char *dst = sb->getBufStart()+numOff; int32_t srcLen = gbstrlen(tmp); if ( srcLen > 5 ) srcLen = 5; for ( int32_t k = 0 ; k < srcLen ; k++ ) dst[k] = src[k]; } //s_gigabitCount++; if ( printedSecond ) { sb->safePrintf("
"); } return true; } /* // print all sentences containing this gigabit static bool printGigabit ( State0 *st, SafeBuf *sb , Msg40 *msg40 , Gigabit *gi , SearchInput *si ) { //static int32_t s_gigabitCount = 0; sb->safePrintf(""); //""); HttpRequest *hr = &st->m_hr; // make a new query sb->safePrintf("urlEncode(gi->m_term,gi->m_termLen); sb->safeMemcpy("+|+",3); char *q = hr->getString("q",NULL,""); sb->urlEncode(q); sb->safePrintf("\">"); sb->safeMemcpy(gi->m_term,gi->m_termLen); sb->safePrintf(""); sb->safePrintf(" "); //int32_t numOff = sb->m_length; // now the # of pages not nuggets sb->safePrintf("(%"INT32")",gi->m_numPages); sb->safePrintf(""); sb->safePrintf(""); if ( si->m_isMasterAdmin ) sb->safePrintf("[%.0f]{%"INT32"}", gi->m_gbscore, gi->m_minPop); // that's it for the gigabit sb->safePrintf("
"); return true; } */ class StateAU { public: SafeBuf m_metaListBuf; Msg4 m_msg4; }; void freeMsg4Wrapper( void *st ) { StateAU *stau = (StateAU *)st; mdelete(stau, sizeof(StateAU), "staud"); delete stau; } // . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotResults ( void *state ) { // cast our State0 class from this State0 *st = (State0 *) state; int64_t nowMS = gettimeofdayInMilliseconds(); // log the time int64_t took = nowMS - st->m_startTime; // record that st->m_took = took; // grab the query Msg40 *msg40 = &(st->m_msg40); //char *q = msg40->getQuery(); //int32_t qlen = msg40->getQueryLen(); SearchInput *si = &st->m_si; // if in streaming mode and we never sent anything and we had // an error, then send that back. we never really entered streaming // mode in that case. this happens when someone deletes a coll // and queries it immediately, then each shard reports ENOCOLLREC. // it was causing a socket to be permanently stuck open. if ( g_errno && si->m_streamResults && st->m_socket->m_totalSent == 0 ) return sendReply(st,NULL); // if already printed from Msg40.cpp, bail out now if ( si->m_streamResults ) { // this will be our final send if ( st->m_socket->m_streamingMode ) { log("res: socket still in streaming mode. wtf?"); st->m_socket->m_streamingMode = false; } log("msg40: done streaming. nuking state=%"PTRFMT" q=%s. " "msg20sin=%i msg20sout=%i sendsin=%i sendsout=%i " "numrequests=%i numreplies=%i " ,(PTRTYPE)st ,si->m_q.m_orig , msg40->m_numMsg20sIn , msg40->m_numMsg20sOut , msg40->m_sendsIn , msg40->m_sendsOut , msg40->m_numRequests , msg40->m_numReplies ); mdelete(st, sizeof(State0), "PageResults2"); delete st; return true; } // int16_tcuts //char *coll = si->m_coll2; //int32_t collLen = si->m_collLen2; //collnum_t collnum = si->m_firstCollnum; // collection rec must still be there since SearchInput references // into it, and it must be the SAME ptr too! CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum ); if ( ! cr ) { // || cr != si->m_cr ) { g_errno = ENOCOLLREC; return sendReply(st,NULL); } //char *coll = cr->m_coll; /* // // BEGIN REDOWNLOAD LOGIC // //////////// // // if caller wants a certain freshness we might have to redownload the // parent url to get the new json // //////////// // get the first result Msg20 *m20first = msg40->m_msg20[0]; int32_t mabr = st->m_hr.getLong("maxagebeforeredownload",-1); if ( mabr >= 0 && numResults > 0 && // only do this once ! st->m_didRedownload && // need at least one result m20first && // get the last spidered time from the msg20 reply of that result m20first->m_r->m_lastSpidered - now > mabr ) { // make a new xmldoc to do the redownload XmlDoc *xd; try { xd = new (XmlDoc); } catch ( ... ) { g_errno = ENOMEM; log("query: Failed to alloc xmldoc."); } if ( g_errno ) return sendReply (st,NULL); mnew ( xd , sizeof(XmlDoc) , "mabrxd"); // save it st->m_xd = xd; // get this st->m_oldContentHash32 = m20rep->m_contentHash32; // do not re-do redownload st->m_didRedownload = true; // set it xd->setUrl(parentUrl); xd->setCallback ( st , doneRedownloadingWrapper ); // get the checksum if ( xd->getContentChecksum32Fast() == (void *)-1 ) // return false if it blocked return false; // error? if ( g_errno ) return sendReply (st,NULL); // how did this not block log("page: redownload did not would block adding parent"); } // if we did the redownload and checksum changed, return 0 results if ( st->m_didRedownload ) { // get the doc we downloaded XmlDoc *xd = st->m_xd; // get it int32_t newHash32 = xd->getContentHash32(); // log it if ( newHash32 != st->m_oldContentHash32 ) // note it in logs for now log("results: content changed for %s",xd->m_firstUrl.m_url); // free it mdelete(xd, sizeof(XmlDoc), "mabrxd" ); delete xd; // null it out so we don't try to re-free st->m_xd = NULL; // if content is significantly different, return 0 results if ( newHash32 != st->m_oldContentHash32 ) { SafeBuf sb; // empty json i guess sb.safePrintf("[]\n"); return sendReply(st,sb.getBufStart()); } // otherwise, print the diffbot json results, they are still valid } // // END REDOWNLOAD LOGIC // */ // // BEGIN ADDING URL // ////////// // // if its a special request to get diffbot json objects for // a given parent url, it often contains the same url in "addurl" // to add as a spider request to spiderdb so that // it gets spidered and processed through diffbot. // ////////// char *addUrl = st->m_hr.getString("addurl",NULL); if ( addUrl ) { // && cr->m_isCustomCrawl ) { Url norm; norm.set ( addUrl ); SpiderRequest sreq; // returns false and sets g_errno on error if ( ! sreq.setFromAddUrl ( norm.getUrl() ) ) { //addUrl ) ) { log("addurl: url had problem: %s",mstrerror(g_errno)); return true; } // addurl state StateAU *stau; try { stau = new(StateAU); } catch ( ... ) { g_errno = ENOMEM; return true; } mnew ( stau , sizeof(StateAU) , "stau"); // fill it up SafeBuf *mlist = &stau->m_metaListBuf; if ( ! mlist->pushChar(RDB_SPIDERDB) ) return true; if ( ! mlist->safeMemcpy ( &sreq , sreq.getRecSize() ) ) return true; Msg4 *msg4 = &stau->m_msg4; // this should copy the recs from list into the buffers if ( msg4->addMetaList ( mlist->getBufStart() , mlist->getLength() , cr->m_collnum, stau , freeMsg4Wrapper , MAX_NICENESS ) ) { // if it copied everything ok, nuke our msg4 // otherwise it will call freeMsg4Wraper when it // completes! freeMsg4Wrapper( stau ); } } // // DONE ADDING URL // int32_t numResults = msg40->getNumResults(); // if user is doing ajax widget we need to know the current docid // that is listed at the top of their widget display so we can // hide the new docids above that and scroll them down slowly. /* //int32_t topDocIdPos = -1; bool hasInvisibleResults = false; //int32_t numInvisible = 0; int32_t numAbove = 0; HttpRequest *hr = &st->m_hr; int64_t oldTop = 0LL; int64_t lastDocId = 0LL; double lastSerpScore = 0.0; if ( si->m_format == FORMAT_WIDGET_AJAX ) { // sanity, no stream mode here, it won't work if ( si->m_streamResults ) log("results: do not use stream=1 for widget"); // get current top docid int64_t topDocId = hr->getLongLong("topdocid",0LL); // DEBUG: force it on for now //topDocId = 4961990748LL; // scan results. this does not support &stream=1 streaming // mode. it doesn't make sense that it needs to. for ( int32_t i = 0 ; i < numResults ; i++ ) { // skip if already invisible if ( msg40->m_msg3a.m_clusterLevels[i] != CR_OK ) continue; // get it Msg20 *m20 = msg40->m_msg20[i]; if ( ! m20 ) continue; // checkdocid Msg20Reply *mr = m20->m_r; if ( ! mr ) continue; // save this lastDocId = mr->m_docId; lastSerpScore = msg40->m_msg3a.m_scores[i]; // set "oldTop" to first docid we encounter if ( ! oldTop ) oldTop = mr->m_docId; // stop if no topdocid otherwise. oldTop is now set if ( ! topDocId ) continue; // == 0 ) break; if ( mr->m_docId != topDocId ) { hasInvisibleResults = true; // count # of docids above top docid numAbove++; continue; } // we match it, so set this if not already set //if ( topDocIdPos != -1 ) topDocIdPos = i; //break; } } */ SafeBuf *sb = &st->m_sb; // print javascript for scrolling down invisible div for // ajax based widgets // MDW: this does not execute because it is loaded via ajax... // so i moved logic into diffbot.php for now. /* if ( si->m_format == FORMAT_WIDGET_AJAX && numInvisible ) { sb->safePrintf("" , numInvisible * (int32_t)RESULT_HEIGHT ); } */ // print logo, search box, results x-y, ... into st->m_sb printSearchResultsHeader ( st ); // propagate "topdocid" so when he does another query every 30 secs // or so we know what docid was on top for scrolling purposes //if ( si->m_format == FORMAT_WIDGET_AJAX ) // sb->safePrintf("\n", // oldTop); // report how many results we added above the topdocid provided, if any // so widget can scroll down automatically //if ( si->m_format == FORMAT_WIDGET_AJAX && numAbove ) // sb->safePrintf("\n",numAbove); // we often can add 100s of things to the widget's result set per // second especially when sorting by last spidered time and spidering // a lot. setting the maxserpscore of the serp score of the last result // allows us to append new search results to what we have in a // consistent manner. // if ( si->m_format == FORMAT_WIDGET_AJAX ) { // // let's make this ascii encoded crap // sb->safePrintf("\n", // lastSerpScore); // // let's make this ascii encoded crap // sb->safePrintf("\n", // lastDocId); // } // then print each result // don't display more than docsWanted results int32_t count = msg40->getDocsWanted(); bool hadPrintError = false; int32_t numPrintedSoFar = 0; //int32_t widgetHeight = hr->getLong("widgetheight",400); //int32_t widgetwidth = hr->getLong("widgetwidth",250); for ( int32_t i = 0 ; count > 0 && i < numResults ; i++ ) { /* if ( hasInvisibleResults ) { // // MAKE THESE RESULTS INVISIBLE! // // if doing a widget, we initially hide the new results // and scroll them down in time so it looks cool. if ( i == 0 ) sb->safePrintf("
" , (-1* (RESULT_HEIGHT+ SERP_SPACER+ PADDING*2)* numInvisible)); // // END INSIVISBILITY // // to test scrolling, hide the first result and // scroll it out if ( i == topDocIdPos ) sb->safePrintf("
" "
" ); } */ ////////// // // prints in xml or html // ////////// if ( ! printResult ( st , i , &numPrintedSoFar ) ) { hadPrintError = true; break; } // limit it count--; } if ( hadPrintError ) { if ( ! g_errno ) g_errno = EBADENGINEER; log("query: had error: %s",mstrerror(g_errno)); //return sendReply ( st , sb.getBufStart() ); } // wrap it up with Next 10 etc. printSearchResultsTail ( st ); // if we split the serps into 2 divs for scrolling purposes // then close up the 2nd one //if ( hasInvisibleResults ) sb->safePrintf("
"); // END SERP DIV if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf(""); // send it off sendReply ( st , st->m_sb.getBufStart() ); return true; } // defined in PageRoot.cpp bool expandHtml ( SafeBuf& sb, char *head , int32_t hlen , char *q , int32_t qlen , HttpRequest *r , SearchInput *si, char *method , CollectionRec *cr ) ; bool printLeftColumnRocketAndTabs ( SafeBuf *sb, bool isSearchResultsPage , CollectionRec *cr , char *tabName ); bool printLeftNavColumn ( SafeBuf &sb, State0 *st ) { SearchInput *si = &st->m_si; Msg40 *msg40 = &st->m_msg40; CollectionRec *cr = si->m_cr; char format = si->m_format; if ( format == FORMAT_HTML ) { char *title = "Search Results"; sb.safePrintf("Gigablast - %s\n",title); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); // // DIVIDE INTO TWO PANES, LEFT COLUMN and MAIN COLUMN // sb.safePrintf("" "\n\n"); // // first the nav column // // . also prints . true=isSearchresults // . tabName = "search" printLeftColumnRocketAndTabs ( &sb , true , cr , "search" ); // sb.safePrintf("" "" "
..." // "
" // "
" // "" // "
" // "
" // "" // "
" // "
" // "
" // "
" // "
" // ,cr->m_coll // ); } /* // home link sb.safePrintf( "" "
" "              " "      " "HOME    " "
" "
" "
" ); */ // // BEGIN FACET PRINTING // // // . print out one table for each gbfacet: term in the query // . LATER: show the text string corresponding to the hash // by looking it up in the titleRec // if ( format == FORMAT_HTML ) msg40->printFacetTables ( &sb ); // // END FACET PRINTING // // // BEGIN PRINT GIGABITS // SafeBuf *gbuf = &msg40->m_gigabitBuf; int32_t numGigabits = gbuf->length()/sizeof(Gigabit); // MDW: support gigabits in xml/json format again //if ( format != FORMAT_HTML ) numGigabits = 0; // print gigabits Gigabit *gigabits = (Gigabit *)gbuf->getBufStart(); //int32_t numCols = 5; //int32_t perRow = numGigabits / numCols; if ( numGigabits && format == FORMAT_XML ) sb.safePrintf("\t\n"); if ( numGigabits && format == FORMAT_JSON ) sb.safePrintf("\"gigabits\":{\n"); if ( numGigabits && format == FORMAT_HTML ) // gigabit unhide function sb.safePrintf ( "\n" ); if ( numGigabits && format == FORMAT_HTML ) sb.safePrintf("
" "" "
" "
" "
" "
" ); Query gigabitQuery; SafeBuf ttt; // limit it to 40 gigabits for now for ( int32_t i = 0 ; i < numGigabits && i < 40 ; i++ ) { Gigabit *gi = &gigabits[i]; ttt.pushChar('\"'); ttt.safeMemcpy(gi->m_term,gi->m_termLen); ttt.pushChar('\"'); ttt.pushChar(' '); } // term on it ttt.nullTerm(); if ( numGigabits > 0 ) gigabitQuery.set2 ( ttt.getBufStart() , si->m_queryLangId , true , // queryexpansion? true ); // usestopwords? // log("results: gigabitquery=%s landid=%"INT32"" // ,ttt.getBufStart() // ,si->m_queryLangId); for ( int32_t i = 0 ; i < numGigabits ; i++ ) { //if ( i > 0 && format == FORMAT_HTML ) // sb.safePrintf("
"); //if ( perRow && (i % perRow == 0) ) // sb.safePrintf("
"); // print all sentences containing this gigabit Gigabit *gi = &gigabits[i]; // after the first 3 hide them with a more link if ( i == 1 && format == FORMAT_HTML ) { sb.safePrintf("" "Show more"); sb.safePrintf("" "

"); } //printGigabit ( st,sb , msg40 , gi , si ); //sb.safePrintf("
"); printGigabitContainingSentences(st,&sb,msg40,gi,si, &gigabitQuery, i); if ( format == FORMAT_HTML ) sb.safePrintf("

"); } //if ( numGigabits >= 1 && format == FORMAT_HTML ) if ( numGigabits && format == FORMAT_HTML ) sb.safePrintf("

"); if ( numGigabits && format == FORMAT_XML ) sb.safePrintf("\t
\n"); if ( numGigabits && format == FORMAT_JSON ) { // remove ,\n sb.m_length -=2; // add back just \n sb.safePrintf("\n},\n"); } // // now print various knobs // // // print sort by date options // /* if ( format == FORMAT_HTML ) sb.safePrintf( "
" "" "SEARCH TOOLS    " "" "
" "
" */ /* "
" "" "NEWSET FIRST    " "" "
" "
" "
" "" "OLDEST FIRST    " "" "
" "
" */ // // print date constraint functions now // if ( format == FORMAT_HTML && 1 == 2) sb.safePrintf( "
" "" "ANYTIME    " "" "
" "
" "
" "" "LAST 24 HOURS    " "" "
" "
" "
" "" "LAST 7 DAYS    " "" "
" "
" "
" "" "LAST 30 DAYS    " "" "
" "
" ); // // now the MAIN column // if ( format == FORMAT_HTML ) sb.safePrintf("\n
\n"); return true; } bool printSearchResultsHeader ( State0 *st ) { SearchInput *si = &st->m_si; // grab the query Msg40 *msg40 = &(st->m_msg40); char *q = msg40->getQuery(); int32_t qlen = msg40->getQueryLen(); //char local[ 128000 ]; //SafeBuf sb(local, 128000); SafeBuf *sb = &st->m_sb; // reserve 1.5MB now! if ( ! sb->reserve(1500000 ,"pgresbuf" ) ) // 128000) ) return false; // just in case it is empty, make it null terminated sb->nullTerm(); // print first [ for json if ( si->m_format == FORMAT_JSON ) { if ( st->m_header ) sb->safePrintf("{\n"); else sb->safePrintf("[\n"); } CollectionRec *cr = si->m_cr; HttpRequest *hr = &st->m_hr; // if there's a ton of sites use the post method otherwise // they won't fit into the http request, the browser will reject // sending such a large request with "GET" char *method = "GET"; if ( si->m_sites && gbstrlen(si->m_sites)>800 ) method = "POST"; if ( si->m_format == FORMAT_HTML && cr->m_htmlHead.length() ) { return expandHtml ( *sb , cr->m_htmlHead.getBufStart(), cr->m_htmlHead.length(), q, qlen, hr, si, method, cr); } // . if not matt wells we do not do ajax // . the ajax is just there to prevent bots from slamming me // with queries. if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) { printCSSHead ( sb ,si->m_format ); sb->safePrintf(""); } if ( ! g_conf.m_isMattWells && si->m_format==FORMAT_WIDGET_IFRAME ) { printCSSHead ( sb ,si->m_format ); sb->safePrintf(""); } if ( si->m_format == FORMAT_WIDGET_IFRAME ) { int32_t refresh = hr->getLong("refresh",0); if ( refresh ) sb->safePrintf("",refresh); } // lead with user's widget header which usually has custom style tags if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { char *header = hr->getString("header",NULL); if ( header ) sb->safeStrcpy ( header ); } // this also prints gigabits and nuggabits // if we are xml/json we call this below otherwise we lose // the header of or whatever if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) { printLeftNavColumn ( *sb,st ); } if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) { printLogoAndSearchBox ( sb,&st->m_hr,-1,si); // catId = -1 } // the calling function checked this so it should be non-null char *coll = cr->m_coll; int32_t collLen = gbstrlen(coll); if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { char *pos = "relative"; if ( si->m_format == FORMAT_WIDGET_IFRAME ) pos = "absolute"; int32_t widgetwidth = hr->getLong("widgetwidth",150); int32_t widgetHeight = hr->getLong("widgetheight",400); //int32_t iconWidth = 25; // put image in this div which will have top:0px JUST like // the div holding the search results we print out below // so that the image does not scroll when you use the // scrollbar. holds the magifying glass img and searchbox. sb->safePrintf("
"); //int32_t refresh = hr->getLong("refresh",15); char *oq = hr->getString("q",NULL); if ( ! oq ) oq = ""; char *prepend = hr->getString("prepend"); if ( ! prepend ) prepend = ""; char *displayStr = "none"; if ( prepend && prepend[0] ) displayStr = ""; // to do a search we need to re-call the ajax, // just call reload like the one that is called every 15s or so sb->safePrintf("
"); sb->safePrintf("" ); //char *origq = hr->getString("q"); // we sort all results by spider date now so PREPEND // the actual user query char *origq = hr->getString("prepend"); if ( ! origq ) origq = ""; sb->safePrintf("
" // the box that holds the query "" , displayStr , widgetwidth / 23 , origq ); sb->safePrintf("
" "
\n" ); // . BEGIN SERP DIV // . div to hold the search results // . this will have the scrollbar to just scroll the serps // and not the magnifying glass sb->safePrintf("
" "
" , widgetwidth , widgetHeight); } // xml if ( si->m_format == FORMAT_XML ) sb->safePrintf("\n" "\n" ); int64_t nowMS = gettimeofdayInMillisecondsLocal(); // show current time if ( si->m_format == FORMAT_XML ) { int64_t globalNowMS = localToGlobalTimeMilliseconds(nowMS); sb->safePrintf("\t%"UINT32"\n", (uint32_t)(globalNowMS/1000)); } else if ( st->m_header && si->m_format == FORMAT_JSON ) { int64_t globalNowMS = localToGlobalTimeMilliseconds(nowMS); sb->safePrintf("\"currentTimeUTC\":%"UINT32",\n", (uint32_t)(globalNowMS/1000)); } // show response time if not doing Quality Assurance if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t%"INT64"\n", st->m_took); else if ( st->m_header && si->m_format == FORMAT_JSON ) sb->safePrintf("\"responseTimeMS\":%"INT64",\n", st->m_took); // out of memory allocating msg20s? if ( st->m_errno ) { log("query: Query failed. Had error processing query: %s", mstrerror(st->m_errno)); g_errno = st->m_errno; //return sendReply(st,sb->getBufStart()); return false; } if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t%"INT32"" "\n", msg40->m_omitCount); if ( st->m_header && si->m_format == FORMAT_JSON ) sb->safePrintf("\"numResultsOmitted\":%"INT32",\n", msg40->m_omitCount); //bool xml = si->m_xml; // if they are doing a search in dmoz, catId will be > 0. //if ( si->m_directCatId >= 0 ) { // printDMOZCrumb ( sb , si->m_directCatId , xml ); //} /////////// // // show DMOZ subcategories if doing either a // "gbpcatid: |" (Search restricted to category) // "gbcatid:" (DMOZ urls in that topic) // // The search gbcatid: results should be sorted by siterank i guess // since it is only search a single term: gbcatid: so we can // put our stars back onto that and should be sorted by them. // /////////// /* if ( si->m_catId >= 0 ) { // print the subtopcis in this topic. show as links above // the search results printDMOZSubTopics ( sb, si->m_catId , xml );//st, xml ); // ok, for now just print the dmoz topics since our search // results will be empty... until populated! //g_categories->printUrlsInTopic ( &sb , si->m_catId ); } */ // save how many docs are in this collection int64_t docsInColl = -1; //RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , si->m_coll ); RdbBase *base = getRdbBase ( (uint8_t)RDB_CLUSTERDB , st->m_collnum ); //if ( base ) docsInColl = base->getNumGlobalRecs(); //docsInColl = g_hostdb.getNumGlobalRecs ( ); // estimate it if ( base ) docsInColl = base->getNumGlobalRecs(); // multiply by # of *unique* shards // no because it already does this i think //docsInColl *= g_hostdb.getNumShards(); // include number of docs in the collection corpus if ( docsInColl >= 0LL ) { if ( si->m_format == FORMAT_XML) sb->safePrintf ( "\t%"INT64"" "\n", docsInColl ); else if ( st->m_header && si->m_format == FORMAT_JSON) sb->safePrintf("\"docsInCollection\":%"INT64",\n", docsInColl); } int32_t numResults = msg40->getNumResults(); bool moreFollow = msg40->moreResultsFollow(); // an estimate of the # of total hits int64_t totalHits = msg40->getNumTotalHits(); // only adjust upwards for first page now so it doesn't keep chaning if ( totalHits < numResults ) totalHits = numResults; if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t%"INT64"\n",(int64_t)totalHits); else if ( st->m_header && si->m_format == FORMAT_JSON ) sb->safePrintf("\"hits\":%"INT64",\n", (int64_t)totalHits); // if streaming results we just don't know if we will require // a "Next 10" link or not! we can print that after we print out // the results i guess... if ( ! si->m_streamResults ) { if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t%"INT32"" "\n" ,(int32_t)moreFollow); else if ( st->m_header && si->m_format == FORMAT_JSON ) sb->safePrintf("\"moreResultsFollow\":%"INT32",\n", (int32_t)moreFollow); } // . did he get a spelling recommendation? // . do not use htmlEncode() on this anymore since receiver // of the XML feed usually does not want that. if ( si->m_format == FORMAT_XML && st->m_spell[0] ) { sb->safePrintf ("\tsafeStrcpy(st->m_spell); sb->safePrintf ("]]>\n"); } if ( si->m_format == FORMAT_JSON && st->m_spell[0] ) { sb->safePrintf ("\t\"spell\":\""); sb->jsonEncode(st->m_spell); sb->safePrintf ("\"\n,"); } // print individual query term info if ( si->m_format == FORMAT_XML ) { Query *q = &si->m_q; sb->safePrintf("\t\n"); sb->safePrintf("\t\tcdataEncode(q->m_orig); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t" "" "\n" , getLangAbbr(si->m_queryLangId) ); sb->safePrintf("\t\t" "" "\n" , getLanguageString(si->m_queryLangId) ); for ( int i = 0 ; i < q->m_numTerms ; i++ ) { sb->safePrintf("\t\t\n"); QueryTerm *qt = &q->m_qterms[i]; sb->safePrintf("\t\t\t%i\n",i); char *term = qt->m_term; char c = term[qt->m_termLen]; term[qt->m_termLen] = '\0'; sb->safePrintf("\t\t\tm_term; if ( is_wspace_a(term[0])) printTerm++; sb->cdataEncode(printTerm); sb->safePrintf("]]>" "\n"); term[qt->m_termLen] = c; // syn? QueryTerm *sq = qt->m_synonymOf; // what language did synonym come from? if ( sq ) { // language map from wiktionary sb->safePrintf("\t\t\t" "m_langIdBits&bit))continue; char *str = getLangAbbr(i); if ( ! first ) sb->pushChar(','); first = false; sb->safeStrcpy ( str ); } sb->safePrintf("]]>\n"); } if ( sq ) { char *term = sq->m_term; char c = term[sq->m_termLen]; term[sq->m_termLen] = '\0'; char *printTerm = term; if ( is_wspace_a(term[0])) printTerm++; sb->safePrintf("\t\t\t" "" "\n" ,printTerm); term[sq->m_termLen] = c; } int64_t tf = msg40->m_msg3a.m_termFreqs[i]; sb->safePrintf("\t\t\t%"INT64"\n" ,tf); sb->safePrintf("\t\t\t%"INT64"\n" ,qt->m_termId); sb->safePrintf("\t\t\t%"UINT64"\n" ,qt->m_rawTermId); QueryWord *qw = qt->m_qword; sb->safePrintf("\t\t\t%"UINT64"\n" ,qw->m_prefixHash); sb->safePrintf("\t\t\n"); } sb->safePrintf("\t\n"); } // print individual query term info if ( si->m_format == FORMAT_JSON && st->m_header ) { Query *q = &si->m_q; sb->safePrintf("\"queryInfo\":{\n"); sb->safePrintf("\t\"fullQuery\":\""); sb->jsonEncode(q->m_orig); sb->safePrintf("\",\n"); sb->safePrintf("\t\"queryLanguageAbbr\":\""); sb->jsonEncode ( getLangAbbr(si->m_queryLangId) ); sb->safePrintf("\",\n"); sb->safePrintf("\t\"queryLanguage\":\""); sb->jsonEncode ( getLanguageString(si->m_queryLangId) ); sb->safePrintf("\",\n"); sb->safePrintf("\t\"terms\":[\n"); for ( int i = 0 ; i < q->m_numTerms ; i++ ) { sb->safePrintf("\t\t{\n"); QueryTerm *qt = &q->m_qterms[i]; sb->safePrintf("\t\t\"termNum\":%i,\n",i); char *term = qt->m_term; char c = term[qt->m_termLen]; term[qt->m_termLen] = '\0'; sb->safePrintf("\t\t\"termStr\":\""); sb->jsonEncode (qt->m_term); sb->safePrintf("\",\n"); term[qt->m_termLen] = c; // syn? QueryTerm *sq = qt->m_synonymOf; // what language did synonym come from? if ( sq ) { // language map from wiktionary sb->safePrintf("\t\t\"termLang\":\""); bool first = true; for ( int i = 0 ; i < langLast ; i++ ) { uint64_t bit = (uint64_t)1 << i; if ( ! (qt->m_langIdBits&bit))continue; char *str = getLangAbbr(i); if ( ! first ) sb->pushChar(','); first = false; sb->jsonEncode ( str ); } sb->safePrintf("\",\n"); } if ( sq ) { char *term = sq->m_term; char c = term[sq->m_termLen]; term[sq->m_termLen] = '\0'; sb->safePrintf("\t\t\"synonymOf\":\""); sb->jsonEncode(sq->m_term); sb->safePrintf("\",\n"); term[sq->m_termLen] = c; } int64_t tf = msg40->m_msg3a.m_termFreqs[i]; sb->safePrintf("\t\t\"termFreq\":%"INT64",\n" ,tf); sb->safePrintf("\t\t\"termId48\":%"INT64",\n" ,qt->m_termId); sb->safePrintf("\t\t\"termId64\":%"UINT64",\n" ,qt->m_rawTermId); // don't end last query term attr on a omma QueryWord *qw = qt->m_qword; sb->safePrintf("\t\t\"prefixHash64\":%"UINT64"\n" ,qw->m_prefixHash); sb->safePrintf("\t}"); if ( i + 1 < q->m_numTerms ) sb->pushChar(','); sb->pushChar('\n'); } sb->safePrintf("\t]\n"); // end "terms":[] sb->safePrintf("},\n"); } // when streaming results we lookup the facets last if ( si->m_format != FORMAT_HTML && ! si->m_streamResults ) msg40->printFacetTables ( sb ); // now print gigabits if we are xml/json if ( si->m_format != FORMAT_HTML ) { // this will print gigabits printLeftNavColumn ( *sb,st ); } // global-index is not a custom crawl but we should use "objects" bool isDiffbot = cr->m_isCustomCrawl; if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) isDiffbot = true; // for diffbot collections only... if ( st->m_header && si->m_format == FORMAT_JSON && isDiffbot ) { sb->safePrintf("\"objects\":[\n"); return true; } if ( si->m_format == FORMAT_JSON && ! cr->m_isCustomCrawl ) { sb->safePrintf("\"results\":[\n"); return true; } // debug if ( si->m_debug ) logf(LOG_DEBUG,"query: Displaying up to %"INT32" results.", numResults); // tell browser again //if ( si->m_format == FORMAT_HTML ) // sb->safePrintf("\n"); // get some result info from msg40 int32_t firstNum = msg40->getFirstResultNum() ; // numResults may be more than we requested now! int32_t n = msg40->getDocsWanted(); if ( n > numResults ) n = numResults; // . make the query class here for highlighting // . keepAllSingles means to convert all individual words into // QueryTerms even if they're in quotes or in a connection (cd-rom). // we use this for highlighting purposes Query qq; qq.set2 ( si->m_displayQuery, langUnknown , si->m_queryExpansion ); // si->m_boolFlag, // true ); // keepAllSingles? if ( g_errno ) return false;//sendReply (st,NULL); DocIdScore *dpx = NULL; if ( numResults > 0 ) dpx = msg40->getScoreInfo(0); if ( si->m_format == FORMAT_XML && dpx ) { // # query terms used! //int32_t nr = dpx->m_numRequiredTerms; float max = 0.0; // max pairwise float lw = getHashGroupWeight(HASHGROUP_INLINKTEXT); // square that location weight lw *= lw; // assume its an inlinker's text, who has rank 15!!! lw *= getLinkerWeight(MAXSITERANK); // double loops /* for ( int32_t i = 0 ; i< nr ; i++ ) { SingleScore *ssi = &dpx->m_singleScores[i]; float tfwi = getTermFreqWeight(ssi->m_listSize); for ( int32_t j = i+1; j< nr ; j++ ) { SingleScore *ssj = &dpx->m_singleScores[j]; float tfwj =getTermFreqWeight(ssj->m_listSize); max += (lw * tfwi * tfwj)/3.0; } } */ // single weights float maxtfw1 = 0.0; int32_t maxi1; // now we can have multiple SingleScores for the same term! // because we take the top MAX_TOP now and add them to // get the term's final score. for ( int32_t i = 0 ; i< dpx->m_numSingles ; i++ ) { SingleScore *ssi = &dpx->m_singleScores[i]; float tfwi = ssi->m_tfWeight; if ( tfwi <= maxtfw1 ) continue; maxtfw1 = tfwi; maxi1 = i; } float maxtfw2 = 0.0; int32_t maxi2; for ( int32_t i = 0 ; i< dpx->m_numSingles ; i++ ) { if ( i == maxi1 ) continue; SingleScore *ssi = &dpx->m_singleScores[i]; float tfwi = ssi->m_tfWeight; if ( tfwi <= maxtfw2 ) continue; maxtfw2 = tfwi; maxi2 = i; } // only 1 term? if ( maxtfw2 == 0.0 ) maxtfw2 = maxtfw1; // best term freqs max *= maxtfw1 * maxtfw2; // site rank effect max *= MAXSITERANK/SITERANKDIVISOR + 1; sb->safePrintf ("\t\t%f" "\n", max ); } // debug msg log ( LOG_TIMING , "query: Got %"INT32" search results in %"INT64" ms for q=%s", numResults,gettimeofdayInMilliseconds()-st->m_startTime, qq.getQuery()); //Highlight h; st->m_qe[0] = '\0'; // encode query buf //char qe[MAX_QUERY_LEN+1]; char *dq = si->m_displayQuery; //int32_t dqlen = si->m_displayQueryLen; if ( dq ) urlEncode(st->m_qe,MAX_QUERY_LEN*2,dq,gbstrlen(dq)); // how many results were requested? //int32_t docsWanted = msg40->getDocsWanted(); // store html head into p, but stop at %q //char *head = cr->m_htmlHead; //int32_t hlen = cr->m_htmlHeadLen; //if ( ! si->m_xml ) sb->safeMemcpy ( head , hlen ); // ignore imcomplete or invalid multibyte or wide characters errors //if ( g_errno == EILSEQ ) { // log("query: Query error: %s. Ignoring.", mstrerror(g_errno)); // g_errno = 0; //} // secret search backdoor if ( qlen == 7 && q[0]=='3' && q[1]=='b' && q[2]=='Y' && q[3]=='6' && q[4]=='u' && q[5]=='2' && q[6]=='Z' ) { sb->safePrintf ( "
You owe me!

" ); } // print it with commas into "thbuf" and null terminate it char thbuf[64]; ulltoa ( thbuf , totalHits ); char inbuf[128]; ulltoa ( inbuf , docsInColl ); Query qq3; Query *qq2; bool firstIgnored; //bool isAdmin = si->m_isMasterAdmin; bool isAdmin = (si->m_isMasterAdmin || si->m_isCollAdmin); if ( si->m_format != FORMAT_HTML ) isAdmin = false; // otherwise, we had no error if ( numResults == 0 && si->m_format == FORMAT_HTML ) { sb->safePrintf ( "No results found in %s collection.", cr->m_coll); } // the token is currently in the collection name so do not show that else if ( numResults == 0 && ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) ) { sb->safePrintf ( "No results found. Wait for spider to " "kick in."); } else if ( moreFollow && si->m_format == FORMAT_HTML ) { if ( isAdmin && si->m_docsToScanForReranking > 1 ) sb->safePrintf ( "PQR'd " ); sb->safePrintf ("Results %"INT32" to %"INT32" of " "exactly %s from an index " "of about %s pages" , firstNum + 1 , firstNum + n , thbuf , inbuf ); } // otherwise, we didn't get enough results to show this page else if ( si->m_format == FORMAT_HTML ) { if ( isAdmin && si->m_docsToScanForReranking > 1 ) sb->safePrintf ( "PQR'd " ); sb->safePrintf ("Results %"INT32" to %"INT32" of " "exactly %s from an index " "of about %s pages" , firstNum + 1 , firstNum + n , thbuf , inbuf ); } if ( si->m_format == FORMAT_HTML ) sb->safePrintf(" in %.02f seconds",((float)st->m_took)/1000.0); // // if query was a url print add url msg // char *url = NULL; if ( !strncmp(q,"url:" ,4) && qlen > 4 ) url = q+4; if ( !strncmp(q,"http://" ,7) && qlen > 7 ) url = q; if ( !strncmp(q,"https://",8) && qlen > 8 ) url = q; if ( !strncmp(q,"www." ,4) && qlen > 4 ) url = q; // find end of url char *ue = url; for ( ; ue && *ue && ! is_wspace_a(*ue) ; ue++ ) ; if ( numResults == 0 && si->m_format == FORMAT_HTML && url ) { sb->safePrintf("

" "Could not find that url in the " "index. Try urlEncode(url,ue-url,false,false); sb->safePrintf(">Adding it."); } // sometimes ppl search for "www.whatever.com" so ask them if they // want to search for url:www.whatever.com if ( numResults > 0 && si->m_format == FORMAT_HTML && url && url ==q){ sb->safePrintf("

" "Did you mean to " "search for the url " "urlEncode(url,ue-url,false,false); sb->safePrintf(">"); sb->safeMemcpy(url,ue-url); sb->safePrintf(" itself?"); } // is it the main collection? bool isMain = false; if ( collLen == 4 && strncmp ( coll, "main", 4) == 0 ) isMain = true; // print "in collection ***" if we had a collection if (collLen>0 && numResults>0 && !isMain && si->m_format==FORMAT_HTML ) sb->safePrintf (" in collection %s",coll); //char *pwd = si->m_pwd; //if ( ! pwd ) pwd = ""; /* if ( si->m_format == FORMAT_HTML ) sb->safePrintf("   " "[show scores]" " ", numResults ); */ /* // convenient admin link if ( isAdmin ) { sb->safePrintf("   " "" "" "admin" "",coll); // print reindex link // get the filename directly char *langStr = si->m_defaultSortLang; if ( numResults>0 ) sb->safePrintf ("   " "" "" "respider these results" "" " ",coll, langStr , st->m_qe ); sb->safePrintf ("   " "" "" "scrape google/bing" " ", coll , st->m_qe ); sb->safePrintf ("   " "" "" "show banned results" " ", coll , langStr , st->m_qe ); sb->safePrintf ("   " "" "api" , coll ); sb->safePrintf ("   " "" "" "xml" " ", coll , langStr , st->m_qe ); sb->safePrintf ("   " "" "" "json" " ", coll , langStr , st->m_qe ); sb->safePrintf ("   " "" "" "hide admin links" " ", coll , langStr , st->m_qe ); } // if its an ip: or site: query, print ban link if ( isAdmin && strncmp(si->m_displayQuery,"ip:",3)==0) { // get the ip char *ips = si->m_displayQuery + 3; // copy to buf, append a ".0" if we need to char buf [ 32 ]; int32_t i ; int32_t np = 0; for ( i = 0 ; i<29 && (is_digit(ips[i])||ips[i]=='.'); i++ ){ if ( ips[i] == '.' ) np++; buf[i]=ips[i]; } // if not enough periods bail if ( np <= 1 ) goto skip2; if ( np == 2 ) { buf[i++]='.'; buf[i++]='0'; } buf[i] = '\0'; // search ip back or forward int32_t ip = atoip(buf,i); sb->safePrintf ("  " "" "[prev %s]" , iptoa(ip-0x01000000),coll,docsWanted, iptoa(ip-0x01000000)); sb->safePrintf ("  " "" "[next %s]" , iptoa(ip+0x01000000),coll,docsWanted, iptoa(ip+0x01000000)); } // if its an ip: or site: query, print ban link if ( isAdmin && strncmp(si->m_displayQuery,"site:",5)==0) { // get the ip char *start = si->m_displayQuery + 5; char *sp = start; while ( *sp && ! is_wspace_a(*sp) ) sp++; char c = *sp; // get the filename directly sb->safePrintf ("   " "" "" "[ban %s]" " ",coll , start ); *sp = c; } if ( isAdmin && strncmp(si->m_displayQuery,"gbad:",5)==0) { // get the ip char *start = si->m_displayQuery + 5; char *sp = start; while ( *sp && ! is_wspace_a(*sp) ) sp++; char c = *sp; *sp = '\0'; sb->safePrintf ("   " "" "" "[ban %s]" " ", coll , start , start ); *sp = c; } skip2: // cache switch for admin if ( isAdmin && msg40->getCachedTime() > 0 ) { // get the filename directly sb->safePrintf("   " "" "safePrintf("&q=%s&rcache=0&seq=0&rtq=0\">" "[cache off]" " ", st->m_qe ); } */ // mention ignored query terms // we need to set another Query with "keepAllSingles" set to false qq2 = &si->m_q; //qq2.set ( q , qlen , NULL , 0 , si->m_boolFlag , false ); firstIgnored = true; for ( int32_t i = 0 ; i < qq2->m_numWords ; i++ ) { //if ( si->m_xml ) break; QueryWord *qw = &qq2->m_qwords[i]; // only print out words ignored cuz they were stop words if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue; // print header -- we got one if ( firstIgnored ) { if ( si->m_format == FORMAT_XML ) sb->safePrintf ("\tm_format == FORMAT_HTML ) sb->safePrintf ("   The " "following query words " "were ignored: " ""); firstIgnored = false; } // print the word char *t = qw->m_word; int32_t tlen = qw->m_wordLen; sb->utf8Encode2 ( t , tlen ); sb->safePrintf (" "); } // print tail if we had ignored terms if ( ! firstIgnored ) { sb->incrementLength(-1); if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]>\n"); else if ( si->m_format == FORMAT_HTML ) sb->safePrintf (". Preceed each with a '+' or " "wrap in " "quotes to not ignore."); } if ( si->m_format == FORMAT_HTML ) sb->safePrintf("

"); if ( si->m_format == FORMAT_HTML ) sb->safePrintf("" "" // ,ps->m_finalScore // ); // . print out the breakout tables then // . they should pop-up when the user // mouses over a cell in the distance matrix //sb->safePrintf("
"); // two pane table //if ( si->m_format == FORMAT_HTML ) // sb->safePrintf(""); // did we get a spelling recommendation? if ( si->m_format == FORMAT_HTML && st->m_spell[0] ) { // encode the spelling recommendation int32_t len = gbstrlen ( st->m_spell ); char qe2[MAX_FRAG_SIZE]; urlEncode(qe2, MAX_FRAG_SIZE, st->m_spell, len); sb->safePrintf ("Did you mean:" " " "safePrintf ("\">"); sb->utf8Encode2(st->m_spell, len); // then finish it off sb->safePrintf ("\n

\n"); } // . Wrap results in a table if we are using ads. Easier to display. //Ads *ads = &st->m_ads; //if ( ads->hasAds() ) // sb->safePrintf("\n" // "
\n"); // debug if ( si->m_debug ) logf(LOG_DEBUG,"query: Printing up to %"INT32" results. " "bufStart=0x%"PTRFMT"", numResults, (PTRTYPE)sb->getBuf()); // // BEGIN PRINT THE RESULTS // //sb->safePrintf(""); //sb->safePrintf(""); /* sb->safePrintf( "\n" "
CLICK ME
\n" ); */ /* if ( si->m_format == FORMAT_HTML ) sb->safePrintf("incrementLength(-2); //} if ( si->m_format == FORMAT_JSON ) { // remove last },\n if there and replace with just \n char *e = sb->getBuf() - 2; if ( sb->length()>=2 && e[0]==',' && e[1]=='\n') { sb->m_length -= 2; sb->safePrintf("\n"); } // print ending ] for json sb->safePrintf("]\n"); // when streaming results we lookup the facets last if ( si->m_streamResults ) msg40->printFacetTables ( sb ); if ( st->m_header ) sb->safePrintf("}\n"); // all done for json return true; } // grab the query char *q = msg40->getQuery(); int32_t qlen = msg40->getQueryLen(); HttpRequest *hr = &st->m_hr; // get some result info from msg40 int32_t firstNum = msg40->getFirstResultNum() ; // end the two-pane table if ( si->m_format == FORMAT_HTML) sb->safePrintf("
"); // for storing a list of all of the sites we displayed, now we print a // link at the bottom of the page to ban all of the sites displayed // with one click SafeBuf banSites; //int32_t tailLen = 0; //char *tail = NULL; // // PRINT PREV 10 NEXT 10 links! // // center everything below here if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "
" ); int32_t remember = sb->length(); // now print "Prev X Results" if we need to if ( firstNum < 0 ) firstNum = 0; char abuf[300]; SafeBuf args(abuf,300); // show banned? if ( si->m_showBanned && ! si->m_isMasterAdmin ) args.safePrintf("&sb=1"); if ( ! si->m_showBanned && si->m_isMasterAdmin ) args.safePrintf("&sb=0"); //HttpRequest *hr = &st->m_hr; // collection args.safePrintf("&c=%s",coll); // formatting info if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { args.safePrintf("&format=widget"); int32_t widgetwidth = hr->getLong("widgetwidth",250); args.safePrintf("&widgetwidth=%"INT32"",widgetwidth); } // carry over the sites we are restricting the search results to if ( si->m_sites ) //whiteListBuf.getBufStart()); args.safePrintf("&sites=%s",si->m_sites); if ( si->m_format == FORMAT_HTML && msg40->m_omitCount ) { // && firstNum == 0 ) { // . add our cgi to the original url // . so if it has &qlang=de and they select &qlang=en // we have to replace it... etc. SafeBuf newUrl; // show banned results replaceParm2 ("sb=1", &newUrl, hr->m_origUrlRequest, hr->m_origUrlRequestLen ); // no deduping by summary or content hash etc. SafeBuf newUrl2; replaceParm2("dr=0",&newUrl2,newUrl.getBufStart(), newUrl.length()); // and no site clustering SafeBuf newUrl3; replaceParm2 ( "sc=0", &newUrl3 , newUrl2.getBufStart(), newUrl2.length()); // start at results #0 again SafeBuf newUrl4; replaceParm2 ( "s=0", &newUrl4 , newUrl3.getBufStart(), newUrl3.length()); // show errors SafeBuf newUrl5; replaceParm2 ( "showerrors=1", &newUrl5 , newUrl4.getBufStart(), newUrl4.length()); sb->safePrintf("
" "" "%"INT32" results were omitted because they " "were considered duplicates, banned, errors " "
" "or " "from the same site as other results. " "Click here to show all results." "
" "
" "

" , msg40->m_omitCount , newUrl5.getBufStart() ); } if ( firstNum > 0 && (si->m_format == FORMAT_HTML || si->m_format == FORMAT_WIDGET_IFRAME //|| //si->m_format == FORMAT_WIDGET_AJAX ) ) { int32_t ss = firstNum - msg40->getDocsWanted(); //sb->safePrintf("safeStrcpy ( st->m_qe ); // print other args if not zero //sb->safeMemcpy ( &args ); // make the cgi parm to add to the original url char nsbuf[128]; sprintf(nsbuf,"s=%"INT32"",ss); // get the original url and add/replace in &s=xxx SafeBuf newUrl; replaceParm ( nsbuf , &newUrl , hr ); // close it up sb->safePrintf ("" "Prev %"INT32" Results" "" , newUrl.getBufStart() , msg40->getDocsWanted() ); } // now print "Next X Results" if ( msg40->moreResultsFollow() && (si->m_format == FORMAT_HTML || si->m_format == FORMAT_WIDGET_IFRAME //si->m_format == FORMAT_WIDGET_AJAX )) { int32_t ss = firstNum + msg40->getDocsWanted(); // print a separator first if we had a prev results before us if ( sb->length() > remember ) sb->safePrintf ( "   " ); // add the query //sb->safePrintf ("safeStrcpy ( st->m_qe ); // print other args if not zero //sb->safeMemcpy ( &args ); // make the cgi parm to add to the original url char nsbuf[128]; sprintf(nsbuf,"s=%"INT32"",ss); // get the original url and add/replace in &s=xxx SafeBuf newUrl; replaceParm ( nsbuf , &newUrl , hr ); // close it up sb->safePrintf("" "Next %"INT32" Results" "" , newUrl.getBufStart() , msg40->getDocsWanted() ); } // print try this search on... // an additional
if we had a Next or Prev results link if ( sb->length() > remember && si->m_format == FORMAT_HTML ) sb->safeMemcpy ("
" , 4 ); // // END PRINT PREV 10 NEXT 10 links! // // end results table cell... and print calendar at top //tail = cr->m_htmlTail; //tailLen = gbstrlen (tail ); //if ( si->m_format == FORMAT_HTML ) sb->safeMemcpy ( tail , tailLen ); if ( si->m_format == FORMAT_HTML ) { /* sb->safePrintf("" "" "" "" ); sb->safePrintf("
" "" "htmlEncode ( si->m_sbuf1.getBufStart() , si->m_sbuf1.length() , false ); sb->safePrintf("\">" "" "
" "
"); */ sb->safePrintf("",coll); } bool isAdmin = (si->m_isMasterAdmin || si->m_isCollAdmin); if ( si->m_format != FORMAT_HTML ) isAdmin = false; if ( isAdmin && banSites.length() > 0 ) sb->safePrintf ("

" "
\n ", coll, banSites.getBufStart()); // TODO: print cache line in light gray here // TODO: "these results were cached X minutes ago" if ( msg40->getCachedTime() > 0 && si->m_format == FORMAT_HTML ) { sb->safePrintf("

" "
"); sb->safePrintf ( " These results were cached " ); // this cached time is this local cpu's time int32_t diff = getTime() - msg40->getCachedTime(); if ( diff < 60 ) sb->safePrintf ("%"INT32" seconds", diff ); else if ( diff < 2*60 ) sb->safePrintf ("1 minute"); else sb->safePrintf ("%"INT32" minutes",diff/60); sb->safePrintf ( " ago. [" "Info]"); sb->safePrintf ( "
"); } if ( si->m_format == FORMAT_XML ) { // when streaming results we lookup the facets last if ( si->m_streamResults ) msg40->printFacetTables ( sb ); sb->safePrintf("\n"); } if ( si->m_format == FORMAT_HTML && ! g_conf.m_isMattWells && cr->m_htmlTail.length() == 0 ) { sb->safePrintf ( "
" "
" "" "Copyright © 2014. All Rights " "Reserved.
" "Powered by the GigaBlast open source " "search engine." "
" "
\n" "
\n" ); } // if we did not use ajax, print this tail here now if ( si->m_format == FORMAT_HTML && ! g_conf.m_isMattWells ) { sb->safePrintf( "\n" "\n" ); } // ajax widgets will have this outside the downloaded content if ( si->m_format == FORMAT_WIDGET_IFRAME ) { sb->safePrintf ( "
" "
" "" // link to edit the list of widget sites // or various other widget content properties // because we can't edit the width/height // of the widget like this. "edit " "• " //"Copyright © 2014. All Rights " //"Reserved.
" "Powered by " "Diffbot." "
" "
\n" "\n" "\n" ); } if ( sb->length() == 0 && si && si->m_format == FORMAT_JSON ) sb->safePrintf("[]\n"); if ( sb->length() == 0 ) { sb->pushChar('\n'); sb->nullTerm(); } if ( si->m_format == FORMAT_HTML && cr->m_htmlTail.length() && ! expandHtml ( *sb , cr->m_htmlTail.getBufStart(), cr->m_htmlTail.length(), q, qlen, hr, si, NULL, // method, cr) ) return false; return true; } bool printTimeAgo ( SafeBuf *sb, time_t ts , char *prefix , SearchInput *si ) { // Jul 23, 1971 sb->reserve2x(200); int32_t now = getTimeGlobal(); // for printing int32_t mins = 1000; int32_t hrs = 1000; int32_t days ; if ( ts > 0 ) { mins = (int32_t)((now - ts)/60); hrs = (int32_t)((now - ts)/3600); days = (int32_t)((now - ts)/(3600*24)); if ( mins < 0 ) mins = 0; if ( hrs < 0 ) hrs = 0; if ( days < 0 ) days = 0; } // print the time ago if ( mins ==1) sb->safePrintf(" - %s: %"INT32" minute ago",prefix,mins); else if (mins<60) sb->safePrintf ( " - %s: %"INT32" minutes ago",prefix,mins); else if ( hrs == 1 ) sb->safePrintf ( " - %s: %"INT32" hour ago",prefix,hrs); else if ( hrs < 24 ) sb->safePrintf ( " - %s: %"INT32" hours ago",prefix,hrs); else if ( days == 1 ) sb->safePrintf ( " - %s: %"INT32" day ago",prefix,days); else if (days< 7 ) sb->safePrintf ( " - %s: %"INT32" days ago",prefix,days); // do not show if more than 1 wk old! we want to seem as // fresh as possible else if ( ts > 0 ) { // && si->m_isMasterAdmin ) { struct tm *timeStruct = localtime ( &ts ); sb->safePrintf(" - %s: ",prefix); char tmp[100]; strftime(tmp,100,"%b %d %Y",timeStruct); sb->safeStrcpy(tmp); } return true; } int linkSiteRankCmp (const void *v1, const void *v2) { Inlink *i1 = *(Inlink **)v1; Inlink *i2 = *(Inlink **)v2; return i2->m_siteRank - i1->m_siteRank; } bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si , int32_t *numPrinted ) { *numPrinted = 0; // . show the "LinkInfo" // . Msg20.cpp will have "computed" the LinkInfo if we set // Msg20Request::m_computeLinkInfo to true, but if we set // Msg20Request::m_getLinkInfo to true it will just get it // from the TitleRec, which is much faster but more stale. // . "&inlinks=1" is slow and fresh, "&inlinks=2" is fast // and stale. Both are really only for BuzzLogic. LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; // sanity if ( info && mr->size_linkInfo!=info->m_lisize ){char *xx=NULL;*xx=0; } // NULLify if empty if ( mr->size_linkInfo <= 0 ) info = NULL; // do not both if none if ( info && ! info->m_numStoredInlinks ) info = NULL; // bail? if ( ! info ) return true; // now sort them up Inlink *k = info->getNextInlink(NULL); // #define from Linkdb.h Inlink *ptrs[MAX_LINKERS]; int32_t numLinks = 0; for ( ; k ; k = info->getNextInlink(k) ) { ptrs[numLinks++] = k; if ( numLinks >= MAX_LINKERS ) break; } // sort them gbsort ( ptrs , numLinks , sizeof(Inlink *) , linkSiteRankCmp ); // print xml starter if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t\n"); // loop through the inlinks bool printedInlinkText = false; bool firstTime = true; int32_t inlinkId = 0; int64_t starttime = gettimeofdayInMillisecondsLocal(); //int32_t icount = 0; //int32_t ecount = 0; //int32_t absSum = 0; for ( int32_t i = 0 ; i < numLinks ; i++ ) { k = ptrs[i]; if ( ! k->getLinkText() ) continue; if ( ! si->m_doQueryHighlighting && si->m_format == FORMAT_HTML ) continue; char *str = k->getLinkText();//ptr_linkText; int32_t strLen = k->size_linkText; //char tt[1024*3]; //char *ttend = tt + 1024*3; char *frontTag = "" ; char *backTag = ""; if ( si->m_format == FORMAT_XML ) { frontTag = ""; backTag = ""; } if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX ) { frontTag = "" ; } Highlight hi; SafeBuf hb; int32_t hlen = hi.set ( &hb,//tt , //ttend - tt , str, strLen , mr->m_language, // docLangId &si->m_hqq , // highlight query CLASS false , // doStemming? false , // use click&scroll? NULL , // base url frontTag, backTag, 0, 0 ); // niceness if ( hlen <= 0 ) continue; // skip it if nothing highlighted if ( hi.getNumMatches() == 0 ) continue; if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t\tm_docId ); // encode it for xml sb->htmlEncode ( k->getUrl(),//ptr_urlBuf, k->size_urlBuf - 1 , false ); sb->safePrintf("\" " //"hostId=\"%"UINT32"\" " "firstindexed=\"%"UINT32"\" " // not accurate! //"lastspidered=\"%"UINT32"\" " "wordposstart=\"%"INT32"\" " "id=\"%"INT32"\" " "siterank=\"%"INT32"\" " "text=\"", //hh , //(int32_t)k->m_datedbDate, (uint32_t)k->m_firstIndexedDate, //(uint32_t)k->m_lastSpidered, (int32_t)k->m_wordPosStart, inlinkId, //linkScore); (int32_t)k->m_siteRank ); // HACK!!! k->m_siteHash = inlinkId; // inc it inlinkId++; // encode it for xml if ( !sb->htmlEncode ( hb.getBufStart(), hb.length(), false)) return false; sb->safePrintf("\"/>\n"); continue; } if ( firstTime ) { sb->safePrintf(""); sb->safePrintf("" "" "" "" "" "" "" ); } firstTime = false; sb->safePrintf("",(int32_t)k->m_siteRank); //sb->safePrintf("
"); printedInlinkText = true; *numPrinted = *numPrinted + 1; } int64_t took = gettimeofdayInMillisecondsLocal() - starttime; if ( took > 2 ) log("timing: took %"INT64" ms to highlight %"INT32" links." ,took,numLinks); // closer for xml if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t\n"); //if ( printedInlinkText ) sb->safePrintf("
\n"); if ( printedInlinkText ) sb->safePrintf("" "
" "
" "Inlinks with Query Terms" "
" "
Inlink TextFromSite Rank
" "" //"" //k->getUrl()); ,si->m_cr->m_coll ,k->m_docId); if ( ! sb->safeMemcpy(&hb) ) return false; int32_t hostLen = 0; char *host = getHostFast(k->getUrl(),&hostLen,NULL); sb->safePrintf(""); if ( host ) sb->safeMemcpy(host,hostLen); sb->safePrintf("%"INT32"
" "
"); return true; } // // . print a dmoz topic for the given numeric catid UNDER search result // . print "Search in Category" link as well // static bool printDMOZCategoryUnderResult ( SafeBuf *sb , SearchInput *si, int32_t catid , State0 *st ) { char format = si->m_format; // these are handled in the logic below now if ( format != FORMAT_HTML ) return true; // if ( format == FORMAT_XML ) { // sb->safePrintf("\t\t\n" // "\t\t\t%"INT32"\n" // "\t\t\tprintPathFromId(&xb, catid, false,si->m_isRTL); // sb->cdataEncode(xb.getBufStart()); // sb->safePrintf("]]>\n"); // sb->safePrintf("\t\t\n"); // return true; // } // if ( format == FORMAT_JSON ) { // sb->safePrintf("\t\t\"dmozCat\":{\n" // "\t\t\t\"dmozCatId\":%"INT32",\n" // "\t\t\t\"dmozCatStr\":\"" // ,catid); // // print the name of the dmoz category // char xbuf[256]; // SafeBuf xb(xbuf,256,0,false); // g_categories->printPathFromId(&xb, catid, false,si->m_isRTL); // sb->jsonEncode(xb.getBufStart()); // sb->safePrintf("\"\n" // "\t\t},\n"); // return true; // } //uint8_t queryLanguage = langUnknown; uint8_t queryLanguage = si->m_queryLangId; // Don't print category if not in native language category // Note that this only trims out "World" cats, not all // of them. Some of them may still sneak in. //if(si->m_langHint) // queryLanguage = si->m_langHint; if(queryLanguage != langUnknown) { char tmpbuf[1024]; SafeBuf langsb(tmpbuf, 1024); g_categories->printPathFromId(&langsb, catid, false); char *ptr = langsb.getBufStart(); uint8_t lang = g_langId.findLangFromDMOZTopic(ptr + 7); if(!strncmp("World: ", ptr, 6) && lang != langUnknown && lang != queryLanguage) // do not print it if not in our language return true; } ////// // // print a link to apply your query to this DMOZ category // ////// sb->safePrintf("urlEncode("|",1); sb->urlEncode(si->m_sbuf1.getBufStart(),si->m_sbuf1.length()); sb->safePrintf("\">Search in Category: "); // setup the host of the url //if ( dmozHost ) // sb->safePrintf("safePrintf("printPathFromId(sb, catid, true,si->m_isRTL); sb->safePrintf("/\">"); // print the name of the dmoz category sb->safePrintf(""); g_categories->printPathFromId(sb, catid, false,si->m_isRTL); sb->safePrintf("
"); //++tr.brCount; return true; } // use this for xml as well as html bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) { SafeBuf *sb = &st->m_sb; HttpRequest *hr = &st->m_hr; CollectionRec *cr = NULL; cr = g_collectiondb.getRec ( st->m_collnum ); if ( ! cr ) { log("query: printResult: collnum %"INT32" gone", (int32_t)st->m_collnum); return true; } // int16_tcuts SearchInput *si = &st->m_si; Msg40 *msg40 = &st->m_msg40; // ensure not all cluster levels are invisible if ( si->m_debug ) logf(LOG_DEBUG,"query: result #%"INT32" clusterlevel=%"INT32"", ix, (int32_t)msg40->getClusterLevel(ix)); int64_t d = msg40->getDocId(ix); // do not print if it is a summary dup or had some error // int32_t level = (int32_t)msg40->getClusterLevel(ix); // if ( level != CR_OK && // level != CR_INDENT ) // return true; if ( si->m_docIdsOnly ) { if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\n" "\t\t%"INT64"\n" "\t\n", d ); else if ( si->m_format == FORMAT_JSON ) sb->safePrintf("\t\{\n" "\t\t\"docId\":%"INT64"\n" "\t},\n", d ); else sb->safePrintf("%"INT64"
\n", d ); // inc it *numPrintedSoFar = *numPrintedSoFar + 1; return true; } Msg20 *m20 ; if ( si->m_streamResults ) m20 = msg40->getCompletedSummary(ix); else m20 = msg40->m_msg20[ix]; // get the reply Msg20Reply *mr = m20->m_r; // . sometimes the msg20reply is NULL so prevent it coring // . i think this happens if all hosts in a shard are down or timeout // or something if ( ! mr ) { sb->safePrintf("getting summary for docid %"INT64" had " "error: %s

" ,d,mstrerror(m20->m_errno)); return true; } // . if section voting info was request, display now, it's in json // . so if in csv it will mess things up!!! if ( mr->ptr_sectionVotingInfo ) // it is possible this is just "\0" sb->safeStrcpy ( mr->ptr_sectionVotingInfo ); // each "result" is the actual cached page, in this case, a json // object, because we were called with &icc=1. in that situation // ptr_content is set in the msg20reply. if ( si->m_format == FORMAT_CSV && mr->ptr_content && mr->m_contentType == CT_JSON ) { // parse it up char *json = mr->ptr_content; // only print header row once, so pass in that flag if ( ! st->m_printedHeaderRow ) { sb->reset(); printCSVHeaderRow ( sb , st ); st->m_printedHeaderRow = true; } printJsonItemInCSV ( json , sb , st ); // inc it *numPrintedSoFar = *numPrintedSoFar + 1; return true; } // just print cached web page? if ( mr->ptr_content && si->m_format == FORMAT_JSON && strstr(mr->ptr_ubuf,"-diffbotxyz") ) { // for json items separate with \n,\n if ( si->m_format != FORMAT_HTML && *numPrintedSoFar > 0 ) sb->safePrintf(",\n"); // a dud? just print empty {}'s if ( mr->size_content == 1 ) sb->safePrintf("{}"); // if it's a diffbot object just print it out directly // into the json. it is already json. else sb->safeStrcpy ( mr->ptr_content ); // . let's hack the spidertime onto the end // . so when we sort by that using gbsortby:spiderdate // we can ensure it is ordered correctly // As of the update on 5/13/2014, the end of sb may have whitespace, so first move away from that int distance; // distance from end to first non-whitespace char char *end; for (distance = 1; distance < sb->getLength(); distance++) { end = sb->getBuf() - distance; if (!is_wspace_a(*end)) break; } if ( si->m_format == FORMAT_JSON && end > sb->getBufStart() && *end == '}' ) { // replace trailing } with spidertime} sb->incrementLength(-distance); // comma? if ( mr->size_content>1 ) sb->pushChar(','); sb->safePrintf("\"docId\":%"INT64"", mr->m_docId); sb->safePrintf(",\"gburl\":\""); sb->jsonEncode(mr->ptr_ubuf); sb->safePrintf("\""); // for deduping //sb->safePrintf(",\"crc\":%"UINT32"",mr->m_contentHash32); // crap, we lose resolution storing as a float // so fix that shit here... //float f = mr->m_lastSpidered; //sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f); // MDW: this is VERY convenient for debugging pls // leave in. we can easily see if a result // should be there for a query like // gbmin:gbspiderdate:12345678 sb->safePrintf(",\"lastCrawlTimeUTC\":%"INT32"", mr->m_lastSpidered); // also include a timestamp field with an RFC 1123 formatted date char timestamp[50]; struct tm *ptm =gmtime((time_t *)&mr->m_lastSpidered ); strftime(timestamp, 50, "%a, %d %b %Y %X %Z", ptm); sb->safePrintf(",\"timestamp\":\"%s\"}\n", timestamp); } //mr->size_content ); if ( si->m_format == FORMAT_HTML ) sb->safePrintf("\n\n

\n\n"); // inc it *numPrintedSoFar = *numPrintedSoFar + 1; // just in case sb->nullTerm(); return true; } int32_t cursor = -1; if ( si->m_format == FORMAT_XML ) cursor = sb->length(); if ( si->m_format == FORMAT_JSON ) cursor = sb->length(); if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\n" ); if ( si->m_format == FORMAT_JSON ) { if ( *numPrintedSoFar != 0 ) sb->safePrintf(",\n"); sb->safePrintf("\t{\n" ); } if ( mr->ptr_content && si->m_format == FORMAT_XML ) { sb->safePrintf("\t\tcdataEncode ( mr->ptr_content ); sb->safePrintf("]]>\n"); } if ( mr->ptr_content && si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"content\":\"" ); sb->jsonEncode ( mr->ptr_content ); sb->safePrintf("\",\n"); } Highlight hi; // get the url char *url = mr->ptr_ubuf ; int32_t urlLen = mr->size_ubuf - 1 ; int32_t err = mr->m_errno ; // . remove any session ids from the url // . for speed reasons, only check if its a cgi url Url uu; uu.set ( url , urlLen, false, true ); url = uu.getUrl(); urlLen = uu.getUrlLen(); // get my site hash uint64_t siteHash = 0; if ( uu.getHostLen() > 0 ) siteHash = hash64(uu.getHost(),uu.getHostLen()); // indent it if level is 2 bool indent = false; bool isAdmin = (si->m_isMasterAdmin || si->m_isCollAdmin); if ( si->m_format == FORMAT_XML ) isAdmin = false; //uint64_t lastSiteHash = siteHash; if ( indent && si->m_format == FORMAT_HTML ) sb->safePrintf("
"); // print the rank. it starts at 0 so add 1 if ( si->m_format == FORMAT_HTML && si->m_streamResults ) //sb->safePrintf("
%"INT32".", // ix+1 ); sb->safePrintf("
"); else if ( si->m_format == FORMAT_HTML ) //sb->safePrintf("
%"INT32".", // ix+1 + si->m_firstResultNum ); sb->safePrintf("
"); if ( si->m_showBanned ) { if ( err == EDOCBANNED ) err = 0; if ( err == EDOCFILTERED ) err = 0; } // if this msg20 had an error print "had error" if ( err || urlLen <= 0 || ! url ) { // revert back so we do not break the json/xml if ( cursor >= 0 ) sb->m_length = cursor; // it's unprofessional to display this in browser // so just let admin see it if ( isAdmin && si->m_format == FORMAT_HTML ) { sb->safePrintf("docId %"INT64" had error: " "%s

", mr->m_docId,//msg40->getDocId(i), mstrerror(err)); } // log it too! log("query: docId %"INT64" had error: %s.", mr->m_docId,mstrerror(err)); // wrap it up if clustered if ( indent && si->m_format == FORMAT_HTML) sb->safeMemcpy("",13); // DO NOT inc it otherwise puts a comma in there and // screws up the json //*numPrintedSoFar = *numPrintedSoFar + 1; return true; } // the score if admin /* if ( isAdmin ) { int32_t level = (int32_t)msg40->getClusterLevel(ix); // print out score sb->safePrintf ( "s=%.03f " "docid=%"UINT64" " "sitenuminlinks=%"INT32"%% " "hop=%"INT32" " "cluster=%"INT32" " "summaryLang=%s " "(%s)
", (float)msg40->getScore(ix) , mr->m_docId, (int32_t )mr->m_siteNumInlinks, (int32_t)mr->m_hopcount, level , getLanguageString(mr->m_summaryLanguage), g_crStrings[level]); } */ char *diffbotSuffix = strstr(url,"-diffbotxyz"); // print youtube and metacafe thumbnails here // http://www.youtube.com/watch?v=auQbi_fkdGE // http://img.youtube.com/vi/auQbi_fkdGE/2.jpg // get the thumbnail url if ( mr->ptr_imgUrl && si->m_format == FORMAT_HTML && // if we got thumbnail use that not this ! mr->ptr_imgData ) sb->safePrintf ("", url,mr->ptr_imgUrl); // if we have a thumbnail show it next to the search result, // base64 encoded. do NOT do this for the WIDGET, only for search // results in html/xml. if ( (si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) && //! mr->ptr_imgUrl && si->m_showImages && mr->ptr_imgData ) { ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData; ThumbnailInfo *ti = ta->getThumbnailInfo(0); if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t"); ti->printThumbnailInHtml ( sb , 100 , // max width 100 , // max height true , // add NULL , " style=\"margin:10px;\" ", si->m_format ); if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t%"INT32"\n", ti->m_dy); sb->safePrintf("\t\t%"INT32"\n", ti->m_dx); sb->safePrintf("\t\t%"INT32"" "\n", ti->m_origDY); sb->safePrintf("\t\t%"INT32"" "\n", ti->m_origDX); sb->safePrintf("\t\tcdataEncode(ti->getUrl()); sb->safePrintf("]]>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"imageHeight\":%"INT32",\n", ti->m_dy); sb->safePrintf("\t\t\"imageWidth\":%"INT32",\n", ti->m_dx); sb->safePrintf("\t\t\"origImageHeight\":%"INT32",\n", ti->m_origDY); sb->safePrintf("\t\t\"origImageWidth\":%"INT32",\n", ti->m_origDX); sb->safePrintf("\t\t\"imageUrl\":\""); sb->jsonEncode(ti->getUrl()); sb->safePrintf("\",\n"); } } bool isWide = false; int32_t newdx = 0; // print image for widget if ( //mr->ptr_imgUrl && ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_AJAX || si->m_format == FORMAT_WIDGET_APPEND ) ) { int32_t widgetWidth = hr->getLong("widgetwidth",200); // prevent coring if ( widgetWidth < 1 ) widgetWidth = 1; // char *bg1 = "lightgray"; // char *bg2 = "white"; // char *bgcolor = bg1; // if ( (ix % 1) == 1 ) bgcolor = bg2; // each search result in widget has a div around it sb->safePrintf("
" , mr->m_docId // this is a double now. this won't work // for streaming... , msg40->m_msg3a.m_scores[ix] // subtract 8 for scrollbar on right , widgetWidth - 2*8 - 8 // padding is 8px , (int32_t)RESULT_HEIGHT , (int32_t)RESULT_HEIGHT , (int32_t)PADDING //, bgcolor ); // if ( mr->ptr_imgUrl ) // sb->safePrintf("background-repeat:no-repeat;" // "background-size:%"INT32"px 140px;" // "background-image:url('%s');" // , widgetwidth - 2*8 // padding is 8px // , mr->ptr_imgUrl); if ( mr->ptr_imgData ) { ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData; ThumbnailInfo *ti = ta->getThumbnailInfo(0); // account for scrollbar on the right int32_t maxWidth = widgetWidth - (int32_t)SCROLLBAR_WIDTH; int32_t maxHeight = (int32_t)RESULT_HEIGHT; // false = do not print link on image ti->printThumbnailInHtml ( sb , maxWidth , maxHeight , false , // add &newdx ); } // end the div style attribute and div tag //sb->safePrintf("\">"); sb->safePrintf ( " .5 * widgetWidth ) { isWide = true; sb->safePrintf("position:absolute;" "bottom:%"INT32";" "left:%"INT32";" , (int32_t) PADDING , (int32_t) PADDING ); } // to align the text verticall we gotta make a textbox div // otherwise it wraps below image! mdw //else // sb->safePrintf("vertical-align:middle;"); else sb->safePrintf("position:absolute;" "bottom:%"INT32";" "left:%"INT32";" , (int32_t) PADDING , (int32_t) PADDING + newdx + 10 ); // close the style and begin the url sb->safePrintf( "\" " "href=\"" ); // truncate off -diffbotxyz%"INT32" int32_t newLen = urlLen; if ( diffbotSuffix ) newLen = diffbotSuffix - url; // print the url in the href tag sb->safeMemcpy ( url , newLen ); // then finish the a href tag and start a bold for title sb->safePrintf ( "\">");//" ); sb->safePrintf(""); //sb->safePrintf ("", // mr->ptr_imgUrl); // then title over image } // only do link here if we have no thumbnail so no bg image if ( (si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) && ! mr->ptr_imgData ) { sb->safePrintf ( "safeMemcpy ( url , newLen ); // then finish the a href tag and start a bold for title sb->safePrintf ( ">");//" ); } // the a href tag if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "\n\n" ); // then if it is banned if ( mr->m_isBanned && si->m_format == FORMAT_HTML ) sb->safePrintf("BANNED "); /////// // // PRINT THE TITLE // /////// // the a href tag if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ( "safeMemcpy ( url , newLen ); // then finish the a href tag and start a bold for title sb->safePrintf ( ">");//" ); } // . then the title (should be NULL terminated) // . the title can be NULL // . highlight it first // . the title itself should not have any tags in it! char *str = mr->ptr_tbuf;//msg40->getTitle(i); int32_t strLen = mr->size_tbuf - 1;// msg40->getTitleLen(i); if ( ! str || strLen < 0 ) strLen = 0; ///// // // are we printing a dmoz category page? // get the appropriate dmoz title/summary to use since the same // url can exist in multiple topics (catIds) with different // titles summaries. // ///// char *dmozSummary2 = NULL; // TODO: just get the catid from httprequest directly? if ( si->m_catId > 0 ) { // si->m_cat_dirId > 0) { // . get the dmoz title and summary // . if empty then just a bunch of \0s, except for catIds Msg20Reply *mr = m20->getReply(); char *dmozTitle = mr->ptr_dmozTitles; dmozSummary2 = mr->ptr_dmozSumms; char *dmozAnchor = mr->ptr_dmozAnchors; int32_t *catIds = mr->ptr_catIds; int32_t numCats = mr->size_catIds / 4; // loop through looking for the right ID for (int32_t i = 0; i < numCats ; i++ ) { // assign shit if we match the dmoz cat we are showing if ( catIds[i] == si->m_catId) break; dmozTitle +=gbstrlen(dmozTitle)+1; dmozSummary2 +=gbstrlen(dmozSummary2)+1; dmozAnchor += gbstrlen(dmozAnchor)+1; } // now make the title the dmoz title str = dmozTitle; strLen = gbstrlen(str); } int32_t hlen; //copy all summary and title excerpts for this result into here //char tt[1024*32]; //char *ttend = tt + 1024*32; char *frontTag = "" ; char *backTag = ""; if ( si->m_format == FORMAT_XML ) { frontTag = ""; backTag = ""; } if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) { frontTag = "" ; } int32_t cols = 80; cols = si->m_summaryMaxWidth; SafeBuf hb; if ( str && strLen && si->m_doQueryHighlighting ) { hlen = hi.set ( &hb, //tt , //ttend - tt , str, strLen , mr->m_language, // docLangId &si->m_hqq , // highlight query CLASS false , // doStemming? false , // use click&scroll? NULL , // base url frontTag, backTag, 0, 0 ); // niceness // reassign! str = hb.getBufStart(); strLen = hb.getLength(); //if (!sb->utf8Encode2(tt, hlen)) return false; // if ( si->m_format != FORMAT_JSON ) // if ( ! sb->brify ( hb.getBufStart(), // hb.getLength(), // 0, // cols) ) return false; } // . use "UNTITLED" if no title // . msg20 should supply the dmoz title if it can if ( strLen == 0 && si->m_format != FORMAT_XML && si->m_format != FORMAT_JSON ) { str = "UNTITLED"; strLen = gbstrlen(str); } if ( str && strLen && ( si->m_format == FORMAT_HTML || si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) ) { // determine if TiTle wraps, if it does add a
count for // each wrap //if (!sb->utf8Encode2(str , strLen )) return false; if ( ! sb->brify ( str,strLen,0,cols) ) return false; } // close up the title tag if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t<![CDATA["); if ( str ) sb->cdataEncode(str); sb->safePrintf("]]>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"title\":\""); if ( str ) sb->jsonEncode(str); sb->safePrintf("\",\n"); } if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("

\n" ) ; // close the title tag stuf if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf("
\n"); // // print

tag contents. hack for client. // char *hp = mr->ptr_htag; char *hpend = hp + mr->size_htag; for ( ; hp && hp < hpend ; ) { if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\tcdataEncode(hp); sb->safePrintf("]]>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"h1Tag\":\""); sb->jsonEncode(hp); sb->safePrintf("\",\n"); } // it is a \0 separated list of headers generated from // XmlDoc::getHeaderTagBuf() hp += gbstrlen(hp) + 1; } // print all dmoz info for xml/json. // seems like both direct and indirect dmoz entries here. if ( mr->size_catIds > 0 && ( si->m_format == FORMAT_JSON || si->m_format == FORMAT_XML ) ) { char *dmozTitle = mr->ptr_dmozTitles; char *dmozSummary = mr->ptr_dmozSumms; char *dmozAnchor = mr->ptr_dmozAnchors; int32_t *catIds = mr->ptr_catIds; int32_t numCats = mr->size_catIds / 4; // loop through looking for the right ID for (int32_t i = 0; i < numCats ; i++ ) { printDmozEntry ( sb, catIds[i], true, dmozTitle, dmozSummary, dmozAnchor , si ); dmozTitle += gbstrlen(dmozTitle ) + 1; dmozSummary += gbstrlen(dmozSummary) + 1; dmozAnchor += gbstrlen(dmozAnchor ) + 1; } } if ( mr->size_indCatIds > 0 && ( si->m_format == FORMAT_JSON || si->m_format == FORMAT_XML ) ) { // print INDIRECT dmoz entries as well int32_t nIndCatids = mr->size_indCatIds / 4; for ( int32_t i = 0; i < nIndCatids; i++ ) { int32_t catId = ((int32_t *)(mr->ptr_indCatIds))[i]; if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t" "%"INT32"\n", catId); if ( si->m_format == FORMAT_JSON ) sb->safePrintf("\t\t\"indirectDmozCatId\":" "%"INT32",\n",catId); } // print INDIRECT dmoz entries as well // int32_t nIndCatids = mr->size_indCatIds / 4; // dmozTitle = mr->ptr_indDmozTitles; // dmozSummary = mr->ptr_dmozSumms; // dmozAnchor = mr->ptr_dmozAnchors; // for ( int32_t i = 0; i < nIndCatids; i++ ) { // int32_t catId = ((int32_t *)(mr->ptr_indCatIds))[i]; // printDmozEntry ( sb , // catId , // false, // dmozTitle, // dmozSummary, // dmozAnchor , // si ); // dmozTitle += gbstrlen(dmozTitle ) + 1; // dmozSummary += gbstrlen(dmozSummary) + 1; // dmozAnchor += gbstrlen(dmozAnchor ) + 1; // } } ///// // // print content type after title // ///// unsigned char ctype = mr->m_contentType; char *cs = g_contentTypeStrings[ctype]; if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t" "" "\n", cs); if ( si->m_format == FORMAT_JSON ) sb->safePrintf("\t\t\"contentType\":\"%s\",\n",cs); if ( si->m_format == FORMAT_HTML && ctype != CT_HTML && ctype != CT_UNKNOWN ){ sb->safePrintf(" "); char *p = cs; for ( ; *p ; p++ ) { char c = to_upper_a(*p); sb->pushChar(c); } sb->safePrintf("  "); } //////////// // // print the summary // //////////// // . then the summary // . "s" is a string of null terminated strings //char *send; // do the normal summary str = mr->ptr_displaySum; // sometimes the summary is longer than requested because for // summary deduping purposes (see "pss" parm in Parms.cpp) we do not // get it as int16_t as request. so use mr->m_sumPrintSize here // not mr->size_sum strLen = mr->size_displaySum - 1;//-1; // this includes the terminating \0 or \0\0 so back up if ( strLen < 0 ) strLen = 0; //send = str + strLen; // dmoz summary might override if we are showing a dmoz topic page if ( dmozSummary2 && (si->m_catId>0 || strLen<=0) ) { str = dmozSummary2; strLen = gbstrlen(dmozSummary2); } bool printSummary = true; // do not print summaries for widgets by default unless overridden // with &summary=1 int32_t defSum = 0; // if no image then default the summary to on if ( ! mr->ptr_imgData ) defSum = 1; if ( (si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) && hr->getLong("summaries",defSum) == 0 ) printSummary = false; if ( printSummary && (si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) ) { int32_t sumLen = strLen; if ( sumLen > 150 ) sumLen = 150; if ( sumLen ) { sb->safePrintf("
"); sb->safeTruncateEllipsis ( str , sumLen ); } } if ( printSummary && si->m_format == FORMAT_HTML ) sb->brify ( str , strLen, 0 , cols ); // niceness = 0 if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\tcdataEncode(str); sb->safePrintf("]]>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"sum\":\""); sb->jsonEncode(str); sb->safePrintf("\",\n"); } // new line if not xml. even summary is empty we need it too like // when showing xml docs - MDW 9/28/2014 if ( si->m_format == FORMAT_HTML ) // && strLen ) sb->safePrintf("
\n"); ///////// // // meta tag values for &dt=keywords ... // ///////// if ( mr->ptr_dbuf && mr->size_dbuf>1 ) printMetaContent ( msg40 , ix,st,sb); //////////// // // . print DMOZ topics under the summary // . will print the "Search in Category" link too // //////////// //Msg20Reply *mr = m20->getMsg20Reply(); int32_t nCatIds = mr->getNumCatIds(); for (int32_t i = 0; i < nCatIds; i++) { int32_t catid = ((int32_t *)(mr->ptr_catIds))[i]; printDMOZCategoryUnderResult(sb,si,catid,st); } // skipCatsPrint: // print the indirect category Ids int32_t nIndCatids = mr->size_indCatIds / 4; //if ( !cr->m_displayIndirectDmozCategories ) // goto skipCatsPrint2; for ( int32_t i = 0; i < nIndCatids; i++ ) { int32_t catid = ((int32_t *)(mr->ptr_indCatIds))[i]; // skip it if it's a regular category //bool skip = false; int32_t d; for ( d = 0; d < nCatIds; d++) { if ( catid == mr->ptr_catIds[i] ) break; } // skip if the indirect catid matched a directed catid if ( d < nCatIds ) continue; // otherwise print it printDMOZCategoryUnderResult(sb,si,catid,st); } /////////// // // print facet field/values // // if there was a gbfacet*: term (gbfacetstr, gbfacetfloat, gbfacetint) // this should be non-NULL and have the facet field/value pairs // and every string ends in a \0 // ////////// char *fp = mr->ptr_facetBuf; char *fpEnd = fp + mr->size_facetBuf; for ( ; fp && fp < fpEnd ; ) { if ( si->m_format == FORMAT_HTML ) { // print first one sb->safePrintf(""); sb->safeStrcpy(fp); sb->safePrintf(""); sb->safePrintf("   :   "); sb->safePrintf(""); fp += gbstrlen(fp) + 1; sb->htmlEncode(fp); // begin a new pair sb->safePrintf(""); sb->safeStrcpy("
\n"); fp += gbstrlen(fp) + 1; } else if ( si->m_format == FORMAT_XML ) { // print first one sb->safePrintf("\t\t\n" "\t\t\tcdataEncode(fp); sb->safePrintf("]]>\n"); fp += gbstrlen(fp) + 1; sb->safePrintf("\t\t\tcdataEncode(fp); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\n"); fp += gbstrlen(fp) + 1; } else if ( si->m_format == FORMAT_JSON ) { // print first one sb->safePrintf("\t\t\"facet\":{\n"); sb->safePrintf("\t\t\t\"field\":\""); sb->jsonEncode(fp); sb->safePrintf("\",\n"); fp += gbstrlen(fp) + 1; sb->safePrintf("\t\t\t\"value\":\""); sb->jsonEncode(fp); sb->safePrintf("\"\n"); fp += gbstrlen(fp) + 1; sb->safePrintf("\t\t},\n"); } } //////////// // // print the URL // //////////// // hack off the http:// if any for displaying it on screen if ( urlLen > 8 && strncmp ( url , "http://" , 7 )==0 ) { url += 7; urlLen -= 7; } // . remove trailing / // . only remove from root urls in case user cuts and // pastes it for link: search if ( url [ urlLen - 1 ] == '/' ) { // see if any other slash before us int32_t j; for ( j = urlLen - 2 ; j >= 0 ; j-- ) if ( url[j] == '/' ) break; // if there wasn't, we must have been a root url // so hack off the last slash if ( j < 0 ) urlLen--; } if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ("" ); //sb->htmlEncode ( url , gbstrlen(url) , false ); // 20 for the date after it sb->safeTruncateEllipsis ( url , 50 ); // cols - 30 ); // turn off the color sb->safePrintf ( "\n" ); } // print url for widgets now if ( (si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) ) { //sb->safePrintf ("
" ); // print url for widgets in top left if we have a wide image // otherwise it gets truncated below the title for some reason if ( isWide ) sb->safePrintf ("
" ); else if ( mr->ptr_imgData ) sb->safePrintf ("
" , (int32_t) PADDING + newdx + 10 ); else sb->safePrintf ("
"); // print the url now, truncated to 50 chars sb->safeTruncateEllipsis ( url , 50 ); // cols - 30 ); sb->safePrintf ( "\n" ); } if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\tsafeMemcpy ( url , urlLen ); sb->safePrintf("]]>\n"); } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"url\":\""); sb->jsonEncode ( url , urlLen ); sb->safePrintf("\",\n"); } if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t%"INT32"\n", (int32_t)mr->m_hopcount); if ( si->m_format == FORMAT_JSON ) sb->safePrintf("\t\t\"hopCount\":%"INT32",\n",(int32_t)mr->m_hopcount); // now the last spidered date of the document time_t ts = mr->m_lastSpidered; if ( si->m_format == FORMAT_HTML ) printTimeAgo ( sb , ts , "indexed" , si ); // the date it was last modified ts = mr->m_lastModified; if ( si->m_format == FORMAT_HTML ) printTimeAgo ( sb , ts , "modified" , si ); // // more xml stuff // if ( si->m_format == FORMAT_XML ) { // doc size in Kilobytes sb->safePrintf ( "\t\t\n", (float)mr->m_contentLen/1024.0); sb->safePrintf ( "\t\t%"INT32"\n", mr->m_contentLen); // . docId for possible cached link // . might have merged a bunch together sb->safePrintf("\t\t%"INT64"\n",mr->m_docId ); // . show the site root // . for hompages.com/users/fred/mypage.html this will be // homepages.com/users/fred/ // . for www.xyz.edu/~foo/burp/ this will be // www.xyz.edu/~foo/ etc. int32_t siteLen = 0; char *site = NULL; // seems like this isn't the way to do it, cuz Tagdb.cpp // adds the "site" tag itself and we do not always have it // in the XmlDoc::ptr_tagRec... so do it this way: site = mr->ptr_site; siteLen = mr->size_site-1; //char *site=uu.getSite( &siteLen , si->m_coll, false, tagRec); sb->safePrintf("\t\t 0 ) sb->safeMemcpy ( site , siteLen ); sb->safePrintf("]]>\n"); //int32_t sh = hash32 ( site , siteLen ); //sb->safePrintf ("\t\t%"UINT32"\n",sh); //int32_t dh = uu.getDomainHash32 (); //sb->safePrintf ("\t\t%"UINT32"\n",dh); // spider date sb->safePrintf ( "\t\t%"UINT32"\n", (uint32_t)mr->m_lastSpidered); // backwards compatibility for buzz sb->safePrintf ( "\t\t%"UINT32"" "\n", (uint32_t)mr->m_firstIndexedDate); sb->safePrintf( "\t\t%"UINT32"" "\n", (uint32_t)mr->m_contentHash32); // pub date int32_t datedbDate = mr->m_datedbDate; // show the datedb date as "" for now if ( datedbDate != -1 ) sb->safePrintf ( "\t\t%"UINT32"\n", (uint32_t)datedbDate); } if ( si->m_format == FORMAT_JSON ) { // doc size in Kilobytes sb->safePrintf ( "\t\t\"size\":\"%4.0fk\",\n", (float)mr->m_contentLen/1024.0); sb->safePrintf ( "\t\t\"sizeInBytes\":%"INT32",\n", mr->m_contentLen); // . docId for possible cached link // . might have merged a bunch together sb->safePrintf("\t\t\"docId\":%"INT64",\n",mr->m_docId ); // . show the site root // . for hompages.com/users/fred/mypage.html this will be // homepages.com/users/fred/ // . for www.xyz.edu/~foo/burp/ this will be // www.xyz.edu/~foo/ etc. int32_t siteLen = 0; char *site = NULL; // seems like this isn't the way to do it, cuz Tagdb.cpp // adds the "site" tag itself and we do not always have it // in the XmlDoc::ptr_tagRec... so do it this way: site = mr->ptr_site; siteLen = mr->size_site-1; //char *site=uu.getSite( &siteLen , si->m_coll, false, tagRec); sb->safePrintf("\t\t\"site\":\""); if ( site && siteLen > 0 ) sb->safeMemcpy ( site , siteLen ); sb->safePrintf("\",\n"); //int32_t sh = hash32 ( site , siteLen ); //sb->safePrintf ("\t\t%"UINT32"\n",sh); //int32_t dh = uu.getDomainHash32 (); //sb->safePrintf ("\t\t%"UINT32"\n",dh); // spider date sb->safePrintf ( "\t\t\"spidered\":%"UINT32",\n", (uint32_t)mr->m_lastSpidered); // backwards compatibility for buzz sb->safePrintf ( "\t\t\"firstIndexedDateUTC\":%"UINT32",\n" , (uint32_t) mr->m_firstIndexedDate); sb->safePrintf( "\t\t\"contentHash32\":%"UINT32",\n" , (uint32_t)mr->m_contentHash32); // pub date int32_t datedbDate = mr->m_datedbDate; // show the datedb date as "" for now if ( datedbDate != -1 ) sb->safePrintf ( "\t\t\"pubdate\":%"UINT32",\n", (uint32_t)datedbDate); } // . we also store the outlinks in a linkInfo structure // . we can call LinkInfo::set ( Links *outlinks ) to set it // in the msg20 LinkInfo *outlinks = (LinkInfo *)mr->ptr_outlinks; // NULLify if empty if ( mr->size_outlinks <= 0 ) outlinks = NULL; // only for xml for now if ( si->m_format == FORMAT_HTML ) outlinks = NULL; Inlink *k; // do we need absScore2 for outlinks? //k = NULL; while ( outlinks && (k =outlinks->getNextInlink(k))) // print it out sb->safePrintf("\t\tm_docId , (uint32_t)k->m_ip,//hostHash, but use ip for now (int32_t)k->m_firstIndexedDate , (int32_t)k->m_datedbDate ); if ( si->m_format == FORMAT_XML ) { // result sb->safePrintf("\t\t" "\n", getLanguageString(mr->m_language)); sb->safePrintf("\t\t%s\n", getLangAbbr(mr->m_language)); char *charset = get_charset_str(mr->m_charset); if(charset) sb->safePrintf("\t\t" "\n", charset); } if ( si->m_format == FORMAT_JSON ) { // result sb->safePrintf("\t\t\"language\":\"%s\",\n", getLanguageString(mr->m_language)); sb->safePrintf("\t\t\"langAbbr\":\"%s\",\n", getLangAbbr(mr->m_language)); char *charset = get_charset_str(mr->m_charset); if(charset) sb->safePrintf("\t\t\"charset\":\"%s\",\n",charset); } // // end more xml stuff // if ( si->m_format == FORMAT_HTML ) { int32_t lang = mr->m_language; if ( lang ) sb->safePrintf(" - %s",getLanguageString(lang)); uint16_t cc = mr->m_computedCountry; if( cc ) sb->safePrintf(" - %s", g_countryCode.getName(cc)); char *charset = get_charset_str(mr->m_charset); if ( charset ) sb->safePrintf(" - %s ", charset); } if ( si->m_format == FORMAT_HTML ) sb->safePrintf("
\n"); //char *coll = si->m_cr->m_coll; // print the [cached] link? bool printCached = true; if ( mr->m_noArchive ) printCached = false; if ( isAdmin ) printCached = true; if ( mr->m_contentLen <= 0 ) printCached = false; if ( si->m_format != FORMAT_HTML ) printCached = false; // get collnum result is from //collnum_t collnum = si->m_cr->m_collnum; // if searching multiple collections - federated search CollectionRec *scr = g_collectiondb.getRec ( mr->m_collnum ); char *coll = "UNKNOWN"; if ( scr ) coll = scr->m_coll; if ( printCached && cr->m_clickNScrollEnabled ) sb->safePrintf ( " - " "cached\n", st->m_qe , coll , mr->m_docId ); else if ( printCached ) sb->safePrintf ( "" "cached\n", st->m_qe , // "qlang" parm si->m_defaultSortLang, coll , mr->m_docId ); // the new links if ( si->m_format == FORMAT_HTML && g_conf.m_isMattWells && 1 == 0 ) { //sb->safePrintf(" - scoring", // coll ); //sb->safePrintf(" - safePrintf(" - safePrintf(" - safePrintf("d=%"INT64"",mr->m_docId); sb->safePrintf("u="); sb->urlEncode ( url , gbstrlen(url) , false ); //sb->safePrintf("&page=1\">seo" ); sb->safePrintf("\">seo" ); } // only display re-spider link if addurl is enabled //if ( isAdmin && // g_conf.m_addUrlEnabled && // cr->m_addUrlEnabled ) { /* if ( si->m_format == FORMAT_HTML ) { // the [respider] link // save this for seo iframe! sb->safePrintf (" - urlEncode ( url , urlLen ); // then collection if ( coll ) { sb->safeMemcpy ( "&c=" , 3 ); sb->safeMemcpy ( coll , gbstrlen(coll) ); } //sb->safePrintf ( "&force=1\">reindex" ); sb->safePrintf ( "\">reindex" ); } */ // unhide the divs on click int32_t placeHolder = -1; int32_t placeHolderLen = 0; if ( si->m_format == FORMAT_HTML && si->m_getDocIdScoringInfo ) { // place holder for backlink table link placeHolder = sb->length(); sb->safePrintf (" - " "00000 backlinks" "\n" , ix ); placeHolderLen = sb->length() - placeHolder; } if ( si->m_format == FORMAT_HTML && si->m_getDocIdScoringInfo ) { // unhide the scoring table on click sb->safePrintf (" - " "scoring" "\n" ,ix ); } if ( si->m_format == FORMAT_HTML ) { // reindex sb->safePrintf(" - respider\n", coll,rand64); } if ( si->m_format == FORMAT_HTML ) { sb->safePrintf (" - " "urlEncode ( url , gbstrlen(url) , false ); sb->safePrintf ( "\">" "spider info\n" ); } // // show rainbow sections link // if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ( " - " "sections\n", st->m_qe , // "qlang" parm si->m_defaultSortLang, coll , mr->m_docId ); } if ( si->m_format == FORMAT_HTML ) { sb->safePrintf ( " - " "page info\n", //st->m_qe , // "qlang" parm //si->m_defaultSortLang, coll , mr->m_docId ); } // this stuff is secret just for local guys! not any more if ( si->m_format == FORMAT_HTML ) { // now the ip of url //int32_t urlip = msg40->getIp(i); // don't combine this with the sprintf above cuz // iptoa uses a static local buffer like ctime() sb->safePrintf(//"
" " - %s\n", coll,iptoa(mr->m_ip), iptoa(mr->m_ip) ); // ip domain link unsigned char *us = (unsigned char *)&mr->m_ip;//urlip; sb->safePrintf (" - %"INT32".%"INT32".%"INT32"\n", coll, (int32_t)us[0],(int32_t)us[1],(int32_t)us[2], (int32_t)us[0],(int32_t)us[1],(int32_t)us[2]); /* // . now the info link // . if it's local, don't put the hostname/port in // there cuz it will mess up Global Spec's machine //if ( h->m_groupId == g_hostdb.m_groupId ) sb.safePrintf(" - m_docId); // then the [info] link to show the TitleRec sb->safePrintf ( "\">[info]" ); // now the analyze link sb.safePrintf (" - m_hopcount); // encode the url now sb->urlEncode ( url , urlLen ); // then the [analyze] link sb->safePrintf ("\">[analyze]" ); // and links: query link sb->safePrintf( " - urlEncode ( url , urlLen ); sb->safeMemcpy ("\">linkers" , 14 ); */ } char dbuf [ MAX_URL_LEN ]; int32_t dlen = uu.getDomainLen(); if ( si->m_format == FORMAT_HTML ) { memcpy ( dbuf , uu.getDomain() , dlen ); dbuf [ dlen ] = '\0'; // newspaperarchive urls have no domain if ( dlen == 0 ) { dlen = uu.getHostLen(); memcpy ( dbuf , uu.getHost() , dlen ); dbuf [ dlen ] = '\0'; } } // admin always gets the site: option so he can ban if ( si->m_format == FORMAT_HTML ) { sb->safePrintf (" - " " " "domain\n" , dbuf , coll );//, dbuf ); } if ( si->m_format == FORMAT_HTML && ( isAdmin || cr->m_isCustomCrawl)){ char *un = ""; int32_t banVal = 1; if ( mr->m_isBanned ) { un = "UN"; banVal = 0; } sb->safePrintf("
" " " "%sBAN %s" "\n" , banVal , dbuf , coll , un , dbuf ); //banSites->safePrintf("%s+", dbuf); dlen = uu.getHostLen(); memcpy ( dbuf , uu.getHost() , dlen ); dbuf [ dlen ] = '\0'; sb->safePrintf(" - " " " "%sBAN %s\n" , banVal , dbuf , coll , un , dbuf ); // take similarity out until working again /* sb->safePrintf (" - [similar -" " " "tag " , (int32_t)mr->m_tagVectorHash, coll); sb->safePrintf ("" "topic " , (int32_t)mr->m_gigabitVectorHash, coll); */ if ( mr->size_gbAdIds > 0 ) sb->safePrintf ("" "Ad Id\n" , mr->ptr_gbAdIds, coll); //sb->safePrintf ("] "); /* put this on 'page info' int32_t urlFilterNum = (int32_t)mr->m_urlFilterNum; if(urlFilterNum != -1) { sb->safePrintf (" - " "UrlFilter:%"INT32"", coll , urlFilterNum); } */ } /* // print the help SafeBuf help; help.safePrintf("The distance matrix uses the " "following formula to calculate " "a score in a table cell for a pair of query terms: " "
" "" "SCORE = (%"INT32" - |pos1-pos2|) * " "locationWeight * " "densityWeight * " "synWeight1 * " "synWeight2 * " "spamWeight1 * " "spamWeight2 * " "tfWeight1 * " "tfWeight2" "" "
" "
" , (int32_t)MAXWORDPOS+1 ); help.safePrintf("" "" "" "
pos1The word position of " "query term 1
pos2The word position of " "query term 2
" ); help.safePrintf( //"where
" //"locationWeight is based on where " //"the two terms occur in the document " //"and uses the following table:
" "" "" "" ); for ( int32_t i = 0 ; i < HASHGROUP_END ; i++ ) { char *hs = getHashGroupString(i); float hw = s_hashGroupWeights[i]; help.safePrintf("" ,hs,hw ); } help.safePrintf("
term locationlocationWeight
%s%.0f
"); help.safePrintf("

"); help.safePrintf( "" "" "" "" "" ); for ( int32_t i = 0 ; i < MAXDENSITYRANK ; i++ ) { help.safePrintf("" "" "" "" "" ,maxw,i,dweight ); } help.safePrintf("
max # alphanumeric words in locationdensityRankdensityWeight
%"INT32"%"INT32"%.0f
"); help.safePrintf("

" */ // end serp div if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf("


"); if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "

\n"); // search result spacer if ( si->m_format == FORMAT_WIDGET_IFRAME || si->m_format == FORMAT_WIDGET_APPEND || si->m_format == FORMAT_WIDGET_AJAX ) sb->safePrintf("

", (int32_t)SERP_SPACER); // inc it *numPrintedSoFar = *numPrintedSoFar + 1; // done? DocIdScore *dp = msg40->getScoreInfo(ix); if ( ! dp ) { if ( si->m_format == FORMAT_XML ) sb->safePrintf ("\t\n\n"); if ( si->m_format == FORMAT_JSON ) { // remove last ,\n sb->m_length -= 2; sb->safePrintf ("\n\t}\n\n"); } // wtf? //char *xx=NULL;*xx=0; // at least close up the table if ( si->m_format != FORMAT_HTML ) return true; sb->safePrintf("
\n"); return true; } // // scoring info tables // int32_t nr = dp->m_numRequiredTerms; if ( nr == 1 ) nr = 0; // print breakout tables here for distance matrix //SafeBuf bt; // final score calc SafeBuf ft; // int16_tcut //Query *q = si->m_q; // put in a hidden div so you can unhide it if ( si->m_format == FORMAT_HTML ) sb->safePrintf(""); sb->safePrintf("
"); // print the breakout tables //if ( nr ) { // //sb->safePrintf("
"); // sb->safeMemcpy ( &bt ); //} // the singles --- TODO: make it ALL query terms //nr = dp->m_numRequiredTerms; //for ( int32_t i = 0 ; i < nr && nr == 1 ; i++ ) { int32_t lastTermNum = -1; int32_t numSingles = dp->m_numSingles; // do not print this if we got pairs if ( dp->m_numPairs ) numSingles = 0; for ( int32_t i = 0 ; i < numSingles ; i++ ) { float totalSingleScore = 0.0; // print all the top winners for this single SingleScore *fss = &dp->m_singleScores[i]; // if same combo as last time skip if ( fss->m_qtermNum == lastTermNum ) continue; // do not reprint for this query term num lastTermNum = fss->m_qtermNum; bool firstTime = true; // print all singles for this combo for ( int32_t j = i ; j < dp->m_numSingles ; j++ ) { // get it SingleScore *ss = &dp->m_singleScores[j]; // stop if different single now if ( ss->m_qtermNum != fss->m_qtermNum ) break; // skip if 0. skip neighborhoods i guess if ( ss->m_finalScore == 0.0 ) continue; // first time? if ( firstTime && si->m_format == FORMAT_HTML ) { Query *q = &si->m_q; printSingleTerm ( sb , q , ss ); printScoresHeader ( sb ); firstTime = false; } // print it printSingleScore ( sb , si , ss , mr , msg40 ); // add up totalSingleScore += ss->m_finalScore; } if ( ft.length() ) ft.safePrintf(" , "); ft.safePrintf("%f",totalSingleScore); // min? if ( minScore < 0.0 || totalSingleScore < minScore ) minScore = totalSingleScore; // we need to set "ft" for xml stuff below if ( si->m_format != FORMAT_HTML ) continue; //sb->safePrintf(""); sb->safePrintf("" "" "", totalSingleScore); // close table from printScoresHeader if ( ! firstTime ) sb->safePrintf("
"); // print pair text //int32_t qtn = fss->m_qtermNum; //sb->safeMemcpy(q->m_qterms[qtn].m_term , // q->m_qterms[qtn].m_termLen ); //sb->safePrintf("
%.04ftotal of above scores

"); } char *ff = ""; if ( si->m_useMinAlgo ) ff = "MIN "; char *ff2 = "sum"; if ( si->m_useMinAlgo ) ff2 = "min"; //if ( nr ) sb->safePrintf("
"); //sb->safePrintf("
"); // final score!!! if ( si->m_format == FORMAT_XML ) { sb->safePrintf ("\t\t%"INT32"\n", (int32_t)dp->m_siteRank ); sb->safePrintf ("\t\t%"INT32"" "\n", (int32_t)mr->m_siteNumInlinks ); sb->safePrintf ("\t\t%"INT32"" "\n", (int32_t)mr->m_siteNumInlinksTotal ); sb->safePrintf ("\t\t%"INT32"" "\n", (int32_t)mr->m_siteNumUniqueIps ); sb->safePrintf ("\t\t%"INT32"" "\n", (int32_t)mr->m_siteNumUniqueCBlocks ); struct tm *timeStruct3; timeStruct3 = gmtime((time_t *)&mr->m_pageInlinksLastUpdated); char tmp3[64]; strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 ); // -1 means unknown if ( mr->m_pageNumInlinks >= 0 ) // how many inlinks, external and internal, we have // to this page not filtered in any way!!! sb->safePrintf("\t\t%"INT32"" "\n" ,mr->m_pageNumInlinks ); // how many inlinking ips we got, including our own if // we link to ourself sb->safePrintf("\t\t%"INT32"" "\n" ,mr->m_pageNumUniqueIps ); // how many inlinking cblocks we got, including our own if // we link to ourself sb->safePrintf("\t\t%"INT32"" "\n" ,mr->m_pageNumUniqueCBlocks ); // how many "good" inlinks. i.e. inlinks whose linktext we // count and index. sb->safePrintf("\t\t%"INT32"" "\n" "\t\t%"UINT32"" "\n" ,mr->m_pageNumGoodInlinks ,(uint32_t)mr->m_pageInlinksLastUpdated ); float score = msg40->getScore (ix); sb->safePrintf("\t\t%f\n", score ); sb->safePrintf ("\t\t" "" "\n" , SITERANKDIVISOR , si->m_sameLangWeight //SAMELANGMULT , ff2 ); sb->safePrintf ("\t\t" "%.03f = (%"INT32"/%.01f+1) " // * %s(" , dp->m_finalScore , (int32_t)dp->m_siteRank , SITERANKDIVISOR //, ff ); // then language weight if ( si->m_queryLangId == 0 || mr->m_language == 0 || si->m_queryLangId == mr->m_language ) sb->safePrintf(" * %.01f", si->m_sameLangWeight);//SAMELANGMULT); // the actual min then sb->safePrintf(" * %.03f",minScore); // no longer list all the scores //sb->safeMemcpy ( &ft ); sb->safePrintf(//")" "]]>" "\n"); sb->safePrintf ("\t\n\n"); return true; } if ( si->m_format != FORMAT_HTML ) return true; char *cc = getCountryCode ( mr->m_country ); if ( mr->m_country == 0 ) cc = "Unknown"; sb->safePrintf("" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "
" "final score
" "
docId%"INT64"
site%s
hopcount%"INT32"
language%s
country%s
siteRank%"INT32"
" , dp->m_docId , mr->ptr_site , (int32_t)mr->m_hopcount //, getLanguageString(mr->m_summaryLanguage) , getLanguageString(mr->m_language) // use page language , cc , (int32_t)dp->m_siteRank ); // list all final scores starting with pairs sb->safePrintf("%f = " "(%"INT32"/%.01f+1)" , dp->m_finalScore , (int32_t)dp->m_siteRank , SITERANKDIVISOR ); // if lang is different if ( si->m_queryLangId == 0 || mr->m_language == 0 || si->m_queryLangId == mr->m_language ) sb->safePrintf(" * %.01f", si->m_sameLangWeight);//SAMELANGMULT); // list all final scores starting with pairs sb->safePrintf(" * %s(" , ff ); sb->safeMemcpy ( &ft ); sb->safePrintf(")

"); // put in a hidden div so you can unhide it sb->safePrintf("\n"); // result is in a table so we can put the result # in its own column sb->safePrintf("
"); /* // UN-indent it if level is 1 if ( si->m_format == FORMAT_HTML && si->m_doIpClustering ) { sb->safePrintf (" - [ " "More from this ip ]", iptoa ( mr->m_ip ) , st->m_qe , coll ); if ( indent ) sb->safePrintf ( "

\n"); else sb->safePrintf ( "

\n"); } else if ( si->m_format == FORMAT_HTML && si->m_doSiteClustering ) { char hbuf [ MAX_URL_LEN ]; int32_t hlen = uu.getHostLen(); memcpy ( hbuf , uu.getHost() , hlen ); hbuf [ hlen ] = '\0'; sb->safePrintf (" - " "More from this site", hbuf , st->m_qe , coll ); if ( indent ) sb->safePrintf ( "
\n"); else sb->safePrintf ( "

\n"); } */ // space out 0000 backlinks char *p = sb->getBufStart() + placeHolder; int32_t plen = placeHolderLen; if ( numInlinks == 0 ) memset ( p , ' ' , plen ); if ( numInlinks > 0 && numInlinks < 99999 ) { char *ss = strstr ( p, "00000" ); if ( ss ) { char c = ss[5]; sprintf(ss,"%5"INT32"",numInlinks); ss[5] = c; } } // print "1 backlink" not "1 backlinks" if ( numInlinks == 1 ) { char *xx = strstr(p,"backlinks"); if ( xx ) xx[8] = ' '; } return true; } bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps , Msg20Reply *mr , Msg40 *msg40 , bool first ) { // int16_tcut Query *q = &si->m_q; //SafeBuf ft; // store in final score calc //if ( ft.length() ) ft.safePrintf(" + "); //ft.safePrintf("%f",ps->m_finalScore); int32_t qtn1 = ps->m_qtermNum1; int32_t qtn2 = ps->m_qtermNum2; /* unsigned char drl1 = ps->m_diversityRankLeft1; unsigned char drl2 = ps->m_diversityRankLeft2; float dvwl1 = getDiversityWeight(dr1); float dvwl2 = getDiversityWeight(dr2); unsigned char drr1 = ps->m_diversityRankRight1; unsigned char drr2 = ps->m_diversityRankRight2; float dvwr1 = getDiversityWeight(dr1); float dvwr2 = getDiversityWeight(dr2); */ unsigned char de1 = ps->m_densityRank1; unsigned char de2 = ps->m_densityRank2; float dnw1 = getDensityWeight(de1); float dnw2 = getDensityWeight(de2); int32_t hg1 = ps->m_hashGroup1; int32_t hg2 = ps->m_hashGroup2; float hgw1 = getHashGroupWeight(hg1); float hgw2 = getHashGroupWeight(hg2); int32_t wp1 = ps->m_wordPos1; int32_t wp2 = ps->m_wordPos2; unsigned char wr1 = ps->m_wordSpamRank1; float wsw1 = getWordSpamWeight(wr1); unsigned char wr2 = ps->m_wordSpamRank2; float wsw2 = getWordSpamWeight(wr2); // HACK for inlink text! if ( hg1 == HASHGROUP_INLINKTEXT ) wsw1 = getLinkerWeight(wr1); if ( hg2 == HASHGROUP_INLINKTEXT ) wsw2 = getLinkerWeight(wr2); char *syn1 = "no"; char *syn2 = "no"; float sw1 = 1.0; float sw2 = 1.0; if ( ps->m_isSynonym1 ) { syn1 = "yes"; sw1 = SYNONYM_WEIGHT; } if ( ps->m_isSynonym2 ) { syn2 = "yes"; sw2 = SYNONYM_WEIGHT; } //char bf1 = ps->m_bflags1; //char bf2 = ps->m_bflags2; char *bs1 = "no"; char *bs2 = "no"; //if ( bf1 & BF_HALFSTOPWIKIBIGRAM ) bs1 = "yes"; //if ( bf2 & BF_HALFSTOPWIKIBIGRAM ) bs2 = "yes"; if ( ps->m_isHalfStopWikiBigram1 ) bs1 = "yes"; if ( ps->m_isHalfStopWikiBigram2 ) bs2 = "yes"; float wbw1 = 1.0; float wbw2 = 1.0; if ( ps->m_isHalfStopWikiBigram1 ) wbw1 = WIKI_BIGRAM_WEIGHT; if ( ps->m_isHalfStopWikiBigram2 ) wbw2 = WIKI_BIGRAM_WEIGHT; //int64_t sz1 = ps->m_listSize1; //int64_t sz2 = ps->m_listSize2; //int64_t tf1 = ps->m_termFreq1;//sz1 / 10; //int64_t tf2 = ps->m_termFreq2;//sz2 / 10; int64_t tf1 = msg40->m_msg3a.m_termFreqs[qtn1]; int64_t tf2 = msg40->m_msg3a.m_termFreqs[qtn2]; float tfw1 = ps->m_tfWeight1; float tfw2 = ps->m_tfWeight2; char *wp = "no"; float wiw = 1.0; if ( ps->m_inSameWikiPhrase ) { wp = "yes"; wiw = WIKI_WEIGHT; // 0.50; } int32_t a = ps->m_wordPos2; int32_t b = ps->m_wordPos1; char *es = ""; char *bes = ""; if ( a < b ) { a = ps->m_wordPos1; b = ps->m_wordPos2; // out of query order penalty! es = "+ 1.0"; bes = "+ 1.0"; } if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t\n"); /* sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)drl1); sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)drr1); sb->safePrintf("\t\t\t%f" "\n", dvwl1); sb->safePrintf("\t\t\t%f" "\n", dvwr1); sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)drl2); sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)drr2); sb->safePrintf("\t\t\t%f" "\n", dvwl2); sb->safePrintf("\t\t\t%f" "\n", dvwr2); */ sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)de1); sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)de2); sb->safePrintf("\t\t\t%f" "\n", dnw1); sb->safePrintf("\t\t\t%f" "\n", dnw2); sb->safePrintf("\t\t\tsafeMemcpy ( q->m_qterms[qtn1].m_term , q->m_qterms[qtn1].m_termLen ); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\tsafeMemcpy ( q->m_qterms[qtn2].m_term , q->m_qterms[qtn2].m_termLen ); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\t" "\n", getHashGroupString(hg1)); sb->safePrintf("\t\t\t" "\n", getHashGroupString(hg2)); sb->safePrintf("\t\t\t%.01f" "\n", hgw1 ); sb->safePrintf("\t\t\t%.01f" "\n", hgw2 ); sb->safePrintf("\t\t\t%"INT32"" "\n", wp1 ); sb->safePrintf("\t\t\t%"INT32"" "\n", wp2 ); //int32_t wordDist = wp2 - wp1; //if ( wordDist < 0 ) wordDist *= -1; //sb->safePrintf("\t\t\t%"INT32"" // "\n",wdist); sb->safePrintf("\t\t\t" "" "\n", syn1); sb->safePrintf("\t\t\t" "" "\n", syn2); sb->safePrintf("\t\t\t%.01f" "\n", sw1); sb->safePrintf("\t\t\t%.01f" "\n", sw2); // word spam / link text weight char *r1 = "wordSpamRank1"; char *r2 = "wordSpamRank2"; char *t1 = "wordSpamWeight1"; char *t2 = "wordSpamWeight2"; if ( hg1 == HASHGROUP_INLINKTEXT ) { r1 = "inlinkSiteRank1"; t1 = "inlinkTextWeight1"; } if ( hg2 == HASHGROUP_INLINKTEXT ) { r2 = "inlinkSiteRank2"; t2 = "inlinkTextWeight2"; } sb->safePrintf("\t\t\t<%s>%"INT32"\n", r1,(int32_t)wr1,r1); sb->safePrintf("\t\t\t<%s>%"INT32"\n", r2,(int32_t)wr2,r2); sb->safePrintf("\t\t\t<%s>%.02f\n", t1,wsw1,t1); sb->safePrintf("\t\t\t<%s>%.02f\n", t2,wsw2,t2); // if offsite inlink text show the inlinkid for matching // to an LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; Inlink *k = info->getNextInlink(NULL); for (;k&&hg1==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){ if ( ! k->getLinkText() ) continue; if ( k->m_wordPosStart > wp1 ) continue; if ( k->m_wordPosStart + 50 < wp1 ) continue; // got it. we HACKED this to put the id // in k->m_siteHash sb->safePrintf("\t\t\t%"INT32"" "\n", k->m_siteHash); } k = info->getNextInlink(NULL); for (;k&&hg2==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){ if ( ! k->getLinkText() ) continue; if ( k->m_wordPosStart > wp2 ) continue; if ( k->m_wordPosStart + 50 < wp2 ) continue; // got it. we HACKED this to put the id // in k->m_siteHash sb->safePrintf("\t\t\t%"INT32"" "\n", k->m_siteHash); } // term freq sb->safePrintf("\t\t\t%"INT64"" "\n",tf1); sb->safePrintf("\t\t\t%"INT64"" "\n",tf2); sb->safePrintf("\t\t\t%f" "\n",tfw1); sb->safePrintf("\t\t\t%f" "\n",tfw2); sb->safePrintf("\t\t\t" "%"INT32"\n", (int32_t)(ps->m_isHalfStopWikiBigram1)); sb->safePrintf("\t\t\t" "%"INT32"\n", (int32_t)(ps->m_isHalfStopWikiBigram2)); sb->safePrintf("\t\t\t%.01f" "\n", wbw1); sb->safePrintf("\t\t\t%.01f" "\n", wbw2); sb->safePrintf("\t\t\t" "" "\n", wp); sb->safePrintf("\t\t\t" "%"INT32"" "\n", ps->m_qdist ); sb->safePrintf("\t\t\t" "%.01f" "\n", wiw ); sb->safePrintf("\t\t\t%f\n", ps->m_finalScore); sb->safePrintf("\t\t\t" "" "\n" , t1 , t2 ); sb->safePrintf("\t\t\t" "%.1f
"//hashgroupweight "*" "%.1f"//hashgroupweight "*" "%.1f" // syn weight "*" "%.1f" // syn weight "*" "%.1f"//wikibigramweight "*" "%.1f"//wikibigramweight "*" "%.02f"//density weight "*" "%.02f"//density weight "*" "%.02f" // wordspam weight "*" "%.02f" // wordspam weight "*" "%.02f"//tf weight "*" "%.02f"//tf weight , ps->m_finalScore , hgw1 , hgw2 , sw1 , sw2 , wbw1 , wbw2 , dnw1 , dnw2 , wsw1 , wsw2 , tfw1 , tfw2 ); if ( ps->m_fixedDistance ) sb->safePrintf( "/%"INT32" " , (int32_t)FIXED_DISTANCE ); else sb->safePrintf( "/" "(((%"INT32"" "-%"INT32"" ")-%"INT32")+1.0%s)" , a,b,ps->m_qdist,bes); // wikipedia weight if ( wiw != 1.0 ) sb->safePrintf("*%.01f", wiw ); sb->safePrintf("]]>" "\n" ); sb->safePrintf("\t\t\n"); return true; // continue; } // print out the entire details i guess //sb->safePrintf("
%.02f
" // ""); // then print the details just like the // single term table below //sb->safePrintf("" // "" // "" // "" // "" // "" // //"" // "" // "" // "" // "" // "" // "" // ); // // print first term in first row // sb->safePrintf("",ps->m_finalScore); //sb->safeMemcpy ( q->m_qterms[qtn1].m_term , // q->m_qterms[qtn1].m_termLen ); //sb->safePrintf(""); sb->safePrintf("" , getHashGroupString(hg1) , hgw1 ); // the word position sb->safePrintf("" "" ,mr->m_docId ,(int32_t)ps->m_wordPos1 ,si->m_cr->m_coll ,(int32_t)ps->m_wordPos1); // is synonym? //if ( sw1 != 1.00 ) sb->safePrintf("",syn1,sw1); //else // sb->safePrintf(""); // wikibigram?/weight //if ( wbw1 != 1.0 ) sb->safePrintf("",bs1,wbw1); //else // sb->safePrintf(""); // diversity - // not needed for term pair algo //sb->safePrintf("", // (int32_t)dr1,dvw1); // density sb->safePrintf("", (int32_t)de1,dnw1); // word spam if ( hg1 == HASHGROUP_INLINKTEXT ) { sb->safePrintf(""); sb->safePrintf("", (int32_t)wr1,wsw1); } else { sb->safePrintf(""); sb->safePrintf(""); } // term freq sb->safePrintf("", tf1,tfw1); // insamewikiphrase? sb->safePrintf("", wp,ps->m_qdist,wiw); // end the row sb->safePrintf(""); // // print 2nd term in 2nd row // sb->safePrintf(""); sb->safePrintf(//"" , getHashGroupString(hg2) , hgw2 ); // the word position sb->safePrintf("" "" ,mr->m_docId ,(int32_t)ps->m_wordPos2 ,si->m_cr->m_coll ,(int32_t)ps->m_wordPos2); // is synonym? //if ( sw2 != 1.00 ) sb->safePrintf("",syn2,sw2); //else // sb->safePrintf(""); // wikibigram?/weight //if ( wbw2 != 1.0 ) sb->safePrintf("",bs2,wbw2); //else // sb->safePrintf(""); // diversity //sb->safePrintf("", // (int32_t)dr2,dvw2); // density sb->safePrintf("", (int32_t)de2,dnw2); // word spam if ( hg2 == HASHGROUP_INLINKTEXT ) { sb->safePrintf(""); sb->safePrintf("", (int32_t)wr2,wsw2); } else { sb->safePrintf(""); sb->safePrintf(""); } // term freq sb->safePrintf("", tf2,tfw2); // insamewikiphrase? sb->safePrintf("", wp,ps->m_qdist,wiw); // end the row sb->safePrintf(""); sb->safePrintf("" //"
" // "
"); //if ( q->m_qterms[qtn1].m_isPhrase ) // sb->pushChar('\"'); //sb->safeMemcpy ( q->m_qterms[qtn1].m_term , // q->m_qterms[qtn1].m_termLen ); //if ( q->m_qterms[qtn1].m_isPhrase ) // sb->pushChar('\"'); //sb->safePrintf(" vs "); //if ( q->m_qterms[qtn2].m_isPhrase ) // sb->pushChar('\"'); //sb->safeMemcpy ( q->m_qterms[qtn2].m_term , // q->m_qterms[qtn2].m_termLen ); //if ( q->m_qterms[qtn2].m_isPhrase ) // sb->pushChar('\"'); //sb->safePrintf("
termlocationwordPossynonymwikibigramdiversityRank/weightdensityRankwordSpamRankinlinkSiteRanktermFreqinWikiPhrase/qdist
"); sb->safePrintf("" ); sb->safePrintf("%.04f" "%s " "%.01f"); //"safePrintf("safePrintf("safePrintf("%"INT64"" "&page=4" //"&page=sections&" "&hipos=%"INT32"" "&c=%s#hipos\">" "%"INT32"%s %.02f" " %s %.02f" " %"INT32"/" // "%f%"INT32" " "%.02f %"INT32" " "%.02f%"INT32"", (int32_t)wr1); //if ( wsw1 != 1.0 ) sb->safePrintf( " " "%.02f", wsw1); sb->safePrintf(" %"INT64" " "%.02f%s %"INT32"/%.01f
"); //sb->safeMemcpy ( q->m_qterms[qtn2].m_term , // q->m_qterms[qtn2].m_termLen ); //sb->safePrintf("" "%s " "%.01f"); //"safePrintf("safePrintf("safePrintf("%"INT64"" "&page=4&" "hipos=%"INT32"&c=%s#hipos\">" "%"INT32"%s %.02f" " %s %.02f" " %"INT32"/" // "%f%"INT32" " "%.02f %"INT32" " "%.02f%"INT32"", (int32_t)wr2); //if ( wsw2 != 1.0 ) sb->safePrintf( " " "%.02f", wsw2); sb->safePrintf(" %"INT64" " "%.02f%s/%"INT32" %.01f
safePrintf("id=poo%"INT32" ",s_count); } sb->safePrintf("colspan=50>" // style=\"display:none\">" "%.03f " "= " //" ( " "100*" "%.1f" "" "*" "%.1f" "" "*" //"(%"INT32" - " , ps->m_finalScore //, idstr , hgw1 , hgw2 //, (int32_t)MAXWORDPOS+1 ); sb->safePrintf("%.1f" "*" " %.1f" "*" // wiki bigram weight "%.02f" "*" "%.02f" "*" "%.02f" "*" "%.02f" "*" "%.02f" "*" " %.02f" "*" "%.02f" "*" "%.02f" , sw1 , sw2 , wbw1 , wbw2 , dnw1 , dnw2 , wsw1 , wsw2 , tfw1 , tfw2 ); if ( ps->m_fixedDistance ) sb->safePrintf( "/%"INT32" " , (int32_t)FIXED_DISTANCE ); else sb->safePrintf( "/" "(((%"INT32"" "-%"INT32")-" "%"INT32") + 1.0%s)" , a,b,ps->m_qdist,bes); // wikipedia weight if ( wiw != 1.0 ) sb->safePrintf("*%.01f", wiw ); sb->safePrintf( // end formula "
" //"
"); ); return true; } bool printSingleTerm ( SafeBuf *sb , Query *q , SingleScore *ss ) { int32_t qtn = ss->m_qtermNum; sb->safePrintf(""); sb->safePrintf(""); return true; } bool printTermPairs ( SafeBuf *sb , Query *q , PairScore *ps ) { // print pair text int32_t qtn1 = ps->m_qtermNum1; int32_t qtn2 = ps->m_qtermNum2; sb->safePrintf("
"); // link to rainbow page //sb->safePrintf("urlEncode( mr->ptr_ubuf ); //sb->safePrintf("&page=4&recycle=1&c=%s\">",coll); if ( q->m_qterms[qtn].m_isPhrase ) sb->pushChar('\"'); sb->safeMemcpy ( q->m_qterms[qtn].m_term , q->m_qterms[qtn].m_termLen ); if ( q->m_qterms[qtn].m_isPhrase ) sb->pushChar('\"'); //sb->safePrintf(""); sb->safePrintf("
" "" "" "" "" "" "" //"" "" "" "" // nlinkSiteRank" "" "\n" ); return true; } bool printSingleScore ( SafeBuf *sb , SearchInput *si , SingleScore *ss , Msg20Reply *mr , Msg40 *msg40 ) { // int16_tcut Query *q = &si->m_q; //SafeBuf ft; // store in final score calc //if ( ft.length() ) ft.safePrintf(" + "); //ft.safePrintf("%f",ss->m_finalScore); char *syn = "no"; float sw = 1.0; if ( ss->m_isSynonym ) { syn = "yes"; sw = SYNONYM_WEIGHT; // Posdb.h } //char bf = ss->m_bflags; float wbw = 1.0; char *bs = "no"; if ( ss->m_isHalfStopWikiBigram ) { bs = "yes"; wbw = WIKI_BIGRAM_WEIGHT; } float hgw = getHashGroupWeight(ss->m_hashGroup); //float dvw = getDiversityWeight(ss->m_diversityRank); float dnw = getDensityWeight(ss->m_densityRank); float wsw = getWordSpamWeight(ss->m_wordSpamRank); // HACK for inlink text! if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) wsw = getLinkerWeight(ss->m_wordSpamRank); //int64_t tf = ss->m_termFreq;//ss->m_listSize; int32_t qtn = ss->m_qtermNum; int64_t tf = msg40->m_msg3a.m_termFreqs[qtn]; float tfw = ss->m_tfWeight; if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t\n"); /* sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)ss->m_diversityRank); sb->safePrintf("\t\t\t%f" "\n", dvw); */ sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)ss->m_densityRank); sb->safePrintf("\t\t\t%f" "\n", dnw); sb->safePrintf("\t\t\tsafeMemcpy ( q->m_qterms[qtn].m_term , q->m_qterms[qtn].m_termLen ); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\t" "\n", getHashGroupString(ss->m_hashGroup)); sb->safePrintf("\t\t\t%.01f" "\n", hgw ); sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)ss->m_wordPos ); sb->safePrintf("\t\t\t" "" "\n", syn); sb->safePrintf("\t\t\t%.01f" "\n", sw); sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)(ss->m_isHalfStopWikiBigram) ); sb->safePrintf("\t\t\t%.01f" "\n", (float)WIKI_BIGRAM_WEIGHT); // word spam if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) { sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)ss->m_wordSpamRank); sb->safePrintf("\t\t\t%.02f" "\n", wsw); } else { sb->safePrintf("\t\t\t%"INT32"" "\n", (int32_t)ss->m_wordSpamRank); sb->safePrintf("\t\t\t%.02f" "\n", wsw); } // if offsite inlink text show the inlinkid for matching // to an LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks; Inlink *k = info->getNextInlink(NULL); for ( ; k && ss->m_hashGroup==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){ if ( ! k->getLinkText() ) continue; if ( k->m_wordPosStart > ss->m_wordPos ) continue; if ( k->m_wordPosStart + 50 < ss->m_wordPos ) continue; // got it. we HACKED this to put the id // in k->m_siteHash sb->safePrintf("\t\t\t%"INT32"" "\n", k->m_siteHash); } // term freq sb->safePrintf("\t\t\t%"INT64"" "\n",tf); sb->safePrintf("\t\t\t%f" "\n",tfw); sb->safePrintf("\t\t\t%f\n", ss->m_finalScore); sb->safePrintf("\t\t\t" "" "\n" ); sb->safePrintf("\t\t\t" "" "\n" , ss->m_finalScore , hgw , hgw , sw , sw , wbw , wbw , dnw , dnw , wsw , wsw , tfw , tfw ); sb->safePrintf("\t\t\n"); return true; } sb->safePrintf("" "\n" "\n" "\n" // syn // wikibigram?/weight "\n" //"" // diversity "\n" // density , (int32_t)ss->m_wordPos , syn , sw // synonym weight , bs , wbw //, (int32_t)ss->m_diversityRank //, dvw , (int32_t)ss->m_densityRank , dnw ); if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) { sb->safePrintf("" "\n" // wordspam , (int32_t)ss->m_wordSpamRank , wsw ); } else { sb->safePrintf("" // wordspam "\n" , (int32_t)ss->m_wordSpamRank , wsw ); } sb->safePrintf("\n" // termfreq "\n" , tf , tfw ); // last row is the computation of score sb->safePrintf("\n" , ss->m_finalScore //, (int32_t)MAXWORDPOS+1 , hgw , hgw , sw , sw , wbw , wbw //, dvw //, dvw , dnw , dnw , wsw , wsw , tfw , tfw ); //sb->safePrintf("
"); if ( q->m_qterms[qtn1].m_isPhrase ) sb->pushChar('\"'); sb->safeMemcpy ( q->m_qterms[qtn1].m_term , q->m_qterms[qtn1].m_termLen ); if ( q->m_qterms[qtn1].m_isPhrase ) sb->pushChar('\"'); sb->safePrintf(" vs "); if ( q->m_qterms[qtn2].m_isPhrase ) sb->pushChar('\"'); sb->safeMemcpy ( q->m_qterms[qtn2].m_term , q->m_qterms[qtn2].m_termLen ); if ( q->m_qterms[qtn2].m_isPhrase ) sb->pushChar('\"'); return true; } bool printScoresHeader ( SafeBuf *sb ) { sb->safePrintf("
scorelocationwordPossynonymwikibigramdiversityRankdensityspaminlnkPRtermFreq
%.03f%s %.1f" "" // wordpos "" "m_finalScore , getHashGroupString(ss->m_hashGroup) , hgw ); //sb->urlEncode( mr->ptr_ubuf ); sb->safePrintf("%"INT64"",mr->m_docId ); sb->safePrintf("&page=4&" "hipos=%"INT32"&c=%s#hipos\">" ,(int32_t)ss->m_wordPos ,si->m_cr->m_coll); sb->safePrintf("%"INT32"%s %.1f" "%s %.02f%"INT32"/%f" //"%"INT32" " "%.02f %"INT32" %.02f" "%"INT32" %.02f" " %"INT64" " "%.02f
" "%.03f " " = " //" %"INT32" * " "100 * " " %.1f" " * " " %.1f" " * " " %.1f" " * " " %.1f" " * " " %.02f"//wikibigramwght " * " " %.02f" " * " "%.02f" " * " "%.02f" " * " "%.02f" " * " "%.02f" " * " "%.02f" " * " "%.02f" //" / ( 3.0 )" // end formula "
" // "
"); return true; } //////// // // . print the directory subtopics // . show these when we are in a directory topic browsing dmoz // . just a list of all the topics/categories // //////// bool printDMOZSubTopics ( SafeBuf *sb, int32_t catId, bool inXml ) { if ( catId <= 0 ) return true; int32_t currType; bool first; bool nextColumn; int32_t maxPerColumn; int32_t currInColumn; int32_t currIndex; char *prefixp; int32_t prefixLen; char *catName; int32_t catNameLen; char encodedName[2048]; //SearchInput *si = &st->m_si; bool isRTL = g_categories->isIdRTL ( catId ); SafeBuf subCatBuf; // stores a list of SubCategories into "subCatBuf" int32_t numSubCats = g_categories->generateSubCats ( catId , &subCatBuf ); // . get the subcategories for a given categoriy // . msg2b::gernerateDirectory() was launched in Msg40.cpp //int32_t numSubCats = st->m_msg40.m_msg2b.m_numSubCats; //SubCategory *subCats = st->m_msg40.m_msg2b.m_subCats; //char *catBuffer = st->m_msg40.m_msg2b.m_catBuffer; //bool showAdultOnTop = st->m_si.m_cr->m_showAdultCategoryOnTop; // just print
if no sub categories if (inXml) { sb->safePrintf ( "\t\n" "\t\t%"INT32"\n" "\t\tprintPathFromId ( sb, catId, // st->m_si.m_cat_dirId, true ); sb->safePrintf ( "]]>\n"); sb->safePrintf ( "\t\t%"INT32"\n", (int32_t)isRTL); } char *p = subCatBuf.getBufStart(); char *pend = subCatBuf.getBuf(); SubCategory *ptrs[MAX_SUB_CATS]; int32_t count = 0; if (numSubCats <= 0) goto dirEnd; // print out the cats currType = 0; // first make ptrs to them for ( ; p < pend ; ) { SubCategory *cat = (SubCategory *)p; ptrs[count++] = cat; p += cat->getRecSize(); // do not breach if ( count >= MAX_SUB_CATS ) break; } for (int32_t i = 0; i < count ; i++ ) { SubCategory *cat = ptrs[i]; first = false; catName = cat->getName();//&catBuffer[subCats[i].m_nameOffset]; catNameLen = cat->m_nameLen;//subCats[i].m_nameLen; // this is the last topic in the dmoz dir path // so if the dmoz topic is Top/Arts/Directories then // the prefixp is "Directories" prefixp = cat->getPrefix();//&catBuffer[subCats[i].m_prefixOffset]; prefixLen = cat->m_prefixLen;//subCats[i].m_prefixLen; // skip bad categories currIndex=g_categories->getIndexFromPath(catName,catNameLen); if (currIndex < 0) continue; // skip top adult category if we're supposed to /* if ( !inXml && st->m_si.m_catId == 1 && si->m_familyFilter && g_categories->isIndexAdultStart ( currIndex ) ) continue; */ // check for room //if (p + subCats[i].m_prefixLen*2 + // subCats[i].m_nameLen*2 + // 512 > pend){ // goto diroverflow; //} // print simple xml tag for inXml if (inXml) { switch ( cat->m_type ) { case SUBCAT_LETTERBAR: sb->safePrintf ( "\t\tsafePrintf ( "]]>" ); sb->safePrintf ( "%"INT32"", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_NARROW2: sb->safePrintf ( "\t\tutf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>"); sb->safePrintf ( "%"INT32"", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_NARROW1: sb->safePrintf ( "\t\tutf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%"INT32"", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_NARROW: sb->safePrintf ( "\t\tutf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%"INT32"", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_SYMBOLIC2: sb->safePrintf ( "\t\tutf8Encode2 ( prefixp, prefixLen ); sb->safePrintf ( ":" ); sb->utf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%"INT32"", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_SYMBOLIC1: sb->safePrintf ( "\t\tutf8Encode2 ( prefixp, prefixLen ); sb->safePrintf ( ":" ); sb->utf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%"INT32"", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_SYMBOLIC: sb->safePrintf ( "\t\tutf8Encode2 ( prefixp, prefixLen ); sb->safePrintf ( ":" ); sb->utf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%"INT32"", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_RELATED: sb->safePrintf ( "\t\tutf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%"INT32"", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n" ); break; case SUBCAT_ALTLANG: sb->safePrintf ( "\t\tutf8Encode2 ( prefixp, prefixLen ); sb->safePrintf ( ":" ); sb->utf8Encode2 ( catName, catNameLen ); sb->safePrintf ( "]]>" ); sb->safePrintf ( "%"INT32"", g_categories->getNumUrlsFromIndex( currIndex) ); sb->safePrintf ( "\n"); break; } continue; } // print type header if ( cat->m_type - currType >= 10) { // end the last type if (currType == SUBCAT_LETTERBAR) sb->safePrintf(" ]\n"); else if (currType != 0) sb->safePrintf ( "\n
\n" ); // start the new type switch (cat->m_type) { case SUBCAT_LETTERBAR: sb->safePrintf ( "" "
[ " ); break; case SUBCAT_NARROW2: case SUBCAT_SYMBOLIC2: case SUBCAT_NARROW1: case SUBCAT_SYMBOLIC1: case SUBCAT_NARROW: case SUBCAT_SYMBOLIC: sb->safePrintf("
\n"); break; case SUBCAT_RELATED: if (currType == 0 || currType == SUBCAT_LETTERBAR) sb->safePrintf("
"); else sb->safePrintf("
"); if (isRTL) sb->safePrintf(""); sb->safePrintf ( "Related Categories:" "" ); if (isRTL) sb->safePrintf(""); break; case SUBCAT_ALTLANG: if (currType == 0 || currType == SUBCAT_LETTERBAR) sb->safePrintf("
"); else sb->safePrintf("
"); if (isRTL) sb->safePrintf(""); sb->safePrintf ( "This category in other" " languages:"); if (isRTL) sb->safePrintf(""); break; } currType = ( cat->m_type/10)*10; first = true; nextColumn = false; currInColumn = 0; if (currType == SUBCAT_LETTERBAR || currType == SUBCAT_RELATED) maxPerColumn = 999; else { // . check how many columns we'll use for this // type int32_t numInType = 1; for (int32_t j = i+1; j < numSubCats; j++) { if ( ptrs[j]->m_type - currType >= 10) break; numInType++; } // column for every 5, up to 3 columns int32_t numColumns = numInType/5; if ( numInType%5 > 0 ) numColumns++; if ( currType == SUBCAT_ALTLANG && numColumns > 4) numColumns = 4; else if (numColumns > 3) numColumns = 3; // max number of links per column maxPerColumn = numInType/numColumns; if (numInType%numColumns > 0) maxPerColumn++; } } // start the sub cat if (first) { if (currType != SUBCAT_LETTERBAR) sb->safePrintf ( "" "
" "
    " "\n
  • "); } // check for the next column else if (nextColumn) { sb->safePrintf ( "\n
" "
\n"); } dirEnd: if (inXml) sb->safePrintf("\t\n"); else { sb->safePrintf(""); sb->safePrintf("
\n");//
\n"); } return true; } bool printDMOZCrumb ( SafeBuf *sb , int32_t catId , bool xml ) { // catid -1 means error if ( catId <= 0 ) return true; int32_t dirIndex = g_categories->getIndexFromId(catId); // dirIndex = g_categories->getIndexFromId(si->m_cat_sdir); if (dirIndex < 0) dirIndex = 0; // display the directory bread crumb //if( (si->m_cat_dirId > 0 && si->m_isMasterAdmin && !si->m_isFriend) // || (si->m_cat_sdir > 0 && si->m_cat_sdirt != 0) ) // sb->safePrintf("

"); // int16_tcut. rtl=Right To Left language format. bool rtl = g_categories->isIdRTL ( catId ) ; //st->m_isRTL = rtl; if ( ! xml ) { sb->safePrintf("\n"); if ( rtl ) sb->safePrintf(""); //sb->safePrintf("Top: "); } // put crumbin xml? if ( xml ) sb->safePrintf("printPathCrumbFromIndex(sb,dirIndex,rtl); if ( xml ) sb->safePrintf("]]>\n" ); // how many urls/entries in this topic? int32_t nu =g_categories->getNumUrlsFromIndex(dirIndex); // print the num if ( ! xml ) { sb->safePrintf("  "); if ( rtl ) sb->safePrintf("(%"INT32")",nu); else sb->safePrintf("(%"INT32")", nu); sb->safePrintf("

\n"); } return true; } bool printDmozRadioButtons ( SafeBuf *sb , int32_t catId ) ; bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr, bool printGigablast ) ; // if catId >= 1 then print the dmoz radio button bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , int32_t catId , SearchInput *si ) { char *root = ""; if ( g_conf.m_isMattWells ) root = "http://www.gigablast.com"; // now make a TABLE, left PANE contains gigabits and stuff /* sb->safePrintf( // logo and menu table "" //"style=color:blue;>" "" // take out logo now that we have the circle rocket // "" "" // "" // "
" // "" // "" // "" // "" //, root ); */ if ( catId >= 0 ) { CollectionRec *cr = g_collectiondb.getRec ( hr ); printFrontPageShell ( sb , "directory",cr,true);//PAGE_DIRECTOR } /* // menu above search box sb->safePrintf( "
" "   " ); if ( catId <= 0 ) sb->safePrintf("web"); else sb->safePrintf("web"); sb->safePrintf("      " ); if ( g_conf.m_isMattWells ) { // SEO functionality not included yet - so redir to gigablast. if ( g_conf.m_isMattWells ) sb->safePrintf(""); else sb->safePrintf(""); sb->safePrintf( "seo" "      " ); } if (catId <= 0 ) sb->safePrintf("" "directory" "" ); else sb->safePrintf("" "directory"); */ char *coll = hr->getString("c"); if ( ! coll ) coll = ""; // if there's a ton of sites use the post method otherwise // they won't fit into the http request, the browser will reject // sending such a large request with "GET" char *method = "GET"; if ( si && si->m_sites && gbstrlen(si->m_sites)>800 ) method = "POST"; sb->safePrintf( //"      " // i'm not sure why this was removed. perhaps // because it is not working yet because of // some bugs... // "" // "advanced" // "" // "     " // "" // "add url" // "" /* "   |   " "" "blog" "" "   |   " "" "about" "" */ //"

" // // search box // "
\n\n" // propagate the collection if they re-search "" , method , coll ); // propagate prepend char *prepend = hr->getString("prepend"); if ( prepend ) { sb->safePrintf("htmlEncode ( prepend, gbstrlen(prepend), false); sb->safePrintf("\">"); } // put search box in a box sb->safePrintf( "
" "
" "
" "
"); sb->safePrintf ( //"
"getString("q",&qlen,"",NULL); sb->htmlEncode ( qstr , qlen , false ); // if it was an advanced search, this can be empty if ( qlen == 0 && si && si->m_displayQuery ) sb->htmlEncode ( si->m_displayQuery ); sb->safePrintf ("\">" //"" "   " "
" "GO" "
" ); // print "Search [ ] sites [ ] pages in this topic or below" if ( catId >= 0 ) { sb->safePrintf("
"); printDmozRadioButtons(sb,catId); } sb->safePrintf( "
" "
" "
" ); /* else { sb->safePrintf("Try your search on: " "   " "google      " "bing"); } */ // do not print filter bar if showing a dmoz topic if ( catId < 0 ) printSearchFiltersBar ( sb , hr ); sb->safePrintf( "\n" // "
\n" ); return true; } bool printDmozRadioButtons ( SafeBuf *sb , int32_t catId ) { sb->safePrintf("Search " " sites " " pages " "in this topic or below" , catId , catId ); return true; } /* // print the search options under a dmoz search box bool printDirectorySearchType ( SafeBuf& sb, int32_t sdirt ) { // default to entire directory if (sdirt < 1 || sdirt > 4) sdirt = 3; // by default search the whole thing sb->safePrintf("safePrintf(" checked>"); else sb->safePrintf(">"); sb->safePrintf("Entire Directory
\n"); // entire category sb->safePrintf("safePrintf(" checked>"); else sb->safePrintf(">"); sb->safePrintf("Entire Category
\n"); // base category only sb->safePrintf("
safePrintf(" checked>"); else sb->safePrintf(">"); sb->safePrintf("Pages in Base Category
\n"); // sites in base category sb->safePrintf("safePrintf(" checked>"); else sb->safePrintf(">"); sb->safePrintf("Sites in Base Category
\n"); // sites in entire category sb->safePrintf("safePrintf(" checked>"); else sb->safePrintf(">"); sb->safePrintf("Sites in Entire Category
\n"); // end it return true; } */ // return 1 if a should be before b int csvPtrCmp ( const void *a, const void *b ) { //JsonItem *ja = (JsonItem **)a; //JsonItem *jb = (JsonItem **)b; char *pa = *(char **)a; char *pb = *(char **)b; if ( strcmp(pa,"type") == 0 ) return -1; if ( strcmp(pb,"type") == 0 ) return 1; // force title on top if ( strcmp(pa,"product.title") == 0 ) return -1; if ( strcmp(pb,"product.title") == 0 ) return 1; if ( strcmp(pa,"title") == 0 ) return -1; if ( strcmp(pb,"title") == 0 ) return 1; // otherwise string compare int val = strcmp(pa,pb); return val; } #include "Json.h" // // print header row in csv // bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) { Msg40 *msg40 = &st->m_msg40; int32_t numResults = msg40->getNumResults(); char tmp1[1024]; SafeBuf tmpBuf (tmp1 , 1024); char tmp2[1024]; SafeBuf nameBuf (tmp2, 1024); char nbuf[27000]; HashTableX nameTable; if ( ! nameTable.set ( 8,4,2048,nbuf,27000,false,0,"ntbuf") ) return false; int32_t niceness = 0; // . scan every fucking json item in the search results. // . we still need to deal with the case when there are so many // search results we have to dump each msg20 reply to disk in // order. then we'll have to update this code to scan that file. for ( int32_t i = 0 ; i < numResults ; i++ ) { // get the msg20 reply for search result #i Msg20 *m20 = msg40->m_msg20[i]; Msg20Reply *mr = m20->m_r; if ( ! mr ) { log("results: missing msg20 reply for result #%"INT32"",i); continue; } // get content char *json = mr->ptr_content; // how can it be empty? if ( ! json ) continue; // parse it up Json jp; jp.parseJsonStringIntoJsonItems ( json , niceness ); // scan each json item for ( JsonItem *ji = jp.getFirstItem(); ji ; ji = ji->m_next ){ // skip if not number or string if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING ) continue; // if in an array, do not print! csv is not // good for arrays... like "media":[....] . that // one might be ok, but if the elements in the // array are not simple types, like, if they are // unflat json objects then it is not well suited // for csv. if ( ji->isInArray() ) continue; // skip "html" field... too spammy for csv and > 32k // causes libreoffice calc to truncate it and break // its parsing if ( ji->m_name && //! ji->m_parent && strcmp(ji->m_name,"html")==0) continue; // reset length of buf to 0 tmpBuf.reset(); // . get the name of the item into "nameBuf" // . returns false with g_errno set on error if ( ! ji->getCompoundName ( tmpBuf ) ) return false; // is it new? int64_t h64 = hash64n ( tmpBuf.getBufStart() ); if ( nameTable.isInTable ( &h64 ) ) continue; // record offset of the name for our hash table int32_t nameBufOffset = nameBuf.length(); // store the name in our name buffer if ( ! nameBuf.safeStrcpy ( tmpBuf.getBufStart() ) ) return false; if ( ! nameBuf.pushChar ( '\0' ) ) return false; // it's new. add it if ( ! nameTable.addKey ( &h64 , &nameBufOffset ) ) return false; } } // . make array of ptrs to the names so we can sort them // . try to always put title first regardless char *ptrs [ 1024 ]; int32_t numPtrs = 0; for ( int32_t i = 0 ; i < nameTable.m_numSlots ; i++ ) { if ( ! nameTable.m_flags[i] ) continue; int32_t off = *(int32_t *)nameTable.getValueFromSlot(i); char *p = nameBuf.getBufStart() + off; ptrs[numPtrs++] = p; if ( numPtrs >= 1024 ) break; } // sort them qsort ( ptrs , numPtrs , sizeof(char *) , csvPtrCmp ); // set up table to map field name to column for printing the json items HashTableX *columnTable = &st->m_columnTable; if ( ! columnTable->set ( 8,4, numPtrs * 4,NULL,0,false,0,"coltbl" ) ) return false; // now print them out as the header row for ( int32_t i = 0 ; i < numPtrs ; i++ ) { if ( i > 0 && ! sb->pushChar(',') ) return false; if ( ! sb->safeStrcpy ( ptrs[i] ) ) return false; // record the hash of each one for printing out further json // objects in the same order so columns are aligned! int64_t h64 = hash64n ( ptrs[i] ); if ( ! columnTable->addKey ( &h64 , &i ) ) return false; } st->m_numCSVColumns = numPtrs; if ( ! sb->pushChar('\n') ) return false; if ( ! sb->nullTerm() ) return false; return true; } // returns false and sets g_errno on error bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) { int32_t niceness = 0; // parse the json Json jp; jp.parseJsonStringIntoJsonItems ( json , niceness ); HashTableX *columnTable = &st->m_columnTable; int32_t numCSVColumns = st->m_numCSVColumns; // make buffer space that we need char ttt[1024]; SafeBuf ptrBuf(ttt,1024); int32_t need = numCSVColumns * sizeof(JsonItem *); if ( ! ptrBuf.reserve ( need ) ) return false; JsonItem **ptrs = (JsonItem **)ptrBuf.getBufStart(); // reset json item ptrs for csv columns. all to NULL memset ( ptrs , 0 , need ); char tmp1[1024]; SafeBuf tmpBuf (tmp1 , 1024); JsonItem *ji; /////// // // print json item in csv // /////// for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) { // skip if not number or string if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING ) continue; // skip if not well suited for csv (see above comment) if ( ji->isInArray() ) continue; // . get the name of the item into "nameBuf" // . returns false with g_errno set on error if ( ! ji->getCompoundName ( tmpBuf ) ) return false; // skip "html" field... too spammy for csv and > 32k causes // libreoffice calc to truncate it and break its parsing if ( ji->m_name && //! ji->m_parent && strcmp(ji->m_name,"html")==0) continue; // is it new? int64_t h64 = hash64n ( tmpBuf.getBufStart() ); int32_t slot = columnTable->getSlot ( &h64 ) ; // MUST be in there if ( slot < 0 ) { // do not core on this anymore... log("serps: json column not in table : %s",ji->m_name); continue; //char *xx=NULL;*xx=0;} } // get col # int32_t column = *(int32_t *)columnTable->getValueFromSlot ( slot ); // sanity if ( column >= numCSVColumns ) { char *xx=NULL;*xx=0; } // set ptr to it for printing when done parsing every field // for this json item ptrs[column] = ji; } // now print out what we got for ( int32_t i = 0 ; i < numCSVColumns ; i++ ) { // get it ji = ptrs[i]; // skip "html" field... too spammy for csv and > 32k causes // libreoffice calc to truncate it and break its parsing if ( ji && ji->m_name && //! ji->m_parent && strcmp(ji->m_name,"html")==0) continue; // , delimeted if ( i > 0 ) sb->pushChar(','); // skip if none if ( ! ji ) continue; // // get value and print otherwise // if ( ji->m_type == JT_NUMBER ) { // print numbers without double quotes if ( ji->m_valueDouble *10000000.0 == (double)ji->m_valueLong * 10000000.0 ) sb->safePrintf("%"INT32"",ji->m_valueLong); else sb->safePrintf("%f",ji->m_valueDouble); continue; } // print the value sb->pushChar('\"'); // get the json item to print out int32_t vlen = ji->getValueLen(); // truncate char *truncStr = NULL; if ( vlen > 32000 ) { vlen = 32000; truncStr = " ... value truncated because " "Excel can not handle it. Download the " "JSON to get untruncated data."; } // print it out sb->csvEncode ( ji->getValue() , vlen ); // print truncate msg? if ( truncStr ) sb->safeStrcpy ( truncStr ); // end the CSV sb->pushChar('\"'); } sb->pushChar('\n'); sb->nullTerm(); return true; } /* RIP: OLD IFRAME WIDGET CODE HACK bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) { // // begin print controls // sb->safePrintf("" "" "Widget Creator" ); //char *coll = "GLOBAL-INDEX"; CollectionRec *cr = NULL; if ( coll ) cr = g_collectiondb.getRec(coll); // if admin clicks "edit" in the live widget itself put up // some simpler content editing boxes. token required! int32_t edit = hr->getLong("inlineedit",0); if ( edit ) { // get widget sites char *sites = cr->m_siteListBuf.getBufStart(); sb->safePrintf("" , sites); sb->safePrintf("
" "" "
" "" ); return true; } sb->safePrintf("\n"); char *c1 = ""; char *c2 = ""; char *c3 = ""; int32_t x1 = hr->getLong("dates" ,0); int32_t x2 = hr->getLong("summaries",0); int32_t x3 = hr->getLong("border" ,0); if ( x1 ) c1 = " checked"; if ( x2 ) c2 = " checked"; if ( x3 ) c3 = " checked"; int32_t width = hr->getLong("width",250); int32_t height = hr->getLong("height",400); int32_t refresh = hr->getLong("refresh",15); char *def = "";//

News

"; int32_t len1,len2,len3,len4; char *header = hr->getString("header",&len1,def); char *sites = hr->getString("sites",&len2,""); char *token = hr->getString("token",&len3,""); //char*query=hr->getString("query",&len4, //"type:article gbsortbyint:date"); char *query =hr->getString("query",&len4, "type:article gbsortbyint:gbspiderdate"); sb->safePrintf("
" "" "" , coll ); sb->safePrintf( "
" "" "" "" "" "" "" , sites , token , query , c1 , c2 , c3 , width , height , refresh , header ); // // end print controls // // // begin print widget // sb->safePrintf ( "" "" "
" "" "" "W" "idget Creator" "
" "" "" "Harness the power of Diffbot." "" "" "
" "Websites to crawl:" "
" "" "
" "Token:" "
" "" "
" "Query:" "
" "" "
" "Show Dates " "" "
" "Show Summaries " "" "
" "Frame border " "" "
" "Width " "" "
" "Height " "" "
" "Refresh in seconds " "" "
" "Custom widget header:" "
" "" "
" "" "" "
" "
" "

" //"
" //, RESULTSWIDTHSTR //,width ); //printTabs ( sb , st ); //printRedBoxes ( sb , st ); #define SHADOWCOLOR "#000000" sb->safePrintf ( // end widget div "
" // end widget column in table "
" // begin div with source in it // "
" // , SHADOWCOLOR // //"
" ); // space widget to the right using this table sb->safePrintf( //class=grad3 " //"style=\"" //"border:2px solid black;" //"padding-bottom:10px;" //"padding-top:10px;" //"padding-left:10px;" //"\"" //">" "
" "
" "" "

" ); int32_t start = sb->length(); char *border = "frameborder=no "; if ( x3 ) border = ""; // this iframe contains the WIDGET sb->safePrintf ( // "
" "\n" //"
" //, si->m_urlParms); //, wp ); int32_t end = sb->length(); sb->reserve ( end - start + 1000 ); char *wdir = "on the left"; int32_t cols = 32; //if ( width <= 240 ) sb->safePrintf("
  "); //else { // sb->safePrintf("


"); // wdir = "above"; // cols = 60; // } sb->safePrintf ( "\n\n" "
" //"


" "" "Insert the following code into your webpage to " "generate the widget %s. " //"
" //"" //"" //"Make $1 per click!" //"
" "

" , wdir ); char *p = sb->getBufStart() + start; sb->safePrintf(""); sb->safePrintf(""); // space widget to the right using this table sb->safePrintf("
"); sb->safePrintf("
"); sb->safePrintf("
"); sb->safePrintf(""); sb->safePrintf(""); return true; } bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) { SafeBuf sb; char *token = hr->getString("token",NULL); if ( token && ! token[0] ) token = NULL; int32_t edit = hr->getLong("inlineedit",0); if ( ! token && ! edit ) { g_errno = ENOTOKEN; char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(s,g_errno,msg); } int32_t tlen = 0; if ( token ) tlen = gbstrlen(token); if ( tlen > 64 ) { g_errno = ENOCOLLREC; char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(s,g_errno,msg); } char coll[MAX_COLL_LEN]; CollectionRec *cr = NULL; if ( token ) { sprintf(coll,"%s-widget123",token); cr = g_collectiondb.getRec(coll); } SafeBuf parmList; collnum_t cn = -1; if ( cr ) cn = cr->m_collnum; // . first update their collection with the sites to crawl // . this is NOT a custom diffbot crawl, just a regular one using // the new crawl filters logic, "siteList" char *sites = hr->getString("sites",NULL); // add the collection if does not exist if ( sites && ! cr && token ) { // we need to add the new collnum, so reserve it collnum_t newCollnum = g_collectiondb.reserveCollNum(); // use that cn = newCollnum; // add the new colection named -widget123 g_parms.addNewParmToList1 ( &parmList,cn,coll,0,"addColl"); // note it log("widget: adding new widget coll %s",coll); } if ( cn >= 0 && token ) { // use special url filters profile that spiders sites // shallowly and frequently to pick up new news stories // "1" = (int32_t)UFP_NEWS char ttt[12]; sprintf(ttt,"%"INT32"",(int32_t)UFP_NEWS); // urlfiltersprofile g_parms.addNewParmToList1 ( &parmList,cn,ttt,0,"ufp"); // use diffbot analyze char durl[1024]; sprintf(durl, "http://api.diffbot.com/v2/analyze?mode=auto&token=%s", token); // TODO: ensure we call diffbot ok g_parms.addNewParmToList1 ( &parmList,cn,durl,0,"apiUrl"); } if ( ! sites ) sites = ""; // . update the list of sites to crawl and search and show in widget // . if they give an empty list then allow that, it will stop crawling if ( cn >= 0 && token ) g_parms.addNewParmToList1 ( &parmList,cn,sites,0,"sitelist"); if ( parmList.length() ) { // send the parms to all hosts in the network g_parms.broadcastParmList ( &parmList , NULL,//s,// state is socket i guess NULL);//doneBroadcastingParms2 ); } // now display the widget controls and the widget and the iframe code printWidgetPage ( &sb , hr , coll ); return g_httpServer.sendDynamicPage(s, sb.getBufStart(), sb.length(), -1,//cacheTime -1 means not tocache false, // POST? "text/html", 200, // httpstatus NULL, // cookie "UTF-8"); // charset } */ bool printDmozEntry ( SafeBuf *sb , int32_t catId , bool direct , char *dmozTitle , char *dmozSummary , char *dmozAnchor , SearchInput *si ) { // assign shit if we match the dmoz cat we are showing //if ( catIds[i] == si->m_catId) break; if ( si->m_format == FORMAT_XML ) { sb->safePrintf("\t\t\n"); sb->safePrintf("\t\t\t%"INT32"" "\n",catId); sb->safePrintf("\t\t\t%"INT32"\n", (int32_t)direct); // print the name of the dmoz category sb->safePrintf("\t\t\tprintPathFromId(&xb, catId, false, si->m_isRTL); sb->cdataEncode(xb.getBufStart()); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\tcdataEncode(dmozTitle); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\tcdataEncode(dmozSummary); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\tcdataEncode(dmozAnchor); sb->safePrintf("]]>\n"); sb->safePrintf("\t\t\n"); return true; } if ( si->m_format == FORMAT_JSON ) { sb->safePrintf("\t\t\"dmozEntry\":{\n"); sb->safePrintf("\t\t\t\"dmozCatId\":%"INT32",\n", catId); sb->safePrintf("\t\t\t\"directCatId\":%"INT32",\n",(int32_t)direct); // print the name of the dmoz category sb->safePrintf("\t\t\t\"dmozCatStr\":\""); char xbuf[256]; SafeBuf xb(xbuf,256,0,false); g_categories->printPathFromId(&xb, catId, false, si->m_isRTL); sb->jsonEncode(xb.getBufStart()); sb->safePrintf("\",\n"); sb->safePrintf("\t\t\t\"dmozTitle\":\""); sb->jsonEncode(dmozTitle); sb->safePrintf("\",\n"); sb->safePrintf("\t\t\t\"dmozSum\":\""); sb->jsonEncode(dmozSummary); sb->safePrintf("\",\n"); sb->safePrintf("\t\t\t\"dmozAnchor\":\""); sb->jsonEncode(dmozAnchor); sb->safePrintf("\"\n"); sb->safePrintf("\t\t},\n"); return true; } return true; } class MenuItem { public: int32_t m_menuNum; char *m_title; // we append this to the url char *m_cgi; char m_tmp[25]; char *m_icon; // for languages - the language flag char m_iconWidth; char m_iconHeight; }; static MenuItem s_mi[200]; static int32_t s_num = 0; bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) { // 1-1 with the langs in Lang.h char *g_flagBytes[] = { // base64 encoding NULL, // langunknown // english }; /* langUnknown = 0, langEnglish = 1, langFrench = 2, langSpanish = 3, langRussian = 4, langTurkish = 5, langJapanese = 6, langChineseTrad = 7, // cantonese langChineseSimp = 8, // mandarin langKorean = 9, langGerman = 10, langDutch = 11, langItalian = 12, langFinnish = 13, langSwedish = 14, langNorwegian = 15, langPortuguese = 16, langVietnamese = 17, langArabic = 18, langHebrew = 19, langIndonesian = 20, langGreek = 21, langThai = 22, langHindi = 23, langBengala = 24, langPolish = 25, langTagalog = 26, // added for wiktionary langLatin = 27, langEsperanto = 28, langCatalan = 29, langBulgarian = 30, langTranslingual = 31, // used by multiple langs in wiktionary langSerboCroatian = 32, langHungarian = 33, langDanish = 34, langLithuanian = 35, langCzech = 36, langGalician = 37, langGeorgian = 38, langScottishGaelic = 39, langGothic = 40, langRomanian = 41, langIrish = 42, langLatvian = 43, langArmenian = 44, langIcelandic = 45, langAncientGreek = 46, langManx = 47, langIdo = 48, langPersian = 49, langTelugu = 50, langVenetian = 51, langMalgasy = 52, langKurdish = 53, langLuxembourgish = 54, langEstonian = 55, langLast = 56 }; */ SafeBuf cu; hr->getCurrentUrl ( cu ); sb->safePrintf("" ); static bool s_init = false; if ( ! s_init ) { int32_t n = 0; s_mi[n].m_menuNum = 0; s_mi[n].m_title = "Any time"; s_mi[n].m_cgi = "secsback=0"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 0; s_mi[n].m_title = "Past 24 hours"; s_mi[n].m_cgi = "secsback=86400"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 0; s_mi[n].m_title = "Past week"; s_mi[n].m_cgi = "secsback=604800"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 0; s_mi[n].m_title = "Past month"; s_mi[n].m_cgi = "secsback=2592000"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 0; s_mi[n].m_title = "Past year"; s_mi[n].m_cgi = "secsback=31536000"; n++; s_mi[n].m_icon = NULL; // sort by s_mi[n].m_menuNum = 1; s_mi[n].m_title = "Sorted by relevance"; s_mi[n].m_cgi = "sortby=0"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 1; s_mi[n].m_title = "Sorted by date"; s_mi[n].m_cgi = "sortby=1"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 1; s_mi[n].m_title = "Reverse sorted by date"; s_mi[n].m_cgi = "sortby=2"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 1; s_mi[n].m_title = "Sorted by site inlinks"; s_mi[n].m_cgi = "sortby=3"; s_mi[n].m_icon = NULL; n++; // languages s_mi[n].m_menuNum = 2; s_mi[n].m_title = "Any language"; s_mi[n].m_cgi = "qlang=xx"; s_mi[n].m_icon = NULL; n++; for ( int32_t i = 0 ; i < langLast ; i++ ) { s_mi[n].m_menuNum = 2; s_mi[n].m_title = getLanguageString(i); char *abbr = getLangAbbr(i); snprintf(s_mi[n].m_tmp,10,"qlang=%s",abbr); s_mi[n].m_cgi = s_mi[n].m_tmp; s_mi[n].m_icon = g_flagBytes[i]; //base64encoded n++; } // filetypes s_mi[n].m_menuNum = 3; s_mi[n].m_title = "Any filetype"; s_mi[n].m_cgi = "filetype=any"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "HTML"; s_mi[n].m_cgi = "filetype=html"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "TEXT"; s_mi[n].m_cgi = "filetype=txt"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "PDF"; s_mi[n].m_cgi = "filetype=pdf"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "Microsoft Word"; s_mi[n].m_cgi = "filetype=doc"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "XML"; s_mi[n].m_cgi = "filetype=xml"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "JSON"; s_mi[n].m_cgi = "filetype=json"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "Excel"; s_mi[n].m_cgi = "filetype=xls"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "PostScript"; s_mi[n].m_cgi = "filetype=ps"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 3; s_mi[n].m_title = "Spider Status"; s_mi[n].m_cgi = "filetype=status"; s_mi[n].m_icon = NULL; n++; // facets s_mi[n].m_menuNum = 4; s_mi[n].m_title = "No Facets"; s_mi[n].m_cgi = "facet="; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Language facet"; s_mi[n].m_cgi = "facet=gbfacetint:gblang"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Content type facet"; s_mi[n].m_cgi = "facet=gbfacetstr:type"; s_mi[n].m_icon = NULL; n++; // s_mi[n].m_menuNum = 4; // s_mi[n].m_title = "Ip address"; // s_mi[n].m_cgi = "facet=gbfacetstr:ip"; // n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Url path depth"; s_mi[n].m_cgi = "facet=gbfacetint:gbpathdepth"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Spider date facet"; s_mi[n].m_cgi = "facet=gbfacetint:gbspiderdate"; s_mi[n].m_icon = NULL; n++; // everything in tagdb is hashed s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Site num inlinks facet"; s_mi[n].m_cgi = "facet=gbfacetint:gbtagsitenuminlinks"; s_mi[n].m_icon = NULL; n++; // s_mi[n].m_menuNum = 4; // s_mi[n].m_title = "Domains facet"; // s_mi[n].m_cgi = "facet=gbfacetint:gbdomhash"; // n++; s_mi[n].m_menuNum = 4; s_mi[n].m_title = "Hopcount facet"; s_mi[n].m_cgi = "facet=gbfacetint:gbhopcount"; s_mi[n].m_icon = NULL; n++; // output s_mi[n].m_menuNum = 5; s_mi[n].m_title = "Output HTML"; s_mi[n].m_cgi = "format=html"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 5; s_mi[n].m_title = "Output XML"; s_mi[n].m_cgi = "format=xml"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 5; s_mi[n].m_title = "Output JSON"; s_mi[n].m_cgi = "format=json"; s_mi[n].m_icon = NULL; n++; // show/hide banned s_mi[n].m_menuNum = 6; s_mi[n].m_title = "Hide banned results"; s_mi[n].m_cgi = "sb=0"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 6; s_mi[n].m_title = "Show banned results"; s_mi[n].m_cgi = "sb=1"; s_mi[n].m_icon = NULL; n++; // spider status s_mi[n].m_menuNum = 7; s_mi[n].m_title = "Hide Spider Log"; s_mi[n].m_cgi = "splog=0"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 7; s_mi[n].m_title = "Show Spider Log"; s_mi[n].m_cgi = "q=type:status"; s_mi[n].m_icon = NULL; n++; // family filter s_mi[n].m_menuNum = 8; s_mi[n].m_title = "Family Filter Off"; s_mi[n].m_cgi = "ff=0"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 8; s_mi[n].m_title = "Family Filter On"; s_mi[n].m_cgi = "ff=1"; s_mi[n].m_icon = NULL; n++; // META TAGS s_mi[n].m_menuNum = 9; s_mi[n].m_title = "No Meta Tags"; s_mi[n].m_cgi = "dt="; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 9; s_mi[n].m_title = "Show Meta Tags"; s_mi[n].m_cgi = "dt=keywords+description"; s_mi[n].m_icon = NULL; n++; // ADMIN s_mi[n].m_menuNum = 10; s_mi[n].m_title = "Show Admin View"; s_mi[n].m_cgi = "admin=1"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 10; s_mi[n].m_title = "Show User View"; s_mi[n].m_cgi = "admin=0"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 11; s_mi[n].m_title = "Action"; s_mi[n].m_cgi = ""; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 11; s_mi[n].m_title = "Respider all results"; s_mi[n].m_cgi = "/admin/reindex"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 11; s_mi[n].m_title = "Delete all results"; s_mi[n].m_cgi = "/admin/reindex"; s_mi[n].m_icon = NULL; n++; s_mi[n].m_menuNum = 11; s_mi[n].m_title = "Scrape from google/bing"; s_mi[n].m_cgi = "/admin/inject"; s_mi[n].m_icon = NULL; n++; s_num = n; if ( n > 200 ) { char *xx=NULL;*xx=0; } } // we'll print the admin menu custom since it's mostly off-page links // bar of drop down menus sb->safePrintf("
"); for ( int32_t i = 0 ; i <= s_mi[s_num-1].m_menuNum ; i++ ) { // after 4 make a new line if ( i == 5 ) sb->safePrintf("

"); if ( i == 9 ) sb->safePrintf("

"); printMenu ( sb , i , hr ); } sb->safePrintf("
\n"); sb->safePrintf("
\n"); return true; } bool printMenu ( SafeBuf *sb , int32_t menuNum , HttpRequest *hr ) { bool firstOne = true; MenuItem *first = NULL; char *src = hr->m_origUrlRequest; int32_t srcLen = hr->m_origUrlRequestLen; char *frontTag = ""; char *backTag = ""; bool isDefaultHeader = true; // try to set first based on what's in the url for ( int32_t i = 0 ; i < s_num ; i++ ) { // int16_tcut MenuItem *mi = &s_mi[i]; // skip if not our item if ( mi->m_menuNum != menuNum ) continue; // admin menu is special // if ( menuNum == s_num - 1 ) { // first = mi; // frontTag = ""; // backTag = ""; // break; // } // is it in the url char *match = strnstr ( src , mi->m_cgi , srcLen ); // or if empty quotes it is the true header like // for 'hide spider log' option if ( ! match ) { isDefaultHeader = false; continue; } // ensure ? or & preceeds if ( match > src && match[-1] != '?' && match[-1] != '&' ) continue; // and \0 or & follows int32_t milen = gbstrlen(mi->m_cgi); if ( match+milen > src+srcLen ) continue; if ( ! is_wspace_a(match[milen]) && match[milen] != '&' ) continue; // got it first = mi; // do not highlight the orig header if ( isDefaultHeader ) break; frontTag = ""; backTag = ""; break; } for ( int32_t i = 0 ; i < s_num ; i++ ) { // int16_tcut MenuItem *mi = &s_mi[i]; // skip if not our item if ( mi->m_menuNum != menuNum ) continue; if ( ! first ) first = mi; if ( ! firstOne ) goto skip; firstOne = false; // for centering the dropdown sb->safePrintf(""); // print hidden drop down menu sb->safePrintf( "" , mi->m_menuNum ); skip: // . add our cgi to the original url // . so if it has &qlang=de and they select &qlang=en // we have to replace it... etc. SafeBuf newUrl; replaceParm ( mi->m_cgi , &newUrl , hr ); // print each item in there sb->safePrintf("" "
" "" , newUrl.getBufStart() ); // print checkmark (check mark) next to selected one // if not the default (trueHeader) if ( mi == first ) // ! isDefaultHeader && mi == first ) sb->safePrintf("%c%c%c", 0xe2,0x9c,0x93); else sb->safePrintf("    "); sb->safePrintf(" %s" "
" "
" , mi->m_title ); //sb->safePrintf("

"); } // wrap up the drop down sb->safePrintf("
"); // print heading or current selection i guess sb->safePrintf( // separate menus with these two spaces "     " // print the menu header that when clicked // will show the drop down "" "%s%s%s %c%c%c" "" , first->m_menuNum , first->m_menuNum , first->m_menuNum , frontTag , first->m_title , backTag // print triangle ,0xe2 ,0x96 ,0xbc ); return true; } bool replaceParm ( char *cgi , SafeBuf *newUrl , HttpRequest *hr ) { // get original request url. this is not \0 terminated char *src = hr->m_origUrlRequest; int32_t srcLen = hr->m_origUrlRequestLen; return replaceParm2 ( cgi ,newUrl, src, srcLen ); } bool replaceParm2 ( char *cgi , SafeBuf *newUrl , char *oldUrl , int32_t oldUrlLen ) { char *src = oldUrl; int32_t srcLen = oldUrlLen; char *srcEnd = src + srcLen; char *equal = strstr(cgi,"="); if ( ! equal ) return log("results: %s has no equal sign",cgi); int32_t cgiLen = equal - cgi; char *found = NULL; char *p = src; tryagain: found = strncasestr ( p , cgi , srcEnd - p , cgiLen ); // if no ? or & before it it is bogus! if ( found && found[-1] != '&' && found[-1] != '?' ) { // try again p = found + 1; goto tryagain; } // if no collision, just append it if ( ! found ) { if ( ! newUrl->safeMemcpy ( src , srcLen ) ) return false; if ( ! newUrl->pushChar('&') ) return false; if ( ! newUrl->safeStrcpy ( cgi ) ) return false; if ( ! newUrl->nullTerm() ) return false; return true; } // . otherwise we have to replace it // . copy up to where it starts if ( ! newUrl->safeMemcpy ( src , found-src ) ) return false; // then insert our new cgi there if ( ! newUrl->safeStrcpy ( cgi ) ) return false; // then resume it char *foundEnd = strncasestr ( found , "&" , srcEnd - found ); // if nothing came after... if ( ! foundEnd ) { if ( ! newUrl->nullTerm() ) return false; return true; } // copy over what came after if ( ! newUrl->safeMemcpy ( foundEnd, srcEnd-foundEnd ) ) return false; if ( ! newUrl->nullTerm() ) return false; return true; } bool printMetaContent ( Msg40 *msg40 , int32_t i , State0 *st, SafeBuf *sb ) { // store the user-requested meta tags content SearchInput *si = &st->m_si; char *pp = si->m_displayMetas; char *ppend = pp + gbstrlen(si->m_displayMetas); Msg20 *m = msg40->m_msg20[i];//getMsg20(i); Msg20Reply *mr = m->m_r; char *dbuf = mr->ptr_dbuf;//msg40->getDisplayBuf(i); int32_t dbufLen = mr->size_dbuf-1;//msg40->getDisplayBufLen(i); char *dbufEnd = dbuf + (dbufLen-1); char *dptr = dbuf; //bool printedSomething = false; // loop over the names of the requested meta tags while ( pp < ppend && dptr < dbufEnd ) { // . assure last byte of dbuf is \0 // provided dbufLen > 0 // . this insures sprintf and gbstrlen won't // crash on dbuf/dptr if ( dbuf [ dbufLen ] != '\0' ) { log(LOG_LOGIC,"query: Meta tag buffer has no \\0."); break; } // skip initial spaces while ( pp < ppend && is_wspace_a(*pp) ) pp++; // break if done if ( ! *pp ) break; // that's the start of the meta tag name char *ss = pp; // . find end of that meta tag name // . can end in : -- specifies max len while ( pp < ppend && ! is_wspace_a(*pp) && *pp != ':' ) pp++; // save current char char c = *pp; char *cp = pp; // NULL terminate the name *pp++ = '\0'; // if ':' was specified, skip the rest if ( c == ':' ) while ( pp < ppend && ! is_wspace_a(*pp)) pp++; // print the name //int32_t sslen = gbstrlen ( ss ); //int32_t ddlen = gbstrlen ( dptr ); int32_t ddlen = dbufLen; //if ( p + sslen + ddlen + 100 > pend ) continue; // newspaperarchive wants tags printed even if no value // make sure the meta tag isn't fucked up for ( int32_t ti = 0; ti < ddlen; ti++ ) { if ( dptr[ti] == '"' || dptr[ti] == '>' || dptr[ti] == '<' || dptr[ti] == '\r' || dptr[ti] == '\n' || dptr[ti] == '\0' ) { ddlen = ti; break; } } if ( ddlen > 0 ) { // ship it out if ( si->m_format == FORMAT_XML ) { sb->safePrintf ( "\t\t" "cdataEncode ( dptr, ddlen ); sb->safePrintf ( "]]>\n" ); } else if ( si->m_format == FORMAT_JSON ) { sb->safePrintf ( "\t\t\"display.%s\":\"",ss); sb->jsonEncode ( dptr, ddlen ); sb->safePrintf ( "\",\n"); } // otherwise, print in light gray else { sb->safePrintf("" "%s: ", ss ); sb->safeMemcpy ( dptr, ddlen ); sb->safePrintf ( "
" ); } } // restore tag name buffer *cp = c; // point to next content of tag to display dptr += ddlen + 1; } return true; }