#include "gb-include.h" #include "SearchInput.h" #include "Parms.h" // g_parms //#include "CollectionRec.h" // cr #include "Pages.h" // g_msg #include "LanguageIdentifier.h" #include "CountryCode.h" #include "geo_ip_table.h" #include "Users.h" #include "Address.h" // getLatLonFromUserInput #include "Timedb.h" #include "PageResults.h" SearchInput::SearchInput() { reset(); } SearchInput::~SearchInput() { reset(); } void SearchInput::reset ( ) { /* m_langHint = 0; m_languageWeightFactor = 0.33; m_enableLanguageSorting = 0; m_queryIP = 0; m_hr = NULL; m_gbcountry = NULL; m_gbcountryLen = 0; m_country = 0; m_language = 0; m_sq = NULL; m_sqLen = 0; m_noDocIds = NULL; m_noSiteIds = NULL; m_noDocIdsLen = 0; m_noSiteIdsLen = 0; */ } void SearchInput::setToDefaults ( CollectionRec *cr , long niceness ) { // reset it first reset(); // set all to 0 just to avoid any inconsistencies long size = (char *)&m_END_TEST - (char *)&m_START; memset ( this , 0x00 , size ); m_sbuf1.reset(); m_sbuf2.reset(); m_sbuf3.reset(); // set these m_numLinesInSummary = 2; m_docsWanted = 10; m_boolFlag = 2; m_maxQueryTerms = 1000; m_niceness = niceness; m_defaultSortLanguageLen = 0; } // . make a key for caching the search results page based on this input // . do not use all vars, like the m_*ToDisplay should not be included key_t SearchInput::makeKey ( ) { // hash the query long n = m_q->getNumTerms (); long long *termIds = m_q->getTermIds (); char *signs = m_q->getTermSigns (); key_t k; k.n1 = 0; k.n0 = hash64 ( (char *)termIds , n * sizeof(long long) ); k.n0 = hash64 ( (char *)signs , n , k.n0 ); // user defined weights, for weighting each query term separately for ( long i = 0 ; i < n ; i++ ) { k.n0 = hash64 ((char *)&m_q->m_qterms[i].m_userWeight,4, k.n0); k.n0 = hash64 ((char *)&m_q->m_qterms[i].m_userType ,1, k.n0); } // space separated, NULL terminated, list of meta tag names to display if ( m_displayMetas ) k.n0 = hash64b ( m_displayMetas , k.n0 ); // name of collection in external cluster to get titleRecs for // related pages from if ( m_rp_getExternalPages && m_rp_externalColl ) k.n0 = hash64b ( m_rp_externalColl , k.n0 ); // collection e import from if ( m_importColl ) k.n0 = hash64b ( m_importColl , k.n0 ); // the special query parm if ( m_sq && m_sqLen > 0 ) k.n0 = hash64 ( m_sq , m_sqLen , k.n0 ); if ( m_noDocIds && m_noDocIdsLen ) k.n0 = hash64 ( m_noDocIds , m_noDocIdsLen , k.n0 ); if ( m_noSiteIds && m_noSiteIdsLen ) k.n0 = hash64 ( m_noSiteIds , m_noSiteIdsLen , k.n0 ); // no need to hash these again separately, they are in between // m_START and m_END_HASH // language //if ( m_language ) // k.n0 = hash64 ( m_language , k.n0 ); //if ( m_gblang ) // k.n0 = hash64 ( m_gblang , k.n0 ); // . now include the hash of the search parameters // . nnot incuding m_docsToScanForTopics since since we got TopicGroups char *a = ((char *)&m_START) + 4 ; // msg40->m_dpf; char *b = (char *)&m_END_HASH ; // msg40->m_topicGroups; long size = b - a; // push and flush some parms that should not contribute //long save1 = m_refs_numToDisplay; //long save2 = m_rp_numToDisplay; //long save3 = m_numTopicsToDisplay; //m_refs_numToDisplay = 0; //m_rp_numToDisplay = 0; //m_numTopicsToDisplay = 0; // and hash it all up k.n0 = hash64 ( a , size , k.n0 ); // and pop out the parms that did not contribute //m_refs_numToDisplay = save1; //m_rp_numToDisplay = save2; //m_numTopicsToDisplay = save3; // hash each topic group for ( long i = 0 ; i < m_numTopicGroups ; i++ ) { TopicGroup *t = &m_topicGroups[i]; //k.n0 = hash64 ( t->m_numTopics , k.n0 ); k.n0 = hash64 ( t->m_maxTopics , k.n0 ); k.n0 = hash64 ( t->m_docsToScanForTopics , k.n0 ); k.n0 = hash64 ( t->m_minTopicScore , k.n0 ); k.n0 = hash64 ( t->m_maxWordsPerTopic , k.n0 ); k.n0 = hash64b( t->m_meta , k.n0 ); k.n0 = hash64 ( t->m_delimeter , k.n0 ); k.n0 = hash64 ( t->m_useIdfForTopics , k.n0 ); k.n0 = hash64 ( t->m_dedup , k.n0 ); } // . boolean queries have operators (AND OR NOT ( ) ) that we need // to consider in this hash as well. so // . so just hash the whole damn query if ( m_q->m_isBoolean ) { char *q = m_q->getQuery(); long qlen = m_q->getQueryLen(); k.n0 = hash64 ( q , qlen , k.n0 ); } // Language stuff k.n0 = hash64(m_defaultSortLanguage, m_defaultSortLanguageLen, k.n0); k.n0 = hash64(m_defaultSortCountry , m_defaultSortCountryLen , k.n0); // debug //logf(LOG_DEBUG,"query: q=%s k.n0=%llu",m_q->getQuery(),k.n0); //Msg1aParms* m1p = msg40->getReferenceParms(); //if( m1p ) { // k.n0=hash64(((char*)m1p)+sizeof(long), // sizeof(Msg1aParms)-8,k.n0); //} return k; } void SearchInput::test ( ) { // set all to 0 just to avoid any inconsistencies char *a = ((char *)&m_START) + 4 ; // msg40->m_dpf; char *b = (char *)&m_END_TEST; long size = b - a; memset ( a , 0x00 , size ); // loop through all possible cgi parms to set SearchInput for ( long i = 0 ; i < g_parms.m_numSearchParms ; i++ ) { Parm *m = g_parms.m_searchParms[i]; char *x = (char *)this + m->m_soff; if ( m->m_type != TYPE_BOOL ) *(long *)x = 0xffffffff; else *(char *)x = 0xff; } // ensure we're all zeros now! long fix = a - (char *)this; unsigned char *p = (unsigned char *)a; for ( long i = 0 ; i < size ; i++ ) { if ( p[i] == 0xff ) continue; // find it long off = i + fix; char *name = NULL; // "unknown"; for ( long k = 0 ; k < g_parms.m_numSearchParms ; k++ ) { Parm *m = g_parms.m_searchParms[k]; if ( m->m_soff != off ) continue; name = m->m_title; break; } if ( ! name ) continue; log("query: Got uncovered SearchInput parm at offset " "%li in SearchInput. name=%s.",off,name); } } void SearchInput::copy ( class SearchInput *si ) { memcpy ( (char *)this , (char *)si , sizeof(SearchInput) ); } class SearchInput *g_si = NULL; bool SearchInput::set ( TcpSocket *sock , HttpRequest *r , Query *q ) { // save it now m_socket = sock; // get coll rec long collLen9; char *coll9 = r->getString ( "c" , &collLen9 ); //if (! coll){coll = g_conf.m_defaultColl; collLen = gbstrlen(coll); } //if ( ! coll ) // coll = g_conf.getDefaultColl(r->getHost(), r->getHostLen()); //if ( ! coll || ! coll[0] ) // coll = "main"; //if ( ! coll ) { g_errno = ENOCOLLREC; return false; } //collLen = gbstrlen(coll); CollectionRec *cr = g_collectiondb.getRec ( coll9 ); if ( ! cr ) { g_errno = ENOCOLLREC; g_msg = " (error: no such collection)"; return false; } // set all to 0 just to avoid any inconsistencies //long size = (char *)&m_END_TEST - (char *)&m_START; //memset ( this , 0x00 , size ); setToDefaults( cr , 0 ); // niceness m_cr = cr; m_coll2 = m_cr->m_coll; m_collLen2 = gbstrlen(m_coll2); // from ::reset() m_languageWeightFactor = 0.33; // Set IP for language detection. // (among other things) if ( sock ) m_queryIP = sock->m_ip; else m_queryIP = 0; m_hr = r; // keep ptr to the query class to use m_q = q; // set this here since its size can be variable m_sq = r->getString("sq",&m_sqLen); // negative docids m_noDocIds = r->getString("nodocids",&m_noDocIdsLen); // negative sites m_noSiteIds = r->getString("nositeids",&m_noSiteIdsLen); // Msg5e calls Msg40 with this set to true in the searchInput // so it can analyze the entire pages of each search result so it // can find the article start/end tag sequence indicators m_getTitleRec = r->getLong("gettrs",0); m_getSitePops = r->getLong("getsitepops",0 ); // does this collection ban this IP? /* long encapIp = 0; m if (! cr->hasSearchPermission ( sock, encapIp ) ) { g_errno = ENOPERM; g_msg = " (error: permission denied)"; return false; } */ // set all search parms in SearchInput to defaults for ( long i = 0 ; i < g_parms.m_numSearchParms ; i++ ) { Parm *m = g_parms.m_searchParms[i]; // sanity if ( m->m_soff < 0 ) { char *xx=NULL;*xx=0; } char *x = (char *)this + m->m_soff; // what is the def val ptr char *def = NULL; if ( m->m_off >= 0 && m->m_obj == OBJ_COLL ) def = ((char *)cr) + m->m_off; else if ( m->m_off >= 0 && m->m_obj == OBJ_CONF ) def = ((char *)&g_conf) + m->m_off; // set it based on type if ( m->m_type == TYPE_LONG ) { long v = 0; if ( def ) v = *(long *)def; else if ( m->m_def ) v = atol(m->m_def); *(long *)x = v; } else if ( m->m_type == TYPE_BOOL ) { long v = 0; if ( def ) v = *(char *)def; else if ( m->m_def ) v = atol(m->m_def); // sanity test! if ( v != 0 && v != 1 ) log("query: got non-bool default " "for bool parm %s",m->m_title); if ( v ) *(char *)x = 1; else *(char *)x = 0; } else if ( m->m_type == TYPE_CHAR ) { if ( def ) *(char *)x = *(char *)def; else if ( m->m_def ) *(char *)x = atol(m->m_def); } else if ( m->m_type == TYPE_FLOAT ) { float v = 0; if ( def ) v = *(float *)def; else if ( m->m_def ) v = atof(m->m_def); *(float *)x = (float)v; } else if ( m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX ) { //if ( m->m_cgi && strcmp ( m->m_cgi, "erpc" ) == 0 ) // log("hey1"); //if ( m->m_cgi && strcmp ( m->m_scgi, "q" ) == 0 ) // log("hey1"); char *v = NULL; if ( def ) v = (char *)def; else if ( m->m_def ) v = m->m_def; *(char **)x = v; // set the length if ( ! v ) *(long *)(x-4) = 0; else *(long *)(x-4) = gbstrlen(v); } } // this is just used to determine in PageResults.cpp if we should // show admin knobs next to each result... // default to off for now. default back on. m_isAdmin = r->getLong("admin",1); //if ( m_isAdmin ) m_isAdmin = g_users.hasPermission ( r,PAGE_MASTER); // local ip? if ( ! r->isLocal() ) m_isAdmin = 0; // default set does not take into account g_conf, // so we will take care of that here ourselves... m_adFeedEnabled = g_conf.m_adFeedEnabled; //m_excludeLinkText = g_conf.m_excludeLinkText; //m_excludeMetaText = g_conf.m_excludeMetaText; // we need to get some cgi values in order to correct the defaults // based on if we're doing an xml feed, have a site: query, etc. //long xml = r->getLong ( "xml" , 0 ); // was "raw" long siteLen = 0; r->getString ("site",&siteLen); long sitesLen = 0; char *sites = r->getString ("sites",&sitesLen,NULL); // save it if there if ( sites && sitesLen > 0 && ( ! m_whiteListBuf.safeStrcpy(sites)|| ! m_whiteListBuf.nullTerm() ) ) return log("query: unable to strcpy whitelist"); char format = getFormatFromRequest ( r ); // now override automatic defaults for special cases if ( format != FORMAT_HTML ) { m_familyFilter = 0; // this is causing me a headache when on when i dont know it m_restrictIndexdbForQuery = false; // this is hackish if ( r->getLong("rt",0) ) m_restrictIndexdbForQuery=false; m_numTopicsToDisplay = 0; m_doQueryHighlighting = 0; m_spellCheck = 0; m_refs_numToGenerate = 0; m_refs_docsToScan = 0; // default scoring info to off m_getDocIdScoringInfo = false; } else if ( m_siteLen > 0 ) { m_restrictIndexdbForQuery = false; m_doSiteClustering = false; m_ipRestrictForTopics = false; } else if ( m_whiteListBuf.length() > 0 ) { m_ipRestrictForTopics = false; } m_doIpClustering = false; //m_sitesQueryLen = 0; // set the user ip, "uip" long uip = m_queryIP; char *uipStr = m_hr->getString ("uip" , NULL ); long tmpIp = 0; if ( uipStr ) tmpIp = atoip(uipStr); if ( tmpIp ) uip = tmpIp; // // // BEGIN MAIN PARM SETTING LOOP // // // loop through all possible cgi parms to set SearchInput for ( long i = 0 ; i < g_parms.m_numSearchParms ; i++ ) { Parm *m = g_parms.m_searchParms[i]; char *x = (char *)this + m->m_soff; // what is the parm's cgi name? char *cgi = m->m_scgi; if ( ! cgi ) cgi = m->m_cgi; // sanity check if ( ! m->m_sparm ) { log("query: Failed search input sanity check."); char *xx = NULL; *xx = 0; } // . break it down by type now // . get it from request and store it in SearchInput if ( m->m_type == TYPE_LONG ) { // default was set above long def = *(long *)x; // assume default long v = def; // but cgi parms override cookie v = r->getLong ( cgi , v ); // but if its a privledged parm and we're not an admin // then do not allow overrides, but m_priv of 3 means // to not display for clients, but to allow overrides if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def; // bounds checks if ( v < m->m_smin ) v = m->m_smin; if ( v > m->m_smax ) v = m->m_smax; if ( m->m_sminc >= 0 ) { long vmin = *(long *)((char *)cr+m->m_sminc); if ( v < vmin ) v = vmin; } if ( m->m_smaxc >= 0 ) { long vmax = *(long *)((char *)cr+m->m_smaxc); if ( v > vmax ) v = vmax; } // set it *(long *)x = v; // do not print start result num (m->m_sprop is 0 for // "s" now) //if ( cgi[0] == 's' && cgi[1] == '\0' ) continue; // should we propagate it? true by default //if ( ! m->m_sprop ) continue; // if it is the same as its default, and the default is // always from m_def and never from the CollectionRec, // then do not both storing it in here! what's the // point? if ( v == def && m->m_off < 0 ) continue; // if not default do not propagate if ( v == def ) continue; // . include for sure if explicitly provided // . vp will be NULL if "cgi" is not explicitly listed // as a cgi parm. otherwise, even if *vp == '\0', vp // is non-NULL. // . crap, it can be in the cookie now //char *vp = r->getValue(cgi, NULL, NULL); // if not given at all, do not propagate //if ( ! vp ) continue; // store in up if different from default, even if // same as default ("def") because default may be // changed by the admin since m->m_off >= 0 //if ( m->m_sprpg && up + gbstrlen(cgi) + 20 < upend ) // up += sprintf ( up , "%s=%li&", cgi , v ); //if ( m->m_sprpp && pp + gbstrlen(cgi) + 80 < ppend ) // pp += sprintf ( pp , "\n", // cgi , v ); } else if ( m->m_type == TYPE_FLOAT ) { // default was set above float def = *(float *)x; // get overriding from http request, if any float v; // but if its a privledged parm and we're not an admin // then do not allow overrides if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def; else v = r->getFloat( cgi , def ); // bounds checks if ( v < m->m_smin ) v = m->m_smin; if ( v > m->m_smax ) v = m->m_smax; if ( m->m_sminc >= 0 ) { float vmin = *(float *)((char *)cr+m->m_sminc); if ( v < vmin ) v = vmin; } if ( m->m_smaxc >= 0 ) { float vmax = *(float *)((char *)cr+m->m_smaxc); if ( v > vmax ) v = vmax; } // set it *(float *)x = v; // do not print start result num //if ( cgi[0] == 's' && cgi[1] == '\0' ) continue; // include for sure if explicitly provided char *vp = r->getValue(cgi, NULL, NULL); if ( ! vp ) continue; // unchanged from default? if ( v == def ) continue; // store in up different from default //if ((vp||v!= def) && up + gbstrlen(cgi)+20 < upend ) // up += sprintf ( up , "%s=%f&", cgi , v ); //if ((vp||v!= def) && pp + gbstrlen(cgi)+20 < ppend ) // pp += sprintf ( pp , "\n", // cgi , v ); } else if ( m->m_type == TYPE_BOOL ) { // default was set above long def = *(char *)x; if ( def != 0 ) def = 1; // normalize // assume default long v = def; // cgi parms override cookie v = r->getBool ( cgi , v ); // but if no perm, use default if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def; if ( v != 0 ) v = 1; // normalize *(char *)x = v; // don't propagate rcache //if ( ! strcmp(cgi,"rcache") ) continue; // should we propagate it? true by default //if ( ! m->m_sprop ) continue; // if it is the same as its default, and the default is // always from m_def and never from the CollectionRec, // then do not both storing it in here! what's the // point? if ( v == def && m->m_off < 0 ) continue; // if not default do not propagate if ( v == def ) continue; // . include for sure if explicitly provided // . vp will be NULL if "cgi" is not explicitly listed // as a cgi parm. otherwise, even if *vp == '\0', vp // is non-NULL. // . crap, it can be in the cookie now! //char *vp = r->getValue(cgi, NULL, NULL); // if not given at all, do not propagate //if ( ! vp ) continue; // store in up if different from default, even if // same as default ("def") because default may be // changed by the admin since m->m_off >= 0 //if ( m->m_sprpg && up + gbstrlen(cgi) + 10 < upend ) // up += sprintf ( up , "%s=%li&", cgi , v ); //if ( m->m_sprpp && pp + gbstrlen(cgi) + 80 < ppend ) // pp += sprintf ( pp , "\n", // cgi , v ); } else if ( m->m_type == TYPE_CHAR ) { // default was set above char def = *(char *)x; *(char *)x = r->getLong ( cgi, def ); // use this long v = *(char *)x; // store in up if different from default, even if // same as default ("def") because default may be // changed by the admin since m->m_off >= 0. nah, // let's try to reduce cgi parm pollution... if ( v == def ) continue; //if ( m->m_sprpg && up + gbstrlen(cgi) + 10 < upend ) // up += sprintf ( up , "%s=%li&", cgi , v ); //if ( m->m_sprpp && pp + gbstrlen(cgi) + 80 < ppend ) // pp += sprintf ( pp , "\n", // cgi , v ); } else if ( m->m_type == TYPE_STRING || m->m_type == TYPE_STRINGBOX ) { //if ( m->m_cgi && strcmp ( m->m_cgi, "qlang" ) == 0 ) // log("hey2"); char *def = *(char **)x; // get overriding from http request, if any long len = 0; char *v = NULL; // . cgi parms override cookie // . is this url encoded? v = r->getString ( cgi , &len , v ); // if not specified explicitly, default it and continue if ( ! v ) { // sanity if ( ! def ) def = ""; *(char **)x = def; // length preceeds char ptr in SearchInput *(long *)(x - 4) = gbstrlen(def); continue; } // if something was specified, override, it might // be length zero, too *(char **)x = v; // length preceeds char ptr in SearchInput *(long *)(x - 4) = len; // do not store if query, that needs to be last so // related topics can append to it //if ( cgi[0] == 'q' && cgi[1] == '\0' ) continue; // should we propagate it? true by default //if ( ! m->m_sprop ) continue; // if not given at all, do not propagate //if ( ! vp ) continue; // if it is the same as its default, and the default is // always from m_def and never from the CollectionRec, // then do not both storing it in here! what's the // point? //if ( v && v == def && !strcmp(def,v) && m->m_off < 0) // continue; // Need to set qcs based on page encoding... // not propagated if (!strncmp(cgi, "qcs", 3)) continue; // do not propagate defaults if ( v == def ) continue; // store in up if different from default, even if // same as default ("def") because default may be // changed by the admin since m->m_off >= 0 //if( m->m_sprpg && up+gbstrlen(cgi)+len+6 < upend ) { // up += sprintf ( up , "%s=", cgi ); // up += urlEncode ( up , upend-up-2 , v , len ); // *up++ = '&'; //} // propogate hidden inputs //if ( m->m_sprpp && up+gbstrlen(cgi)+len+80 < upend ) // pp += sprintf ( pp , "\n", // cgi , v ); } } // now add the special "qh" parm whose default value changes // depending on if we are widget related or not long qhDefault = 1; m_doQueryHighlighting = r->getLong("qh",qhDefault); // // TODO: use Parms.cpp defaults // TopicGroup *tg = &m_topicGroups[0]; // // // gigabits // // tg->m_numTopics = 50; tg->m_maxTopics = 50; tg->m_docsToScanForTopics = m_docsToScanForTopics; tg->m_minTopicScore = 0; tg->m_maxWordsPerTopic = 6; tg->m_meta[0] = '\0'; tg->m_delimeter = '\0'; tg->m_useIdfForTopics = false; tg->m_dedup = true; // need to be on at least 2 pages! tg->m_minDocCount = 2; tg->m_ipRestrict = true; tg->m_dedupSamplePercent = 80; tg->m_topicRemoveOverlaps = true; tg->m_topicSampleSize = 4096; // max sequential punct chars allowedin a topic tg->m_topicMaxPunctLen = 1; m_numTopicGroups = 1; // use "&dg=1" to debug gigabits m_debugGigabits = r->getLong("dg",0); // override m_format = format; // . omit scoring info from the xml feed for now // . we have to roll this out to gk144 net i think //if ( m_format != FORMAT_HTML ) // m_getDocIdScoringInfo = 0; // turn off by default! if ( ! r->getLong("gigabits",0) ) { m_numTopicGroups = 0; } ////////////////////////////////////// // // transform input into classes // ////////////////////////////////////// // USER_ADMIN, ... m_username = g_users.getUsername(r); // if collection is NULL default to one in g_conf if ( ! m_coll2 || ! m_coll2[0] ) { //m_coll = g_conf.m_defaultColl; m_coll2 = g_conf.getDefaultColl(r->getHost(), r->getHostLen()); m_collLen2 = gbstrlen(m_coll2); } // reset this m_gblang = 0; // use gblang then! long gglen; char *gg = r->getString ( "clang" , &gglen , NULL ); if ( gg && gglen > 1 ) m_gblang = getLanguageFromAbbr(gg); // allow for "qlang" if still don't have it //long gglen2; //char *gg2 = r->getString ( "qlang" , &gglen2 , NULL ); //if ( m_gblang == 0 && gg2 && gglen2 > 1 ) // m_gblang = getLanguageFromAbbr(gg2); // fix query by removing lang:xx from ask.com queries //char *end = m_query + m_queryLen -8; //if ( m_queryLen > 8 && m_query && end > m_query && // strncmp(end," lang:",6)==0 ) { // char *asklang = m_query+m_queryLen - 2; // m_gblang = getLanguageFromAbbr(asklang); // m_queryLen -= 8; // m_query[m_queryLen] = 0; // //} // . returns false and sets g_errno on error // . sets m_qbuf1 and m_qbuf2 if ( ! setQueryBuffers (r) ) return log("query: setQueryBuffers: %s",mstrerror(g_errno)); /* --- Virtual host language detection --- */ if(r->getHost()) { bool langset = getLanguageFromAbbr(m_defaultSortLanguage); char *cp; if(!langset && (cp = strrchr(r->getHost(), '.'))) { uint8_t lang = getLanguageFromUserAgent(++cp); if(lang) { // char langbuf[128]; // sprintf(langbuf, "qlang=%s\0", getLanguageAbbr(lang)); //m_defaultSortLanguage = getLanguageAbbr(lang); char *tmp = getLanguageAbbr(lang); strncpy(m_defaultSortLanguage, tmp, 6); // log(LOG_INFO, // getLanguageString(lang), r->getHost(), this); } } } /* --- End Virtual host language detection --- */ char *qs1 = m_defaultSortLanguage; // this overrides though //long qlen2; //char *qs2 = r->getString ("qlang",&qlen2,NULL); //if ( qs2 ) qs1 = qs2; m_queryLang = getLanguageFromAbbr ( qs1 ); if ( qs1 && qs1[0] && ! m_queryLang ) log("query: qlang of \"%s\" is NOT SUPPORTED",qs1); // . the query to use for highlighting... can be overriden with "hq" // . we need the language id for doing synonyms if ( m_highlightQuery && m_highlightQuery[0] ) m_hqq.set2 ( m_highlightQuery , m_queryLang , true ); else if ( m_query && m_query[0] ) m_hqq.set2 ( m_query , m_queryLang , true ); // log it here log("query: got query %s",m_sbuf1.getBufStart()); // . now set from m_qbuf1, the advanced/composite query buffer // . returns false and sets g_errno on error (ETOOMANYOPERANDS) if ( ! m_q->set2 ( m_sbuf1.getBufStart(), m_queryLang , m_queryExpansion ) ) { g_msg = " (error: query has too many operands)"; return false; } if ( m_q->m_truncated && m_q->m_isBoolean ) { g_errno = ETOOMANYOPERANDS; g_msg = " (error: query has too many operands)"; return false; } // do not allow querier to use the links: query operator unless they // are admin or the search controls explicitly allow links: //if ( m_q->m_hasLinksOperator && ! m_isAdmin && // !cr->m_allowLinksSearch ) { // g_errno = ENOPERM; // g_msg = " (error: permission denied)"; // return false; //} // miscellaneous m_showBanned = false; //if ( m_isAdmin ) m_showBanned = true; // admin can say &sb=0 explicitly to not show banned results // . if you are searching a diffbot collection, you are the admin // i guess... if ( m_isAdmin || cr->m_isCustomCrawl ) m_showBanned = r->getLong("sb",m_showBanned); if ( m_q->m_hasUrlField ) m_ipRestrictForTopics = false; if ( m_q->m_hasIpField ) { m_ipRestrictForTopics = false; //if( m_isAdmin ) m_showBanned = true; } if ( m_q->m_hasPositiveSiteField ) { m_ipRestrictForTopics = false; m_doSiteClustering = false; } if ( m_q->m_hasQuotaField ) { m_doSiteClustering = false; m_doDupContentRemoval = false; } m_familyFilter = r->getLong("ff",0); long codeLen; char *code = r->getString ("code",&codeLen,NULL); // set m_endUser if ( ! codeLen || ! code || strcmp(code,"gbfront")==0 ) m_endUser = true; else m_endUser = false; if(codeLen && !m_endUser) { m_maxResults = cr->m_maxSearchResultsForClients; } else { m_maxResults = cr->m_maxSearchResults; } // don't let admin bewilder himself if ( m_maxResults < 1 ) m_maxResults = 500; // we can't get this kind of constraint from generic Parms routines if ( m_firstResultNum + m_docsWanted > m_maxResults ) m_firstResultNum = m_maxResults - m_docsWanted; if(m_firstResultNum < 0) m_firstResultNum = 0; // if useCache is -1 then pick a default value if ( m_useCache == -1 ) { // assume yes as default m_useCache = 1; // . if query has url: or site: term do NOT use cache by def. // . however, if spider is off then use the cache by default if ( g_conf.m_spideringEnabled ) { if ( m_q->m_hasPositiveSiteField ) m_useCache = 0; else if ( m_q->m_hasIpField ) m_useCache = 0; else if ( m_q->m_hasUrlField ) m_useCache = 0; else if ( m_siteLen > 0 ) m_useCache = 0; else if ( m_whiteListBuf.length() ) m_useCache = 0; else if ( m_urlLen > 0 ) m_useCache = 0; } } // never use cache if doing a rerank (msg3b) //if ( m_rerankRuleset >= 0 ) m_useCache = 0; bool readFromCache = false; if ( m_useCache == 1 ) readFromCache = true; if ( m_rcache == 0 ) readFromCache = false; if ( m_useCache == 0 ) readFromCache = false; // if useCache is false, don't write to cache if it was not specified if ( m_wcache == -1 ) { if ( m_useCache == 0 ) m_wcache = 0; else m_wcache = 1; } // save it m_rcache = readFromCache; /* m_language = 0; // convert m_languageCode to a number for m_language if ( m_languageCode ) { m_language = (unsigned char)atoi(m_languageCode); if ( m_language == 0 ) m_language = getLanguageFromAbbr(m_languageCode); } */ // a hack for buzz for backwards compatibility //if ( strstr ( m_q->m_orig,"gbkeyword:r36p1" ) ) // m_ruleset = 36; // // . turn this off for now // . it is used in setClusterLevels() to use clusterdb to filter our // search results via Msg39, so it is not the most efficient. // . plus i am deleting most foreign language pages from the index // so we can just focus on english and that will give us more english // pages that we could normally get. we don't have resources to // de-spam the other languages, etc. // . turn it back on, i took out the setClusterLevels() use of that // because we got the langid in the posdb keys now // //m_language = 0; // convert m_defaultSortCountry to a number for m_countryHint m_countryHint = g_countryCode.getIndexOfAbbr(m_defaultSortCountry); return true; } // . sets m_qbuf1[] and m_qbuf2[] // . m_qbuf1[] is the advanced query // . m_qbuf2[] is the query to be used for spell checking // . returns false and set g_errno on error bool SearchInput::setQueryBuffers ( HttpRequest *hr ) { m_sbuf1.reset(); m_sbuf2.reset(); m_sbuf3.reset(); short qcs = csUTF8; if (m_queryCharset && m_queryCharsetLen){ // we need to convert the query string to utf-8 qcs = get_iana_charset(m_queryCharset, m_queryCharsetLen); if (qcs == csUnknown) { //g_errno = EBADCHARSET; //g_msg = "(error: unknown query charset)"; //return false; qcs = csUTF8; } } // prepend sites terms long numSites = 0; char *csStr = NULL; numSites = 0; csStr = get_charset_str(qcs); /* if ( m_sites && m_sites[0] ) { char *s = m_sites; char *t; long len; m_sbuf1.pushChar('(');// *p++ = '('; loop: // skip white space while ( *s && ! is_alnum_a(*s) ) s++; // bail if done if ( ! *s ) goto done; // get length of it t = s; while ( *t && ! is_wspace_a(*t) ) t++; len = t - s; // add site: term //if ( p + 12 + len >= pend ) goto toobig; if ( numSites > 0 ) m_sbuf1.safeStrcpy ( " UOR " ); m_sbuf1.safeStrcpy ( "site:" ); //p += ucToUtf8(p, pend-p,s, len, csStr, 0,0); m_sbuf1.safeMemcpy ( s , len ); //memcpy ( p , s , len ); p += len; // *p++ = ' '; m_sbuf1.pushChar(' '); s = t; numSites++; goto loop; done: m_sbuf1.safePrintf(") | "); // inc totalLen m_sitesQueryLen = m_sitesLen + (numSites * 10); } */ // prepend char *qp = hr->getString("prepend",NULL,NULL); if( qp && qp[0] ) { //if( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //p += sprintf( p, "+gblang:%li |", m_gblang ); m_sbuf1.safePrintf( "%s", qp ); } // append site: term if ( m_siteLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //memcpy ( p , "+site:" , 6 ); p += 6; m_sbuf1.safePrintf("+site:"); //memcpy ( p , m_site , m_siteLen ); p += m_siteLen; m_sbuf1.safeMemcpy(m_site,m_siteLen); } if ( m_familyFilter ) { if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); m_sbuf1.safePrintf("gbisadult:0 | "); } // append gblang: term if( m_gblang > 0 ) { //if( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //p += sprintf( p, "+gblang:%li |", m_gblang ); m_sbuf1.safePrintf( "+gblang:%li |", m_gblang ); } // bookmark here so we can copy into st->m_displayQuery below //long displayQueryOffset = m_sbuf1.length(); // append url: term if ( m_urlLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //memcpy ( p , "+url:" , 5 ); p += 5; m_sbuf1.safeStrcpy ( "+url:"); //memcpy ( p , m_url , m_urlLen ); p += m_urlLen; m_sbuf1.safeMemcpy ( m_url , m_urlLen ); } // append url: term if ( m_linkLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //memcpy ( p , "+link:" , 6 ); p += 6; m_sbuf1.safeStrcpy ( "+link:"); //memcpy ( p , m_link , m_linkLen ); p += m_linkLen; m_sbuf1.safeMemcpy ( m_link , m_linkLen ); } // append the natural query if ( m_queryLen > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //p += ucToUtf8(p, pend-p, m_query, m_queryLen, csStr, 0,0); m_sbuf1.safeMemcpy ( m_query , m_queryLen ); //memcpy ( p , m_query , m_queryLen ); p += m_queryLen; // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //p2 +=ucToUtf8(p2, pend2-p2, m_query, m_queryLen, csStr, 0,0); m_sbuf2.safeMemcpy ( m_query , m_queryLen ); //memcpy ( p2 , m_query , m_queryLen ); p2 += m_queryLen; } if ( m_query2Len > 0 ) { //if ( p3 > pstart3 ) *p3++ = ' '; if ( m_sbuf3.length() ) m_sbuf3.pushChar(' '); //p3+=ucToUtf8(p3, pend3-p3, m_query2, m_query2Len, csStr,0,0); m_sbuf3.safeMemcpy ( m_query2 , m_query2Len ); } //if (g_errno == EILSEQ){ // illegal character seq // log("query: bad char set"); // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} // append quoted phrases to query if ( m_quoteLen1 > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //*p++ = '+'; //*p++ = '\"'; m_sbuf1.safeStrcpy("+\""); //p += ucToUtf8(p, pend-p, m_quote1, m_quoteLen1, csStr, 0,0); m_sbuf1.safeMemcpy ( m_quote1 , m_quoteLen1 ); //memcpy ( p , m_quote1 , m_quoteLen1 ); p += m_quoteLen1 ; //*p++ = '\"'; m_sbuf1.safeStrcpy("\""); // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //*p2++ = '+'; //*p2++ = '\"'; m_sbuf2.safeStrcpy("+\""); //p2+=ucToUtf8(p2, pend2-p2, m_quote1, m_quoteLen1, csStr,0,0); m_sbuf2.safeMemcpy ( m_quote1 , m_quoteLen1 ); //memcpy ( p2 , m_quote1 , m_quoteLen1 ); p2 += m_quoteLen1 ; //*p2++ = '\"'; m_sbuf2.safeStrcpy("\""); } //if (g_errno == EILSEQ){ // illegal character seq // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} if ( m_quoteLen2 > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //*p++ = '+'; //*p++ = '\"'; m_sbuf1.safeStrcpy("+\""); //p += ucToUtf8(p, pend-p, m_quote2, m_quoteLen2, csStr, 0,0); m_sbuf1.safeMemcpy ( m_quote2 , m_quoteLen2 ); //memcpy ( p , m_quote2 , m_quoteLen2 ); p += m_quoteLen2 ; //*p++ = '\"'; m_sbuf1.safeStrcpy("\""); // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //*p2++ = '+'; //*p2++ = '\"'; m_sbuf2.safeStrcpy("+\""); //p2+=ucToUtf8(p2, pend2-p2, m_quote2, m_quoteLen2, csStr,0,0); m_sbuf2.safeMemcpy ( m_quote2 , m_quoteLen2 ); //memcpy ( p2 , m_quote2 , m_quoteLen2 ); p2 += m_quoteLen2 ; //*p2++ = '\"'; m_sbuf2.safeStrcpy("\""); } //if (g_errno == EILSEQ){ // illegal character seq // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} // append plus terms if ( m_plusLen > 0 ) { char *s = m_plus; char *send = m_plus + m_plusLen; //if ( p > pstart && p < pend ) *p++ = ' '; //if ( p2 > pstart2 && p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); while (s < send) { while (isspace(*s) && s < send) s++; char *s2 = s+1; if (*s == '\"') { // if there's no closing quote just treat // the end of the line as such while (*s2 != '\"' && s2 < send) s2++; if (s2 < send) s2++; } else { while (!isspace(*s2) && s2 < send) s2++; } if (s2 < send) break; //if (p < pend) *p++ = '+'; //if (p2 < pend2) *p2++ = '+'; m_sbuf1.pushChar('+'); m_sbuf2.pushChar('+'); //p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0); //p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0); m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf2.safeMemcpy ( s , s2 - s ); /* if (g_errno == EILSEQ) { // illegal character seq g_errno = 0; if (qcs == csUTF8) { qcs = csISOLatin1; goto doOver; } if (qcs != csISOLatin1) { qcs = csUTF8; goto doOver; } } */ s = s2 + 1; if (s < send) { //if (p < pend) *p++ = ' '; //if (p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); } } } // append minus terms if ( m_minusLen > 0 ) { char *s = m_minus; char *send = m_minus + m_minusLen; //if ( p > pstart && p < pend ) *p++ = ' '; //if ( p2 > pstart2 && p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); while (s < send) { while (isspace(*s) && s < send) s++; char *s2 = s+1; if (*s == '\"') { // if there's no closing quote just treat // the end of the line as such while (*s2 != '\"' && s2 < send) s2++; if (s2 < send) s2++; } else { while (!isspace(*s2) && s2 < send) s2++; } if (s2 < send) break; //if (p < pend) *p++ = '-'; //if (p2 < pend2) *p2++ = '-'; m_sbuf1.pushChar('-'); m_sbuf2.pushChar('-'); //p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0); //p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0); m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf2.safeMemcpy ( s , s2 - s ); /* if (g_errno == EILSEQ) { // illegal character seq g_errno = 0; if (qcs == csUTF8) { qcs = csISOLatin1; goto doOver; } if (qcs != csISOLatin1) { qcs = csUTF8; goto doOver; } } */ s = s2 + 1; if (s < send) { //if (p < pend) *p++ = ' '; //if (p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); } } } // append gbkeyword:numinlinks if they have &mininlinks=X, X>0 long minInlinks = m_hr->getLong("mininlinks",0); if ( minInlinks > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //char *str = "gbkeyword:numinlinks"; //long len = gbstrlen(str); //memcpy ( p , str , len ); //p += len; m_sbuf1.safePrintf ( "gbkeyword:numinlinks"); } // null terms if ( ! m_sbuf1.pushChar('\0') ) return false; if ( ! m_sbuf2.pushChar('\0') ) return false; if ( ! m_sbuf3.pushChar('\0') ) return false; // the natural query m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset; if ( ! m_displayQuery ) m_displayQuery = ""; while ( *m_displayQuery == ' ' ) m_displayQuery++; m_displayQueryLen = gbstrlen(m_displayQuery);//p-m_displayQuery //log("query: got query %s",m_sbuf1.getBufStart()); //log("query: got display query %s",m_displayQuery); // urlencoded display query urlEncode(m_qe, MAX_QUERY_LEN*2, m_displayQuery, m_displayQueryLen); ////////// // // show DMOZ BREADCRUMB if doing a // "gbpcatid: |" (Search restricted to category) // "gbcatid:" (DMOZ urls in that topic, c=dmoz3) // ////////// long pcatId = -1; long dcatId = -1; // get the final query char *q =m_sbuf1.getBufStart(); if ( q ) sscanf(q,"gbpcatid:%li",&pcatId); if ( q ) sscanf(q,"gbcatid:%li",&dcatId); // pick the one that is valid long catId = -1; if ( pcatId >= 0 ) catId = pcatId; if ( dcatId >= 0 ) catId = dcatId; ////// // // save catid into the state m_catId = catId; // /////// // are we a right to left language like hebrew? if ( catId > 0 && g_categories->isIdRTL(catId) ) m_isRTL = true; else m_isRTL = false; return true; } uint8_t SearchInput::detectQueryLanguage(void) { uint8_t lang = 0; // Check to see if default language is set. // This should override everything else. if(m_defaultSortLanguage) lang = getLanguageFromAbbr(m_defaultSortLanguage); // Set query language from User Agent string, if possible if(!lang && m_hr->getUserAgent()) lang = g_langId.guessLanguageFromUserAgent(m_hr->getUserAgent()); // guess from query terms if(!lang && m_q) lang = g_langId.guessLanguageFromQuery(m_q); // guess from IP addr of the requester if(!lang && m_queryIP) lang = g_langId.guessLanguageFromIP(m_queryIP); // Save for later m_langHint = lang; if(m_gbcountry && m_gbcountryLen > 0) m_country = g_countryCode.getIndexOfAbbr(m_gbcountry); if(!m_country) { // Now guess country of the query. char *codep = g_langId.findGeoIP(m_queryIP, geoIPNumRows - 1, 0); if(codep) m_country = g_countryCode.getIndexOfAbbr(codep); // Many doofuses just download firefox and don't set it // up properly, so this takes second place to the IP search. if(!m_country) m_country = g_langId.guessCountryFromUserAgent(m_hr->getUserAgent()); } return(lang); } char getFormatFromRequest ( HttpRequest *r ) { char format = FORMAT_HTML; // what format should search results be in? default is html char *formatStr = r->getString("format", NULL ); if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML; if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON; if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML; if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV; // support old api &xml=1 to mean &format=1 if ( r->getLong("xml",0) ) { format = FORMAT_XML; } // also support &json=1 if ( r->getLong("json",0) ) { format = FORMAT_JSON; } if ( r->getLong("csv",0) ) { format = FORMAT_CSV; } return format; }