#include "gb-include.h" #include "SearchInput.h" #include "Parms.h" // g_parms //#include "CollectionRec.h" // cr #include "Pages.h" // g_msg #include "LanguageIdentifier.h" #include "CountryCode.h" #include "geo_ip_table.h" #include "Users.h" #include "Address.h" // getLatLonFromUserInput #include "Timedb.h" #include "PageResults.h" //char getFormatFromRequest ( class HttpRequest *hr ) ; SearchInput::SearchInput() { reset(); } SearchInput::~SearchInput() { reset(); } void SearchInput::reset ( ) { } //void SearchInput::setToDefaults ( CollectionRec *cr , int32_t niceness ) { void SearchInput::clear ( int32_t niceness ) { // reset it first reset(); // set all to 0 just to avoid any inconsistencies int32_t size = (char *)&m_END_TEST - (char *)&m_START; memset ( this , 0x00 , size ); m_sbuf1.reset(); m_sbuf2.reset(); m_sbuf3.reset(); // set these m_numLinesInSummary = 2; m_docsWanted = 10; m_boolFlag = 2; m_maxQueryTerms = 1000; m_niceness = niceness; //m_defaultSortLanguageLen = 0; } // . make a key for caching the search results page based on this input // . do not use all vars, like the m_*ToDisplay should not be included key_t SearchInput::makeKey ( ) { // hash the query int32_t n = m_q.getNumTerms (); int64_t *termIds = m_q.getTermIds (); char *signs = m_q.getTermSigns (); key_t k; k.n1 = 0; k.n0 = hash64 ( (char *)termIds , n * sizeof(int64_t) ); k.n0 = hash64 ( (char *)signs , n , k.n0 ); // user defined weights, for weighting each query term separately for ( int32_t i = 0 ; i < n ; i++ ) { k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_userWeight,4, k.n0); k.n0 = hash64 ((char *)&m_q.m_qterms[i].m_userType ,1, k.n0); } // space separated, NULL terminated, list of meta tag names to display if ( m_displayMetas ) k.n0 = hash64b ( m_displayMetas , k.n0 ); // name of collection in external cluster to get titleRecs for // related pages from //if ( m_rp_getExternalPages && m_rp_externalColl ) // k.n0 = hash64b ( m_rp_externalColl , k.n0 ); // collection e import from //if ( m_importColl ) // k.n0 = hash64b ( m_importColl , k.n0 ); // the special query parm //if ( m_sq && m_sqLen > 0 ) // k.n0 = hash64 ( m_sq , m_sqLen , k.n0 ); //if ( m_noDocIds && m_noDocIdsLen ) // k.n0 = hash64 ( m_noDocIds , m_noDocIdsLen , k.n0 ); //if ( m_noSiteIds && m_noSiteIdsLen ) // k.n0 = hash64 ( m_noSiteIds , m_noSiteIdsLen , k.n0 ); // no need to hash these again separately, they are in between // m_START and m_END_HASH // language //if ( m_language ) // k.n0 = hash64 ( m_language , k.n0 ); //if ( m_gblang ) // k.n0 = hash64 ( m_gblang , k.n0 ); // . now include the hash of the search parameters // . nnot incuding m_docsToScanForTopics since since we got TopicGroups char *a = ((char *)&m_START) + 4 ; // msg40->m_dpf; char *b = (char *)&m_END_HASH ; // msg40->m_topicGroups; int32_t size = b - a; // push and flush some parms that should not contribute //int32_t save1 = m_refs_numToDisplay; //int32_t save2 = m_rp_numToDisplay; //int32_t save3 = m_numTopicsToDisplay; //m_refs_numToDisplay = 0; //m_rp_numToDisplay = 0; //m_numTopicsToDisplay = 0; // and hash it all up k.n0 = hash64 ( a , size , k.n0 ); // and pop out the parms that did not contribute //m_refs_numToDisplay = save1; //m_rp_numToDisplay = save2; //m_numTopicsToDisplay = save3; // hash each topic group for ( int32_t i = 0 ; i < m_numTopicGroups ; i++ ) { TopicGroup *t = &m_topicGroups[i]; //k.n0 = hash64 ( t->m_numTopics , k.n0 ); k.n0 = hash64 ( t->m_maxTopics , k.n0 ); k.n0 = hash64 ( t->m_docsToScanForTopics , k.n0 ); k.n0 = hash64 ( t->m_minTopicScore , k.n0 ); k.n0 = hash64 ( t->m_maxWordsPerTopic , k.n0 ); k.n0 = hash64b( t->m_meta , k.n0 ); k.n0 = hash64 ( t->m_delimeter , k.n0 ); k.n0 = hash64 ( t->m_useIdfForTopics , k.n0 ); k.n0 = hash64 ( t->m_dedup , k.n0 ); } // . boolean queries have operators (AND OR NOT ( ) ) that we need // to consider in this hash as well. so // . so just hash the whole damn query if ( m_q.m_isBoolean ) { char *q = m_q.getQuery(); int32_t qlen = m_q.getQueryLen(); k.n0 = hash64 ( q , qlen , k.n0 ); } // Language stuff //k.n0 = hash64(m_defaultSortLanguage, m_defaultSortLanguageLen, k.n0); //k.n0 = hash64(m_defaultSortCountry , m_defaultSortCountryLen , k.n0); // debug //logf(LOG_DEBUG,"query: q=%s k.n0=%"UINT64"",m_q.getQuery(),k.n0); //Msg1aParms* m1p = msg40->getReferenceParms(); //if( m1p ) { // k.n0=hash64(((char*)m1p)+sizeof(int32_t), // sizeof(Msg1aParms)-8,k.n0); //} return k; } void SearchInput::test ( ) { // set all to 0 just to avoid any inconsistencies char *a = ((char *)&m_START) + 4 ; // msg40->m_dpf; char *b = (char *)&m_END_TEST; int32_t size = b - a; memset ( a , 0x00 , size ); // loop through all possible cgi parms to set SearchInput for ( int32_t i = 0 ; i < g_parms.m_numSearchParms ; i++ ) { Parm *m = g_parms.m_searchParms[i]; char *x = (char *)this + m->m_off; if ( m->m_type != TYPE_BOOL ) *(int32_t *)x = 0xffffffff; else *(char *)x = 0xff; } // ensure we're all zeros now! int32_t fix = a - (char *)this; unsigned char *p = (unsigned char *)a; for ( int32_t i = 0 ; i < size ; i++ ) { if ( p[i] == 0xff ) continue; // find it int32_t off = i + fix; char *name = NULL; // "unknown"; for ( int32_t k = 0 ; k < g_parms.m_numSearchParms ; k++ ) { Parm *m = g_parms.m_searchParms[k]; if ( m->m_off != off ) continue; name = m->m_title; break; } if ( ! name ) continue; log("query: Got uncovered SearchInput parm at offset " "%"INT32" in SearchInput. name=%s.",off,name); } } void SearchInput::copy ( class SearchInput *si ) { gbmemcpy ( (char *)this , (char *)si , sizeof(SearchInput) ); } class SearchInput *g_si = NULL; bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) { // store list of collection #'s to search here. usually just one. m_collnumBuf.reset(); // zero out everything, set niceness to 0 clear ( 0 ) ; // save it now m_sock = sock; // still his buffer. m_hr will free the stuff, but "r" can // still access it for the time being, and not free it m_hr.stealBuf ( r ); char *coll = g_collectiondb.getDefaultColl ( r ); ////// // // build "m_collnumBuf" to consist of all the collnums we should // be searching. // /////// m_firstCollnum = -1; // set this to the collrec of the first valid collnum we encounter CollectionRec *cr = NULL; // now convert list of space-separated coll names into list of collnums char *p = r->getString("c",NULL); // if no collection list was specified look for "token=" and // use those to make collections. hack for diffbot. char *token = r->getString("token",NULL); // find all collections under this token for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { // must not have a "&c=" if ( p ) break; // must have a "&token=" if ( ! token ) break; // skip if empty CollectionRec *tmpcr = g_collectiondb.m_recs[i]; if ( ! tmpcr ) continue; // skip if does not match token if ( strcmp(token,tmpcr->m_diffbotToken.getBufStart()) ) continue; // . we got a match // . set initial junk if ( ! cr ) { cr = tmpcr; m_firstCollnum = tmpcr->m_collnum; } // save the collection # if ( ! m_collnumBuf.safeMemcpy ( &tmpcr->m_collnum, sizeof(collnum_t) ) ) return false; } // if we had a "&c=..." in the GET request process that if ( p ) { loop: char *end = p; for ( ; *end && ! is_wspace_a(*end) ; end++ ); // temp null char c = *end; *end = '\0'; CollectionRec *tmpcr = g_collectiondb.getRec ( p ); // set defaults from the FIRST one if ( tmpcr && ! cr ) { cr = tmpcr; m_firstCollnum = tmpcr->m_collnum; } if ( ! tmpcr ) { g_errno = ENOCOLLREC; log("query: missing collection %s",p); g_msg = " (error: no such collection)"; return false; } // add to our list if (!m_collnumBuf.safeMemcpy(&tmpcr->m_collnum, sizeof(collnum_t))) return false; // restore the \0 character we wrote in there *end = c; // advance p = end; // skip to next collection name if there is one while ( *p && is_wspace_a(*p) ) p++; // now add it's collection # to m_collnumBuf if there if ( *p ) goto loop; } // use default collection if none provided if ( ! p && ! token && m_collnumBuf.length() <= 0 ) { // get default collection rec cr = g_collectiondb.getRec (coll); // add to our list if ( cr && !m_collnumBuf.safeMemcpy(&cr->m_collnum, sizeof(collnum_t))) return false; } ///// // // END BUILDING m_collnumBuf // ///// // save the collrec m_cr = cr; // must have had one if ( ! cr ) { log("si: si. collection does not exist"); // if we comment the below out then it cores in setToDefault! g_errno = ENOCOLLREC; return false; } // and set from the http request. will set m_coll, etc. g_parms.setToDefault ( (char *)this , OBJ_SI , cr ); /////// // // set defaults of some things based on format language // ////// // get the format. "xml" "html" "json" --> FORMAT_HTML, FORMAT_CSV ... char tmpFormat = m_hr.getReplyFormat();//getFormatFromRequest ( &m_hr); // now override automatic defaults for special cases if ( tmpFormat != FORMAT_HTML ) { m_familyFilter = 0; m_numTopicsToDisplay = 0; m_doQueryHighlighting = 0; //m_spellCheck = 0; m_getDocIdScoringInfo = false; // turn gigabits off by default if not html //m_docsToScanForTopics = 0; } // if they have a list of sites... if ( m_sites && m_sites[0] ) { m_doSiteClustering = false; m_ipRestrictForTopics = false; } // and set from the http request. will set m_coll, etc. g_parms.setFromRequest ( &m_hr , sock , cr , (char *)this , OBJ_SI ); if ( m_streamResults && tmpFormat != FORMAT_XML && tmpFormat != FORMAT_JSON ) { log("si: streamResults only supported for " "json/html. disabling"); m_streamResults = false; } m_coll = coll; // it sets m_formatStr above, but we gotta set this... m_format = tmpFormat; ////// // // fix some parms // ////// // set m_isMasterAdmin to zero if no correct ip or password if ( ! g_conf.isMasterAdmin ( sock , &m_hr ) ) m_isMasterAdmin = 0; // collection admin? m_isCollAdmin = g_conf.isCollAdmin ( sock , &m_hr ); ////////////////////////////////////// // // transform input into classes // ////////////////////////////////////// // allow for "qlang" if still don't have it //int32_t gglen2; //char *gg2 = r->getString ( "qlang" , &gglen2 , NULL ); //if ( m_gblang == 0 && gg2 && gglen2 > 1 ) // m_gblang = getLanguageFromAbbr(gg2); // fix query by removing lang:xx from ask.com queries //char *end = m_query + m_queryLen -8; //if ( m_queryLen > 8 && m_query && end > m_query && // strncmp(end," lang:",6)==0 ) { // char *asklang = m_query+m_queryLen - 2; // m_gblang = getLanguageFromAbbr(asklang); // m_queryLen -= 8; // m_query[m_queryLen] = 0; // //} // . returns false and sets g_errno on error // . sets m_qbuf1 and m_qbuf2 // . sets: // m_sbuf1 // m_sbuf2 // m_sbuf3 // m_displayQuery // m_qe (encoded query) // m_rtl (right to left like hebrew) // m_highlightQuery if ( ! setQueryBuffers (r) ) return log("query: setQueryBuffers: %s",mstrerror(g_errno)); /* --- Virtual host language detection --- */ /* if(r->getHost()) { bool langset = getLanguageFromAbbr(m_defaultSortLanguage); char *cp; if(!langset && (cp = strrchr(r->getHost(), '.'))) { uint8_t lang = getLanguageFromUserAgent(++cp); if(lang) { // char langbuf[128]; // sprintf(langbuf, "qlang=%s\0", getLanguageAbbr(lang)); //m_defaultSortLanguage = getLanguageAbbr(lang); char *tmp = getLanguageAbbr(lang); strncpy(m_defaultSortLanguage, tmp, 6); // log(LOG_INFO, // getLanguageString(lang), r->getHost(), this); } } } */ /* --- End Virtual host language detection --- */ //char *qs1 = m_defaultSortLanguage; // this overrides though //int32_t qlen2; //char *qs2 = r->getString ("qlang",&qlen2,NULL); //if ( qs2 ) qs1 = qs2; //m_queryLang = getLanguageFromAbbr ( qs1 ); //m_queryLang = detectQueryLanguage(); //char *qs1 = getLangAbbr(m_queryLang); // this parm is in Parms.cpp and should be set char *langAbbr = m_defaultSortLang; // Parms.cpp sets it to an empty string, so make that null // if Parms.cpp set it to NULL it seems it comes out as "(null)" // i guess because we sprintf it or something. if ( langAbbr && langAbbr[0] == '\0' ) langAbbr = NULL; // if &qlang was not given explicitly fall back to coll rec if ( cr && ! langAbbr ) langAbbr = cr->m_defaultSortLanguage2; // if no coll rec use language unknown if ( ! langAbbr ) langAbbr = "xx"; log(LOG_INFO,"query: using default lang of %s", langAbbr ); // get code m_queryLangId = getLangIdFromAbbr ( langAbbr ); // allow for 'xx', which means langUnknown if ( m_queryLangId == langUnknown && langAbbr && langAbbr[0] && langAbbr[0]!='x' ) log("query: qlang of \"%s\" is NOT SUPPORTED. using " "langUnknown, \"xx\".",langAbbr); // . the query to use for highlighting... can be overriden with "hq" // . we need the language id for doing synonyms if ( m_prepend && m_prepend[0] ) m_hqq.set2 ( m_prepend , m_queryLangId , true ); else if ( m_highlightQuery && m_highlightQuery[0] ) m_hqq.set2 ( m_highlightQuery , m_queryLangId , true ); else if ( m_query && m_query[0] ) m_hqq.set2 ( m_query , m_queryLangId , true ); // log it here log(LOG_INFO, "query: got query %s (len=%i)" ,m_sbuf1.getBufStart() ,m_sbuf1.length()); // . now set from m_qbuf1, the advanced/composite query buffer // . returns false and sets g_errno on error (ETOOMANYOPERANDS) if ( ! m_q.set2 ( m_sbuf1.getBufStart(), m_queryLangId , m_queryExpansion ) ) { g_msg = " (error: query has too many operands)"; return false; } m_q.m_containingParent = (void *)this; if ( m_q.m_truncated && m_q.m_isBoolean ) { g_errno = EQUERYTOOBIG; g_msg = " (error: query is too long)"; return false; } if ( m_hideAllClustered ) m_doSiteClustering = true; // turn off some parms if ( m_q.m_hasUrlField ) m_ipRestrictForTopics = false; if ( m_q.m_hasIpField ) m_ipRestrictForTopics = false; if ( m_q.m_hasPositiveSiteField ) { m_ipRestrictForTopics = false; m_doSiteClustering = false; } if ( cr && ! cr->m_ipRestrict ) m_ipRestrictForTopics = false; if ( m_q.m_hasQuotaField ) { m_doSiteClustering = false; m_doDupContentRemoval = false; } if ( ! m_doSiteClustering ) m_hideAllClustered = false; // sanity check if(m_firstResultNum < 0) m_firstResultNum = 0; // DEBUG: temp hack // static bool first = true; // if ( first ) { // first = false; // m_firstResultNum = 10; // } // . if query has url: or site: term do NOT use cache by def. // . however, if spider is off then use the cache by default if ( m_useCache == -1 && g_conf.m_spideringEnabled ) { if ( m_q.m_hasPositiveSiteField ) m_useCache = 0; else if ( m_q.m_hasIpField ) m_useCache = 0; else if ( m_q.m_hasUrlField ) m_useCache = 0; else if ( m_sites && m_sites[0] ) m_useCache = 0; //else if ( m_whiteListBuf.length() ) m_useCache = 0; else if ( m_url && m_url[0] ) m_useCache = 0; } // if useCache is still -1 then turn it on if ( m_useCache == -1 ) m_useCache = 1; // never use cache if doing a rerank (msg3b) //if ( m_rerankRuleset >= 0 ) m_useCache = 0; bool readFromCache = false; if ( m_useCache == 1 ) readFromCache = true; if ( m_rcache == 0 ) readFromCache = false; if ( m_useCache == 0 ) readFromCache = false; // if useCache is false, don't write to cache if it was not specified if ( m_wcache == -1 ) { if ( m_useCache == 0 ) m_wcache = 0; else m_wcache = 1; } // save it m_rcache = readFromCache; // // TODO: use Parms.cpp defaults // TopicGroup *tg = &m_topicGroups[0]; // // // gigabits // // tg->m_numTopics = 50; tg->m_maxTopics = 50; tg->m_docsToScanForTopics = m_docsToScanForTopics; tg->m_minTopicScore = 0; tg->m_maxWordsPerTopic = 6; tg->m_meta[0] = '\0'; tg->m_delimeter = '\0'; tg->m_useIdfForTopics = false; tg->m_dedup = true; // need to be on at least 2 pages! tg->m_minDocCount = 2; tg->m_ipRestrict = m_ipRestrictForTopics; tg->m_dedupSamplePercent = 80; tg->m_topicRemoveOverlaps = true; tg->m_topicSampleSize = 4096; // max sequential punct chars allowedin a topic tg->m_topicMaxPunctLen = 1; m_numTopicGroups = 1; return true; } // . sets m_qbuf1[] and m_qbuf2[] // . m_qbuf1[] is the advanced query // . m_qbuf2[] is the query to be used for spell checking // . returns false and set g_errno on error bool SearchInput::setQueryBuffers ( HttpRequest *hr ) { m_sbuf1.reset(); m_sbuf2.reset(); m_sbuf3.reset(); int16_t qcs = csUTF8; if (m_queryCharset && m_queryCharset[0]){ // we need to convert the query string to utf-8 int32_t qclen = gbstrlen(m_queryCharset); qcs = get_iana_charset(m_queryCharset, qclen ); if (qcs == csUnknown) { //g_errno = EBADCHARSET; //g_msg = "(error: unknown query charset)"; //return false; qcs = csUTF8; } } // prepend sites terms int32_t numSites = 0; char *csStr = NULL; numSites = 0; csStr = get_charset_str(qcs); /* if ( m_sites && m_sites[0] ) { char *s = m_sites; char *t; int32_t len; m_sbuf1.pushChar('(');// *p++ = '('; loop: // skip white space while ( *s && ! is_alnum_a(*s) ) s++; // bail if done if ( ! *s ) goto done; // get length of it t = s; while ( *t && ! is_wspace_a(*t) ) t++; len = t - s; // add site: term //if ( p + 12 + len >= pend ) goto toobig; if ( numSites > 0 ) m_sbuf1.safeStrcpy ( " UOR " ); m_sbuf1.safeStrcpy ( "site:" ); //p += ucToUtf8(p, pend-p,s, len, csStr, 0,0); m_sbuf1.safeMemcpy ( s , len ); //gbmemcpy ( p , s , len ); p += len; // *p++ = ' '; m_sbuf1.pushChar(' '); s = t; numSites++; goto loop; done: m_sbuf1.safePrintf(") | "); // inc totalLen m_sitesQueryLen = m_sitesLen + (numSites * 10); } */ // prepend char *qp = hr->getString("prepend",NULL,NULL); if( qp && qp[0] ) { //if( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //p += sprintf( p, "+gblang:%"INT32" |", m_gblang ); m_sbuf1.safePrintf( "%s", qp ); } // boolean OR terms bool boolq = false; char *any = hr->getString("any",NULL); bool first = true; if ( any ) { char *s = any; char *send = any + gbstrlen(any); if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); while (s < send) { while (isspace(*s) && s < send) s++; char *s2 = s+1; if (*s == '\"') { // if there's no closing quote just treat // the end of the line as such while (*s2 != '\"' && s2 < send) s2++; if (s2 < send) s2++; } else { while (!isspace(*s2) && s2 < send) s2++; } if ( first ) m_sbuf1.safeStrcpy("("); if ( first ) m_sbuf2.safeStrcpy("("); if ( ! first ) m_sbuf1.safeStrcpy(" OR "); if ( ! first ) m_sbuf2.safeStrcpy(" OR "); first = false; m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf2.safeMemcpy ( s , s2 - s ); s = s2 + 1; } } if ( ! first ) m_sbuf1.safeStrcpy(") AND "); if ( ! first ) m_sbuf2.safeStrcpy(") AND "); if ( ! first ) boolq = true; // and this if ( m_secsBack > 0 ) { int32_t timestamp = getTimeGlobalNoCore(); timestamp -= m_secsBack; if ( timestamp <= 0 ) timestamp = 0; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); m_sbuf1.safePrintf("gbminint:gbspiderdate:%"UINT32"",timestamp); } if ( m_sortBy == 1 ) { if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); m_sbuf1.safePrintf("gbsortbyint:gbspiderdate"); } if ( m_sortBy == 2 ) { if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); m_sbuf1.safePrintf("gbrevsortbyint:gbspiderdate"); } if ( m_sortBy == 3 ) { if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); m_sbuf1.safePrintf("gbsortbyint:gbsitenuminlinks"); } char *ft = m_filetype; if ( ft && strcasecmp(ft,"any")==0 ) ft = NULL; if ( ft && ! ft[0] ) ft = NULL; if ( ft ) { if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); m_sbuf1.safePrintf("filetype:%s",ft); } // facet prepend en masse // for ( int32_t i = 1 ; i <= 6 ; i++ ) { // char tmp[12]; // sprintf(tmp,"facet%"INT32"",i); // char *ff = hr->getString(tmp,NULL); // if ( ! ff ) continue; // if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); // m_sbuf1.safePrintf("%s",ff); // } // one at a time for now char *ff = hr->getString("facet",NULL); if ( ff ) { if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); m_sbuf1.safePrintf("%s",ff); } // append site: term // if ( m_sites && m_sites[0] ) { // if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); // m_sbuf1.safePrintf("+site:"); // m_sbuf1.safeStrcpy(m_sites); // } if ( m_familyFilter ) { if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); m_sbuf1.safePrintf( "+gbisadult:0"); //m_sbuf2.safePrintf( "+gbisadult:0"); if ( ! boolq ) { m_sbuf1.safeStrcpy(" |"); //m_sbuf2.safeStrcpy(" |"); } else { m_sbuf1.safeStrcpy(" AND "); //m_sbuf2.safeStrcpy(" AND "); } } // PRE-pend gblang: term int32_t gblang = hr->getLong("gblang",-1); if( gblang >= 0 ) { if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); m_sbuf1.safePrintf( "+gblang:%"INT32"", gblang ); m_sbuf2.safePrintf( "+gblang:%"INT32"", gblang ); if ( ! boolq ) { m_sbuf1.safeStrcpy(" |"); m_sbuf2.safeStrcpy(" |"); } else { m_sbuf1.safeStrcpy(" AND "); m_sbuf2.safeStrcpy(" AND "); } } // bookmark here so we can copy into st->m_displayQuery below //int32_t displayQueryOffset = m_sbuf1.length(); // append url: term // if ( m_url && m_url[0] ) { // //if ( p > pstart ) *p++ = ' '; // if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); // //gbmemcpy ( p , "+url:" , 5 ); p += 5; // m_sbuf1.safeStrcpy ( "+url:"); // //gbmemcpy ( p , m_url , m_urlLen ); p += m_urlLen; // m_sbuf1.safeStrcpy ( m_url ); // } // append url: term if ( m_link && m_link[0] ) { if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); m_sbuf1.safeStrcpy ( "+link:"); m_sbuf2.safeStrcpy ( "+link:"); m_sbuf1.safeStrcpy ( m_link ); m_sbuf2.safeStrcpy ( m_link ); if ( ! boolq ) { m_sbuf1.safeStrcpy(" |"); m_sbuf2.safeStrcpy(" |"); } else { m_sbuf1.safeStrcpy(" AND "); m_sbuf2.safeStrcpy(" AND "); } } // append the natural query if ( m_query && m_query[0] ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //p += ucToUtf8(p, pend-p, m_query, m_queryLen, csStr, 0,0); m_sbuf1.safeStrcpy ( m_query ); //gbmemcpy ( p , m_query , m_queryLen ); p += m_queryLen; // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //p2 +=ucToUtf8(p2, pend2-p2, m_query, m_queryLen, csStr, 0,0); m_sbuf2.safeStrcpy ( m_query ); //gbmemcpy ( p2 , m_query , m_queryLen ); p2 += m_queryLen; } // if ( m_query2 && m_query2[0] ) { // //if ( p3 > pstart3 ) *p3++ = ' '; // if ( m_sbuf3.length() ) m_sbuf3.pushChar(' '); // //p3+=ucToUtf8(p3, pend3-p3, m_query2, m_query2Len, csStr,0,0); // m_sbuf3.safeStrcpy ( m_query2 ); // } //if (g_errno == EILSEQ){ // illegal character seq // log("query: bad char set"); // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} // append quoted phrases to query if ( m_quote1 && m_quote1[0] ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //*p++ = '+'; //*p++ = '\"'; if ( ! boolq ) { m_sbuf1.safeStrcpy(" +\""); m_sbuf2.safeStrcpy(" +\""); } else { m_sbuf1.safeStrcpy(" AND \""); m_sbuf2.safeStrcpy(" AND \""); } //p += ucToUtf8(p, pend-p, m_quote1, m_quoteLen1, csStr, 0,0); m_sbuf1.safeStrcpy ( m_quote1 ); //gbmemcpy ( p , m_quote1 , m_quoteLen1 ); p += m_quoteLen1 ; //*p++ = '\"'; m_sbuf1.safeStrcpy("\""); // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //*p2++ = '+'; //*p2++ = '\"'; //p2+=ucToUtf8(p2, pend2-p2, m_quote1, m_quoteLen1, csStr,0,0); m_sbuf2.safeStrcpy ( m_quote1 ); //gbmemcpy ( p2 , m_quote1 , m_quoteLen1 ); p2 += m_quoteLen1 ; //*p2++ = '\"'; m_sbuf2.safeStrcpy("\""); } //if (g_errno == EILSEQ){ // illegal character seq // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} if ( m_quote2 && m_quote2[0] ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //*p++ = '+'; //*p++ = '\"'; if ( ! boolq ) { m_sbuf1.safeStrcpy(" +\""); m_sbuf2.safeStrcpy(" +\""); } else { m_sbuf1.safeStrcpy(" AND \""); m_sbuf2.safeStrcpy(" AND \""); } //m_sbuf1.safeStrcpy("+\""); //p += ucToUtf8(p, pend-p, m_quote2, m_quoteLen2, csStr, 0,0); m_sbuf1.safeStrcpy ( m_quote2 ); //gbmemcpy ( p , m_quote2 , m_quoteLen2 ); p += m_quoteLen2 ; //*p++ = '\"'; m_sbuf1.safeStrcpy("\""); // add to spell checked buf, too //if ( p2 > pstart2 ) *p2++ = ' '; if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); //*p2++ = '+'; //*p2++ = '\"'; //m_sbuf2.safeStrcpy("+\""); //p2+=ucToUtf8(p2, pend2-p2, m_quote2, m_quoteLen2, csStr,0,0); m_sbuf2.safeStrcpy ( m_quote2 ); //gbmemcpy ( p2 , m_quote2 , m_quoteLen2 ); p2 += m_quoteLen2 ; //*p2++ = '\"'; m_sbuf2.safeStrcpy("\""); } //if (g_errno == EILSEQ){ // illegal character seq // g_errno = 0; // if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;} // if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;} //} // append plus terms if ( m_plus && m_plus[0] ) { char *s = m_plus; char *send = m_plus + gbstrlen(m_plus); //if ( p > pstart && p < pend ) *p++ = ' '; //if ( p2 > pstart2 && p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); while (s < send) { while (isspace(*s) && s < send) s++; char *s2 = s+1; if (*s == '\"') { // if there's no closing quote just treat // the end of the line as such while (*s2 != '\"' && s2 < send) s2++; if (s2 < send) s2++; } else { while (!isspace(*s2) && s2 < send) s2++; } //if (s2 < send) break; //if (p < pend) *p++ = '+'; //if (p2 < pend2) *p2++ = '+'; //m_sbuf1.pushChar('+'); //m_sbuf2.pushChar('+'); if ( ! boolq ) { m_sbuf1.safeStrcpy("+"); m_sbuf2.safeStrcpy("+"); } else { m_sbuf1.safeStrcpy(" AND "); m_sbuf2.safeStrcpy(" AND "); } //p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0); //p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0); m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf2.safeMemcpy ( s , s2 - s ); /* if (g_errno == EILSEQ) { // illegal character seq g_errno = 0; if (qcs == csUTF8) { qcs = csISOLatin1; goto doOver; } if (qcs != csISOLatin1) { qcs = csUTF8; goto doOver; } } */ s = s2 + 1; if (s < send) { //if (p < pend) *p++ = ' '; //if (p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); } } } // append minus terms if ( m_minus && m_minus[0] ) { char *s = m_minus; char *send = m_minus + gbstrlen(m_minus); //if ( p > pstart && p < pend ) *p++ = ' '; //if ( p2 > pstart2 && p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); while (s < send) { while (isspace(*s) && s < send) s++; char *s2 = s+1; if (*s == '\"') { // if there's no closing quote just treat // the end of the line as such while (*s2 != '\"' && s2 < send) s2++; if (s2 < send) s2++; } else { while (!isspace(*s2) && s2 < send) s2++; } if (s2 < send) break; //if (p < pend) *p++ = '-'; //if (p2 < pend2) *p2++ = '-'; // m_sbuf1.pushChar('-'); // m_sbuf2.pushChar('-'); if ( ! boolq ) { m_sbuf1.safeStrcpy("-"); m_sbuf2.safeStrcpy("-"); } else { m_sbuf1.safeStrcpy(" AND NOT "); m_sbuf2.safeStrcpy(" AND NOT "); } //p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0); //p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0); m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf2.safeMemcpy ( s , s2 - s ); /* if (g_errno == EILSEQ) { // illegal character seq g_errno = 0; if (qcs == csUTF8) { qcs = csISOLatin1; goto doOver; } if (qcs != csISOLatin1) { qcs = csUTF8; goto doOver; } } */ s = s2 + 1; if (s < send) { //if (p < pend) *p++ = ' '; //if (p2 < pend2) *p2++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); } } } // append gbkeyword:numinlinks if they have &mininlinks=X, X>0 int32_t minInlinks = m_hr.getLong("mininlinks",0); if ( minInlinks > 0 ) { //if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); //char *str = "gbkeyword:numinlinks"; //int32_t len = gbstrlen(str); //gbmemcpy ( p , str , len ); //p += len; m_sbuf1.safePrintf ( "gbkeyword:numinlinks"); } // null terms if ( ! m_sbuf1.nullTerm() ) return false; if ( ! m_sbuf2.nullTerm() ) return false; if ( ! m_sbuf3.nullTerm() ) return false; // the natural query m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset; if ( ! m_displayQuery ) m_displayQuery = ""; while ( *m_displayQuery == ' ' ) m_displayQuery++; //m_displayQueryLen = gbstrlen(m_displayQuery);//p-m_displayQuery //log("query: got query %s",m_sbuf1.getBufStart()); //log("query: got display query %s",m_displayQuery); // urlencoded display query m_qe.urlEncode ( m_displayQuery ); // urlEncode(m_qe, // MAX_QUERY_LEN*2, // m_displayQuery, // gbstrlen(m_displayQuery)); ////////// // // show DMOZ BREADCRUMB if doing a // "gbpcatid: |" (Search restricted to category) // "gbcatid:" (DMOZ urls in that topic, c=dmoz3) // ////////// int32_t pcatId = -1; int32_t dcatId = -1; // get the final query char *q =m_sbuf1.getBufStart(); if ( q ) sscanf(q,"gbpcatid:%"INT32"",&pcatId); if ( q ) sscanf(q,"gbcatid:%"INT32"",&dcatId); // pick the one that is valid int32_t catId = -1; if ( pcatId >= 0 ) catId = pcatId; if ( dcatId >= 0 ) catId = dcatId; ////// // // save catid into the state m_catId = catId; // /////// // are we a right to left language like hebrew? if ( catId > 0 && g_categories->isIdRTL(catId) ) m_isRTL = true; else m_isRTL = false; return true; } /* uint8_t SearchInput::detectQueryLanguage(void) { uint8_t lang = 0; // Check to see if default language is set. // This should override everything else. //if(m_defaultSortLanguage) // lang = getLanguageFromAbbr(m_defaultSortLanguage); // Set query language from User Agent string, if possible if(!lang && m_hr.getUserAgent()) lang= g_langId.guessLanguageFromUserAgent(m_hr.getUserAgent()); // guess from query terms if(!lang && m_q) lang = g_langId.guessLanguageFromQuery(m_q); // guess from IP addr of the requester if(!lang && m_queryIP) lang = g_langId.guessLanguageFromIP(m_queryIP); // Save for later m_langHint = lang; if(m_gbcountry && m_gbcountryLen > 0) m_country = g_countryCode.getIndexOfAbbr(m_gbcountry); if(!m_country) { // Now guess country of the query. char *codep = g_langId.findGeoIP(m_queryIP, geoIPNumRows - 1, 0); if(codep) m_country = g_countryCode.getIndexOfAbbr(codep); // Many doofuses just download firefox and don't set it // up properly, so this takes second place to the IP search. if(!m_country) m_country = g_langId.guessCountryFromUserAgent(m_hr.getUserAgent()); } return(lang); } */ //char getFormatFromRequest ( HttpRequest *r ) { // //}