open-source-search-engine/SearchInput.h

427 lines
11 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
// Copyright Apr 2005 Matt Wells
// . parameters from CollectionRec that can be overriden on a per query basis
// . use m_qbuf1 for the query. it has all the min/plus/quote1/quote2 advanced
// search cgi parms appended to it. it is the complete query, which is
// probably what you are looking for. m_q should also be set to that, too.
// m_query is simply the "query=" string in the url request.
// . use m_qbuf2 for the spell checker. it is missing url: link: etc fields.
// . use m_displayQuery for the query displayed in the text box. it is missing
// values from the "sites=" and "site=" cgi parms, since they show up with
// radio buttons below the search text box on the web page.
#include "Query.h" // MAX_QUERY_LEN
//#include "Msg24.h" // MAX_TOPIC_GROUPS
#ifndef _SEARCHINPUT_H_
#define _SEARCHINPUT_H_
//#define MAX_URLPARMS_LEN (MAX_URL_LEN + MAX_QUERY_LEN+ PASSWORD_MAX_LEN + 5000)
#define SI_TMPBUF_SIZE (16*1024)
#define MAX_TOPIC_GROUPS 10
char getFormatFromRequest ( class HttpRequest *hr ) ;
2013-08-03 00:12:24 +04:00
// . parameters used to generate a set of related topics (gigabits)
// . you can have Msg24 generate multiple sets of related topics in one call
class TopicGroup {
public:
long m_numTopics;
long m_maxTopics;
long m_docsToScanForTopics;
long m_minTopicScore;
long m_maxWordsPerTopic;
char m_meta[32];
char m_delimeter;
bool m_useIdfForTopics;
bool m_dedup;
long m_minDocCount ;
bool m_ipRestrict ;
char m_dedupSamplePercent; // -1 means no deduping
bool m_topicRemoveOverlaps;
long m_topicSampleSize;
long m_topicMaxPunctLen;
};
class SearchInput {
public:
// why provide query here, it is in "hr"
bool set ( class TcpSocket *s , class HttpRequest *hr );
2013-08-03 00:12:24 +04:00
void test ( );
key_t makeKey ( ) ;
// private
void setTopicGroups ( class HttpRequest *r ,
class CollectionRec *cr ) ;
bool setQueryBuffers ( class HttpRequest *hr ) ;
2013-08-03 00:12:24 +04:00
//void setToDefaults ( class CollectionRec *cr , long niceness ) ;
void clear ( long niceness ) ;
2013-08-03 00:12:24 +04:00
// Msg40 likes to use this to pass the parms to a remote host
SearchInput ( );
~SearchInput ( );
void reset ( );
long getStoredSizeForMsg40 ( ) ;
//char *serializeForMsg40 ( long *size ) ;
//void deserializeForMsg40 ( char *buf, long bufSize, bool ownBuf ) ;
void copy ( class SearchInput *si ) ;
// Language support for Msg40
uint8_t detectQueryLanguage(void);
//bool addFacebookInterests ( char *list ) ;
//bool addInterests ( char *list , char delim, bool hasNums ) ;
//bool addInterest (char *s, long slen, char valc, bool overwrite );
2013-08-03 00:12:24 +04:00
///////////
//
// BEGIN COMPUTED THINGS
//
///////////
// we basically steal the original HttpRequest buffer and keep
// here since the original one is on the stack
HttpRequest m_hr;
TcpSocket *m_sock;
long m_niceness; // msg40
// array of collnum_t's to search... usually just one
SafeBuf m_collnumBuf;
// first collection # listed in m_collnumBuf
collnum_t m_firstCollnum;
// reset this
long m_numTopicGroups; // msg40
char *m_displayQuery; // pts into m_qbuf1
//class Hostdb *m_hostdb;
// urlencoded display query
//char m_qe [ MAX_QUERY_LEN *2 + 1 ];
// urlencoded display query. everything is compiled into this.
SafeBuf m_qe;
CollectionRec *m_cr;
// the final compiled query goes here
Query m_q;
Query *m_q2;
char m_isAdmin;
// these are set from things above
TopicGroup m_topicGroups [ MAX_TOPIC_GROUPS ];// msg40
SafeBuf m_sbuf1;
SafeBuf m_sbuf2;
SafeBuf m_sbuf3;
long m_catId;
bool m_isRTL;
// make a cookie from parms with m_flags of PF_COOKIE set
SafeBuf m_cookieBuf;
// we convert m_defaultSortLang to this number, like langEnglish
// or langFrench or langUnknown...
long m_queryLangId;
// can be 1 for FORMAT_HTML, 2 = FORMAT_XML, 3=FORMAT_JSON, 4=csv
long m_format;
2013-08-03 00:12:24 +04:00
// used as indicator by SearchInput::makeKey() for generating a
// key by hashing the parms between m_START and m_END
long m_START;
//////
//
// BEGIN USER PARMS set from HttpRequest, m_hr
//
//////
char *m_coll;
char *m_query;
char *m_prepend;
// ip address of searcher used for banning abusive IPs "uip"
char *m_userIpStr;
char m_showImages;
// general parms, not part of makeKey(), but should be serialized
char m_useCache; // msg40
char m_rcache; // msg40
char m_wcache; // msg40
char m_debug; // msg40
char m_debugGigabits;
char m_spiderResults;
char m_spiderResultRoots;
char m_spellCheck;
char *m_displayMetas; // msg40
// do not include these in makeKey()
long m_numTopicsToDisplay;
long m_refs_numToDisplay;
long m_rp_numToDisplay;
// these should all be hashed in makeKey()
//char *m_rp_externalColl; // msg40
//char *m_importColl; // msg40
char *m_queryCharset;
char *m_gbcountry;
uint8_t m_country;
//char *m_query2;
// advanced query parms
char *m_url; // for url: search
char *m_sites;
char *m_plus;
char *m_minus;
char *m_link;
char *m_quote1;
char *m_quote2;
// co-branding parms
char *m_imgUrl;
char *m_imgLink;
long m_imgWidth;
long m_imgHeight;
long m_titleMaxLen;
// for limiting results by score in the widget
double m_maxSerpScore;
long long m_minSerpDocId;
// prefer what lang in the results. it gets a 20x boost. "en" "xx" "fr"
char *m_defaultSortLang;
// prefer what country in the results. currently unused. support later.
char *m_defaultSortCountry;
2013-08-03 00:12:24 +04:00
// general parameters
char m_dedupURL;
long m_percentSimilarSummary; // msg40
char m_showBanned;
char m_allowPunctInPhrase;
char m_excludeLinkText;
char m_excludeMetaText;
char m_doBotDetection;
long m_includeCachedCopy;
char m_getSectionVotingInfo;
2013-08-03 00:12:24 +04:00
char m_familyFilter; // msg40
//char m_restrictIndexdbForQuery; // msg40
2013-08-03 00:12:24 +04:00
char m_doSiteClustering; // msg40
char m_doDupContentRemoval; // msg40
char m_getDocIdScoringInfo;
char m_hideAllClustered;
2013-08-03 00:12:24 +04:00
// ranking algos
char m_useMinAlgo;
char m_useNewAlgo;
char m_doMaxScoreAlgo;
char m_adFeedEnabled;
// intersection speed up shortcut? "&fi=1", defaults to on
char m_fastIntersection;
// stream results back on socket in streaming mode, usefule when
// thousands of results are requested
2014-02-05 05:05:43 +04:00
char m_streamResults;
2013-08-03 00:12:24 +04:00
// . related topic (gigabits) parameters
// . TODO: prepend m_top_ to these var names
long m_docsToScanForTopics; // msg40
long m_minTopicScore; // msg40
long m_minDocCount; // msg40
long m_dedupSamplePercent; // msg40
long m_maxWordsPerTopic; // msg40
long m_ipRestrictForTopics; // msg40
char m_returnDocIdCount; // msg40
char m_returnDocIds; // msg40
char m_returnPops; // msg40
// . reference page parameters
// . copied from CollectionRec.h
long m_refs_numToGenerate; // msg40
long m_refs_docsToScan; // msg40
long m_refs_minQuality; // msg40
long m_refs_minLinksPerReference; // msg40
long m_refs_maxLinkers; // msg40
float m_refs_additionalTRFetch; // msg40
long m_refs_numLinksCoefficient; // msg40
long m_refs_qualityCoefficient; // msg40
long m_refs_linkDensityCoefficient; // msg40
char m_refs_multiplyRefScore; // msg40
// . related page parameters
// . copied from CollectionRec.h
long m_rp_numToGenerate; // msg40
long m_rp_numLinksPerDoc; // msg40
long m_rp_minQuality; // msg40
long m_rp_minScore; // msg40
long m_rp_minLinks; // msg40
long m_rp_numLinksCoeff; // msg40
long m_rp_avgLnkrQualCoeff; // msg40
long m_rp_qualCoeff; // msg40
long m_rp_srpLinkCoeff; // msg40
long m_rp_numSummaryLines; // msg40
long m_rp_titleTruncateLimit; // msg40
char m_rp_useResultsAsReferences; // msg40
char m_rp_getExternalPages; // msg40
// unused pqr stuff
int8_t m_langHint;
float m_languageUnknownWeight;
float m_languageWeightFactor;
char m_enableLanguageSorting;
uint8_t m_countryHint;
char m_useLanguagePages;
2013-08-03 00:12:24 +04:00
//char m_getTitleRec;
2013-08-03 00:12:24 +04:00
// buzz uses this
//char m_getSitePops;
2013-08-03 00:12:24 +04:00
// search result knobs
long m_realMaxTop;
// general parameters
long m_numLinesInSummary; // msg40
long m_summaryMaxWidth; // msg40
long m_summaryMaxNumCharsPerLine;
2013-08-03 00:12:24 +04:00
long m_docsWanted; // msg40
long m_firstResultNum; // msg40
long m_boolFlag; // msg40
long m_numResultsToImport; // msg40
float m_importWeight;
long m_numLinkerWeight;
long m_minLinkersPerImportedResult; // msg40
char m_doQueryHighlighting; // msg40
char *m_highlightQuery;
Query m_hqq;
long m_queryMatchOffsets;
long m_summaryMode;
2014-04-05 22:33:42 +04:00
// are we doing a QA query for quality assurance consistency
char m_qa;
long m_docsToScanForReranking;
2013-08-03 00:12:24 +04:00
float m_pqr_demFactSubPhrase;
float m_pqr_demFactCommonInlinks;
float m_pqr_demFactLocTitle;
float m_pqr_demFactLocSummary;
float m_pqr_demFactLocDmoz;
float m_pqr_demFactProximity;
float m_pqr_demFactInSection;
float m_pqr_demFactOrigScore;
bool m_pqr_demInTopics;
// . buzz stuff (buzz)
// . these controls the set of results, so should be in the makeKey()
// as it is, in between the start and end hash vars
long m_displayInlinks;
long m_displayOutlinks;
char m_displayTermFreqs;
char m_justMarkClusterLevels;
//char m_queryPrepend[41];
2013-08-03 00:12:24 +04:00
// new sort/constrain by date stuff
char m_useDateLists;
long m_maxQueryTerms;
// for the news collection really
char m_considerTitlesFromBody;
long m_maxClusterByTopicResults;
long m_numExtraClusterByTopicResults;
// we do not do summary deduping, and other filtering with docids
// only, so can change the result and should be part of the key
char m_docIdsOnly; // msg40
char *m_formatStr;
2013-08-03 00:12:24 +04:00
// this should be part of the key because it will affect the results!
char m_queryExpansion;
long m_maxRealTimeInlinks;
////////
//
// END USER PARMS
//
////////
2013-08-03 00:12:24 +04:00
// . end the section we hash in SearchInput::makeKey()
// . we also hash displayMetas, TopicGroups and Query into the key
long m_END_HASH;
//////
//
// STUFF NOT REALLY USED NWO
//
//////
// for selecting a language
//char m_queryLang;
//long m_gblang;
// post query reranking
//long m_docsToScanForReranking;
2013-08-03 00:12:24 +04:00
//char *m_url;
//SafeBuf m_whiteListBuf;
2013-08-03 00:12:24 +04:00
// password is for all
//char *m_pwd;
2013-08-03 00:12:24 +04:00
// for /addurl?u=www.foo.com
//char *m_url2;
2013-08-03 00:12:24 +04:00
2014-05-06 21:47:57 +04:00
2013-08-03 00:12:24 +04:00
// for /get?d=xxxxx&strip=0&ih=1&qh=1
//long long m_docId;
//long m_strip;
//char m_includeHeader;
//char m_queryHighlighting;
//char m_doDateHighlighting;
//long m_useAdFeedNum;
2013-08-03 00:12:24 +04:00
//char *m_username;
2013-08-03 00:12:24 +04:00
// true if query is directly from an end-user
//char m_endUser;
2013-08-03 00:12:24 +04:00
//long m_maxResults; // msg40
2013-08-03 00:12:24 +04:00
// a marker for SearchInput::test()
long m_END_TEST;
};
#endif