mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
index numbers as integers too, not just floats
so we can sort by spider date without losing 128 seconds of resolution.
This commit is contained in:
parent
63e95c3b2d
commit
2d4af1aefe
@ -1096,7 +1096,7 @@ void Msg39::estimateHits ( ) {
|
||||
|
||||
// convenience ptrs. we will store the docids/scores into these arrays
|
||||
long long *topDocIds;
|
||||
float *topScores;
|
||||
double *topScores;
|
||||
key_t *topRecs;
|
||||
|
||||
// numDocIds counts docs in all tiers when using toptree.
|
||||
@ -1163,7 +1163,7 @@ void Msg39::estimateHits ( ) {
|
||||
mr.ptr_clusterRecs = NULL;
|
||||
// this is how much space to reserve
|
||||
mr.size_docIds = 8 * numDocIds; // long long
|
||||
mr.size_scores = 4 * numDocIds; // float
|
||||
mr.size_scores = sizeof(double) * numDocIds; // float
|
||||
// if not doing site clustering, we won't have these perhaps...
|
||||
if ( m_gotClusterRecs )
|
||||
mr.size_clusterRecs = sizeof(key_t) *numDocIds;
|
||||
@ -1191,7 +1191,7 @@ void Msg39::estimateHits ( ) {
|
||||
return ;
|
||||
}
|
||||
topDocIds = (long long *) mr.ptr_docIds;
|
||||
topScores = (float *) mr.ptr_scores;
|
||||
topScores = (double *) mr.ptr_scores;
|
||||
topRecs = (key_t *) mr.ptr_clusterRecs;
|
||||
}
|
||||
|
||||
@ -1225,6 +1225,8 @@ void Msg39::estimateHits ( ) {
|
||||
//add it to the reply
|
||||
topDocIds [docCount] = t->m_docId;
|
||||
topScores [docCount] = t->m_score;
|
||||
if ( m_tt.m_useIntScores )
|
||||
topScores[docCount] = (double)t->m_intScore;
|
||||
// supply clusterdb rec? only for full splits
|
||||
if ( m_gotClusterRecs )
|
||||
topRecs [docCount] = t->m_clusterRec;
|
||||
|
2
Msg39.h
2
Msg39.h
@ -158,7 +158,7 @@ public:
|
||||
long m_errno;
|
||||
|
||||
char *ptr_docIds ; // the results, long long
|
||||
char *ptr_scores; ; // floats
|
||||
char *ptr_scores; ; // now doubles! so we can have intScores
|
||||
char *ptr_scoreInfo ; // transparency info
|
||||
char *ptr_pairScoreBuf ; // transparency info
|
||||
char *ptr_singleScoreBuf ; // transparency info
|
||||
|
31
Msg3a.cpp
31
Msg3a.cpp
@ -277,8 +277,8 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
m_docIds = (long long *)p;
|
||||
p += 8 * m_numDocIds;
|
||||
// scores
|
||||
m_scores = (float *)p;
|
||||
p += sizeof(float) * m_numDocIds;
|
||||
m_scores = (double *)p;
|
||||
p += sizeof(double) * m_numDocIds;
|
||||
// site hashes
|
||||
m_siteHashes26 = (long *)p;
|
||||
p += 4 * m_numDocIds;
|
||||
@ -727,20 +727,20 @@ bool Msg3a::gotAllSplitReplies ( ) {
|
||||
if ( ! m_debug ) continue;
|
||||
// cast these for printing out
|
||||
long long *docIds = (long long *)mr->ptr_docIds;
|
||||
score_t *scores = (score_t *)mr->ptr_scores;
|
||||
double *scores = (double *)mr->ptr_scores;
|
||||
// print out every docid in this split reply
|
||||
for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
|
||||
// print out score_t
|
||||
logf( LOG_DEBUG,
|
||||
"query: msg3a: [%lu] %03li) "
|
||||
"split=%li docId=%012llu domHash=0x%02lx "
|
||||
"score=%lu" ,
|
||||
"score=%f" ,
|
||||
(unsigned long)this ,
|
||||
j ,
|
||||
i ,
|
||||
docIds [j] ,
|
||||
(long)g_titledb.getDomHash8FromDocId(docIds[j]),
|
||||
(long)scores[j] );
|
||||
(float)scores[j] );
|
||||
}
|
||||
}
|
||||
|
||||
@ -772,7 +772,7 @@ bool Msg3a::gotAllSplitReplies ( ) {
|
||||
for ( long i = 0 ; i < max ; i++ )
|
||||
cr.pushLongLong(m_docIds[i] );
|
||||
for ( long i = 0 ; i < max ; i++ )
|
||||
cr.pushFloat(m_scores[i]);
|
||||
cr.pushDouble(m_scores[i]);
|
||||
for ( long i = 0 ; i < max ; i++ )
|
||||
cr.pushLong(getSiteHash26(i));
|
||||
// sanity
|
||||
@ -849,7 +849,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
// . tcPtr = term count. how many required query terms does the doc
|
||||
// have? formerly called topExplicits in IndexTable2.cpp
|
||||
long long *diPtr [MAX_INDEXDB_SPLIT];
|
||||
float *rsPtr [MAX_INDEXDB_SPLIT];
|
||||
double *rsPtr [MAX_INDEXDB_SPLIT];
|
||||
key_t *ksPtr [MAX_INDEXDB_SPLIT];
|
||||
long long *diEnd [MAX_INDEXDB_SPLIT];
|
||||
for ( long j = 0; j < m_numHosts ; j++ ) {
|
||||
@ -863,7 +863,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
continue;
|
||||
}
|
||||
diPtr [j] = (long long *)mr->ptr_docIds;
|
||||
rsPtr [j] = (float *)mr->ptr_scores;
|
||||
rsPtr [j] = (double *)mr->ptr_scores;
|
||||
ksPtr [j] = (key_t *)mr->ptr_clusterRecs;
|
||||
diEnd [j] = (long long *)(mr->ptr_docIds +
|
||||
mr->m_numDocIds * 8);
|
||||
@ -919,7 +919,8 @@ bool Msg3a::mergeLists ( ) {
|
||||
|
||||
// . how much do we need to store final merged docids, etc.?
|
||||
// . docid=8 score=4 bitScore=1 clusterRecs=key_t clusterLevls=1
|
||||
long need = m_docsToGet * (8+4+sizeof(key_t)+sizeof(DocIdScore *)+1);
|
||||
long need = m_docsToGet * (8+sizeof(double)+
|
||||
sizeof(key_t)+sizeof(DocIdScore *)+1);
|
||||
// allocate it
|
||||
m_finalBuf = (char *)mmalloc ( need , "finalBuf" );
|
||||
m_finalBufSize = need;
|
||||
@ -928,7 +929,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
// hook into it
|
||||
char *p = m_finalBuf;
|
||||
m_docIds = (long long *)p; p += m_docsToGet * 8;
|
||||
m_scores = (float *)p; p += m_docsToGet * sizeof(float);
|
||||
m_scores = (double *)p; p += m_docsToGet * sizeof(double);
|
||||
m_clusterRecs = (key_t *)p; p += m_docsToGet * sizeof(key_t);
|
||||
m_clusterLevels = (char *)p; p += m_docsToGet * 1;
|
||||
m_scoreInfos = (DocIdScore **)p;p+=m_docsToGet*sizeof(DocIdScore *);
|
||||
@ -1078,7 +1079,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
|
||||
// turn it into a float, that is what rscore_t is.
|
||||
// we do this to make it easier for PostQueryRerank.cpp
|
||||
m_scores [m_numDocIds]=(float)*rsPtr[maxj];
|
||||
m_scores [m_numDocIds]=(double)*rsPtr[maxj];
|
||||
if ( m_r->m_doSiteClustering )
|
||||
m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
|
||||
// clear this out
|
||||
@ -1142,7 +1143,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
long Msg3a::getStoredSize ( ) {
|
||||
// docId=8, scores=sizeof(rscore_t), clusterLevel=1 bitScores=1
|
||||
// eventIds=1
|
||||
long need = m_numDocIds * ( 8 + sizeof(rscore_t) + 1 ) +
|
||||
long need = m_numDocIds * ( 8 + sizeof(double) + 1 ) +
|
||||
4 + // m_numDocIds
|
||||
8 ; // m_numTotalEstimatedHits (estimated # of results)
|
||||
return need;
|
||||
@ -1158,8 +1159,8 @@ long Msg3a::serialize ( char *buf , char *bufEnd ) {
|
||||
// store each docid, 8 bytes each
|
||||
memcpy ( p , m_docIds , m_numDocIds * 8 ); p += m_numDocIds * 8;
|
||||
// store scores
|
||||
memcpy ( p , m_scores , m_numDocIds * sizeof(rscore_t) );
|
||||
p += m_numDocIds * sizeof(rscore_t) ;
|
||||
memcpy ( p , m_scores , m_numDocIds * sizeof(double) );
|
||||
p += m_numDocIds * sizeof(double) ;
|
||||
// store cluster levels
|
||||
memcpy ( p , m_clusterLevels , m_numDocIds ); p += m_numDocIds;
|
||||
// sanity check
|
||||
@ -1178,7 +1179,7 @@ long Msg3a::deserialize ( char *buf , char *bufEnd ) {
|
||||
// get each docid, 8 bytes each
|
||||
m_docIds = (long long *)p; p += m_numDocIds * 8;
|
||||
// get scores
|
||||
m_scores = (rscore_t *)p; p += m_numDocIds * sizeof(rscore_t) ;
|
||||
m_scores = (double *)p; p += m_numDocIds * sizeof(double) ;
|
||||
// get cluster levels
|
||||
m_clusterLevels = (char *)p; p += m_numDocIds;
|
||||
// sanity check
|
||||
|
4
Msg3a.h
4
Msg3a.h
@ -61,7 +61,7 @@ public:
|
||||
// we basically turn the scores we get from each msg39 split into
|
||||
// floats (rscore_t) and store them as floats so that PostQueryRerank
|
||||
// has an easier time
|
||||
float *getScores ( ) { return m_scores; };
|
||||
double *getScores ( ) { return m_scores; };
|
||||
long getNumDocIds ( ) { return m_numDocIds; };
|
||||
|
||||
long getSiteHash26 ( long i ) {
|
||||
@ -160,7 +160,7 @@ public:
|
||||
|
||||
// final merged lists go here
|
||||
long long *m_docIds ;
|
||||
float *m_scores ;
|
||||
double *m_scores ;
|
||||
class DocIdScore **m_scoreInfos ;
|
||||
//key_t *m_recs ; // clusterdb recs
|
||||
key_t *m_clusterRecs ;
|
||||
|
@ -3047,9 +3047,9 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"stream=1&" // stream results back as we get them
|
||||
"q="
|
||||
// put NEWEST on top
|
||||
"gbsortby%%3Agbspiderdate+"
|
||||
"gbsortbyint%%3Agbspiderdate+"
|
||||
// min spider date = now - 10 mins
|
||||
"gbmin%%3Agbspiderdate%%3A%li&"
|
||||
"gbminint%%3Agbspiderdate%%3A%li&"
|
||||
//"debug=1"
|
||||
"prepend=type%%3Ajson"
|
||||
">"
|
||||
|
@ -2100,8 +2100,10 @@ bool printResult ( State0 *st, long ix ) {
|
||||
sb->incrementLength(-1);
|
||||
// crap, we lose resolution storing as a float
|
||||
// so fix that shit here...
|
||||
float f = mr->m_lastSpidered;
|
||||
sb->safePrintf(",\"lastSpiderTimeUTC\":%.0f}",f);
|
||||
//float f = mr->m_lastSpidered;
|
||||
//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
|
||||
sb->safePrintf(",\"lastCrawlTimeUTC\":%li}",
|
||||
mr->m_lastSpidered);
|
||||
}
|
||||
|
||||
//mr->size_content );
|
||||
|
69
Posdb.cpp
69
Posdb.cpp
@ -4118,11 +4118,16 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
|
||||
// assume not sorting by a numeric termlist
|
||||
m_sortByTermNum = -1;
|
||||
m_sortByTermNumInt = -1;
|
||||
|
||||
// now we have score ranges for gbmin:price:1.99 etc.
|
||||
m_minScoreTermNum = -1;
|
||||
m_maxScoreTermNum = -1;
|
||||
|
||||
// for gbminint:count:99 etc.
|
||||
m_minScoreTermNumInt = -1;
|
||||
m_maxScoreTermNumInt = -1;
|
||||
|
||||
//for ( long i = 0 ; i < m_msg2->getNumLists() ; i++ ) {
|
||||
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
|
||||
QueryTerm *qt = &m_q->m_qterms[i];
|
||||
@ -4141,6 +4146,14 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
if ( qt->m_fieldCode == FIELD_GBSORTBY ||
|
||||
qt->m_fieldCode == FIELD_GBREVSORTBY )
|
||||
m_sortByTermNum = i;
|
||||
|
||||
if ( qt->m_fieldCode == FIELD_GBSORTBYINT ||
|
||||
qt->m_fieldCode == FIELD_GBREVSORTBYINT ) {
|
||||
m_sortByTermNumInt = i;
|
||||
// tell topTree to use int scores
|
||||
m_topTree->m_useIntScores = true;
|
||||
}
|
||||
|
||||
// is it gbmin:price:1.99?
|
||||
if ( qt->m_fieldCode == FIELD_GBNUMBERMIN ) {
|
||||
m_minScoreTermNum = i;
|
||||
@ -4150,6 +4163,14 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
m_maxScoreTermNum = i;
|
||||
m_maxScoreVal = qt->m_qword->m_float;
|
||||
}
|
||||
if ( qt->m_fieldCode == FIELD_GBNUMBERMININT ) {
|
||||
m_minScoreTermNumInt = i;
|
||||
m_minScoreValInt = qt->m_qword->m_int;
|
||||
}
|
||||
if ( qt->m_fieldCode == FIELD_GBNUMBERMAXINT ) {
|
||||
m_maxScoreTermNumInt = i;
|
||||
m_maxScoreValInt = qt->m_qword->m_int;
|
||||
}
|
||||
// count
|
||||
long nn = 0;
|
||||
// also add in bigram lists
|
||||
@ -4277,6 +4298,15 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMAX )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
|
||||
if (qt->m_fieldCode == FIELD_GBSORTBYINT )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBREVSORTBYINT )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMININT )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMAXINT )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
|
||||
// only really add if useful
|
||||
// no, because when inserting NEW (related) terms that are
|
||||
// not currently in the document, this list may initially
|
||||
@ -5295,6 +5325,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
char siteRank =0;
|
||||
char docLang =0;
|
||||
float score;
|
||||
long intScore;
|
||||
float minScore;
|
||||
float minPairScore;
|
||||
float minSingleScore;
|
||||
@ -5365,6 +5396,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
// do not do it if we got a gbsortby: field
|
||||
if ( m_sortByTermNum >= 0 ) nnn = 0;
|
||||
if ( m_sortByTermNumInt >= 0 ) nnn = 0;
|
||||
|
||||
/*
|
||||
// skip all this if getting score of just one docid on special
|
||||
@ -5653,6 +5685,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
pass0++;
|
||||
|
||||
if ( m_sortByTermNum >= 0 ) goto skipScoringFilter;
|
||||
if ( m_sortByTermNumInt >= 0 ) goto skipScoringFilter;
|
||||
|
||||
// test why we are slow
|
||||
//if ( (s_sss++ % 8) != 0 ) { docIdPtr += 6; fail0++; goto docIdLoop;}
|
||||
@ -6493,11 +6526,18 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
score = g_posdb.getFloat ( miniMergedList[m_sortByTermNum] );
|
||||
}
|
||||
|
||||
if ( m_sortByTermNumInt >= 0 ) {
|
||||
// no term?
|
||||
if ( ! miniMergedList[m_sortByTermNumInt] ) goto advance;
|
||||
intScore = g_posdb.getInt( miniMergedList[m_sortByTermNumInt]);
|
||||
}
|
||||
|
||||
// skip docid if outside of range
|
||||
if ( m_minScoreTermNum >= 0 ) {
|
||||
// no term?
|
||||
if ( ! miniMergedList[m_minScoreTermNum] ) goto advance;
|
||||
float score2 = g_posdb.getFloat ( miniMergedList[m_minScoreTermNum] );
|
||||
float score2 ;
|
||||
score2= g_posdb.getFloat ( miniMergedList[m_minScoreTermNum] );
|
||||
if ( score2 < m_minScoreVal ) goto advance;
|
||||
}
|
||||
|
||||
@ -6505,10 +6545,29 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
if ( m_maxScoreTermNum >= 0 ) {
|
||||
// no term?
|
||||
if ( ! miniMergedList[m_maxScoreTermNum] ) goto advance;
|
||||
float score2 = g_posdb.getFloat ( miniMergedList[m_maxScoreTermNum] );
|
||||
float score2 ;
|
||||
score2= g_posdb.getFloat ( miniMergedList[m_maxScoreTermNum] );
|
||||
if ( score2 > m_maxScoreVal ) goto advance;
|
||||
}
|
||||
|
||||
// skip docid if outside of range
|
||||
if ( m_minScoreTermNumInt >= 0 ) {
|
||||
// no term?
|
||||
if ( ! miniMergedList[m_minScoreTermNumInt] ) goto advance;
|
||||
long score3;
|
||||
score3=g_posdb.getInt(miniMergedList[m_minScoreTermNumInt]);
|
||||
if ( score3 < m_minScoreValInt ) goto advance;
|
||||
}
|
||||
|
||||
// skip docid if outside of range
|
||||
if ( m_maxScoreTermNumInt >= 0 ) {
|
||||
// no term?
|
||||
if ( ! miniMergedList[m_maxScoreTermNumInt] ) goto advance;
|
||||
long score3 ;
|
||||
score3= g_posdb.getInt ( miniMergedList[m_maxScoreTermNumInt]);
|
||||
if ( score3 > m_maxScoreValInt ) goto advance;
|
||||
}
|
||||
|
||||
|
||||
// . seoDebug hack so we can set "dcs"
|
||||
// . we only come here if we actually made it into m_topTree
|
||||
@ -6606,6 +6665,12 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// set the score and docid ptr
|
||||
t->m_score = score;
|
||||
t->m_docId = m_docId;
|
||||
// use an integer score like lastSpidered timestamp?
|
||||
if ( m_sortByTermNumInt >= 0 ) {
|
||||
t->m_intScore = intScore;
|
||||
t->m_score = 0.0;
|
||||
if ( ! m_topTree->m_useIntScores){char *xx=NULL;*xx=0;}
|
||||
}
|
||||
// . this will not add if tree is full and it is less than the
|
||||
// m_lowNode in score
|
||||
// . if it does get added to a full tree, lowNode will be
|
||||
|
15
Posdb.h
15
Posdb.h
@ -208,10 +208,16 @@ class Posdb {
|
||||
void setFloat ( void *vkp , float f ) {
|
||||
*(float *)(((char *)vkp) + 2) = f; };
|
||||
|
||||
void setInt ( void *vkp , long x ) {
|
||||
*(long *)(((char *)vkp) + 2) = x; };
|
||||
|
||||
// and read the float as well
|
||||
float getFloat ( void *vkp ) {
|
||||
return *(float *)(((char *)vkp) + 2); };
|
||||
|
||||
long getInt ( void *vkp ) {
|
||||
return *(long *)(((char *)vkp) + 2); };
|
||||
|
||||
void setAlignmentBit ( void *vkp , char val ) {
|
||||
char *p = (char *)vkp;
|
||||
if ( val ) p[1] = p[1] | 0x02;
|
||||
@ -610,6 +616,7 @@ class PosdbTable {
|
||||
|
||||
// for gbsortby:item.price ...
|
||||
long m_sortByTermNum;
|
||||
long m_sortByTermNumInt;
|
||||
|
||||
// for gbmin:price:1.99
|
||||
long m_minScoreTermNum;
|
||||
@ -619,6 +626,14 @@ class PosdbTable {
|
||||
float m_minScoreVal;
|
||||
float m_maxScoreVal;
|
||||
|
||||
// for gbmin:count:99
|
||||
long m_minScoreTermNumInt;
|
||||
long m_maxScoreTermNumInt;
|
||||
|
||||
// for gbmin:count:99
|
||||
long m_minScoreValInt;
|
||||
long m_maxScoreValInt;
|
||||
|
||||
|
||||
// the new intersection/scoring algo
|
||||
void intersectLists10_r ( );
|
||||
|
60
Query.cpp
60
Query.cpp
@ -2187,6 +2187,11 @@ bool Query::setQWords ( char boolFlag ,
|
||||
if ( fieldCode == FIELD_GBNUMBERMAX )
|
||||
ph = hash64 ("gbsortby", 8);
|
||||
|
||||
if ( fieldCode == FIELD_GBNUMBERMININT )
|
||||
ph = hash64 ("gbsortbyint", 11);
|
||||
if ( fieldCode == FIELD_GBNUMBERMAXINT )
|
||||
ph = hash64 ("gbsortbyint", 11);
|
||||
|
||||
// ptr to field, if any
|
||||
|
||||
qw->m_fieldCode = fieldCode;
|
||||
@ -2213,8 +2218,14 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// gbmin:price:1.23
|
||||
fieldCode == FIELD_GBNUMBERMIN ||
|
||||
fieldCode == FIELD_GBNUMBERMAX ||
|
||||
|
||||
fieldCode == FIELD_GBSORTBYINT ||
|
||||
fieldCode == FIELD_GBREVSORTBYINT ||
|
||||
fieldCode == FIELD_GBNUMBERMININT ||
|
||||
fieldCode == FIELD_GBNUMBERMAXINT ||
|
||||
|
||||
fieldCode == FIELD_GBAD ) {
|
||||
// . find first space -- that terminates the field value
|
||||
// . find 1st space -- that terminates the field value
|
||||
// . make "end" point to the end of the entire query
|
||||
char *end =
|
||||
(words.m_words[words.m_numWords-1] +
|
||||
@ -2222,13 +2233,14 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// use this for gbmin:price:1.99 etc.
|
||||
long firstColonLen = -1;
|
||||
// "w" points to the first alnumword after the field,
|
||||
// so for site:xyz.com "w" points to the 'x' and wlen would
|
||||
// be 3 in that case sinze xyz is a word of 3 chars. so advance
|
||||
// so for site:xyz.com "w" points to the 'x' and wlen
|
||||
// would be 3 in that case sinze xyz is a word of 3
|
||||
// chars. so advance
|
||||
// wlen until we hit a space.
|
||||
while ( w + wlen < end ) {
|
||||
// stop at first white space
|
||||
if ( is_wspace_utf8(w+wlen) ) break;
|
||||
// in the case of gbmin:price:1.99 record first ':'
|
||||
// in case of gbmin:price:1.99 record first ':'
|
||||
if ( w[wlen]==':' ) firstColonLen = wlen;
|
||||
wlen++;
|
||||
}
|
||||
@ -2238,21 +2250,28 @@ bool Query::setQWords ( char boolFlag ,
|
||||
unsigned long long wid = hash64 ( w , wlen, 0LL );
|
||||
|
||||
// i've decided not to make
|
||||
// gbsortby:products.offerPrice gbmin:price:1.23 case insensitive
|
||||
// gbsortby:products.offerPrice
|
||||
// gbmin:price:1.23 case insensitive
|
||||
if ( fieldCode == FIELD_GBSORTBY ||
|
||||
fieldCode == FIELD_GBREVSORTBY )
|
||||
fieldCode == FIELD_GBREVSORTBY ||
|
||||
fieldCode == FIELD_GBSORTBYINT ||
|
||||
fieldCode == FIELD_GBREVSORTBYINT )
|
||||
wid = hash64Lower_utf8 ( w , wlen , 0LL );
|
||||
|
||||
// gbmin:price:1.23
|
||||
if ( firstColonLen>0 &&
|
||||
( fieldCode == FIELD_GBNUMBERMIN ||
|
||||
fieldCode == FIELD_GBNUMBERMAX ) ) {
|
||||
fieldCode == FIELD_GBNUMBERMAX ||
|
||||
fieldCode == FIELD_GBNUMBERMAXINT ||
|
||||
fieldCode == FIELD_GBNUMBERMAXINT ) ) {
|
||||
// record the field
|
||||
wid = hash64Lower_utf8 ( w , firstColonLen , 0LL );
|
||||
wid = hash64Lower_utf8(w,firstColonLen , 0LL );
|
||||
// and also the floating point after that
|
||||
qw->m_float = atof ( w + firstColonLen + 1 );
|
||||
qw->m_int = (long)atoll( w + firstColonLen+1);
|
||||
}
|
||||
|
||||
|
||||
// should we have normalized before hashing?
|
||||
if ( fieldCode == FIELD_URL ||
|
||||
fieldCode == FIELD_GBPARENTURL ||
|
||||
@ -3078,9 +3097,12 @@ struct QueryField g_fields[] = {
|
||||
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
|
||||
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
|
||||
{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
|
||||
{"gbsortby", FIELD_GBSORTBY, false,"Example: gbsortby:price. Fields can be "
|
||||
|
||||
{"gbsortby", FIELD_GBSORTBY, false,
|
||||
"Example: gbsortby:price. Fields can be "
|
||||
"in JSON or in meta tag."},
|
||||
{"gbrevsortby", FIELD_GBREVSORTBY, false,"Example: gbrevsortby:item.price . "
|
||||
{"gbrevsortby", FIELD_GBREVSORTBY, false,
|
||||
"Example: gbrevsortby:item.price . "
|
||||
"Fields can be in JSON or in meta tag."},
|
||||
|
||||
// gbmin:price:1.23
|
||||
@ -3088,6 +3110,20 @@ struct QueryField g_fields[] = {
|
||||
"fields can be in JSON or in meta tag."},
|
||||
{"gbmax", FIELD_GBNUMBERMAX, false,"Usage: gbmax:price:1.99"},
|
||||
|
||||
|
||||
{"gbsortbyint", FIELD_GBSORTBYINT, false,
|
||||
"Example: gbsortbyint:intfield . Fields can be "
|
||||
"in JSON or in meta tag."},
|
||||
{"gbrevsortbyint", FIELD_GBREVSORTBYINT, false,
|
||||
"Example: gbrevsortbyint:item.count . "
|
||||
"Fields can be in JSON or in meta tag."},
|
||||
{"gbminint", FIELD_GBNUMBERMININT, false,
|
||||
"Usage: gbminint:count:99 . Numeric "
|
||||
"fields can be in JSON or in meta tag."},
|
||||
{"gbmaxint", FIELD_GBNUMBERMAXINT, false,
|
||||
"Usage: gbmaxint:count:99"},
|
||||
|
||||
|
||||
{"gbcountry",FIELD_GBCOUNTRY,false,""},
|
||||
{"gbad",FIELD_GBAD,false,""},
|
||||
|
||||
@ -3108,7 +3144,9 @@ struct QueryField g_fields[] = {
|
||||
|
||||
{"gbpermalink",FIELD_GBPERMALINK,false,""},
|
||||
//{"gbcsenum",FIELD_GBCSENUM,false,""},
|
||||
{"gbparenturl", FIELD_GBPARENTURL, true,"Match the json urls that were extract from this parent url. Example: gbparenturl:www.gigablast.com/addurl.htm"},
|
||||
{"gbparenturl", FIELD_GBPARENTURL, true,"Match the json urls that "
|
||||
"were extract from this parent url. Example: "
|
||||
"gbparenturl:www.gigablast.com/addurl.htm"},
|
||||
{"gbdocid",FIELD_GBDOCID,false,"restrict results to this docid"}
|
||||
|
||||
};
|
||||
|
8
Query.h
8
Query.h
@ -110,6 +110,12 @@ typedef unsigned long long qvec_t;
|
||||
#define FIELD_GBNUMBERMAX 57
|
||||
#define FIELD_GBPARENTURL 58
|
||||
|
||||
#define FIELD_GBSORTBYINT 59
|
||||
#define FIELD_GBREVSORTBYINT 60
|
||||
#define FIELD_GBNUMBERMININT 61
|
||||
#define FIELD_GBNUMBERMAXINT 62
|
||||
|
||||
|
||||
#define FIELD_GBOTHER 92
|
||||
|
||||
// returns a FIELD_* code above, or FIELD_GENERIC if not in the list
|
||||
@ -365,6 +371,8 @@ class QueryWord {
|
||||
|
||||
// for min/max score ranges like gbmin:price:1.99
|
||||
float m_float;
|
||||
// for gbminint:99 etc. uses integers instead of floats for better res
|
||||
long m_int;
|
||||
};
|
||||
|
||||
// . we filter the QueryWords and turn them into QueryTerms
|
||||
|
@ -220,6 +220,15 @@ bool SafeBuf::pushFloat ( float i) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SafeBuf::pushDouble ( double i) {
|
||||
if ( m_length + (long)sizeof(double) > m_capacity )
|
||||
if(!reserve(sizeof(double)))
|
||||
return false;
|
||||
*(double *)(m_buf+m_length) = i;
|
||||
m_length += sizeof(double);
|
||||
return true;
|
||||
}
|
||||
|
||||
long SafeBuf::popLong ( ) {
|
||||
if ( m_length < 4 ) { char *xx=NULL;*xx=0; }
|
||||
long ret = *(long *)(m_buf+m_length-4);
|
||||
|
@ -306,6 +306,7 @@ struct SafeBuf {
|
||||
bool pushLong (long i);
|
||||
bool pushLongLong (long long i);
|
||||
bool pushFloat (float i);
|
||||
bool pushDouble (double i);
|
||||
long popLong();
|
||||
float popFloat();
|
||||
|
||||
|
52
TopTree.cpp
52
TopTree.cpp
@ -36,6 +36,7 @@ TopTree::~TopTree() { reset(); }
|
||||
void TopTree::reset ( ) {
|
||||
if ( m_nodes ) mfree(m_nodes,m_allocSize,"TopTree");
|
||||
m_nodes = NULL;
|
||||
m_useIntScores = false;
|
||||
//m_sampleVectors = NULL;
|
||||
m_numNodes = 0;
|
||||
m_numUsedNodes = 0;
|
||||
@ -200,9 +201,18 @@ bool TopTree::addNode ( TopNode *t , long tnn ) {
|
||||
if ( m_vcount >= m_docsWanted ) {
|
||||
long i = m_lowNode;
|
||||
|
||||
if ( t->m_score < m_nodes[i].m_score ) {
|
||||
m_kickedOutDocIds = true; return false; }
|
||||
if ( t->m_score > m_nodes[i].m_score ) goto addIt;
|
||||
if ( m_useIntScores ) {
|
||||
if ( t->m_intScore < m_nodes[i].m_intScore ) {
|
||||
m_kickedOutDocIds = true; return false; }
|
||||
if ( t->m_intScore > m_nodes[i].m_intScore) goto addIt;
|
||||
}
|
||||
|
||||
else {
|
||||
if ( t->m_score < m_nodes[i].m_score ) {
|
||||
m_kickedOutDocIds = true; return false; }
|
||||
if ( t->m_score > m_nodes[i].m_score ) goto addIt;
|
||||
}
|
||||
|
||||
// . finally, compare docids, store lower ones first
|
||||
// . docids should not tie...
|
||||
if ( t->m_docId >= m_nodes[i].m_docId ) {
|
||||
@ -243,11 +253,23 @@ bool TopTree::addNode ( TopNode *t , long tnn ) {
|
||||
// . if a node exists with our key then do NOT replace it
|
||||
else while ( i >= 0 ) {
|
||||
iparent = i;
|
||||
|
||||
// . compare to the ith node
|
||||
if ( t->m_score < m_nodes[i].m_score ) {
|
||||
i = LEFT(i); dir = 0; continue; }
|
||||
if ( t->m_score > m_nodes[i].m_score ) {
|
||||
i = RIGHT(i); dir = 1; continue; }
|
||||
if ( m_useIntScores ) {
|
||||
if ( t->m_intScore < m_nodes[i].m_intScore ) {
|
||||
i = LEFT(i); dir = 0; continue; }
|
||||
if ( t->m_intScore > m_nodes[i].m_intScore ) {
|
||||
i = RIGHT(i); dir = 1; continue; }
|
||||
|
||||
}
|
||||
else {
|
||||
if ( t->m_score < m_nodes[i].m_score ) {
|
||||
i = LEFT(i); dir = 0; continue; }
|
||||
if ( t->m_score > m_nodes[i].m_score ) {
|
||||
i = RIGHT(i); dir = 1; continue; }
|
||||
}
|
||||
|
||||
|
||||
// . finally, compare docids, store lower ones first
|
||||
// . docids should not tie...
|
||||
if ( t->m_docId > m_nodes[i].m_docId ) {
|
||||
@ -293,7 +315,13 @@ bool TopTree::addNode ( TopNode *t , long tnn ) {
|
||||
// . WARNING: if t->m_score is fractional, the fraction will be
|
||||
// dropped and could result in the lower scoring of the two docids
|
||||
// being kept.
|
||||
uint32_t cs = ((uint32_t)t->m_score);
|
||||
uint32_t cs ;
|
||||
|
||||
if ( m_useIntScores )
|
||||
cs = (uint32_t) t->m_intScore;
|
||||
else
|
||||
cs = ((uint32_t)t->m_score);
|
||||
|
||||
key_t k;
|
||||
k.n1 = domHash << 24; // 1 byte domHash
|
||||
//k.n1 |= (t->m_bscore & ~0xc0) << 16; // 1 byte bscore
|
||||
@ -421,7 +449,13 @@ bool TopTree::addNode ( TopNode *t , long tnn ) {
|
||||
// WARNING: if t->m_score is fractional, the fraction will be
|
||||
// dropped and could result in the lower scoring of the two
|
||||
// docids being kept.
|
||||
uint32_t cs = ((uint32_t)t->m_score);
|
||||
uint32_t cs ;
|
||||
|
||||
if ( m_useIntScores )
|
||||
cs = (uint32_t) t->m_intScore;
|
||||
else
|
||||
cs = ((uint32_t)t->m_score);
|
||||
|
||||
k.n1 = domHash2 << 24; // 1 byte domHash
|
||||
//k.n1 |= (t->m_bscore & ~0xc0) << 16; // 1 byte bscore
|
||||
k.n1 |= cs >> 16; // 4 byte score
|
||||
|
@ -30,6 +30,10 @@ class TopNode {
|
||||
//unsigned char m_tier ;
|
||||
float m_score ;
|
||||
long long m_docId;
|
||||
|
||||
// option for using int scores
|
||||
long m_intScore;
|
||||
|
||||
// clustering info
|
||||
//long m_kid ; // result from our same site below us
|
||||
//unsigned long m_siteHash ;
|
||||
@ -124,6 +128,7 @@ class TopTree {
|
||||
long m_cap ;
|
||||
float m_partial ;
|
||||
bool m_doSiteClustering;
|
||||
bool m_useIntScores;
|
||||
long m_docsWanted;
|
||||
long m_ridiculousMax;
|
||||
char m_kickedOutDocIds;
|
||||
|
124
XmlDoc.cpp
124
XmlDoc.cpp
@ -29871,6 +29871,23 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
|
||||
if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) )
|
||||
return false;
|
||||
|
||||
//
|
||||
// also hash as an int, 4 byte-integer so our lastSpidered timestamps
|
||||
// dont lose 128 seconds of resolution
|
||||
//
|
||||
|
||||
long i = (long) atoll2 ( p , bufEnd - p );
|
||||
|
||||
if ( ! hashNumber3 ( i , hi , "gbsortbyint" ) )
|
||||
return false;
|
||||
|
||||
// also hash in reverse order for sorting from low to high
|
||||
i = -1 * i;
|
||||
|
||||
if ( ! hashNumber3 ( i , hi , "gbrevsortbyint" ) )
|
||||
return false;
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -29979,6 +29996,113 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashNumber3 ( long n , HashInfo *hi , char *sortByStr ) {
|
||||
|
||||
// prefix is something like price. like the meta "name" or
|
||||
// the json name with dots in it like "product.info.price" or something
|
||||
long long nameHash = 0LL;
|
||||
long nameLen = 0;
|
||||
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
|
||||
if ( hi->m_prefix && nameLen )
|
||||
nameHash = hash64Lower_utf8 ( hi->m_prefix , nameLen );
|
||||
// need a prefix for hashing numbers... for now
|
||||
else { char *xx=NULL; *xx=0; }
|
||||
|
||||
// combine prefix hash with a special hash to make it unique to avoid
|
||||
// collisions. this is the "TRUE" prefix.
|
||||
long long truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
|
||||
// hash with the "TRUE" prefix
|
||||
long long ph2 = hash64 ( nameHash , truePrefix64 );
|
||||
|
||||
// . now store it
|
||||
// . use field hash as the termid. normally this would just be
|
||||
// a prefix hash
|
||||
// . use mostly fake value otherwise
|
||||
key144_t k;
|
||||
g_posdb.makeKey ( &k ,
|
||||
ph2 ,
|
||||
0,//docid
|
||||
0,// word pos #
|
||||
0,// densityRank , // 0-15
|
||||
0 , // MAXDIVERSITYRANK
|
||||
0 , // wordSpamRank ,
|
||||
0 , //siterank
|
||||
0 , // hashGroup,
|
||||
// we set to docLang final hash loop
|
||||
//langUnknown, // langid
|
||||
// unless already set. so set to english here
|
||||
// so it will not be set to something else
|
||||
// otherwise our floats would be ordered by langid!
|
||||
// somehow we have to indicate that this is a float
|
||||
// termlist so it will not be mangled any more.
|
||||
//langEnglish,
|
||||
langUnknown,
|
||||
0 , // multiplier
|
||||
false, // syn?
|
||||
false , // delkey?
|
||||
hi->m_shardByTermId );
|
||||
|
||||
//long long final = hash64n("products.offerprice",0);
|
||||
//long long prefix = hash64n("gbsortby",0);
|
||||
//long long h64 = hash64 ( final , prefix);
|
||||
//if ( ph2 == h64 )
|
||||
// log("hey: got offer price");
|
||||
|
||||
// now set the float in that key
|
||||
//g_posdb.setFloat ( &k , f );
|
||||
g_posdb.setInt ( &k , n );
|
||||
|
||||
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
||||
// so that we can b-step into a posdb list and make sure
|
||||
// we are aligned on a 6 byte or 12 byte key, since they come
|
||||
// in both sizes. but for this, hack it off to tell
|
||||
// addTable144() that we are a special posdb key, a "numeric"
|
||||
// key that has a float stored in it. then it will NOT
|
||||
// set the siterank and langid bits which throw our sorting
|
||||
// off!!
|
||||
g_posdb.setAlignmentBit ( &k , 0 );
|
||||
|
||||
// sanity
|
||||
//float t = g_posdb.getFloat ( &k );
|
||||
long x = g_posdb.getInt ( &k );
|
||||
if ( x != n ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
HashTableX *dt = hi->m_tt;
|
||||
|
||||
// the key may indeed collide, but that's ok for this application
|
||||
if ( ! dt->addTerm144 ( &k ) )
|
||||
return false;
|
||||
|
||||
if ( ! m_wts )
|
||||
return true;
|
||||
|
||||
// store in buffer
|
||||
char buf[128];
|
||||
long bufLen = sprintf(buf,"%li",n);
|
||||
|
||||
// add to wts for PageParser.cpp display
|
||||
// store it
|
||||
if ( ! storeTerm ( buf,
|
||||
bufLen,
|
||||
truePrefix64,
|
||||
hi,
|
||||
0, // word#, i,
|
||||
0, // wordPos
|
||||
0,// densityRank , // 0-15
|
||||
0, // MAXDIVERSITYRANK,//phrase
|
||||
0, // ws,
|
||||
0, // hashGroup,
|
||||
//true,
|
||||
&m_wbuf,
|
||||
m_wts,
|
||||
// a hack for display in wts:
|
||||
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
||||
langUnknown ) )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . many many websites got hijacked pages in them...
|
||||
// . revkim.org/mcdrt/mgntf/sata/sata.htm
|
||||
// . collegefootballweekly.net/hswsj/riime/sata/sata.htm
|
||||
|
Loading…
Reference in New Issue
Block a user