mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
92413001fb
a word-based function, isDirtyWord() which IDs single words, bigrams and trigrams. it'll be much faster than the current approach and won't slow down when the list of dirty words gets big. we then need isDirtyUrl() to use the logic in Speller.cpp to split a url up into composing words to run through isDirtyWord().
518 lines
15 KiB
C++
518 lines
15 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "matches2.h"
|
|
#include "Unicode.h"
|
|
#include "Titledb.h"
|
|
#include "HashTableT.h"
|
|
|
|
//make the key, it is just the needles ptr
|
|
//static HashTableT<unsigned long long , char*> s_quickTables;
|
|
static HashTableX s_quickTables;
|
|
|
|
/*
|
|
// returns false and sets g_errno on error
|
|
bool fast_highlight ( // highlight these query terms:
|
|
Query *q ,
|
|
// highlight query terms in this string
|
|
char *stringToHighlight ,
|
|
// store highlighted string here:
|
|
SafeBuf *dstBuf ) {
|
|
|
|
|
|
// make them into needles first
|
|
SafeBuf needles;
|
|
long need = q->m_numTerms * sizeof(Needle);
|
|
if ( ! needles.reserve(need) ) return false;
|
|
|
|
char *p = needles.getBufStart();
|
|
for ( long i = 0 ; i < q->m_numTerms ; i++ ) {
|
|
QueryTerm *qt = &q->m_qterms[i];
|
|
Needle *ne = (Needle *)p;
|
|
p += sizeof(Needle);
|
|
ne->m_string = qt->m_term;
|
|
ne->m_stringSize = qt->m_termLen;
|
|
ne->m_id = i;
|
|
ne->m_isSection = false;
|
|
}
|
|
|
|
Needle *nbuf = needles.getBufStart();
|
|
|
|
getMatches2 ( nbuf ,
|
|
q->m_numTerms,
|
|
stringToHighlight,
|
|
gbstrlen(stringToHighlight),
|
|
NULL, // linkpos
|
|
&needleNum,
|
|
false, // stopatfirstmatch?
|
|
NULL, // hadprematch?
|
|
true,// save quick tables?
|
|
niceness );
|
|
|
|
}
|
|
*/
|
|
|
|
|
|
// . get the first substring in "haystack" that matches a string in "needles"
|
|
// . set *needleNum to the # of the needle in "needles" that matched
|
|
// . return a ptr into "haystack" that matches, NULL if none
|
|
// . if "linkPos" is non-NULL then certain types of matches must occur
|
|
// BEFORE "linkPos" in order to be counted. those certain types are indicated
|
|
// by a Needle::m_section of 1. this is for allowing links before comment
|
|
// sections to be able to vote, and links in/after comment section to not
|
|
// vote. (see LinkText.cpp's call to getMatch() to better understand this)
|
|
// . ALL needles.m_strings MUST BE IN LOWER CASE!! otherwise, we should convert
|
|
// to lower and store into tmp[]. TODO.
|
|
// . a space (includes \r \n) in a needle will match a consecutive sequence
|
|
// of spaces in the haystack
|
|
|
|
#define BITVEC unsigned long long
|
|
|
|
char *getMatches2 ( Needle *needles ,
|
|
long numNeedles ,
|
|
char *haystack ,
|
|
long haystackSize ,
|
|
char *linkPos ,
|
|
long *needleNum ,
|
|
bool stopAtFirstMatch ,
|
|
bool *hadPreMatch ,
|
|
bool saveQuickTables ,
|
|
long niceness ) {
|
|
|
|
// assume not
|
|
if ( hadPreMatch ) *hadPreMatch = false;
|
|
// empty haystack? then no matches
|
|
if ( ! haystack || haystackSize <= 0 ) return NULL;
|
|
// JAB: no needles? then no matches
|
|
if ( ! needles || numNeedles <= 0 ) return NULL;
|
|
|
|
//char tmp[8192];
|
|
//char *t = tmp;
|
|
//char *tend = tmp + 8192;
|
|
|
|
// reset counts to 0
|
|
//if ( ! stopAtFirstMatch )
|
|
// for ( long i=0 ; i < numNeedles ; i++ )
|
|
// needles[i].m_count = 0;
|
|
|
|
// are we responsible for init'ing string lengths? this is much
|
|
// faster than having to specify lengths manually.
|
|
for ( long i=0 ; i < numNeedles; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// clear
|
|
needles[i].m_count = 0;
|
|
needles[i].m_firstMatch = NULL;
|
|
// set the string size in bytes if not provided
|
|
if ( needles[i].m_stringSize == 0 )
|
|
needles[i].m_stringSize = gbstrlen(needles[i].m_string);
|
|
}
|
|
|
|
// . set up the quick tables.
|
|
// . utf16 is not as effective here because half the bytes are zeroes!
|
|
// . TODO: use a static cache of like 4 of these tables where the key
|
|
// is the Needles ptr ... done
|
|
long numNeedlesToInit = numNeedles;
|
|
char space[256 * 4 * sizeof(BITVEC)];
|
|
char *buf = NULL;
|
|
|
|
BITVEC *s0;
|
|
BITVEC *s1;
|
|
BITVEC *s2;
|
|
BITVEC *s3;
|
|
|
|
/*
|
|
static bool s_quickTableInit = false;
|
|
static char s_qtbuf[128*(12+1)*2];
|
|
|
|
long slot = -1;
|
|
if(saveQuickTables) {
|
|
if ( ! s_quickTableInit ) {
|
|
s_quickTableInit = true;
|
|
s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx");
|
|
}
|
|
uint64_t key = (uint32_t)needles;
|
|
slot = s_quickTables.getSlot(&key);
|
|
if ( slot >= 0 ) {
|
|
buf = s_quickTables.getValueFromSlot(slot);
|
|
numNeedlesToInit = 0;
|
|
}
|
|
}
|
|
*/
|
|
|
|
if(!buf) {
|
|
buf = space;
|
|
memset ( buf , 0 , sizeof(BITVEC)*256*4);
|
|
}
|
|
|
|
/*
|
|
if( useQuickTables && slot == -1 ) {
|
|
//buf = (char*)mcalloc(sizeof(unsigned long)*256*5,
|
|
// "matches");
|
|
if(buf) s_quickTables.addKey(&key, &buf);
|
|
//sanity check, no reason why there needs to be a
|
|
//limit, I just don't expect there to be this many
|
|
//static needles at this point.
|
|
if(s_quickTables.getNumSlotsUsed() > 32){
|
|
char *xx=NULL; *xx = 0;
|
|
}
|
|
}
|
|
*/
|
|
|
|
// try 64 bit bit vectors now since we doubled # of needles
|
|
long offset = 0;
|
|
s0 = (BITVEC *)(buf + offset);
|
|
offset += sizeof(BITVEC)*256;
|
|
s1 = (BITVEC *)(buf + offset);
|
|
offset += sizeof(BITVEC)*256;
|
|
s2 = (BITVEC *)(buf + offset);
|
|
offset += sizeof(BITVEC)*256;
|
|
s3 = (BITVEC *)(buf + offset);
|
|
offset += sizeof(BITVEC)*256;
|
|
|
|
BITVEC mask;
|
|
|
|
// set the letter tables, s0[] through sN[], for each needle
|
|
for ( long i = 0 ; i < numNeedlesToInit ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
unsigned char *w = (unsigned char *)needles[i].m_string;
|
|
unsigned char *wend = w + needles[i].m_stringSize;
|
|
// BITVEC is now 64 bits
|
|
mask = (1<<(i&0x3f)); // (1<<(i%64));
|
|
// if the needle is small, fill up the remaining letter tables
|
|
// with its mask... so it matches any character in haystack.
|
|
s0[(unsigned char)to_lower_a(*w)] |= mask;
|
|
s0[(unsigned char)to_upper_a(*w)] |= mask;
|
|
w += 1;//step;
|
|
if ( w >= wend ) {
|
|
for ( long j = 0 ; j < 256 ; j++ ) {
|
|
s1[j] |= mask;
|
|
s2[j] |= mask;
|
|
s3[j] |= mask;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
s1[(unsigned char)to_lower_a(*w)] |= mask;
|
|
s1[(unsigned char)to_upper_a(*w)] |= mask;
|
|
w += 1;//step;
|
|
if ( w >= wend ) {
|
|
for ( long j = 0 ; j < 256 ; j++ ) {
|
|
s2[j] |= mask;
|
|
s3[j] |= mask;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
s2[(unsigned char)to_lower_a(*w)] |= mask;
|
|
s2[(unsigned char)to_upper_a(*w)] |= mask;
|
|
w += 1;//step;
|
|
if ( w >= wend ) {
|
|
for ( long j = 0 ; j < 256 ; j++ ) {
|
|
s3[j] |= mask;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
s3[(unsigned char)to_lower_a(*w)] |= mask;
|
|
s3[(unsigned char)to_upper_a(*w)] |= mask;
|
|
w += 1;//step;
|
|
}
|
|
|
|
// return a ptr to the first match if we should, this is it
|
|
char *retVal = NULL;
|
|
// debug vars
|
|
//long debugCount = 0;
|
|
//long pp = 0;
|
|
// now find the first needle in the haystack
|
|
unsigned char *p = (unsigned char *)haystack;
|
|
unsigned char *pend = (unsigned char *)haystack + haystackSize;
|
|
char *dend = (char *)pend;
|
|
|
|
// do not breach!
|
|
pend -= 4;
|
|
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
//if ( (char *)p - (char *)haystack >= 12508 )
|
|
// log("hey");
|
|
// analytics...
|
|
|
|
// is this a possible match? (this should be VERY fast)
|
|
mask = s0[*(p+0)];
|
|
if ( ! mask ) continue;
|
|
mask &= s1[*(p+1)];
|
|
if ( ! mask ) continue;
|
|
mask &= s2[*(p+2)];
|
|
if ( ! mask ) continue;
|
|
mask &= s3[*(p+3)];
|
|
if ( ! mask ) continue;
|
|
//debugCount++;
|
|
/*
|
|
// display
|
|
char oo[148];
|
|
char *xx ;
|
|
xx = oo;
|
|
//memcpy ( xx , p , 8 );
|
|
for ( long k = 0 ; k < 5 ; k++ ) {
|
|
*xx++ = p[k];
|
|
}
|
|
memcpy ( xx , "..." , 3 );
|
|
xx += 3;
|
|
*/
|
|
//
|
|
// XXX: do a hashtable lookup here so we have the candidate
|
|
// matches in a chain...
|
|
// XXX: for small needles which match frequently let's have
|
|
// a single char hash table, a 2 byte char hash table,
|
|
// etc. so if we have small needles we check the hash
|
|
// in those tables first, but only if mask & SMALL_NEEDLE
|
|
// is true! the single byte needle hash table can just
|
|
// be a lookup table. just XOR the bytes together for
|
|
// the hash.
|
|
// XXX: just hash the mask into a table to get candidate
|
|
// matches in a chain? but there's 4B hashes!!
|
|
// we got a good candidate, loop through all the needles
|
|
for ( long j = 0 ; j < numNeedles ; j++ ) {
|
|
// skip if does not match mask, will save time
|
|
if ( ! ((1<<(j&0x3f)) & mask) ) continue;
|
|
if( needles[j].m_stringSize > 3) {
|
|
// ensure first 4 bytes matches this needle's
|
|
if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
|
|
continue;
|
|
if (needles[j].m_string[1]!=to_lower_a(*(p+1)))
|
|
continue;
|
|
if (needles[j].m_string[2]!=to_lower_a(*(p+2)))
|
|
continue;
|
|
if (needles[j].m_string[3]!=to_lower_a(*(p+3)))
|
|
continue;
|
|
}
|
|
// get needle size
|
|
long msize = needles[j].m_stringSize;
|
|
// can p possibly be big enough?
|
|
if ( pend - p < msize ) continue;
|
|
// needle is "m" now
|
|
char *m = needles[j].m_string;
|
|
char *mend = needles[j].m_stringSize + m;
|
|
// use a tmp ptr for ptr into haystack
|
|
char *d = (char *)p;
|
|
// skip first 4 bytes since we know they match
|
|
if(msize > 3) {
|
|
d += 4;
|
|
m += 4;
|
|
}
|
|
// loop over each char in "m"
|
|
//for ( ; *m ; m++ ) {
|
|
for ( ; m < mend ; m++ ) {
|
|
//while ( ! *d && d < dend ) d++;
|
|
//while ( ! *m && m < mend ) m++;
|
|
// if we are a non alnum, that will match
|
|
// any string of non-alnums, like a space
|
|
// for instance. the 0 byte does not count
|
|
// because it is used in utf16 a lot. this
|
|
// may trigger some false matches in utf16
|
|
// but, oh well... this way "link partner"
|
|
// will match "link - partner" in the haystk
|
|
if ( is_wspace_a(*m) && m < mend ) {
|
|
// skip all in "d" then.
|
|
while (d<dend&&is_wspace_a(*d)) d++;
|
|
// advance m then
|
|
continue;
|
|
}
|
|
// make sure we match otherwise
|
|
if ( *m != to_lower_a(*d) ) break;
|
|
// ok, we matched, go to next
|
|
d++;
|
|
}
|
|
// if not null, keep going
|
|
if ( m < mend ) continue;
|
|
// if this needle is "special" AND it occurs AFTER
|
|
// linkPos, then do not consider it a match. this is
|
|
// if we have a comment section indicator, like
|
|
// "div id=\"comment" AND it occurs AFTER linkPos
|
|
// (the char ptr to our link in the haystack) then
|
|
// the match does not count.
|
|
if ( linkPos && needles[j].m_isSection &&
|
|
(char *)p>linkPos ) {
|
|
// record this for LinkText.cpp
|
|
if ( hadPreMatch ) *hadPreMatch = true;
|
|
continue;
|
|
}
|
|
// store ptr if NULL
|
|
if ( ! needles[j].m_firstMatch )
|
|
needles[j].m_firstMatch = (char *)p;
|
|
// return ptr to needle in "haystack"
|
|
if ( stopAtFirstMatch ) {
|
|
// ok, we got a match
|
|
if ( needleNum ) *needleNum = j;
|
|
//return (char *)p;
|
|
retVal = (char *)p;
|
|
p = pend;
|
|
break;
|
|
}
|
|
// otherwise, just count it
|
|
needles[j].m_count++;
|
|
// see if we match another needle, fixes bug
|
|
// of matching "anal" but not "analy[tics]"
|
|
continue;
|
|
// advance to next char in the haystack
|
|
break;
|
|
}
|
|
// ok, we did not match any needles, advance p and try again
|
|
}
|
|
|
|
//
|
|
// HACK:
|
|
//
|
|
// repeat above loop but for the last 4 characters in haystack!!
|
|
// this fixes a electric fence mem breach core
|
|
//
|
|
// it is slower because we check for \0
|
|
//
|
|
pend += 4;
|
|
|
|
for ( ; p < pend ; p++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
//if ( (char *)p - (char *)haystack >= 12508 )
|
|
// log("hey");
|
|
// is this a possible match? (this should be VERY fast)
|
|
mask = s0[*(p+0)];
|
|
if ( ! mask ) continue;
|
|
if ( p+1 < pend ) {
|
|
mask &= s1[*(p+1)];
|
|
if ( ! mask ) continue;
|
|
}
|
|
if ( p+2 < pend ) {
|
|
mask &= s2[*(p+2)];
|
|
if ( ! mask ) continue;
|
|
}
|
|
if ( p+3 < pend ) {
|
|
mask &= s3[*(p+3)];
|
|
if ( ! mask ) continue;
|
|
}
|
|
//debugCount++;
|
|
/*
|
|
// display
|
|
char oo[148];
|
|
char *xx ;
|
|
xx = oo;
|
|
//memcpy ( xx , p , 8 );
|
|
for ( long k = 0 ; k < 5 ; k++ ) {
|
|
*xx++ = p[k];
|
|
}
|
|
memcpy ( xx , "..." , 3 );
|
|
xx += 3;
|
|
*/
|
|
//
|
|
// XXX: do a hashtable lookup here so we have the candidate
|
|
// matches in a chain...
|
|
// XXX: for small needles which match frequently let's have
|
|
// a single char hash table, a 2 byte char hash table,
|
|
// etc. so if we have small needles we check the hash
|
|
// in those tables first, but only if mask & SMALL_NEEDLE
|
|
// is true! the single byte needle hash table can just
|
|
// be a lookup table. just XOR the bytes together for
|
|
// the hash.
|
|
// XXX: just hash the mask into a table to get candidate
|
|
// matches in a chain? but there's 4B hashes!!
|
|
// we got a good candidate, loop through all the needles
|
|
for ( long j = 0 ; j < numNeedles ; j++ ) {
|
|
// skip if does not match mask, will save time
|
|
if ( ! ((1<<(j&0x3f)) & mask) ) continue;
|
|
if( needles[j].m_stringSize > 3) {
|
|
// ensure first 4 bytes matches this needle's
|
|
if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
|
|
continue;
|
|
if (!p[1] ||
|
|
needles[j].m_string[1]!=to_lower_a(*(p+1)))
|
|
continue;
|
|
if (!p[2] ||
|
|
needles[j].m_string[2]!=to_lower_a(*(p+2)))
|
|
continue;
|
|
if (!p[3] ||
|
|
needles[j].m_string[3]!=to_lower_a(*(p+3)))
|
|
continue;
|
|
}
|
|
// get needle size
|
|
long msize = needles[j].m_stringSize;
|
|
// can p possibly be big enough?
|
|
if ( pend - p < msize ) continue;
|
|
// needle is "m" now
|
|
char *m = needles[j].m_string;
|
|
char *mend = needles[j].m_stringSize + m;
|
|
// use a tmp ptr for ptr into haystack
|
|
char *d = (char *)p;
|
|
// skip first 4 bytes since we know they match
|
|
if(msize > 3) {
|
|
d += 4;
|
|
m += 4;
|
|
}
|
|
// loop over each char in "m"
|
|
//for ( ; *m ; m++ ) {
|
|
for ( ; m < mend ; m++ ) {
|
|
//while ( ! *d && d < dend ) d++;
|
|
//while ( ! *m && m < mend ) m++;
|
|
// if we are a non alnum, that will match
|
|
// any string of non-alnums, like a space
|
|
// for instance. the 0 byte does not count
|
|
// because it is used in utf16 a lot. this
|
|
// may trigger some false matches in utf16
|
|
// but, oh well... this way "link partner"
|
|
// will match "link - partner" in the haystk
|
|
if ( is_wspace_a(*m) && m < mend ) {
|
|
// skip all in "d" then.
|
|
while (d<dend&&is_wspace_a(*d)) d++;
|
|
// advance m then
|
|
continue;
|
|
}
|
|
// make sure we match otherwise
|
|
if ( *m != to_lower_a(*d) ) break;
|
|
// ok, we matched, go to next
|
|
d++;
|
|
}
|
|
// if not null, keep going
|
|
if ( m < mend ) continue;
|
|
// if this needle is "special" AND it occurs AFTER
|
|
// linkPos, then do not consider it a match. this is
|
|
// if we have a comment section indicator, like
|
|
// "div id=\"comment" AND it occurs AFTER linkPos
|
|
// (the char ptr to our link in the haystack) then
|
|
// the match does not count.
|
|
if ( linkPos && needles[j].m_isSection &&
|
|
(char *)p>linkPos ) {
|
|
// record this for LinkText.cpp
|
|
if ( hadPreMatch ) *hadPreMatch = true;
|
|
continue;
|
|
}
|
|
// store ptr if NULL
|
|
if ( ! needles[j].m_firstMatch )
|
|
needles[j].m_firstMatch = (char *)p;
|
|
// return ptr to needle in "haystack"
|
|
if ( stopAtFirstMatch ) {
|
|
// ok, we got a match
|
|
if ( needleNum ) *needleNum = j;
|
|
//return (char *)p;
|
|
retVal = (char *)p;
|
|
p = pend;
|
|
break;
|
|
}
|
|
// otherwise, just count it
|
|
needles[j].m_count++;
|
|
// advance to next char in the haystack
|
|
break;
|
|
}
|
|
// ok, we did not match any needles, advance p and try again
|
|
}
|
|
|
|
|
|
//if ( debugCount > 0 ) pp = haystackSize / debugCount;
|
|
//log("build: debug count = %li uc=%li hsize=%li "
|
|
// "1 in %li chars matches.",
|
|
// debugCount,(long)isHaystackUtf16,haystackSize,pp);
|
|
|
|
// before we exit, clean up
|
|
return retVal;
|
|
}
|