open-source-search-engine/matches2.cpp
mwells 92413001fb dirty word detector revisions. we need
a word-based function, isDirtyWord()
which IDs single words, bigrams and trigrams.
it'll be much faster than the current approach
and won't slow down when the list of dirty
words gets big. we then need isDirtyUrl() to
use the logic in Speller.cpp to split
a url up into composing words to run through
isDirtyWord().
2013-10-16 20:19:49 -07:00

518 lines
15 KiB
C++

#include "gb-include.h"
#include "matches2.h"
#include "Unicode.h"
#include "Titledb.h"
#include "HashTableT.h"
//make the key, it is just the needles ptr
//static HashTableT<unsigned long long , char*> s_quickTables;
static HashTableX s_quickTables;
/*
// returns false and sets g_errno on error
bool fast_highlight ( // highlight these query terms:
Query *q ,
// highlight query terms in this string
char *stringToHighlight ,
// store highlighted string here:
SafeBuf *dstBuf ) {
// make them into needles first
SafeBuf needles;
long need = q->m_numTerms * sizeof(Needle);
if ( ! needles.reserve(need) ) return false;
char *p = needles.getBufStart();
for ( long i = 0 ; i < q->m_numTerms ; i++ ) {
QueryTerm *qt = &q->m_qterms[i];
Needle *ne = (Needle *)p;
p += sizeof(Needle);
ne->m_string = qt->m_term;
ne->m_stringSize = qt->m_termLen;
ne->m_id = i;
ne->m_isSection = false;
}
Needle *nbuf = needles.getBufStart();
getMatches2 ( nbuf ,
q->m_numTerms,
stringToHighlight,
gbstrlen(stringToHighlight),
NULL, // linkpos
&needleNum,
false, // stopatfirstmatch?
NULL, // hadprematch?
true,// save quick tables?
niceness );
}
*/
// . get the first substring in "haystack" that matches a string in "needles"
// . set *needleNum to the # of the needle in "needles" that matched
// . return a ptr into "haystack" that matches, NULL if none
// . if "linkPos" is non-NULL then certain types of matches must occur
// BEFORE "linkPos" in order to be counted. those certain types are indicated
// by a Needle::m_section of 1. this is for allowing links before comment
// sections to be able to vote, and links in/after comment section to not
// vote. (see LinkText.cpp's call to getMatch() to better understand this)
// . ALL needles.m_strings MUST BE IN LOWER CASE!! otherwise, we should convert
// to lower and store into tmp[]. TODO.
// . a space (includes \r \n) in a needle will match a consecutive sequence
// of spaces in the haystack
#define BITVEC unsigned long long
char *getMatches2 ( Needle *needles ,
long numNeedles ,
char *haystack ,
long haystackSize ,
char *linkPos ,
long *needleNum ,
bool stopAtFirstMatch ,
bool *hadPreMatch ,
bool saveQuickTables ,
long niceness ) {
// assume not
if ( hadPreMatch ) *hadPreMatch = false;
// empty haystack? then no matches
if ( ! haystack || haystackSize <= 0 ) return NULL;
// JAB: no needles? then no matches
if ( ! needles || numNeedles <= 0 ) return NULL;
//char tmp[8192];
//char *t = tmp;
//char *tend = tmp + 8192;
// reset counts to 0
//if ( ! stopAtFirstMatch )
// for ( long i=0 ; i < numNeedles ; i++ )
// needles[i].m_count = 0;
// are we responsible for init'ing string lengths? this is much
// faster than having to specify lengths manually.
for ( long i=0 ; i < numNeedles; i++ ) {
// breathe
QUICKPOLL(niceness);
// clear
needles[i].m_count = 0;
needles[i].m_firstMatch = NULL;
// set the string size in bytes if not provided
if ( needles[i].m_stringSize == 0 )
needles[i].m_stringSize = gbstrlen(needles[i].m_string);
}
// . set up the quick tables.
// . utf16 is not as effective here because half the bytes are zeroes!
// . TODO: use a static cache of like 4 of these tables where the key
// is the Needles ptr ... done
long numNeedlesToInit = numNeedles;
char space[256 * 4 * sizeof(BITVEC)];
char *buf = NULL;
BITVEC *s0;
BITVEC *s1;
BITVEC *s2;
BITVEC *s3;
/*
static bool s_quickTableInit = false;
static char s_qtbuf[128*(12+1)*2];
long slot = -1;
if(saveQuickTables) {
if ( ! s_quickTableInit ) {
s_quickTableInit = true;
s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx");
}
uint64_t key = (uint32_t)needles;
slot = s_quickTables.getSlot(&key);
if ( slot >= 0 ) {
buf = s_quickTables.getValueFromSlot(slot);
numNeedlesToInit = 0;
}
}
*/
if(!buf) {
buf = space;
memset ( buf , 0 , sizeof(BITVEC)*256*4);
}
/*
if( useQuickTables && slot == -1 ) {
//buf = (char*)mcalloc(sizeof(unsigned long)*256*5,
// "matches");
if(buf) s_quickTables.addKey(&key, &buf);
//sanity check, no reason why there needs to be a
//limit, I just don't expect there to be this many
//static needles at this point.
if(s_quickTables.getNumSlotsUsed() > 32){
char *xx=NULL; *xx = 0;
}
}
*/
// try 64 bit bit vectors now since we doubled # of needles
long offset = 0;
s0 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s1 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s2 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
s3 = (BITVEC *)(buf + offset);
offset += sizeof(BITVEC)*256;
BITVEC mask;
// set the letter tables, s0[] through sN[], for each needle
for ( long i = 0 ; i < numNeedlesToInit ; i++ ) {
// breathe
QUICKPOLL(niceness);
unsigned char *w = (unsigned char *)needles[i].m_string;
unsigned char *wend = w + needles[i].m_stringSize;
// BITVEC is now 64 bits
mask = (1<<(i&0x3f)); // (1<<(i%64));
// if the needle is small, fill up the remaining letter tables
// with its mask... so it matches any character in haystack.
s0[(unsigned char)to_lower_a(*w)] |= mask;
s0[(unsigned char)to_upper_a(*w)] |= mask;
w += 1;//step;
if ( w >= wend ) {
for ( long j = 0 ; j < 256 ; j++ ) {
s1[j] |= mask;
s2[j] |= mask;
s3[j] |= mask;
}
continue;
}
s1[(unsigned char)to_lower_a(*w)] |= mask;
s1[(unsigned char)to_upper_a(*w)] |= mask;
w += 1;//step;
if ( w >= wend ) {
for ( long j = 0 ; j < 256 ; j++ ) {
s2[j] |= mask;
s3[j] |= mask;
}
continue;
}
s2[(unsigned char)to_lower_a(*w)] |= mask;
s2[(unsigned char)to_upper_a(*w)] |= mask;
w += 1;//step;
if ( w >= wend ) {
for ( long j = 0 ; j < 256 ; j++ ) {
s3[j] |= mask;
}
continue;
}
s3[(unsigned char)to_lower_a(*w)] |= mask;
s3[(unsigned char)to_upper_a(*w)] |= mask;
w += 1;//step;
}
// return a ptr to the first match if we should, this is it
char *retVal = NULL;
// debug vars
//long debugCount = 0;
//long pp = 0;
// now find the first needle in the haystack
unsigned char *p = (unsigned char *)haystack;
unsigned char *pend = (unsigned char *)haystack + haystackSize;
char *dend = (char *)pend;
// do not breach!
pend -= 4;
for ( ; p < pend ; p++ ) {
// breathe
QUICKPOLL(niceness);
//if ( (char *)p - (char *)haystack >= 12508 )
// log("hey");
// analytics...
// is this a possible match? (this should be VERY fast)
mask = s0[*(p+0)];
if ( ! mask ) continue;
mask &= s1[*(p+1)];
if ( ! mask ) continue;
mask &= s2[*(p+2)];
if ( ! mask ) continue;
mask &= s3[*(p+3)];
if ( ! mask ) continue;
//debugCount++;
/*
// display
char oo[148];
char *xx ;
xx = oo;
//memcpy ( xx , p , 8 );
for ( long k = 0 ; k < 5 ; k++ ) {
*xx++ = p[k];
}
memcpy ( xx , "..." , 3 );
xx += 3;
*/
//
// XXX: do a hashtable lookup here so we have the candidate
// matches in a chain...
// XXX: for small needles which match frequently let's have
// a single char hash table, a 2 byte char hash table,
// etc. so if we have small needles we check the hash
// in those tables first, but only if mask & SMALL_NEEDLE
// is true! the single byte needle hash table can just
// be a lookup table. just XOR the bytes together for
// the hash.
// XXX: just hash the mask into a table to get candidate
// matches in a chain? but there's 4B hashes!!
// we got a good candidate, loop through all the needles
for ( long j = 0 ; j < numNeedles ; j++ ) {
// skip if does not match mask, will save time
if ( ! ((1<<(j&0x3f)) & mask) ) continue;
if( needles[j].m_stringSize > 3) {
// ensure first 4 bytes matches this needle's
if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
continue;
if (needles[j].m_string[1]!=to_lower_a(*(p+1)))
continue;
if (needles[j].m_string[2]!=to_lower_a(*(p+2)))
continue;
if (needles[j].m_string[3]!=to_lower_a(*(p+3)))
continue;
}
// get needle size
long msize = needles[j].m_stringSize;
// can p possibly be big enough?
if ( pend - p < msize ) continue;
// needle is "m" now
char *m = needles[j].m_string;
char *mend = needles[j].m_stringSize + m;
// use a tmp ptr for ptr into haystack
char *d = (char *)p;
// skip first 4 bytes since we know they match
if(msize > 3) {
d += 4;
m += 4;
}
// loop over each char in "m"
//for ( ; *m ; m++ ) {
for ( ; m < mend ; m++ ) {
//while ( ! *d && d < dend ) d++;
//while ( ! *m && m < mend ) m++;
// if we are a non alnum, that will match
// any string of non-alnums, like a space
// for instance. the 0 byte does not count
// because it is used in utf16 a lot. this
// may trigger some false matches in utf16
// but, oh well... this way "link partner"
// will match "link - partner" in the haystk
if ( is_wspace_a(*m) && m < mend ) {
// skip all in "d" then.
while (d<dend&&is_wspace_a(*d)) d++;
// advance m then
continue;
}
// make sure we match otherwise
if ( *m != to_lower_a(*d) ) break;
// ok, we matched, go to next
d++;
}
// if not null, keep going
if ( m < mend ) continue;
// if this needle is "special" AND it occurs AFTER
// linkPos, then do not consider it a match. this is
// if we have a comment section indicator, like
// "div id=\"comment" AND it occurs AFTER linkPos
// (the char ptr to our link in the haystack) then
// the match does not count.
if ( linkPos && needles[j].m_isSection &&
(char *)p>linkPos ) {
// record this for LinkText.cpp
if ( hadPreMatch ) *hadPreMatch = true;
continue;
}
// store ptr if NULL
if ( ! needles[j].m_firstMatch )
needles[j].m_firstMatch = (char *)p;
// return ptr to needle in "haystack"
if ( stopAtFirstMatch ) {
// ok, we got a match
if ( needleNum ) *needleNum = j;
//return (char *)p;
retVal = (char *)p;
p = pend;
break;
}
// otherwise, just count it
needles[j].m_count++;
// see if we match another needle, fixes bug
// of matching "anal" but not "analy[tics]"
continue;
// advance to next char in the haystack
break;
}
// ok, we did not match any needles, advance p and try again
}
//
// HACK:
//
// repeat above loop but for the last 4 characters in haystack!!
// this fixes a electric fence mem breach core
//
// it is slower because we check for \0
//
pend += 4;
for ( ; p < pend ; p++ ) {
// breathe
QUICKPOLL(niceness);
//if ( (char *)p - (char *)haystack >= 12508 )
// log("hey");
// is this a possible match? (this should be VERY fast)
mask = s0[*(p+0)];
if ( ! mask ) continue;
if ( p+1 < pend ) {
mask &= s1[*(p+1)];
if ( ! mask ) continue;
}
if ( p+2 < pend ) {
mask &= s2[*(p+2)];
if ( ! mask ) continue;
}
if ( p+3 < pend ) {
mask &= s3[*(p+3)];
if ( ! mask ) continue;
}
//debugCount++;
/*
// display
char oo[148];
char *xx ;
xx = oo;
//memcpy ( xx , p , 8 );
for ( long k = 0 ; k < 5 ; k++ ) {
*xx++ = p[k];
}
memcpy ( xx , "..." , 3 );
xx += 3;
*/
//
// XXX: do a hashtable lookup here so we have the candidate
// matches in a chain...
// XXX: for small needles which match frequently let's have
// a single char hash table, a 2 byte char hash table,
// etc. so if we have small needles we check the hash
// in those tables first, but only if mask & SMALL_NEEDLE
// is true! the single byte needle hash table can just
// be a lookup table. just XOR the bytes together for
// the hash.
// XXX: just hash the mask into a table to get candidate
// matches in a chain? but there's 4B hashes!!
// we got a good candidate, loop through all the needles
for ( long j = 0 ; j < numNeedles ; j++ ) {
// skip if does not match mask, will save time
if ( ! ((1<<(j&0x3f)) & mask) ) continue;
if( needles[j].m_stringSize > 3) {
// ensure first 4 bytes matches this needle's
if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
continue;
if (!p[1] ||
needles[j].m_string[1]!=to_lower_a(*(p+1)))
continue;
if (!p[2] ||
needles[j].m_string[2]!=to_lower_a(*(p+2)))
continue;
if (!p[3] ||
needles[j].m_string[3]!=to_lower_a(*(p+3)))
continue;
}
// get needle size
long msize = needles[j].m_stringSize;
// can p possibly be big enough?
if ( pend - p < msize ) continue;
// needle is "m" now
char *m = needles[j].m_string;
char *mend = needles[j].m_stringSize + m;
// use a tmp ptr for ptr into haystack
char *d = (char *)p;
// skip first 4 bytes since we know they match
if(msize > 3) {
d += 4;
m += 4;
}
// loop over each char in "m"
//for ( ; *m ; m++ ) {
for ( ; m < mend ; m++ ) {
//while ( ! *d && d < dend ) d++;
//while ( ! *m && m < mend ) m++;
// if we are a non alnum, that will match
// any string of non-alnums, like a space
// for instance. the 0 byte does not count
// because it is used in utf16 a lot. this
// may trigger some false matches in utf16
// but, oh well... this way "link partner"
// will match "link - partner" in the haystk
if ( is_wspace_a(*m) && m < mend ) {
// skip all in "d" then.
while (d<dend&&is_wspace_a(*d)) d++;
// advance m then
continue;
}
// make sure we match otherwise
if ( *m != to_lower_a(*d) ) break;
// ok, we matched, go to next
d++;
}
// if not null, keep going
if ( m < mend ) continue;
// if this needle is "special" AND it occurs AFTER
// linkPos, then do not consider it a match. this is
// if we have a comment section indicator, like
// "div id=\"comment" AND it occurs AFTER linkPos
// (the char ptr to our link in the haystack) then
// the match does not count.
if ( linkPos && needles[j].m_isSection &&
(char *)p>linkPos ) {
// record this for LinkText.cpp
if ( hadPreMatch ) *hadPreMatch = true;
continue;
}
// store ptr if NULL
if ( ! needles[j].m_firstMatch )
needles[j].m_firstMatch = (char *)p;
// return ptr to needle in "haystack"
if ( stopAtFirstMatch ) {
// ok, we got a match
if ( needleNum ) *needleNum = j;
//return (char *)p;
retVal = (char *)p;
p = pend;
break;
}
// otherwise, just count it
needles[j].m_count++;
// advance to next char in the haystack
break;
}
// ok, we did not match any needles, advance p and try again
}
//if ( debugCount > 0 ) pp = haystackSize / debugCount;
//log("build: debug count = %li uc=%li hsize=%li "
// "1 in %li chars matches.",
// debugCount,(long)isHaystackUtf16,haystackSize,pp);
// before we exit, clean up
return retVal;
}