open-source-search-engine/matches2.cpp

#include "gb-include.h"

#include "matches2.h"
#include "Unicode.h"
#include "Titledb.h"
#include "HashTableT.h"

//make the key, it is just the needles ptr
//static HashTableT<unsigned long long , char*> s_quickTables;
static HashTableX s_quickTables;

/*
// returns false and sets g_errno on error
bool fast_highlight ( // highlight these query terms:
		      Query *q ,
		      // highlight query terms in this string
		      char *stringToHighlight ,
		      // store highlighted string here:
		      SafeBuf *dstBuf ) {


	// make them into needles first
	SafeBuf needles;
	long need = q->m_numTerms * sizeof(Needle);
	if ( ! needles.reserve(need) ) return false;

	char *p = needles.getBufStart();
	for ( long i = 0 ; i < q->m_numTerms ; i++ ) {
		QueryTerm *qt = &q->m_qterms[i];
		Needle *ne = (Needle *)p;
		p += sizeof(Needle);
		ne->m_string = qt->m_term;
		ne->m_stringSize = qt->m_termLen;
		ne->m_id = i;
		ne->m_isSection = false;
	}

	Needle *nbuf = needles.getBufStart();

	getMatches2 ( nbuf ,
		      q->m_numTerms,
		      stringToHighlight,
		      gbstrlen(stringToHighlight),
		      NULL, // linkpos
		      &needleNum,
		      false, // stopatfirstmatch?
		      NULL, // hadprematch?
		      true,// save quick tables?
		      niceness );

}
*/


// . get the first substring in "haystack" that matches a string in "needles"
// . set *needleNum to the # of the needle in "needles" that matched
// . return a ptr into "haystack" that matches, NULL if none
// . if "linkPos" is non-NULL then certain types of matches must occur
//   BEFORE "linkPos" in order to be counted. those certain types are indicated
//   by a Needle::m_section of 1. this is for allowing links before comment
//   sections to be able to vote, and links in/after comment section to not
//   vote. (see LinkText.cpp's call to getMatch() to better understand this)
// . ALL needles.m_strings MUST BE IN LOWER CASE!! otherwise, we should convert
//   to lower and store into tmp[]. TODO.
// . a space (includes \r \n) in a needle will match a consecutive sequence
//   of spaces in the haystack

#define BITVEC unsigned long long

char *getMatches2 ( Needle *needles          ,
		    long    numNeedles       ,
		    char   *haystack         ,
		    long    haystackSize     ,
		    char   *linkPos          ,
		    long   *needleNum        ,
		    bool    stopAtFirstMatch ,
		    bool   *hadPreMatch      ,
		    bool    saveQuickTables  ,
		    long    niceness         ) {

	// assume not
	if ( hadPreMatch ) *hadPreMatch = false;
	// empty haystack? then no matches
	if ( ! haystack || haystackSize <= 0 ) return NULL;
	// JAB: no needles? then no matches
	if ( ! needles  || numNeedles   <= 0 ) return NULL;

	//char tmp[8192];
	//char *t    = tmp;
	//char *tend = tmp + 8192;

	// reset counts to 0
	//if ( ! stopAtFirstMatch )
	//	for ( long i=0 ; i < numNeedles ; i++ )
	//		needles[i].m_count = 0;

	// are we responsible for init'ing string lengths? this is much
	// faster than having to specify lengths manually.
	for ( long i=0 ; i < numNeedles; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		// clear
		needles[i].m_count      = 0;
		needles[i].m_firstMatch = NULL;
		// set the string size in bytes if not provided
		if ( needles[i].m_stringSize == 0 )
			needles[i].m_stringSize = gbstrlen(needles[i].m_string);
	}

	// . set up the quick tables.
	// . utf16 is not as effective here because half the bytes are zeroes!
	// . TODO: use a static cache of like 4 of these tables where the key
	//         is the Needles ptr ... done
	long numNeedlesToInit = numNeedles;
	char space[256 * 4 * sizeof(BITVEC)];
	char *buf = NULL;

	BITVEC *s0;
	BITVEC *s1;
	BITVEC *s2;
	BITVEC *s3;

	/*
	static bool s_quickTableInit = false;
	static char s_qtbuf[128*(12+1)*2];

	long slot = -1;
	if(saveQuickTables) {
		if ( ! s_quickTableInit ) {
			s_quickTableInit = true;
			s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx");
		}
		uint64_t key = (uint32_t)needles;
		slot = s_quickTables.getSlot(&key);
		if ( slot >= 0 ) {
			buf = s_quickTables.getValueFromSlot(slot);
			numNeedlesToInit = 0;
		}
	}
	*/

	if(!buf) {
		buf = space;
		memset ( buf , 0 , sizeof(BITVEC)*256*4);
	}

	/*
	if( useQuickTables && slot == -1 ) {
		//buf = (char*)mcalloc(sizeof(unsigned long)*256*5,
		//		     "matches");
		if(buf) s_quickTables.addKey(&key, &buf);
		//sanity check, no reason why there needs to be a
		//limit, I just don't expect there to be this many
		//static needles at this point.
		if(s_quickTables.getNumSlotsUsed() > 32){
			char *xx=NULL; *xx = 0;
		}
	}
	*/

	// try 64 bit bit vectors now since we doubled # of needles
	long offset = 0;
	s0 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s1 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s2 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s3 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;

	BITVEC mask;

	// set the letter tables, s0[] through sN[], for each needle
	for ( long i = 0 ; i < numNeedlesToInit ; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		unsigned char *w    = (unsigned char *)needles[i].m_string;
		unsigned char *wend = w + needles[i].m_stringSize;
		// BITVEC is now 64 bits
		mask = (1<<(i&0x3f)); // (1<<(i%64));
		// if the needle is small, fill up the remaining letter tables
		// with its mask... so it matches any character in haystack.
		s0[(unsigned char)to_lower_a(*w)] |= mask;
		s0[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( long j = 0 ; j < 256 ; j++ )  {
				s1[j] |= mask;
				s2[j] |= mask;
				s3[j] |= mask;
			}
			continue;
		}

		s1[(unsigned char)to_lower_a(*w)] |= mask;
		s1[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( long j = 0 ; j < 256 ; j++ )  {
				s2[j] |= mask;
				s3[j] |= mask;
			}
			continue;
		}

		s2[(unsigned char)to_lower_a(*w)] |= mask;
		s2[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( long j = 0 ; j < 256 ; j++ )  {
				s3[j] |= mask;
			}
			continue;
		}

		s3[(unsigned char)to_lower_a(*w)] |= mask;
		s3[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
	}

	// return a ptr to the first match if we should, this is it
	char *retVal = NULL;
	// debug vars
	//long debugCount = 0;
	//long pp = 0;
	// now find the first needle in the haystack
	unsigned char *p    = (unsigned char *)haystack;
	unsigned char *pend = (unsigned char *)haystack + haystackSize;
	char          *dend = (char *)pend;

	// do not breach!
	pend -= 4;

	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL(niceness);
		//if ( (char *)p - (char *)haystack >= 12508 )
		//	log("hey");
		// analytics...

		// is this a possible match? (this should be VERY fast)
		mask  = s0[*(p+0)];
		if ( ! mask ) continue;
		mask &= s1[*(p+1)];
		if ( ! mask ) continue;
		mask &= s2[*(p+2)];
		if ( ! mask ) continue;
		mask &= s3[*(p+3)];
		if ( ! mask ) continue;
		//debugCount++;
		/*
		// display
		char oo[148];
		char *xx ;
		xx = oo;
		//memcpy ( xx , p , 8 );
		for ( long k = 0 ; k < 5 ; k++ ) {
			*xx++ = p[k];
		}
		memcpy ( xx , "..." , 3 );
		xx += 3;
		*/
		//
		// XXX: do a hashtable lookup here so we have the candidate
		//      matches in a chain...
		// XXX: for small needles which match frequently let's have
		//      a single char hash table, a 2 byte char hash table,
		//      etc. so if we have small needles we check the hash
		//      in those tables first, but only if mask & SMALL_NEEDLE
		//      is true! the single byte needle hash table can just
		//      be a lookup table. just XOR the bytes together for
		//      the hash.
		// XXX: just hash the mask into a table to get candidate
		//      matches in a chain? but there's 4B hashes!!
		// we got a good candidate, loop through all the needles
		for ( long j = 0 ; j < numNeedles ; j++ ) {
			// skip if does not match mask, will save time
			if ( ! ((1<<(j&0x3f)) & mask) ) continue;
			if( needles[j].m_stringSize > 3) {
				// ensure first 4 bytes matches this needle's
				if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
					continue;
				if (needles[j].m_string[1]!=to_lower_a(*(p+1)))
					continue;
				if (needles[j].m_string[2]!=to_lower_a(*(p+2)))
					continue;
				if (needles[j].m_string[3]!=to_lower_a(*(p+3)))
					continue;
			}
			// get needle size
			long msize = needles[j].m_stringSize;
			// can p possibly be big enough?
			if ( pend - p < msize ) continue;
			// needle is "m" now
			char *m    = needles[j].m_string;
			char *mend = needles[j].m_stringSize + m;
			// use a tmp ptr for ptr into haystack
			char *d = (char *)p;
			// skip first 4 bytes since we know they match
			if(msize > 3) {
				d += 4;
				m += 4;
			}
			// loop over each char in "m"
			//for ( ; *m ; m++ ) {
			for ( ; m < mend ; m++ ) {
				//while ( ! *d && d < dend ) d++;
				//while ( ! *m && m < mend ) m++;
				// if we are a non alnum, that will match
				// any string of non-alnums, like a space
				// for instance. the 0 byte does not count
				// because it is used in utf16 a lot. this
				// may trigger some false matches in utf16
				// but, oh well... this way "link partner"
				// will match "link  - partner" in the haystk
				if ( is_wspace_a(*m) && m < mend ) {
					// skip all in "d" then.
					while (d<dend&&is_wspace_a(*d)) d++;
					// advance m then
					continue;
				}
				// make sure we match otherwise
				if ( *m != to_lower_a(*d) ) break;
				// ok, we matched, go to next
				d++;
			}
			// if not null, keep going
			if ( m < mend ) continue;
			// if this needle is "special" AND it occurs AFTER
			// linkPos, then do not consider it a match. this is
			// if we have a comment section indicator, like
			// "div id=\"comment" AND it occurs AFTER linkPos
			// (the char ptr to our link in the haystack) then
			// the match does not count.
			if ( linkPos && needles[j].m_isSection &&
			     (char *)p>linkPos ) {
				// record this for LinkText.cpp
				if ( hadPreMatch ) *hadPreMatch = true;
				continue;
			}
			// store ptr if NULL
			if ( ! needles[j].m_firstMatch )
				needles[j].m_firstMatch = (char *)p;
			// return ptr to needle in "haystack"
			if ( stopAtFirstMatch ) {
				// ok, we got a match
				if ( needleNum ) *needleNum = j;
				//return (char *)p;
				retVal = (char *)p;
				p = pend;
				break;
			}
			// otherwise, just count it
			needles[j].m_count++;
			// see if we match another needle, fixes bug
			// of matching "anal" but not "analy[tics]"
			continue;
			// advance to next char in the haystack
			break;
		}
		// ok, we did not match any needles, advance p and try again
	}

	//
	// HACK:
	//
	// repeat above loop but for the last 4 characters in haystack!!
	// this fixes a electric fence mem breach core
	//
	// it is slower because we check for \0
	//
	pend += 4;

	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL(niceness);
		//if ( (char *)p - (char *)haystack >= 12508 )
		//	log("hey");
		// is this a possible match? (this should be VERY fast)
		mask  = s0[*(p+0)];
		if ( ! mask ) continue;
		if ( p+1 < pend ) {
			mask &= s1[*(p+1)];
			if ( ! mask ) continue;
		}
		if ( p+2 < pend ) {
			mask &= s2[*(p+2)];
			if ( ! mask ) continue;
		}
		if ( p+3 < pend ) {
			mask &= s3[*(p+3)];
			if ( ! mask ) continue;
		}
		//debugCount++;
		/*
		// display
		char oo[148];
		char *xx ;
		xx = oo;
		//memcpy ( xx , p , 8 );
		for ( long k = 0 ; k < 5 ; k++ ) {
			*xx++ = p[k];
		}
		memcpy ( xx , "..." , 3 );
		xx += 3;
		*/
		//
		// XXX: do a hashtable lookup here so we have the candidate
		//      matches in a chain...
		// XXX: for small needles which match frequently let's have
		//      a single char hash table, a 2 byte char hash table,
		//      etc. so if we have small needles we check the hash
		//      in those tables first, but only if mask & SMALL_NEEDLE
		//      is true! the single byte needle hash table can just
		//      be a lookup table. just XOR the bytes together for
		//      the hash.
		// XXX: just hash the mask into a table to get candidate
		//      matches in a chain? but there's 4B hashes!!
		// we got a good candidate, loop through all the needles
		for ( long j = 0 ; j < numNeedles ; j++ ) {
			// skip if does not match mask, will save time
			if ( ! ((1<<(j&0x3f)) & mask) ) continue;
			if( needles[j].m_stringSize > 3) {
				// ensure first 4 bytes matches this needle's
				if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
					continue;
				if (!p[1] ||
				    needles[j].m_string[1]!=to_lower_a(*(p+1)))
					continue;
				if (!p[2] ||
				    needles[j].m_string[2]!=to_lower_a(*(p+2)))
					continue;
				if (!p[3] ||
				    needles[j].m_string[3]!=to_lower_a(*(p+3)))
					continue;
			}
			// get needle size
			long msize = needles[j].m_stringSize;
			// can p possibly be big enough?
			if ( pend - p < msize ) continue;
			// needle is "m" now
			char *m    = needles[j].m_string;
			char *mend = needles[j].m_stringSize + m;
			// use a tmp ptr for ptr into haystack
			char *d = (char *)p;
			// skip first 4 bytes since we know they match
			if(msize > 3) {
				d += 4;
				m += 4;
			}
			// loop over each char in "m"
			//for ( ; *m ; m++ ) {
			for ( ; m < mend ; m++ ) {
				//while ( ! *d && d < dend ) d++;
				//while ( ! *m && m < mend ) m++;
				// if we are a non alnum, that will match
				// any string of non-alnums, like a space
				// for instance. the 0 byte does not count
				// because it is used in utf16 a lot. this
				// may trigger some false matches in utf16
				// but, oh well... this way "link partner"
				// will match "link  - partner" in the haystk
				if ( is_wspace_a(*m) && m < mend ) {
					// skip all in "d" then.
					while (d<dend&&is_wspace_a(*d)) d++;
					// advance m then
					continue;
				}
				// make sure we match otherwise
				if ( *m != to_lower_a(*d) ) break;
				// ok, we matched, go to next
				d++;
			}
			// if not null, keep going
			if ( m < mend ) continue;
			// if this needle is "special" AND it occurs AFTER
			// linkPos, then do not consider it a match. this is
			// if we have a comment section indicator, like
			// "div id=\"comment" AND it occurs AFTER linkPos
			// (the char ptr to our link in the haystack) then
			// the match does not count.
			if ( linkPos && needles[j].m_isSection &&
			     (char *)p>linkPos ) {
				// record this for LinkText.cpp
				if ( hadPreMatch ) *hadPreMatch = true;
				continue;
			}
			// store ptr if NULL
			if ( ! needles[j].m_firstMatch )
				needles[j].m_firstMatch = (char *)p;
			// return ptr to needle in "haystack"
			if ( stopAtFirstMatch ) {
				// ok, we got a match
				if ( needleNum ) *needleNum = j;
				//return (char *)p;
				retVal = (char *)p;
				p = pend;
				break;
			}
			// otherwise, just count it
			needles[j].m_count++;
			// advance to next char in the haystack
			break;
		}
		// ok, we did not match any needles, advance p and try again
	}


	//if ( debugCount > 0 ) pp = haystackSize / debugCount;
	//log("build: debug count = %li uc=%li hsize=%li "
	//    "1 in %li chars matches.",
	//    debugCount,(long)isHaystackUtf16,haystackSize,pp);

	// before we exit, clean up
	return retVal;
}