open-source-search-engine/Bits.cpp

#include "gb-include.h"

#include "Bits.h"
#include "StopWords.h"
#include "fctypes.h"
#include "Abbreviations.h"
#include "Mem.h"

Bits::Bits() {
	m_bits = NULL;
	m_swbits = NULL;
}

Bits::~Bits() {
	reset();
}

void Bits::reset() {
	if ( m_bits && m_needsFree ) // (char *)m_bits != m_localBuf )
		mfree ( m_bits , m_bitsSize , "Bits" );
	if ( m_swbits && m_needsFree )
		mfree ( m_swbits , m_swbitsSize , "Bits" );
	m_bits = NULL;
	m_swbits = NULL;
	m_inLinkBitsSet = false;
	m_inUrlBitsSet = false;
}

// . set bits for each word
// . these bits are used for phrasing and by spam detector
// . returns false and sets errno on error
bool Bits::set ( Words *words , char titleRecVersion , long niceness ,
		 char *buf , long bufSize ) {
	reset();
	// save words so printBits works
	m_words = words;
	// save for convenience/speed
	m_titleRecVersion = titleRecVersion;
	m_niceness        = niceness;
	// how many words?
	long numBits = words->getNumWords();
	// how much space do we need?
	long need = numBits * sizeof(wbit_t);
	// assume no malloc
	m_needsFree = false;

	// use local buf?
	if ( need < BITS_LOCALBUFSIZE ) m_bits = (wbit_t *)m_localBuf;
	// use provided buf?
	else if ( need < bufSize ) m_bits = (wbit_t *)buf;
	// i guess need to malloc
	else {
		m_bitsSize = need;
		m_bits = (wbit_t *)mmalloc ( need , "Bits1" );
		m_needsFree = true;
	}
	if ( ! m_bits ) return log("build: Could not allocate "
				   "Bits table used to parse words: "
				   "%s",
				   mstrerror(g_errno));

	// breathe
	QUICKPOLL ( m_niceness );

	// sometimes the next bits are dependent on the previous bits.
	wbit_t prevBits  = 0;

	nodeid_t *tagIds = words->getTagIds();
	char **w         = words->getWords();
	long long *wids  = words->getWordIds();
	char **wptrs     = words->getWords();

	long long prevWid = 0LL;

	//long  *wlens     = words->getWordLens();
	long   brcount   = 0;

	wbit_t bits;
	bool isInSentence = false;

	for ( long i = 0 ; i < numBits ; i++ ) {
		// get the word text and it's length
		//char         *s    = words->getWord    ( i );
		//long          slen = words->getWordLen ( i );
		//wbit_t bits;

		// breathe
		QUICKPOLL ( m_niceness );

		if ( tagIds && tagIds[i] ) {
			// shortcut
			nodeid_t tid = tagIds[i] & BACKBITCOMP;
			// count the <br>s, we can't pair across more than 1
			if ( g_nodes[tid].m_isBreaking )
				bits = 0;
			// can only pair across one <br> tag, not two
			else if ( tid == TAG_BR ) { //tagIds[i] == 20 ){// <br>
				if ( brcount > 0 ) bits = 0;
				else { brcount++; bits = D_CAN_PAIR_ACROSS; }
			}
			else bits = D_CAN_PAIR_ACROSS;
		}

		// just skip if ignored from a 0 score
		//else if ( scores && scores[i] <= 0 ) {
		//	bits = 0;
		//}
		else if ( is_alnum_utf8 ( w[i]+0 )) {
			bits=getAlnumBits(i,prevBits);
			brcount = 0;
		}
		else {
			// . just allow anything now!
			// . the curved quote in utf8 is 3 bytes long and with
			//   a space before it, was causing issues here!
			bits= D_CAN_PAIR_ACROSS;
			//bits = getPunctuationBits(w[i],wlens[i]);
		}
		// now everybody has a period before them since i don't
		// want "project S" to phrase to "projects" or
		// "the rapist" to phrase to "therapist"
		bits |= D_CAN_PERIOD_PRECEED;
		// i commented this out cuz we ALWAYS put a period between now
		// if this word is following a "/", "." or "/~" then it can
		// be period preceeded in a phrase
		//if ( i > 1 && (s[-1]=='/' || s[-1]=='.') && is_alnum(s[-2]))
		//	bits |= D_CAN_PERIOD_PRECEED;
		//if ( i > 2 &&  s[-1]=='~' && s[-2]=='/'  && is_alnum(s[-3]))
		//	bits |= D_CAN_PERIOD_PRECEED;
		// remember our bits.
		m_bits [ i ] = bits;
		// these bits will be the previous bits the next time around.
		prevBits = bits; //m_bits [ i - 1 ];

		/////////////////////////
		//
		// . identify which tags and punct words break a sentence
		// . Sections.cpp uses this to carve out sentence sections
		//
		/////////////////////////

		// a word never breaks a sentence
		if ( wids[i] ) {
			isInSentence = true;
			prevWid = wids[i];
			continue;
		}

		// if not in a sentence, just keep going
		if ( ! isInSentence ) continue;

		// if punct it breaks unless it is a comma, semicolon,
		// colon, space, etc.
		if ( ! tagIds || ! tagIds[i] ) {
			// not a break if no period right there
			if ( wptrs[i][0] != '.' &&
			     wptrs[i][0] != '!' &&
			     wptrs[i][0] != '?' )
				continue;
			// if an alnum char follows the ., it is ok
			// probably a hostname or ip or phone #
			if ( is_alnum_utf8(wptrs[i]+1) ) continue;
			// if abbreviation before we are ok too
			if ( wptrs[i][0]=='.' && isAbbr(prevWid) ) continue;
			// otherwise, break that sentence
			m_bits[i] |= D_BREAKS_SENTENCE;
			// stop it
			isInSentence = false;
			// keep going
			continue;
		}

		// skip non breaking tags like font
		if ( ! isBreakingTagId(tagIds[i]) ) continue;

		// now we assume br tags break sentences until we can figure
		// out if the page is microsoft front page or not.
		m_bits[i] |= D_BREAKS_SENTENCE;
		// stop it
		isInSentence = false;


		//
		// pick the longest line in a hard section which ends in
		// a period and contains a br tag.  then any line that
		// is 80%+ of that line's number of chars is also a line
		// where the br should not terminate it as a sentence.
		// ?????
	}

	return true;
}

#include "Sections.h"

void Bits::setInLinkBits ( Sections *ss ) {

	if ( m_inLinkBitsSet ) return;
	m_inLinkBitsSet = true;
	if ( ss->m_numSections == 0 ) return;
	// sets bits for Bits.cpp for D_IN_LINK for each ALNUM word
	for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// skip if not a href section
		if ( si->m_baseHash != TAG_A ) continue;
		// set boundaries
		long a = si->m_a;
		long b = si->m_b;
		for ( long i = a ; i < b ; i++ )
			m_bits[i] |= D_IN_LINK;
	}
}

void Bits::setInUrlBits ( long niceness ) {
	if ( m_inUrlBitsSet ) return;
	m_inUrlBitsSet = true;
	nodeid_t *tids  = m_words->getTagIds();
	long long *wids = m_words->getWordIds();
	char **wptrs    = m_words->getWords();
	long nw = m_words->getNumWords();
	for ( long i = 0 ; i < nw; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		// look for protocol
		if ( wids[i] ) continue;
		if ( tids[i] ) continue;
		if ( wptrs[i][0] != ':' ) continue;
		if ( wptrs[i][1] != '/' ) continue;
		if ( wptrs[i][2] != '/' ) continue;
		// set them up
		if ( i<= 0 ) continue;
		// scan for end of it. stop at tag or space
		long j = i - 1;
		for ( ; j < nw ; j++ ) {
			// breathe
			QUICKPOLL(niceness);
			// check if end
			if ( m_words->hasSpace(j) ) break;
			// or tag
			if ( tids[j] )
			     //tids[j] != TAG_B &&
			     //tids[j] != (TAG_B|BACKBIT) )
				break;
			// include it
		        m_bits[j] |= D_IS_IN_URL;
		}
		// avoid inifinite loop with this if conditional statement
		if ( j > i ) i = j;
	}
}

void Bits::printBits ( ) {
	for ( long i = 0 ; i < m_words->getNumWords(); i++ ) {
		m_words->printWord(i);
		fprintf(stderr," ");
		printBit(i);
		fprintf(stderr,"\n");
	}
}

void Bits::printBit ( long i ) {
	if (m_bits[i]&D_CAN_BE_IN_PHRASE ) fprintf(stderr," canBeInPhrse");
	else                               fprintf(stderr,"             ");
	if (m_bits[i]&D_IS_STOPWORD ) fprintf(stderr," stopword");
	else                          fprintf(stderr,"         ");
	if (m_bits[i]&D_CAN_PERIOD_PRECEED)fprintf(stderr," periodCanPreceed");
	else                               fprintf(stderr,"                 ");
	//if (m_bits[i]&D_IS_INDEXABLE) fprintf(stderr," indexable");
	//else                          fprintf(stderr,"          ");
	if (m_bits[i]&D_CAN_START_PHRASE) fprintf(stderr," canStartPhrase");
	else                              fprintf(stderr,"               ");
	if (m_bits[i]&D_CAN_PAIR_ACROSS ) fprintf(stderr," canPairAcross");
	else                              fprintf(stderr,"              ");
}

// . if we're a stop word and previous word was an apostrophe
//   then set D_CAN_APOSTROPHE_PRECEED to true and PERIOD_PRECEED to false
wbit_t Bits::getAlnumBits ( long i , wbit_t prevBits ) {

	char      *s   = m_words->getWord    ( i );
	long       len = m_words->getWordLen ( i );
	long long  wid = m_words->getWordId  ( i );

	//if ( m_titleRecVersion < 36 && m_words->getStripWordId(i) )
	//	wid = m_words->getStripWordId(i);

	wbit_t bits = 0;

	// this is used by Weights.cpp
	if ( is_cap_utf8 ( s , len ) ) bits |= D_IS_CAP;

	// this is not case sensitive -- all non-stop words can start phrases
	if ( ! ::isStopWord ( s , len , wid ) )
	       return bits | D_CAN_BE_IN_PHRASE | D_CAN_START_PHRASE;

	bits |=
		D_CAN_BE_IN_PHRASE   |
		D_CAN_PAIR_ACROSS    |
		D_IS_STOPWORD        |
		D_CAN_PERIOD_PRECEED ;

	// stopwords preceeding an immediate hyphen (i-phone) can start phrases
	if ( s[len]=='-' && is_alnum_utf8(s+len+1) )
		return bits | D_CAN_START_PHRASE;

	// capitalized stop words can start phrases. ( kick Him in the *** )
	if ( is_upper_utf8(s) ) return bits | D_CAN_START_PHRASE;

	// if the previous word could not be paired across then
	// this stop word can start a phrase.  ( short end.  it happened
	// yesterday. )
	if ((prevBits & D_CAN_PAIR_ACROSS) == 0)
		return bits | D_CAN_START_PHRASE;

	// . the first alnum word can start a phrase as well
	// . prevBits may nto be zero if first word was punctuation
	if ( i <= 1 ) return bits | D_CAN_START_PHRASE;

	return bits;
}

// TODO: fuckin' ms frontpage puts long sequences of spaces
//       between words that are next to each other
wbit_t Bits::getPunctuationBits ( char *s , long len ) {

	uint8_t cs;
	if ( len != 2 ) goto tryLen1;

	if (s[0]==',' && (s[1]=='\n' || s[1]==' ')) return D_CAN_PAIR_ACROSS;
	if (s[0]=='/' && s[1]=='~')                 return D_CAN_PAIR_ACROSS ;
	cs = getUtf8CharSize ( s );
	// allow double spaces for version 6 or more
	if ( is_wspace_utf8(s) && is_wspace_utf8(s+cs) )
		return D_CAN_PAIR_ACROSS;
	if (is_wspace_utf8(s+cs) && is_punct_utf8(s)) {
		// switch/case is slow b-tree thing! stop it!
		if ( s[0] == '?' ) return 0;
		if ( s[0] == ';' ) return 0;
		if ( s[0] == '{' ) return 0;
		if ( s[0] == '}' ) return 0;
		if ( s[0] == '<' ) return 0;
		if ( s[0] == '>' ) return 0;
		//switch ((wbit_t)s[0]) {
		//case '!': return D_CAN_PAIR_ACROSS; // "Yahoo! games"
		//case '.': return 0;  // initials!  "I. B. M."
		//UTF8?case 171: return 0;  // <<  left shift operator
		//UTF8?case 187: return 0;  // >>  right shift operator
		//UTF8?case 191: return 0;  // upsidedown question mark
		//UTF8?case 161: return 0;  // upsidedown exclamation point
		return D_CAN_PAIR_ACROSS;
	}
	if (is_wspace_utf8(s) && is_punct_utf8(s+cs)) {
		// switch/case is slow b-tree thing! stop it!
		if ( s[cs] == '?' ) return 0;
		if ( s[cs] == ';' ) return 0;
		if ( s[cs] == '{' ) return 0;
		if ( s[cs] == '}' ) return 0;
		if ( s[cs] == '<' ) return 0;
		if ( s[cs] == '>' ) return 0;
		if ( s[cs] == '!' ) return 0;
		//UTF8?case 171: return 0;  // <<  left shift operator
		//UTF8?case 187: return 0;  // >>  right shift operator
		//UTF8?case 191: return 0;  // upsidedown question mark
		//UTF8?case 161: return 0;  // upsidedown exclamation point
		return D_CAN_PAIR_ACROSS;
	}
	return 0;

 tryLen1:

	if (len != 1) goto tryLen3;

	// switch/case is slow b-tree thing! stop it!
	if ( s[0] == '?' ) return 0;
	if ( s[0] == ';' ) return 0;
	if ( s[0] == '{' ) return 0;
	if ( s[0] == '}' ) return 0;
	if ( s[0] == '<' ) return 0;
	if ( s[0] == '>' ) return 0;
	if ( s[0] == '!' ) return 0;
	//UTF8?case 171: return 0;  // <<  left shift operator
	//UTF8?case 187: return 0;  // >>  right shift operator
	//UTF8?case 191: return 0;  // upsidedown question mark
	//UTF8?case 161: return 0;  // upsidedown exclamation point
	return D_CAN_PAIR_ACROSS;

	// we can pair across:
	// "://"
	// " , "
	// " - "
	// " & "
	// " + "
 tryLen3:

	//
	// good place to check for ascii spaces...
	//

	// pair across any number of spaces, it will only show up as one
	// space in html and Microsoft Front Page separates lines by a
	// bunch of spaces
	if ( is_wspace_a(s[0]) && is_wspace_a(s[1]) && is_wspace_a(s[2]) ) {
		long k = 3;
		while ( k < len ) if ( ! is_wspace_a(s[k++] ) ) return 0;
		return D_CAN_PAIR_ACROSS;
	}
	if (len != 3) return 0;
	if (s[0]==':' && s[1]=='/'&&s[2]=='/')return D_CAN_PAIR_ACROSS;
	if ( is_wspace_a(s[0]) && is_wspace_a(s[2]) )
		switch (s[1]) {
		case ',': return D_CAN_PAIR_ACROSS;
		case '-': return D_CAN_PAIR_ACROSS;
		case '+': return D_CAN_PAIR_ACROSS;
		case '&': return D_CAN_PAIR_ACROSS;
		}
	return 0;
}

//
// Summary.cpp sets its own bits.
//

// this table maps a tagId to a #define'd bit from Bits.h which describes
// the format of the following text in the page. like bold or italics, etc.
nodeid_t s_bt [ 1000 ];

// . set bits for each word
// . these bits are used for phrasing and by spam detector
// . returns false and sets errno on error
bool Bits::setForSummary ( Words *words , char *buf , long bufSize ) {
	// clear the mem
	reset();

	// set our s_bt[] table
	bool s_init = false;
	if ( ! s_init ) {
		// only do this once
		s_init = true;
		// clear table
		if ( 1000 < getNumXmlNodes() ) { char *xx=NULL;*xx=0; }
		memset ( s_bt , 0 , 1000 * sizeof(nodeid_t) );
		// set just those that have bits #defined in Bits.h
		s_bt [ TAG_TITLE      ] = D_IN_TITLE;
		s_bt [ TAG_A          ] = D_IN_HYPERLINK;
		s_bt [ TAG_B          ] = D_IN_BOLDORITALICS;
		s_bt [ TAG_I          ] = D_IN_BOLDORITALICS;
		s_bt [ TAG_LI         ] = D_IN_LIST;
		s_bt [ TAG_SUP        ] = D_IN_SUP;
		s_bt [ TAG_P          ] = D_IN_PARAGRAPH;
		s_bt [ TAG_BLOCKQUOTE ] = D_IN_BLOCKQUOTE;
	}

	// save words so printBits works
	m_words = words;
	// save for convenience/speed
	//m_titleRecVersion = 0;
	// how many words?
	long numBits = words->getNumWords();
	// how much space do we need?
	long need = sizeof(swbit_t) * numBits;
	// assume no malloc
	m_needsFree = false;

	// use local buf?
	if ( need < BITS_LOCALBUFSIZE ) m_swbits = (swbit_t *)m_localBuf;
	// use provided buf?
	else if ( need < bufSize ) m_swbits = (swbit_t *)buf;
	// i guess need to malloc
	else {
		m_swbitsSize = need;
		m_swbits = (swbit_t *)mmalloc ( need , "BitsW" );
		m_needsFree = true;
	}
	if ( ! m_swbits ) return log("build: Could not allocate "
				     "Bits table used to parse words: "
				     "%s",
				     mstrerror(g_errno));

	// set
	// D_STRONG_CONNECTOR
	// D_STARTS_SENTENCE
	// D_STARTS_FRAGMENT

	nodeid_t   *tagIds = words->getTagIds();
	char      **w      = words->getWords();
	long       *wlens  = words->getWordLens();
	long long  *wids   = words->getWordIds();

	char          startSent = 1;
	char          startFrag = 1;
	char          inQuote   = 0;
	char          inParens  = 0;

	long          wlen;
	char         *wp;

	// the ongoing accumulation flag we apply to each word
	swbit_t flags = 0;

	for ( long i = 0 ; i < numBits ; i++ ) {
		// assume none are set
		m_swbits[i] = 0;
		// if a breaking tag, next guy can "start a sentence"
		if ( tagIds && tagIds[i] ) {
			// get the tag id minus the high "back bit"
			long tid = tagIds[i] & BACKBITCOMP;
			// is it a "breaking tag"?
			if ( g_nodes[tid].m_isBreaking ) {
				startSent = 1;
				inQuote   = 0;
			}
			// adjust flags if we should
			if ( s_bt[tid] ) {
				if   ( tid != tagIds[i] ) flags &= ~s_bt[tid];
				else                      flags |=  s_bt[tid];
			}
			// apply flag
			m_swbits[i] |= flags;
			continue;
		}
		// if alnum, might start sentence or fragment
		if ( wids[i] ) {
			if ( startFrag ) {
				m_swbits[i] |= D_STARTS_FRAG   ; startFrag =0;}
			if ( startSent ) {
				m_swbits[i] |= D_STARTS_SENTENCE;startSent =0;}
			if ( inQuote ) {
				m_swbits[i] |= D_IN_QUOTES      ;inQuote = 0;}
			if ( inParens )
				m_swbits[i] |= D_IN_PARENS;
			// apply any other flags we got
			m_swbits[i] |= flags;
			continue;
		}
		// fast ptrs
		wlen = wlens[i];
		wp   = w    [i];

		// this is not 100%
		if      ( words->hasChar (i, '(' ) ) flags |=  D_IN_PARENS;
		else if ( words->hasChar (i, ')' ) ) flags &= ~D_IN_PARENS;

		// apply curent flags
		m_swbits[i] |= flags;


		// does it END in a quote?
		if      ( wp[wlen-1]=='\"' )
			inQuote = 1;
		else if ( wlen >= 6 &&
			  strncmp(wp,"&quot;",6)== 0 )
			inQuote = 1;

		// . but double spaces are not starters
		// . MDW: we kinda force ourselves to only use ascii spaceshere
		if ( wlen==2 && is_wspace_a(*wp)&&is_wspace_a(wp[1])) continue;
		// it can start a fragment if not a single space char
		if ( wlen!=1 || ! is_wspace_utf8(wp) )
			startFrag = 1;
		// ". " denotes end of sentence
		if ( wlen>=2 && wp[0]=='.' && is_wspace_utf8(wp+1)){
			// but not if preceeded by an initial
			if ( i>0 && wlens[i-1]==1 && wids[i-1] )
				continue;
			// ok, really the end of a sentence
			startSent = 1;
		}
		// are we a "strong connector", meaning that
		// Summary.cpp should not split on us if possible

		// apostrophe html encoded?
		if ( wlen == 6 && strncmp(wp,"&#146;",6) == 0 ) {
			m_swbits[i] |= D_IS_STRONG_CONNECTOR;
			continue;
		}
		if ( wlen == 7 && strncmp(wp,"&#8217;",7) == 0 ) {
			m_swbits[i] |= D_IS_STRONG_CONNECTOR;
			continue;
		}

		// otherwise, strong connectors must be single char
		if ( wlen != 1 ) continue;
		// is it apostrophe? - & . * (M*A*S*H)
		char c = wp[0];
		if      ( c == '\'')m_swbits[i]|=D_IS_STRONG_CONNECTOR;
		else if ( c == '-' )m_swbits[i]|=D_IS_STRONG_CONNECTOR;
		else if ( c == '&' )m_swbits[i]|=D_IS_STRONG_CONNECTOR;
		else if ( c == '.' )m_swbits[i]|=D_IS_STRONG_CONNECTOR;
		else if ( c == '*' )m_swbits[i]|=D_IS_STRONG_CONNECTOR;
		else if ( c == '/' )m_swbits[i]|=D_IS_STRONG_CONNECTOR;
	}

	return true;
}