open-source-search-engine/Scores.cpp

#include "gb-include.h"

#include "Scores.h"
#include "Words.h"

// . explicit article body indicator tags:
//   <div class=blogbody,storycontent,body,article_body,story-body
//    <div/td/span class=blogbody,storycontent,body,article_body,
//     story,body-content,entry,story-body,mainarttxt>
//   <td class=story>
//   <span class="body-content">
//   <div class="entry">  --- although has "entry" for ads, etc.
//   reuters: <!-- Article Text Begins -->  and Ends -->
// . forbes have a bunch of <span class=mainartext> strewn together. they
//   are neighbor sections.

#define MAX_LEVELS 200

Scores::Scores () {
	m_buf     = NULL;
	m_bufSize = 0;
	m_scores  = NULL;
	//m_rerankScores = NULL;
}

Scores::~Scores() {
	reset();
}

void Scores::reset() {
	if ( m_buf && m_needsFree ) // m_buf != m_localBuf )
		mfree ( m_buf , m_bufSize , "Scores" );
	m_buf    = NULL;
	m_scores = NULL;
}

bool Scores::set ( Words    *words             ,
		   Sections *sections          ,
		   long      titleRecVersion   ,
		   bool      eliminateMenus    ,
		   // provide it with a buffer to prevent a malloc
		   char     *buf               ,
		   long      bufSize           ,
		   long      minIndexableWords ) {

	//long  defaultm = 40;
	//if ( titleRecVersion >= 56 ) defaultm = -1;

	// "scoreBySection" (default is true)
	// Should gigablast break the document into sections and score the
	// words in sections with mostly link text lower than words in sections
	// without much link text? This helps to reduce the effects of menu
	// spam.
	// Used for news articles.
	// This only applies to the body of the document.

	// "indexContentSectionOnly" (default is false)
	// Should gigablast attempt to isolate just the single most-relevant
	// content section from the document and not index anything else?
	// Used for news articles.
	// This only applies to the body of the document.

	// "minSectionScore" (default is -1000000000)
	// The minimum score an entire section of the document needs to have
	// its words indexed. Each word in a section counts as 128 points, but
	// a word in a hyperlink counts as -256 points.
	// Used for news articles.
	// This only applies to the body of the document.

	// "minIndexableWords" (default is -1)
	// If the number of indexable words that have a positive average score
	// is below this value, then no words will be indexed. Used
	// to just index beefy news articles. -1 means to ignore this
	// constraint.

	// "minAvgWordScore" (default is 0)
	// Words have an average score of the 8 neighboring words on their left
	// and the 8 neighboring words on their right, in the same section.
	// These word scores are 128 points for a word not in a link, and only
	// 21 points for a word in a link. What is the minimum score average
	// score a word needs to be indexed? (Before applying the top word
	// weight, below)
	// scoreBySection must be enabled for this to work.

	// "numTopWords" (default is 0)
	// Weight the first X words higher.
	// Used for news articles.
	// This only applies to the body of the document.

	// "topWordsWeight" (default is 1.0)
	// Weight the first X words by this much, a rational number.
	// Used for news articles.
	// This only applies to the body of the document.

	// "topSentenceWeight" (default is 1.0)
	// Weight the first sentence by this much, a rational number.
	// Only applies to documents that support western punctuation.
	// Used for news articles.
	// This only applies to the body of the document.

	// "maxWordsInSentence" (default is 0)
	// Do not weight more than this words in the first sentence.
	// Used for news articles.
	// This only applies to the body of the document.

	// if we are doing "menu elimination technology" then zero out
	// scores of terms not in the single content section
	if ( eliminateMenus )
		return set ( words          ,
			     titleRecVersion,
			     true           , // scoreBySection
			     true           , // indexContentSectionOnly (DIFF)
			     -1000000000    , // minSectionScore
			     0              , // minAvgWordScore
			     40             , // minIndexableWords (DIFF)
			     0              , // numTopWords
			     3.0            , // topWordsWeight
			     1.0            , // topSentenceWeight
			     30             );// maxWordsInSentence


	// use all defaults if no site rec
	//if ( ! sx )
	return set ( words             ,
		     titleRecVersion   ,
		     true              , // scoreBySection
		     false             , // indexContentSectionOnly
		     -1000000000       , // minSectionScore
		     0                 , // minAvgWordScore
		     minIndexableWords , // defaults to -1
		     0                 , // numTopWords
		     3.0               , // topWordsWeight
		     1.0               , // topSentenceWeight
		     30                );// maxWordsInSentence

	/*
	// there should only by one <index> block in the ruleset file that has
	// these special config switches
	long n0 = 0;
	long n1 = 0x7fffffff;

	// this is used to decrease the scores of words in menu sections.
	// this means that words will be scored based on their neighboring
	// words in the same section of the document. the section of the
	// document is determined by <table><div><tr><td> tags and the like.
	// if the neighboring words are in links then the score is decreased.
	// this way we expect to score words in menus less. this is now
	// default scoring behaviour for newer documents.
	bool scoreBySection = true;
	if ( ! sx->getBool(n0,n1,"index.scoreBySection",true) )
		scoreBySection = false;

	// this is used to index newspaper articles.
	// indexContentSectionOnly means to only index the words in the top-
	// scoring section of the document. the section of the document
	// is determined by <table><div><tr><td> tags and the like. the score
	// of a section is based on how many words that are not in hyperlinks
	// are contained in that section. words in hyperlinks actually decrease
	// the score of the section.
	bool indexContentSectionOnly =
		sx->getBool(n0,n1,"index.indexContentSectionOnly",false);
	//log("REMOVE ME");
	//indexContentSectionOnly = true;

	// if the total score of a section is less than this then no words
	// in that section will get indexed. each word in a section is
	// counted as 128 points, but if the word is in a hyper link it is
	// counted as -256 points (-2*128)
	long minSectionScore = sx->getLong(n0,n1,"index.minSectionScore",
					   -1000000000);

	// count words in links as 21 points, words not in links as 128.
	// the average score of each word is its score plus the scores of
	// its 8 left and its 7 right neighbors divided by 16. if that
	// average score is below this value, the word is not indexed.
	// only valid if scoreBySection is true!
	long minAvgWordScore = sx->getLong(n0,n1,"index.minAvgWordScore",0);

	// if the whole document has less than this many words with positive
	// scores, do not index any of the words (set their scores to 0)
	long  minIndexableWords =
		sx->getLong (n0,n1,"index.minIndexableWords",defaultm);//40);

	// . for weighting the top portion of the document more, use these.
	// . only applicable if using the new parser so we can use the new
	//   Scores class
	long  numTopWords        =
		sx->getLong (n0,n1,"index.numTopWords",0);
	float topWordsWeight     =
		sx->getFloat(n0,n1,"index.topWordsWeight",3.0);
	float topSentenceWeight  =
		sx->getFloat(n0,n1,"index.topSentenceWeight",1.0);
	long  maxWordsInSentence =
		sx->getLong (n0,n1,"index.maxWordsInSentence",30);

	return set ( words                   ,
		     titleRecVersion         ,
		     scoreBySection          ,
		     indexContentSectionOnly ,
		     minSectionScore         ,
		     minAvgWordScore         ,
		     minIndexableWords       ,
		     // these are for weighting top part
		     // of news articles
		     numTopWords             ,
		     topWordsWeight          ,
		     topSentenceWeight       ,
		     maxWordsInSentence      ) ;
	*/
}

// . returns false and sets g_errno on error
// . scores the words in the Words.cpp class, which is set from an Xml pointer
// . Words.cpp must contain tags cuz that's what we look at to divide the
//   words up into sections
// . most docs are divided up into sections based on div, and table/tr/td tags
// . look at each section independently and score words in each section based
//   on the density of words in hyperlinks in their vicinity.
// . if a particular section has a lot of hyperlinked text it should score
//   low, while a section of a lot of pure text should score high.
// . small sections with not much plain text, but no hyperlinks, will not score
//   very high either, usually they are like copyright notices and stuff,
//   although they could be a small message on a message board.
// . most sections really don't have many things embedded in them, with the
//   exception of the root section, so we can linearly scan each section,
//   skipping over the embedded sections, with decent speed and compute the
//   score of each word on an individual basis.
// . sets m_wscores[i] to word #i's score weight.
// . if n1 is non-NULL we set the scores of all words that are not in the
//   top-scoring section to 0 or -1. this is used for just indexing simple
//   news articles which are mostly just contained in a single section.
// . if we have less than minIndexableWords positive scoring words, then do
//   not index any words, set their scores to 0
bool Scores::set ( Words    *words                   ,
		   Sections *sections                ,
		   long      titleRecVersion         ,
		   bool      scoreBySection          ,
		   bool      indexContentSectionOnly ,
		   long      minSectionScore         ,
		   long      minAvgWordScore         ,
		   long      minIndexableWords       ,
		   // these are for weighting top part of news articles
		   long      numTopWords             ,
		   float     topWordsWeight          ,
		   float     topSentenceWeight       ,
		   long      maxWordsInSentence      ,
		   char     *buf                     ,
		   long      bufSize                 ) {

	// sanity check
	//if ( m_buf ) { char *xx = NULL; *xx = 0; }
	reset();

	// save for printing into g_pbuf in TermTable.cpp
	m_scoreBySection            = scoreBySection          ;
	m_indexContentSectionOnly   = indexContentSectionOnly ;
	m_minSectionScore           = minSectionScore         ;
	m_minAvgWordScore           = minAvgWordScore         ;
	m_minIndexableWords         = minIndexableWords       ;
	m_numTopWords               = numTopWords             ;
	m_topWordsWeight            = topWordsWeight          ;
	m_topSentenceWeight         = topSentenceWeight       ;
	m_maxWordsInSentence        = maxWordsInSentence      ;

	// allocate m_scores buffer, one byte score per word
	m_scores = NULL;
	long nw = words->getNumWords();
	long need = nw * 4;
	// assume no malloc
	m_needsFree = false;
	if ( need < SCORES_LOCALBUFSIZE ) m_buf = m_localBuf;
	else if ( need < bufSize ) m_buf = buf;
	else {
		m_buf = (char *)mmalloc ( need , "Scores" );
		m_needsFree = true;
	}
	m_bufSize = need;
	if ( ! m_buf ) return false;
	char *p = m_buf;
	m_scores = (long *)p;
	p += nw * 4;
	//m_rerankScores = (long *) p;

	// all words start with a default normal score, 128 as of right now
	for ( long i = 0 ; i < nw ; i++ ) m_scores[i] = NORM_WORD_SCORE;

	nodeid_t   *tids  = words->getTagIds  ();
	long long  *wids  = words->getWordIds ();
	char      **w     = words->m_words;
	long       *wlens = words->m_wordLens;

	// . zero out scores of words in javascript and style tags
	// . set scores to 1 if word in select or marquee tag
	// . MATCHES.CPP check if the score is -1, and ignores it if so!!!!
	//   so if you modify this, keep that in mind
	if ( ! tids ) return true;
	char inScript  = 0;
	char inStyle   = 0;
	char inSelect  = 0;
	char inMarquee = 0;
	for ( long i = 0 ; i < nw ; i++ ) {
		// skip if not tag
		if ( ! tids[i] ) {
			if (inScript || inStyle) { m_scores[i] = -1; continue;}
			if (inSelect||inMarquee) { m_scores[i] = -1; continue;}
			continue;
		}
		// give all tags score of 0 by default
		m_scores[i] = 0;

		if ( (tids[i]&BACKBITCOMP) == TAG_SCRIPT ) { // <script>
			if   ( tids[i] & BACKBIT ) inScript = 0;
			else                       inScript = 1;
			continue;
		}
		if ( (tids[i]&BACKBITCOMP) == TAG_STYLE ) { // <style>
			if   ( tids[i] & BACKBIT ) inStyle = 0;
			else                       inStyle = 1;
			continue;
		}
		if ( (tids[i]&BACKBITCOMP) == TAG_SELECT ) { // <select>
			if   ( tids[i] & BACKBIT ) inSelect = 0;
			else                       inSelect = 1;
			continue;
		}
		if ( (tids[i]&BACKBITCOMP) == TAG_MARQUEE ) { // <marquee>
			if   ( tids[i] & BACKBIT ) inMarquee = 0;
			else                       inMarquee = 1;
			continue;
		}
		if ( inScript || inStyle   ) { m_scores[i] = -1; continue; }
		if ( inSelect || inMarquee ) { m_scores[i] = -1; continue; }
	}

	// . set pre-scores of words to NORM_WORD_SCORE (128) if in not in a
	//   link and to NORM_WORD_SCORE/6 (21) if in a link
	// . ignore punctuation and tag words
	// . then set the final score of each word to the average of its
	//   pre-score and the pre-scores of its 7 left and 8 right neighbors
	// . do not score anything with a score of "1" that is reserved for
	//   the <select> tags above
	if ( scoreBySection ) {
		if ( ! setScoresBySection ( words,
					    indexContentSectionOnly ,
					    minSectionScore ,
					    minAvgWordScore ) )
			return false;
	}
	// otherwise, give all indexable words a default normal score
	//else if (titleRecVersion >= 60){
	for ( long i = 0 ; i < nw ; i++ )
		if ( wids[i] && m_scores[i] > 0 )
			m_scores[i] = NORM_WORD_SCORE; // 128;
	//}
	//else{ // old version...unignores script/select/style words
	//	for ( long i = 0 ; i < nw ; i++ )
	//		if ( wids[i] )
	//			m_scores[i] = NORM_WORD_SCORE; // 128;
	//}

	// . we need at least this many positive scoring, indexable words
	// . this is -1 if unused
	if ( minIndexableWords > 0 ) {
		long count = 0;
		for ( long i = 0 ; i < nw ; i++ )
			if ( wids[i] && m_scores[i] > 1 ) count++;
		if ( count < minIndexableWords )
			for ( long i = 0 ; i < nw ; i++ ) m_scores[i] = 0 ;
	}

	// . now weight the words in the top of the document more
	// . news articles and other docs put the most important info first
	if ( numTopWords == 0 ) return true;

	long k;
	long count = 0;
	for ( long i = 0 ; i < nw ; i++ ) {
		// skip if not indexed (even though it may have a score > 0)
		if ( wids[i] == 0 ) continue;
		// skip over anything with a weight of 0 (ignored) or 1
		// which means in a <select> tag or something else that should
		// be indexed with minimum possible score.
		if ( m_scores[i] <= 1 ) {
			// end of sentence?
			if ( wids[i] != 0            ) continue;
			if ( maxWordsInSentence == 0 ) continue;
			for ( k = 0 ; k < wlens[i] ; i++ )
				if ( w[i][k] == '.' ||
				     w[i][k] == '!'   )
				maxWordsInSentence = 0;
			continue;
		}
		if ( count < numTopWords        )
			m_scores[i] =
				(long)((float)m_scores[i] * topWordsWeight);
		if ( count < maxWordsInSentence )
			m_scores[i] =
				(long)((float)m_scores[i] * topSentenceWeight);
		count++;
		if ( count >= maxWordsInSentence ) maxWordsInSentence = 0;
		if ( count >= numTopWords        ) numTopWords        = 0;
		if ( maxWordsInSentence > 0 ) continue;
		if ( numTopWords        > 0 ) continue;
		break;
	}

	return true;
}

#define RADIUS 16

bool Scores::setScoresBySection ( Words *words,
				  bool indexContentSectionOnly ,
				  long minSectionScore         ,
				  long minAvgWordScore         ) {

	long       nw     = words->getNumWords();
	long long *wids   = words->getWordIds ();
	nodeid_t  *tids   = words->getTagIds  ();
	bool       inLink = false;
	long       score  = 0;
	long       level  = 0;
	long       i;
	nodeid_t   ids    [ MAX_LEVELS ]; // tag ids on stack
	long       scores [ MAX_LEVELS ]; // scores on stack
	long       starts [ MAX_LEVELS ]; // section start positions on stack
	long       previs [ MAX_LEVELS ]; // linked list end
	long       previ =  0;
	// for storing the winning section
	long       max   =  -2000000000;
	long       maxa  = -1;
	long       maxb  = -1;
	char       flag  =  0;

	// . get the vector, 1-1 with the words
	// . wscores is 1 byte, fscores is 4 bytes, wnext is 4 bytes
	long need = nw * 6;
	char *tmp = NULL;
	char tstack[1024*100];
	if ( need > 1024*100 )
		tmp = (char *)mmalloc(need,"Scoress");
	else
		tmp = tstack;
	// bail if alloc failed
	if ( ! tmp ) return log("build: Scores failed to alloc %li bytes.",
				  need);
	char *p = (char *)tmp;
	long   *wnext   = (long  *)p ; p += 4 * nw;
	short  *wscores = (short *)p ; p += 2 * nw;
	// init
	wnext[0] = -1;
	// point to our score buffer
	long *fscores = m_scores;
	// convenience var
	//char *wscores = m_wscores;
	// -1 means score is unset
	//memset ( wscores , -1 , nw );
	// make this fixed for now, a hyperlink word needs to be balanced out
	// with 6 plain text words in order to be scored positively. this needs
	// to be an unchangeable knob in the ruleset file.
	float ratio = 8.0;
	// how much to score a plain text word?
	long plain = NORM_WORD_SCORE; // 128;
	// how much to score a word in hypertext?
	long hyper = (long)((float)plain / ratio);
	// for scoring the section
	long neg = plain * 4;
	// misc vars
	long mid,k,j,sj,rscore,lscore,bscore,count,cumscore;
	nodeid_t tid;

	for ( i = 0 ; i < nw ; i++ ) {
		// get the tag id
		tid = tids[i] & BACKBITCOMP;
		// we have to know what words are in hyperlinks
		if ( tid == 2 ) {
			if ( tids[i] & BACKBIT ) inLink = false;
			else                     inLink = true;
			continue;
		}
		// . if score already set to 0 or 1, skip it
		// . probably in a <script>, <style> or <select> tag
		// . MDW: i commented this out because it causes
		//   menu elimination tech to falter, zak uncommented
		//   it because it screwed up summary generation,
		//   cuz we were taking summaries from scripts i guess
		//if ( m_scores[i] <= 1 ) {
		//	// set it to 0 in case it was 1 already
		//	if ( indexContentSectionOnly ) m_scores[i] = 0;
		//	continue;
		//}
		// did we have a non section delimiting word?
		if ( tid != TAG_DIV  &&    // <div>
		     tid != TAG_TEXTAREA  &&    // <textarea>
		     tid != TAG_TR  &&    // <tr>
		     tid != TAG_TD  &&    // <td>
		     tid != TAG_TABLE    ) { // <table>
			// don't score tags, only text
			//if ( tid > 0 ) continue;
			// skip if punct or tag
			if ( wids[i] == 0 ) {
				// . punish if tag, except for <br> or <p>
				// . generally, taggy sections are not very
				//   good content.
				if ( ! tid ) continue;
				if ( tid == TAG_BR ) continue; // <br>
				if ( tid == TAG_P ) continue; // <p>
				score -= neg;
				continue;
			}
			// in a <script>, <style> or <select> tag?
			//if ( indexContentSectionOnly && m_scores[i] == -1 )
			// hey we should always scores these as -1! we were
			// getting summaries with no spaces in them!
			if ( m_scores[i] == -1 )
				continue;
			// if we hit a comment section identifier, then stop. i
			// don't want to index comments right now
			//if ( wids[i] == WID_COMMENT  ||
			//     wids[i] == WID_COMMENTS ||
			// subtract points if in link, otherwise add points
			if ( inLink ) {
				score -= neg    ; wscores[i] = hyper; }
			else          {
				score += plain  ; wscores[i] = plain; }
			// keep linked list up to date
			wnext[previ] =  i;
			wnext[    i] = -1;
			previ = i;
			continue;
		}

		// . we got a section delimiting tag
		// . is there an embedded section in this section?
		// . this should break any hyper link right?
		inLink = false;

		// front tag? did we start a new section?
		if ( !(tids[i] & BACKBIT) ) {
			// no more sections until we pop this one off
			if ( level >= MAX_LEVELS ) {
				// only log once
				if ( flag == 0 ) {
					log("build: Exceeded max levels.");
					flag = 1;
				}
				continue;
			}
			if ( g_conf.m_logDebugBuild )
				log(LOG_DEBUG,"build: Scored section %ld: %ld",
				    level, score);
			// push old info onto the stack
			ids    [level] = tids[i];
			scores [level] = score;
			starts [level] = i;
			previs [level] = previ;
			level++;
			score = 0;
			// start another linked list
			previ = i;
			// assume no linked list of words for this section
			wnext[i] = -1;
			continue;
		}

		// . it's a back tag
		// . bail if no corresponding fron tag on stack
		if ( level == 0 ) continue;
		// or if did not match what was on top of stack
		if ( tid != ids[level-1] ) continue;
		// recycle code
	hookin:
		// pop stack
		level--;

		//
		// this part scores the individual words based on the scores of
		// their neighbors. it is just like a moving average of the
		// past and future, but we don't bother dividing by the number
		// of samples.
		//

		// score words on left side of section
		if ( level == -1 ) sj = 0;
		else               sj = starts[level];
		// often, the first in level is not an indexable word, but
		// just a start of the linked list
		if ( wids[sj] == 0 ) sj = wnext[sj];
		// now start with that
		j = sj;
		// bail if nothing in the list
		if ( j == -1 ) goto empty;
		// compute left Boundary score, bscore
		bscore = 0;
		// accumulate score of first 16 words
		for (count=0;count<RADIUS&&j>=0;count++,j=wnext[j]) {
			bscore += wscores[j];
			// show score of each word
			//char *s    = words->m_words   [j];
			//long  slen = words->m_wordLens[j];
			//printstring(s,slen);
			// then score of it
			//fprintf(stderr,"(%li) ",(long)wscores[j]);
		}
		// save accumulation, not average
		cumscore = bscore;
		// if section has less than 16 words then
		// grow bscore proportionately
		//if ( count < 16 ) bscore = (bscore *16)/count;
		// make it an average score, so it's in [0,128]
		bscore /= count;
		// 1 is reserved for <select> tag et al
		if ( bscore == 1 ) bscore = 2;
		// must be above this. all or nothing.
		if ( bscore < minAvgWordScore ) bscore = 0;
		// set score of first 16 words to the sum
		// of the scores of the first 16 words
		j = sj; // j = starts[level];
		for (count=0;count<RADIUS&&j>=0;count++,j=wnext[j])
			fscores[j] = bscore;
		// bail if no more words in section
		if ( j == -1 ) goto skip;

		// . set up right/mid/left ptr info
		// . set our rightmost ptr, k
		k = j;
		// and right most cumulative score
		rscore = cumscore;
		// left side cumulative score starts at 0
		lscore = 0;
		// advance j to the 8th word, almost exactly in the middle
		j = sj; // j = starts[level];
		for (count=0;count<(RADIUS/2-1)/*7*/;count++,j=wnext[j]);
		// score words in middle now
		mid = j;
		// j is the left most ptr
		j = sj; // j = starts[level];

	more:	// now set centroids' FINAL score, the sum of
		// its wscore and first 5 on left and right.
		// divide by 16 to make it an average score
		//fscores[mid] = (rscore - lscore) / 16 ;
		//fscores[mid] = (rscore - lscore) >> 4;
		//fscores[mid] = cumscore >> 4;
		fscores[mid] = cumscore / RADIUS; // >> 4;
		// 1 is reserved for <select> tag et al
		if ( fscores[mid] == 1 ) fscores[mid] = 2;
		// must be above this. all or nothing.
		if ( fscores[mid] < minAvgWordScore ) fscores[mid] = 0;
		// debug point
		//if ( fscores[mid] == 0 ) {
		//	char *xx = NULL; *xx = 0; }
		// advance left  end and its cumulative score
		cumscore -= wscores[j];
		j         = wnext  [j];
		// advance middle
		mid       = wnext  [mid];
		// advance right end and its cumulative score
		k         = wnext  [k];
		cumscore += wscores[k];
		// loop if more left
		if ( k > 0 ) goto more;

		// score words on right end
		bscore = 0;
		count  = 0;
		for (k=mid;k>=0;k=wnext[k],count++) bscore+=wscores[k];
		// get the average score of them
		bscore /= count;
		// 1 is reserved for <select> tag et al
		if ( bscore == 1 ) bscore = 2;
		// must be above this. all or nothing.
		if ( bscore < minAvgWordScore ) bscore = 0;
		// and set
		for (k=mid;k>=0;k=wnext[k]) fscores[k]=bscore;

		//
		// end neighbor-influenced scoring
		//

	skip:
		if ( (score > max || maxa==-1) && score > minSectionScore ) {
			// zero out the previous winning section
			if ( indexContentSectionOnly && maxa >= 0 ) {
				// zero out all in list
				for ( j = maxa; j >= 0 ; j = wnext[j] )
					fscores[j] = 0;
			}
			// this section is the new winning section
			log(LOG_DEBUG, "build: Winning section: %ld, "
			    "score: %ld", level, score);
			max  = score;
			maxa = sj; // starts[level];
			maxb = i; // our section's last word # is < i
		}
		// if we were not the winning section, zero ourselves out
		else if ( indexContentSectionOnly ) {
			for ( j = sj ; j >= 0 ; j = wnext[j] )
				fscores[j] = 0;
		}
	empty:
		// pop old score et al back to be resumed
		if ( level >= 0 ) {
			score  = scores[level];
			previ  = previs[level];
		}
		// get next node
	}

	// set scores of anything still on the stack at completion
	while ( level >= 0 ) {
		i = nw - 1;
		goto hookin;
	}


	/*
	for (long i = 0 ; i < nw ; i++ ) {
		// skip if no wid
		if ( words->m_wordIds[i] == 0LL ) continue;
		if ( m_scores[i] == 0 ) continue;
		// show score of each word
		char *s    = words->m_words   [i];
		long  slen = words->m_wordLens[i];
		printstring(s,slen);
		// then score of it
		fprintf(stderr,"(%li) ",(long)m_scores[i]);
	}
	*/

	// and scores of the main/base section
	//i = nw - 1;
	//if ( ! flag ) { flag = 1; level = 1; goto hookin; }

	// ok, now we have designated all the sections and assigned them a
	// score, so if we are just getting the top section, return that

	// give caller the article text in a nutshell if that's all they wanted
	//if ( n1 ) {
	//	// assume no top section
	//	*n1 = -1; *n2 = -1;
	//	if ( maxi >= 0 ) { *n1 = maxa; *n2 = maxb; }
	//	return true;
	//}

	// now set the individual word scores in each section
	//for ( long i = 0 ; i < nsecs ; i++ )
	//	setSectionScores ( i , secStarts , secEnds , wscores );

	// copy scores
	//for (long i = 0 ; i < nw ;i++) m_scores[i]=(unsigned char)fscores[i];
	// done
	if ( tmp != tstack ) mfree ( tmp , need , "Scores" );

	// success
	return true;
}