open-source-search-engine/Weights.cpp
2021-05-06 01:52:55 +10:00

2825 lines
90 KiB
C++

// TODO: pass spam class to weight class and modify weights based on the spam
// then we can just serialize the weight vector in the title rec along
// with ptr offsets to the words that we index. carver can just scan
// through the word ptrs rather than 1 char at a time. summary generator
// can just use the weights to score each sample then.
#include "gb-include.h"
#include "Weights.h"
#include "Words.h"
#include "Bits.h"
#include "Phrases.h"
#include "Titledb.h"
#include "HashTableX.h"
#include "Abbreviations.h"
#include "XmlNode.h" // g_nodes[]
#include "HashTable.h"
#include "Sections.h"
static HashTable s_punctTable;
bool initPunctWeights ( ) ;
Weights::Weights () {
m_buf = NULL;
m_bufSize = 0;
m_ww = NULL;
m_pw = NULL;
m_countTablePtr = NULL;
}
Weights::~Weights () {
reset();
}
void Weights::reset() {
if ( m_buf && m_buf != m_localBuf )
mfree ( m_buf , m_bufSize , "Weights" );
m_buf = NULL;
m_bufSize = 0;
m_ww = NULL;
m_pw = NULL;
m_countTablePtr = NULL;
}
// RULE #1 (apply to meta tags, too)
// these rules should be applied when indexing/hashing the incoming linktext,
// title, document body and meta tags.
// RULE #2 (promte rather than demote)
// prefer to promote rather than demote because then we do not have to reindex
// the low quality pages, just the good ones, which are fewer, and the good
// ones will bubble to the top.
// RULE #3 (in parentheses)
// if word in ()'s []'s or {}'s then demote word and phrase score equally.
// after the first 40 words in the parentheses, do not demote any more.
// RULE #4 (next to bad punct)
// if we have a sequence of punctuation that is not just spaces on the left
// or the right of the word, demote it. Right now we just use
// Bits::canPairAcross(), but we should get more accurate...
// We compute a word and phrase weight for each punctuation "word", i.e.
// sequence of punctuation, and we use initPunctWeights() to set that table.
// Apply these weights differently to both the word and the phrase. The
// phrase weight from this is averaged across the
// RULE #5 (in hyperlink)
// if word or phrase is in a hyperlink multiply score by .15.
// RULE #6 (in content section)
// if word is in a section with lots of plain text words that are not
// in hyperlinks, boost it. If section is skimpy demote each word, but only
// demote a word if NOT the first occurrence of that word in that section.
// That way we allow for headers in their own section, like a table row, <tr>.
// Sections are delimited by "breaking tags" defined in XmlNode.cpp, g_nodes[].
// RULE #7 (in header, italic, bold or title tag)
// boost words and phrases in header, italic, bold or title tags.
// Only do this for the first 20 words in header tags to
// prevent abuse. Italic and bold are times also have a combined limit of 40.
// up to the first 20 alnum words in a title tag are boosted.
// RULE #8 (in a ul list)
// demote if in a list, under the <ul> tag.
// RULE #9 (repeated in sentence/fragment)
// if word/phrase repeated in same sentence, demote it.
// RULE #10 (comma-separated list)
// multiply score by .3 if in a comma-separated list. only
// count one occurrence in all such lists.
// RULE #11 (capitalized words)
// multiply score by 1.3 if the word is capitalized and is NOT immediately
// adjacent to another capitalized word, and not at the beginning of a
// sentence. Demote word score if it is immediately next to a capitalized
// word on the left or right, it is part of a phrase probably.
// If word is capitalized and so is adjacent word, but they are separated
// by a punct word that has less than DW of word weight, do not demote
// the word for being adjacent to a cap word.
// RULE #12 (repeated sentence frags)
// Demote the weight of the words and phrases in repeated sentence fragments.
// Fixes message boards which include the same msg over again in the reply.
// The first title and first header tag have amnesty, those often repeat
// anyway. What about int32_t titles? The demotion will be more the longer
// the repeated fragment. TODO: Fragments have to have a minimum length of
// 5 words unless they are surrounded by breaking tags. Hey, but we will demote
// those words for being in a small section via RULE #6.
// RULE #13 (promote if well distributed)
// If word is preceeded by the same word, but in a different section, boost
// the word for being well distributed.
// RULE #14 (common phrases) (UNIMPLEMENTED)
// if word is part of a common fragment in the dictionary files then multiply
// its score by .15. Have a constant fixed file of just the common fragments
// in order to keep parser consistent for now, until we store the
// scores/termids in the title rec. Fixes 'new mexico' results coming up for
// 'mexico' query. Apply this rule to incoming link text as well. Names like
// "Tom Cruise" should be an exception. Regenerate the dictionary because
// fragments are often bad, like a lot start with just 's ' and some do not
// break over tags like they should it seems, and we have "states of america"
// in there, too. Let's just hand filter the top ones out for now and put them
// into a file.
// RULE #15 (word to phrase ratio)
// if a word repeats in different phrases, promote the word and
// demote the phrase. BUT if a word repeats in pretty much the same phrase,
// promote the phrase and demote the word
// RULE #16 (beginning of the sentence)
// weight word at the beginning of the sentence, 1.2, and at the end, .8.
// interpolate linearly in between. It is more likely to be the subject
// matter if at the start of the sentence.
// RULE #17
// if phrase preceeded immediately by single or double quote, promote it.
// but only if not promoted from Rule #18 below.
// RULE #18 (capitalized phrase)
// if the word is capitalized and the last word in its phrase is capitalized
// then promote the phrase score.
// TODO: Do not consider fragments that start or end with 'the' or 'a' or
// fragments with punctuation in them, like "Sparks, OK".
// RULE #19 (hyphenated phrases)
// if word occurs in a hyphenated phrase, boost the phrase score and demote
// the word score.
// RULE #20 (inherit score)
// the base score is the average of the word scores in the phrase. HOWEVER, do
// not take the punishment received in SOME of the above rules, like for the
// word being in a common phrase or something. that should not apply to us!
// RULE #21 (UNIMPLEMENTED)
// index capitalized words with the capitalized letter and with it
// uncapitalized. then if someone types in "Sparks" they get the one that
// have that word capitalized.
// RULE #22 (phrase weight adjustment)
// the score of a phrase should be the phrase score of the first word in the
// phrase but weighted by the scores of the phrase scores of the contained
// punctuation words. certain sequences of punctuation contained in the phrase
// will drastically reduce its score.
// RULE #23 (words in special tags)
// do not index words in style, script or marquee tags. Index words and phrases
// in select tags, but with minimal weight.
// RULE #24 (primitive numbered list detector)
// if previous word began with a digit demote the word that follows it, but not
// if there is a breaking tag in between, however. demote the word if a number
// word follows the word as well, with no tag in between.
// TODO: make this more accurate.
// RULE #25 (demote phrases splitting hyphenated phrases)
// if one of the boundary words in a phrase is next to one and only one hyphen,
// and that hyphen is not in the phrase, then demote the phrase. if we have a
// phrase like like "spam-free email" we should demote the phrase "free email".
// it splits a STRONG_CONNECTOR.
// RULE #26 ("long" phrases) (not implemented -- using PQR, post query rerank)
// if phrase is very popular we usually weight it high and the individual words
// low. however, if the phrase term to the left or right of the phrase term
// has a very similar count, we should cut each phrase term's weights in half
// because it is probably a int32_t phrase. the more phrases in the chain that
// have the similar count, the more we should doc the phrase term weights
// of all involved. this is very similar to the anti-spam logic we have in
// set3(), repeated sentence frags, rule #13...
// RULE #27 reduce words/phrases in menu-y sections.
// RULE #28 repetitive spam detector (was Spam.cpp class)
float g_wtab[30][30];
float g_ptab[30][30];
// . returns false and sets g_errno on error
// . determines weights that are 1-1 with the words in the Words.cpp class
// . Words.cpp must contain tags cuz that's what we look at to divide the
// words up into sections
// . most docs are divided up into sections based on div, and table/tr/td tags
bool Weights::set ( Words *words ,
Phrases *phrases ,
Bits *bits ,
Sections *sections ,
SafeBuf *pbuf , // debug?
bool eliminateMenus ,
bool isPreformattedText ,
int32_t titleRecVersion ,
int32_t titleWeight ,
int32_t headerWeight ,
HashTableX *countTablePtr ,
bool isLinkText ,
bool isCountTable ,
int32_t siteNumInlinks ,
int32_t niceness ) {
reset();
m_version = titleRecVersion;
m_titleWeight = titleWeight;
m_headerWeight = headerWeight;
m_niceness = niceness;
m_isLinkText = isLinkText;
m_isCountTable = isCountTable;
m_words = words;
m_bits = bits;
m_siteNumInlinks = siteNumInlinks;
// use the provided count table, if not NULL
m_countTablePtr = countTablePtr;
// init the punct weight hash table
static bool s_needsInit = true;
if ( s_needsInit ) {
// if out of memory, we can't do much...
if ( ! initPunctWeights() ) return false;
s_needsInit = false;
}
// allocate memory
int32_t nw = words->getNumWords();
int32_t need = nw * (4 + 4 );
// one float for each rule per word so we can print out the weight
// associated with each algorithm (RULE #) for XmlDoc::print() as
// it is called by PageParser.cpp for debug purposes
if ( pbuf ) need += 2 * nw * MAX_RULES * sizeof(float);
if ( need < LOCAL_BUF_SIZE ) m_buf = m_localBuf;
else m_buf = (char *)mmalloc ( need , "Weights" );
m_bufSize = need;
if ( ! m_buf ) return false;
// assign ptrs
char *p = m_buf;
m_ww = (int32_t *)p; p += 4 * nw;
m_pw = (int32_t *)p; p += 4 * nw;
// the rule weight vector used for debugging by XmlDoc::print()
m_rvw = NULL;
m_rvp = NULL;
if ( pbuf ) {
float *start = (float *)p;
m_rvw = (float *)p; p += MAX_RULES * 4 * nw;
m_rvp = (float *)p; p += MAX_RULES * 4 * nw;
// init all to 1.0
for ( ; start < (float *)p ; start++ ) *start = 1.0;
// sanity check
if ( p > m_buf + need ) { char *xx=NULL;*xx=0; }
}
// . set all weights to default if weighting incoming link text
// . we only want to apply the word to phrase ratio weights really
if ( m_isLinkText || m_isCountTable ) {
for ( int32_t i = 0 ; i < nw ;i++ ) {
m_ww[i] = DW;
m_pw[i] = DW;
}
}
// . for each word, set a phrase weight and a word weight
// . each word can be a tag, alnum word or punct word
else {
memset ( m_pw , 0 , 4 * nw );
memset ( m_ww , 0 , 4 * nw );
}
// yield cpu
QUICKPOLL ( m_niceness );
m_wids = words->getWordIds ();
m_nw = words->getNumWords();
// . TODO?
// . if url is cgi, hash the values of the cgi parms into this table
// . demote those words in the doc because chances are we are a
// search results page, or content generated from those words
// . but increase the weight if the word is in the mid-domain of the
// url. that should override being in the cgi val in case of ties
// . do not do this promotion for urls with hyphens, but only for
// urls like gigablast.com where there is only one word "gigablast"
//m_urlTable
QUICKPOLL ( m_niceness );
// . set pass 1
// . 7 of the 14 ms is this
if ( ! set1 ( words, phrases, bits, isPreformattedText) ) return false;
QUICKPOLL ( m_niceness );
// . set pass 2
// . just adjusts the phrase scores per RULE #22 and word scores
// per RULE #4
// . 2 of the 14 ms is this
if ( ! set2 ( words, phrases, bits ) ) return false;
QUICKPOLL ( m_niceness );
// . set pass 3
// . look for repeating fragments
// . 1 of the 14 ms is this
//if ( ! set3 ( words , bits ) ) return false;
QUICKPOLL ( m_niceness );
// the old Spam.cpp spam detector
//if ( ! set4 ( ) ) return false;
QUICKPOLL ( m_niceness );
// . Sections replaces Scores for menu elimination technology
// . eliminate words in script, style, etc. tags
// . google seems to index marquee junk so i took SEC_MARQUEE out
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
int32_t goodFlags = SEC_IN_TITLE|SEC_IN_HEADER;//SEC_ARTICLE
// if not enough voters do not eliminate menus, cuz we are not
// that sure...
//if ( sections && sections->m_totalSimilarLayouts < 5 )
// eliminateMenus = false;
// only do this if we have sections
if ( ! sections ) nw = 0;
// might have no identified sections
if ( sections && sections->m_numSections <= 0 ) nw = 0;
// int16_tcut
Section *sn;
// loop over all words
for ( int32_t i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get section
sn = sections->m_sectionPtrs[i];
// get the flags of the section containing this word
int32_t flags = 0;
// set 'em if we got 'em. a root level meta tag has NULL sect.
if ( sn ) flags = sn->m_flags;
// is it in marquee, select, script or style tag?
if ( flags & badFlags ) {
// nukey!
m_ww[i] = 0;
m_pw[i] = 0;
// debug purposes
if ( m_rvw ) {
m_rvw[i*MAX_RULES+23] = 0.0;
m_rvp[i*MAX_RULES+23] = 0.0;
}
continue;
}
// if we are not valid and we are supposed to eliminate menus
// then we should not index anything, since we are not sure
// what section is what! being invalid just means we could
// not get enough voters to make a decision on what was
// a menu section and what wasn't.
if ( eliminateMenus ) {
// nukey!
m_ww[i] = 0;
m_pw[i] = 0;
// debug purposes
if ( m_rvw ) {
m_rvw[i*MAX_RULES+23] = 0.0;
m_rvp[i*MAX_RULES+23] = 0.0;
}
continue;
}
// if an article, or non-menu, do not reduce it
if ( flags & goodFlags ) continue;
// if not a repeat section skip as well
//if ( ! ( flags & SEC_DUP ) ) continue;
if ( sn->m_votesForNotDup > sn->m_votesForDup ) continue;
// . RULE #27
// . reduce dynamic, non-unique sections (top stories)
// . reduce static sections (menus)
// . also reduces EVERYTHING if we are "invalid"...
m_ww[i] = (int32_t)(m_ww[i] * .10);
m_pw[i] = (int32_t)(m_pw[i] * .10);
// debug purposes
if ( m_rvw ) {
m_rvw[i*MAX_RULES+27] = 0.10;
m_rvp[i*MAX_RULES+27] = 0.10;
}
}
/*
// we now use the scores for its menu elimination tech
if ( ! scores ) return true;
// now if the score is 0, make the m_ww and m_pw BOTH 0
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
// skip if score is good
if ( scores->m_scores[i] > 0 ) continue;
// otherwise nukey!
m_ww[i] = 0;
m_pw[i] = 0;
}
*/
return true;
}
/*
// match word sequences of NUMWORDS or more words
#define NUMWORDS 5
// . RULE #12 (repeated sentence frags)
bool Weights::set3 ( Words *words , Bits *bits ) {
if ( m_isLinkText ) return true;
// ez vars
int64_t *wids = words->getWordIds ();
int32_t nw = words->getNumWords();
// if no words, nothing to do
if ( nw == 0 ) return true;
// truncate for performance reasons. i've seen this be over 4M
// and it was VERY VERY SLOW... over 10 minutes...
// - i saw this tak over 200MB for an alloc for
// WeightsSet3 below, so lower from 200k to 50k. this will probably
// make parsing inconsistencies for really large docs...
if ( nw > 80000 ) nw = 80000;
int64_t ringWids [ NUMWORDS ];
int32_t ringPos [ NUMWORDS ];
int32_t ringi = 0;
int32_t count = 0;
int64_t h = 0;
// . make the hash table
// . make it big enough so there are gaps, so chains are not too long
int32_t minBuckets = (int32_t)(nw * 1.5);
int32_t nb = 2 * getHighestLitBitValue ( minBuckets ) ;
int32_t need = nb * (8+4);
char *buf = NULL;
char tmpBuf[50000];
if ( need < 50000 ) buf = tmpBuf;
else buf = (char *)mmalloc ( need , "WeightsSet3" );
char *ptr = buf;
int64_t *hashes = (int64_t *)ptr; ptr += nb * 8;
int32_t *vals = (int32_t *)ptr; ptr += nb * 4;
if ( ! buf ) return false;
// make the mask
uint32_t mask = nb - 1;
// clear the hash table
memset ( hashes , 0 , nb * 8 );
// clear ring of hashes
memset ( ringWids , 0 , NUMWORDS * 8 );
// for sanity check
int32_t lastStart = -1;
// . hash EVERY NUMWORDS-word sequence in the document
// . if we get a match look and see what sequences it matches
// . we allow multiple instances of the same hash to be stored in
// the hash table, so keep checking for a matching hash until you
// chain to a 0 hash, indicating the chain ends
// . check each matching hash to see if more than NUMWORDS words match
// . get the max words that matched from all of the candidates
// . demote the word and phrase weights based on the total/max
// number of words matching
for ( int32_t i = 0 ; i < nw ; i++ ) {
// skip if not alnum word
if ( ! wids[i] ) continue;
// yield
QUICKPOLL ( m_niceness );
// add new to the 5 word hash
h ^= wids[i];
// . remove old from 5 word hash before adding new...
// . initial ring wids are 0, so should be benign at startup
h ^= ringWids[ringi];
// add to ring
ringWids[ringi] = wids[i];
// save our position
ringPos[ringi] = i;
// wrap the ring ptr if we need to, that is why we are a ring
if ( ++ringi >= NUMWORDS ) ringi = 0;
// this 5-word sequence starts with word # "start"
int32_t start = ringPos[ringi];
// need at least NUMWORDS words in ring buffer to do analysis
if ( ++count < NUMWORDS ) continue;
// . skip if it starts with a word which can not start phrases
// . that way "a new car" being repeated a lot will not
// decrease the weight of the phrase term "new car"
// . setCountTable() calls set3() with this set to NULL
//if ( bits && ! bits->canStartPhrase(start) ) continue;
// sanity check
if ( start <= lastStart ) { char *xx = NULL; *xx = 0; }
// reset max matched
int32_t max = 0;
// look up in the hash table
int32_t n = h & mask;
loop:
// all done if empty
if ( ! hashes[n] ) {
// sanity check
//if ( n >= nb ) { char *xx = NULL; *xx = 0; }
// add ourselves to the hash table now
hashes[n] = h;
// sanity check
//if ( wids[start] == 0 ) { char *xx = NULL; *xx = 0; }
// this is where the 5-word sequence starts
vals [n] = start;
// save it
lastStart = start;
// debug point
//if ( start == 7948 )
// log("heystart");
// do not demote words if less than NUMWORDS matched
if ( max < NUMWORDS ) continue;
// . how much we should we demote
// . 10 matching words pretty much means 0 weights
float demote = 1.0 - ((max-5)*.10);
if ( demote >= 1.0 ) continue;
if ( demote < 0.0 ) demote = 0.0;
// . RULE #26 ("long" phrases)
// . if we got 3, 4 or 5 in our matching sequence
// . basically divide by the # of *phrase* terms
// . multiply by 1/(N-1)
// . HOWEVER, should we also look at HOW MANY other
// sequences matches this too!???
//float demote = 1.0 / ((float)max-1.0);
// set3() is still called from setCountTable() to
// discount the effects of repeated fragments, and
// the count table only understands score or no score
//if ( max >= 15 ) demote = 0.0;
// demote the next "max" words
int32_t mc = 0;
int32_t j;
for ( j = start ; mc < max ; j++ ) {
// skip if not an alnum word
if ( ! wids[j] ) continue;
// count it
mc++;
// demote it
m_ww[j] = (int32_t)(m_ww[j] * demote);
m_pw[j] = (int32_t)(m_pw[j] * demote);
if ( m_ww[j] <= 0 ) m_ww[j] = 2;
if ( m_pw[j] <= 0 ) m_pw[j] = 2;
// debug purposes
if ( m_rvw ) {
m_rvw[i*MAX_RULES+26] *= demote;
m_rvp[i*MAX_RULES+26] *= demote;
}
}
// save the original i
int32_t mini = i;
// advance i, it will be incremented by 1 immediately
// after hitting the "continue" statement
i = j - 1;
// must be at least the original i, we are monotinic
// otherwise ringPos[] will not be monotonic and core
// dump eventually cuz j and k will be equal below
// and we increment matched++ forever.
if ( i < mini ) i = mini;
// get next word
continue;
}
// get next in chain if hash does not match
if ( hashes[n] != h ) {
// wrap around the hash table if we hit the end
if ( ++n >= nb ) n = 0;
// check out bucket #n now
goto loop;
}
// how many words match so far
int32_t matched = 0;
// . we have to check starting at the beginning of each word
// sequence since the XOR compositional hash is order
// independent
// . see what word offset this guy has
int32_t j = vals[n] ;
// k becomes the start of the current 5-word sequence
int32_t k = start;
// sanity check
if ( j == k ) { char *xx = NULL; *xx = 0; }
// skip to next in chain to check later
if ( ++n >= nb ) n = 0;
// keep advancing k and j as int32_t as the words match
matchLoop:
// get next wid for k and j
while ( k < nw && ! wids[k] ) k++;
while ( j < nw && ! wids[j] ) j++;
if ( k < nw && wids[k] == wids[j] ) {
matched++;
k++;
j++;
goto matchLoop;
}
// keep track of the max matched for i0
if ( matched > max ) max = matched;
// get another matching string of words, if possible
goto loop;
}
if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" );
return true;
}
*/
// RULE #22
// adjust the phrase weights so that the phrase weight, m_pw[], of the first
// word in the phrase is the weight of the whole phrase
bool Weights::set2 ( Words *words ,
Phrases *phrases ,
Bits *bits ) {
if ( m_isCountTable ) return true;
// . we need to set the count table if not provided
// . m_countTablePtr is a ptr to the count table
// . we make it point to m_localCountTable if not provided
// . 3 of the 14 ms is this
if ( ! m_countTablePtr ) { char *xx = NULL; *xx = 0; }
// ez var
int64_t *wids = words->getWordIds ();
nodeid_t *tids = words->getTagIds ();
//char **wptrs = words->m_words;
//int32_t *wlens = words->m_wordLens;
int32_t nw = words->getNumWords();
int64_t *pids = phrases->getPhraseIds();
HashTableX *tt1 = m_countTablePtr;
//int32_t phrcountLast = 0;
int32_t nexti = -10;
int64_t pidLast = 0;
//logf(LOG_DEBUG,"build: still doing int32_t sanity check in here");
// . now consider ourselves the last word in a phrase
// . adjust the score of the first word in the phrase to be
for ( int32_t i = 0 ; i < nw ; i++ ) {
// skip if not alnum word
if ( ! wids[i] ) continue;
//if ( i == 38 )
// log("hey2");
//if(wptrs[i][0]=='S' && wptrs[i][1]=='e' && wptrs[i][2]=='r' )
// log("hey2");
// yield
QUICKPOLL ( m_niceness );
// . RULE #15
// . try to inline this
int64_t nextWid = 0;
int64_t lastPid = 0;
int32_t nwp = phrases->getNumWordsInPhrase(i);
if ( nwp > 0 ) nextWid = wids [i + nwp - 1] ;
if ( i == nexti ) lastPid = pidLast;
// get current pid
int64_t pid = pids[i];
// PHRASE WEIGHT HACK:
// if it is 0, force a phrase so that "News &amp; Events"
// is seen as a phrase as far as Weights.cpp is concerned
// NO LONGER NEEDED since we de-entityize all html docs now!!
/*
if ( pid == 0LL ) {
// get the next wid in line
int32_t j = i+ 2;
int32_t maxj = i+15;
if ( maxj >= nw ) maxj = nw;
for ( ; j < maxj ; j++ ) if ( wids[j] ) break;
if ( j < maxj ) {
pid = hash64 ( pid , wids[i] );
pid = hash64 ( pid , wids[j] );
}
}
*/
// get the word and phrase weights for term #i
float ww;
float pw;
getWordToPhraseRatioWeights ( lastPid , // pids[i-1],
wids[i] ,
pid ,
nextWid , // wids[i+1] ,
&ww ,
&pw ,
tt1 ,
m_version );
// assigne for debugging
//if ( m_rvw ) {
// m_rvw[i*MAX_RULES+15] *= ww;
// m_rvp[i*MAX_RULES+15] *= pw;
//}
// save the last phrase id
if ( nwp > 0 ) {
nexti = i + nwp - 1;
pidLast = pid; // pids[i] ;
}
// . apply the weights
// . do not hit all the way down to zero though...
// . Words.cpp::hash() will not index it then...
if ( m_ww[i] > 0 ) {
m_ww[i] = (int32_t)(m_ww[i] * ww);
if ( m_ww[i] <= 0 ) m_ww[i] = 1;
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+15] *= ww;
}
if ( m_pw[i] > 0 ) {
m_pw[i] = (int32_t)(m_pw[i] * pw);
if ( m_pw[i] <= 0 ) m_pw[i] = 1;
// debug purposes
if ( m_rvw ) m_rvp[i*MAX_RULES+15] *= pw;
}
if ( m_isLinkText ) continue;
//
// sanity check
//
/*
// . RULE #15
// . we copy getWordToPhraseRatioWeights() here in an effort
// to verify that the function works properly. if so,
// then we should end up with the same output here.
// . this will be commented out later.
// . if a word repeats in different phrases, promote the word
// and demote the phrase
// . if a word repeats in pretty much the same phrase, promote
// the phrase and demote the word
int32_t phrcount = 0 ;
if ( pids[i] )
phrcount=tt1->getScore (pids[i]);
int32_t wrdcount = tt1->getScore (wids[i]);
// if we are always ending the same phrase, like "Mexico"
// in "New Mexico"... get the most popular phrase this word is
// in...
int32_t phrcountMax;
if ( i == lastj && phrcountLast > phrcount )
phrcountMax = phrcountLast;
else
phrcountMax = phrcount;
// save wordcount before possibly truncating it
int32_t wrdcountMin = wrdcount;
// watch out for breeches
if ( wrdcount > 29 ) {
float ratio = (float)phrcountMax / (float)wrdcount;
phrcountMax = (int32_t)((29.0 * ratio) + 0.5);
wrdcount = 29;
}
if ( phrcountMax > 29 ) {
float ratio = (float)wrdcount / (float)phrcountMax;
wrdcount = (int32_t)((29.0 * ratio) + 0.5);
phrcountMax = 29;
}
// . sanity check
// . countLinkInfo() below may just add a phrase term without
// its corresponding word term, thereby breeching this
// sanity check, so let's comment it out
//if ( phrcountMax > wrdcount ) { char *xx = NULL; *xx = 0; }
// apply the weights from the table we computed above
m_ww[i] = (int32_t)(m_ww[i] * g_wtab[wrdcount][phrcountMax]);
// get length of phrase in words
//int32_t nwp = phrases->getNumWords(i);
// save the last phrase id
if ( nwp > 0 ) {
lastj = i + nwp - 1;
phrcountLast = phrcount;
pidLast = pids[i] ;
}
// . if the word is Mexico in 'New Mexico good times' then
// phrase term #i which is, say, "Mexico good" needs to
// get the min word count when doings its word to phrase
// ratio.
// . it has two choices, it can use the word count of
// "Mexico" or it can use the word count of "good".
// . say, each is pretty high in the document so the phrase
// ends up getting penalized heavily, which is good because
// it is a nonsense phrase.
// . if we had "united socialist soviet republic" repeated
// a lot, the phrase "socialist soviet" would score high
// and the individual words would score low. that is good.
if ( nwp > 0 ) {
// get the word count of the next word
nextWid = wids [i + nwp - 1] ;
int32_t wc = tt1->getScore(nextWid);
// if it is smaller than current word, set wrdcountMin
if ( wc < wrdcountMin ) wrdcountMin = wc;
}
// watch out for breeches
if ( wrdcountMin > 29 ) {
float ratio = (float)phrcount / (float)wrdcountMin;
phrcount = (int32_t)((29.0 * ratio) + 0.5);
wrdcountMin = 29;
}
if ( phrcount > 29 ) {
float ratio = (float)wrdcountMin / (float)phrcount;
wrdcountMin = (int32_t)((29.0 * ratio) + 0.5);
phrcount = 29;
}
// try to seek the highest weight possible for this phrase
// by choosing the lowest word count possible
m_pw[i] = (int32_t)(m_pw[i] * g_ptab[wrdcountMin][phrcount]);
// temp sanity checks until the inlined function works
if ( g_wtab[wrdcount][phrcountMax] != ww ) {
char *xx = NULL; *xx = 0; }
if ( g_ptab[wrdcountMin][phrcount] != pw ) {
char *xx = NULL; *xx = 0; }
*/
//
// end sanity check
//
// . RULE #4
// . if word adjacent to bad punct, demote it
// . or if adjacent to a high phrase weight punct word then
// likewise, we should demote it, like a hyphen
// . set1() must have been called for this to work
// . apply word weight based on punct to our left, if any
if ( i >0 && !wids[i-1] && (!tids || !tids[i-1]) ) {
m_ww[i] = ((int64_t)m_ww[i]*(int64_t)m_ww[i-1])/DW;
// debug purposes here
if ( m_rvw ) m_rvw [i*MAX_RULES+4] *=
((float)m_ww[i-1])/(float)DW;
}
// apply word weight based on punct to our right, if any
if ( i+1<nw && !wids[i+1] && (!tids || !tids[i+1]) ) {
m_ww[i] = ((int64_t)m_ww[i]*(int64_t)m_ww[i+1])/DW;
// debug purposes here
if ( m_rvw ) m_rvw [i*MAX_RULES+4] *=
((float)m_ww[i+1])/(float)DW;
}
// do not demote all the way to 0
if ( m_ww[i] <= 0 ) m_ww[i] = 2;
// RULE #25
// . if one of the boundary words in a phrase is next to one
// and only one hyphen, and that hyphen is not in the phrase
// then demote the phrase, it is splitting a hyphenated
// phrase, like for "spam-free email" we should demote the
// phrase "free email". it splits a STRONG_CONNECTOR. (mdw)
// . if we had a tight hyphen right before us this should
// have the 1.1 weight specified in the PunctWeights array
//if ( m_pw[i-1] == 1.10 * DW && m_pw[i+1] != 1.10 * DW ) {
// // down to 25%
// m_pw[i] = m_pw[i] * .25;
//}
//if ( i >0 && !wids[i-1] &&
// (!tids || !tids[i-1]) && m_pw[i-1]!=DW )
// m_ww[i] = (int32_t)(m_ww[i] * .75);
//if ( i+1<nw && !wids[i+1] &&
// (!tids || !tids[i+1]) && m_pw[i+1]!=DW )
// m_ww[i] = (int32_t)(m_ww[i] * .75);
// skip if phrase score is 0
if ( ! m_pw[i] ) continue;
// skip if cannot start a phrase
//if ( ! bits->canStartPhrase(i) ) { m_pw[i] = 0; continue; }
//if ( pids[i] == 0 ) { m_pw[i] = 0; continue; }
if ( pid == 0 ) { m_pw[i] = 0; continue; }
// skip if does not start phrase
if ( nwp <= 0 ) continue;
// sanity check
if ( nwp == 99 ) { char *xx = NULL; *xx = 0; }
// now mod the score
int64_t avg = m_pw[i];
// weight by punct in between
for ( int32_t j = i+1 ; j < i+nwp ; j++ ) {
if ( wids[j] ) continue;
avg = (avg * (int64_t)m_pw[j]) / DW;
}
// do not demote all the way to zero, we still want to index it
// and when normalized on a 100 point scale, like when printed
// out by PageParser.cpp, a score of 1 here gets normalized to
// 0, so make sure it is at least 2.
if ( avg < 2 )
avg = 2;
// apply
if ( m_rvp ) m_rvp [i*MAX_RULES+22] *= avg / m_pw[i] ;
// set that as our new score
m_pw[i] = avg;
}
return true;
}
// . inline this for speed
// . RULE #15
// . if a word repeats in different phrases, promote the word
// and demote the phrase
// . if a word repeats in pretty much the same phrase, promote
// the phrase and demote the word
// . if you have the window of text "new mexico good times"
// and word #i is mexico, then:
// pid1 is "new mexico"
// wid1 is "mexico"
// pid2 is "mexico good"
// wid2 is "good"
// . we store sliderParm in titleRec so we can update it along
// with title and header weights on the fly from the spider controls
void getWordToPhraseRatioWeights ( int64_t pid1 , // pre phrase
int64_t wid1 ,
int64_t pid2 ,
int64_t wid2 , // post word
float *ww ,
float *pw ,
HashTableX *tt1 ,
int32_t titleRecVersion ) {
static float s_fsp;
// from 0 to 100
char sliderParm = g_conf.m_sliderParm;
// i'm not too keen on putting this as a parm in the CollectionRec
// because it is so cryptic...
//static char sliderParm = 25;
// . to support RULE #15 (word to phrase ratio)
// . these weights are based on the ratio of word to phrase count
// for a particular word
static char s_sp = -1;
if ( s_sp != sliderParm ) {
// . set it to the newly updated value
// . should range from 0 up to 100
s_sp = sliderParm;
// the float version
s_fsp = (float)sliderParm / 100.0;
// sanity test
if ( s_fsp < 0.0 || s_fsp > 1.0 ) { char *xx = NULL; *xx = 0; }
// i is the word count, how many times a particular word
// occurs in the document
for ( int32_t i = 0 ; i < 30 ; i++ ) {
// . k is the phrase count, how many times a particular phrase
// occurs in the document
// . k can be GREATER than i because we index only phrase terms
// sometimes when indexing neighborhoods, and not the
// single words that compose them
for ( int32_t k = 0 ; k < 30 ; k++ ) {
// do not allow phrase count to be greater than
// word count, even though it can happen since we
// add imported neighborhood pwids to the count table
int32_t j = k;
if ( k > i ) j = i;
// get ratio
//float ratio = (float)phrcount / (float)wrdcount;
float ratio = (float)j/(float)i;
// it should be impossible that this can be over 1.0
// but might happen due to hash collisions
if ( ratio > 1.0 ) ratio = 1.0;
// restrict the range we can weight a word or phrase
// based on the word count
//float r = 1.0;
//if ( i >= 20 ) r = 2.1;
//else if ( i >= 10 ) r = 1.8;
//else if ( i >= 4 ) r = 1.5;
//else r = 1.3;
g_ptab[i][k] = 1.00;
g_wtab[i][k] = 1.00;
if ( i <= 1 ) continue;
// . we used to have a sliding bar between 0.0 and 1.0.
// word is weighted (1.0 - x) and phrase is weighted
// by (x). however, x could go all the way to 1.0
// even when i = 2, so we need to restrict x.
// . x is actually "ratio"
// . when we have 8 or less word occurrences, do not
// remove more than 80% of its score, a 1/5 penalty
// is good enough for now. but for words that occur
// a lot in the link text or pwids, go to town...
if ( i <= 2 && ratio >= .50 ) ratio = .50;
else if ( i <= 4 && ratio >= .60 ) ratio = .60;
else if ( i <= 8 && ratio >= .80 ) ratio = .80;
else if ( i <= 12 && ratio >= .95 ) ratio = .95;
// round up, so many "new mexico" phrases but only
// make it up to 95%...
if ( ratio >= .95 ) ratio = 1.00;
// if word's phrase is repeated 3 times or more then
// is a pretty good indication that we should weight
// the phrase more and the word itself less
//if ( k >= 3 && ratio < .90 ) ratio = .90;
// compute the weights
float pw = 2.0 * ratio;
float ww = 2.0 * (1.0 - ratio);
// . punish words a little more
// . if we got 50% ratio, words should not get as much
// weight as the phrase
//ww *= .45;
// do not weight to 0, no less than .15
if ( ww < 0.0001 ) ww = 0.0001;
if ( pw < 0.0001 ) pw = 0.0001;
// do not overpromote either
if ( ww > 2.50 ) ww = 2.50;
if ( pw > 2.50 ) pw = 2.50;
// . do a sliding weight of the weight
// . a "ww" of 1.0 means to do no weight
// . can't do this for ww cuz we use "mod" below
//float newWW = s_fsp*ww + (1.0-s_fsp)*1.00;
float newPW = s_fsp*pw + (1.0-s_fsp)*1.00;
// limit how much we promote a word because it
// may occur 30 times total, but have a phrase count
// of only 1. however, the other 29 times it occurs it
// is in the same phrase, just not this particular
// phrase.
//if ( ww > 2.0 ) ww = 2.0;
g_wtab[i][k] = ww;
g_ptab[i][k] = newPW;
//logf(LOG_DEBUG,"build: wc=%"INT32" pc=%"INT32" ww=%.2f "
//"pw=%.2f",i,k,g_wtab[i][k],g_ptab[i][k]);
}
}
}
int32_t phrcount1 = 0;
int32_t phrcount2 = 0;
int32_t wrdcount1 = 0;
int32_t wrdcount2 = 0;
if ( tt1->m_numSlotsUsed > 0 ) {
if (pid1) phrcount1 = tt1->getScore(&pid1);
if (pid2) phrcount2 = tt1->getScore(&pid2);
if (wid1) wrdcount1 = tt1->getScore(&wid1);
if (wid2) wrdcount2 = tt1->getScore(&wid2);
}
// if we are always ending the same phrase, like "Mexico"
// in "New Mexico"... get the most popular phrase this word is
// in...
int32_t phrcountMax = phrcount1;
int32_t wrdcountMin = wrdcount1;
// these must actually exist to be part of the selection
if ( pid2 && phrcount2 > phrcountMax ) phrcountMax = phrcount2;
if ( wid2 && wrdcount2 < wrdcountMin ) wrdcountMin = wrdcount2;
// . but if we are 'beds' and in a popular phrase like 'dog beds'
// there maybe a lot of other phrases mentioned that have 'beds'
// in them like 'pillow beds', 'pet beds', but we need to assume
// that is phrcountMax is high enough, do not give much weight to
// the word... otherwise you can subvert this algorithm by just
// adding other random phrases with the word 'bed' in them.
// . BUT, if a page has 'X beds' with a lot of different X's then you
// still want to index 'beds' with a high score!!! we are trying to
// balance those 2 things.
// . do this up here before you truncate phrcountMax below!!
float mod = 1.0;
if ( phrcountMax <= 6 ) mod = 0.50;
else if ( phrcountMax <= 8 ) mod = 0.20;
else if ( phrcountMax <= 10 ) mod = 0.05;
else if ( phrcountMax <= 15 ) mod = 0.03;
else mod = 0.01;
// scale wrdcount1/phrcountMax down for the g_wtab table
if ( wrdcount1 > 29 ) {
float ratio = (float)phrcountMax / (float)wrdcount1;
phrcountMax = (int32_t)((29.0 * ratio) + 0.5);
wrdcount1 = 29;
}
if ( phrcountMax > 29 ) {
float ratio = (float)wrdcount1 / (float)phrcountMax;
wrdcount1 = (int32_t)((29.0 * ratio) + 0.5);
phrcountMax = 29;
}
// . sanity check
// . neighborhood.cpp does not always have wid/pid pairs
// that match up right for some reason... so we can't do this
//if ( phrcount1 > wrdcount1 ) { char *xx = NULL; *xx = 0; }
//if ( phrcount2 > wrdcount2 ) { char *xx = NULL; *xx = 0; }
// apply the weights from the table we computed above
*ww = mod * g_wtab[wrdcount1][phrcountMax];
// slide it
*ww = s_fsp*(*ww) + (1.0-s_fsp)*1.00;
// ensure we do not punish too hard
if ( *ww <= 0.0 ) *ww = 0.01;
/*
if ( phrcountMax >= 0 ) {
int64_t sh = getPrefixHash ( (char *)NULL , 0 , NULL , 0 );
int64_t tid = g_indexdb.getTermId ( sh , wid1 );
logf(LOG_DEBUG,"build: phrcountMax=%"INT32" wrdCount1=%"INT32" "
"*ww=%.4f for word with tid=%"UINT64"",
phrcountMax,wrdcount1,(float)*ww,tid);
//if ( phrcountMax < 10 && tid == 16944700235015LL )
// log("hey");
}
*/
// sanity check
//if ( *ww == 0.0 ) { char *xx = NULL; *xx = 0; }
// scale wrdcountMin/phrcount down for the g_ptab table
if ( wrdcountMin > 29 ) {
float ratio = (float)phrcount2 / (float)wrdcountMin;
phrcount2 = (int32_t)((29.0 * ratio) + 0.5);
wrdcountMin = 29;
}
if ( phrcount2 > 29 ) {
float ratio = (float)wrdcountMin / (float)phrcount2;
wrdcountMin = (int32_t)((29.0 * ratio) + 0.5);
phrcount2 = 29;
}
// . if the word is Mexico in 'New Mexico good times' then
// phrase term #i which is, say, "Mexico good" needs to
// get the min word count when doings its word to phrase
// ratio.
// . it has two choices, it can use the word count of
// "Mexico" or it can use the word count of "good".
// . say, each is pretty high in the document so the phrase
// ends up getting penalized heavily, which is good because
// it is a nonsense phrase.
// . if we had "united socialist soviet republic" repeated
// a lot, the phrase "socialist soviet" would score high
// and the individual words would score low. that is good.
// . try to seek the highest weight possible for this phrase
// by choosing the lowest word count possible
// . NO LONGER AFFECT phrase weights because just because the
// words occur a lot in the document and this may be the only
// occurrence of this phrase, does not mean we should punish
// the phrase. -- MDW
if ( titleRecVersion >= 88 ) {
*pw = 1.0;
return;
}
// do it the old way...
*pw = g_ptab[wrdcountMin][phrcount2];
// sanity check
if ( *pw == 0.0 ) { char *xx = NULL; *xx = 0; }
}
bool Weights::set1 ( Words *words ,
Phrases *phrases ,
Bits *bits ,
bool isPreformattedText ) {
if ( m_isLinkText ) return true;
if ( m_isCountTable ) return true;
if ( ! m_countTablePtr ) { char *xx = NULL; *xx = 0; }
nodeid_t *tids = words->getTagIds ();
int64_t *wids = words->getWordIds ();
char **wptrs = words->m_words;
int32_t *wlens = words->m_wordLens;
int64_t *pids = phrases->getPhraseIds();
int32_t tid = 0;
// these punct weights are used for RULE #4
for ( int32_t i = 0 ; i < m_nw ; i++ ) {
// skip alnum words
if ( wids[i] ) continue;
// skip tags
if ( tids && tids[i] ) continue;
// just set weight for punct "words"
setPunctWeights ( i , wptrs[i] , wlens[i] );
// yield
QUICKPOLL ( m_niceness );
}
//HashTableT<uint64_t,char> titleRepeatTable;
HashTableX titleRepeatTable;
int32_t bufSize = 1024 * 20; // 10 bytes per record
char buf[bufSize];
// false = allowDupKeys
titleRepeatTable.set(8,0,2048,buf,bufSize,false,m_niceness,"titlerpt");
char inScript = 0;
char inStyle = 0;
char inSelect = 0;
char inTitle = 0;
char inMarquee = 0;
char inLink = 0;
char inQuotes = 0;
char inParens = 0;
char inHeader = 0;
char inItalic = 0;
char inBold = 0;
char inList = 0;
char inPre = 0;
char afterNum = 0;
char afterQuote= 0;
int32_t hcount = 0;
int32_t tcount = 0;
int32_t bcount = 0;
int32_t pcount = 0;
int32_t brcount = 0;
int32_t qcount = 0;
// start with fragNum 1 since TermTable::addScore() does not like
// scores of 0
int32_t fragNum = 1;
int32_t fragPos = 0;
if ( isPreformattedText ) inPre = 1;
// section vars
int32_t sectionStart = 0;
int32_t sectionWordCount = 0;
int32_t sectionWeightSum = 0;
// start with section 1 since TermTable::addScore() does not like
// scores of 0
int32_t section = 1;
// comma vars
int32_t commaWordCount = 0;
int32_t lastCommaWordCount = 0;
int32_t lastCommai = 0;
char charLen = 1;
// . we use a hash table for:
// . 1. for hashing all the phrase termids (phrase ids) into the term
// table so we can see if the phrase is unique or not. words that
// are in phrases that are repeated a lot will have their scores
// demoted, likewise, such phrase scores will be promoted. Now
// we also hash the wordids into here so we can look at the
// phrase count to word count ratio. (tt1)
// . 2. discounting multiple occurrences of a word in the same
// section. (tt2)
// . 3. for seeing in what section a word last occurred in, because
// we boost the words score when it is spread out over many
// different sections (tt3)
// . 4. discounting multiple occurrences of a word in the same
// sentence/fragment. (tt4)
//TermTable *tt1 = m_countTablePtr;
HashTableX tt2;
HashTableX tt3;
HashTableX tt4;
// allow for 33% of the slots to be empty to avoid excessive chaining
if ( ! tt2.set ( 8,4,(int32_t)(m_nw * 4),NULL,0,false,m_niceness,
"weight2") )
return false;
if ( ! tt3.set ( 8,4,(int32_t)(m_nw * 4),NULL,0,false,m_niceness,
"weight3") )
return false;
// this is for all words and phrases
if ( ! tt4.set ( 8,4,(int32_t)(m_nw * 4),NULL,0,false,m_niceness,
"weight4") )
return false;
// set the tag ptrs
char *tagPtrs[256];
memset ( tagPtrs , 0, 256 * sizeof(char *) );
// sanity check - let user know to increase this over 256
if ( getNumXmlNodes() > 256 ) { char *xx = NULL; *xx = 0; }
// set our quickie array
tagPtrs[ 43] = &inHeader;
tagPtrs[ 44] = &inHeader;
tagPtrs[ 45] = &inHeader;
tagPtrs[ 46] = &inHeader;
tagPtrs[ 47] = &inHeader;
tagPtrs[ 48] = &inHeader;
tagPtrs[105] = &inList;
tagPtrs[ 79] = &inPre;
tagPtrs[ 83] = &inScript;
tagPtrs[111] = &inStyle;
tagPtrs[ 84] = &inSelect;
tagPtrs[ 65] = &inMarquee;
tagPtrs[101] = &inTitle;
tagPtrs[ 2] = &inLink;
tagPtrs[ 52] = &inItalic;
tagPtrs[ 10] = &inBold;
tagPtrs[ 89] = &inBold; // strong
int32_t i = -1;
int32_t tiw = m_titleWeight;
int32_t hdw = m_headerWeight;
// sanity check, we cannot support this as we are now since we
// subtract 42 from the tagId before setting tagPtrs[x] to it
// (see below)
if ( tagPtrs[42] ) { char *xx = NULL; *xx = 0; }
// we do not use a *for* loop in order to save a level of indentation
loop:
if ( ++i >= m_nw )
return true;
// yield
QUICKPOLL ( m_niceness );
//
// . is the word a "tag"?
// . 2.3 ms of the 7 ms
//
if ( tids && tids[i] ) {
// get the tag id of the ith "word"
tid = tids[i];
// it it a front or back tag? (back tags start with "</")
char isFront;
if ( tid & BACKBIT ) isFront = 0;
else isFront = 1;
// clear that hi bit
tid &= BACKBITCOMP;
// debug
//if ( tid == 43 ) {
// log("got it");
// sleep(5);
//}
// set the in* thang
if ( tagPtrs[tid] ) {
// this - 42 is to support inHeader
if ( isFront ) *tagPtrs[tid] = tid - 42;
// exit the tag
else *tagPtrs[tid] = 0;
}
// is it a <br> tag?
else if ( tid == 20 ) {
// cancel this flag
afterNum = false;
// is there a 2nd, 3rd ... consecutive <br> tag?
if ( brcount > 0 ) {
// if so, any phrase containing this tag "word"
// will have a score of 0
m_pw[i] = 0;
goto loop;
}
// ok, we only have one <br> so far
brcount++;
// do not affect the score of the phrase containing us
m_pw[i] = DW;
goto loop;
}
// . a phrase can contain any "non-breaking" tag, so make sure
// that m_pw[i], the phrase weight, is the normal 128, so
// we can include this tag in the phrase without penalty
// . see XmlNode.cpp for the list of g_nodes
if ( ! g_nodes[tid].m_isBreaking ) {
m_pw[i] = DW; goto loop; }
// hey, if we just got a header tag
// cancel this flag with any breaking tag
afterNum = false;
// come here when we hit a word that ends our current section
newSection:
// reset the attribute flags if we encounter a breaking tag
inBold = 0;
inItalic = 0;
// . do not reset this! we might have just had a header tag
// right above and just got done setting this to 1!!!!
// . we have to reset to delete old stuff cleanly!
if ( m_version <= 85 ) inHeader = 0;
// and reset the thing for comma-separated list detection
lastCommaWordCount = -999;
// . RULE #6
// . adjust word/phrase scores of this section that just ended
// . do not count words in links for "sectionWordCount"
// . these are based on default weights of 128
// . it is not necessarily bad for the word to be in a int16_t
// fragment, like when searching for 'denver weather', for
// example, you want to find those words in int16_t header
// fragments. so do not demote weight for int16_t frags so much
float sw;
if ( sectionWordCount < 10 ) sw = 0.8;
else if ( sectionWordCount < 30 ) sw = 1.1;
else if ( sectionWordCount < 50 ) sw = 1.2;
else if ( sectionWordCount < 100 ) sw = 1.3;
else if ( sectionWordCount < 150 ) sw = 1.4;
else if ( sectionWordCount < 200 ) sw = 1.5;
else sw = 1.6;
// TODO: get the average word score of the lowest scoring
// half of all words in this section
for ( int32_t j = sectionStart ; j < i ; j++ ) {
// skip if not a word
if ( ! wids[j] ) continue;
// hash the word and phrase ids with the section num
int64_t hw = wids[j] * (int64_t)section;
int64_t hp = pids[j] * (int64_t)section;
// . do not demote first occurrence of word in a
// low-scoring section, but only demote successive
// occurrences in low-scoring sections.
// This allows for headers in separate sections.
// . get the special hash
if ( tt2.getScore(&hw) > 1 || sw >= DW ) {
m_ww[j] = (int32_t)(m_ww[j] * sw);
// debug purposes
if ( m_rvw ) m_rvw [i*MAX_RULES+6]*=(float)sw;
}
if ( tt2.getScore(&hp) > 1 || sw >= DW ) {
m_pw[j] = (int32_t)(m_pw[j] * sw);
// debug purposes
if ( m_rvp ) m_rvp [i*MAX_RULES+6]*=(float)sw;
}
// add to the hash table
if ( ! tt2.addTerm(&hw) ) return false;
if ( ! tt2.addTerm(&hp) ) return false;
}
// start a new section with word #i
sectionStart = i;
// count words in this section
sectionWordCount = 0;
// accumulate their scores so we can average them
sectionWeightSum = 0;
// new fragment as well
fragNum++;
fragPos = 0;
// new section
section++;
goto loop;
}
// . RULE #23
// . do not index anything in these tags
if ( inScript ) goto loop;
if ( inStyle ) goto loop;
if ( inMarquee ) goto loop;
char *w = wptrs[i];
int32_t wlen = wlens[i];
//
// is the word a "punct word"? if so, wids[i] will be zero.
// . 0.7 ms of the 7 ms
//
if ( ! wids[i] ) {
// . these are similar to Bits.cpp functions
// . this is now called above
//setPunctWeights ( i , w , wlen );
//m_pw[i] = getPunctPhraseWeight (i, w,wlen);
// point to the word
char *p ;
char *wend = w + wlen;
char hasComma = false;
int32_t nlcount = 0;
// . scan the chars in the word
// . this might not work super well for unicode... :(
for ( p = w ; p < wend ; p++ ) {
// RULE #3
if ( *p == '\n' ) { nlcount++ ; continue;}
if ( *p == ' ' ) continue;
//if ( ! specialTab[*p] ) continue;
if ( *p == '(' ) { inParens = 1; pcount =0; continue;}
if ( *p == '[' ) { inParens = 1; pcount =0; continue;}
if ( *p == '{' ) { inParens = 1; pcount =0; continue;}
if ( *p == ')' ) { inParens = 0; continue;}
if ( *p == ']' ) { inParens = 0; continue;}
if ( *p == '}' ) { inParens = 0; continue;}
if ( *p == '\"' ) {
if ( inQuotes ) { inQuotes = 0; qcount = 0; }
else { inQuotes = 1; qcount = 0; }
continue;
}
// do not do single quote yet, it could be apostrophe
if ( *p == '\"' ) { afterQuote = true; continue; }
if ( *p == ',' ) { hasComma = true; continue; }
// . did we end a fragment? just check for a period
// . this should also check for !'s and ?'s
if ( *p != '.' && *p != '!' && *p != '?' ) continue;
// do not allow abbreviations to break sentences/frags
if ( *w == '.' && i>0 && wids[i-1] &&isAbbr(wids[i-1]))
break;
fragNum++;
fragPos = 0;
}
// in pre-formatted text we have no tags, so consider
// two new lines a breaking tag
if ( inPre && nlcount >= 2 ) goto newSection;
//
// demote words/phrases in comma-separated lists
//
// . RULE #10
// . if we have no comma get the next "word"
if ( ! hasComma ) goto loop;
// . if previous comma was nearby, tell all the words in
// between they are in a comma separated list
// . TODO: reset these to 0 when section/fragment changes
// or when we hit a breaking tag
if ( commaWordCount - lastCommaWordCount > 5 ) goto loop;
// reduce the scores of all in between these two commas
int32_t j = lastCommai - 2;
if ( j < 0 ) j = 0;
for ( ; j < i ; j++ ) {
// only demote scores of alnum words
if ( ! wids[j] ) continue;
// do not hit it too hard since we also demote words
// and phrase weights that are next to punctuation
// words in the set2() function above
m_ww[j] = (int32_t)(m_ww[j] * .75);
//m_pw[j]=(int32_t)(m_ww[j] * .75);
m_pw[j] = (int32_t)(m_pw[j] * .75);
// debug purposes
if ( m_rvw ) {
m_rvw [j*MAX_RULES+10] *= (float).75;
m_rvp [j*MAX_RULES+10] *= (float).75;
}
}
// save the new comma state
lastCommaWordCount = commaWordCount;
lastCommai = i;
goto loop;
}
//
// the word is an "alnum word"
//
// reset brcount
brcount = 0;
//if ( w[0]=='T' && w[1]=='h' && w[2]=='e' ) // && w[3]=='l' )
// log("hey");
// assume a normal weight
m_ww[i] = DW;
// and this
if ( m_rvw ) {
// init all to 1.0
float *start = &m_rvw[i*MAX_RULES];
float *end = &m_rvw[i*MAX_RULES+MAX_RULES];
for ( ; start < end ; start++ ) *start = 1.0;
}
// . RULE #23
// . minimize the weight if in a select tag
if ( inSelect ) {
m_ww[i] = 2;
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+23] = 2.0 / (float)DW;
}
// . RULE #3
// . i don't like search results that have the matching words in a
// parenthetical usually
// . but only demote the first 40 words in ()'s, otherwise we might
// end up demoting everything because we have an open parentheses bug
// in the html source
if ( inParens && pcount++ < 40 ) {
m_ww[i] /= 4;
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+3] *= 0.25;
}
// . RULE #8
// . or in a <ul> unnumbered list...
// . actually if each bullet point in the list is beefy it is pretty
// good, so be careful...
if ( inList ) {
m_ww[i] = (int32_t)(m_ww[i] * 0.75);
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+8] *= 0.75;
}
// in quotes is usually not that good, unless it is
// a int32_t quotation!!! TODO
//if ( inQuotes && qcount++ < 40 ) m_ww[i] /= 2;
// . RULE #7
// . in a header tag?
// . only boost first 40 words, though
// support version 85 and below for backwards compatibility
/*
if ( inHeader && m_version <= 85 && hcount++ < 40 ) {
// <h1> tag
if ( inHeader == 1 ) m_ww[i] *= 3;
// <h2> tag
else if ( inHeader == 2 ) m_ww[i] *= 2;
// <h3> tag
else if ( inHeader == 3 ) m_ww[i] = (int32_t)(m_ww[i] * 1.5);
// others
else m_ww[i] = (int32_t)(m_ww[i] * 1.2);
}
*/
if ( inHeader && m_version > 85 ) { // && hcount++ < 40 ) {
// <h1> tag
if ( inHeader == 1 ) {
if ( hdw > 100 ) m_ww[i] = (m_ww[i] * hdw) / 100;
// debug purposes
if ( m_rvw && hdw > 100 )
m_rvw[i*MAX_RULES+7] *= (float)hdw/100.0;
}
// <h2> tag
else if ( inHeader == 2 ) {
if ( hdw > 150 ) m_ww[i] = (m_ww[i] * hdw) / 150;
// debug purposes
if ( m_rvw && hdw > 150 )
m_rvw[i*MAX_RULES+7] *= (float)hdw/150.0;
}
// <h3> tag
else if ( inHeader == 3 ) {
if ( hdw > 200 ) m_ww[i] = (m_ww[i] * hdw) / 200;
// debug purposes
if ( m_rvw && hdw > 200 )
m_rvw[i*MAX_RULES+7] *= (float)hdw/200.0;
}
// others
else {
if ( hdw > 300 ) m_ww[i] = (m_ww[i] * hdw) / 300;
// debug purposes
if ( m_rvw && hdw > 300 )
m_rvw[i*MAX_RULES+7] *= (float)hdw/300.0;
}
// . degrade it by .8 each time
// . but only start degrading the 6th term in title
if ( ++hcount >= 30 ) hdw = (int32_t)((float)hdw * .90);
}
// . RULE #7
// . titleWeight is a percentage
if ( inTitle ) {
// support old rule for backwards compatibility
//if ( m_version <= 85 && tcount++ < 20 )
// m_ww[i] = ( m_ww[i] * m_titleWeight ) / 100;
// is it a title repeat?
bool repeated = false;
if ( m_titleWeight>100 && !titleRepeatTable.isEmpty(&wids[i]))
repeated=true;
// only use this for newer versions, 89 and above
//if ( m_version < 89 ) repeated = false;
// only add to repeat table
if ( tiw>100 && ! repeated )
if ( ! titleRepeatTable.addKey (&wids[i]) )
return false;
//m_ww[i] = ( m_ww[i] * m_titleWeight ) / 100;
// do not allow the same term to be repeated and receive the
// title boost both times!
if ( tiw > 100 && ! repeated ) {
m_ww[i] = (m_ww[i] * tiw) / 100;
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+7] *=((float)tiw)/100.0;
}
// . degrade it by .8 each time
// . but only start degrading the 4th term in title
if ( ++tcount >= 3 ) tiw = (int32_t)((float)tiw * .80);
// do not weight for italics or bold now either
goto skipBoldWeight;
}
// . RULE #5
// . or in a link
if ( inLink ) {
//if ( m_version <= 85 ) m_ww[i] /= 3;
m_ww[i] /= 5;
// save that
if ( m_rvw ) m_rvw[i*MAX_RULES+5] *= .20;
// inherit the weight here
m_pw[i] = m_ww[i];
// debug purposes
if ( m_rvw ) {
gbmemcpy ( &m_rvp[i*MAX_RULES] ,
&m_rvw[i*MAX_RULES] ,
MAX_RULES * sizeof(float) );
}
// reset this, it is only for immediate quotes
afterQuote = false;
fragPos++;
goto loop;
}
// . RULE #7
// . use same count, bcount, for both these
if ( inBold && bcount++ < 20 ) {
m_ww[i] = (int32_t)(m_ww[i] * 1.5);
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+7] *= 1.5;
}
else if ( inItalic && bcount++ < 20 ) {
m_ww[i] = (int32_t)(m_ww[i] * 1.5);
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+7] *= 1.5;
}
// apply body weight here
skipBoldWeight:
// . RULE #24
// . if we are following a number, demote us
// . TODO: make this a numbered list detector!! more accurate.
if ( afterNum && ! is_digit(wptrs[i][0]) ) {
m_ww[i] = (int32_t)(m_ww[i] * 0.5);
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+24] *= 0.5;
}
// are we a number?
if ( is_digit(wptrs[i][0]) ) afterNum = true;
else afterNum = false;
// if a number follow us, demote us, too
if ( i+2<m_nw && is_digit(wptrs[i+2][0]) ) {
m_ww[i] = (int32_t)(m_ww[i] * 0.5);
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+24] *= 0.5;
}
// . RULE #13
// . is previous occurrence in a different section?
// . this is -1 if no previous occurrence
int32_t lastSection = tt3.getScore ( &wids[i] );
// . if this is the first occurrence ever, boost it
// . if the last occurrence was not this section, boost it
// . this rewards uniformly distributed words.
if ( lastSection != section ) {
m_ww[i] = (int32_t)(m_ww[i]*1.3);
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+13] *= 1.3;
// subtract lastSection from the score since TermTable will
// accumulate the score
if (!tt3.addTerm(&wids[i],section-lastSection))
return false;
}
// . RULE #16
// . if at beginning of sentence/fragment, boost
if ( fragPos < 50 ) {
m_ww[i] = (int32_t)(m_ww[i] * ((200.0 - (float)fragPos)/150.0));
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+16] *=
((200.0 - (float)fragPos)/150.0);
}
//if ( fragPos <= 4 ) m_ww[i] = (int32_t)(m_ww[i] * 1.30);
//else if ( fragPos <= 7 ) m_ww[i] = (int32_t)(m_ww[i] * 1.20);
//else if ( fragPos <= 13 ) m_ww[i] = (int32_t)(m_ww[i] * 1.10);
// . RULE #4
// . if word adjacent to "bad" punct, demote it
//if ( i >0 && !wids[i-1] && !tids[i-1] && !bits->canPairAcross(i-1))
// m_ww[i] = (int32_t)(m_ww[i] * .75);
//if ( i+1<nw && !wids[i+1] && !tids[i+1] && !bits->canPairAcross(i+1))
// m_ww[i] = (int32_t)(m_ww[i] * .75);
//
// phrase weight inherits word weight at this point
//
m_pw[i] = m_ww[i];
// debug purposes
if ( m_rvw )
gbmemcpy ( &m_rvp[i*MAX_RULES] ,
&m_rvw[i*MAX_RULES] ,
MAX_RULES * sizeof(float) );
// . RULE #9
// . if this is NOT the first time this word has occurred in this
// particular fragment/sentence, then quarter its weight
// . int64_t h = hash64 ( wids[i] , fragNum );
int64_t h = wids[i] * (int64_t)fragNum;
if ( tt4.getScore(&h)) {
m_ww[i] = (int32_t)(m_ww[i]*.25);
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+9] *= .25;
}
if ( ! tt4.addTerm ( &h ) ) return false;
if ( pids[i] ) {
h = pids[i] * (int64_t)fragNum;
if ( tt4.getScore(&h)) {
m_pw[i] = (int32_t)(m_pw[i]*.25);
if ( m_rvw ) m_rvp[i*MAX_RULES+9] *= .25;
}
if ( ! tt4.addTerm ( &h ) ) return false;
}
// . keep track of the average word weight for this section
// . do not count weight influencers below since they are mostly
// phrase specific
if ( ! inLink ) {
sectionWordCount++;
sectionWeightSum += m_ww[i];
}
// does word have a hyphen immediately on left or right of it?
char leftHyphen = false;
char rightHyphen = false;
char promoted = false;
if (i-1>=0 && wptrs[i-1][0]=='-' && wlens[i-1]==charLen )
leftHyphen = true;
if (i+1<m_nw && wptrs[i+1][0]=='-' && wlens[i+1]==charLen )
rightHyphen = true;
// . are we capitalized?
// . only alnum words can be capitalized
// . returns false if any letter besides the first is capitalized
char iscap = bits->isCap(i) ;
// is word to the left capitalized?
char iscap1 = 0;
// is word to the right capitalized?
char iscap2 = 0;
// do not set if not well connected (phrase weight is low)
if ( i-2 >= 0 && m_pw[i-1] >= DW && wids[i-2] )
iscap1 = bits->isCap(i-2);
// m_pw[i+1] should have been set at the top of set1()
if ( i+2 < m_nw && m_pw[i+1] >= DW && wids[i+2] )
iscap2 = bits->isCap(i+2);
// RULE #11
// demote word if immediate left word if it is capitalized but so
// is the word on the immediate left or right
if ( iscap && (iscap1 || iscap2) ) {
m_ww[i] /= 2;
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+11] *= .50;
}
// . RULE #11
// . if an "isolated" cap promote the word
// . only do this if NOT the beginning of a sentence, it
// has to have a space and then word on the immediate left
else if ( iscap && i-2>=0 &&wids[i-2] && wlens[i-1]==1) {
m_ww[i] *= 2;
// debug purposes
if ( m_rvw ) m_rvw[i*MAX_RULES+11] *= 2.0;
}
// . RULE #18
// . if the phrase this word starts is capitalized and
// so is the ending word of the phrase, promote the phrase.
// . if word has hyphen to right, do not demote.
// . if phrase contains junk like commas, it will be demoted
// via RULE #4
int32_t nwp = phrases->getNumWordsInPhrase(i);
if ( iscap && pids[i] && nwp>0 && bits->isCap(i + nwp - 1) ) {
m_pw[i] = (int32_t)(m_pw[i] * 2.0);
if ( m_rvw ) m_rvp[i*MAX_RULES+18] *= 2.0;
promoted = true;
}
// . RULE #18
// . if phrase is not all in caps, and not after a quote, and does
// not contain a hyphen after the first word, demote it
else if ( ! afterQuote && ! rightHyphen ) {
// only demote if phrase occurred only once or twice
int32_t count = m_countTablePtr->getScore ( &pids[i] );
if ( count <= 1 ) {
m_pw[i] = (int32_t)(m_pw[i] * 0.1);
if ( m_rvw ) m_rvp[i*MAX_RULES+18] *= 0.1;
}
else if ( count == 2 ) {
m_pw[i] = (int32_t)(m_pw[i] * 0.4);
if ( m_rvw ) m_rvp[i*MAX_RULES+18] *= 0.4;
}
}
// . RULE #17
// . if this phrase is immediately preceeded by double quote then
// promote it. but only if not promoted from rule #18 above.
if ( afterQuote && ! promoted ) {
m_pw[i] = (int32_t)(m_pw[i] * 2.0);
if ( m_rvw ) m_rvp[i*MAX_RULES+17] *= 2.0;
}
// reset this, it is only for immediate quotes
afterQuote = false;
// RULE #25
// if one of the boundary words in a phrase is next to one and only one
// hyphen, and that hyphen is not in the phrase, then demote the
// phrase. if we have a phrase like like "spam-free email" we should
// demote the phrase "free email". it splits a STRONG_CONNECTOR.
// '22-year-old man' should demote "old man".
if ( leftHyphen && ! rightHyphen ) {
m_pw[i] = (int32_t)(m_pw[i] * 0.25);
if ( m_rvw ) m_rvp[i*MAX_RULES+25] *= 0.25;
}
// is word part of a common phrase? like "san francisco"?
// we have a file of these commonPhrases.dat
//if ( isInCommonPhrase(i) ) {
// m_ww[i] /= 3;
// m_pw[i] *= 3;
//}
// TODO: REMOVE REPEATED FRAGMENTS SO THE BELOW CODE IS BETTER!!
// we are getting the headline repeated...
// do not demote all the way to zero, we still want to index it
// and when normalized on a 100 point scale, like when printed
// out by PageParser.cpp, a score of 1 here gets normalized to
// 0, so make sure it is at least 2.
if ( m_ww[i] < 2 ) m_ww[i] = 2;
//if ( m_pw[i] > m_ww[i] )
// log("hey");
//if ( isInSpammedFrag(i) ) {
// m_ww[i] = 0;
// m_pw[i] = 0;
//}
// this is the alnum word # in the current sentence fragment
fragPos++;
goto loop;
}
// . multiply the phrase score by the minimum phrase connector modifier of
// all punct words in the phrase
// . phrases that contain certain sequences of punctuation can have their
// scores lowered all the way to 0, and sometimes even boosted
// . normal modifier is 128, to keep things fast and float-free
/*
int32_t Weights::getPunctPhraseWeight ( int32_t i , char *s , int32_t len ) {
if ( len != 2 ) goto tryLen1;
// a comma is not usually a good thing to have in a phrase
if ( s[0]==',' && (s[1]=='\n' || s[1]==' ')) return DW/8;
// this is a url path and is not usually good either
if ( s[0]=='/' && s[1]=='~') return DW/8;
// double spaces, are ok, but not as good as a single space
if ( is_space(s[0]) && is_space(s[1]) ) return DW;//(int32_t)(DW*1.8);
if ( s[0]=='.' && is_space(s[1]) && i+1 < m_nw && m_wids[i+1] ) {
if ( i == 0 ) return 0;
if ( isAbbr ( m_wids[i-1] ) ) return DW;
return 1;
}
// a punct follow by a space:
if ( is_space(s[1]) && is_punct(s[0]) ) {
switch ((unsigned char)s[1]) {
case '?': return 0;
case '!': return 0;
case ';': return 0;
case '<': return 0;
case '>': return 0;
case 171: return 0; // << left shift operator
case 187: return 0; // >> right shift operator
case 191: return 0; // upsidedown question mark
case 161: return 0; // upsidedown exclamation point
case '(': return 1; // parens
case ')': return 1; // parens
case '[': return 1; // parens
case ']': return 1; // parens
case '{': return 1; // parens
case '}': return 1; // parens
case ',': return DW/3; // usually bad, but could be "Abq, NM"
default : return DW/8;
}
}
if ( is_space(s[0]) && is_punct(s[1]) ) {
switch ((unsigned char)s[1]) {
case '?': return 0;
case '!': return 0;
case ';': return 0;
case '<': return 0;
case '>': return 0;
case '.': return DW/16; // the .exe file extension
case 171: return 0; // << left shift operator
case 187: return 0; // >> right shift operator
case 191: return 0; // upsidedown question mark
case 161: return 0; // upsidedown exclamation point
case '(': return 1; // parens
case ')': return 1; // parens
case '[': return 1; // parens
case ']': return 1; // parens
case '{': return 1; // parens
case '}': return 1; // parens
case '$': return DW;
default : return DW/8;
}
}
return 0;
tryLen1:
if (len != 1) goto tryLen3;
switch ((unsigned char)s[0]) {
case '?' : return 0;
case '!' : return 0;
case ';' : return 0;
case '{' : return 0;
case '}' : return 0;
case '<' : return 0;
case '>' : return 0;
case '.' : return DW;
case ',' : return (int32_t)(DW*0.25);
case '_' : return (int32_t)(DW*0.25);
case '@' : return (int32_t)(DW*0.25);
case '&' : return (int32_t)(DW*0.80);
case '\t': return (int32_t)(DW*0.80);
case '\n': return DW;
case '\r': return DW;
case ' ' : return DW;
case '\'': return (int32_t)(DW*1.5); // tom's
case '-' : return (int32_t)(DW*2.00);
case 171 : return 0;
case 187 : return 0;
case 191 : return 0;
case 161 : return 0;
default : return (int32_t)(DW*0.25);
}
tryLen3:
// pair across any number of spaces, it will only show up as one
// space in html and Microsoft Front Page separates lines by a
// bunch of spaces
if ( is_space(s[0]) && is_space(s[1]) && is_space(s[2]) ) {
int32_t k = 3;
while ( k < len ) if ( ! is_space(s[k++]) ) return 0;
return DW;
}
// any other sequence of 4 or more punct chars we do not phrase across
if (len != 3) return 0;
// "://" is indicative of a url
if (s[0]==':' && s[1]=='/'&&s[2]=='/') return (int32_t)(DW*.025);
// we can pair across:
// "://"
// " , "
// " - "
// " & "
// " + "
if ( is_space(s[0]) && is_space(s[2]) )
switch (s[1]) {
case ',': return DW/8;
case '-': return DW/8;
case '+': return DW/8;
case '&': return DW;
}
return 0;
}
*/
struct PunctWeight {
float m_wordWeightFloat;
float m_phraseWeightFloat;
char *m_str;
int32_t m_hash;
int32_t m_wordWeight;
int32_t m_phraseWeight;
};
// . each PunctWeight contains both a word and phrase weight
// . here are all the sequences of punct that do not have a weight of DW/8,
// which is the default weight assigned below when nothing matches
// . we add a space to the end of the punct word if a tag immediately follows
// it, so when it is hashed, it is hashed as if a space follows it
// . a single space represents any number of \t or \n or space chars
// . before hashing your punct word to look it up in this table, we condense
// any sequence of spaces (\t \n or space) into a single space in the
// setPunctWeight()'s hash routine
// . make another table for version incrementing?
// . upsidedown ? and ! are hashed just as a ? or ! is
static PunctWeight s_punctWeights[] = {
// 1 char
{1.0, 1.00, " " ,0,0,0},
{0.2, 1.10, "-" ,0,0,0}, // cd-rom, i-pod
{0.2, 1.00, "." ,0,0,0}, // www.ibm
{0.2, 1.00, "&" ,0,0,0}, // m&m
{1.0, 1.00, "\'" ,0,0,0}, // tom's
{0.2, 0.20, "/" ,0,0,0}, // this/that
{0.8, 0.10, "," ,0,0,0}, // meta keywords?
// 2 chars
{1.0, 0.00, ". " ,0,0,0}, // mr. tom (assume not abbr here)
// do not punish word weight for ", ", it could be
// "kanoodle, an ad company, ..."
// and we punish for comma-separated lists above...
// also, it could be "Abq, NM"
{1.0, 0.20, ", " ,0,0,0},
{0.8, 0.20, " ," ,0,0,0}, // probably a typo for ", "
{0.2, 0.16, "/~" ,0,0,0}, // usually bad, url path component
{1.0, 0.00, "? " ,0,0,0}, // hello? anyone
{1.0, 0.10, "! " ,0,0,0}, // yahoo! games
{1.0, 0.00, ": " ,0,0,0},
{1.0, 0.00, "; " ,0,0,0},
{1.0, 0.00, "< " ,0,0,0},
{1.0, 0.00, "> " ,0,0,0},
{1.0, 0.00, " <" ,0,0,0},
{1.0, 0.00, " >" ,0,0,0},
{1.0, 0.00, "( " ,0,0,0},
{1.0, 0.00, ") " ,0,0,0},
{1.0, 0.00, " (" ,0,0,0},
{1.0, 0.00, " )" ,0,0,0},
{1.0, 0.00, "[ " ,0,0,0},
{1.0, 0.00, "] " ,0,0,0},
{1.0, 0.00, " [" ,0,0,0},
{1.0, 0.00, " ]" ,0,0,0},
{1.0, 0.00, "{ " ,0,0,0},
{1.0, 0.00, "} " ,0,0,0},
{1.0, 0.00, " {" ,0,0,0},
{1.0, 0.00, " }" ,0,0,0},
{1.0, 1.00, " $" ,0,0,0},
{1.0, 1.00, " %" ,0,0,0},
{1.0, 0.00, "\" " ,0,0,0},
{1.0, 0.00, " \"" ,0,0,0},
{1.0, 1.00, "\' " ,0,0,0}, // plural possessive or single quote
{1.0, 0.00, " \'" ,0,0,0}, // single quote
// 3 chars
{1.0, 0.30, " , " ,0,0,0}, // like ", "
{1.0, 0.05, " : " ,0,0,0},
{1.0, 0.00, " ( " ,0,0,0},
{1.0, 0.00, " ) " ,0,0,0},
{1.0, 0.00, " [ " ,0,0,0},
{1.0, 0.00, " ] " ,0,0,0},
{1.0, 0.00, " { " ,0,0,0},
{1.0, 0.00, " } " ,0,0,0},
{0.5, 1.00, " & " ,0,0,0}, // M & M
{1.0, 0.10, " - " ,0,0,0},
{1.0, 0.00, ", \"" ,0,0,0},
{1.0, 0.00, ", \'" ,0,0,0},
{1.0, 0.00, "\"! " ,0,0,0},
{1.0, 0.00, "\"? " ,0,0,0},
{1.0, 0.00, "\": " ,0,0,0},
// 4 chars
{1.0, 0.00, " -- " ,0,0,0},
{1.0, 0.00, "... " ,0,0,0},
// 5 chars
{1.0, 0.00, " ... ",0,0,0},
{1.0, 0.00, ".... ",0,0,0},
// special
{1.0, 1.00, "&nbsp;" ,0,0,0},
{1.0, 1.00, "&nbsp; ",0,0,0} // can have tag right after it
};
bool initPunctWeights ( ) {
PunctWeight *pws = s_punctWeights;
int32_t npw = sizeof(s_punctWeights) / sizeof(PunctWeight);
if ( ! s_punctTable.set ( 1024 , NULL , 0 ) ) return false;
for ( int32_t i = 0 ; i < npw ; i++ ) {
PunctWeight *PW = &pws[i];
// set hash
//int32_t len = gbstrlen ( PW->m_str );
int32_t h = hash32n ( PW->m_str );//, len );
// store the hash
PW->m_hash = h;
// set int32_t weights from float weights
int32_t xww = (int32_t)(PW->m_wordWeightFloat * DW);
int32_t xpw = (int32_t)(PW->m_phraseWeightFloat * DW);
// ensure minimums, do not go all the way to 0 because we
// want to still index a phrase or word with minimal score
// just in case someone searches for it. it will come up, but
// at the very bottom of the results.
if ( xww == 0 ) xww = 2;
if ( xpw == 0 ) xpw = 2;
// assign
PW->m_wordWeight = xww;
PW->m_phraseWeight = xpw;
// add this PunctWeight class to the hash table
if ( ! s_punctTable.addKey ( h , (int32_t)PW ) ) return false;
}
return true;
}
// . set m_ww[k] and m_pw[k] for a punctuation word which is a sequence of
// non-alnum chars (as defined in Words.cpp)
// . caller can divide the returned weight by DW to make it a proper percentage
void Weights::setPunctWeights ( int32_t k , char *s , int32_t len ) {
// convert the punct word into a normalized hash for look-up
unsigned char c;
int32_t j = 0;
uint32_t h = 0;
char lastSpace = false;
uint8_t *p = (uint8_t *)s;
uint8_t *pend = p + len;
char cs ;
for ( ; p < pend ; p += cs ) {
// get char size in bytes
cs = getUtf8CharSize ( p );
// assume ascii i guess
c = *p;
// normalize it
if ( is_wspace_utf8 (p) ) c = ' ';
// assume all all other multi byte punct is bad
else if ( cs >= 2 ) break;
// . normalize upside down ? and ! to ? and !
// . see http://www.utf8-chartable.de/ table latin1 to utf8
if ( p[0]==0xc2 && p[1]==0xbf ) c = '?' ; // inverted qmark
if ( p[0]==0xc2 && p[1]==0xa1 ) c = '?' ; // inverted excl mark
if ( p[0]==0xc2 && p[1]==0xb4 ) c = '\''; // grave mark
// if we are space and last was a space, skip
if ( c == ' ' && lastSpace ) continue;
if ( c == ' ' ) lastSpace = true;
else lastSpace = false;
// . incorporate it into hash
// . taken from hash32() in hash.cpp so it matches with the
// hash32() function in initPunctWeights() above
h ^= (uint32_t) g_hashtab
[(unsigned char)j]
[(unsigned char)c] ;
// advance j for next char
j++;
}
// tag follows? if so, append a space to hash if does not end in one
if ( k+1<m_nw && s[len] == '<' && ! lastSpace )
h ^= (uint32_t) g_hashtab
[(unsigned char)j++]
[(unsigned char)' '] ;
// lookup in the hash table, try to get the PunctWeight
PunctWeight *PW = NULL;
// only consult table if we had no unrecognized utf8 punct
if ( p >= pend ) PW = (PunctWeight *) s_punctTable.getValue ( h );
// if not there, use minimal weights
if ( ! PW ) {
// just make a really low weight by default, but not 0, just
// to be on the safe side
m_ww[k] = DW/8;
m_pw[k] = DW/8;
// TODO: put in code for rvw and rvp????
// debug, see if we are missing anything special
/*
char dbuf[256];
int32_t dlen = len;
if ( dlen > 100 ) dlen = 100;
gbmemcpy ( dbuf , s , dlen );
dbuf[dlen]=0;
logf(LOG_DEBUG,"build: missed \"%s\"",dbuf);
*/
return;
}
// was it an abbreviation? that is a special case
if ( PW->m_str[0]=='.' && PW->m_str[1]==' ' && PW->m_str[2]=='\0' &&
isAbbr(m_wids[k-1]) ) {
// give normal weights if so
m_ww[k] = DW;
m_pw[k] = DW;
return;
}
// otherwise grab the weight for this known punctuation sequence
m_ww[k] = PW->m_wordWeight;
m_pw[k] = PW->m_phraseWeight;
return;
}
/*
float Weights::getPunctWordWeight ( char *s , int32_t len ) {
// must be a tag right after, or possible EOF
if ( len == 0 ) return 1.0;
if ( len != 2 ) goto tryLen1;
tryLen2:
if ( is_space(s[0]) && is_space(s[1]) ) return 1.0;
if ( is_space(s[1]) ) {
skippy:
switch ((unsigned char)s[0]) {
case '?': return 1.0;
case '!': return 1.0;
case ';': return 1.0;
case '(': return 1.0;
case ')': return 1.0;
case '{': return 1.0;
case '}': return 1.0;
case '<': return 1.0;
case '>': return 1.0;
case '.': return 1.0;
case '\"': return 1.0;
case '\'': return 1.0;
case 191: return 1.0; // upsidedown question mark
case 161: return 1.0; // upsidedown exclamation point
default : return 0.5;
}
}
if ( is_space(s[0]) ) {
switch ((unsigned char)s[1]) {
case '?': return 0.8;
case '!': return 0.8;
case ';': return 0.8;
case '(': return 1.0;
case ')': return 1.0;
case '{': return 1.0;
case '}': return 1.0;
case '<': return 1.0;
case '>': return 1.0;
case '.': return 0.8;
case '\"': return 1.0;
case '\'': return 1.0;
case 191: return 1.0; // upsidedown question mark
case 161: return 1.0; // upsidedown exclamation point
default : return 0.5;
}
}
return 0.5;
tryLen1:
if ( len != 1 ) goto tryLen3;
// this is probably the most common case
if ( is_space(s[0]) ) return 1.0;
if ( s[0]=='-' ) return 0.5;
// if a tag follows, treat as a space
if ( s[1] == '<' ) goto skippy;
// apostrophe is ok
if ( s[1] == '\'' ) return 1.0;
// otherwise, a word follows, so <word><punctChar><word> pretty
// much always is a low weight for the word
return 0.5;
tryLen3:
// if first char is a legit ending of a sentence, skip that
// if .!?!? followed by string of spaces
char skipped = false;
if ( s[0] == '.' ||
s[0] == '?' ||
s[0] == '!' ||
s[0] == ':' ||
(unsigned char)s[0] == 191 || // upsidedown question mark
(unsigned char)s[0] == 161 ) { // upsidedown exclamation point
if ( len == 2 ) {
if ( ! is_space(s[1]) ) return 0.5;
if ( ! is_space(s[2]) ) return 0.5;
return 1.0;
}
s++;
len--;
skipped = true;
}
// pair across any number of spaces, it will only show up as one
// space in html and Microsoft Front Page separates lines by a
// bunch of spaces
if ( is_space(s[0]) && is_space(s[1]) && is_space(s[2]) ) {
int32_t k = 3;
// a int32_t string of more than spaces sucks...
while ( k < len ) if ( ! is_space(s[k++] ) ) return 0.3;
// it was all spaces, do not hurt it
return 1.0;
}
// if we skipped a starting .?! no need to go any further
if ( skipped ) return 0.5;
// a glorified breaking hyphen
if ( len == 4 &&
is_space(s[0]) &&
s[1] == '-' &&
s[2] == '-' &&
is_space(s[3]) ) return 1.0;
// ...
if ( len == 4 &&
s[0] = '.' &&
s[1] = '.' &&
s[2] = '.' &&
is_space(s[3]) ) return 1.0;
if ( len != 3 ) return 0.5;
if ( s[0] = ',' &&
s[1] == '\"' &&
is_space(s[2] ) return 1.0;
if ( is_space(s[0]) && is_space(s[2]) )
switch (s[1]) {
case ',': return 0.8;
case '&': return 0.7; // better as a phrase
}
return 0.5;
}
*/
/*
//
//
// BEGIN OLD SPAM.CPP class
//
//
#define WTMPBUFSIZE (MAX_WORDS *21*3)
// . RULE #28, repetitive word/phrase spam detector
// . set's the "spam" member of each word from 0(no spam) to 100(100% spam)
// . "bits" describe each word in phrasing terminology
// . if more than maxPercent of the words are spammed to some degree then we
// consider all of the words to be spammed, and give each word the minimum
// score possible when indexing the document.
// . returns false and sets g_errno on error
bool Weights::set4 ( ) {
// assume not the repeat spammer
m_isRepeatSpammer = false;
if ( m_isLinkText ) return true;
if ( m_isCountTable ) return true;
// int16_tcuts
Words *words = m_words;
Bits *bits = m_bits;
// if 20 words totally spammed, call it all spam?
m_numRepeatSpam = 20;
// int16_tcut
int32_t sni = m_siteNumInlinks;
// set "m_maxPercent"
int32_t maxPercent = 6;
if ( sni > 10 ) maxPercent = 8;
if ( sni > 30 ) maxPercent = 10;
if ( sni > 100 ) maxPercent = 20;
if ( sni > 500 ) maxPercent = 30;
// assume not totally spammed
m_totallySpammed = false;
// get # of words we have to set spam for
int32_t numWords = words->getNumWords();
// set up the size of the hash table (number of buckets)
int32_t size = numWords * 3;
// . add a tmp buf as a scratch pad -- will be freed right after
// . allocate this second to avoid mem fragmentation more
// . * 2 for double the buckets
char tmpBuf [ WTMPBUFSIZE ];
char *tmp = tmpBuf;
int32_t need = (numWords * 21) * 3 + numWords;
if ( need > WTMPBUFSIZE ) {
tmp = (char *) mmalloc ( need , "Spam" );
if ( ! tmp )
return log("build: Failed to allocate %"INT32" more "
"bytes for spam detection: %s.",
need,mstrerror(g_errno));
}
QUICKPOLL(m_niceness);
// set up ptrs
char *p = tmp;
// first this
unsigned char *spam = (unsigned char *)p; p += numWords ;
// . this allows us to make linked lists of indices of words
// . i.e. next[13] = 23--> word #23 FOLLOWS word #13 in the linked list
int32_t *next = (int32_t *)p; p += size * 4;
// hash of this word's stem (or word itself if useStem if false)
int64_t *bucketHash = (int64_t *)p; p += size * 8;
// that word's position in document
int32_t *bucketWordPos = (int32_t *)p; p += size * 4;
// profile of a word
int32_t *profile = (int32_t *)p; p += size * 4;
// is it a common word?
char *commonWords = (char *)p; p += size * 1;
// sanity check
if ( p - tmp > need ) { char *xx=NULL;*xx=0; }
// clear all our spam percentages for these words
memset ( spam , 0 , numWords );
int32_t np;
// clear the hash table
int32_t i;
for ( i = 0 ; i < size ; i++ ) {
bucketHash [i] = 0;
bucketWordPos[i] = -1;
commonWords [i] = 0;
}
// count position since Words class can now have tags in it
//
//int32_t pos = 0;
//bool usePos = false;
//if ( words->m_tagIds ) usePos = true;
int64_t *wids = words->getWordIds();
// . loop through each word
// . hash their stems and place in linked list
// . if no stemming then don't do stemming
for ( i = 0 ; i < numWords ; i++ ) {
// . skip punctuation
// . this includes tags now , too i guess
//if ( words->isPunct(i) ) continue;
if ( wids[i] == 0 ) continue;
// skip if will not be indexed cuz score is too low
//if ( wscores && wscores[i] <= 0 ) continue;
QUICKPOLL(m_niceness);
// TODO: get phrase stem if stemming is on
// store the phrase stem this word int32_to the buffer
// blen = words->getPhraseStem(i,buf,100);
// if (blen<=0) continue;
// get the hash of the ith word
int64_t h = words->getWordId(i);
// use secondary wordId if available
//if ( words->getStripWordId(i) )
// h = words->getStripWordId(i);
// "j" is the bucket index
int32_t j = (uint64_t)h % size;
// make sure j points to the right bucket
while (bucketHash[j]) {
if ( h == bucketHash[j] ) break;
if (++j == size) j = 0;
}
// if this bucket is occupied by a word then replace it but
// make sure it adds onto the "linked list"
if (bucketHash[j]) {
// if Words class contain tags as words, do this
//if ( usePos ) {
// next [pos] = bucketWordPos[j];
// bucketWordPos[ j] = pos++;
//}
//else {
// add onto linked list for the ith word
next[i] = bucketWordPos[j];
// replace bucket with index to this word
bucketWordPos[j] = i;
//}
}
// otherwise, we have a new occurrence of this word
else {
bucketHash [j] = h;
// if Words class contain tags as words, do this
//if ( usePos ) {
// bucketWordPos[ j] = pos++;
// next [pos] = -1;
//}
//else {
// store our position # (i) in bucket
bucketWordPos[j] = i;
// no next occurrence of the ith word yet
next[i] = -1;
//}
}
// if stop word or number then mark it
if ( bits->isStopWord(i) ) commonWords[j] = 1;
if ( words->isNum ( i ) ) commonWords[j] = 1;
}
// count distinct candidates that had spam and did not have spam
int32_t spamWords = 0;
int32_t goodWords = 0;
// . now cruise down the hash table looking for filled buckets
// . grab the linked list of indices and make a "profile"
for ( i = 0 ; i < size ; i++ ) {
// skip empty buckets
if (bucketHash[i] == 0) continue;
np=0;
// word #j is in bucket #i
int32_t j = bucketWordPos[i];
// . cruise down the linked list for this word
while ( j!=-1) {
// store position of occurrence of this word in profile
profile [ np++ ] = j;
// get the position of next occurrence of this word
j = next[ j ];
}
// if 2 or less occurrences of this word, don't check for spam
if ( np < 3 ) { goodWords++; continue; }
//
// set m_isRepeatSpammer
//
// look for a word repeated in phrases, in a big list,
// where each phrase is different
//
int32_t max = 0;
int32_t count = 0;
int32_t knp = np;
// must be 3+ letters, not a stop word, not a number
if ( words->m_wordLens[profile[0]] <= 2 || commonWords[i] )
knp = 0;
// scan to see if they are a tight list
for ( int32_t k = 1 ; k < knp ; k++ ) {
// breathe
QUICKPOLL(m_niceness);
// are they close together? if not, bail
if ( profile[k-1] - profile[k] >= 25 ) {
count = 0;
continue;
}
// otherwise inc it
count++;
// must have another word in between or tag
int32_t a = profile[k];
int32_t b = profile[k-1];
bool gotSep = false;
bool inLink = false;
for ( int32_t j = a+1 ; j <b ; j++ ) {
// if in link do not count, chinese spammer
// does not have his crap in links
if ( words->m_words[j][0] == '<' &&
words->m_wordLens[j]>=3 ) {
// get the next char after the <
char nc;
nc=to_lower_a(words->m_words[j][1]);
// now check it for anchor tag
if ( nc == 'a' ) {
inLink = true; break; }
}
if ( words->m_words[j][0] == '<' )
gotSep = true;
if ( is_alnum_a(words->m_words[j][0]) )
gotSep = true;
}
// . the chinese spammer always has a separator,
// usually another tag
// . and fix "BOW BOW BOW..." which has no separators
if ( ! gotSep ) count--;
else if ( inLink ) count--;
// get the max
if ( count > max ) max = count;
}
// a count of 50 such monsters indicates the chinese spammer
if ( max >= 50 )
m_isRepeatSpammer = true;
//
// end m_isRepeatSpammer detection
//
// . determine the probability this word was spammed by looking
// at the distribution of it's positions in the document
// . sets "spam" member of each word in this profile
// . don't check if word occurred 2 or less times
// . TODO: what about TORA! TORA! TORA!
// . returns true if 1+ occurrences were considered spam
QUICKPOLL(m_niceness);
bool isSpam = setSpam ( profile , np , numWords , spam );
// don't count stop words or numbers towards this threshold
if ( commonWords[i] ) continue;
// tally them up
if ( isSpam ) spamWords++;
else goodWords++;
}
// what percent of distinct candidate words were spammed?
int32_t totalWords = spamWords + goodWords;
// if no or ver few words return true
int32_t percent;
if ( totalWords <= 10 ) goto done;
percent = ( spamWords * 100 ) / totalWords;
// if 20% of words we're spammed punish everybody now to 100% spam
// if we had < 100 candidates and < 20% spam, don't bother
//if ( percent < 5 ) goto done;
if ( percent <= maxPercent ) goto done;
// set flag so linkspam.cpp can see if all is spam and will not allow
// this page to vote
m_totallySpammed = true;
// now only set to 99 so each singleton usually gets hashed
for ( i = 0 ; i < numWords ; i++ )
if ( words->getWordId(i) && spam[i] < 99 )
spam[i] = 99;
done:
// update the weights for the words
for ( i = 0 ; i < numWords ; i++ ) {
m_ww[i] = ( m_ww[i] * (100 - spam[i]) ) / 100;
if ( m_rvw ) m_rvw[i*MAX_RULES+28] *= (100 - spam[i]) / 100;
}
// TODO: use the min word spam algo as in Phrases.cpp for this!
for ( i = 0 ; i < numWords ; i++ ) {
m_pw[i] = ( m_pw[i] * (100 - spam[i]) ) / 100;
if ( m_rvp ) m_rvp[i*MAX_RULES+28] *= (100 - spam[i]) / 100;
}
// free our temporary table stuff
if ( tmp != tmpBuf ) mfree ( tmp , need , "Spam" );
return true;
}
// . a "profile" is an array of all the positions of a word in the document
// . a "position" is just the word #, like first word, word #8, etc...
// . we map "each" subProfile to a probability of spam (from 0 to 100)
// . if the profile is really big we get really slow (O(n^2)) iterating through
// many subProfiles
// . so after the first 25 words, it's automatically considered spam
// . return true if one word was spammed w/ probability > 20%
bool Weights::setSpam ( int32_t *profile, int32_t plen , int32_t numWords ,
unsigned char *spam ) {
// don't bother detecting spam if 2 or less occurrences of the word
if ( plen < 3 ) return false;
int32_t i;
// if we have more than 10 words and this word is 20% or more of
// them then all but the first occurrence is spammed
//log(LOG_INFO,"setSpam numRepeatSpam = %f", m_numRepeatSpam);
if (numWords > 10 && (plen*100)/numWords >= m_numRepeatSpam) {
for (i=1; i<plen; i++) spam[profile[i]] = 100;
return true ;
}
// . over 50 repeated words is ludicrous
// . set all past 50 to spam and continue detecting
// . no, our doc length based weight takes care of that kind of thing
//if (plen > 50 && m_version < 93 ) {
// // TODO: remember, profile[i] is in reverse order!! we should
// // really do i=0;i<plen-50, but this is obsolete anyway...
// for (i=50; i<plen;i++) m_spam[profile[i]] = 100;
// plen = 50;
//}
// we have to do this otherwise it takes FOREVER to do for plens in
// the thousands, like i saw a plen of 8338!
if ( plen > 50 ) { // && m_version >= 93 ) {
// . set all but the last 50 to a spam of 100%
// . the last 50 actually occur as the first 50 in the doc
for (i=0; i<plen-50;i++) spam[profile[i]] = 100;
// we now have only 50 occurrences
plen = 50;
// we want to skip the first plen-50 because they actually
// occur at the END of the document
profile += plen - 50;
}
QUICKPOLL(m_niceness);
// higher quality docs allow more "freebies", but only starting with
// version 93... (see Titledb.h)
// profile[i] is actually in reverse order so we subtract off from wlen
//int32_t off ;
//if ( m_version >= 93 ) {
// off = (m_docQuality - 30) / 3;
// if ( off < 0 ) off = 0;
//}
// just use 40% "quality"
int32_t off = 3;
// . now the nitty-gritty part
// . compute all sub sequences of the profile
// . similar to a compression scheme (wavelets?)
// . TODO: word positions should count by two's since punctuation is
// not included so start step @ 2 instead of 1
// . if "step" is 1 we look at every word position in the profile
// . if "step" is 2 we look at every other word position
// . if "step" is 3 we look at every 3rd word position, etc...
int32_t maxStep = plen / 4;
if ( maxStep > 4 ) maxStep = 4;
// . loop through all possible tuples
int32_t window, wlen, step, prob;
for ( step = 1 ; step <= maxStep ; step++ ) {
for ( window = 0 ; window + 3 < plen ; window+=1) {
for (wlen = 3; window+wlen <= plen ; wlen+=1) {
// continue if step isn't aligned with window
// length
if (wlen % step != 0) continue;
// . get probability that this tuple is spam
// . returns 0 to 100
prob = getProbSpam ( profile + window ,
wlen , step);
// printf("(%i,%i,%i)=%i\n",step,window,
// wlen,prob);
// . if the probability is too low continue
// . was == 100
if ( prob <= 20 ) continue;
// set the spammed words spam to "prob"
// only if it's bigger than their current spam
for (i=window; i<window+wlen;i++) {
// first occurrences can have immunity
// due to doc quality being high
if ( i >= plen - off ) break;
if (spam[profile[i]] < prob)
spam[profile[i]] = prob;
}
QUICKPOLL(m_niceness);
}
}
}
// was this word spammed at all?
bool hadSpam = false;
for (i=0;i<plen;i++) if ( spam[profile[i]] > 20 ) hadSpam = true;
// make sure at least one word survives
for (i=0;i<plen;i++) if ( spam[profile[i]] == 0) return hadSpam;
// clear the spam level on this guy
spam[profile[0]] = 0;
// return true if we had spam, false if not
return hadSpam;
}
*/