2013-08-03 00:12:24 +04:00
# include "gb-include.h"
# include "Title.h"
# include "Words.h"
# include "Sections.h"
# include "Pops.h"
# include "Pos.h"
# include "Titledb.h" // TITLEREC_CURRENT_VERSION
# include "Profiler.h"
# include "sort.h"
# include "HashTable.h"
2013-12-11 03:28:04 +04:00
//#include "CollectionRec.h"
2013-08-03 00:12:24 +04:00
# include "Indexdb.h"
# include "XmlDoc.h"
// test urls
// http://www.thehindu.com/2009/01/05/stories/2009010555661000.htm
// http://xbox360.ign.com/objects/142/14260912.html
// http://www.scmp.com/portal/site/SCMP/menuitem.2c913216495213d5df646910cba0a0a0?vgnextoid=edeb63a0191ae110VgnVCM100000360a0a0aRCRD&vgnextfmt=teaser&ss=Markets&s=Business
// http://www.legacy.com/shelbystar/Obituaries.asp?Page=LifeStory&PersonId=122245831
// http://web.me.com/bluestocking_bb/The_Bluestocking_Guide/Book_Reviews/Entries/2009/1/6_Hamlet.html
// http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/
// http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html
// www4.gsb.columbia.edu/cbs-directory/detail/6335554/Schoenberg
// http://www.washingtonpost.com/wp-dyn/content/article/2008/10/29/AR2008102901960.html
// http://www.w3.org/2008/12/wcag20-pressrelease.html
// http://www.usnews.com/articles/business/best-careers/2008/12/11/best-careers-2009-librarian.html
// http://www.verysmartbrothas.com/2008/12/09/
// http://www.slashgear.com/new-palm-nova-handset-to-have-touchscreen-and-qwerty-keyboard-0428710/
// still bad
// http://66.231.188.171:8500/search?k3j=668866&c=main&n=20&ldays=1&q=url%3Ahttp%3A%2F%2Fmichellemalkin.com%2F2008%2F12%2F29%2Fgag-worthy%2F selects
// "gag-worthy" instead of
// "Gag-worthy: Bipartisan indignance over .Barack the Magic Negro. parody"
// http://www.1800pocketpc.com/2009/01/09/web-video-downloader-00160-download-videos-from-youtube-on-your-pocket-pc.html : need to fix the numbers in the
// path somehow so similarity is higher
/*
static long isHeadlineClass ( Xml * xml , Words * words , long wordIndex ) ;
// . List of title tags
// . do not include bold cuz
// http://www.groovanauts.com/board/showthread.php?threadid=41718
// gets "Username" as the title!
static char s_titleTags [ ] = { TAG_TITLE , TAG_H1 , TAG_H2 , TAG_H3 } ; //,TAG_B };
static inline int s_min ( const int x , const int y ) {
if ( x < y ) return x ;
return y ;
}
static inline int s_max ( const int x , const int y ) {
if ( x > y ) return x ;
return y ;
}
*/
Title : : Title ( ) {
m_title = NULL ;
m_titleBytes = 0 ;
m_query = NULL ;
}
Title : : ~ Title ( ) {
reset ( ) ;
}
void Title : : reset ( ) {
if ( m_title & & m_title ! = m_localBuf )
mfree ( m_title , m_titleAllocSize , " Title " ) ;
m_title = NULL ;
m_titleBytes = 0 ;
2014-03-28 19:01:01 +04:00
m_titleAllocSize = 0 ;
2013-08-03 00:12:24 +04:00
m_query = NULL ;
m_titleTagStart = - 1 ;
m_titleTagEnd = - 1 ;
}
// returns false and sets g_errno on error
bool Title : : setTitle ( XmlDoc * xd ,
Xml * xml ,
Words * words ,
Sections * sections ,
Pos * pos ,
long maxTitleChars ,
long maxTitleWords ,
SafeBuf * pbuf ,
Query * q ,
CollectionRec * cr ,
long niceness ) {
// if this is too big the "first line" algo can be huge!!!
// and really slow everything way down with a huge title candidate
if ( maxTitleWords > 128 ) maxTitleWords = 128 ;
m_niceness = niceness ;
// make Msg20.cpp faster if it is just has
// Msg20Request::m_setForLinkInfo set to true, no need
// to extricate a title.
if ( maxTitleChars < = 0 ) return true ;
if ( maxTitleWords < = 0 ) return true ;
long long startTime = gettimeofdayInMilliseconds ( ) ;
2013-09-13 20:23:18 +04:00
// . reset so matches.cpp using this does not core
// . assume no title tag
m_titleTagStart = - 1 ;
m_titleTagEnd = - 1 ;
// if we are a json object
if ( ! xd - > m_contentTypeValid ) { char * xx = NULL ; * xx = 0 ; }
char * val = NULL ;
// look for the "title:" field in json then use that
2013-11-09 04:16:13 +04:00
SafeBuf jsonTitle ;
2014-03-28 19:01:01 +04:00
long vlen = 0 ;
2013-11-09 04:16:13 +04:00
if ( xd - > m_contentType = = CT_JSON ) {
2014-04-29 01:14:18 +04:00
// shortcut
char * s = xd - > ptr_utf8Content ;
2013-11-09 04:16:13 +04:00
char * jt ;
2014-04-29 01:14:18 +04:00
jt = getJSONFieldValue ( s , " title " , & vlen ) ;
2013-11-09 04:16:13 +04:00
if ( jt & & vlen > 0 ) {
2013-11-23 02:16:14 +04:00
jsonTitle . safeDecodeJSONToUtf8 ( jt , vlen , m_niceness ) ;
2013-11-09 04:16:13 +04:00
jsonTitle . nullTerm ( ) ;
2014-04-29 01:14:18 +04:00
}
// if we got a product, try getting price
long oplen ;
char * op = getJSONFieldValue ( s , " offerPrice " , & oplen ) ;
if ( op & & oplen ) {
if ( ! is_digit ( op [ 0 ] ) ) { op + + ; oplen - - ; }
float price = atof2 ( op , oplen ) ;
// print without decimal point if ends in .00
if ( ( float ) ( long ) price = = price )
jsonTitle . safePrintf ( " , $%li " ,
( long ) price ) ;
else
jsonTitle . safePrintf ( " , $%.02f " , price ) ;
}
if ( jsonTitle . length ( ) ) {
2013-11-09 04:16:13 +04:00
val = jsonTitle . getBufStart ( ) ;
2014-02-28 20:18:09 +04:00
vlen = jsonTitle . length ( ) ;
2013-11-09 04:16:13 +04:00
}
}
2013-09-13 20:23:18 +04:00
// if we had a title: field in the json...
2014-02-28 20:18:09 +04:00
if ( val & & vlen > 0 ) {
2013-09-13 20:23:18 +04:00
char * dst = NULL ;
m_titleBytes = vlen ;
if ( m_titleBytes + 1 < TITLE_LOCAL_SIZE )
dst = m_localBuf ;
else {
dst = ( char * ) mmalloc ( m_titleBytes + 1 , " titdst " ) ;
if ( ! dst ) return false ;
2014-03-28 19:01:01 +04:00
m_titleAllocSize = m_titleBytes + 1 ;
2013-09-13 20:23:18 +04:00
}
m_title = dst ;
memcpy ( dst , val , m_titleBytes ) ;
dst [ m_titleBytes ] = ' \0 ' ;
return true ;
}
2014-03-28 19:01:01 +04:00
// json content, if has no explicit title field, has no title then
if ( xd - > m_contentType = = CT_JSON ) {
m_localBuf [ 0 ] = ' \0 ' ;
m_title = m_localBuf ;
m_titleBytes = 0 ;
return true ;
}
2013-09-13 20:23:18 +04:00
2013-08-03 00:12:24 +04:00
bool status = setTitle4 ( xd ,
xml ,
words ,
sections ,
pos ,
maxTitleChars ,
maxTitleWords ,
pbuf ,
q ,
cr ) ;
long long took = gettimeofdayInMilliseconds ( ) - startTime ;
if ( took > 5 ) log ( " query: Title set took %lli ms for %s " , took ,
xd - > getFirstUrl ( ) - > getUrl ( ) ) ;
return status ;
}
// types of titles. indicates where they came from.
# define TT_LINKTEXTLOCAL 1
# define TT_LINKTEXTREMOTE 2
# define TT_RSSITEMLOCAL 3
# define TT_RSSITEMREMOTE 4
# define TT_BOLDTAG 5
# define TT_HTAG 6
# define TT_TITLETAG 7
# define TT_DMOZ 8
# define TT_FIRSTLINE 9
# define TT_DIVTAG 10
# define TT_FONTTAG 11
# define TT_ATAG 12
# define TT_TDTAG 13
# define TT_PTAG 14
# define TT_URLPATH 15
# define TT_TITLEATT 16
# define MAX_TIT_CANDIDATES 100
// does word qualify as a subtitle delimeter?
bool isWordQualified ( char * wp , long wlen ) {
// must be punct word
if ( is_alnum_utf8 ( wp ) ) return false ;
// scan the chars
long x ; for ( x = 0 ; x < wlen ; x + + ) {
if ( wp [ x ] = = ' ' ) continue ;
break ;
}
// does it qualify as a subtitle delimeter?
bool qualified = false ;
if ( x < wlen ) qualified = true ;
// fix amazon.com from splitting on period
if ( wlen = = 1 ) qualified = false ;
return qualified ;
}
//
// TODO: do not accumulate boosts from a parent
// and its kids, subtitles...
//
bool Title : : setTitle4 ( XmlDoc * xd ,
Xml * XML ,
Words * WW ,
Sections * sections ,
Pos * POS ,
long maxTitleChars ,
long maxTitleWords ,
SafeBuf * pbuf ,
Query * q ,
CollectionRec * cr ) {
m_maxTitleChars = maxTitleChars ;
m_maxTitleWords = maxTitleWords ;
// assume no title
reset ( ) ;
long NW = WW - > getNumWords ( ) ;
if ( pbuf ) {
//pbuf->safePrintf("<div stype=\"border:1px solid black\">");
//pbuf->safePrintf("<b>***Finding Title***</b><br>\n");
}
// array of candidate tags
static char s_candTags [ 512 ] ;
static char s_flag = 0 ;
if ( s_flag = = 0 ) {
// do not re-do
s_flag = 1 ;
// reset
memset ( s_candTags , 0 , 512 ) ;
}
// set each time since we "unset" below if we've no "article content"
s_candTags [ TAG_B ] = 1 ;
s_candTags [ TAG_H1 ] = 1 ;
s_candTags [ TAG_H2 ] = 1 ;
s_candTags [ TAG_H3 ] = 1 ;
s_candTags [ TAG_DIV ] = 1 ;
s_candTags [ TAG_TD ] = 1 ;
s_candTags [ TAG_P ] = 1 ;
s_candTags [ TAG_FONT ] = 1 ;
s_candTags [ TAG_TITLE ] = 1 ;
// we only allow candidates in <a> tags if it is a self link!
s_candTags [ TAG_A ] = 1 ;
//
// now get all the candidates
//
// . allow up to 100 title CANDIDATES
// . "as" is the word # of the first word in the candidate
// . "bs" is the word # of the last word IN the candidate PLUS ONE
long n = 0 ;
long as [ MAX_TIT_CANDIDATES ] ;
long bs [ MAX_TIT_CANDIDATES ] ;
float scores [ MAX_TIT_CANDIDATES ] ;
Words * cptrs [ MAX_TIT_CANDIDATES ] ;
long types [ MAX_TIT_CANDIDATES ] ;
char htmlEnc [ MAX_TIT_CANDIDATES ] ;
long numAlnum [ MAX_TIT_CANDIDATES ] ;
long parent [ MAX_TIT_CANDIDATES ] ;
// record the scoring algos effects
float baseScore [ MAX_TIT_CANDIDATES ] ;
float noCapsBoost [ MAX_TIT_CANDIDATES ] ;
float qtermsBoost [ MAX_TIT_CANDIDATES ] ;
float inCommonCandBoost [ MAX_TIT_CANDIDATES ] ;
float inCommonBodyBoost [ MAX_TIT_CANDIDATES ] ;
// reset these
for ( long i = 0 ; i < MAX_TIT_CANDIDATES ; i + + )
// assume no parent
parent [ i ] = - 1 ;
// xml and words class for each link info, rss item
Xml tx [ MAX_TIT_CANDIDATES ] ;
Words tw [ MAX_TIT_CANDIDATES ] ;
long ti = 0 ;
// restrict how many link texts and rss blobs we check for titles
// because title recs like www.google.com have hundreds and can
// really slow things down to like 50ms for title generation
long kcount = 0 ;
long rcount = 0 ;
// only allow 4 internal inlink titles
//long didHost = 0;
LinkInfo * info = xd - > getLinkInfo1 ( ) ;
// a flag to control subloop jumping
char didit = false ;
// come back to top of loop after switching "info" to point to
// the imported link info from another collection, linkInfo2...
fooloop :
//long long x = gettimeofdayInMilliseconds();
// . get every link text
// . TODO: repeat for linkInfo2, the imported link text
for ( Inlink * k = NULL ; info & & ( k = info - > getNextInlink ( k ) ) ; ) {
// breathe
QUICKPOLL ( m_niceness ) ;
// fast skip check for link text
if ( k - > size_linkText > = 3 & & + + kcount > = 20 ) continue ;
// fast skip check for rss item
if ( k - > size_rssItem > 10 & & + + rcount > = 20 ) continue ;
// set Url
Url u ;
u . set ( k - > ptr_urlBuf , k - > size_urlBuf ) ;
// is it the same host as us?
bool sh = true ;
// the title url
Url * tu = xd - > getFirstUrl ( ) ;
// skip if not from same host and should be
if ( tu - > getHostLen ( ) ! = u . getHostLen ( ) ) sh = false ;
// skip if not from same host and should be
if ( strncmp ( tu - > getHost ( ) , u . getHost ( ) , u . getHostLen ( ) ) ) sh = false ;
// get the link text
if ( k - > size_linkText > = 3 ) {
// if same host and it already "voted" skip it
//if ( sh && didHost >= 4 ) continue;
// only one vote for this host
//if ( sh ) didHost++;
// set the words to it
//if ( ! k->setXmlFromLinkText ( &tx[ti] ) )
// return false;
char * p = k - > ptr_linkText ;
long plen = k - > size_linkText - 1 ;
if ( ! verifyUtf8 ( p , plen ) ) {
log ( " title: set4 bad link text from url=%s " ,
k - > ptr_urlBuf ) ;
continue ;
}
// now the words.
if ( ! tw [ ti ] . set ( k - > ptr_linkText ,
k - > size_linkText - 1 , // len
TITLEREC_CURRENT_VERSION ,
true , // computeIds
0 ) ) // niceness
return false ;
// set the bookends, it is the whole thing
cptrs [ n ] = & tw [ ti ] ;
as [ n ] = 0 ;
bs [ n ] = tw [ ti ] . getNumWords ( ) ;
htmlEnc [ n ] = false ;
// score higher if same host
if ( sh ) scores [ n ] = 1.05 ;
// do not count so high if remote!
else scores [ n ] = 0.80 ;
// set the type
if ( sh ) types [ n ] = TT_LINKTEXTLOCAL ;
else types [ n ] = TT_LINKTEXTREMOTE ;
// another candidate
n + + ;
// use xml and words
ti + + ;
// break out if too many already. save some for below.
if ( n + 30 > = MAX_TIT_CANDIDATES ) break ;
}
// get the rss item
if ( k - > size_rssItem < = 10 ) continue ;
// . returns false and sets g_errno on error
// . use a 0 for niceness
if ( ! k - > setXmlFromRSS ( & tx [ ti ] , 0 ) ) return false ;
// if same host and it already "voted" skip it
//if ( sh && didHost >= 4 ) continue;
// only one vote for this host
//if ( sh ) didHost++;
// get the word range
long tslen ;
bool isHtmlEnc ;
char * ts = tx [ ti ] . getRSSTitle ( & tslen , & isHtmlEnc ) ;
// skip if not in the rss
if ( ! ts ) continue ;
// skip if empty
if ( tslen < = 0 ) continue ;
// now set words to that
if ( ! tw [ ti ] . set ( ts ,
tslen ,
TITLEREC_CURRENT_VERSION ,
true , // compute wordIds?
0 ) ) // niceness
return false ;
// point to that
cptrs [ n ] = & tw [ ti ] ;
as [ n ] = 0 ;
bs [ n ] = tw [ ti ] . getNumWords ( ) ;
htmlEnc [ n ] = isHtmlEnc ;
// increment since we are using it
ti + + ;
// base score for rss title
if ( sh ) scores [ n ] = 5.0 ;
// if not same host, treat like link text
else scores [ n ] = 2.0 ;
// set the type
if ( sh ) types [ n ] = TT_RSSITEMLOCAL ;
else types [ n ] = TT_RSSITEMREMOTE ;
// advance
n + + ;
// break out if too many already. save some for below.
if ( n + 30 > = MAX_TIT_CANDIDATES ) break ;
}
//logf(LOG_DEBUG,"title: took1=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// process the imported link info
info = * xd - > getLinkInfo2 ( ) ;
// only process it once though, use the flag "didit" to control that
if ( ! didit & & info ) { didit = true ; goto fooloop ; }
// . set the flags array
// . indicates what words are in title candidates already, but
// that is set below
// . up here we set words that are not allowed to be in candidates,
// like words that are in a link that is not a self link
// . alloc for it
char * flags = NULL ;
char localBuf [ 10000 ] ;
long need = WW - > getNumWords ( ) ;
if ( need < = 10000 ) flags = ( char * ) localBuf ;
else flags = ( char * ) mmalloc ( need , " TITLEflags " ) ;
if ( ! flags ) return false ;
// clear it
memset ( flags , 0 , need ) ;
// check tags in body
nodeid_t * tids = WW - > getTagIds ( ) ;
// scan to set link text flags
// loop over all "words" in the html body
char inLink = false ;
char selfLink = false ;
for ( long i = 0 ; i < NW ; i + + ) {
// breathe
QUICKPOLL ( m_niceness ) ;
// if in a link that is not self link, cannot be in a candidate
if ( inLink & & ! selfLink ) flags [ i ] | = 0x02 ;
// out of a link
if ( tids [ i ] = = ( TAG_A | BACKBIT ) ) inLink = false ;
// if not start of <a> tag, skip it
if ( tids [ i ] ! = TAG_A ) continue ;
// flag it
inLink = true ;
// get the node in the xml
long xn = WW - > m_nodes [ i ] ;
// is it a self link?
long len ;
char * link = XML - > getString ( xn , " href " , & len ) ;
// . set the url class to this
// . TODO: use the base url in the doc
Url u ; u . set ( link , len , true , false ) ;
// compare
if ( u . equals ( xd - > getFirstUrl ( ) ) )
selfLink = true ;
else
selfLink = false ;
// skip if not selfLink
if ( ! selfLink ) continue ;
// if it is a selflink , check for an "onClock" tag in the
// anchor tag to fix that Mixx issue for:
// http://www.npr.org/templates/story/story.php?storyId=5417137
long oclen ;
char * oc = NULL ;
if ( ! oc ) oc = XML - > getString ( xn , " onclick " , & oclen ) ;
if ( ! oc ) oc = XML - > getString ( xn , " onClick " , & oclen ) ;
// assume not a self link if we see that...
if ( oc ) selfLink = false ;
// if this <a href> link has a "title" attribute, use that
// instead! that thing is solid gold.
long atlen ;
char * atitle = XML - > getString ( xn , " title " , & atlen ) ;
// stop and use that, this thing is gold!
if ( ! atitle | | atlen < = 0 ) continue ;
// craziness? ignore it...
if ( atlen > 400 ) continue ;
// if it contains permanent or permalink, ignore it!
if ( strncasestr ( atitle , " permalink " , atlen ) ) continue ;
if ( strncasestr ( atitle , " permanent " , atlen ) ) continue ;
// do not count the link text as viable
selfLink = false ;
// aw, dammit
if ( ti > = MAX_TIT_CANDIDATES ) continue ;
// other dammit
if ( n > = MAX_TIT_CANDIDATES ) break ;
// ok, process it
if ( ! tw [ ti ] . set ( atitle ,
atlen , // len
TITLEREC_CURRENT_VERSION ,
true , // computeIds
0 ) ) // niceness
return false ;
// set the bookends, it is the whole thing
cptrs [ n ] = & tw [ ti ] ;
as [ n ] = 0 ;
bs [ n ] = tw [ ti ] . getNumWords ( ) ;
htmlEnc [ n ] = false ;
scores [ n ] = 3.0 ; // not ALWAYS solid gold!
types [ n ] = TT_TITLEATT ;
// we are using the words class
ti + + ;
// advance
n + + ;
// break out if too many already. save some for below.
if ( n + 20 > = MAX_TIT_CANDIDATES ) break ;
}
//logf(LOG_DEBUG,"title: took2=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
//long long *wids = WW->getWordIds();
// . find the last positive scoring guy
// . do not consider title candidates after "r" if "r" is non-zero
// . FIXES http://larvatusprodeo.net/2009/01/07/partisanship-politics-
// and-participation/
/*
long r = NW - 1 ;
if ( ! SS ) r = 0 ;
for ( ; r > 0 ; r - - ) {
// skip if no word
if ( wids [ r ] = = 0LL ) continue ;
if ( SS - > m_scores [ r ] > 0 ) break ;
}
// if it is zero that means we had none! so consider all titles!
if ( r = = 0 ) r = NW ;
*/
// . Sections class obsoletes Scores class
// . this sets r to -1 if no words in article
//long seca,secb;
//sections->getArticleRange ( &seca , &secb );
// do we have a valid article even?
//bool validArticle = false;
// this must be something
//if ( seca < secb ) validArticle = true;
// the candidate # of the title tag
long tti = - 1 ;
// if no "article content", ignore these tags
//if ( secb == -1 ) {
s_candTags [ TAG_B ] = 0 ;
s_candTags [ TAG_H1 ] = 0 ;
s_candTags [ TAG_H2 ] = 0 ;
s_candTags [ TAG_H3 ] = 0 ;
s_candTags [ TAG_DIV ] = 0 ;
s_candTags [ TAG_TD ] = 0 ;
s_candTags [ TAG_P ] = 0 ;
s_candTags [ TAG_FONT ] = 0 ;
//}
// allow up to 4 tags from each type
char table [ 512 ] ;
// sanity check
if ( getNumXmlNodes ( ) > 512 ) { char * xx = NULL ; * xx = 0 ; }
// clear table counts
memset ( table , 0 , 512 ) ;
// ignore "titles" in script or style tags
bool ignore = false ;
// the first word
char * wstart = NULL ; if ( NW > 0 ) wstart = WW - > getWord ( 0 ) ;
// loop over all "words" in the html body
for ( long i = 0 ; i < NW ; i + + ) {
// come back up here if we encounter another "title-ish" tag
// within our first alleged "title-ish" tag
subloop :
// get the tag id minus the back tag bit
nodeid_t tid = tids [ i ] & BACKBITCOMP ;
// pen up and pen down for these comment like tags
if ( tid = = TAG_SCRIPT | | tid = = TAG_STYLE ) {
// if start of it flag it
if ( tids [ i ] & BACKBIT ) ignore = false ;
else ignore = true ;
}
// stop after 30k of text
if ( WW - > getWord ( i ) - wstart > 200000 )
break ; // 1106
// keep going if in script or style tag
if ( ignore ) continue ;
// skip if not a good tag.
if ( ! s_candTags [ tid ] ) continue ;
// must NOT be a back tag
if ( tids [ i ] & BACKBIT ) continue ;
// skip if we hit our limit
if ( table [ tid ] > = 4 ) continue ;
// after the document body we can only have "self link" titles
//if ( validArticle && i >= secb && tid != 2 ) continue;
// when using pdftohtml, the title tag is the filename
if ( tid = = TAG_TITLE & & * xd - > getContentType ( ) = = CT_PDF )
continue ;
// skip over tag/word #i
i + + ;
// no words in links, unless it is a self link
if ( i < NW & & ( flags [ i ] & 0x02 ) ) continue ;
// the start should be here
long start = - 1 ;
// do not go too far
long max = i + 200 ;
// find the corresponding back tag for it
for ( ; i < NW & & i < max ; i + + ) {
// hey we got it, BUT we got no alnum word first
// so the thing was empty, so loop back to subloop
if ( ( tids [ i ] & BACKBITCOMP ) = = tid & &
( tids [ i ] & BACKBIT ) & &
start = = - 1 )
goto subloop ;
// if we hit another title-ish tag, loop back up
if ( s_candTags [ tids [ i ] & BACKBITCOMP ] ) {
// if no alnum text, restart at the top
if ( start = = - 1 )
goto subloop ;
// otherwise, break out and see if title works
break ;
}
// if we hit a breaking tag...
if ( isBreakingTagId ( tids [ i ] & BACKBITCOMP ) & &
// do not consider <span> tags breaking for
// our purposes. i saw a <h1><span> setup before.
tids [ i ] ! = TAG_SPAN )
break ;
// skip if not alnum word
if ( ! WW - > isAlnum ( i ) ) continue ;
// if in link and score is 0 stop
//if ( SS && SS->m_scores[i] <= 0 ) break;
// if we hit an alnum word, break out
if ( start = = - 1 ) start = i ;
}
// if no start was found, must have had a 0 score in there
if ( start = = - 1 ) continue ;
// if we exhausted the doc, we are done
if ( i > = NW )
break ;
// skip if way too big!
if ( i > = max ) continue ;
// if was too long do not consider a title
if ( i - start > 300 ) continue ;
// if not a back tag, that is bad too
//if ( ! WW->isBackTag(i) ) continue;
// . skip if too many bytes
// . this does not include the length of word #i, but #(i-1)
if ( WW - > getStringSize ( start , i ) > 1000 ) continue ;
// count it
table [ tid ] + + ;
// max it out if we are positive scoring. stop after the
// first positive scoring guy in a section. this might
// hurt the "Hamlet" thing though...
// MDW: well we now uses Sections, so commented this out
//if ( SS && SS->m_scores[start] > 0 ) table[tid] = 100;
// store a point to the title tag guy. Msg20.cpp needs this
// because the zak's proximity algo uses it in Summary.cpp
// and in Msg20.cpp
if ( tid = = TAG_TITLE & &
// only get the first one! often the 2nd on is in
// an iframe!! which we now expand into here.
m_titleTagStart = = - 1 ) {
m_titleTagStart = start ;
m_titleTagEnd = i ;
// save the candidate # because we always use this
// as the title if we are a root
if ( tti < 0 ) tti = n ;
}
// point to words class of the body that was passed in to us
cptrs [ n ] = WW ;
as [ n ] = start ;
bs [ n ] = i ;
htmlEnc [ n ] = true ;
if ( tid = = TAG_B ) types [ n ] = TT_BOLDTAG ;
if ( tid = = TAG_H1 ) types [ n ] = TT_HTAG ;
if ( tid = = TAG_H2 ) types [ n ] = TT_HTAG ;
if ( tid = = TAG_H3 ) types [ n ] = TT_HTAG ;
if ( tid = = TAG_TITLE ) types [ n ] = TT_TITLETAG ;
if ( tid = = TAG_DIV ) types [ n ] = TT_DIVTAG ;
if ( tid = = TAG_TD ) types [ n ] = TT_TDTAG ;
if ( tid = = TAG_P ) types [ n ] = TT_PTAG ;
if ( tid = = TAG_FONT ) types [ n ] = TT_FONTTAG ;
if ( tid = = TAG_A ) types [ n ] = TT_ATAG ;
// the score
if ( tid = = TAG_B ) scores [ n ] = 1.0 ;
else if ( tid = = TAG_H1 ) scores [ n ] = 1.8 ;
else if ( tid = = TAG_H2 ) scores [ n ] = 1.7 ;
else if ( tid = = TAG_H3 ) scores [ n ] = 1.6 ;
else if ( tid = = TAG_TITLE ) scores [ n ] = 3.0 ;
else if ( tid = = TAG_DIV ) scores [ n ] = 1.0 ;
else if ( tid = = TAG_TD ) scores [ n ] = 1.0 ;
else if ( tid = = TAG_P ) scores [ n ] = 1.0 ;
else if ( tid = = TAG_FONT ) scores [ n ] = 1.0 ;
// . self link is very powerful
// . BUT http://www.npr.org/templates/story/
// story.php?storyId=5417137 doesn't use it right! so use
// 1.3 instead of 3.0. that has an "onClick" thing in the
// <a> tag, so check for that!
// this was bad for http://www.spiritualwoman.net/?cat=191
// so i am demoting from 3.0 to 1.5
else if ( tid = = TAG_A ) scores [ n ] = 1.5 ;
// count it
n + + ;
// start loop over at tag #i, for loop does an i++, so negate
// that so this will work
i - - ;
// break out if too many already. save some for below.
if ( n + 10 > = MAX_TIT_CANDIDATES ) break ;
}
//logf(LOG_DEBUG,"title: took3=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
/*
// add in the dmoz title
const unsigned char numCatids = xd - > size_catIds / 4 ;
//char *dmozTitles = xd->ptr_dmozTitles;
//long *dmozTitleLens = tr->getDmozTitleLens();
// dmoz titles are always stored in UTF-8 format
Xml dxml ;
Words dwords ;
if ( numCatids & & xd - > ptr_dmozTitles & & n < MAX_TIT_CANDIDATES ) {
// point to buffer of NULL terminated titles
char * dt = xd - > ptr_dmozTitles ;
// set the xml
if ( ! dxml . set ( dt ,
gbstrlen ( dt ) ,
false , // ownData?
0 , // allocSize
false , // pureXml?
xd - > m_version ) )
return false ;
// . set the words
// . javier says he doesn't htmldecode() the dmoz titles
// so they should have html entities in them
if ( ! dwords . set ( & dxml ,
true , // compute word ids
true ) ) // has html entities
return false ;
// set the ptrs
cptrs [ n ] = & dwords ;
htmlEnc [ n ] = true ;
scores [ n ] = 3.0 ;
types [ n ] = TT_DMOZ ;
as [ n ] = 0 ;
bs [ n ] = dwords . getNumWords ( ) ;
n + + ;
}
*/
// sanity check
if ( ! xd - > m_contentTypeValid ) { char * xx = NULL ; * xx = 0 ; }
// to handle text documents, throw in the first line of text
// as a title candidate, just make the score really low
bool textDoc = false ;
if ( xd - > m_contentType = = CT_UNKNOWN ) textDoc = true ;
if ( xd - > m_contentType = = CT_TEXT ) textDoc = true ;
// make "i" point to first alphabetical word in the document
long i ; for ( i = 0 ; textDoc & & i < NW & & ! WW - > isAlpha ( i ) ; i + + ) ;
// if we got a first alphabetical word, then assume that to be the
// start of our title
if ( textDoc & & i < NW & & n < MAX_TIT_CANDIDATES ) {
// first word in title is "t0"
long t0 = i ;
// find end of first line
long numWords = 0 ;
// set i to the end now. we MUST find a \n to terminate the
// title, otherwise we will not have a valid title
while ( i < NW & &
numWords < maxTitleWords & &
( WW - > isAlnum ( i ) | |
! WW - > hasChar ( i , ' \n ' ) ) ) {
if ( WW - > isAlnum ( i ) ) numWords + + ;
i + + ;
}
// "t1" is the end
long t1 = - 1 ;
// we must have found our \n in order to set "t1"
if ( i < = NW & & numWords < maxTitleWords ) t1 = i ;
// set the ptrs
cptrs [ n ] = WW ;
htmlEnc [ n ] = true ;
// this is the last resort i guess...
scores [ n ] = 0.5 ;
types [ n ] = TT_FIRSTLINE ;
as [ n ] = t0 ;
bs [ n ] = t1 ;
// add it as a candidate if t0 and t1 were valid
if ( t0 > = 0 & & t1 > t0 ) n + + ;
}
//logf(LOG_DEBUG,"title: took4=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// now add the last url path to contain underscores or hyphens
char * pstart = xd - > getFirstUrl ( ) - > getPath ( ) ;
// get first url
Url * fu = xd - > getFirstUrl ( ) ;
// start at the end
char * p = fu - > getUrl ( ) + fu - > getUrlLen ( ) ;
// end pointer
char * pend = NULL ;
// come up here for each path component
while ( p > = pstart ) {
// save end
pend = p ;
// skip over /
if ( * p = = ' / ' ) p - - ;
// now go back to next /
long count = 0 ;
for ( ; p > = pstart & & * p ! = ' / ' ; p - - )
if ( * p = = ' _ ' | | * p = = ' - ' ) count + + ;
// did we get it?
if ( count > 0 ) break ;
}
// did we get any?
if ( p > pstart & & n < MAX_TIT_CANDIDATES ) {
// now set words to that
if ( ! tw [ ti ] . set ( p , // string
pend - p , // len
TITLEREC_CURRENT_VERSION ,
true , // compute wordIds?
0 ) ) // niceness
return false ;
// point to that
cptrs [ n ] = & tw [ ti ] ;
as [ n ] = 0 ;
bs [ n ] = tw [ ti ] . getNumWords ( ) ;
htmlEnc [ n ] = false ;
scores [ n ] = 1.0 ;
types [ n ] = TT_URLPATH ;
// increment since we are using it
ti + + ;
// advance
n + + ;
}
// save old n
long oldn = n ;
// . do not split titles if we are a root url
// . maps.yahoo.com was getting "Maps" for the title
Url * tu = xd - > getFirstUrl ( ) ;
if ( tu - > isRoot ( ) ) oldn = - 2 ;
// point to list of \0 separated titles
char * rootTitleBuf = NULL ;
char * rootTitleBufEnd = NULL ;
bool doRootTitleRemoval = false ;
if ( ! xd - > ptr_rootTitleBuf ) doRootTitleRemoval = false ;
// get the root title if we are not root!
if ( doRootTitleRemoval ) { // xd->ptr_rootTitleBuf ) {
// it should not block
char * * px = xd - > getFilteredRootTitleBuf ( ) ;
// error?
if ( ! px ) return false ;
// should never block! should be set from title rec basically
if ( px = = ( void * ) - 1 ) { char * xx = NULL ; * xx = 0 ; }
// point to list of \0 separated titles
rootTitleBuf = * px ;
rootTitleBufEnd = * px + xd - > m_filteredRootTitleBufSize ;
}
Matches m ;
if ( rootTitleBuf & & q ) m . setQuery ( q ) ;
// debug hack for 'spiritual books for women query'
//rootTitleBuf = "Forbes.com";
//rootTitleBufEnd = rootTitleBuf + gbstrlen(rootTitleBuf);
// convert into an array
long nr = 0 ;
char * pr = rootTitleBuf ;
char * rootTitles [ 20 ] ;
long rootTitleLens [ 20 ] ;
// loop over each root title segment
for ( ; pr & & pr < rootTitleBufEnd ; pr + = gbstrlen ( pr ) + 1 ) {
// if we had a query...
if ( q ) {
// reset it
m . reset ( ) ;
// see if root title segment has query terms in it
m . addMatches ( pr ,
gbstrlen ( pr ) ,
MF_TITLEGEN ,
xd - > m_docId ,
m_niceness ) ;
// if matches query, do NOT add it, we only add it for
// removing from the title of the page...
if ( m . getNumMatches ( ) ) continue ;
}
// point to it. it should start with an alnum already
// since it is the "filtered" list of root titles...
// if not, fix it in xmldoc then.
rootTitles [ nr ] = pr ;
rootTitleLens [ nr ] = gbstrlen ( pr ) ;
// advance
nr + + ;
// no breaching
if ( nr > = 20 ) break ;
}
// TODO: fix this... put the isSiteRoot bit in title rec?
//if ( tu->isSiteRoot(xd->m_coll) ) oldn = -2;
// now split up candidates in children candidates by tokenizing
// using :, | and - as delimters.
// the hyphen must have a space on at least one side, so "cd-rom" does
// not create a pair of tokens...
// FIX: for the title:
// Best Careers 2009: Librarian - US News and World Report
// we need to recognize "Best Careers 2009: Librarian" as a subtitle
// otherwise we don't get it as the title. so my question is are we
// going to have to do all the permutations at some point? for now
// let's just add in pairs...
for ( long i = 0 ; i < oldn & & n + 3 < MAX_TIT_CANDIDATES ; i + + ) {
// stop if no root title segments
if ( nr < = 0 ) break ;
// get the word info
Words * w = cptrs [ i ] ;
long a = as [ i ] ;
long b = bs [ i ] ;
// init
long lasta = a ;
char prev = false ;
// char length in bytes
//long charlen = 1;
// see how many we add
long added = 0 ;
char * skipTo = NULL ;
bool qualified = true ;
// . scan the words looking for a token
// . sometimes the candidates end in ": " so put in "k < b-1"
// . made this from k<b-1 to k<b to fix
// "Hot Tub Time Machine (2010) - IMDb" to strip IMDb
for ( long k = a ; k < b & & n + 3 < MAX_TIT_CANDIDATES ; k + + ) {
// get word
char * wp = w - > getWord ( k ) ;
// skip if not alnum
if ( ! w - > isAlnum ( k ) ) {
// in order for next alnum word to
// qualify for "clipping" if it matches
// the root title, there has to be more
// than just spaces here, some punct.
// otherwise title
// "T. D. Jakes: Biography from Answers.com"
// becomes
// "T. D. Jakes: Biography from"
qualified = isWordQualified ( wp , w - > getWordLen ( k ) ) ;
continue ;
}
// gotta be qualified!
if ( ! qualified ) continue ;
// skip if in root title
if ( skipTo & & wp < skipTo ) continue ;
// does this match any root page title segments?
long j ;
for ( j = 0 ; j < nr ; j + + ) {
// . compare to root title
// . break out if we matched!
if ( ! strncmp ( wp ,
rootTitles [ j ] ,
rootTitleLens [ j ] ) )
break ;
}
// if we did not match a root title segment,
// keep on chugging
if ( j > = nr ) continue ;
// . we got a root title match!
// . skip over
skipTo = wp + rootTitleLens [ j ] ;
// must land on qualified punct then!!
long e = k + 1 ;
for ( ; e < b & & w - > m_words [ e ] < skipTo ; e + + ) ;
// ok, word #e must be a qualified punct
if ( e < b & &
! isWordQualified ( w - > getWord ( e ) , w - > getWordLen ( e ) ) )
// assume no match then!!
continue ;
// if we had a previous guy, reset the end of the
// previous candidate
if ( prev ) {
bs [ n - 2 ] = k ;
bs [ n - 1 ] = k ;
}
// . ok, we got two more candidates
// . well, only one more if this is not the 1st time
if ( ! prev ) {
cptrs [ n ] = cptrs [ i ] ;
htmlEnc [ n ] = htmlEnc [ i ] ;
scores [ n ] = scores [ i ] ;
types [ n ] = types [ i ] ;
as [ n ] = lasta ;
bs [ n ] = k ;
parent [ n ] = i ;
n + + ;
added + + ;
}
// the 2nd one
cptrs [ n ] = cptrs [ i ] ;
htmlEnc [ n ] = htmlEnc [ i ] ;
scores [ n ] = scores [ i ] ;
types [ n ] = types [ i ] ;
as [ n ] = e + 1 ;
bs [ n ] = bs [ i ] ;
parent [ n ] = i ;
n + + ;
added + + ;
// now add in the last pair as a whole token
cptrs [ n ] = cptrs [ i ] ;
htmlEnc [ n ] = htmlEnc [ i ] ;
scores [ n ] = scores [ i ] ;
types [ n ] = types [ i ] ;
as [ n ] = lasta ;
bs [ n ] = bs [ i ] ;
parent [ n ] = i ;
n + + ;
added + + ;
// nuke the current candidate then since it got
// split up to not contain the root title...
//cptrs[i] = NULL;
// update this
lasta = k + 1 ;
// if we encounter another delimeter we will have
// to revise bs[n-1], so note that
prev = true ;
}
// nuke the current candidate then since it got
// split up to not contain the root title...
if ( added ) {
scores [ i ] = 0.001 ;
//cptrs[i] = NULL;
}
// erase the pair if that there was only one token
if ( added = = 3 ) n - - ;
}
//logf(LOG_DEBUG,"title: took5=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// set base score
for ( long i = 0 ; i < n ; i + + ) baseScore [ i ] = scores [ i ] ;
// set # alnum words
for ( long i = 0 ; i < n ; i + + ) {
// point to the words
Words * w = cptrs [ i ] ;
// skip if got nuked above
if ( ! w ) continue ;
// get the word boundaries
long a = as [ i ] ;
long b = bs [ i ] ;
long count = 0 ;
// scan the words in this title candidate
for ( long j = a ; j < b ; j + + )
if ( w - > isAlnum ( j ) ) count + + ;
// store it
numAlnum [ i ] = count ;
}
//
// . now punish by 0.85 for every lower case non-stop word it has
// . reward by 1.1 if has a non-stopword in the query
//
for ( long i = 0 ; i < n ; i + + ) {
// point to the words
Words * w = cptrs [ i ] ;
// skip if got nuked above
if ( ! w ) continue ;
// the word ptrs
char * * wptrs = w - > getWordPtrs ( ) ;
// skip if empty
if ( w - > getNumWords ( ) < = 0 ) continue ;
// get the word boundaries
long a = as [ i ] ;
long b = bs [ i ] ;
// record the boosts
float ncb = 1.0 ;
float qtb = 1.0 ;
// a flag
char uncapped = false ;
// scan the words in this title candidate
for ( long j = a ; j < b ; j + + ) {
// skip stop words
if ( w - > isQueryStopWord ( j ) ) continue ;
// punish if uncapitalized non-stopword
if ( ! w - > isCapitalized ( j ) ) uncapped = true ;
// skip if no query
if ( ! q ) continue ;
// convert the word id into a term id
//int64_ttermid=g_indexdb.getTermId(0,w->getWordId(j));
long long wid = w - > getWordId ( j ) ;
// reward if in the query
if ( q - > getWordNum ( wid ) > = 0 ) {
qtb * = 1.5 ;
scores [ i ] * = 1.5 ;
}
}
// . only punish once if missing a capitalized word
// . hurts us for:
// http://content-uk.cricinfo.com/ausvrsa2008_09/engine/
// current/match/351682.html
if ( uncapped ) {
ncb * = 1.00 ; //0.85;
scores [ i ] * = 1.00 ; //0.85;
}
// punish if a http:// title thingy
char * s = wptrs [ a ] ; //w->getWord(a);
long size = w - > getStringSize ( a , b ) ;
if ( size > 9 & & memcmp ( " http:// " , s , 7 ) = = 0 )
ncb * = .10 ;
if ( size > 14 & & memcmp ( " h \0 t \0 t \0 p \0 : \0 / \0 / " , s , 14 ) = = 0 )
ncb * = .10 ;
// set these guys
scores [ i ] * = ncb ;
noCapsBoost [ i ] = ncb ;
qtermsBoost [ i ] = qtb ;
}
//logf(LOG_DEBUG,"title: took6=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// . now compare each candidate to the other candidates
// . give a boost if matches
for ( long i = 0 ; i < n ; i + + ) {
// point to the words
Words * w1 = cptrs [ i ] ;
// skip if got nuked above
if ( ! w1 ) continue ;
long a1 = as [ i ] ;
long b1 = bs [ i ] ;
//long nw1 = b1 - a1;
// reset our array
//long found[512];
// sanity check
//if ( nw1 > 512 ) { char *xx=NULL;*xx=0; };
//memset ( found , 0 , 4*512);
// reset some flags
char localFlag1 = 0 ;
char localFlag2 = 0 ;
// record the boost
float iccb = 1.0 ;
// total boost
float total = 1.0 ;
//long count = 0;
// to each other candidate
for ( long j = 0 ; j < n ; j + + ) {
// not to ourselves
if ( j = = i ) continue ;
// or our derivatives
if ( parent [ j ] = = i ) continue ;
// or derivates to their parent
if ( parent [ i ] = = j ) continue ;
// only check parents now. do not check kids.
// this was only for when doing percent contained
// not getSimilarity() per se
//if ( parent[j] != -1 ) continue;
//
// TODO: do not accumulate boosts from a parent
// and its kids, subtitles...
//
// do not compare type X to type Y
if ( types [ i ] = = TT_TITLETAG ) {
if ( types [ j ] = = TT_TITLETAG ) continue ;
}
// do not compare a div candidate to another div cand
// http://friendfeed.com/foxiewire?start=30
// likewise, a TD to another TD
// http://content-uk.cricinfo.com/ausvrsa2008_09/
// engine/match/351681.html ... etc.
if ( types [ i ] = = TT_BOLDTAG | |
types [ i ] = = TT_HTAG | |
types [ i ] = = TT_DIVTAG | |
types [ i ] = = TT_TDTAG | |
types [ i ] = = TT_FONTTAG ) {
if ( types [ j ] = = types [ i ] ) continue ;
}
// . do not compare one kid to another kid
// . i.e. if we got "x | y" as a title and "x | z"
// as a link text, it will emphasize "x" too much
// http://content-uk.cricinfo.com/ausvrsa2008_09/
// engine/current/match/351682.html
if ( parent [ j ] ! = - 1 & & parent [ i ] ! = - 1 ) continue ;
// . body type tags are mostly mutually exclusive
// . for the legacy.com url mentioned below, we have
// good stuff in <td> tags, so this hurts us...
// . but for the sake of
// http://larvatusprodeo.net/2009/01/07/partisanship
// -politics-and-participation/ i put bold tags back
//if ( types[i] == TT_BOLDTAG )
// if ( types[j] == TT_BOLDTAG ) continue;
/*
if ( types [ i ] = = TT_BOLDTAG | |
types [ i ] = = TT_HTAG | |
types [ i ] = = TT_DIVTAG | |
types [ i ] = = TT_TDTAG | |
types [ i ] = = TT_FONTTAG ) {
if ( types [ j ] = = TT_HTAG ) continue ;
if ( types [ j ] = = TT_BOLDTAG ) continue ;
if ( types [ j ] = = TT_DIVTAG ) continue ;
if ( types [ j ] = = TT_TDTAG ) continue ;
if ( types [ j ] = = TT_FONTTAG ) continue ;
}
*/
if ( types [ i ] = = TT_LINKTEXTLOCAL ) {
if ( types [ j ] = = TT_LINKTEXTLOCAL ) continue ;
}
if ( types [ i ] = = TT_RSSITEMLOCAL ) {
if ( types [ j ] = = TT_RSSITEMLOCAL ) continue ;
}
// only compare to one local link text for each i
if ( types [ j ] = = TT_LINKTEXTLOCAL & & localFlag1 )
continue ;
if ( types [ j ] = = TT_RSSITEMLOCAL & & localFlag2 )
continue ;
if ( types [ j ] = = TT_LINKTEXTLOCAL ) localFlag1 = 1 ;
if ( types [ j ] = = TT_RSSITEMLOCAL ) localFlag2 = 1 ;
// not link title attr to link title attr either
// fixes http://www.spiritualwoman.net/?cat=191
if ( types [ i ] = = TT_TITLEATT & &
types [ j ] = = TT_TITLEATT )
continue ;
// get our words
Words * w2 = cptrs [ j ] ;
// skip if got nuked above
if ( ! w2 ) continue ;
long a2 = as [ j ] ;
long b2 = bs [ j ] ;
// use body scores if we can
//Scores *scores1 = NULL;
//Scores *scores2 = NULL;
//if ( w1 == WW ) scores1 = SS;
//if ( w2 == WW ) scores2 = SS;
/*
// make his hashtable
HashTable ht ;
char hbuf [ 5000 ] ;
// but we cannot have more than 1024 slots then
if ( ! ht . set ( 256 , hbuf , 5000 ) ) return false ;
// and table auto grows when 90% full, so limit us here
long count = 0 ;
// loop over all words in "w1" and hash them
for ( long k = a2 ; k < b2 & & count < 128 ; k + + ) {
// the word id
long wid = ( long ) w2 - > m_wordIds [ k ] ;
// skip if not indexable
if ( wid = = 0 ) continue ;
// count it
count + + ;
// add to table
if ( ! ht . addKey ( ( long ) wid , 1 , NULL ) )
return false ;
}
// which words are found in another candidate
for ( long k = 0 ; k < nw1 ; k + + ) {
// get word id
long wid = ( long ) w1 - > m_wordIds [ a1 + k ] ;
// skip if punct. set it to -1
if ( wid = = 0LL ) { found [ k ] = - 1 ; continue ; }
// see if in table
long slot = ht . getSlot ( wid ) ;
// this word was found in another candidate
if ( slot > = 0 ) found [ k ] + + ;
}
*/
// how similar is title #i to title #j ?
float fp = getSimilarity ( w2 , a2 , b2 ,
w1 , a1 , b1 ) ;
// TODO: scores1 , scores2 );
// error?
if ( fp = = - 1.0 ) return false ;
// give a 1.1 boost per word i guess
//float boost = 1.0;
// get # of "matched words" in the two titles
//long nw1 = (long)(fp * (float)numAlnum[i]);
//for ( long v = 0 ; v < nw1 ; v++ )
// boost *= 1.1;
// custom boosting...
float boost = 1.0 ;
if ( fp > = .95 ) boost = 3.0 ;
else if ( fp > = .90 ) boost = 2.0 ;
else if ( fp > = .85 ) boost = 1.5 ;
else if ( fp > = .80 ) boost = 1.4 ;
else if ( fp > = .75 ) boost = 1.3 ;
else if ( fp > = .70 ) boost = 1.2 ;
else if ( fp > = .60 ) boost = 1.1 ;
else if ( fp > = .50 ) boost = 1.08 ;
else if ( fp > = .40 ) boost = 1.04 ;
// limit total
total * = boost ;
if ( total > 100.0 ) break ;
// if you are matching the url path, that is pretty
// good so give more!
// actually, that would hurt:
// http://michellemalkin.com/2008/12/29/gag-worthy/
/*
if ( types [ j ] = = TT_URLPATH ) {
float delta = boost - 1.0 ;
// double the delta boost
boost = boost + delta ;
}
*/
// . boost by that!
// . if 100% similar give x3.0
// . if 0% similar x1.0
//float boost = 1.0 + (2.0 * fp);
//float boost = ((1.0 + fp)*(1.0 + fp));
// custom boosting!
if ( fp > 0.0 & & g_conf . m_logDebugTitle )
logf ( LOG_DEBUG , " title: i=%li j=%li fp=%.02f "
" b=%.02f " , i , j , fp , boost ) ;
// apply it
scores [ i ] * = boost ;
iccb * = boost ;
}
// . boost from words that word found in other candidates
// . TODO: dedup the found vector so we don't count the same
// word twice!!
/*
float boost = 1.0 ;
for ( long k = 0 ; k < nw1 ; k + + ) {
// skip punct
if ( found [ k ] = = - 1 ) continue ;
// boost or punish
if ( found [ k ] ) boost * = 1.20 ;
else boost * = 0.85 ;
}
// assigne
scores [ i ] = boost ;
inCommonCandBoost [ i ] = boost ;
*/
inCommonCandBoost [ i ] = iccb ;
}
//logf(LOG_DEBUG,"title: took7=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// loop over all n candidates
for ( long i = 0 ; i < n ; i + + ) {
// skip if not in the document body
if ( cptrs [ i ] ! = WW ) continue ;
// point to the words
long a1 = as [ i ] ;
long b1 = bs [ i ] ;
// . loop through this candidates words
// . TODO: use memset here?
for ( long j = a1 ; j < = b1 & & j < NW ; j + + )
// flag it
flags [ j ] | = 0x01 ;
}
Section * * sp = NULL ;
if ( sections ) sp = sections - > m_sectionPtrs ;
//logf(LOG_DEBUG,"title: took8=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
/*
MDW : removed since SEC_ARTICLE was removed - - - -
// . now compare each candidate to the words in the positive scoring
// body of the document.
// . hash each word in the document with a positive score
// . go up to the first 5000 "words"
// . hash up to 1000 "words"
HashTableT < long long , long > ht ;
inLink = false ;
for ( long i = 0 ; i < NW & & i < 5000 ; i + + ) {
// see whose in a link tag
if ( tids [ i ] = = TAG_A ) inLink = true ;
if ( tids [ i ] = = ( TAG_A | BACKBIT ) ) inLink = false ;
// must be alnum word
if ( wids [ i ] = = 0LL ) continue ;
// skip if not in article section
if ( sp & & ! ( sp [ i ] - > m_flags & SEC_ARTICLE ) ) continue ;
// skip if 0 score
//if ( SS && SS->m_scores[i] <= 0 ) continue;
// . skip if this word is in a candidate title
// . for http://www.legacy.com/shelbystar/Obituaries.asp?Pa
// ge=LifeStory&PersonId=122245831
// the body is actually a <td> candidate and the first
// td candidate is a good title and is unable to get boost
// from the body because it is a <td> candidate! so remove
// this logic for now
if ( flags [ i ] & 0x01 ) continue ;
// or in a link as determined with the flags
if ( flags [ i ] & 0x02 ) continue ;
// skip if in a link
if ( inLink ) continue ;
// skip if stop word
if ( WW - > isQueryStopWord ( i ) ) continue ;
// . hash it. return false if error adding it.
// . store the word # so we can avoid comparing to ourselves
// in case the title candidate intersect this part of the doc
if ( ! ht . addKey ( wids [ i ] , i ) ) {
if ( flags ! = localBuf ) mfree ( flags , need , " TITLEflags " ) ;
return false ;
}
}
*/
// free our stuff
if ( flags ! = localBuf ) mfree ( flags , need , " TITLEflags " ) ;
//logf(LOG_DEBUG,"title: took9=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// ok, now compare each candidate to that hash table
for ( long i = 0 ; i < n ; i + + ) {
// record the boost
float icbb = 1.0 ;
/*
MDW : removed since SEC_ARTICLE was removed - - - -
// point to the words
Words * w1 = cptrs [ i ] ;
long a1 = as [ i ] ;
long b1 = bs [ i ] ;
long nw1 = w1 - > getNumWords ( ) ;
long long * wids1 = w1 - > getWordIds ( ) ;
// loop through this candidates words
for ( long j = a1 ; j < = b1 & & j < nw1 ; j + + ) {
// skip if not alnum
if ( wids1 [ j ] = = 0LL ) continue ;
// is it in the positive scoring body?
if ( ! ht . getValuePtr ( wids1 [ j ] ) ) continue ;
// boost score by 20% for every term we have that
// is also in the positive scoring body
icbb * = 1.20 ;
scores [ i ] * = 1.20 ;
}
*/
inCommonBodyBoost [ i ] = icbb ;
}
//logf(LOG_DEBUG,"title: took10=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// now get the highest scoring candidate title
float max = - 1.0 ;
long winner = - 1 ;
for ( long i = 0 ; i < n ; i + + ) {
// skip if got nuked
if ( ! cptrs [ i ] ) continue ;
if ( winner ! = - 1 & & scores [ i ] < = max ) continue ;
// url path's cannot be titles in and of themselves
if ( types [ i ] = = TT_URLPATH ) continue ;
// skip if empty basically, like if title was exact
// copy of root, then the whole thing got nuked and
// some empty string added, where a > b
if ( as [ i ] > = bs [ i ] ) continue ;
// got one
max = scores [ i ] ;
// save it
winner = i ;
}
//logf(LOG_DEBUG,"title: took11=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// if we are a root, always pick the title tag as the title
if ( oldn = = - 2 & & tti > = 0 ) winner = tti ;
// if no winner, all done. no title
if ( winner = = - 1 ) return true ;
// point to the words class of the winner
Words * w = cptrs [ winner ] ;
// skip if got nuked above
if ( ! w ) { char * xx = NULL ; * xx = 0 ; }
// make the Pos class of the winner, and point "pp" to it
Pos * pp = POS ;
//Scores *ss = SS;
// need to make our own Pos class if title not from body
Pos tp ;
if ( w ! = WW ) {
// use the temp Pos class, "tp"
pp = & tp ;
// use no scores then
//ss = NULL;
// set "Scores" ptr to NULL. we assume all are positive scores
if ( ! tp . set ( w , NULL ) ) return false ;
}
// the string ranges from word #a up to and including word #b
long a = as [ winner ] ;
long b = bs [ winner ] ;
// sanity check
if ( a < 0 | | b > w - > getNumWords ( ) ) { char * xx = NULL ; * xx = 0 ; }
// save the title
if ( ! copyTitle ( w , pp , a , b , sections ) )
return false ;
// save these
m_htmlEncoded = htmlEnc [ winner ] ;
//logf(LOG_DEBUG,"title: took12=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// return now if no need to log this stuff
//SafeBuf sb;
//pbuf = &sb;
if ( ! pbuf ) return true ;
//log("title: candidates for %s",xd->getFirstUrl()->getUrl() );
pbuf - > safePrintf ( " <table cellpadding=5 border=2><tr> "
" <td colspan=20><center><b>Title Generation</b> "
" </center></td> "
" </tr> \n <tr> "
" <td>#</td> "
" <td>type</td> "
" <td>parent</td> "
" <td>base score</td> "
" <td>format penalty</td> "
" <td>query term boost</td> "
" <td>candidate intersection boost</td> "
" <td>body intersection boost</td> "
" <td>FINAL SCORE</td> "
" <td>title</td> "
" </tr> \n " ) ;
// print out all candidates
for ( long i = 0 ; i < n ; i + + ) {
char * ts = " unknown " ;
if ( types [ i ] = = TT_LINKTEXTLOCAL ) ts = " local inlink text " ;
if ( types [ i ] = = TT_LINKTEXTREMOTE ) ts = " remote inlink text " ;
if ( types [ i ] = = TT_RSSITEMLOCAL ) ts = " local rss title " ;
if ( types [ i ] = = TT_RSSITEMREMOTE ) ts = " remote rss title " ;
if ( types [ i ] = = TT_BOLDTAG ) ts = " bold tag " ;
if ( types [ i ] = = TT_HTAG ) ts = " header tag " ;
if ( types [ i ] = = TT_TITLETAG ) ts = " title tag " ;
if ( types [ i ] = = TT_DMOZ ) ts = " dmoz title " ;
if ( types [ i ] = = TT_FIRSTLINE ) ts = " first line in text " ;
if ( types [ i ] = = TT_FONTTAG ) ts = " font tag " ;
if ( types [ i ] = = TT_ATAG ) ts = " anchor tag " ;
if ( types [ i ] = = TT_DIVTAG ) ts = " div tag " ;
if ( types [ i ] = = TT_TDTAG ) ts = " td tag " ;
if ( types [ i ] = = TT_PTAG ) ts = " p tag " ;
if ( types [ i ] = = TT_URLPATH ) ts = " url path " ;
if ( types [ i ] = = TT_TITLEATT ) ts = " title attribute " ;
// get the title
pbuf - > safePrintf (
" <tr> "
" <td>#%li</td> "
" <td><nobr>%s</nobr></td> "
" <td>%li</td> "
" <td>%0.2f</td> " // baseScore
" <td>%0.2f</td> "
" <td>%0.2f</td> "
" <td>%0.2f</td> "
" <td>%0.2f</td> "
" <td>%0.2f</td> "
" <td> " ,
i ,
ts ,
parent [ i ] ,
baseScore [ i ] ,
noCapsBoost [ i ] ,
qtermsBoost [ i ] ,
inCommonCandBoost [ i ] ,
inCommonBodyBoost [ i ] ,
scores [ i ] ) ;
// ptrs
Words * w = cptrs [ i ] ;
long a = as [ i ] ;
long b = bs [ i ] ;
// skip if no words
if ( w - > getNumWords ( ) < = 0 ) continue ;
// the word ptrs
char * * wptrs = w - > getWordPtrs ( ) ;
// string ptrs
char * ptr = wptrs [ a ] ; //w->getWord(a);
long size = w - > getStringSize ( a , b ) ;
// it is utf8
pbuf - > safeMemcpy ( ptr , size ) ;
// end the line
pbuf - > safePrintf ( " </td></tr> \n " ) ;
}
pbuf - > safePrintf ( " </table> \n <br> \n " ) ;
//logf(LOG_DEBUG,"title: took13=%lli",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// log these for now
//log("title: %s",sb.getBufStart());
return true ;
}
// . returns 0.0 to 1.0
// . what percent of the alnum words in "w1" are in "w2" from words in [t0,t1)
// . gets 50% points if has all single words, and the other 50% if all phrases
// . Scores class applies to w1 only, use NULL if none
// . use word popularity information for scoring rarer term matches more
// . ONLY CHECKS FIRST 1000 WORDS of w2 for speed
float Title : : getSimilarity ( Words * w1 , long i0 , long i1 ,
Words * w2 , long t0 , long t1 ) {
// if either empty, that's 0% contained
if ( w1 - > getNumWords ( ) < = 0 ) return 0 ;
if ( w2 - > getNumWords ( ) < = 0 ) return 0 ;
if ( i0 > = i1 ) return 0 ;
if ( t0 > = t1 ) return 0 ;
// invalids vals
if ( i0 < 0 ) return 0 ;
if ( t0 < 0 ) return 0 ;
// . for this to be useful we must use idf
// . get the popularity of each word in w1
// . w1 should only be a few words since it is a title candidate
// . does not add pop for word #i if scores[i] <= 0
// . take this out for now since i removed the unified dict,
// we could use this if we added popularity to g_wiktionary
// but it would have to be language dependent
Pops pops1 ;
Pops pops2 ;
if ( ! pops1 . set ( w1 , i0 , i1 ) ) return - 1.0 ;
if ( ! pops2 . set ( w2 , t0 , t1 ) ) return - 1.0 ;
// now hash the words in w1, the needle in the haystack
long nw1 = w1 - > getNumWords ( ) ;
if ( i1 > nw1 ) i1 = nw1 ;
HashTable table ;
//long *ss1 = NULL;
//long *ss2 = NULL;
//if ( scores1 ) ss1 = scores1->m_scores;
//if ( scores2 ) ss2 = scores2->m_scores;
// this augments the hash table
//long long lastWids[1024];
long long lastWid = - 1 ;
float lastScore = 0.0 ;
// but we cannot have more than 1024 slots then
if ( ! table . set ( 1024 ) ) return - 1.0 ;
// and table auto grows when 90% full, so limit us here
long count = 0 ;
long maxCount = 20 ; // (1024 * 90) / 100 - 1;
// sum up everything we add
float sum = 0.0 ;
// loop over all words in "w1" and hash them
for ( long i = i0 ; i < i1 ; i + + ) {
// the word id
long long wid = ( long ) w1 - > m_wordIds [ i ] ;
// skip if not indexable
if ( wid = = 0 ) continue ;
// or score is 0
//if ( ss && ss[i] <= 0 ) continue;
// no room left in table!
if ( count + + > maxCount ) {
//logf(LOG_DEBUG, "query: Hash table for title "
// "generation too small. Truncating words from w1.");
break ;
}
// . map pop to a score, "pscore"
// . the least popular something is the more points it is worth
//val = MAX_POP - pops.m_pops[i];
// . make this a float. it ranges from 0.0 to 1.0
// . 1.0 means the word occurs in 100% of documents sampled
// . 0.0 means it occurs in none of them
// . but "val" is the complement of those two statements!
float score = 1.0 - pops1 . getNormalizedPop ( i ) ;
// accumulate
sum + = score ;
// debug
//logf(LOG_DEBUG,"adding wid=%li score=%.02f sum=%.02f",
// (long)wid,score,sum);
// accumulate for scoring phrases too! (adjacent words)
//psum += val;
// update the linked list
//if ( oldi < 1024 ) next[oldi] = i;
// prepare for next link, it may never come if we're last one!
//oldi = i;
// add to table
2013-09-16 08:10:53 +04:00
if ( ! table . addKey ( ( long ) wid , ( long ) score , NULL ) )
2013-08-03 00:12:24 +04:00
return - 1.0 ;
// if no last wid, continue
if ( lastWid = = - 1LL ) { lastWid = wid ; lastScore = score ; continue ; }
// keep this 1-1 with the hash table slots
//lastWids [ slot ] = lastWid;
// . what was his val?
// . the "val" of the phrase:
float phrScore = score + lastScore ;
// do not count as much as single words
phrScore * = 0.5 ;
// accumulate
sum + = phrScore ;
// get the phrase id
long long pid = hash64 ( wid , lastWid ) ;
// debug
//logf(LOG_DEBUG,
// "adding pid=%li score=%.02f sum=%.02f",
// (long)pid,phrScore,sum);
// now add that
2013-09-16 08:10:53 +04:00
if ( ! table . addKey ( ( long ) pid , ( long ) phrScore , NULL ) )
2013-08-03 00:12:24 +04:00
return - 1.0 ;
// we are now the last wid
lastWid = wid ;
lastScore = score ;
}
// sanity check. it can't grow cuz we keep lastWids[] 1-1 with it
if ( table . getNumSlots ( ) ! = 1024 ) {
log ( LOG_LOGIC , " query: Title has logic bug. " ) ;
return - 1.0 ;
}
// reset score sum to get "percent contained" functionality back
//sum = 0.0;
// accumulate scores of words that are found
float found = 0.0 ;
// reset
lastWid = - 1LL ;
// loop over all words in "w1" and hash them
for ( long i = t0 ; i < t1 ; i + + ) {
// the word id
long long wid = ( long ) w2 - > m_wordIds [ i ] ;
// skip if not indexable
if ( wid = = 0 ) continue ;
// or score is 0
//if ( ss && ss[i] <= 0 ) continue;
// . make this a float. it ranges from 0.0 to 1.0
// . 1.0 means the word occurs in 100% of documents sampled
// . 0.0 means it occurs in none of them
// . but "val" is the complement of those two statements!
float score = 1.0 - pops2 . getNormalizedPop ( i ) ;
// accumulate
sum + = score ;
// is it in table?
long slot = table . getSlot ( ( long ) wid ) ;
// . if in table, add that up to "found"
// . we essentially find his wid AND our wid, so 2.0 times
if ( slot > = 0 ) found + = 2.0 * score ;
// use percent contained functionality now
//if ( slot >= 0 ) found += score;
// debug
//logf(LOG_DEBUG,"checking wid=%li score=%.02f sum=%.02f "
// "found=%.02f slot=%li", (long)wid,score,sum,found,slot);
// now the phrase
if ( lastWid = = - 1LL ) { lastWid = wid ; lastScore = score ; continue ; }
// . what was his val?
// . the "val" of the phrase:
float phrScore = score + lastScore ;
// do not count as much as single words
phrScore * = 0.5 ;
// accumulate
sum + = phrScore ;
// get the phrase id
long long pid = hash64 ( wid , lastWid ) ;
// is it in table?
slot = table . getSlot ( ( long ) pid ) ;
// . accumulate if in there
// . we essentially find his wid AND our wid, so 2.0 times
if ( slot > = 0 ) found + = 2.0 * phrScore ;
// use percent contained functionality now
//if ( slot >= 0 ) found += score;
// we are now the last wid
lastWid = wid ;
lastScore = score ;
// debug
//logf(LOG_DEBUG,
// "checking pid=%li score=%.02f sum=%.02f found=%.02f "
// "slot=%li",
// (long)pid,phrScore,sum,found,slot);
}
// do not divide by zero
if ( sum = = 0.0 ) return 0.0 ;
// sanity check
//if ( found > sum ) { char *xx=NULL;*xx=0; }
if ( found < 0.0 | | sum < 0.0 ) { char * xx = NULL ; * xx = 0 ; }
// . return the percentage matched
// . will range from 0.0 to 1.0
return found / sum ;
}
// . copy just words in [t0,t1)
// . returns false on error and sets g_errno
bool Title : : copyTitle ( Words * w , Pos * pos ,
long t0 , long t1 ,
Sections * sections ) {
// skip initial punct
//long long *wids = w->m_wordIds;
//nodeid_t *tids = w->m_tagIds;
char * * wp = w - > m_words ;
long * wlens = w - > m_wordLens ;
long nw = w - > m_numWords ;
// sanity check
if ( t1 < t0 ) { char * xx = NULL ; * xx = 0 ; }
// don't breech number of words
if ( t1 > nw ) t1 = nw ;
// no title?
if ( nw = = 0 | | t0 = = t1 ) { reset ( ) ; return true ; }
char * end = wp [ t1 - 1 ] + wlens [ t1 - 1 ] ;
// allocate title
long need = end - wp [ t0 ] ;
// . max bytes we'll need
// . no, all "chars" could be encoded so they take up like 5 bytes each
//long max = m_maxTitleChars;
// truncate the bytes to allocate if we can, based on m_maxTitleChars
//if ( need > max ) need = max;
// add 3 bytes for "..." and 1 for \0
need + = 5 ;
// assume we can use our local buf
m_title = m_localBuf ;
// if it is too small, then we must allocate
if ( need > = TITLE_LOCAL_SIZE ) {
m_title = ( char * ) mmalloc ( need , " Title " ) ;
m_titleAllocSize = need ;
}
// return false if could not alloc mem to hold the title
if ( ! m_title ) {
m_titleBytes = 0 ;
log ( " query: Could not alloc %li bytes for title. " , need ) ;
return false ;
}
// save for freeing later
m_titleAllocSize = need ;
// point to the title to transcribe
char * src = wp [ t0 ] ;
char * srcEnd = end ;
// include a \" or \'
if ( t0 > 0 & &
( src [ - 1 ] = = ' \' ' | | src [ - 1 ] = = ' \" ' ) )
src - - ;
// and remove terminating | or :
for ( ;
srcEnd > src & &
( srcEnd [ - 1 ] = = ' : ' | |
srcEnd [ - 1 ] = = ' ' | |
srcEnd [ - 1 ] = = ' - ' | |
srcEnd [ - 1 ] = = ' | ' ) ;
srcEnd - - ) ;
// store in here
char * dst = m_title ;
// leave room for "...\0"
char * dstEnd = m_title + m_titleAllocSize - 4 ;
// size of character in bytes, usually 1
char cs ;
// point to last punct char
char * lastp = NULL ;
// convert them always for now
bool convertHtmlEntities = true ;
// copy the node @p into "dst"
for ( ; src < srcEnd ; src + = cs , dst + = cs ) {
// get src size
cs = getUtf8CharSize ( src ) ;
// break if we are full!
if ( dst + cs > = dstEnd ) break ;
// remember last punct for cutting purposes
if ( ! is_alnum_utf8 ( src ) ) lastp = dst ;
// encode it as an html entity if asked to
if ( * src = = ' < ' & & convertHtmlEntities ) {
if ( dst + 4 > = dstEnd ) break ;
memcpy ( dst , " < " , 4 ) ;
dst + = 4 - cs ;
continue ;
}
// encode it as an html entity if asked to
if ( * src = = ' > ' & & convertHtmlEntities ) {
if ( dst + 4 > = dstEnd ) break ;
memcpy ( dst , " > " , 4 ) ;
dst + = 4 - cs ;
continue ;
}
// if more than 1 byte in char, use memcpy
if ( cs = = 1 ) * dst = * src ;
else memcpy ( dst , src , cs ) ;
}
// null term always
* dst = ' \0 ' ;
// do not split a word in the middle!
if ( src < srcEnd ) {
if ( lastp ) {
memcpy ( lastp , " ... \0 " , 4 ) ;
dst = lastp + 3 ;
}
else {
memcpy ( dst , " ... \0 " , 4 ) ;
dst + = 3 ;
}
}
// set size. does not include the terminating \0
m_titleBytes = dst - m_title ;
return true ;
}