open-source-search-engine/Msgaa.cpp

248 lines
7.5 KiB
C++
Raw Normal View History

2013-08-03 00:12:24 +04:00
#include "Msgaa.h"
#include "Tagdb.h"
#include "Msg40.h"
// test sites:
// blogs.ubc.ca/wetsocks/feed/
// my.donews.com/comboatme/2008/12/17/t-think-they-could-be-hydroxycut-hepatic-failure-upon/
// blogs.kaixo.com/maildepa/2008/12/23/by-thom-patterson-cnn-the-adorable-shemale-wave-of-millions-of-early/
// blog.seamolec.org/tim980/2008/12/30/purchase-thief-of-hearts-online/
// www.homeschoolblogger.com/sagerats/623595/
// blogs.x7web.com/lamonblog2427/
// blogs.msdn.com/predos_spot/
//
// POSSIBLE IMPROVEMENTS
//
// check usernames to be compound that when split the right way contain
// a common name, like "dave", "pedro" or "williams". and possibly discard
// subsites whose path contains categories or other names in the dictionary.
//
// POTENTIAL ARTICLE CONTENT IDENTIFCATION ALGORITHM
//
// identify all tag hashes (includes hashes of all parents) on the page
// which preceed unique content, only found on that page. minimize the list
// of all such tag hashes, and store into tagdb for that SITE. also store
// for that domain and hostname if those tag recs have less than 10-20 such
// tag hashes already. (tag hash hashes all the alpha chars in the tag
// attributes as well. see DateParse.cpp ALGORITHM description)
// PROBLEM: duplicated pages, or printer-friendly pages! if pages is very
// similar content (not tags) to another (test using vectors) then toss it!!
// keep a count for each tag as how many pages think it is a repeated vs.
// unique content.
static void gotResultsAAWrapper ( Msg40 *msg40 , void *state ) ;
Msgaa::Msgaa ( ) {
m_msg40 = NULL;
}
Msgaa::~Msgaa ( ) {
if ( m_msg40 ) {
// free him, we sent his reply
mdelete ( m_msg40 , sizeof(Msg40),"msgaa");
delete ( m_msg40 );
}
m_msg40 = NULL;
}
#define SAMPLESIZE 100
// . also sets m_sitePathDepth to what it should be
// . -1 indicates unknown (not enough data, etc.) or host/domain is the site
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
bool Msgaa::addSitePathDepth ( TagRec *gr ,
Url *url ,
char *coll ,
void *state ,
void (* callback)(void *state) ) {
// save it
m_gr = gr;
m_url = url;
m_coll = coll;
m_state = state;
m_callback = callback;
// set this to unknown for now
m_sitePathDepth = -1;
m_oldSitePathDepth = -1;
// reset this just in case
g_errno = 0;
// skip this for now!
return true;
CollectionRec *cr = g_collectiondb.getRec ( coll );
if ( ! cr ) return true;
if ( ! cr->m_subsiteDetectionEnabled ) return true;
// check the current tag for an age
Tag *tag = gr->getTag("sitepathdepth");
// if there and the age is young, skip it
long age = -1;
long now = getTimeGlobal();
if ( tag ) age = now - tag->m_timestamp;
// if there, at least get it
if ( tag ) m_oldSitePathDepth = (long)tag->m_data[0];
// if older than 30 days, we need to redo it
if ( age > 30*24*60*60 ) age = -1;
// if age is valid, skip it
if ( age >= 0 ) {
// just use what we had, it is not expired
m_sitePathDepth = (long)tag->m_data[0];
// all done, we did not block
return true;
}
// right now we only run on host #0 so we do not flood the cluster
// with queries...
if ( g_hostdb.m_hostId != 0 ) return true;
HASH all these things together into 1 termlist!!!
gbsitesample:<pathdepth> term. everything else is constant. use msg0 then, not msg40
rename msgaa to Site.
and use Site::set(Url *u) to do all this logic.
or Site::set(TitleRec *tr) to set it exactly from title rec
// make a new Msg40 to get search results with
try { m_msg40 = new (Msg40); }
catch ( ... ) {
g_errno = ENOMEM;
log("msgaa: new(%i): %s", sizeof(Msg40), mstrerror(g_errno));
return true;
}
mnew ( m_msg40 , sizeof(Msg40) , "Msgaa" );
// initial path depth
m_pathDepth = 1;
// see how many urls are non-cgi with a pathdepth of 1
char *p = m_qbuf;
strcpy ( p , "site:" );
p += 5;
memcpy ( p , m_url->getHost() , m_url->getHostLen() );
p += m_url->getHostLen();
// sort them by the random score term, gbrandscore (see XmlDoc.cpp)
p += sprintf (p ,
" gbpathdepth:%li"
" gbiscgi:0"
" gbhasfilename:0"
// www.xyz.com/viacom-extends-brad-greys-contract/ not!
" gbpermalink:0 "
"| gbrandscore:1",
m_pathDepth);
// set our SearchInput
m_si.reset();
// a sample of 100 results should be good!
m_si.m_docsWanted = SAMPLESIZE;
m_si.m_requireAllTerms = true;
m_si.m_firstResultNum = 0;
m_si.m_coll = m_coll;
m_si.m_doDupContentRemoval = false;
m_si.m_doSiteClustering = false;
m_si.m_docsToScanForReranking = 0;
// the query
m_si.m_query = m_qbuf;
m_si.m_queryLen = gbstrlen(m_qbuf);
// sanity check
if ( m_si.m_queryLen + 1 > MAX_QBUF_SIZE ) {char*xx=NULL;*xx=0; }
// do not generate titles or summaries to save time, we just need
// the url to check for bushiness. Msg20 if "getSummary" is false
// will already not set the title...
m_si.m_numLinesInSummary = 0;
// perform the Msg40 query
if ( ! m_msg40->getResults ( &m_si ,
false , // forward?
this , // state
gotResultsAAWrapper ) )
// return false if that blocks
return false;
// did not block
return gotResults();
}
void gotResultsAAWrapper ( Msg40 *msg40 , void *state ) {
Msgaa *THIS = (Msgaa *)state;
THIS->gotResults();
}
// . look at the urls we got in the results
// . they should be of the for abc.xyz.com/yyyyyy/
// . returns true and sets g_errno on error
bool Msgaa::gotResults ( ) {
// loop over each one
long n = m_msg40->m_numMsg20s;
// we need at least half of requested to make a good estimation
if ( n <= SAMPLESIZE/2 ) return true;
// make a hashtable
HashTable ht;
// get the url
for ( long i = 0 ; i < n ; i++ ) {
// get the ith result
Msg20Reply *r = m_msg40->m_msg20[i]->m_r;
// get the url string
char *us = r->ptr_ubuf;
long uslen = r->size_ubuf - 1;
Url u; u.set ( us , uslen );
// get path component # m_pathDepth
long clen;
char *c = u.getPathComponent ( i - 1 , &clen );
// must be there
if ( ! c || clen <= 0 ) {
log("msgaa: c is empty");
continue;
}
// now hash it
long h = hash32 ( c , clen );
// count in table
long slot = ht.getSlot ( h );
// how many times has this occurred in a result's url?
long count = 0;
// inc if there
if ( slot >= 0 ) count = ht.getValueFromSlot ( slot );
// inc it
count++;
// put it back
if ( slot >= 0 ) ht.setValue ( slot , count );
// otherwise, add it new
else if ( ! ht.addKey ( h , count ) ) return true;
}
// now scan the hash table and see how many unique path components
long unique = 0;
long ns = ht.getNumSlots();
for ( long i = 0 ;i < ns ; i++ ) {
// is empty
long val = ht.getValueFromSlot ( i );
// count if non-empty
if ( val > 0 ) unique++;
}
// i'd say 50% of the SAMPLESIZE+ search results are required to have
// unique path components in order for us to consider this to be a
// subsite with a path depth of m_pathDepth.
long required = n / 2;
if ( unique < required ) {
// ok, do not set m_sitePathDepth, leave it -1
log("msgaa: only have %li unique path components at path "
"depth %li out of %li results. %s does not have subsites.",
unique,m_pathDepth,n,m_url->getUrl());
return true;
}
// i guess we got it
log("msgaa: have %li unique path components at path "
"depth %li out of %li results. Enough to declare this as a "
"subsite for %s .",unique,m_pathDepth,n,m_url->getUrl());
// ok set it
m_sitePathDepth = m_pathDepth;
// we are done
return true;
}