#include "Msgaa.h" #include "Tagdb.h" #include "Msg40.h" // test sites: // blogs.ubc.ca/wetsocks/feed/ // my.donews.com/comboatme/2008/12/17/t-think-they-could-be-hydroxycut-hepatic-failure-upon/ // blogs.kaixo.com/maildepa/2008/12/23/by-thom-patterson-cnn-the-adorable-shemale-wave-of-millions-of-early/ // blog.seamolec.org/tim980/2008/12/30/purchase-thief-of-hearts-online/ // www.homeschoolblogger.com/sagerats/623595/ // blogs.x7web.com/lamonblog2427/ // blogs.msdn.com/predos_spot/ // // POSSIBLE IMPROVEMENTS // // check usernames to be compound that when split the right way contain // a common name, like "dave", "pedro" or "williams". and possibly discard // subsites whose path contains categories or other names in the dictionary. // // POTENTIAL ARTICLE CONTENT IDENTIFCATION ALGORITHM // // identify all tag hashes (includes hashes of all parents) on the page // which preceed unique content, only found on that page. minimize the list // of all such tag hashes, and store into tagdb for that SITE. also store // for that domain and hostname if those tag recs have less than 10-20 such // tag hashes already. (tag hash hashes all the alpha chars in the tag // attributes as well. see DateParse.cpp ALGORITHM description) // PROBLEM: duplicated pages, or printer-friendly pages! if pages is very // similar content (not tags) to another (test using vectors) then toss it!! // keep a count for each tag as how many pages think it is a repeated vs. // unique content. static void gotResultsAAWrapper ( Msg40 *msg40 , void *state ) ; Msgaa::Msgaa ( ) { m_msg40 = NULL; } Msgaa::~Msgaa ( ) { if ( m_msg40 ) { // free him, we sent his reply mdelete ( m_msg40 , sizeof(Msg40),"msgaa"); delete ( m_msg40 ); } m_msg40 = NULL; } #define SAMPLESIZE 100 // . also sets m_sitePathDepth to what it should be // . -1 indicates unknown (not enough data, etc.) or host/domain is the site // . returns false if blocked, true otherwise // . returns true and sets g_errno on error bool Msgaa::addSitePathDepth ( TagRec *gr , Url *url , char *coll , void *state , void (* callback)(void *state) ) { // save it m_gr = gr; m_url = url; m_coll = coll; m_state = state; m_callback = callback; // set this to unknown for now m_sitePathDepth = -1; m_oldSitePathDepth = -1; // reset this just in case g_errno = 0; // skip this for now! return true; CollectionRec *cr = g_collectiondb.getRec ( coll ); if ( ! cr ) return true; if ( ! cr->m_subsiteDetectionEnabled ) return true; // check the current tag for an age Tag *tag = gr->getTag("sitepathdepth"); // if there and the age is young, skip it long age = -1; long now = getTimeGlobal(); if ( tag ) age = now - tag->m_timestamp; // if there, at least get it if ( tag ) m_oldSitePathDepth = (long)tag->m_data[0]; // if older than 30 days, we need to redo it if ( age > 30*24*60*60 ) age = -1; // if age is valid, skip it if ( age >= 0 ) { // just use what we had, it is not expired m_sitePathDepth = (long)tag->m_data[0]; // all done, we did not block return true; } // right now we only run on host #0 so we do not flood the cluster // with queries... if ( g_hostdb.m_hostId != 0 ) return true; HASH all these things together into 1 termlist!!! gbsitesample: term. everything else is constant. use msg0 then, not msg40 rename msgaa to Site. and use Site::set(Url *u) to do all this logic. or Site::set(TitleRec *tr) to set it exactly from title rec // make a new Msg40 to get search results with try { m_msg40 = new (Msg40); } catch ( ... ) { g_errno = ENOMEM; log("msgaa: new(%i): %s", sizeof(Msg40), mstrerror(g_errno)); return true; } mnew ( m_msg40 , sizeof(Msg40) , "Msgaa" ); // initial path depth m_pathDepth = 1; // see how many urls are non-cgi with a pathdepth of 1 char *p = m_qbuf; strcpy ( p , "site:" ); p += 5; memcpy ( p , m_url->getHost() , m_url->getHostLen() ); p += m_url->getHostLen(); // sort them by the random score term, gbrandscore (see XmlDoc.cpp) p += sprintf (p , " gbpathdepth:%li" " gbiscgi:0" " gbhasfilename:0" // www.xyz.com/viacom-extends-brad-greys-contract/ not! " gbpermalink:0 " "| gbrandscore:1", m_pathDepth); // set our SearchInput m_si.reset(); // a sample of 100 results should be good! m_si.m_docsWanted = SAMPLESIZE; m_si.m_requireAllTerms = true; m_si.m_firstResultNum = 0; m_si.m_coll = m_coll; m_si.m_doDupContentRemoval = false; m_si.m_doSiteClustering = false; m_si.m_docsToScanForReranking = 0; // the query m_si.m_query = m_qbuf; m_si.m_queryLen = gbstrlen(m_qbuf); // sanity check if ( m_si.m_queryLen + 1 > MAX_QBUF_SIZE ) {char*xx=NULL;*xx=0; } // do not generate titles or summaries to save time, we just need // the url to check for bushiness. Msg20 if "getSummary" is false // will already not set the title... m_si.m_numLinesInSummary = 0; // perform the Msg40 query if ( ! m_msg40->getResults ( &m_si , false , // forward? this , // state gotResultsAAWrapper ) ) // return false if that blocks return false; // did not block return gotResults(); } void gotResultsAAWrapper ( Msg40 *msg40 , void *state ) { Msgaa *THIS = (Msgaa *)state; THIS->gotResults(); } // . look at the urls we got in the results // . they should be of the for abc.xyz.com/yyyyyy/ // . returns true and sets g_errno on error bool Msgaa::gotResults ( ) { // loop over each one long n = m_msg40->m_numMsg20s; // we need at least half of requested to make a good estimation if ( n <= SAMPLESIZE/2 ) return true; // make a hashtable HashTable ht; // get the url for ( long i = 0 ; i < n ; i++ ) { // get the ith result Msg20Reply *r = m_msg40->m_msg20[i]->m_r; // get the url string char *us = r->ptr_ubuf; long uslen = r->size_ubuf - 1; Url u; u.set ( us , uslen ); // get path component # m_pathDepth long clen; char *c = u.getPathComponent ( i - 1 , &clen ); // must be there if ( ! c || clen <= 0 ) { log("msgaa: c is empty"); continue; } // now hash it long h = hash32 ( c , clen ); // count in table long slot = ht.getSlot ( h ); // how many times has this occurred in a result's url? long count = 0; // inc if there if ( slot >= 0 ) count = ht.getValueFromSlot ( slot ); // inc it count++; // put it back if ( slot >= 0 ) ht.setValue ( slot , count ); // otherwise, add it new else if ( ! ht.addKey ( h , count ) ) return true; } // now scan the hash table and see how many unique path components long unique = 0; long ns = ht.getNumSlots(); for ( long i = 0 ;i < ns ; i++ ) { // is empty long val = ht.getValueFromSlot ( i ); // count if non-empty if ( val > 0 ) unique++; } // i'd say 50% of the SAMPLESIZE+ search results are required to have // unique path components in order for us to consider this to be a // subsite with a path depth of m_pathDepth. long required = n / 2; if ( unique < required ) { // ok, do not set m_sitePathDepth, leave it -1 log("msgaa: only have %li unique path components at path " "depth %li out of %li results. %s does not have subsites.", unique,m_pathDepth,n,m_url->getUrl()); return true; } // i guess we got it log("msgaa: have %li unique path components at path " "depth %li out of %li results. Enough to declare this as a " "subsite for %s .",unique,m_pathDepth,n,m_url->getUrl()); // ok set it m_sitePathDepth = m_pathDepth; // we are done return true; }