// http://www.propeciauk.co.uk/links.htm // http://www.hendersonvillehomepro.com/FavoriteLinks/Default.aspx // http://www.viacreme-viacream-viagra.com/health/pharmacies.htm // are the same description for viagrapunch.com. why did they not cancel? #include "linkspam.h" #include "Url.h" #include "Linkdb.h" //#include "TitleRec.h" #include "Unicode.h" #include "matches2.h" #include "Categories.h" bool isLinkChain ( Xml *xml , Url *linker , Url *linkee , long linkNode , char **note ) ; // . here's some additional things to mark it as a log page, but these // depend on the content of the page, not the url itself. // . fields: string, stringLen, id, section? // . section is "1" if the substring identifies the start of a comment // section, so that any links above that identifier should be // consider good, and any below, should be considered bad links. // Otherwise, if section is 0, if the match occurs anywhere on the // page then all links on the page should be considered bad. static Needle s_needles1[] = { {"open.thumbshots.org" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , //{"google-ad" , 0 , 0 , 0 , 0 , NULL } , // indicates search results page // this often directly precedes the comment section {"[trackback" , 0 , 1 , 1 , 0 , NULL , 0 , NULL } , {"class=\"comtext" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"comment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"coment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"trackback" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"ping" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"followup" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"response" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , // this can signify a blog entry, not just a comment //{"class=\"entry" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , // these seem to be more indicative of posted comments {"class=\"posted" , 0 , 8 , 1 , 0 , NULL , 0 }, {"id=\"posted" , 0 , 8 , 1 , 0 , NULL , 0 }, {"name=\"posted" , 0 , 8 , 1 , 0 , NULL , 0 }, // annoying little textbox thingy {"class=\"shoutbox" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"comment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"coment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"trackback" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"ping" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"followup" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"response" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"comment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"coment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"trackback" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"ping" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"followup" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"response" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , // a lot of the comment boards can be identified because // they have a bunch of mailto links, one before each comment //{"href=\"mailto" , 0 , 8 , 1 , 0 , NULL , 0 }, //{"href=mailto" , 0 , 8 , 1 , 0 , NULL , 0 }, // wikipedias {"div class=\"editsection" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"action=edit" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , // message boards {"anonymous user" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"anonymer user" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"date posted" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"post your notice" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"edit this page" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , // edit
{"editeditpost a reply" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"post reply" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"submit post" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">post message" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">post a comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">leave a comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">post comments" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , // Comments (0) after each posted entry... //{">comments<" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"comments: <" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"comments:<" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , //{"comment:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"reacties:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"comentarios:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"comentários:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {">message:" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">mensagem:" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">faca seu comentario" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">faça seu comentário" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , // comment add in german {">Kommentar hinzuf" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"rate this link" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"link submit" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"links directory" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">add my comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , // title of the text area box {">your comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"your comment<" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">comment by" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {">scrivi un commento" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">scrivi il tuo commento" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"add comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"trackbacks for the art" , 0 , 12, 1 , 0 , NULL , 0 , NULL } , {"these trackbacks have been re", 0 , 13, 1 , 0 , NULL , 0 , NULL } , {"trackback pings" , 0 , 13, 1 , 0 , NULL , 0 , NULL } , {"read the rest of this com" , 0 , 13, 1 , 0 , NULL , 0 , NULL } , // that was the opinion of ... {"das war die meinung von" , 0 , 13, 1 , 0 , NULL , 0 , NULL } , {"resource partner" , 0 , 49, 0 , 0 , NULL , 0 , NULL } , {"partner link" , 0 , 50, 0 , 0 , NULL , 0 , NULL } , {"partner site" , 0 , 51, 0 , 0 , NULL , 0 , NULL } , {"sign the guestbook" , 0 , 43, 0 , 0 , NULL , 0 , NULL } , //{"add new comment" , 0 , 14, 0 , 0 , NULL , 0 }, //{"add message" , 0 , 14, 0 , 0 , NULL , 0 }, // tagboard software allows free submits. it has this in // an html comment tag... {"2002 natali ardianto" , 0 , 14, 0 , 0 , NULL , 0 , NULL } , // guestbooks {"guestbooksponsors<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {">sponsor<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {">sponsored<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {">submit site<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {": sponsor" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"/sponsor/" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"*sponsors*" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {">payperpost" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"sponsored post" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"sponsored flag" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"sponsoredflag" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"sponsored listing" , 0 , 48, 1 , 0 , NULL , 0 , NULL } , {"sponsored link" , 0 , 48, 1 , 0 , NULL , 0 , NULL } , {"post is sponsor" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"paid post" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"powered by" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , // wordpress {"suggest your website" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"advertisement:" , 0 , 48, 1 , 0 , NULL , 0 , NULL } }; // now check outlinks on the page for these substrings static Needle s_needles2[] = { {"cyber-robotics.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"cyberspacehq.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"links4trade.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"searchfeed.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"marketnex.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"partnersignup" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"publisher-network" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , //{"amazon.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , //{"dmoz.org" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , //{"dmoz.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"linksmanager" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"changinglinks" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } }; // . we set the bit in linkdb for a doc if this returns true // . it precludes a doc from voting if its bits is set in linkdb // . this saves resources // . the isLinkSpam() function is used when we have the linkee url // . note is only set if the whole doc can not vote for some reason // . otherwise, each outlink in "links" is assigned a "note" to indicate if // the outlink is a spam link or not // . returns true on success, false on error bool setLinkSpam ( long ip , long *indCatIds , long numIndCatIds , Url *linker , long siteNumInlinks , Xml *xml , Links *links , bool isContentTruncated , long niceness ) { // get our url //Url *linker = tr->getUrl(); // it is critical to get inlinks from all pingserver xml // pages regardless if they are often large pages. we // have to manually hard-code the ping servers in for now. if ( linker->isPingServer() ) return false; // if the doc got truncated we may be missing valuable identifiers // that identify the doc as a guestbook or something if ( isContentTruncated ) return links->setAllSpamBits("doc too big"); // get linker quality //long q = tr->getDocQuality(); // do not allow .info or .biz to vote ever for now char *tld = linker->getTLD(); long tldLen = linker->getTLDLen(); if ( tldLen == 4 && strncmp ( tld, "info" , tldLen) == 0 && //q < 55 ) siteNumInlinks < 20 ) return links->setAllSpamBits("low quality .info linker"); if ( tldLen == 3 && strncmp ( tld, "biz" , tldLen) == 0 && //q < 55 ) siteNumInlinks < 20 ) return links->setAllSpamBits("low quality .biz linker"); // if has an outlink to dmoz-identified porn, all outlinks are spam long *ids = indCatIds;//NULL long nids = numIndCatIds;//0; //if ( tr ) ids = tr->getIndCatids(); //if ( tr ) nids = tr->getNumIndCatids(); for ( long j = 0 ; j < nids ; j++ ) if ( g_categories && g_categories->isIdAdult ( ids[j] ) ) return links->setAllSpamBits("dmoz porn"); QUICKPOLL( niceness ); // do we contain a dmoz subpath in our url? that would indicate that // we are probably a dmoz mirror! char *zstart = linker->getPath(); long zlen = linker->getPathLen(); char *zend = zstart + zlen; // start at the end of the path char *z = zend-1; // back up to previous / for ( ; z > zstart && *z != '/' ; z-- ); // make that the new end zend = z + 1; // need at least 2 path components before checking... keep count long zcount = 2; // begin the loop while ( z > zstart ) { // . backup until we hit the previous / for ( z-- ; z > zstart && *z != '/' ; z-- ); // debug //char tmp[2000]; //memcpy(tmp,z,zend-z); //tmp[zend-z]=0; //log("build: path=%s",tmp); // look it up // "/Arts/" --> 1 if ( --zcount > 0 ) continue; if ( g_categories->getIndexFromPath (z, zend-z) < 0 ) continue; // consider ourselves a dmoz mirror and discount all outlinks return links->setAllSpamBits("dmoz subpath in url"); } QUICKPOLL( niceness ); // guestbook in hostname - domain? char *hd = linker->getHost(); char *hd2 = linker->getDomain(); long hdlen = hd2 - hd; if ( hd && hd2 && hdlen < 30 ) { char c = hd[hdlen]; hd[hdlen] = '\0'; bool hasIt = false; if ( strstr ( hd , "guestbook" ) ) hasIt = true; hd[hdlen] = c; if ( hasIt ) return links->setAllSpamBits("guestbook in hostname"); } // do not allow any cgi url to vote if ( linker->isCgi() ) return links->setAllSpamBits("path is cgi"); long plen = linker->getPathLen(); // if the page has just one rel=nofollow tag then we know they // are not a guestbook //if ( links->hasRelNoFollow() ) plen = 0; if ( plen > 1 ) { char *p = linker->getPath(); //char c = p[plen-1]; //p[plen-1] = '\0'; //bool val = false; char *note = NULL; if ( strncasestr ( p , "guest",plen,5) ) note = "path has guest" ; else if ( strncasestr ( p , "cgi",plen,3) ) note = "path has cgi" ; else if ( strncasestr ( p , "gast",plen,4) ) note = "path has gast" ; // german else if ( strncasestr ( p , "gaest",plen,5) ) note = "path has gaest" ; else if ( strncasestr ( p , "gbook",plen,5) ) note = "path has gbook" ; // vietnamese? else if ( strncasestr ( p , "akobook",plen,7) ) note = "path has akobook" ; else if ( strncasestr ( p , "/gb",plen,3) ) note = "path has /gb" ; else if ( strncasestr ( p , "msg",plen,3 ) ) note = "path has msg" ; else if ( strncasestr ( p , "messag",plen,6) ) note = "path has messag" ; else if ( strncasestr ( p , "board",plen,5) ) note = "path has board" ; else if ( strncasestr ( p , "coment",plen,6) ) note = "path has coment" ; else if ( strncasestr ( p , "comment",plen,7) ) note = "path has comment" ; else if ( strncasestr ( p , "linktrader",plen,10) ) note = "path has linktrader" ; else if ( strncasestr ( p , "tradelinks",plen,10) ) note = "path has tradelinks" ; else if ( strncasestr ( p , "trade-links",plen,11) ) note = "path has trade-links" ; else if ( strncasestr ( p , "linkexchange",plen,12) ) note = "path has linkexchange" ; else if ( strncasestr ( p , "link-exchange",plen,13 ) ) note = "path has link-exchange" ; else if ( strncasestr ( p , "reciprocal-link",plen,15) ) note = "path has reciprocal-link"; else if ( strncasestr ( p , "reciprocallink",plen, 14) ) note = "path has reciprocallink" ; else if ( strncasestr ( p , "/trackbacks/",plen,12 ) ) note = "path has /trackbacks/" ; //if ( gb_strcasestr ( p , "archive" ) ) val = true; //if ( gb_strcasestr ( p , ".asp" ) ) val = true; //if ( gb_strcasestr ( p , ".aspx" ) ) val = true; // these are mostly link exchange pages. no they are not! //if ( gb_strcasestr ( p , "link" ) ) val = true; //p[plen-1] = c; //if ( val ) { note = "cgi or guestbook url"; return true; } if ( note ) return links->setAllSpamBits(note); } QUICKPOLL( niceness ); // does title contain "web statistics for"? long tlen ; char *title = xml->getString ( "title" , &tlen ); if ( title && tlen > 0 ) { // normalize title into buffer, remove non alnum chars char buf[256]; char *d = buf; char *dend = buf + 250; char *s = title; char *send = title + tlen; while ( d < dend && s < send ) { // remove punct if ( ! is_alnum_a(*s) ) { s++; continue; } *d = to_lower_a ( *s ); d++; s++; } *d = '\0'; // see if it matches some catch phrases bool val = false; if ( strstr (buf,"webstatisticsfor" )) val = true; if ( strstr (buf,"webserverstatisticsfor")) val = true; else if ( strstr (buf,"usagestatisticsfor" )) val = true; else if ( strstr (buf,"siteusageby" )) val = true; else if ( strstr (buf,"surfstatsloganal" )) val = true; else if ( strstr (buf,"webstarterhelpstats" )) val = true; else if ( strstr (buf,"sitestatistics" )) val = true; if ( val ) return links->setAllSpamBits("stats page"); } QUICKPOLL( niceness ); ///////////////////////////////////////////////////// // // check content for certain keywords and phrases // ///////////////////////////////////////////////////// //char *haystack = tr->getContent(); //long haystackSize = tr->getContentLen(); char *haystack = xml->getContent(); long haystackSize = xml->getContentLen(); // get our page quality, it serves as a threshold for some algos //char quality = tr->getNewQuality(); //char *linkPos = NULL; //if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode ); //if ( strstr ( linker->getUrl() , "usa_apartments1.htm") ) { // log("hey"); // sleep(7); //} // loop: // do not call them "bad links" if our link occurs before any // comment section. our link's position therefore needs to be known, // that is why we pass in linkPos. // "n" is the number it matches. long numNeedles1 = sizeof(s_needles1)/sizeof(Needle); bool hadPreMatch; getMatches2 ( s_needles1 , numNeedles1 , haystack , haystackSize , NULL , // linkPos , NULL , // &n , false , // stopAtFirstMatch &hadPreMatch , true , // save quicktables niceness ); QUICKPOLL( niceness ); // see if we got a hit char *minPtr = NULL; char *note = NULL; for ( long i = 0 ; i < numNeedles1 ; i++ ) { // open.thumbshots.org needs multiple counts if ( i == 0 && s_needles1[i].m_count < 5 ) continue; // skip if no matches on this string if ( s_needles1[i].m_count <= 0 ) continue; // ok, if it had its section bit set to 0 that means the // whole page is link spam! if ( s_needles1[i].m_isSection == 0 ) return links->setAllSpamBits(s_needles1[i].m_string ); // get the char ptr char *ptr = s_needles1[i].m_firstMatch; // set to the min if ( ! minPtr || ptr < minPtr ) { note = s_needles1[i].m_string; minPtr = ptr; } } QUICKPOLL( niceness ); // convert the char ptr into a link node following it long aa = 0; if ( minPtr ) aa = links->getNumLinks(); long mini = -1; for ( long i = 0 ; i < aa ; i++ ) { // get the link's char ptr into the content long linkNode = links->getNodeNum(i); char *linkPos = NULL; if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode ); // now we can compare, if BEFORE this comment section // indicating tag, we are NOT link spam, so continue if ( linkPos < minPtr ) continue; // otherwise, we are the first, stop. mini = i; break; } QUICKPOLL( niceness ); // now count all the links BELOW this match as link spam // but everyone else is ok! if ( minPtr && mini >= 0 ) links->setSpamBits ( note , mini ); // now check outlinks on the page for these substrings haystack = links->getLinkBuf(); haystackSize = links->getLinkBufLen(); long numNeedles2 = sizeof(s_needles2)/sizeof(Needle); getMatches2 ( s_needles2 , numNeedles2 , haystack , haystackSize , NULL , // linkPos, NULL , // &n , false , // stopAtFirstMatch? NULL , true , // save quicktables niceness ); QUICKPOLL( niceness ); // see if we got a hit for ( long i = 0 ; i < numNeedles2 ; i++ ) { // skip if did not match if ( s_needles2[i].m_count <= 0 ) continue; // the whole doc is considered link spam return links->setAllSpamBits(s_needles2[i].m_string); } QUICKPOLL( niceness ); //skiplinks: // check for certain post tag, indicative of a comment-friendly blog //
// // long nn = xml->getNumNodes(); bool gotTextArea = false; bool gotSubmit = false; for ( long i=0; i < nn ; i++ ) { //