// http://www.propeciauk.co.uk/links.htm
// http://www.hendersonvillehomepro.com/FavoriteLinks/Default.aspx
// http://www.viacreme-viacream-viagra.com/health/pharmacies.htm
// are the same description for viagrapunch.com. why did they not cancel?
#include "linkspam.h"
#include "Url.h"
#include "Linkdb.h"
//#include "TitleRec.h"
#include "Unicode.h"
#include "matches2.h"
#include "Categories.h"
bool isLinkChain ( Xml *xml , Url *linker , Url *linkee , long linkNode ,
char **note ) ;
// . here's some additional things to mark it as a log page, but these
// depend on the content of the page, not the url itself.
// . fields: string, stringLen, id, section?
// . section is "1" if the substring identifies the start of a comment
// section, so that any links above that identifier should be
// consider good, and any below, should be considered bad links.
// Otherwise, if section is 0, if the match occurs anywhere on the
// page then all links on the page should be considered bad.
static Needle s_needles1[] = {
{"open.thumbshots.org" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
//{"google-ad" , 0 , 0 , 0 , 0 , NULL } ,
// indicates search results page
// this often directly precedes the comment section
{"[trackback" , 0 , 1 , 1 , 0 , NULL , 0 , NULL } ,
{"class=\"comtext" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"class=\"comment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"class=\"coment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"class=\"trackback" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"class=\"ping" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"class=\"followup" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"class=\"response" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
// this can signify a blog entry, not just a comment
//{"class=\"entry" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
// these seem to be more indicative of posted comments
{"class=\"posted" , 0 , 8 , 1 , 0 , NULL , 0 },
{"id=\"posted" , 0 , 8 , 1 , 0 , NULL , 0 },
{"name=\"posted" , 0 , 8 , 1 , 0 , NULL , 0 },
// annoying little textbox thingy
{"class=\"shoutbox" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"id=\"comment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"id=\"coment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"id=\"trackback" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"id=\"ping" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"id=\"followup" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"id=\"response" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"name=\"comment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"name=\"coment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"name=\"trackback" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"name=\"ping" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"name=\"followup" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
{"name=\"response" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
// a lot of the comment boards can be identified because
// they have a bunch of mailto links, one before each comment
//{"href=\"mailto" , 0 , 8 , 1 , 0 , NULL , 0 },
//{"href=mailto" , 0 , 8 , 1 , 0 , NULL , 0 },
// wikipedias
{"div class=\"editsection" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{"action=edit" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
// message boards
{"anonymous user" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{"anonymer user" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{"date posted" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{"post your notice" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{"edit this page" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
// edit
{"editeditpost a reply" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{"post reply" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{"submit post" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{">post message" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{">post a comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{">leave a comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{">post comments" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
// Comments (0) after each posted entry...
//{">comments<" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{"comments: <" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{"comments:<" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
//{"comment:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{"reacties:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{"comentarios:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{"comentários:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{">message:" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{">mensagem:" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{">faca seu comentario" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{">faça seu comentário" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
// comment add in german
{">Kommentar hinzuf" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{"rate this link" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{"link submit" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{"links directory" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{">add my comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
// title of the text area box
{">your comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{"your comment<" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{">comment by" , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
{">scrivi un commento" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{">scrivi il tuo commento" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{"add comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
{"trackbacks for the art" , 0 , 12, 1 , 0 , NULL , 0 , NULL } ,
{"these trackbacks have been re", 0 , 13, 1 , 0 , NULL , 0 , NULL } ,
{"trackback pings" , 0 , 13, 1 , 0 , NULL , 0 , NULL } ,
{"read the rest of this com" , 0 , 13, 1 , 0 , NULL , 0 , NULL } ,
// that was the opinion of ...
{"das war die meinung von" , 0 , 13, 1 , 0 , NULL , 0 , NULL } ,
{"resource partner" , 0 , 49, 0 , 0 , NULL , 0 , NULL } ,
{"partner link" , 0 , 50, 0 , 0 , NULL , 0 , NULL } ,
{"partner site" , 0 , 51, 0 , 0 , NULL , 0 , NULL } ,
{"sign the guestbook" , 0 , 43, 0 , 0 , NULL , 0 , NULL } ,
//{"add new comment" , 0 , 14, 0 , 0 , NULL , 0 },
//{"add message" , 0 , 14, 0 , 0 , NULL , 0 },
// tagboard software allows free submits. it has this in
// an html comment tag...
{"2002 natali ardianto" , 0 , 14, 0 , 0 , NULL , 0 , NULL } ,
// guestbooks
{"guestbooksponsors<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{">sponsor<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{">sponsored<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{">submit site<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{": sponsor" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{"/sponsor/" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{"*sponsors*" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{">payperpost" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{"sponsored post" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{"sponsored flag" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{"sponsoredflag" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{"sponsored listing" , 0 , 48, 1 , 0 , NULL , 0 , NULL } ,
{"sponsored link" , 0 , 48, 1 , 0 , NULL , 0 , NULL } ,
{"post is sponsor" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{"paid post" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{"powered by" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , // wordpress
{"suggest your website" , 0 , 48, 0 , 0 , NULL , 0 , NULL } ,
{"advertisement:" , 0 , 48, 1 , 0 , NULL , 0 , NULL }
};
// now check outlinks on the page for these substrings
static Needle s_needles2[] = {
{"cyber-robotics.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
{"cyberspacehq.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
{"links4trade.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
{"searchfeed.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
{"marketnex.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
{"partnersignup" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
{"publisher-network" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
//{"amazon.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
//{"dmoz.org" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
//{"dmoz.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
{"linksmanager" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } ,
{"changinglinks" , 0 , 0 , 0 , 0 , NULL , 0 , NULL }
};
// . we set the bit in linkdb for a doc if this returns true
// . it precludes a doc from voting if its bits is set in linkdb
// . this saves resources
// . the isLinkSpam() function is used when we have the linkee url
// . note is only set if the whole doc can not vote for some reason
// . otherwise, each outlink in "links" is assigned a "note" to indicate if
// the outlink is a spam link or not
// . returns true on success, false on error
bool setLinkSpam ( long ip ,
long *indCatIds ,
long numIndCatIds ,
Url *linker ,
long siteNumInlinks ,
Xml *xml ,
Links *links ,
bool isContentTruncated ,
long niceness ) {
// get our url
//Url *linker = tr->getUrl();
// it is critical to get inlinks from all pingserver xml
// pages regardless if they are often large pages. we
// have to manually hard-code the ping servers in for now.
if ( linker->isPingServer() ) return false;
// if the doc got truncated we may be missing valuable identifiers
// that identify the doc as a guestbook or something
if ( isContentTruncated )
return links->setAllSpamBits("doc too big");
// get linker quality
//long q = tr->getDocQuality();
// do not allow .info or .biz to vote ever for now
char *tld = linker->getTLD();
long tldLen = linker->getTLDLen();
if ( tldLen == 4 && strncmp ( tld, "info" , tldLen) == 0 && //q < 55 )
siteNumInlinks < 20 )
return links->setAllSpamBits("low quality .info linker");
if ( tldLen == 3 && strncmp ( tld, "biz" , tldLen) == 0 && //q < 55 )
siteNumInlinks < 20 )
return links->setAllSpamBits("low quality .biz linker");
// if has an outlink to dmoz-identified porn, all outlinks are spam
long *ids = indCatIds;//NULL
long nids = numIndCatIds;//0;
//if ( tr ) ids = tr->getIndCatids();
//if ( tr ) nids = tr->getNumIndCatids();
for ( long j = 0 ; j < nids ; j++ )
if ( g_categories && g_categories->isIdAdult ( ids[j] ) )
return links->setAllSpamBits("dmoz porn");
QUICKPOLL( niceness );
// do we contain a dmoz subpath in our url? that would indicate that
// we are probably a dmoz mirror!
char *zstart = linker->getPath();
long zlen = linker->getPathLen();
char *zend = zstart + zlen;
// start at the end of the path
char *z = zend-1;
// back up to previous /
for ( ; z > zstart && *z != '/' ; z-- );
// make that the new end
zend = z + 1;
// need at least 2 path components before checking... keep count
long zcount = 2;
// begin the loop
while ( z > zstart ) {
// . backup until we hit the previous /
for ( z-- ; z > zstart && *z != '/' ; z-- );
// debug
//char tmp[2000];
//memcpy(tmp,z,zend-z);
//tmp[zend-z]=0;
//log("build: path=%s",tmp);
// look it up
// "/Arts/" --> 1
if ( --zcount > 0 ) continue;
if ( g_categories->getIndexFromPath (z, zend-z) < 0 ) continue;
// consider ourselves a dmoz mirror and discount all outlinks
return links->setAllSpamBits("dmoz subpath in url");
}
QUICKPOLL( niceness );
// guestbook in hostname - domain?
char *hd = linker->getHost();
char *hd2 = linker->getDomain();
long hdlen = hd2 - hd;
if ( hd && hd2 && hdlen < 30 ) {
char c = hd[hdlen];
hd[hdlen] = '\0';
bool hasIt = false;
if ( strstr ( hd , "guestbook" ) ) hasIt = true;
hd[hdlen] = c;
if ( hasIt )
return links->setAllSpamBits("guestbook in hostname");
}
// do not allow any cgi url to vote
if ( linker->isCgi() )
return links->setAllSpamBits("path is cgi");
long plen = linker->getPathLen();
// if the page has just one rel=nofollow tag then we know they
// are not a guestbook
//if ( links->hasRelNoFollow() ) plen = 0;
if ( plen > 1 ) {
char *p = linker->getPath();
//char c = p[plen-1];
//p[plen-1] = '\0';
//bool val = false;
char *note = NULL;
if ( strncasestr ( p , "guest",plen,5) )
note = "path has guest" ;
else if ( strncasestr ( p , "cgi",plen,3) )
note = "path has cgi" ;
else if ( strncasestr ( p , "gast",plen,4) )
note = "path has gast" ;
// german
else if ( strncasestr ( p , "gaest",plen,5) )
note = "path has gaest" ;
else if ( strncasestr ( p , "gbook",plen,5) )
note = "path has gbook" ;
// vietnamese?
else if ( strncasestr ( p , "akobook",plen,7) )
note = "path has akobook" ;
else if ( strncasestr ( p , "/gb",plen,3) )
note = "path has /gb" ;
else if ( strncasestr ( p , "msg",plen,3 ) )
note = "path has msg" ;
else if ( strncasestr ( p , "messag",plen,6) )
note = "path has messag" ;
else if ( strncasestr ( p , "board",plen,5) )
note = "path has board" ;
else if ( strncasestr ( p , "coment",plen,6) )
note = "path has coment" ;
else if ( strncasestr ( p , "comment",plen,7) )
note = "path has comment" ;
else if ( strncasestr ( p , "linktrader",plen,10) )
note = "path has linktrader" ;
else if ( strncasestr ( p , "tradelinks",plen,10) )
note = "path has tradelinks" ;
else if ( strncasestr ( p , "trade-links",plen,11) )
note = "path has trade-links" ;
else if ( strncasestr ( p , "linkexchange",plen,12) )
note = "path has linkexchange" ;
else if ( strncasestr ( p , "link-exchange",plen,13 ) )
note = "path has link-exchange" ;
else if ( strncasestr ( p , "reciprocal-link",plen,15) )
note = "path has reciprocal-link";
else if ( strncasestr ( p , "reciprocallink",plen, 14) )
note = "path has reciprocallink" ;
else if ( strncasestr ( p , "/trackbacks/",plen,12 ) )
note = "path has /trackbacks/" ;
//if ( gb_strcasestr ( p , "archive" ) ) val = true;
//if ( gb_strcasestr ( p , ".asp" ) ) val = true;
//if ( gb_strcasestr ( p , ".aspx" ) ) val = true;
// these are mostly link exchange pages. no they are not!
//if ( gb_strcasestr ( p , "link" ) ) val = true;
//p[plen-1] = c;
//if ( val ) { note = "cgi or guestbook url"; return true; }
if ( note ) return links->setAllSpamBits(note);
}
QUICKPOLL( niceness );
// does title contain "web statistics for"?
long tlen ;
char *title = xml->getString ( "title" , &tlen );
if ( title && tlen > 0 ) {
// normalize title into buffer, remove non alnum chars
char buf[256];
char *d = buf;
char *dend = buf + 250;
char *s = title;
char *send = title + tlen;
while ( d < dend && s < send ) {
// remove punct
if ( ! is_alnum_a(*s) ) { s++; continue; }
*d = to_lower_a ( *s );
d++;
s++;
}
*d = '\0';
// see if it matches some catch phrases
bool val = false;
if ( strstr (buf,"webstatisticsfor" )) val = true;
if ( strstr (buf,"webserverstatisticsfor")) val = true;
else if ( strstr (buf,"usagestatisticsfor" )) val = true;
else if ( strstr (buf,"siteusageby" )) val = true;
else if ( strstr (buf,"surfstatsloganal" )) val = true;
else if ( strstr (buf,"webstarterhelpstats" )) val = true;
else if ( strstr (buf,"sitestatistics" )) val = true;
if ( val ) return links->setAllSpamBits("stats page");
}
QUICKPOLL( niceness );
/////////////////////////////////////////////////////
//
// check content for certain keywords and phrases
//
/////////////////////////////////////////////////////
//char *haystack = tr->getContent();
//long haystackSize = tr->getContentLen();
char *haystack = xml->getContent();
long haystackSize = xml->getContentLen();
// get our page quality, it serves as a threshold for some algos
//char quality = tr->getNewQuality();
//char *linkPos = NULL;
//if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode );
//if ( strstr ( linker->getUrl() , "usa_apartments1.htm") ) {
// log("hey");
// sleep(7);
//}
// loop:
// do not call them "bad links" if our link occurs before any
// comment section. our link's position therefore needs to be known,
// that is why we pass in linkPos.
// "n" is the number it matches.
long numNeedles1 = sizeof(s_needles1)/sizeof(Needle);
bool hadPreMatch;
getMatches2 ( s_needles1 ,
numNeedles1 ,
haystack ,
haystackSize ,
NULL , // linkPos ,
NULL , // &n ,
false , // stopAtFirstMatch
&hadPreMatch ,
true , // save quicktables
niceness );
QUICKPOLL( niceness );
// see if we got a hit
char *minPtr = NULL;
char *note = NULL;
for ( long i = 0 ; i < numNeedles1 ; i++ ) {
// open.thumbshots.org needs multiple counts
if ( i == 0 && s_needles1[i].m_count < 5 ) continue;
// skip if no matches on this string
if ( s_needles1[i].m_count <= 0 ) continue;
// ok, if it had its section bit set to 0 that means the
// whole page is link spam!
if ( s_needles1[i].m_isSection == 0 )
return links->setAllSpamBits(s_needles1[i].m_string );
// get the char ptr
char *ptr = s_needles1[i].m_firstMatch;
// set to the min
if ( ! minPtr || ptr < minPtr ) {
note = s_needles1[i].m_string;
minPtr = ptr;
}
}
QUICKPOLL( niceness );
// convert the char ptr into a link node following it
long aa = 0;
if ( minPtr ) aa = links->getNumLinks();
long mini = -1;
for ( long i = 0 ; i < aa ; i++ ) {
// get the link's char ptr into the content
long linkNode = links->getNodeNum(i);
char *linkPos = NULL;
if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode );
// now we can compare, if BEFORE this comment section
// indicating tag, we are NOT link spam, so continue
if ( linkPos < minPtr ) continue;
// otherwise, we are the first, stop.
mini = i;
break;
}
QUICKPOLL( niceness );
// now count all the links BELOW this match as link spam
// but everyone else is ok!
if ( minPtr && mini >= 0 )
links->setSpamBits ( note , mini );
// now check outlinks on the page for these substrings
haystack = links->getLinkBuf();
haystackSize = links->getLinkBufLen();
long numNeedles2 = sizeof(s_needles2)/sizeof(Needle);
getMatches2 ( s_needles2 ,
numNeedles2 ,
haystack ,
haystackSize ,
NULL , // linkPos,
NULL , // &n ,
false , // stopAtFirstMatch?
NULL ,
true , // save quicktables
niceness );
QUICKPOLL( niceness );
// see if we got a hit
for ( long i = 0 ; i < numNeedles2 ; i++ ) {
// skip if did not match
if ( s_needles2[i].m_count <= 0 ) continue;
// the whole doc is considered link spam
return links->setAllSpamBits(s_needles2[i].m_string);
}
QUICKPOLL( niceness );
//skiplinks:
// check for certain post tag, indicative of a comment-friendly blog
//