// http://www.propeciauk.co.uk/links.htm // http://www.hendersonvillehomepro.com/FavoriteLinks/Default.aspx // http://www.viacreme-viacream-viagra.com/health/pharmacies.htm // are the same description for viagrapunch.com. why did they not cancel? #include "linkspam.h" #include "Url.h" #include "Linkdb.h" //#include "TitleRec.h" #include "Unicode.h" #include "matches2.h" #include "Categories.h" bool isLinkChain ( Xml *xml , Url *linker , Url *linkee , long linkNode , char **note ) ; // . here's some additional things to mark it as a log page, but these // depend on the content of the page, not the url itself. // . fields: string, stringLen, id, section? // . section is "1" if the substring identifies the start of a comment // section, so that any links above that identifier should be // consider good, and any below, should be considered bad links. // Otherwise, if section is 0, if the match occurs anywhere on the // page then all links on the page should be considered bad. static Needle s_needles1[] = { {"open.thumbshots.org" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , //{"google-ad" , 0 , 0 , 0 , 0 , NULL } , // indicates search results page // this often directly precedes the comment section {"[trackback" , 0 , 1 , 1 , 0 , NULL , 0 , NULL } , {"class=\"comtext" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"comment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"coment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"trackback" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"ping" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"followup" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"class=\"response" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , // this can signify a blog entry, not just a comment //{"class=\"entry" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , // these seem to be more indicative of posted comments {"class=\"posted" , 0 , 8 , 1 , 0 , NULL , 0 }, {"id=\"posted" , 0 , 8 , 1 , 0 , NULL , 0 }, {"name=\"posted" , 0 , 8 , 1 , 0 , NULL , 0 }, // annoying little textbox thingy {"class=\"shoutbox" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"comment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"coment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"trackback" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"ping" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"followup" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"id=\"response" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"comment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"coment" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"trackback" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"ping" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"followup" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , {"name=\"response" , 0 , 8 , 1 , 0 , NULL , 0 , NULL } , // a lot of the comment boards can be identified because // they have a bunch of mailto links, one before each comment //{"href=\"mailto" , 0 , 8 , 1 , 0 , NULL , 0 }, //{"href=mailto" , 0 , 8 , 1 , 0 , NULL , 0 }, // wikipedias {"div class=\"editsection" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"action=edit" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , // message boards {"anonymous user" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"anonymer user" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"date posted" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"post your notice" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"edit this page" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , // edit
{"editeditpost a reply" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"post reply" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"submit post" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">post message" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">post a comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">leave a comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">post comments" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , // Comments (0) after each posted entry... //{">comments<" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"comments: <" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"comments:<" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , //{"comment:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"reacties:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"comentarios:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {"comentários:" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {">message:" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">mensagem:" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">faca seu comentario" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">faça seu comentário" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , // comment add in german {">Kommentar hinzuf" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"rate this link" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"link submit" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"links directory" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">add my comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , // title of the text area box {">your comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"your comment<" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">comment by" , 0 , 10, 1 , 0 , NULL , 0 , NULL } , {">scrivi un commento" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {">scrivi il tuo commento" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"add comment" , 0 , 10, 0 , 0 , NULL , 0 , NULL } , {"trackbacks for the art" , 0 , 12, 1 , 0 , NULL , 0 , NULL } , {"these trackbacks have been re", 0 , 13, 1 , 0 , NULL , 0 , NULL } , {"trackback pings" , 0 , 13, 1 , 0 , NULL , 0 , NULL } , {"read the rest of this com" , 0 , 13, 1 , 0 , NULL , 0 , NULL } , // that was the opinion of ... {"das war die meinung von" , 0 , 13, 1 , 0 , NULL , 0 , NULL } , {"resource partner" , 0 , 49, 0 , 0 , NULL , 0 , NULL } , {"partner link" , 0 , 50, 0 , 0 , NULL , 0 , NULL } , {"partner site" , 0 , 51, 0 , 0 , NULL , 0 , NULL } , {"sign the guestbook" , 0 , 43, 0 , 0 , NULL , 0 , NULL } , //{"add new comment" , 0 , 14, 0 , 0 , NULL , 0 }, //{"add message" , 0 , 14, 0 , 0 , NULL , 0 }, // tagboard software allows free submits. it has this in // an html comment tag... {"2002 natali ardianto" , 0 , 14, 0 , 0 , NULL , 0 , NULL } , // guestbooks {"guestbooksponsors<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {">sponsor<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {">sponsored<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {">submit site<" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {": sponsor" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"/sponsor/" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"*sponsors*" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {">payperpost" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"sponsored post" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"sponsored flag" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"sponsoredflag" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"sponsored listing" , 0 , 48, 1 , 0 , NULL , 0 , NULL } , {"sponsored link" , 0 , 48, 1 , 0 , NULL , 0 , NULL } , {"post is sponsor" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"paid post" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"powered by" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , // wordpress {"suggest your website" , 0 , 48, 0 , 0 , NULL , 0 , NULL } , {"advertisement:" , 0 , 48, 1 , 0 , NULL , 0 , NULL } }; // now check outlinks on the page for these substrings static Needle s_needles2[] = { {"cyber-robotics.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"cyberspacehq.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"links4trade.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"searchfeed.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"marketnex.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"partnersignup" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"publisher-network" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , //{"amazon.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , //{"dmoz.org" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , //{"dmoz.com" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"linksmanager" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } , {"changinglinks" , 0 , 0 , 0 , 0 , NULL , 0 , NULL } }; // . we set the bit in linkdb for a doc if this returns true // . it precludes a doc from voting if its bits is set in linkdb // . this saves resources // . the isLinkSpam() function is used when we have the linkee url // . note is only set if the whole doc can not vote for some reason // . otherwise, each outlink in "links" is assigned a "note" to indicate if // the outlink is a spam link or not // . returns true on success, false on error bool setLinkSpam ( long ip , long *indCatIds , long numIndCatIds , Url *linker , long siteNumInlinks , Xml *xml , Links *links , bool isContentTruncated , long niceness ) { // get our url //Url *linker = tr->getUrl(); // it is critical to get inlinks from all pingserver xml // pages regardless if they are often large pages. we // have to manually hard-code the ping servers in for now. if ( linker->isPingServer() ) return false; // if the doc got truncated we may be missing valuable identifiers // that identify the doc as a guestbook or something if ( isContentTruncated ) return links->setAllSpamBits("doc too big"); // get linker quality //long q = tr->getDocQuality(); // do not allow .info or .biz to vote ever for now char *tld = linker->getTLD(); long tldLen = linker->getTLDLen(); if ( tldLen == 4 && strncmp ( tld, "info" , tldLen) == 0 && //q < 55 ) siteNumInlinks < 20 ) return links->setAllSpamBits("low quality .info linker"); if ( tldLen == 3 && strncmp ( tld, "biz" , tldLen) == 0 && //q < 55 ) siteNumInlinks < 20 ) return links->setAllSpamBits("low quality .biz linker"); // if has an outlink to dmoz-identified porn, all outlinks are spam long *ids = indCatIds;//NULL long nids = numIndCatIds;//0; //if ( tr ) ids = tr->getIndCatids(); //if ( tr ) nids = tr->getNumIndCatids(); for ( long j = 0 ; j < nids ; j++ ) if ( g_categories && g_categories->isIdAdult ( ids[j] ) ) return links->setAllSpamBits("dmoz porn"); QUICKPOLL( niceness ); // do we contain a dmoz subpath in our url? that would indicate that // we are probably a dmoz mirror! char *zstart = linker->getPath(); long zlen = linker->getPathLen(); char *zend = zstart + zlen; // start at the end of the path char *z = zend-1; // back up to previous / for ( ; z > zstart && *z != '/' ; z-- ); // make that the new end zend = z + 1; // need at least 2 path components before checking... keep count long zcount = 2; // begin the loop while ( z > zstart ) { // . backup until we hit the previous / for ( z-- ; z > zstart && *z != '/' ; z-- ); // debug //char tmp[2000]; //memcpy(tmp,z,zend-z); //tmp[zend-z]=0; //log("build: path=%s",tmp); // look it up // "/Arts/" --> 1 if ( --zcount > 0 ) continue; if ( g_categories->getIndexFromPath (z, zend-z) < 0 ) continue; // consider ourselves a dmoz mirror and discount all outlinks return links->setAllSpamBits("dmoz subpath in url"); } QUICKPOLL( niceness ); // guestbook in hostname - domain? char *hd = linker->getHost(); char *hd2 = linker->getDomain(); long hdlen = hd2 - hd; if ( hd && hd2 && hdlen < 30 ) { char c = hd[hdlen]; hd[hdlen] = '\0'; bool hasIt = false; if ( strstr ( hd , "guestbook" ) ) hasIt = true; hd[hdlen] = c; if ( hasIt ) return links->setAllSpamBits("guestbook in hostname"); } // do not allow any cgi url to vote if ( linker->isCgi() ) return links->setAllSpamBits("path is cgi"); long plen = linker->getPathLen(); // if the page has just one rel=nofollow tag then we know they // are not a guestbook //if ( links->hasRelNoFollow() ) plen = 0; if ( plen > 1 ) { char *p = linker->getPath(); //char c = p[plen-1]; //p[plen-1] = '\0'; //bool val = false; char *note = NULL; if ( strncasestr ( p , "guest",plen,5) ) note = "path has guest" ; else if ( strncasestr ( p , "cgi",plen,3) ) note = "path has cgi" ; else if ( strncasestr ( p , "gast",plen,4) ) note = "path has gast" ; // german else if ( strncasestr ( p , "gaest",plen,5) ) note = "path has gaest" ; else if ( strncasestr ( p , "gbook",plen,5) ) note = "path has gbook" ; // vietnamese? else if ( strncasestr ( p , "akobook",plen,7) ) note = "path has akobook" ; else if ( strncasestr ( p , "/gb",plen,3) ) note = "path has /gb" ; else if ( strncasestr ( p , "msg",plen,3 ) ) note = "path has msg" ; else if ( strncasestr ( p , "messag",plen,6) ) note = "path has messag" ; else if ( strncasestr ( p , "board",plen,5) ) note = "path has board" ; else if ( strncasestr ( p , "coment",plen,6) ) note = "path has coment" ; else if ( strncasestr ( p , "comment",plen,7) ) note = "path has comment" ; else if ( strncasestr ( p , "linktrader",plen,10) ) note = "path has linktrader" ; else if ( strncasestr ( p , "tradelinks",plen,10) ) note = "path has tradelinks" ; else if ( strncasestr ( p , "trade-links",plen,11) ) note = "path has trade-links" ; else if ( strncasestr ( p , "linkexchange",plen,12) ) note = "path has linkexchange" ; else if ( strncasestr ( p , "link-exchange",plen,13 ) ) note = "path has link-exchange" ; else if ( strncasestr ( p , "reciprocal-link",plen,15) ) note = "path has reciprocal-link"; else if ( strncasestr ( p , "reciprocallink",plen, 14) ) note = "path has reciprocallink" ; else if ( strncasestr ( p , "/trackbacks/",plen,12 ) ) note = "path has /trackbacks/" ; //if ( gb_strcasestr ( p , "archive" ) ) val = true; //if ( gb_strcasestr ( p , ".asp" ) ) val = true; //if ( gb_strcasestr ( p , ".aspx" ) ) val = true; // these are mostly link exchange pages. no they are not! //if ( gb_strcasestr ( p , "link" ) ) val = true; //p[plen-1] = c; //if ( val ) { note = "cgi or guestbook url"; return true; } if ( note ) return links->setAllSpamBits(note); } QUICKPOLL( niceness ); // does title contain "web statistics for"? long tlen ; char *title = xml->getString ( "title" , &tlen ); if ( title && tlen > 0 ) { // normalize title into buffer, remove non alnum chars char buf[256]; char *d = buf; char *dend = buf + 250; char *s = title; char *send = title + tlen; while ( d < dend && s < send ) { // remove punct if ( ! is_alnum_a(*s) ) { s++; continue; } *d = to_lower_a ( *s ); d++; s++; } *d = '\0'; // see if it matches some catch phrases bool val = false; if ( strstr (buf,"webstatisticsfor" )) val = true; if ( strstr (buf,"webserverstatisticsfor")) val = true; else if ( strstr (buf,"usagestatisticsfor" )) val = true; else if ( strstr (buf,"siteusageby" )) val = true; else if ( strstr (buf,"surfstatsloganal" )) val = true; else if ( strstr (buf,"webstarterhelpstats" )) val = true; else if ( strstr (buf,"sitestatistics" )) val = true; if ( val ) return links->setAllSpamBits("stats page"); } QUICKPOLL( niceness ); ///////////////////////////////////////////////////// // // check content for certain keywords and phrases // ///////////////////////////////////////////////////// //char *haystack = tr->getContent(); //long haystackSize = tr->getContentLen(); char *haystack = xml->getContent(); long haystackSize = xml->getContentLen(); // get our page quality, it serves as a threshold for some algos //char quality = tr->getNewQuality(); //char *linkPos = NULL; //if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode ); //if ( strstr ( linker->getUrl() , "usa_apartments1.htm") ) { // log("hey"); // sleep(7); //} // loop: // do not call them "bad links" if our link occurs before any // comment section. our link's position therefore needs to be known, // that is why we pass in linkPos. // "n" is the number it matches. long numNeedles1 = sizeof(s_needles1)/sizeof(Needle); bool hadPreMatch; getMatches2 ( s_needles1 , numNeedles1 , haystack , haystackSize , NULL , // linkPos , NULL , // &n , false , // stopAtFirstMatch &hadPreMatch , true , // save quicktables niceness ); QUICKPOLL( niceness ); // see if we got a hit char *minPtr = NULL; char *note = NULL; for ( long i = 0 ; i < numNeedles1 ; i++ ) { // open.thumbshots.org needs multiple counts if ( i == 0 && s_needles1[i].m_count < 5 ) continue; // skip if no matches on this string if ( s_needles1[i].m_count <= 0 ) continue; // ok, if it had its section bit set to 0 that means the // whole page is link spam! if ( s_needles1[i].m_isSection == 0 ) return links->setAllSpamBits(s_needles1[i].m_string ); // get the char ptr char *ptr = s_needles1[i].m_firstMatch; // set to the min if ( ! minPtr || ptr < minPtr ) { note = s_needles1[i].m_string; minPtr = ptr; } } QUICKPOLL( niceness ); // convert the char ptr into a link node following it long aa = 0; if ( minPtr ) aa = links->getNumLinks(); long mini = -1; for ( long i = 0 ; i < aa ; i++ ) { // get the link's char ptr into the content long linkNode = links->getNodeNum(i); char *linkPos = NULL; if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode ); // now we can compare, if BEFORE this comment section // indicating tag, we are NOT link spam, so continue if ( linkPos < minPtr ) continue; // otherwise, we are the first, stop. mini = i; break; } QUICKPOLL( niceness ); // now count all the links BELOW this match as link spam // but everyone else is ok! if ( minPtr && mini >= 0 ) links->setSpamBits ( note , mini ); // now check outlinks on the page for these substrings haystack = links->getLinkBuf(); haystackSize = links->getLinkBufLen(); long numNeedles2 = sizeof(s_needles2)/sizeof(Needle); getMatches2 ( s_needles2 , numNeedles2 , haystack , haystackSize , NULL , // linkPos, NULL , // &n , false , // stopAtFirstMatch? NULL , true , // save quicktables niceness ); QUICKPOLL( niceness ); // see if we got a hit for ( long i = 0 ; i < numNeedles2 ; i++ ) { // skip if did not match if ( s_needles2[i].m_count <= 0 ) continue; // the whole doc is considered link spam return links->setAllSpamBits(s_needles2[i].m_string); } QUICKPOLL( niceness ); //skiplinks: // check for certain post tag, indicative of a comment-friendly blog //

// // long nn = xml->getNumNodes(); bool gotTextArea = false; bool gotSubmit = false; for ( long i=0; i < nn ; i++ ) { //

tags are bad... but only if we have not
		// matched "track" or whatever from above... check for that
		// if you uncommment this... otherwise you disable all blogs!
		// Only do this check if we did match a comment related phrase
		// in s_needles1[] BUT it was BEFORE our outlink. That 
		// basically means that we do *not* recognize the format of 
		// the comment page and so therefore need to be more 
		// restrictive about allowing this page to vote.
		if ( ! hadPreMatch ) {
			// is it a <textarea> tag?
			if ( xml->getNodeId ( i ) == TAG_TEXTAREA ) 
				gotTextArea = true;
			// is it an <input> tag?
			long len = 0;
			if ( xml->getNodeId ( i ) == TAG_INPUT &&
			     xml->getString(i,"submit",&len)) gotSubmit = true;
		}
		// check for script tag
		/*
		if ( xml->getNodeId(i) == TAG_SCRIPT && quality < 80 ) {
			// <script src=blah.com/fileparse.js" 
			// type="text/javascript"> is used to hide google
			// ads, so don't allow those pages to vote either
			long  slen; xml->getString(i,"src",&slen);
			if ( slen > 0 ) { *note = "script src"; return true; }
		}
		*/
		if ( xml->getNodeId ( i ) != TAG_FORM ) continue;
			
		// get the method field of this base tag
		long  slen;
		char *s = (char *) xml->getString(i,"method",&slen);
		// if not thee, skip it
		if ( ! s || slen <= 0 ) continue;
		//if ( slen != 4 ) continue;
		// if not a post, skip it
		//if ( strncasecmp ( s , "post" , 4 ) ) continue;
		// get the action url
		s = (char *) xml->getString(i,"action",&slen);
		if ( ! s || slen <= 0 ) continue;
		char c = s[slen];
		s[slen]='\0';
		bool val = false;
		// this is a bit too strong, but i'ev seen an action of
		// "cgi-bin/mt-leaveone.cgi" so we can't rely on "mt-comment"
		if      ( strstr ( s , "comment" ) ) val = true;
		else if ( strstr ( s , "/MT/" ) ) val = true;
		else if ( strstr ( s , "/mt/" ) ) val = true;
		// they can have these search boxes though
		if ( val && strstr ( s , "/mt/mt-search" ) ) val = false;
		//else if ( strstr ( s , "cgi"     ) ) val = true;
		// eliminate some false positives
		//if ( val && strstr ( s , "search" ) ) val = false;
		s[slen] = c;
		if ( val ) return links->setAllSpamBits("post page");
		QUICKPOLL( niceness );
	}

if ( gotTextArea && gotSubmit )
		return links->setAllSpamBits("textarea tag");

QUICKPOLL( niceness );

// edu, gov, etc. can have link chains
	if ( tldLen >= 3 && strncmp ( tld, "edu" , 3) == 0 ) return true;
	if ( tldLen >= 3 && strncmp ( tld, "gov" , 3) == 0 ) return true;
	// . allow sites in dmoz to have link chains, too
	// . no, there are too many spam porn sites in dmoz
	//if ( tr->getNumIndCatids() || tr->getNumCatids()   ) return false;

// if linker is naughty, he cannot vote... how did he make it in?
	if ( linker->isSpam() )
		return links->setAllSpamBits("linker is sporny");

// . if they link to any adult site, consider them link spam
	// . just consider a 100 link radius around linkNode
	long nl = links->getNumLinks();
	for ( long i = 0 ; i < nl ; i++ ) {
		// skip if this link is internal, we will add it to linkdb
		// anyway... this will save us some processing time
		if ( links->isInternalDom(i) ) continue;
		// otherwise, normalize it...
		Url uu; uu.set ( links->getLink(i), links->getLinkLen(i) );
		char          *h     = uu.getUrl();
		// include the trailing /
		char          *hend  = uu.getHost() + uu.getHostLen() + 1;
		unsigned long  hhash = hash32 ( h , hend - h );
		if ( g_categories && g_categories->isInBadCat ( hhash ) ) {
			links->setAllSpamBits("links to dmoz filth");
			log(LOG_DEBUG,"build: %s is filthy.",uu.getUrl());
			return true;
		}
		// take a break
		QUICKPOLL ( niceness );
		// . is it near sporny links? (naughty domains or lotsa -'s)
		// . if we are in a list of ads, chances are good the true
		//   nature of the ads will emerge...
		if ( uu.isSpam() ) {
			links->setAllSpamBits("has sporny outlinks");
			log(LOG_DEBUG,"build: %s has sporny outlinks.",
			    uu.getUrl());
			return true;
		}

// check if this url is a link chain
		//if ( q >= 60 ) continue;
		if ( siteNumInlinks >= 50 ) continue;
		char *np = NULL;
		//if ( strncmp("http://www.reliant.com",uu.getUrl(),18)==0 )
		//     log("hey");
		// get the xml node of link #i
		long xmlNode = links->getNodeNum ( i );
		if ( isLinkChain ( xml , linker, &uu, xmlNode, &np ))
			links->setSpamBit ( np , i );
		// take a break
		QUICKPOLL ( niceness );
	}
	return true;
}

bool isLinkSpam ( Url *linker, 
		  long ip ,
		  long *indCatIds ,
		  long  numIndCatIds ,
		  long siteNumInlinks ,
		  //TitleRec *tr, 
		  Xml *xml, 
		  Links *links ,
		  long maxDocLen , 
		  char **note , 
		  Url *linkee , 
		  // node position of the linkee in the linker's content
		  long  linkNode ,
		  char *coll     ,
		  long  niceness ) {
	// it is critical to get inlinks from all pingserver xml
	// pages regardless if they are often large pages. we
	// have to manually hard-code the ping servers in for now.
	if ( linker->isPingServer() ) return false;
	// same host linkers can be link spam (TODO: make same ip block)
	// because we only allow up to 10 to vote as a single voter
	if ( linkee ) {
		char *h1    = linkee->getHost();
		long  h1len = linkee->getHostLen();
		char *h2    = NULL;
		long  h2len = 0;
		//if ( tr ) h2    = tr->getUrl()->getHost();
		//if ( tr ) h2len = tr->getUrl()->getHostLen();
		h2    = linker->getHost();
		h2len = linker->getHostLen();
		if ( h1len == h2len && strncmp ( h1 , h2 , h1len ) == 0 ) 
			return false;
	}
	// do not allow .info or .biz to vote ever for now
	char *tld    = linker->getTLD();
	long  tldLen = linker->getTLDLen();
	if ( tldLen == 4 && strncmp ( tld, "info" , tldLen) == 0 ) {
		*note = ".info tld";
		return true;
	}
	if ( tldLen == 3 && strncmp ( tld, "biz" , tldLen) == 0 ) {
		*note = ".biz tld";
		return true;
	}
	//long ip = tr->getIp();
	char *ips = (char *)&ip;
	if ( ips[0]==70 && ips[1]>=80 && ips[1]<=89 ) {
		*note = "70.8*. ip";
		return true;
	}

// if this page fails zak's page quality algo, do not let it vote
	// like if from .biz or lots of hyphens in the url or an ip-based 
	// url.
	// now we do this on a site by site basis, more accurate that way.
	//  --z
	//unsigned char spamScore = tr->getSpamScore();
	// 	unsigned char spamScore ;
	// 	spamScore = getNegativeQualityWeight ( linker ,
	// 					       xml   ,
	// 					       links ,
	// 					       NULL  , // words
	// 					       coll  ,
	// 					       NULL  , // sr    ,
	// 					       NULL  ,// safebuf
	// 					       0  /*niceness*/); 
	// 	if ( spamScore >= 30 ) {
	// 		*note = "had big spam score";
	// 		return true;
	// 	}

// big pages may have keywords identifying them as log pages cutoff
	// so assume the worst
	//if ( tr->getContentLen() > 100*1024 ) return true;
	// i saw a german doc get its textarea cut out because of this, so
	// we need this here
	//if ( tr && tr->getContentLen() > maxDocLen ) {
	if ( xml && xml->getContentLen() > maxDocLen ) {
		*note ="doc too big";
		return true; 
	}
	// if it has a link to dmoz.org then we are probably a dmoz mirror 
	// because we are required to have a link to dmoz if a mirror
	bool checkForDmoz = true;
	bool checkForAmazon = true;
	if ( linkee && 
	     linkee->getDomainLen() == 8 &&
	     linkee->getDomain() )
		if ( strncmp ( linkee->getDomain() , "dmoz.org" , 8 ) == 0 ||
		     strncmp ( linkee->getDomain() , "dmoz.com" , 8 ) == 0 )
			checkForDmoz = false;

if ( linker && 
	     linker->getDomainLen() == 8 &&
	     linker->getDomain() )
		if ( strncmp ( linker->getDomain() , "dmoz.org" , 8 ) == 0 ||
		     strncmp ( linker->getDomain() , "dmoz.com" , 8 ) == 0 )
			checkForDmoz = false;

// does the url of the linker have a dmoz path in it?
	char *zstart = linker->getPath();
	long  zlen   = linker->getPathLen();
	char *zend   = zstart + zlen;
	// start at the end of the path
	char *z      = zend-1;
	// back up to previous /
	for ( ; z > zstart && *z != '/' ; z-- );
	// make that the new end
	zend = z + 1;
	// need at least 2 path components before checking... keep count
	long  zcount = 2;
	// begin the loop
	while ( checkForDmoz && z > zstart ) {
		// . backup until we hit the previous /
		for ( z-- ; z > zstart && *z != '/' ; z-- );
		// debug
		//char tmp[2000];
		//memcpy(tmp,z,zend-z);
		//tmp[zend-z]=0;
		//log("build: path=%s",tmp);
		// look it up
		// "/Arts/" --> 1
		if ( --zcount > 0 ) continue;
		if ( g_categories->getIndexFromPath (z, zend-z) < 0 ) continue;
		*note = "has dmoz path";
		return true;
	}

QUICKPOLL( niceness );

// if it has a link to amazon.com
	// because we are required to have a link to dmoz if a mirror
	if ( linkee && linkee->getDomain() &&
	     strncmp ( linkee->getDomain(), "amazon.com" , 10 ) == 0 )
		checkForAmazon = false;

if ( linker && linker->getDomain() &&
	     strncmp ( linker->getDomain(), "amazon.com" , 10 ) == 0 )
		checkForAmazon = false;

// guestbook in hostname - domain?
	char *hd  = linker->getHost();
	char *hd2 = linker->getDomain();
	long  hdlen = hd2 - hd;
	if ( hd && hd2 && hdlen < 30 ) {
		char c = hd[hdlen];
		hd[hdlen] = '\0';
		bool hasIt = false;
		if ( strstr ( hd , "guestbook" ) ) hasIt = true;
		hd[hdlen] = c;
		if ( hasIt ) { 
			*note = "guestbook in hostname"; 
			return true; 
		}
	}

// do not allow any cgi url to vote
	if ( linker->isCgi() ) { *note = "path is cgi"; return true; }

// breathe
	QUICKPOLL(niceness);

long plen = linker->getPathLen();

// if very spammy!!
	// if ( spam

// if the page has just one rel=nofollow tag then we know they
	// are not a guestbook
	//if ( links->hasRelNoFollow() ) plen = 0;
	if ( plen > 1 ) {
		char *p    = linker->getPath();
		//char  c    = p[plen-1];
		//p[plen-1] = '\0';
		//bool val = false;
		if ( strncasestr ( p , "guest",plen,5) ) {
			*note = "path has guest"          ; return true; }
		else if ( strncasestr ( p , "cgi",plen,3) ) { 
			*note = "path has cgi"            ; return true; }
		else if ( strncasestr ( p , "gast",plen,4) ) { 
			*note = "path has gast"           ; return true; }
		// german
		else if ( strncasestr ( p , "gaest",plen,5) ) {
			*note = "path has gaest"          ; return true; }
		else if ( strncasestr ( p , "gbook",plen,5) ) { 
			*note = "path has gbook"          ; return true; }
		// vietnamese?
		else if ( strncasestr ( p , "akobook",plen,7) ) { 
			*note = "path has akobook"        ; return true; }
		else if ( strncasestr ( p , "/gb",plen,3) ) { 
			*note = "path has /gb"            ; return true; }
		else if ( strncasestr ( p , "msg",plen,3 ) ) { 
			*note = "path has msg"            ; return true; }
		else if ( strncasestr ( p , "messag",plen,6) ) { 
			*note = "path has messag"         ; return true; }
		else if ( strncasestr ( p , "board",plen,5) ) { 
			*note = "path has board"          ; return true; }
		else if ( strncasestr ( p , "coment",plen,6) ) { 
			*note = "path has coment"         ; return true; }
		else if ( strncasestr ( p , "comment",plen,7) ) { 
			*note = "path has comment"        ; return true; }
		else if ( strncasestr ( p , "linktrader",plen,10) ) { 
			*note = "path has linktrader"     ; return true; }
		else if ( strncasestr ( p , "tradelinks",plen,10) ) { 
			*note = "path has tradelinks"     ; return true; }
		else if ( strncasestr ( p , "trade-links",plen,11) ) { 
			*note = "path has trade-links"    ; return true; }
		else if ( strncasestr ( p , "linkexchange",plen,12) ) { 
			*note = "path has linkexchange"   ; return true; }
		else if ( strncasestr ( p , "link-exchange",plen,13  ) ) {
			*note = "path has link-exchange"  ; return true; }
		else if ( strncasestr ( p , "reciprocal-link",plen,15) ) {
			*note = "path has reciprocal-link"; return true; }
		else if ( strncasestr ( p , "reciprocallink",plen, 14) ) {
			*note = "path has reciprocallink" ; return true; }
		else if ( strncasestr ( p , "/trackbacks/",plen,12 ) ) { 
			*note = "path has /trackbacks/"   ; return true; }

//if ( gb_strcasestr ( p , "archive"   ) ) val = true;
		//if ( gb_strcasestr ( p , ".asp"      ) ) val = true;
		//if ( gb_strcasestr ( p , ".aspx"     ) ) val = true;
		// these are mostly link exchange pages. no they are not!
		//if ( gb_strcasestr ( p , "link"   ) ) val = true;
		//p[plen-1] = c;
		//if ( val ) { *note = "cgi or guestbook url"; return true; }
	}

QUICKPOLL( niceness );

// scan through the content as fast as possible
	char  *content    = xml->getContent(); 
	long   contentLen = xml->getContentLen();

// does title contain "web statistics for"?
	long  tlen ;
	char *title = xml->getString ( "title" , &tlen );
	if ( title && tlen > 0 ) {
		// normalize title into buffer, remove non alnum chars
		char buf[256];
		char *d    = buf;
		char *dend = buf + 250;
		char *s    = title;
		char *send = title + tlen;
		while ( d < dend && s < send ) {
			// remove punct
			if ( ! is_alnum_a(*s) ) { s++; continue; }
			*d = to_lower_a ( *s );
			d++;
			s++;
		}
		*d = '\0';
		// see if it matches some catch phrases
		bool val = false;
		if      ( strstr (buf,"webstatisticsfor"      )) val = true;
		if      ( strstr (buf,"webserverstatisticsfor")) val = true;
		else if ( strstr (buf,"usagestatisticsfor"    )) val = true;
		else if ( strstr (buf,"siteusageby"           )) val = true;
		else if ( strstr (buf,"surfstatsloganal"      )) val = true;
		else if ( strstr (buf,"webstarterhelpstats"   )) val = true;
		else if ( strstr (buf,"sitestatistics"        )) val = true;
		if ( val ) { *note = "stats page"; return true; }
	}

QUICKPOLL( niceness );

/////////////////////////////////////////////////////
	//
	// check content for certain keywords and phrases
	//
	/////////////////////////////////////////////////////

char *haystack     = content;
	long  haystackSize = contentLen;

// get our page quality, it serves as a threshold for some algos
	//char quality = tr->getNewQuality();

char *linkPos = NULL;
	if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode );

//if ( strstr ( linker->getUrl() , "usa_apartments1.htm") ) {
	//	log("hey");
	//	sleep(7);
	//}

// loop:
	// do not call them "bad links" if our link occurs before any
	// comment section. our link's position therefore needs to be known,
	// that is why we pass in linkPos. 
	// "n" is the number it matches.
	long  n;
	long numNeedles1 = sizeof(s_needles1)/sizeof(Needle);
	bool hadPreMatch;
	getMatches2 ( s_needles1       ,
		      numNeedles1      ,
		      haystack         ,
		      haystackSize     ,
		      linkPos          ,
		      &n               ,
		      false            , // stopAtFirstMatch
		      &hadPreMatch     ,
		      true             , // save quicktables
		      niceness         );

QUICKPOLL( niceness );

// see if we got a hit
	for ( long i = 0 ; i < numNeedles1 ; i++ ) {
		long need = 1;
		// open.thumbshots.org needs multiple counts
		if ( i == 0 ) need = 5;
		if ( s_needles1[i].m_count < need ) continue;
		*note = s_needles1[i].m_string;
		return true;
	}

// now check outlinks on the page for these substrings
	haystack     = links->getLinkBuf();
	haystackSize = links->getLinkBufLen();
	long numNeedles2 = sizeof(s_needles2)/sizeof(Needle);
	getMatches2 ( s_needles2   , 
		      numNeedles2  , 
		      haystack     , 
		      haystackSize , 
		      NULL         ,  // linkPos, 
		      NULL         ,  // &n ,
		      false        ,  // stopAtFirstMatch?
		      NULL         ,  // hadPreMatch?
		      true         ,  // save quicktables
		      niceness     );

QUICKPOLL( niceness );

// see if we got a hit
	for ( long i = 0 ; i < numNeedles2 ; i++ ) {
		long need = 1;
		// open.thumbshots.org needs multiple counts
		//if ( i == 9 ) need = 5;
		if ( s_needles2[i].m_count < need ) continue;
		*note = s_needles2[i].m_string;
		return true;
	}

QUICKPOLL( niceness );

//skiplinks:
	// check for certain post tag, indicative of a comment-friendly blog
	// <form method=post ... action=*comments*cgi-bin>
	// <form method="post" 
	//       action="http://www.mydomain.com/cgi-bin/mt-comments.cgi" 
	//       name="comments_form" ...>
	// <form method=POST 
	//  action="http://peaceaction.org/wboard/wwwboard.cgi">
	long nn = xml->getNumNodes();
	bool gotTextArea = false;
	bool gotSubmit   = false;
	for ( long i=0; i < nn ; i++ ) {
		// <textarea> tags are bad... but only if we have not
		// matched "track" or whatever from above... check for that
		// if you uncommment this... otherwise you disable all blogs!
		// Only do this check if we did match a comment related phrase
		// in s_needles1[] BUT it was BEFORE our outlink. That 
		// basically means that we do *not* recognize the format of 
		// the comment page and so therefore need to be more 
		// restrictive about allowing this page to vote.
		if ( ! hadPreMatch ) {
			// is it a <textarea> tag?
			if ( xml->getNodeId ( i ) == TAG_TEXTAREA ) 
				gotTextArea = true;
			// is it an <input> tag?
			long len = 0;
			if ( xml->getNodeId ( i ) == TAG_INPUT &&
			     xml->getString(i,"submit",&len)) gotSubmit = true;
		}
		// check for script tag
		/*
		if ( xml->getNodeId(i) == TAG_SCRIPT && quality < 80 ) {
			// <script src=blah.com/fileparse.js" 
			// type="text/javascript"> is used to hide google
			// ads, so don't allow those pages to vote either
			long  slen; xml->getString(i,"src",&slen);
			if ( slen > 0 ) { *note = "script src"; return true; }
		}
		*/
		if ( xml->getNodeId ( i ) != TAG_FORM ) continue;
			
		// get the method field of this base tag
		long  slen;
		char *s = (char *) xml->getString(i,"method",&slen);
		// if not thee, skip it
		if ( ! s || slen <= 0 ) continue;
		//if ( slen != 4 ) continue;
		// if not a post, skip it
		//if ( strncasecmp ( s , "post" , 4 ) ) continue;
		// get the action url
		s = (char *) xml->getString(i,"action",&slen);
		if ( ! s || slen <= 0 ) continue;
		char c = s[slen];
		s[slen]='\0';
		bool val = false;
		// this is a bit too strong, but i'ev seen an action of
		// "cgi-bin/mt-leaveone.cgi" so we can't rely on "mt-comment"
		if      ( strstr ( s , "comment" ) ) val = true;
		else if ( strstr ( s , "/MT/" ) ) val = true;
		else if ( strstr ( s , "/mt/" ) ) val = true;
		// they can have these search boxes though
		if ( val && strstr ( s , "/mt/mt-search" ) ) val = false;
		//else if ( strstr ( s , "cgi"     ) ) val = true;
		// eliminate some false positives
		//if ( val && strstr ( s , "search" ) ) val = false;
		s[slen] = c;
		if ( val ) { *note = "post page"; return true; }
	}

if ( gotTextArea && gotSubmit ) {
		*note = "textarea tag";
		return true;
	}

QUICKPOLL( niceness );

// edu, gov, etc. can have link chains
	if ( tldLen >= 3 && strncmp ( tld, "edu" , 3) == 0 ) return false;
	if ( tldLen >= 3 && strncmp ( tld, "gov" , 3) == 0 ) return false;

// if linker is naughty, he cannot vote
	if ( linker->isSpam() ) return true;

// if being called from PageTitledb.cpp for displaying a titlerec, 
	// then do not call this, because no linkee is provided in that case.
	if ( !linkee ) {
		*note = "linkee not found";
		return false;//true;
	}

// . allow sites in dmoz to have it too
	// . no, there are too many spam porn sites in dmoz
	long *ids  = indCatIds;
	long  nids = numIndCatIds;
	//if ( tr ) ids  = indCatIds;//tr->getIndCatids();
	//if ( tr ) nids = numIndCatids;//tr->getNumIndCatids();
	for ( long j = 0 ; j < nids ; j++ ) 
		if ( g_categories && g_categories->isIdAdult ( ids[j] ) ) {
			*note = "dmoz porn"; 
			return true;
		}

QUICKPOLL( niceness );
	// . if they link to any adult site, consider them link spam
	// . just consider a 100 link radius around linkNode
	long nl = links->getNumLinks();
	long linkNum =links->findLinkNum(linkee->getUrl(),linkee->getUrlLen());
	long i0 = linkNum - 100;
	long i1 = linkNum + 100;
	if ( i0 < 0  ) i0 = 0;
	if ( i1 > nl ) i1 = nl;
	for ( long i = i0 ; i < i1 ; i++ ) {
		if ( ! g_categories ) continue;
		Url uu; uu.set ( links->getLink(i), links->getLinkLen(i) );
		char *h    = uu.getUrl();
		// include the trailing /
		char *hend = uu.getHost() + uu.getHostLen() + 1;
		unsigned long hhash = hash32 ( h , hend - h );
		if ( g_categories->isInBadCat ( hhash ) ) {
			*note = "links to dmoz filth";
			log(LOG_DEBUG,"build: %s is filthy.",uu.getUrl());
			return true;
		}
		QUICKPOLL( niceness );

// . is it near sporny links? (naughty domains or lotsa -'s)
		// . if we are in a list of ads, chances are good the true
		//   nature of the ads will emerge...
		if ( i == linkNum      ) continue;
		if ( i - linkNum >=  4 ) continue;
		if ( i - linkNum <= -4 ) continue;
		if ( ! uu.isSpam()     ) continue;
		*note = "near sporny outlink";
		log(LOG_DEBUG,"build: %s is sporny.",uu.getUrl());
		return true;
	}

// . allow sites in dmoz to have it too
	// . no, there are too many spam porn sites in dmoz
	//if ( tr->getNumIndCatids() || tr->getNumCatids()   ) return false;

// init these before the loop
	long  hlen  = linkee->getHostLen();
	char *host  = linkee->getHost();
	char *uu    = linkee->getUrl();
	char *uuend = host + hlen;
	long  uulen = uuend - uu;
	long  x     = linkNode;
 loop:

QUICKPOLL( niceness );

// return true right away if it is a link chain
	//if ( tr->getDocQuality() < 60 &&
	if ( siteNumInlinks < 1000 && 
	     isLinkChain ( xml , linker, linkee , x , note ) ) 
		return true;

// if no domain, that's it
	if ( ! uu || uulen <= 0 ) return false;

// . see if this domain is linked to in other areas of the document.
	// . if any of those areas are not link chains, then assume we are
	//   not a link chain
	for ( x++ ; x < nl ; x++ ) {
		char *link    = links->getLink    (x);
		long  linkLen = links->getLinkLen (x);
		if ( ! link          ) continue;
		if ( linkLen <= 0    ) continue;
		if ( linkLen > uulen ) continue;
		if ( strncmp ( link , uu , uulen ) ) continue;
		// got a match, is it a link chain? if not, them we are not
		goto loop;
	}

return false;

}

// Criteria for being a link chain:
//
// 1. the "linkee" is in a chain of outlinks to external domains
// 2. all outlinks to the same hostname as "linkee" are in link chains
// 3. no plain text is present between "linkee" and one of the other
//    outlinks in the chain
// 4. this might hurt blogrolls, and resource pages, but such links
//    are kind of low quality anyway.
bool isLinkChain ( Xml *xml , Url *linker , Url *linkee , long linkNode ,
		   char **note ) {

//log(LOG_DEBUG,"build: doing %s",linker->m_url);

// if the linkee is internal (by domain) then not a link chain
	if ( linkee->m_dlen == linker->m_dlen &&
	     strncmp ( linkee->m_domain , linker->m_domain,linkee->m_dlen)==0)
		return false;

char *linkPos = NULL;
	if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode );

// did we have text to the left/right of this link and after/before
	// the neighboring link? assume not.
	bool leftText  = false;
	bool rightText = false;

// the links on the left and right
	Url  leftUrl;
	Url  rightUrl;
	bool leftMalformed  = false;
	bool rightMalformed = false;
	// these do not have constructors so we must reset them
	leftUrl.reset();
	rightUrl.reset();
	long i ;
	// . see if we are alone in a table or not
	// . table must occur before/after our left/right neighbor link
	bool tableLeft   = false;
	bool tableRight  = false;

// going backwards from linkNode we are not in a link
	bool inLink = false;

// get the start of an anchor tag on our immediate left
	for ( i = linkNode - 1 ; i >= 0 ; i-- ) {
		// do not look too far
		if ( linkPos - xml->getNode(i) >= 1500 ) break;
		if ( linkNode - i >= 90                ) break;
		// NOTE: if you add more tags to this list, then also add
		// to Vector::setPairHashes() as well
		// stop at <title> or </title> tags
		if ( xml->getNodeId(i) == TAG_TITLE ) break;
		// stop at <ul> or </ul> tags
		// no, otherwise, these lists are always "link chain left"
		//if ( xml->getNodeId(i) == TAG_UL ) break;
		// stop at <table> or </table> tags
		if ( xml->getNodeId(i) == TAG_TABLE ) {
			if ( ! xml->isBackTag(i) ) tableLeft = true;
			break;
		}

// check for *plain* text
		if ( ! inLink && xml->getNodeId(i) == TAG_TEXTNODE ) {
			// get the node as a string
			char *p    = xml->getNode(i);
			char *pend = p + xml->getNodeLen(i);
			// check for elipsis, that is a sign that we are a serp
			for ( char *s = p ; s+2 < pend ; s++ ) {
				//if ( is_alnum(*s) ) break;
				if ( *s != '.' ) continue;
				s++;
				if ( *s != '.' ) continue;
				s++;
				if ( *s != '.' ) continue;
				// ok, got it
				*note = "search result right";
				return true;
			}
			// if we already got text, but searching still for ...
			if ( leftText ) continue;
			// does it have alnum
			if ( ! has_alpha_utf8 ( p , pend ) ) continue;
			leftText = true; 
			// do not break yet, cont search for ellipsis!
		}

// keep chugging if not an anchor tag, <a> or </a>
		if ( xml->getNodeId(i) != TAG_A ) continue;
		// if we are </a> then we are now in a link since we are moving
		// backwards
		if ( xml->isBackTag(i) ) { inLink = true; continue; }
		// if we hit a forward tag and inLink was false... we had
		// no corresponding back tag, so disconsider any text
		if ( ! inLink ) rightText = false;
		// no longer in an <a> tag
		inLink = false;

// ok, get the url from this anchor tag
		long  ulen = 0;
		char *u = (char *) xml->getString ( i, "href", &ulen );
		// if we did not get one, that means it could have been
		// malformed... like the href had a quote right b4 it
		if ( ulen == 0 ) leftMalformed = true;
		// normalize
		if ( ulen > 0 ) 
			leftUrl.set ( linker , u , ulen );
		// . if NOT from the same domain, break out, otherwise continue
		// . this helps us find the <table> tag in ad tables with 
		//   multiple links to the same domain
		// . this helps us accept a list of links to the same domain if
		//   there is left/right text, like the guy that had a list
		//   to 3 different gigablast.com links in a row with no
		//   text in between
		if ( leftUrl.m_dlen != linkee->m_dlen  ) break;
		if ( strncmp ( leftUrl.m_domain ,
			       linkee->m_domain ,
			       linkee->m_dlen   )       ) break;
	}

// we start off in link text, since linkNode is an <a> tag
	inLink = true;
	// now loop through all the nodes after us
	for ( i = linkNode + 1 ; i < xml->getNumNodes() ; i++ ) {
		// stop if we've gone too far
		if ( xml->getNode(i) - linkPos >= 1580 ) break;
		if ( i - linkNode >= 95                ) break;
		// stop at <title> or </title> tags
		if ( xml->getNodeId(i) == TAG_TITLE ) break;
		// stop at <table> or </table> tags
		if ( xml->getNodeId(i) == TAG_TABLE ) {
			// note it for table ads
			if ( xml->isBackTag(i) ) tableRight = true;
			break;
		}

// check for *plain* text
		if ( ! inLink && xml->getNodeId(i) == TAG_TEXTNODE ) {
			// get the node as a string
			char *p    = xml->getNode(i);
			char *pend = p + xml->getNodeLen(i);
			// check for elipsis, that is a sign that we are a serp
			for ( char *s = p ; s+2 < pend ; s++ ) {
				//if ( is_alnum(*s) ) break;
				if ( *s != '.' ) continue;
				s++;
				if ( *s != '.' ) continue;
				s++;
				if ( *s != '.' ) continue;
				// ok, got it
				*note = "search result right";
				return true;
			}
			// if we already got text, but searching still for ...
			if ( rightText ) continue;
			// does it have alnum
			if ( ! has_alpha_utf8 ( p , pend ) ) continue;
			rightText = true; 
			// do not break yet, cont search for ellipsis!
		}

// keep chugging if not an anchor tag, <a> or </a>
		if ( xml->getNodeId(i) != TAG_A ) continue;
		// skip if not a forward tag
		if ( xml->isBackTag(i) ) { inLink = false; continue; }
		// we are now in a link
		inLink = true;
		// stop text here
		//stopTextScan = i;
		// ok, get the url
		long  ulen = 0;
		char *u = (char *) xml->getString ( i, "href", &ulen );
		// if we did not get one, that means it could have been
		// malformed... like the href had a quote right b4 it
		if ( ulen == 0 ) rightMalformed = true;
		// normalize
		if ( ulen > 0 ) 
			rightUrl.set ( linker, u , ulen );
		// . if NOT from the same domain, break out, otherwise continue
		// . this helps us find the <table> tag in ad tables with 
		//   multiple links to the same domain
		// . this helps us accept a list of links to the same domain if
		//   there is left/right text, like the guy that had a list
		//   to 3 different gigablast.com links in a row with no
		//   text in between
		if ( rightUrl.m_dlen != linkee->m_dlen  ) break;
		if ( strncmp ( rightUrl.m_domain ,
			       linkee->m_domain  ,
			       linkee->m_dlen    )       ) break;
	}

if ( tableLeft && tableRight ) {
		*note = "ad table";
		return true;
	}

// if we had text on both sides of us, we are not a link chain
	if ( leftText && rightText ) return false;

if      ( ! leftText  && rightText ) *note = "link chain left";
	else if ( ! rightText && leftText  ) *note = "link chain right";
	else                                 *note = "link chain middle";

return true;
}

/*
//we want to find the position of the linkee in the linker's xml class
//if it is in a series of links without vertical space in the end of
//the document, then we won't count it
bool isLinkSpam2 ( Url *linkee, TitleRec *tr, Xml *xml, Links *links ,
		   long maxDocLen ) {
	long linkNum = links->findLinkNum(linkee->getUrl(), 
					  linkee->getUrlLen());
	if(linkNum < 0) return false;

long thisXmlNode;
	long nextXmlNode;
	long numInARow = 1;
	long needInARow = 3;

long thisLink = linkNum;
	long nextLink = thisLink + 1;
	short nodeId;
	long i;
	long len;
	char *c;
	char *cend;

goRight:
	if(nextLink >= links->getNumLinks()) {
		//we can only go left from here.
		//log(LOG_WARN, "links: endotheline");
		goto goLeft;
	}
	thisXmlNode = links->getNodeNum(thisLink);
	nextXmlNode = links->getNodeNum(nextLink);
	 
	 //skip link text
	 for ( i=thisXmlNode; i < nextXmlNode ; i++ ) {
		 if(xml->isBackTag(i)) break;
	 }
	//could we find link text close tag?
	if(i == nextXmlNode) {
		//log(LOG_WARN, "links: couldn't find it");
		goto goLeft;
	}
	for (; i < nextXmlNode ; i++ ) {
		nodeId = xml->getNodeId(i);
		if(nodeId == TAG_BR  ||//br
		   nodeId == TAG_TR ||//tr
		   nodeId == TAG_LI) {//li
			//log(LOG_WARN, "links: breaking tag");
			goto goLeft;
		}
		//do we have a rel="nofollow" link which isn't in the 
		//links class.  If so, skip past it's link text.
		if(nodeId == TAG_A) {
			for ( ; i < nextXmlNode ; i++ ) {
				if(xml->isBackTag(i)) break;
			}
			if(i == nextXmlNode) goto goLeft;
		}

if(nodeId != TAG_TEXTNODE && nodeId != TAG_B) continue;
		c = xml->getNode(i); 
		len = xml->getNodeLen(i); 
		cend  = c + len;

while (c < cend) {
			if(*c == '&') 
				while (c < cend && *c != ';') c++;
			if(is_alnum_utf8(c)) {
				//log(LOG_WARN, "links: stupid %c", *c);
				goto goLeft;
			}
			c++;
		}
	}

//ok, so we got to the next link without
	//breaking, try getting the next link
	numInARow++;
	if(numInARow >= needInARow) goto goLeft;

thisLink++;
	nextLink++;
	goto goRight;

goLeft:
	//log(LOG_WARN, "links: numinarow %li linkee %s , linker %s",
	//numInARow, linkee->getUrl(), tr->getUrl()->getUrl());
	if(numInARow >= needInARow) return true;

//get out if we've already tried going left
	if(thisLink < linkNum) return false;

needInARow = needInARow - numInARow;
	thisLink = linkNum - needInARow;
	nextLink = thisLink + 1;
	if(thisLink < 0) return false;
	goto goRight;

return false;
}
*/

static Needle s_needles3[] = {
	// this often directly precedes the comment section
	{"[trackback"                   , 0 , 1 , 1 , 0 , NULL , 0 , NULL } ,
	{"class=\"comtext"              , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"class=\"comment"              , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"class=\"coment"               , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"class=\"trackback"            , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"class=\"ping"                 , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"class=\"followup"             , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"class=\"response"             , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	// this can signify a blog entry, not just a comment
	//{"class=\"entry"              , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	// these seem to be more indicative of posted comments
	{"class=\"posted"               , 0 , 8 , 1 , 0 , NULL , 0 },
	{"id=\"posted"                  , 0 , 8 , 1 , 0 , NULL , 0 },
	{"name=\"posted"                , 0 , 8 , 1 , 0 , NULL , 0 },
	// annoying little textbox thingy
	{"class=\"shoutbox"             , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"id=\"comment"                 , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"id=\"coment"                  , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"id=\"trackback"               , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"id=\"ping"                    , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"id=\"followup"                , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"id=\"response"                , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"name=\"comment"               , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"name=\"coment"                , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"name=\"trackback"             , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"name=\"ping"                  , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"name=\"followup"              , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	{"name=\"response"              , 0 , 8 , 1 , 0 , NULL , 0 , NULL } ,
	// message boards
	{"anonymous user"               , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	{"anonymer user"                , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	{"date posted"                  , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	{"post your notice"             , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	//{"edit this page"               , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	// edit</a><br>
	{"edit<a]br"                    , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	// link to edit a comment
	{">edit</a"                     , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	// these often indicate blog entries, not just comments
	//{"reply with quote"             , 0 , 9 , 0 , 0 , NULL , 0 , NULL } ,
	//{">post a reply"                , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	//{"post reply"                   , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	//{"submit post"                  , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	//{">post message"                , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	//{">post a comment"              , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	//{">leave a comment"             , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	//{">post comments"               , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	// Comments</font> (0) after each posted entry...
	//{">comments<"                 , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	{"comments: <"                  , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	{"comments:<"                   , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	//{"comment:"                   , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	{"reacties:"                    , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	{"comentarios:"                 , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	{"comentários:"                 , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	{">message:"                    , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	{">mensagem:"                   , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	{">faca seu comentario"         , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	{">faça seu comentário"         , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	// comment add in german
	{">Kommentar hinzuf"            , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	{">add my comment"              , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	// title of the text area box
	{">your comment"                , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	{"your comment<"                , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	{">comment by"                  , 0 , 10, 1 , 0 , NULL , 0 , NULL } ,
	{">scrivi un commento"          , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	{">scrivi il tuo commento"      , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	{"add comment"                  , 0 , 10, 0 , 0 , NULL , 0 , NULL } ,
	{"trackbacks for the art"       , 0 , 12, 1 , 0 , NULL , 0 , NULL } ,
	{"these trackbacks have been re", 0 , 13, 1 , 0 , NULL , 0 , NULL } ,
	{"trackback pings"              , 0 , 13, 1 , 0 , NULL , 0 , NULL } ,
	{"read the rest of this com"    , 0 , 13, 1 , 0 , NULL , 0 , NULL } ,
	// that was the opinion of ...
	{"das war die meinung von"      , 0 , 13, 1 , 0 , NULL , 0 , NULL } ,
	//{"add new comment"              , 0 , 14, 0 , 0 , NULL , 0 },
	//{"add message"                  , 0 , 14, 0 , 0 , NULL , 0 },
	// tagboard software allows free submits. it has this in 
	// an html comment tag...
	{"2002 natali ardianto"         , 0 , 14, 0 , 0 , NULL , 0 , NULL } ,
	// guestbooks
	//{"guestbook</title"             , 0 , 13, 0 , 0 , NULL , 0 , NULL } ,
	//{"gastenboek</title"            , 0 , 13, 0 , 0 , NULL , 0 , NULL } ,
};

// gives you where the comment section starts
// looks for only the first comment section
char *getCommentSection ( char *haystack     ,
			  long  haystackSize ,
			  long  niceness     ){

// get our page quality, it serves as a threshold for some algos
	//char quality = tr->getNewQuality();

//char *linkPos = NULL;
	//if ( linkNode >= 0 ) linkPos = xml->getNode ( linkNode );

//if ( strstr ( linker->getUrl() , "usa_apartments1.htm") ) {
	//	log("hey");
	//	sleep(7);
	//}

// loop:
	// do not call them "bad links" if our link occurs before any
	// comment section. our link's position therefore needs to be known,
	// that is why we pass in linkPos. 
	// "n" is the number it matches.
	//long numNeedles1 = sizeof(s_needles3)/sizeof(Needle);
	return getMatches2 ( s_needles3  ,
			     sizeof(s_needles3)/sizeof(Needle),
			     haystack    ,
			     haystackSize,
			     NULL        ,
			     NULL        ,
			     true        ,// stopAtFirstMatch
			     NULL        , // hadPreMatch?
			     false       , // save quick tables?
			     niceness    );
}