#include "gb-include.h"
#include "CatRec.h"
//#include "SiteBonus.h"
#include "Lang.h"
//#include "DateParse.h"
//static long getY(Xml *xml, long n0,long n1,long X,
// char *strx,char *stry,long def);
CatRec::CatRec (){
reset();
};
CatRec::~CatRec() {}
void CatRec::reset() {
m_hadRec = false;
//m_xml = NULL;
m_catids = NULL;
m_numCatids = 0;
m_numIndCatids = 0;
m_dataSize = 0;
//m_siteQuality = 0;
//m_spamBits = 0;
//m_adultLevel = 0;
//m_numTypes = 0;
//m_numLangs = 0;
}
// . used by Msg8 to parse a serialized site rec into this CatRec class
// . we copy the info we need from "rec" so caller can free it
// . if rec is NULL or recSize is 0 we use the default xml
// . returns false and sets g_errno on error
// . a CatRec has the format: (like a record in an RdbList)
// . kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk k = 96bit key (typical)
// . kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk
// . kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk
// . dddddddd dddddddd dddddddd dddddddd d = dataSize of data below here
// .[nnnnnnnn cccccccc cccccccc cccccccc n = number of catids, Catdb only
// . cccccccc cccccccc ........ ........] c = series of catids, longs [Catdb]
// . ffffffff ffffffff ffffffff vvvvvvvv v = version f = site fileNum (must be >= 0)
// . uuuuuuuu uuuuuuuu uuuuuuuu ........ u = var length site url
// version >= 2:
// . ppssxxxx s = spam bits
// . p = adultLevel
// . x = unused
// version >= 3:
// . qqqqqqqq q = site quality
bool CatRec::set ( Url *url , char *data , long dataSize , bool gotByIp ) {
//char rdbId ) {
// assume url does not have a rec in tagdb
m_hadRec = false;
// set our collection
//if ( coll ) memcpy ( m_coll , coll , collLen );
//m_collLen = collLen;
// . if "data" is i guess the rec did not exist... so make a dummy rec
// . MDW: why?
if ( ! data || dataSize <= 0 ) {
// default m_site to the hostname
m_site.set (url->getHost(),url->getHostLen(),false/*addwww?*/);
// steal ip from url
m_site.setIp ( url->getIp() );
// default xml for this collection
//m_xml = g_tagdb.getSiteXml ( 0,/*filenum*/
// coll, collLen); //, NULL , 0 );
m_filenum = 0 ;
//if ( m_xml ) return true;
//g_errno = ENODATA;
//return log("db: Could not find the ruleset file "
// "%stagdb0.xml.",g_hostdb.m_dir);
return true;
}
// return false and set g_errno if buf too small
if ( dataSize >= CATREC_BUF_SIZE ) {
g_errno = EBUFTOOSMALL;
return false;
}
// copy the raw data
memcpy(m_data, data, dataSize);
m_dataSize = dataSize;
// set up a parsing ptr into "data"
//char *p = data;
char *p = m_data;
// get the catids if using catdb
//if (rdbId == RDB_CATDB) {
m_numCatids = *(unsigned char*)p;
p++;
m_catids = (long*)p;
p += 4*m_numCatids;
//}
// point to the filenum so we can mod it!
//m_filenumPtr = p;
// get the filenum (0 is default)
//m_filenum = *(long *) p ; p += 4;
m_filenum = *(long *) p ; p += 3;
// get the version
if ( m_filenum == -1 ) {
m_version = 0;
p++;
}
else {
m_filenum &= 0x00FFFFFF;
m_version = *p;
p++;
}
// calc site url length
if ( m_version == 0 ) {
m_urlLen = dataSize - 4;
//if (rdbId == RDB_CATDB)
m_urlLen -= (4*m_numCatids) + 1;
}
else
m_urlLen = gbstrlen(p);
// set our site url
m_url = p;
m_site.set ( p , m_urlLen , false/*addwww?*/);
// move p to end of url
p += m_urlLen;
if ( m_version >= 1 )
p++;
// add time stamp, comment, username
/*
if ( m_version >= 2 && rdbId != RDB_CATDB ) {
// time stamp
m_timeStamp = *(long*)p;
p += 4;
// comment
m_comment = p;
p += gbstrlen(m_comment) + 1;
// username
m_username = p;
p += gbstrlen(m_username) + 1;
}
unsigned char siteFlags = 0;
m_spamBits = 0;
m_adultLevel = 0;
if ( m_version >= 3 && rdbId != RDB_CATDB ) {
siteFlags = *p++;
m_spamBits = siteFlags & 0xc0;
}
//we've added a 1 byte quality and 2 bits for adult content level.
if ( m_version >= 4 && rdbId != RDB_CATDB ) {
m_siteQuality = *p++;
m_adultLevel = (siteFlags & 0x30);
}
m_incHere = NULL;
m_addHere = NULL;
if ( m_version >= 5 && rdbId != RDB_CATDB ) {
// a marker for addSiteType() function below
m_incHere = (long *)p;
m_numTypes = *(uint8_t*)p;
p += sizeof(uint8_t);
for(long i = 0; i < m_numTypes; i++) {
m_siteTypes[i].m_type = *(uint8_t*)p;
p += sizeof(uint8_t);
// version 6 adds 32-bit scores to site type
if (m_version >= 6 &&
SiteType::isType4Bytes(m_siteTypes[i].m_type)) {
m_siteTypes[i].m_score = *(uint32_t*)p;
p += sizeof(uint32_t);
}
else {
m_siteTypes[i].m_score = (uint32_t)*(uint8_t*)p;
p += sizeof(uint8_t);
}
}
// save ptr for addSiteTypes()
m_addHere = p;
//now for the languages
m_numLangs = *(uint8_t*)p;
p += sizeof(uint8_t);
for(long i = 0; i < m_numLangs; i++) {
m_siteLangs[i].m_type = *(uint8_t*)p;
p += sizeof(uint8_t);
m_siteLangs[i].m_score = (uint32_t)*(uint8_t*)p;
p += sizeof(uint8_t);
}
}
*/
// sanity check
if ( p - m_data != m_dataSize ) {
log ( "tagdb: Deserialized datasize %i != %li for url %s so "
"ignoring tagdb record.",
p - m_data, m_dataSize , url->getUrl() );
return false;
char *xx = NULL; *xx = 0;
}
// if hostname is same as url we can use the ip from url
if ( url && m_site.getHostLen() == url->getHostLen() )
m_site.setIp ( url->getIp() );
// . this url had it's own rec in the db
// . Msg16 needs to know this so it won't auto-detect porn/spam in
// this url itself and delete it from tfndb
m_hadRec = true;
// if rec was in tagdb, data will be non-null.. did we get the rec
// from tagdb by matching an IP? (as oppossed to canonical name)
m_gotByIp = gotByIp;
// get the xml for this filenum
//m_xml = g_tagdb.getSiteXml ( m_filenum , coll , collLen );
//if ( m_xml ) return true;
// should NEVER be NULL
//g_errno = ENODATA;
//return log("db: Could not find the ruleset file %stagdb%li.xml.",
// g_hostdb.m_dir,m_filenum);
return true;
}
bool CatRec::set ( Url *site ,
long filenum ,
//char version , char rdbId ,
//long timeStamp, char *comment , char *username ,
long *catids , unsigned char numCatids
//unsigned char spamBits, char siteQuality,
//char adultLevel,
//SiteType *siteTypes,
//uint8_t numTypes,
//SiteType *siteLangs,
//uint8_t numLangs) {
) {
// version
m_version = CATREC_CURRENT_VERSION; // version;
// how big should the site rec be?
m_dataSize = 4 + site->getUrlLen() ;
// null termination
if ( CATREC_CURRENT_VERSION >= 1 )
m_dataSize++;
// add time stamp, comment, username
//if ( version >= 2 && rdbId != RDB_CATDB ) {
// m_dataSize += 6;
// if (comment)
// m_dataSize += gbstrlen(comment);
// if (username)
// m_dataSize += gbstrlen(username);
//}
//the spam bits.
//if ( version >= 3 && rdbId != RDB_CATDB) {
// m_dataSize++;
//}
//the site quality
//if ( version >= 4 && rdbId != RDB_CATDB) {
// m_dataSize++;
//}
//if ( version >= 5 && rdbId != RDB_CATDB) {
// m_dataSize += sizeof(uint8_t);
// m_dataSize += numTypes * (sizeof(uint8_t) + sizeof(uint8_t));
// m_dataSize += sizeof(uint8_t);
// m_dataSize += numLangs * (sizeof(uint8_t) + sizeof(uint8_t));
//}
// . beginning with version 6, SiteType scores can be either 8-bit or
// 32-bit, so add the extra bytes to the data size
//if ( version >= 6 ) {
// for ( long i = 0; i < numTypes; i++ ) {
// if ( SiteType::isType4Bytes(siteTypes[i].m_type) ) {
// m_dataSize += (sizeof(uint32_t) -
// sizeof(uint8_t));
// }
// }
//}
// sanity check
if ( m_version > CATREC_CURRENT_VERSION ) {
char *xx = NULL; *xx = 0; }
// catids and numcatids
//if (rdbId == RDB_CATDB)
m_dataSize += 1 + (numCatids * 4);
// return false and set g_errno if buf too small
if ( m_dataSize > CATREC_BUF_SIZE ) {
g_errno = EBUFTOOSMALL;
return false;
}
// how about the actual dataSize?
//m_dataSize = 4 + site->getUrlLen();
// serialize into m_data
char *p = m_data;
// get our key
//key_t key = g_tagdb.makeKey (site, coll, collLen, false/*del?*/);
//m_numTypes = numTypes;
//sanity check:
//if(m_numTypes > MAX_SITE_TYPES) {
// char *xx = NULL; *xx = 0;}
// store numCatids and catids if exist
m_numCatids = numCatids;
if ( m_numCatids > MAX_CATIDS )
m_numCatids = MAX_CATIDS;
//if (catids) {
//if (rdbId == RDB_CATDB) {
// add the count
memcpy(p, &m_numCatids, 1);
p++;
// add the ids
m_catids = (long*)p;
memcpy(p, catids, 4*m_numCatids);
// skip over "numCatids" NOT m_numCatids which is TRUNCATED
// to MAX_CATIDS
p += 4*numCatids;
//}
// point to the filenum so we can mod it!
//m_filenumPtr = p;
// store the filenum (3 bytes)
//*(long *) p = filenum ; p += 4;
//long filenum = 0; // make this 0 for catdb rec: MDW
memcpy(p, &filenum, 3); p += 3;
// store the version (1 byte)
*p = m_version; p++;
// the site
m_url = p;
m_urlLen = site->getUrlLen();
memcpy ( p , site->getUrl() , site->getUrlLen() );
p += site->getUrlLen();
// NULL terminate the site
if ( m_version >= 1 ) {
*p = '\0'; p++;
}
// add time stamp, comment, username
/*
if ( m_version >= 2 && rdbId != RDB_CATDB ) {
// time stamp
m_timeStamp = timeStamp;
memcpy(p, &timeStamp, 4);
p += 4;
// comment
m_comment = p;
if (comment) {
strcpy(p, comment);
p += gbstrlen(comment) + 1;
}
else {
*p = '\0';
p++;
}
// username
m_username = p;
if (username) {
strcpy(p, username);
p += gbstrlen(username) + 1;
}
else {
*p = '\0';
p++;
}
}
m_adultLevel = adultLevel;
m_spamBits = spamBits;
unsigned char siteFlags = 0;
siteFlags |= m_adultLevel;
siteFlags |= m_spamBits;
if ( m_version >= 3 && rdbId != RDB_CATDB ) {
*p = siteFlags;
p++;
}
if ( m_version >= 4 && rdbId != RDB_CATDB ) {
*p = siteQuality;
p++;
}
// reset this
m_addHere = NULL;
m_incHere = NULL;
if ( m_version >= 5 && rdbId != RDB_CATDB ) {
// a marker for addSiteType() function below
m_incHere = (long *)p;
*(uint8_t*)p = numTypes;
p += sizeof(uint8_t);
for(long i = 0; i < numTypes; i++) {
*(uint8_t*)p = siteTypes[i].m_type;
p += sizeof(uint8_t);
// version 6 adds 32-bit scores to site type
if ( m_version >= 6 &&
SiteType::isType4Bytes( siteTypes[i].m_type ) ) {
*(uint32_t*)p = siteTypes[i].m_score;
p += sizeof(uint32_t);
}
else {
*(uint8_t*)p = (uint8_t)siteTypes[i].m_score;
p += sizeof(uint8_t);
}
}
// this is a marker where to add site types from
// addSiteType() function below
m_addHere = p;
*(uint8_t*)p = numLangs;
p += sizeof(uint8_t);
for(long i = 0; i < numLangs; i++) {
*(uint8_t*)p = siteLangs[i].m_type;
p += sizeof(uint8_t);
*(uint8_t*)p = siteLangs[i].m_score;
p += sizeof(uint8_t);
}
}
*/
// sanity check
if ( p - m_data != m_dataSize ) {
log ( "catrec: Serialized datasize %i != %li",
p - m_data, m_dataSize );
char *xx = NULL; *xx = 0;
}
// set our member vars correctly in addition to the site rec
m_site.set ( site->getUrl(), site->getUrlLen(), false/*addwww?*/);
// steal ip from "site"
m_site.setIp ( site->getIp() );
// save the collection into m_coll
//memcpy ( m_coll , coll , collLen );
//m_collLen = collLen;
// save the fileNum as well
//m_filenum = filenum;
// make sure xml is set
//m_xml = g_tagdb.getSiteXml ( m_filenum , coll , collLen );
//if ( m_xml ) return true;
// should NEVER be NULL
//g_errno = ENODATA;
//return log("db: Could not find the ruleset file %stagdb%li.xml.",
// g_hostdb.m_dir,m_filenum);
return true;
}
// keep everything else the same
/*
bool CatRec::set ( long filenum ) {
// save the fileNum
m_filenum = filenum;
// make sure xml is set
m_xml = g_tagdb.getSiteXml ( m_filenum , m_coll , m_collLen );
if ( m_xml ) return true;
// should NEVER be NULL
g_errno = ENODATA;
return log("db: Could not find the ruleset file %stagdb%li.xml.",
g_hostdb.m_dir,m_filenum);
}
*/
// . this set method just sets the site records filenum, version,
// url and url len
// . this method is added to skip the getSiteXml and other
// overheads
bool CatRec::set ( char *data, long dataSize ) {//, char rdbId ){
if ( !data || dataSize <= 0 )
return false;
//if (rdbId == RDB_CATDB) {
m_numCatids = *(unsigned char*)data;
data++;
m_catids = (long*)data;
data += 4*m_numCatids;
//}
// get the filenum (0 is default)
//m_filenum = *(long *) p ; p += 4;
m_filenum = *(long *) data ; data += 3;
// get the version
if ( m_filenum == -1 ) {
m_version = 0;
data++;
}
else {
m_filenum &= 0x00FFFFFF;
m_version = *data;
data++;
}
// calc site url length
if ( m_version == 0 ) {
m_urlLen = dataSize - 4;
//if (rdbId == RDB_CATDB)
m_urlLen -= (4*m_numCatids) + 1;
}
else
m_urlLen = gbstrlen(data);
// set our site url
m_url = data;
m_site.set ( data , m_urlLen , false);
return true;
}
// set the indirect catids
void CatRec::setIndirectCatids ( long *indCatids, long numIndCatids ) {
// store the number of ids
m_numIndCatids = numIndCatids;
if ( m_numIndCatids > MAX_IND_CATIDS )
m_numIndCatids = MAX_IND_CATIDS;
// store the ids
memcpy ( m_indCatids, indCatids, m_numIndCatids*4 );
}
/*
long CatRec::getMaxLenFromQuality ( long n0, long n1, long quality ) {
return getY (n0,n1, quality, "index.quality1","index.maxLen1",64000);}
long CatRec::getMaxScoreFromQuality ( long n0, long n1, long quality ) {
long max=getY (n0,n1,quality,"index.quality2","index.maxScore2",100);
if ( max > 100 ) {
log("db: Encountered maxScore from quality > 100 in ruleset "
"file. Truncating to 100.");
max = 100;
}
return max;
}
//bool CatRec::hasMaxCountFromQualityTag ( long n0, long n1 ) {
// long max=getY (n0,n1,50,"index.quality4","index.maxCount4",-9321);
// if ( max == -9321 ) return false;
// return true;
//}
//
//long CatRec::getMaxCountFromQuality ( long n0, long n1, long quality ) {
// // 100 in this sense is not a percentage, but an actual word count
// long max=getY (n0,n1,quality,"index.quality4","index.maxCount4",
// 9999999);
// if ( max < 0 ) {
// log("db: Encountered maxScore from quality of %li in ruleset "
// "file. Setting to 0.",max);
// max = 0;
// }
// return max;
//}
long CatRec::getScoreWeightFromQuality ( long n0, long n1, long quality ) {
return getY (n0,n1,quality,"index.quality3","index.scoreWeight3",100);}
long CatRec::getScoreWeightFromQuality2( long quality ) {
return getY (0,999999,quality,"quality3","scoreWeight3",100);}
long CatRec::getScoreWeightFromLen ( long n0, long n1, long len ) {
return getY (n0,n1, len , "index.len4" ,"index.scoreWeight4",100);}
long CatRec::getScoreWeightFromLen2 ( long len ) {
return getY (0,999999, len , "len4" ,"scoreWeight4",100);}
long CatRec::getScoreWeightFromNumWords( long n0, long n1, long len ) {
return getY (n0,n1, len , "index.numWords6","index.scoreWeight6",100);}
long CatRec::getMaxScoreFromLen ( long n0, long n1, long len ) {
long max = getY (n0,n1, len, "index.len5" ,"index.maxScore5",100);
if ( max > 100 ) {
log("db: Encountered maxScore from length > 100 in ruleset "
"file. Truncating to 100.");
max = 100;
}
return max;
}
long CatRec::getMaxScoreFromNumWords ( long n0, long n1, long len ) {
long max = getY (n0,n1, len, "index.numWords7","index.maxScore7",100);
if ( max > 100 ) {
log("db: Encountered maxScore from length > 100 in ruleset "
"file. Truncating to 100.");
max = 100;
}
return max;
}
long CatRec::getQualityBoostFromNumLinks ( long numLinks ) {
return getY (0,99999, numLinks,"numLinks1" ,"qualityBoost1",100); }
long CatRec::getQualityBoostFromLinkQualitySum ( long sum ) {
return getY (0,99999, sum ,"linkQualitySum2","qualityBoost2",100);}
long CatRec::getQualityBoostFromRootQuality ( long rootQuality ) {
return getY (0,99999,rootQuality,"rootQuality3","qualityBoost3",100); }
long CatRec::getLinkTextScoreWeightFromLinkerQuality ( long quality ) {
return getY (0,99999,quality ,"quality4","linkTextScoreWeight4",100);}
long getLinkTextScoreWeightFromLinkerQuality ( Xml *xml , long quality ) {
return getY (xml,0,99999,quality ,"quality4","linkTextScoreWeight4",100);}
long CatRec::getLinkTextScoreWeightFromLinkeeQuality ( long quality ) {
return getY (0,99999,quality ,"quality7","linkTextScoreWeight7",100);}
long getLinkTextScoreWeightFromLinkeeQuality ( Xml *xml , long quality ) {
return getY (xml,0,99999,quality ,"quality7","linkTextScoreWeight7",100);}
long CatRec::getLinkTextScoreWeightFromNumWords( long numWords ) {
return getY (0,99999,numWords ,"linkTextNumWords6",
"linkTextScoreWeight6", 100); }
long CatRec::getQuotaBoostFromRootQuality ( long rootQuality ) {
return getY (0,99999,rootQuality,"rootQuality7","quotaBoost7",100); }
long CatRec::getQuotaBoostFromQuality ( long quality ) {
return getY (0,99999,quality,"quality8","quotaBoost8",100); }
long CatRec::getLinkTextMaxScoreFromQuality ( long quality ) {
long max = getY(0,99999,quality,"quality5","linkTextMaxScore5",100);
if ( max > 100 ) {
log("db: Encountered linkText maxScore from quality > 100 in "
"ruleset file. Truncating to 100.");
max = 100;
}
return max;
}
long CatRec::getMaxPercentForSpamFromQuality ( long quality ) {
// old ruleset files (tagdb*.xml) do not have this, so it *has* to
// default to 4 to preserve the old method... so we can properly
// delete docs.
long max = getY(0,99999,quality,"quality6","maxPercentSpammed6",4);
// a safety catch
if ( max < 4 ) {
max = 4;
static char s_flag = 0;
if ( s_flag == 0 ) {
log("db: Encountered max percent threshold for spam "
"that is less than 4. Setting to 4. This message "
"will not be repeated.");
s_flag = 1;
}
}
return max;
}
// . grab the Y value given the X
// . assumes a graph like:
//