open-source-search-engine/CatRec.cpp
mwells 87285ba3cd use gbmemcpy not memcpy so we can get profiler working again
since memcpy can't be interrupted and backtrace() called.
2015-01-13 12:25:42 -07:00

937 lines
28 KiB
C++

#include "gb-include.h"
#include "CatRec.h"
//#include "SiteBonus.h"
#include "Lang.h"
//#include "DateParse.h"
//static int32_t getY(Xml *xml, int32_t n0,int32_t n1,int32_t X,
// char *strx,char *stry,int32_t def);
CatRec::CatRec (){
reset();
};
CatRec::~CatRec() {}
void CatRec::reset() {
m_hadRec = false;
//m_xml = NULL;
m_catids = NULL;
m_numCatids = 0;
m_numIndCatids = 0;
m_dataSize = 0;
//m_siteQuality = 0;
//m_spamBits = 0;
//m_adultLevel = 0;
//m_numTypes = 0;
//m_numLangs = 0;
}
// . used by Msg8 to parse a serialized site rec into this CatRec class
// . we copy the info we need from "rec" so caller can free it
// . if rec is NULL or recSize is 0 we use the default xml
// . returns false and sets g_errno on error
// . a CatRec has the format: (like a record in an RdbList)
// . kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk k = 96bit key (typical)
// . kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk
// . kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk
// . dddddddd dddddddd dddddddd dddddddd d = dataSize of data below here
// .[nnnnnnnn cccccccc cccccccc cccccccc n = number of catids, Catdb only
// . cccccccc cccccccc ........ ........] c = series of catids, int32_ts [Catdb]
// . ffffffff ffffffff ffffffff vvvvvvvv v = version f = site fileNum (must be >= 0)
// . uuuuuuuu uuuuuuuu uuuuuuuu ........ u = var length site url
// version >= 2:
// . ppssxxxx s = spam bits
// . p = adultLevel
// . x = unused
// version >= 3:
// . qqqqqqqq q = site quality
bool CatRec::set ( Url *url , char *data , int32_t dataSize , bool gotByIp ) {
//char rdbId ) {
// assume url does not have a rec in tagdb
m_hadRec = false;
// set our collection
//if ( coll ) gbmemcpy ( m_coll , coll , collLen );
//m_collLen = collLen;
// . if "data" is i guess the rec did not exist... so make a dummy rec
// . MDW: why?
if ( ! data || dataSize <= 0 ) {
// default m_site to the hostname
m_site.set (url->getHost(),url->getHostLen(),false/*addwww?*/);
// steal ip from url
m_site.setIp ( url->getIp() );
// default xml for this collection
//m_xml = g_tagdb.getSiteXml ( 0,/*filenum*/
// coll, collLen); //, NULL , 0 );
m_filenum = 0 ;
//if ( m_xml ) return true;
//g_errno = ENODATA;
//return log("db: Could not find the ruleset file "
// "%stagdb0.xml.",g_hostdb.m_dir);
return true;
}
// return false and set g_errno if buf too small
if ( dataSize >= CATREC_BUF_SIZE ) {
g_errno = EBUFTOOSMALL;
return false;
}
// copy the raw data
gbmemcpy(m_data, data, dataSize);
m_dataSize = dataSize;
// set up a parsing ptr into "data"
//char *p = data;
char *p = m_data;
// get the catids if using catdb
//if (rdbId == RDB_CATDB) {
m_numCatids = *(unsigned char*)p;
p++;
m_catids = (int32_t*)p;
p += 4*m_numCatids;
//}
// point to the filenum so we can mod it!
//m_filenumPtr = p;
// get the filenum (0 is default)
//m_filenum = *(int32_t *) p ; p += 4;
m_filenum = *(int32_t *) p ; p += 3;
// get the version
if ( m_filenum == -1 ) {
m_version = 0;
p++;
}
else {
m_filenum &= 0x00FFFFFF;
m_version = *p;
p++;
}
// calc site url length
if ( m_version == 0 ) {
m_urlLen = dataSize - 4;
//if (rdbId == RDB_CATDB)
m_urlLen -= (4*m_numCatids) + 1;
}
else
m_urlLen = gbstrlen(p);
// set our site url
m_url = p;
m_site.set ( p , m_urlLen , false/*addwww?*/);
// move p to end of url
p += m_urlLen;
if ( m_version >= 1 )
p++;
// add time stamp, comment, username
/*
if ( m_version >= 2 && rdbId != RDB_CATDB ) {
// time stamp
m_timeStamp = *(int32_t*)p;
p += 4;
// comment
m_comment = p;
p += gbstrlen(m_comment) + 1;
// username
m_username = p;
p += gbstrlen(m_username) + 1;
}
unsigned char siteFlags = 0;
m_spamBits = 0;
m_adultLevel = 0;
if ( m_version >= 3 && rdbId != RDB_CATDB ) {
siteFlags = *p++;
m_spamBits = siteFlags & 0xc0;
}
//we've added a 1 byte quality and 2 bits for adult content level.
if ( m_version >= 4 && rdbId != RDB_CATDB ) {
m_siteQuality = *p++;
m_adultLevel = (siteFlags & 0x30);
}
m_incHere = NULL;
m_addHere = NULL;
if ( m_version >= 5 && rdbId != RDB_CATDB ) {
// a marker for addSiteType() function below
m_incHere = (int32_t *)p;
m_numTypes = *(uint8_t*)p;
p += sizeof(uint8_t);
for(int32_t i = 0; i < m_numTypes; i++) {
m_siteTypes[i].m_type = *(uint8_t*)p;
p += sizeof(uint8_t);
// version 6 adds 32-bit scores to site type
if (m_version >= 6 &&
SiteType::isType4Bytes(m_siteTypes[i].m_type)) {
m_siteTypes[i].m_score = *(uint32_t*)p;
p += sizeof(uint32_t);
}
else {
m_siteTypes[i].m_score = (uint32_t)*(uint8_t*)p;
p += sizeof(uint8_t);
}
}
// save ptr for addSiteTypes()
m_addHere = p;
//now for the languages
m_numLangs = *(uint8_t*)p;
p += sizeof(uint8_t);
for(int32_t i = 0; i < m_numLangs; i++) {
m_siteLangs[i].m_type = *(uint8_t*)p;
p += sizeof(uint8_t);
m_siteLangs[i].m_score = (uint32_t)*(uint8_t*)p;
p += sizeof(uint8_t);
}
}
*/
// sanity check
if ( p - m_data != m_dataSize ) {
log ( "tagdb: Deserialized datasize %"INT32" != %"INT32" for url %s so "
"ignoring tagdb record.",
(int32_t)(p - m_data), m_dataSize , url->getUrl() );
return false;
char *xx = NULL; *xx = 0;
}
// if hostname is same as url we can use the ip from url
if ( url && m_site.getHostLen() == url->getHostLen() )
m_site.setIp ( url->getIp() );
// . this url had it's own rec in the db
// . Msg16 needs to know this so it won't auto-detect porn/spam in
// this url itself and delete it from tfndb
m_hadRec = true;
// if rec was in tagdb, data will be non-null.. did we get the rec
// from tagdb by matching an IP? (as oppossed to canonical name)
m_gotByIp = gotByIp;
// get the xml for this filenum
//m_xml = g_tagdb.getSiteXml ( m_filenum , coll , collLen );
//if ( m_xml ) return true;
// should NEVER be NULL
//g_errno = ENODATA;
//return log("db: Could not find the ruleset file %stagdb%"INT32".xml.",
// g_hostdb.m_dir,m_filenum);
return true;
}
bool CatRec::set ( Url *site ,
int32_t filenum ,
//char version , char rdbId ,
//int32_t timeStamp, char *comment , char *username ,
int32_t *catids , unsigned char numCatids
//unsigned char spamBits, char siteQuality,
//char adultLevel,
//SiteType *siteTypes,
//uint8_t numTypes,
//SiteType *siteLangs,
//uint8_t numLangs) {
) {
// version
m_version = CATREC_CURRENT_VERSION; // version;
// how big should the site rec be?
m_dataSize = 4 + site->getUrlLen() ;
// null termination
if ( CATREC_CURRENT_VERSION >= 1 )
m_dataSize++;
// add time stamp, comment, username
//if ( version >= 2 && rdbId != RDB_CATDB ) {
// m_dataSize += 6;
// if (comment)
// m_dataSize += gbstrlen(comment);
// if (username)
// m_dataSize += gbstrlen(username);
//}
//the spam bits.
//if ( version >= 3 && rdbId != RDB_CATDB) {
// m_dataSize++;
//}
//the site quality
//if ( version >= 4 && rdbId != RDB_CATDB) {
// m_dataSize++;
//}
//if ( version >= 5 && rdbId != RDB_CATDB) {
// m_dataSize += sizeof(uint8_t);
// m_dataSize += numTypes * (sizeof(uint8_t) + sizeof(uint8_t));
// m_dataSize += sizeof(uint8_t);
// m_dataSize += numLangs * (sizeof(uint8_t) + sizeof(uint8_t));
//}
// . beginning with version 6, SiteType scores can be either 8-bit or
// 32-bit, so add the extra bytes to the data size
//if ( version >= 6 ) {
// for ( int32_t i = 0; i < numTypes; i++ ) {
// if ( SiteType::isType4Bytes(siteTypes[i].m_type) ) {
// m_dataSize += (sizeof(uint32_t) -
// sizeof(uint8_t));
// }
// }
//}
// sanity check
if ( m_version > CATREC_CURRENT_VERSION ) {
char *xx = NULL; *xx = 0; }
// catids and numcatids
//if (rdbId == RDB_CATDB)
m_dataSize += 1 + (numCatids * 4);
// return false and set g_errno if buf too small
if ( m_dataSize > CATREC_BUF_SIZE ) {
g_errno = EBUFTOOSMALL;
return false;
}
// how about the actual dataSize?
//m_dataSize = 4 + site->getUrlLen();
// serialize into m_data
char *p = m_data;
// get our key
//key_t key = g_tagdb.makeKey (site, coll, collLen, false/*del?*/);
//m_numTypes = numTypes;
//sanity check:
//if(m_numTypes > MAX_SITE_TYPES) {
// char *xx = NULL; *xx = 0;}
// store numCatids and catids if exist
m_numCatids = numCatids;
if ( m_numCatids > MAX_CATIDS )
m_numCatids = MAX_CATIDS;
//if (catids) {
//if (rdbId == RDB_CATDB) {
// add the count
gbmemcpy(p, &m_numCatids, 1);
p++;
// add the ids
m_catids = (int32_t*)p;
gbmemcpy(p, catids, 4*m_numCatids);
// skip over "numCatids" NOT m_numCatids which is TRUNCATED
// to MAX_CATIDS
p += 4*numCatids;
//}
// point to the filenum so we can mod it!
//m_filenumPtr = p;
// store the filenum (3 bytes)
//*(int32_t *) p = filenum ; p += 4;
//int32_t filenum = 0; // make this 0 for catdb rec: MDW
gbmemcpy(p, &filenum, 3); p += 3;
// store the version (1 byte)
*p = m_version; p++;
// the site
m_url = p;
m_urlLen = site->getUrlLen();
gbmemcpy ( p , site->getUrl() , site->getUrlLen() );
p += site->getUrlLen();
// NULL terminate the site
if ( m_version >= 1 ) {
*p = '\0'; p++;
}
// add time stamp, comment, username
/*
if ( m_version >= 2 && rdbId != RDB_CATDB ) {
// time stamp
m_timeStamp = timeStamp;
gbmemcpy(p, &timeStamp, 4);
p += 4;
// comment
m_comment = p;
if (comment) {
strcpy(p, comment);
p += gbstrlen(comment) + 1;
}
else {
*p = '\0';
p++;
}
// username
m_username = p;
if (username) {
strcpy(p, username);
p += gbstrlen(username) + 1;
}
else {
*p = '\0';
p++;
}
}
m_adultLevel = adultLevel;
m_spamBits = spamBits;
unsigned char siteFlags = 0;
siteFlags |= m_adultLevel;
siteFlags |= m_spamBits;
if ( m_version >= 3 && rdbId != RDB_CATDB ) {
*p = siteFlags;
p++;
}
if ( m_version >= 4 && rdbId != RDB_CATDB ) {
*p = siteQuality;
p++;
}
// reset this
m_addHere = NULL;
m_incHere = NULL;
if ( m_version >= 5 && rdbId != RDB_CATDB ) {
// a marker for addSiteType() function below
m_incHere = (int32_t *)p;
*(uint8_t*)p = numTypes;
p += sizeof(uint8_t);
for(int32_t i = 0; i < numTypes; i++) {
*(uint8_t*)p = siteTypes[i].m_type;
p += sizeof(uint8_t);
// version 6 adds 32-bit scores to site type
if ( m_version >= 6 &&
SiteType::isType4Bytes( siteTypes[i].m_type ) ) {
*(uint32_t*)p = siteTypes[i].m_score;
p += sizeof(uint32_t);
}
else {
*(uint8_t*)p = (uint8_t)siteTypes[i].m_score;
p += sizeof(uint8_t);
}
}
// this is a marker where to add site types from
// addSiteType() function below
m_addHere = p;
*(uint8_t*)p = numLangs;
p += sizeof(uint8_t);
for(int32_t i = 0; i < numLangs; i++) {
*(uint8_t*)p = siteLangs[i].m_type;
p += sizeof(uint8_t);
*(uint8_t*)p = siteLangs[i].m_score;
p += sizeof(uint8_t);
}
}
*/
// sanity check
if ( p - m_data != m_dataSize ) {
log ( "catrec: Serialized datasize %"INT32" != %"INT32"",
(int32_t)(p - m_data), (int32_t)m_dataSize );
char *xx = NULL; *xx = 0;
}
// set our member vars correctly in addition to the site rec
m_site.set ( site->getUrl(), site->getUrlLen(), false/*addwww?*/);
// steal ip from "site"
m_site.setIp ( site->getIp() );
// save the collection into m_coll
//gbmemcpy ( m_coll , coll , collLen );
//m_collLen = collLen;
// save the fileNum as well
//m_filenum = filenum;
// make sure xml is set
//m_xml = g_tagdb.getSiteXml ( m_filenum , coll , collLen );
//if ( m_xml ) return true;
// should NEVER be NULL
//g_errno = ENODATA;
//return log("db: Could not find the ruleset file %stagdb%"INT32".xml.",
// g_hostdb.m_dir,m_filenum);
return true;
}
// keep everything else the same
/*
bool CatRec::set ( int32_t filenum ) {
// save the fileNum
m_filenum = filenum;
// make sure xml is set
m_xml = g_tagdb.getSiteXml ( m_filenum , m_coll , m_collLen );
if ( m_xml ) return true;
// should NEVER be NULL
g_errno = ENODATA;
return log("db: Could not find the ruleset file %stagdb%"INT32".xml.",
g_hostdb.m_dir,m_filenum);
}
*/
// . this set method just sets the site records filenum, version,
// url and url len
// . this method is added to skip the getSiteXml and other
// overheads
bool CatRec::set ( char *data, int32_t dataSize ) {//, char rdbId ){
if ( !data || dataSize <= 0 )
return false;
//if (rdbId == RDB_CATDB) {
m_numCatids = *(unsigned char*)data;
data++;
m_catids = (int32_t*)data;
data += 4*m_numCatids;
//}
// get the filenum (0 is default)
//m_filenum = *(int32_t *) p ; p += 4;
m_filenum = *(int32_t *) data ; data += 3;
// get the version
if ( m_filenum == -1 ) {
m_version = 0;
data++;
}
else {
m_filenum &= 0x00FFFFFF;
m_version = *data;
data++;
}
// calc site url length
if ( m_version == 0 ) {
m_urlLen = dataSize - 4;
//if (rdbId == RDB_CATDB)
m_urlLen -= (4*m_numCatids) + 1;
}
else
m_urlLen = gbstrlen(data);
// set our site url
m_url = data;
m_site.set ( data , m_urlLen , false);
return true;
}
// set the indirect catids
void CatRec::setIndirectCatids ( int32_t *indCatids, int32_t numIndCatids ) {
// store the number of ids
m_numIndCatids = numIndCatids;
if ( m_numIndCatids > MAX_IND_CATIDS )
m_numIndCatids = MAX_IND_CATIDS;
// store the ids
gbmemcpy ( m_indCatids, indCatids, m_numIndCatids*4 );
}
/*
int32_t CatRec::getMaxLenFromQuality ( int32_t n0, int32_t n1, int32_t quality ) {
return getY (n0,n1, quality, "index.quality1","index.maxLen1",64000);}
int32_t CatRec::getMaxScoreFromQuality ( int32_t n0, int32_t n1, int32_t quality ) {
int32_t max=getY (n0,n1,quality,"index.quality2","index.maxScore2",100);
if ( max > 100 ) {
log("db: Encountered maxScore from quality > 100 in ruleset "
"file. Truncating to 100.");
max = 100;
}
return max;
}
//bool CatRec::hasMaxCountFromQualityTag ( int32_t n0, int32_t n1 ) {
// int32_t max=getY (n0,n1,50,"index.quality4","index.maxCount4",-9321);
// if ( max == -9321 ) return false;
// return true;
//}
//
//int32_t CatRec::getMaxCountFromQuality ( int32_t n0, int32_t n1, int32_t quality ) {
// // 100 in this sense is not a percentage, but an actual word count
// int32_t max=getY (n0,n1,quality,"index.quality4","index.maxCount4",
// 9999999);
// if ( max < 0 ) {
// log("db: Encountered maxScore from quality of %"INT32" in ruleset "
// "file. Setting to 0.",max);
// max = 0;
// }
// return max;
//}
int32_t CatRec::getScoreWeightFromQuality ( int32_t n0, int32_t n1, int32_t quality ) {
return getY (n0,n1,quality,"index.quality3","index.scoreWeight3",100);}
int32_t CatRec::getScoreWeightFromQuality2( int32_t quality ) {
return getY (0,999999,quality,"quality3","scoreWeight3",100);}
int32_t CatRec::getScoreWeightFromLen ( int32_t n0, int32_t n1, int32_t len ) {
return getY (n0,n1, len , "index.len4" ,"index.scoreWeight4",100);}
int32_t CatRec::getScoreWeightFromLen2 ( int32_t len ) {
return getY (0,999999, len , "len4" ,"scoreWeight4",100);}
int32_t CatRec::getScoreWeightFromNumWords( int32_t n0, int32_t n1, int32_t len ) {
return getY (n0,n1, len , "index.numWords6","index.scoreWeight6",100);}
int32_t CatRec::getMaxScoreFromLen ( int32_t n0, int32_t n1, int32_t len ) {
int32_t max = getY (n0,n1, len, "index.len5" ,"index.maxScore5",100);
if ( max > 100 ) {
log("db: Encountered maxScore from length > 100 in ruleset "
"file. Truncating to 100.");
max = 100;
}
return max;
}
int32_t CatRec::getMaxScoreFromNumWords ( int32_t n0, int32_t n1, int32_t len ) {
int32_t max = getY (n0,n1, len, "index.numWords7","index.maxScore7",100);
if ( max > 100 ) {
log("db: Encountered maxScore from length > 100 in ruleset "
"file. Truncating to 100.");
max = 100;
}
return max;
}
int32_t CatRec::getQualityBoostFromNumLinks ( int32_t numLinks ) {
return getY (0,99999, numLinks,"numLinks1" ,"qualityBoost1",100); }
int32_t CatRec::getQualityBoostFromLinkQualitySum ( int32_t sum ) {
return getY (0,99999, sum ,"linkQualitySum2","qualityBoost2",100);}
int32_t CatRec::getQualityBoostFromRootQuality ( int32_t rootQuality ) {
return getY (0,99999,rootQuality,"rootQuality3","qualityBoost3",100); }
int32_t CatRec::getLinkTextScoreWeightFromLinkerQuality ( int32_t quality ) {
return getY (0,99999,quality ,"quality4","linkTextScoreWeight4",100);}
int32_t getLinkTextScoreWeightFromLinkerQuality ( Xml *xml , int32_t quality ) {
return getY (xml,0,99999,quality ,"quality4","linkTextScoreWeight4",100);}
int32_t CatRec::getLinkTextScoreWeightFromLinkeeQuality ( int32_t quality ) {
return getY (0,99999,quality ,"quality7","linkTextScoreWeight7",100);}
int32_t getLinkTextScoreWeightFromLinkeeQuality ( Xml *xml , int32_t quality ) {
return getY (xml,0,99999,quality ,"quality7","linkTextScoreWeight7",100);}
int32_t CatRec::getLinkTextScoreWeightFromNumWords( int32_t numWords ) {
return getY (0,99999,numWords ,"linkTextNumWords6",
"linkTextScoreWeight6", 100); }
int32_t CatRec::getQuotaBoostFromRootQuality ( int32_t rootQuality ) {
return getY (0,99999,rootQuality,"rootQuality7","quotaBoost7",100); }
int32_t CatRec::getQuotaBoostFromQuality ( int32_t quality ) {
return getY (0,99999,quality,"quality8","quotaBoost8",100); }
int32_t CatRec::getLinkTextMaxScoreFromQuality ( int32_t quality ) {
int32_t max = getY(0,99999,quality,"quality5","linkTextMaxScore5",100);
if ( max > 100 ) {
log("db: Encountered linkText maxScore from quality > 100 in "
"ruleset file. Truncating to 100.");
max = 100;
}
return max;
}
int32_t CatRec::getMaxPercentForSpamFromQuality ( int32_t quality ) {
// old ruleset files (tagdb*.xml) do not have this, so it *has* to
// default to 4 to preserve the old method... so we can properly
// delete docs.
int32_t max = getY(0,99999,quality,"quality6","maxPercentSpammed6",4);
// a safety catch
if ( max < 4 ) {
max = 4;
static char s_flag = 0;
if ( s_flag == 0 ) {
log("db: Encountered max percent threshold for spam "
"that is less than 4. Setting to 4. This message "
"will not be repeated.");
s_flag = 1;
}
}
return max;
}
// . grab the Y value given the X
// . assumes a graph like:
// <index>
// <quality21> %uc </>
// <quality22> %uc </>
// <quality23> %uc </>
// <quality24> %uc </>
// <quality25> %uc </>
// <maxScore21> %ul </>
// <maxScore22> %ul </>
// <maxScore23> %ul </>
// <maxScore24> %ul </>
// <maxScore25> %ul </>
// </index>
// . where strx = "index.quality2"
// . stry = "index.maxScore2"
// . this example maps a quality to a maxScore
int32_t CatRec::getY(int32_t n0,int32_t n1,int32_t X,char *strx,char *stry,int32_t def){
return ::getY(m_xml,n0,n1,X,strx,stry,def);
}
int32_t getY(Xml *xml, int32_t n0,int32_t n1,int32_t X,char *strx,char *stry,int32_t def){
// . make the name buffers
// . generates labels for the (x,y) points
// . we can have up to 32 points
char buf[64];
int32_t x[32], y[32];
int32_t i;
for ( i = 0 ; i < 32 ; i++ ) {
// get the x value (i.e. "quality23")
sprintf ( buf, "%s%"INT32"", strx , i+1 );
x[i] = xml->getLong ( n0, n1, buf , -1 );
// break if this x point ain't present
if ( x[i] == -1 ) break;
// get the y value (i.e. "maxScore23")
sprintf ( buf, "%s%"INT32"", stry , i+1 );
y[i] = xml->getLong ( n0, n1, buf , -1 );
// break if this y point ain't present
if ( y[i] == -1 ) break;
}
// n is our number of (x,y) points
int32_t n = i;
// bitch if no points present and return 0
if ( n == 0 ) {
static char s_flag = 0;
// only print out once if it is the quality6/maxPercentSpammed
// map because that is a new thing
if ( s_flag == 1 && strx && ! strcmp ( "quality6" , strx ) )
return def;
// ok, there's other missing things, too
if ( s_flag == 1 )
return def;
s_flag = 1;
log("db: No map present in a ruleset file (tagdb*.xml) for "
"%s/%s. Using default of %"INT32".",strx,stry,def);
return def;
}
// if we only have one point then there'll be no interpolation
if ( n == 1 ) return y[0];
// find the first x after our "X"
int32_t j;
for ( j = 0 ; j < n; j++ ) if ( x[j] >= X ) break;
// before/after first/last point means we don't have to interpolate
if ( j <= 0 ) return y[0 ];
if ( j >= n ) return y[n-1];
// linear interpolate between our 2 points (x0,y0) and (x1,y1)
int64_t x0 = x[j-1];
int64_t x1 = x[j ];
int64_t y0 = y[j-1];
int64_t y1 = y[j ];
// error if x1 less than x0
if ( x1 <= x0 ) {
log("db: X coordinates are not in ascending order for map "
"(%s,%s) in a ruleset file (tagdb*.xml).",strx,stry);
return def;
}
// otherwise we have a sloping line
return y0 + ( ((int64_t)X - x0) * (y1-y0) ) /(x1-x0) ;
}
void CatRec::printFormattedRec(SafeBuf *sb) {
struct tm *timeStruct = localtime ( &m_timeStamp );
char tbuf[64];
strftime ( tbuf, 64 , "%b-%d-%Y(%H:%M:%S) ", timeStruct );
sb->safePrintf("<tr><td>Site: </td><td>%s</td></tr>\n"
"<tr><td>Site File Number:</td><td>%"INT32"</td></tr>\n"
"<tr><td>Had Rec: </td><td>%s</td></tr>\n"
"<tr><td>Version: </td><td>%"INT32"</td></tr>\n"
"<tr><td>Timestamp: </td><td>%s</td></tr>\n"
"<tr><td>Comment: </td><td>%s</td></tr>\n"
"<tr><td>Username: </td><td>%s</td></tr>\n"
"<tr><td>Site Quality: </td><td>%"INT32"</td></tr>\n"
"<tr><td>Spam Status: </td><td>%s</td></tr>\n"
"<tr><td>Adult Level: </td><td>%s</td></tr>\n"
"<tr><td>Alexa Rank: </td><td>%"INT32"</td></tr>\n",
m_site.getUrl(),
(int32_t)m_filenum,
m_hadRec?"YES":"NO",
(int32_t)m_version,
tbuf,//m_timeStamp,
m_comment,
m_username,
(int32_t)m_siteQuality,
getSpamStr(),
getAdultStr(),
g_siteBonus.getAlexaRanking(&m_site));
for(int32_t i = 0;i < m_numTypes; i++) {
sb->safePrintf("<tr><td>%s:</td><td>%"INT32"</td></tr>\n",
SiteType::getSiteTypeStr(m_siteTypes[i].m_type),
(int32_t)m_siteTypes[i].m_score);
}
for(int32_t i = 0;i < m_numLangs; i++) {
sb->safePrintf("<tr><td>%s:</td><td>%"INT32"</td></tr>\n",
getLanguageString(m_siteLangs[i].m_type),
(int32_t)m_siteLangs[i].m_score);
}
}
char* CatRec::printFormattedRec(char* p) {
p += sprintf(p,
"<tr><td>Site: </td><td>%s</td></tr>\n"
"<tr><td>Site File Number:</td><td>%"INT32"</td></tr>\n"
"<tr><td>Had Rec: </td><td>%s</td></tr>\n"
"<tr><td>Version: </td><td>%"INT32"</td></tr>\n"
"<tr><td>Timestamp: </td><td>%"INT32"</td></tr>\n"
"<tr><td>Comment: </td><td>%s</td></tr>\n"
"<tr><td>Username: </td><td>%s</td></tr>\n"
"<tr><td>Spam Status: </td><td>%s</td></tr>\n"
"<tr><td>Adult Level: </td><td>%s</td></tr>\n"
"<tr><td>Alexa Rank: </td><td>%"INT32"</td></tr>\n",
m_site.getUrl(),
(int32_t)m_filenum,
m_hadRec?"YES":"NO",
(int32_t)m_version,
m_timeStamp,
m_comment,
m_username,
getSpamStr(),
getAdultStr(),
g_siteBonus.getAlexaRanking(&m_site));
for(int32_t i = 0;i < m_numTypes; i++) {
p += sprintf(p,
"<tr><td>%s:</td><td>%"INT32"</td></tr>\n",
SiteType::getSiteTypeStr(m_siteTypes[i].m_type),
(int32_t)m_siteTypes[i].m_score);
}
for(int32_t i = 0;i < m_numLangs; i++) {
p += sprintf("<tr><td>%s:</td><td>%"INT32"</td></tr>\n",
getLanguageString(m_siteLangs[i].m_type),
(int32_t)m_siteLangs[i].m_score);
}
return p;
}
uint32_t CatRec::getScoreForType(uint8_t type) {
for(int32_t i = 0; i < m_numTypes; i++) {
if(m_siteTypes[i].m_type == type) return m_siteTypes[i].m_score;
}
return 0;
}
void CatRec::setFilenum ( int32_t filenum ) {
m_filenum = filenum;
// gotta update the m_data[] buffer too!
gbmemcpy(m_filenumPtr, &filenum, 3);
}
void CatRec::addSiteType ( uint8_t type, uint32_t score ) {
if ( m_numTypes >= MAX_SITE_TYPES ) {
log("build: hit max site types!");return;}
// this is NOT supported for older version that had no site types!
if ( m_version < 5 ) return;
// score of 0 means none i guess? a reserved value!
if ( score == 0 ) {
log("build: adding site type with zero score!");
char *xx = NULL; *xx = 0;
}
m_siteTypes[m_numTypes].m_type = type;
m_siteTypes[m_numTypes].m_score = score;
m_numTypes++;
// the type size!
int32_t scoreSize = SiteType::getScoreSize(type);
// size of site type and score combined
int32_t totalSize = 1 + scoreSize;
// shift the data in m_data!
char *p = m_addHere;
// how much to shift down
int32_t toShift = m_data + m_dataSize - p;
// shift it
gbmemcpy ( p + totalSize , p , toShift );
// store new type
*(uint8_t *)p = type; p++;
// store new score
gbmemcpy ( p , &score , scoreSize );
// inc data size
m_dataSize += totalSize;
// inc this guy
m_addHere += totalSize;
// inc this too!
*m_incHere = *m_incHere + 1;
}
char* CatRec::printXmlRec(char* p) {
p += sprintf(p,
"\t<site><![CDATA[%s]]></site>\n"
"\t<siteFileNumber>%"INT32"</siteFileNumber>\n"
"\t<hadRec><![CDATA[%s]]></hadRec>\n"
"\t<version>%"INT32"</version>\n"
"\t<timestamp>%"INT32"</timestamp>\n"
"\t<comment><![CDATA[%s]]></comment>\n"
"\t<username><![CDATA[%s]]></username>\n"
"\t<siteQuality>%"INT32"</siteQuality>\n"
"\t<spamStatus><![CDATA[%s]]></spamStatus>\n"
"\t<adultLevel><![CDATA[%s]]></adultLevel>\n"
"\t<alexaRank>%"INT32"</alexaRank>\n"
"\t<banned>%i</banned>\n",
m_site.getUrl(),
(int32_t)m_filenum,
m_hadRec?"YES":"NO",
(int32_t)m_version,
m_timeStamp,
m_comment,
m_username,
(int32_t)m_siteQuality,
getSpamStr(),
getAdultStr(),
g_siteBonus.getAlexaRanking(&m_site),
m_xml->getBool("isBanned", false));
return p;
}
void CatRec::printXmlRec( SafeBuf *sb ) {
sb->safePrintf("\t<site><![CDATA[%s]]></site>\n"
"\t<siteFileNumber>%"INT32"</siteFileNumber>\n"
"\t<hadRec><![CDATA[%s]]></hadRec>\n"
"\t<version>%"INT32"</version>\n"
"\t<timestamp>%"INT32"</timestamp>\n"
"\t<comment><![CDATA[%s]]></comment>\n"
"\t<username><![CDATA[%s]]></username>\n"
"\t<siteQuality>%"INT32"</siteQuality>\n"
"\t<spamStatus><![CDATA[%s]]></spamStatus>\n"
"\t<adultLevel><![CDATA[%s]]></adultLevel>\n"
"\t<alexaRank>%"INT32"</alexaRank>\n"
"\t<banned>%i</banned>\n",
m_site.getUrl(),
(int32_t)m_filenum,
m_hadRec?"YES":"NO",
(int32_t)m_version,
m_timeStamp,
m_comment,
m_username,
(int32_t)m_siteQuality,
getSpamStr(),
getAdultStr(),
g_siteBonus.getAlexaRanking(&m_site),
m_xml->getBool("isBanned", false));
}
char* CatRec::getSpamStr() {
if ( m_version >= 3 ) {
switch (m_spamBits) {
case SPAM_BIT:
return "spam";
case NOT_SPAM:
return "not spam"; break;
case SPAM_UNKNOWN:
return "unknown"; break;
default:
return "corrupt"; break;
}
}
return "unknown";
}
char* CatRec::getAdultStr() {
if ( m_version >= 4 ) {
switch (m_adultLevel) {
case RATED_G:
return "kid safe";
case RATED_R:
return "adult, not porn"; break;
case RATED_X:
return "porn"; break;
default:
return "not rated"; break;
}
}
return "not rated";
}
char *CatRec::getPubDateFmtStr() {
int32_t fmt = getScoreForType(SiteType::DATE_FORMAT);
switch (fmt) {
case DateParse::DATE_FMT_AMER:
return "American";
case DateParse::DATE_FMT_EURO:
return "European";
}
return "Unknown/Ambiguous";
}
*/