#include "gb-include.h" #include "CatRec.h" //#include "SiteBonus.h" #include "Lang.h" //#include "DateParse.h" //static long getY(Xml *xml, long n0,long n1,long X, // char *strx,char *stry,long def); CatRec::CatRec (){ reset(); }; CatRec::~CatRec() {} void CatRec::reset() { m_hadRec = false; //m_xml = NULL; m_catids = NULL; m_numCatids = 0; m_numIndCatids = 0; m_dataSize = 0; //m_siteQuality = 0; //m_spamBits = 0; //m_adultLevel = 0; //m_numTypes = 0; //m_numLangs = 0; } // . used by Msg8 to parse a serialized site rec into this CatRec class // . we copy the info we need from "rec" so caller can free it // . if rec is NULL or recSize is 0 we use the default xml // . returns false and sets g_errno on error // . a CatRec has the format: (like a record in an RdbList) // . kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk k = 96bit key (typical) // . kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk // . kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk // . dddddddd dddddddd dddddddd dddddddd d = dataSize of data below here // .[nnnnnnnn cccccccc cccccccc cccccccc n = number of catids, Catdb only // . cccccccc cccccccc ........ ........] c = series of catids, longs [Catdb] // . ffffffff ffffffff ffffffff vvvvvvvv v = version f = site fileNum (must be >= 0) // . uuuuuuuu uuuuuuuu uuuuuuuu ........ u = var length site url // version >= 2: // . ppssxxxx s = spam bits // . p = adultLevel // . x = unused // version >= 3: // . qqqqqqqq q = site quality bool CatRec::set ( Url *url , char *data , long dataSize , bool gotByIp ) { //char rdbId ) { // assume url does not have a rec in tagdb m_hadRec = false; // set our collection //if ( coll ) memcpy ( m_coll , coll , collLen ); //m_collLen = collLen; // . if "data" is i guess the rec did not exist... so make a dummy rec // . MDW: why? if ( ! data || dataSize <= 0 ) { // default m_site to the hostname m_site.set (url->getHost(),url->getHostLen(),false/*addwww?*/); // steal ip from url m_site.setIp ( url->getIp() ); // default xml for this collection //m_xml = g_tagdb.getSiteXml ( 0,/*filenum*/ // coll, collLen); //, NULL , 0 ); m_filenum = 0 ; //if ( m_xml ) return true; //g_errno = ENODATA; //return log("db: Could not find the ruleset file " // "%stagdb0.xml.",g_hostdb.m_dir); return true; } // return false and set g_errno if buf too small if ( dataSize >= CATREC_BUF_SIZE ) { g_errno = EBUFTOOSMALL; return false; } // copy the raw data memcpy(m_data, data, dataSize); m_dataSize = dataSize; // set up a parsing ptr into "data" //char *p = data; char *p = m_data; // get the catids if using catdb //if (rdbId == RDB_CATDB) { m_numCatids = *(unsigned char*)p; p++; m_catids = (long*)p; p += 4*m_numCatids; //} // point to the filenum so we can mod it! //m_filenumPtr = p; // get the filenum (0 is default) //m_filenum = *(long *) p ; p += 4; m_filenum = *(long *) p ; p += 3; // get the version if ( m_filenum == -1 ) { m_version = 0; p++; } else { m_filenum &= 0x00FFFFFF; m_version = *p; p++; } // calc site url length if ( m_version == 0 ) { m_urlLen = dataSize - 4; //if (rdbId == RDB_CATDB) m_urlLen -= (4*m_numCatids) + 1; } else m_urlLen = gbstrlen(p); // set our site url m_url = p; m_site.set ( p , m_urlLen , false/*addwww?*/); // move p to end of url p += m_urlLen; if ( m_version >= 1 ) p++; // add time stamp, comment, username /* if ( m_version >= 2 && rdbId != RDB_CATDB ) { // time stamp m_timeStamp = *(long*)p; p += 4; // comment m_comment = p; p += gbstrlen(m_comment) + 1; // username m_username = p; p += gbstrlen(m_username) + 1; } unsigned char siteFlags = 0; m_spamBits = 0; m_adultLevel = 0; if ( m_version >= 3 && rdbId != RDB_CATDB ) { siteFlags = *p++; m_spamBits = siteFlags & 0xc0; } //we've added a 1 byte quality and 2 bits for adult content level. if ( m_version >= 4 && rdbId != RDB_CATDB ) { m_siteQuality = *p++; m_adultLevel = (siteFlags & 0x30); } m_incHere = NULL; m_addHere = NULL; if ( m_version >= 5 && rdbId != RDB_CATDB ) { // a marker for addSiteType() function below m_incHere = (long *)p; m_numTypes = *(uint8_t*)p; p += sizeof(uint8_t); for(long i = 0; i < m_numTypes; i++) { m_siteTypes[i].m_type = *(uint8_t*)p; p += sizeof(uint8_t); // version 6 adds 32-bit scores to site type if (m_version >= 6 && SiteType::isType4Bytes(m_siteTypes[i].m_type)) { m_siteTypes[i].m_score = *(uint32_t*)p; p += sizeof(uint32_t); } else { m_siteTypes[i].m_score = (uint32_t)*(uint8_t*)p; p += sizeof(uint8_t); } } // save ptr for addSiteTypes() m_addHere = p; //now for the languages m_numLangs = *(uint8_t*)p; p += sizeof(uint8_t); for(long i = 0; i < m_numLangs; i++) { m_siteLangs[i].m_type = *(uint8_t*)p; p += sizeof(uint8_t); m_siteLangs[i].m_score = (uint32_t)*(uint8_t*)p; p += sizeof(uint8_t); } } */ // sanity check if ( p - m_data != m_dataSize ) { log ( "tagdb: Deserialized datasize %i != %li for url %s so " "ignoring tagdb record.", p - m_data, m_dataSize , url->getUrl() ); return false; char *xx = NULL; *xx = 0; } // if hostname is same as url we can use the ip from url if ( url && m_site.getHostLen() == url->getHostLen() ) m_site.setIp ( url->getIp() ); // . this url had it's own rec in the db // . Msg16 needs to know this so it won't auto-detect porn/spam in // this url itself and delete it from tfndb m_hadRec = true; // if rec was in tagdb, data will be non-null.. did we get the rec // from tagdb by matching an IP? (as oppossed to canonical name) m_gotByIp = gotByIp; // get the xml for this filenum //m_xml = g_tagdb.getSiteXml ( m_filenum , coll , collLen ); //if ( m_xml ) return true; // should NEVER be NULL //g_errno = ENODATA; //return log("db: Could not find the ruleset file %stagdb%li.xml.", // g_hostdb.m_dir,m_filenum); return true; } bool CatRec::set ( Url *site , long filenum , //char version , char rdbId , //long timeStamp, char *comment , char *username , long *catids , unsigned char numCatids //unsigned char spamBits, char siteQuality, //char adultLevel, //SiteType *siteTypes, //uint8_t numTypes, //SiteType *siteLangs, //uint8_t numLangs) { ) { // version m_version = CATREC_CURRENT_VERSION; // version; // how big should the site rec be? m_dataSize = 4 + site->getUrlLen() ; // null termination if ( CATREC_CURRENT_VERSION >= 1 ) m_dataSize++; // add time stamp, comment, username //if ( version >= 2 && rdbId != RDB_CATDB ) { // m_dataSize += 6; // if (comment) // m_dataSize += gbstrlen(comment); // if (username) // m_dataSize += gbstrlen(username); //} //the spam bits. //if ( version >= 3 && rdbId != RDB_CATDB) { // m_dataSize++; //} //the site quality //if ( version >= 4 && rdbId != RDB_CATDB) { // m_dataSize++; //} //if ( version >= 5 && rdbId != RDB_CATDB) { // m_dataSize += sizeof(uint8_t); // m_dataSize += numTypes * (sizeof(uint8_t) + sizeof(uint8_t)); // m_dataSize += sizeof(uint8_t); // m_dataSize += numLangs * (sizeof(uint8_t) + sizeof(uint8_t)); //} // . beginning with version 6, SiteType scores can be either 8-bit or // 32-bit, so add the extra bytes to the data size //if ( version >= 6 ) { // for ( long i = 0; i < numTypes; i++ ) { // if ( SiteType::isType4Bytes(siteTypes[i].m_type) ) { // m_dataSize += (sizeof(uint32_t) - // sizeof(uint8_t)); // } // } //} // sanity check if ( m_version > CATREC_CURRENT_VERSION ) { char *xx = NULL; *xx = 0; } // catids and numcatids //if (rdbId == RDB_CATDB) m_dataSize += 1 + (numCatids * 4); // return false and set g_errno if buf too small if ( m_dataSize > CATREC_BUF_SIZE ) { g_errno = EBUFTOOSMALL; return false; } // how about the actual dataSize? //m_dataSize = 4 + site->getUrlLen(); // serialize into m_data char *p = m_data; // get our key //key_t key = g_tagdb.makeKey (site, coll, collLen, false/*del?*/); //m_numTypes = numTypes; //sanity check: //if(m_numTypes > MAX_SITE_TYPES) { // char *xx = NULL; *xx = 0;} // store numCatids and catids if exist m_numCatids = numCatids; if ( m_numCatids > MAX_CATIDS ) m_numCatids = MAX_CATIDS; //if (catids) { //if (rdbId == RDB_CATDB) { // add the count memcpy(p, &m_numCatids, 1); p++; // add the ids m_catids = (long*)p; memcpy(p, catids, 4*m_numCatids); // skip over "numCatids" NOT m_numCatids which is TRUNCATED // to MAX_CATIDS p += 4*numCatids; //} // point to the filenum so we can mod it! //m_filenumPtr = p; // store the filenum (3 bytes) //*(long *) p = filenum ; p += 4; //long filenum = 0; // make this 0 for catdb rec: MDW memcpy(p, &filenum, 3); p += 3; // store the version (1 byte) *p = m_version; p++; // the site m_url = p; m_urlLen = site->getUrlLen(); memcpy ( p , site->getUrl() , site->getUrlLen() ); p += site->getUrlLen(); // NULL terminate the site if ( m_version >= 1 ) { *p = '\0'; p++; } // add time stamp, comment, username /* if ( m_version >= 2 && rdbId != RDB_CATDB ) { // time stamp m_timeStamp = timeStamp; memcpy(p, &timeStamp, 4); p += 4; // comment m_comment = p; if (comment) { strcpy(p, comment); p += gbstrlen(comment) + 1; } else { *p = '\0'; p++; } // username m_username = p; if (username) { strcpy(p, username); p += gbstrlen(username) + 1; } else { *p = '\0'; p++; } } m_adultLevel = adultLevel; m_spamBits = spamBits; unsigned char siteFlags = 0; siteFlags |= m_adultLevel; siteFlags |= m_spamBits; if ( m_version >= 3 && rdbId != RDB_CATDB ) { *p = siteFlags; p++; } if ( m_version >= 4 && rdbId != RDB_CATDB ) { *p = siteQuality; p++; } // reset this m_addHere = NULL; m_incHere = NULL; if ( m_version >= 5 && rdbId != RDB_CATDB ) { // a marker for addSiteType() function below m_incHere = (long *)p; *(uint8_t*)p = numTypes; p += sizeof(uint8_t); for(long i = 0; i < numTypes; i++) { *(uint8_t*)p = siteTypes[i].m_type; p += sizeof(uint8_t); // version 6 adds 32-bit scores to site type if ( m_version >= 6 && SiteType::isType4Bytes( siteTypes[i].m_type ) ) { *(uint32_t*)p = siteTypes[i].m_score; p += sizeof(uint32_t); } else { *(uint8_t*)p = (uint8_t)siteTypes[i].m_score; p += sizeof(uint8_t); } } // this is a marker where to add site types from // addSiteType() function below m_addHere = p; *(uint8_t*)p = numLangs; p += sizeof(uint8_t); for(long i = 0; i < numLangs; i++) { *(uint8_t*)p = siteLangs[i].m_type; p += sizeof(uint8_t); *(uint8_t*)p = siteLangs[i].m_score; p += sizeof(uint8_t); } } */ // sanity check if ( p - m_data != m_dataSize ) { log ( "catrec: Serialized datasize %i != %li", p - m_data, m_dataSize ); char *xx = NULL; *xx = 0; } // set our member vars correctly in addition to the site rec m_site.set ( site->getUrl(), site->getUrlLen(), false/*addwww?*/); // steal ip from "site" m_site.setIp ( site->getIp() ); // save the collection into m_coll //memcpy ( m_coll , coll , collLen ); //m_collLen = collLen; // save the fileNum as well //m_filenum = filenum; // make sure xml is set //m_xml = g_tagdb.getSiteXml ( m_filenum , coll , collLen ); //if ( m_xml ) return true; // should NEVER be NULL //g_errno = ENODATA; //return log("db: Could not find the ruleset file %stagdb%li.xml.", // g_hostdb.m_dir,m_filenum); return true; } // keep everything else the same /* bool CatRec::set ( long filenum ) { // save the fileNum m_filenum = filenum; // make sure xml is set m_xml = g_tagdb.getSiteXml ( m_filenum , m_coll , m_collLen ); if ( m_xml ) return true; // should NEVER be NULL g_errno = ENODATA; return log("db: Could not find the ruleset file %stagdb%li.xml.", g_hostdb.m_dir,m_filenum); } */ // . this set method just sets the site records filenum, version, // url and url len // . this method is added to skip the getSiteXml and other // overheads bool CatRec::set ( char *data, long dataSize ) {//, char rdbId ){ if ( !data || dataSize <= 0 ) return false; //if (rdbId == RDB_CATDB) { m_numCatids = *(unsigned char*)data; data++; m_catids = (long*)data; data += 4*m_numCatids; //} // get the filenum (0 is default) //m_filenum = *(long *) p ; p += 4; m_filenum = *(long *) data ; data += 3; // get the version if ( m_filenum == -1 ) { m_version = 0; data++; } else { m_filenum &= 0x00FFFFFF; m_version = *data; data++; } // calc site url length if ( m_version == 0 ) { m_urlLen = dataSize - 4; //if (rdbId == RDB_CATDB) m_urlLen -= (4*m_numCatids) + 1; } else m_urlLen = gbstrlen(data); // set our site url m_url = data; m_site.set ( data , m_urlLen , false); return true; } // set the indirect catids void CatRec::setIndirectCatids ( long *indCatids, long numIndCatids ) { // store the number of ids m_numIndCatids = numIndCatids; if ( m_numIndCatids > MAX_IND_CATIDS ) m_numIndCatids = MAX_IND_CATIDS; // store the ids memcpy ( m_indCatids, indCatids, m_numIndCatids*4 ); } /* long CatRec::getMaxLenFromQuality ( long n0, long n1, long quality ) { return getY (n0,n1, quality, "index.quality1","index.maxLen1",64000);} long CatRec::getMaxScoreFromQuality ( long n0, long n1, long quality ) { long max=getY (n0,n1,quality,"index.quality2","index.maxScore2",100); if ( max > 100 ) { log("db: Encountered maxScore from quality > 100 in ruleset " "file. Truncating to 100."); max = 100; } return max; } //bool CatRec::hasMaxCountFromQualityTag ( long n0, long n1 ) { // long max=getY (n0,n1,50,"index.quality4","index.maxCount4",-9321); // if ( max == -9321 ) return false; // return true; //} // //long CatRec::getMaxCountFromQuality ( long n0, long n1, long quality ) { // // 100 in this sense is not a percentage, but an actual word count // long max=getY (n0,n1,quality,"index.quality4","index.maxCount4", // 9999999); // if ( max < 0 ) { // log("db: Encountered maxScore from quality of %li in ruleset " // "file. Setting to 0.",max); // max = 0; // } // return max; //} long CatRec::getScoreWeightFromQuality ( long n0, long n1, long quality ) { return getY (n0,n1,quality,"index.quality3","index.scoreWeight3",100);} long CatRec::getScoreWeightFromQuality2( long quality ) { return getY (0,999999,quality,"quality3","scoreWeight3",100);} long CatRec::getScoreWeightFromLen ( long n0, long n1, long len ) { return getY (n0,n1, len , "index.len4" ,"index.scoreWeight4",100);} long CatRec::getScoreWeightFromLen2 ( long len ) { return getY (0,999999, len , "len4" ,"scoreWeight4",100);} long CatRec::getScoreWeightFromNumWords( long n0, long n1, long len ) { return getY (n0,n1, len , "index.numWords6","index.scoreWeight6",100);} long CatRec::getMaxScoreFromLen ( long n0, long n1, long len ) { long max = getY (n0,n1, len, "index.len5" ,"index.maxScore5",100); if ( max > 100 ) { log("db: Encountered maxScore from length > 100 in ruleset " "file. Truncating to 100."); max = 100; } return max; } long CatRec::getMaxScoreFromNumWords ( long n0, long n1, long len ) { long max = getY (n0,n1, len, "index.numWords7","index.maxScore7",100); if ( max > 100 ) { log("db: Encountered maxScore from length > 100 in ruleset " "file. Truncating to 100."); max = 100; } return max; } long CatRec::getQualityBoostFromNumLinks ( long numLinks ) { return getY (0,99999, numLinks,"numLinks1" ,"qualityBoost1",100); } long CatRec::getQualityBoostFromLinkQualitySum ( long sum ) { return getY (0,99999, sum ,"linkQualitySum2","qualityBoost2",100);} long CatRec::getQualityBoostFromRootQuality ( long rootQuality ) { return getY (0,99999,rootQuality,"rootQuality3","qualityBoost3",100); } long CatRec::getLinkTextScoreWeightFromLinkerQuality ( long quality ) { return getY (0,99999,quality ,"quality4","linkTextScoreWeight4",100);} long getLinkTextScoreWeightFromLinkerQuality ( Xml *xml , long quality ) { return getY (xml,0,99999,quality ,"quality4","linkTextScoreWeight4",100);} long CatRec::getLinkTextScoreWeightFromLinkeeQuality ( long quality ) { return getY (0,99999,quality ,"quality7","linkTextScoreWeight7",100);} long getLinkTextScoreWeightFromLinkeeQuality ( Xml *xml , long quality ) { return getY (xml,0,99999,quality ,"quality7","linkTextScoreWeight7",100);} long CatRec::getLinkTextScoreWeightFromNumWords( long numWords ) { return getY (0,99999,numWords ,"linkTextNumWords6", "linkTextScoreWeight6", 100); } long CatRec::getQuotaBoostFromRootQuality ( long rootQuality ) { return getY (0,99999,rootQuality,"rootQuality7","quotaBoost7",100); } long CatRec::getQuotaBoostFromQuality ( long quality ) { return getY (0,99999,quality,"quality8","quotaBoost8",100); } long CatRec::getLinkTextMaxScoreFromQuality ( long quality ) { long max = getY(0,99999,quality,"quality5","linkTextMaxScore5",100); if ( max > 100 ) { log("db: Encountered linkText maxScore from quality > 100 in " "ruleset file. Truncating to 100."); max = 100; } return max; } long CatRec::getMaxPercentForSpamFromQuality ( long quality ) { // old ruleset files (tagdb*.xml) do not have this, so it *has* to // default to 4 to preserve the old method... so we can properly // delete docs. long max = getY(0,99999,quality,"quality6","maxPercentSpammed6",4); // a safety catch if ( max < 4 ) { max = 4; static char s_flag = 0; if ( s_flag == 0 ) { log("db: Encountered max percent threshold for spam " "that is less than 4. Setting to 4. This message " "will not be repeated."); s_flag = 1; } } return max; } // . grab the Y value given the X // . assumes a graph like: // // %uc // %uc // %uc // %uc // %uc // %ul // %ul // %ul // %ul // %ul // // . where strx = "index.quality2" // . stry = "index.maxScore2" // . this example maps a quality to a maxScore long CatRec::getY(long n0,long n1,long X,char *strx,char *stry,long def){ return ::getY(m_xml,n0,n1,X,strx,stry,def); } long getY(Xml *xml, long n0,long n1,long X,char *strx,char *stry,long def){ // . make the name buffers // . generates labels for the (x,y) points // . we can have up to 32 points char buf[64]; long x[32], y[32]; long i; for ( i = 0 ; i < 32 ; i++ ) { // get the x value (i.e. "quality23") sprintf ( buf, "%s%li", strx , i+1 ); x[i] = xml->getLong ( n0, n1, buf , -1 ); // break if this x point ain't present if ( x[i] == -1 ) break; // get the y value (i.e. "maxScore23") sprintf ( buf, "%s%li", stry , i+1 ); y[i] = xml->getLong ( n0, n1, buf , -1 ); // break if this y point ain't present if ( y[i] == -1 ) break; } // n is our number of (x,y) points long n = i; // bitch if no points present and return 0 if ( n == 0 ) { static char s_flag = 0; // only print out once if it is the quality6/maxPercentSpammed // map because that is a new thing if ( s_flag == 1 && strx && ! strcmp ( "quality6" , strx ) ) return def; // ok, there's other missing things, too if ( s_flag == 1 ) return def; s_flag = 1; log("db: No map present in a ruleset file (tagdb*.xml) for " "%s/%s. Using default of %li.",strx,stry,def); return def; } // if we only have one point then there'll be no interpolation if ( n == 1 ) return y[0]; // find the first x after our "X" long j; for ( j = 0 ; j < n; j++ ) if ( x[j] >= X ) break; // before/after first/last point means we don't have to interpolate if ( j <= 0 ) return y[0 ]; if ( j >= n ) return y[n-1]; // linear interpolate between our 2 points (x0,y0) and (x1,y1) long long x0 = x[j-1]; long long x1 = x[j ]; long long y0 = y[j-1]; long long y1 = y[j ]; // error if x1 less than x0 if ( x1 <= x0 ) { log("db: X coordinates are not in ascending order for map " "(%s,%s) in a ruleset file (tagdb*.xml).",strx,stry); return def; } // otherwise we have a sloping line return y0 + ( ((long long)X - x0) * (y1-y0) ) /(x1-x0) ; } void CatRec::printFormattedRec(SafeBuf *sb) { struct tm *timeStruct = localtime ( &m_timeStamp ); char tbuf[64]; strftime ( tbuf, 64 , "%b-%d-%Y(%H:%M:%S) ", timeStruct ); sb->safePrintf("Site: %s\n" "Site File Number:%li\n" "Had Rec: %s\n" "Version: %li\n" "Timestamp: %s\n" "Comment: %s\n" "Username: %s\n" "Site Quality: %li\n" "Spam Status: %s\n" "Adult Level: %s\n" "Alexa Rank: %li\n", m_site.getUrl(), (long)m_filenum, m_hadRec?"YES":"NO", (long)m_version, tbuf,//m_timeStamp, m_comment, m_username, (long)m_siteQuality, getSpamStr(), getAdultStr(), g_siteBonus.getAlexaRanking(&m_site)); for(long i = 0;i < m_numTypes; i++) { sb->safePrintf("%s:%li\n", SiteType::getSiteTypeStr(m_siteTypes[i].m_type), (long)m_siteTypes[i].m_score); } for(long i = 0;i < m_numLangs; i++) { sb->safePrintf("%s:%li\n", getLanguageString(m_siteLangs[i].m_type), (long)m_siteLangs[i].m_score); } } char* CatRec::printFormattedRec(char* p) { p += sprintf(p, "Site: %s\n" "Site File Number:%li\n" "Had Rec: %s\n" "Version: %li\n" "Timestamp: %li\n" "Comment: %s\n" "Username: %s\n" "Spam Status: %s\n" "Adult Level: %s\n" "Alexa Rank: %li\n", m_site.getUrl(), (long)m_filenum, m_hadRec?"YES":"NO", (long)m_version, m_timeStamp, m_comment, m_username, getSpamStr(), getAdultStr(), g_siteBonus.getAlexaRanking(&m_site)); for(long i = 0;i < m_numTypes; i++) { p += sprintf(p, "%s:%li\n", SiteType::getSiteTypeStr(m_siteTypes[i].m_type), (long)m_siteTypes[i].m_score); } for(long i = 0;i < m_numLangs; i++) { p += sprintf("%s:%li\n", getLanguageString(m_siteLangs[i].m_type), (long)m_siteLangs[i].m_score); } return p; } uint32_t CatRec::getScoreForType(uint8_t type) { for(long i = 0; i < m_numTypes; i++) { if(m_siteTypes[i].m_type == type) return m_siteTypes[i].m_score; } return 0; } void CatRec::setFilenum ( long filenum ) { m_filenum = filenum; // gotta update the m_data[] buffer too! memcpy(m_filenumPtr, &filenum, 3); } void CatRec::addSiteType ( uint8_t type, uint32_t score ) { if ( m_numTypes >= MAX_SITE_TYPES ) { log("build: hit max site types!");return;} // this is NOT supported for older version that had no site types! if ( m_version < 5 ) return; // score of 0 means none i guess? a reserved value! if ( score == 0 ) { log("build: adding site type with zero score!"); char *xx = NULL; *xx = 0; } m_siteTypes[m_numTypes].m_type = type; m_siteTypes[m_numTypes].m_score = score; m_numTypes++; // the type size! long scoreSize = SiteType::getScoreSize(type); // size of site type and score combined long totalSize = 1 + scoreSize; // shift the data in m_data! char *p = m_addHere; // how much to shift down long toShift = m_data + m_dataSize - p; // shift it memcpy ( p + totalSize , p , toShift ); // store new type *(uint8_t *)p = type; p++; // store new score memcpy ( p , &score , scoreSize ); // inc data size m_dataSize += totalSize; // inc this guy m_addHere += totalSize; // inc this too! *m_incHere = *m_incHere + 1; } char* CatRec::printXmlRec(char* p) { p += sprintf(p, "\t\n" "\t%li\n" "\t\n" "\t%li\n" "\t%li\n" "\t\n" "\t\n" "\t%li\n" "\t\n" "\t\n" "\t%li\n" "\t%i\n", m_site.getUrl(), (long)m_filenum, m_hadRec?"YES":"NO", (long)m_version, m_timeStamp, m_comment, m_username, (long)m_siteQuality, getSpamStr(), getAdultStr(), g_siteBonus.getAlexaRanking(&m_site), m_xml->getBool("isBanned", false)); return p; } void CatRec::printXmlRec( SafeBuf *sb ) { sb->safePrintf("\t\n" "\t%li\n" "\t\n" "\t%li\n" "\t%li\n" "\t\n" "\t\n" "\t%li\n" "\t\n" "\t\n" "\t%li\n" "\t%i\n", m_site.getUrl(), (long)m_filenum, m_hadRec?"YES":"NO", (long)m_version, m_timeStamp, m_comment, m_username, (long)m_siteQuality, getSpamStr(), getAdultStr(), g_siteBonus.getAlexaRanking(&m_site), m_xml->getBool("isBanned", false)); } char* CatRec::getSpamStr() { if ( m_version >= 3 ) { switch (m_spamBits) { case SPAM_BIT: return "spam"; case NOT_SPAM: return "not spam"; break; case SPAM_UNKNOWN: return "unknown"; break; default: return "corrupt"; break; } } return "unknown"; } char* CatRec::getAdultStr() { if ( m_version >= 4 ) { switch (m_adultLevel) { case RATED_G: return "kid safe"; case RATED_R: return "adult, not porn"; break; case RATED_X: return "porn"; break; default: return "not rated"; break; } } return "not rated"; } char *CatRec::getPubDateFmtStr() { long fmt = getScoreForType(SiteType::DATE_FORMAT); switch (fmt) { case DateParse::DATE_FMT_AMER: return "American"; case DateParse::DATE_FMT_EURO: return "European"; } return "Unknown/Ambiguous"; } */