#include "gb-include.h" #include "Categories.h" #include "Catdb.h" #include "Loop.h" #include "sort.h" #include "LanguageIdentifier.h" using namespace std; Categories g_categories1; Categories g_categories2; Categories *g_categories; static int sortCatHash ( const void *h1, const void *h2 ); // properly read from file long Categories::fileRead ( int fileid, void *buf, size_t count ) { char *p = (char*)buf; long n = 0; unsigned long sizeRead = 0; while ( sizeRead < count ) { n = read ( fileid, p, count - sizeRead ); if ( n <= 0 || n > (long)count ) return n; sizeRead += n; p += n; } return sizeRead; } Categories::Categories() { m_cats = NULL; m_numCats = 0; m_nameBuffer = NULL; m_nameBufferSize = 0; m_buffer = NULL; m_bufferSize = 0; } Categories::~Categories() { reset(); } void Categories::reset() { if (m_buffer) { mfree ( m_buffer, m_bufferSize, "Categories" ); m_buffer = NULL; } } // filename usually ./catdb/gbdmoz.structure.dat long Categories::loadCategories ( char *filename ) { //ifstream inStream; int inStream; // open the structure file inStream = open(filename, O_RDONLY); // make sure it opened okay if ( inStream < 0 ) { log("cat: Error opening structure file: %s", filename); return 1; } // read the size of the name buffer if ( fileRead ( inStream, &m_nameBufferSize, sizeof(long) ) != sizeof(long) ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } // read in the number of cats // filename usually ./catdb/gbdmoz.structure.dat if ( fileRead ( inStream, &m_numCats, sizeof(long) ) != sizeof(long) ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } // create the name buffer m_bufferSize = m_nameBufferSize + sizeof(Category)*m_numCats + sizeof(CategoryHash)*m_numCats; m_buffer = (char*)mmalloc(m_bufferSize, "Categories"); if (!m_buffer) { log("cat: Could not allocate %li bytes for Category Buffer", m_bufferSize); close(inStream); g_errno = ENOMEM; return 1; } // assign the buffers m_nameBuffer = m_buffer; m_cats = (Category*)(m_buffer + (sizeof(char)*m_nameBufferSize)); m_catHash = (CategoryHash*)(m_buffer + (sizeof(char)*m_nameBufferSize) + (sizeof(Category)*m_numCats)); //(sizeof(long)*m_numSymParents)); /* // read and fill the name buffer if ( fileRead ( inStream, m_nameBuffer, m_nameBufferSize ) != m_nameBufferSize ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } */ // temp buffer to read the whole file first long readSize = m_nameBufferSize + (m_numCats * 30); char *tempBuffer = (char*)mmalloc(readSize, "Categories"); if ( !tempBuffer ) { log("cat: Could not allocate %li bytes for File Temp Buffer", readSize); close(inStream); g_errno = ENOMEM; return 1; } // . read the rest of the file into the temp buffer // . filename usually ./catdb/gbdmoz.structure.dat if ( fileRead ( inStream, tempBuffer, readSize ) != readSize ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } char *p = tempBuffer; memcpy ( m_nameBuffer, p, m_nameBufferSize ); p += m_nameBufferSize; // read and fill the cats for (long i = 0; i < m_numCats; i++) { memcpy(&m_cats[i].m_catid, p, sizeof(long)); p += sizeof(long); memcpy(&m_cats[i].m_parentid, p, sizeof(long)); p += sizeof(long); memcpy(&m_cats[i].m_nameOffset, p, sizeof(long)); p += sizeof(long); memcpy(&m_cats[i].m_nameLen, p, sizeof(short)); p += sizeof(short); memcpy(&m_cats[i].m_structureOffset, p, sizeof(long)); p += sizeof(long); memcpy(&m_cats[i].m_contentOffset, p, sizeof(long)); p += sizeof(long); memcpy(&m_cats[i].m_numUrls, p, sizeof(long)); p += sizeof(long); /* if ( fileRead ( inStream, &m_cats[i].m_catid, sizeof(long) ) != sizeof(long) ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } if ( fileRead(inStream, &m_cats[i].m_parentid, sizeof(long)) != sizeof(long) ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } if ( fileRead ( inStream, &m_cats[i].m_nameOffset, sizeof(long) ) != sizeof(long) ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } if ( fileRead ( inStream, &m_cats[i].m_nameLen, sizeof(short) ) != sizeof(short) ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } if ( fileRead ( inStream, &m_cats[i].m_structureOffset, sizeof(long) ) != sizeof(long) ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } if ( fileRead ( inStream, &m_cats[i].m_contentOffset, sizeof(long) ) != sizeof(long) ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } if ( fileRead ( inStream, &m_cats[i].m_numUrls, sizeof(long) ) != sizeof(long) ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } */ } // read the category hash for (long i = 0; i < m_numCats; i++) { // read the hash /* if ( fileRead ( inStream, &m_catHash[i].m_hash, sizeof(long) ) != sizeof(long) ) { log("cat: Error reading structure file: %s", filename); close(inStream); return 1; } */ memcpy(&m_catHash[i].m_hash, p, sizeof(long)); p += sizeof(long); // assign the index m_catHash[i].m_catIndex = i; } // is this a bottleneck? shouldn't it be stored that way on disk? long long start = gettimeofdayInMilliseconds(); // sort the category hash by hash value gbsort(m_catHash, m_numCats, sizeof(CategoryHash), sortCatHash); // sanity check - no dups allowed unsigned long last = 0xffffffff; for ( long i = 0 ; i < m_numCats ; i++ ) { if ( m_catHash[i].m_hash == last ) log("dmoz: hash collision on %lu",last); last = m_catHash[i].m_hash; } // time it long long took = gettimeofdayInMilliseconds(); if ( took - start > 100 ) log(LOG_INIT,"admin: Took %lli ms to " "sort cat hashes.",took-start); // close the file close(inStream); // free the temp buffer mfree(tempBuffer, readSize, "Categories"); // now create the "bad" hash table, so we can quickly see if a url // url is in the adult, gambling or online pharmacies categories if ( ! makeBadHashTable() ) return 1; // success return 0; } // returns false and sets g_errno on error bool Categories::makeBadHashTable ( ) { m_badTable.reset(); // . if it is on disk, load it // . returns false and sets g_errno on load error // . returns true if file does not exist if ( ! m_badTable.load ( g_hostdb.m_dir , "badcattable.dat" ) ) return false; // if it existed, we are done if ( m_badTable.getNumSlotsUsed() > 0 ) return true; log(LOG_INFO,"cat: Generating hash table of bad url hashes."); for ( long i = 0 ; i < m_numCats ; i++ ) { // skip if not an bad catid if ( ! isIdBad ( m_cats[i].m_catid ) ) continue; // it is, add the url hash to the table addUrlsToBadHashTable ( m_cats[i].m_catid ) ; //log(LOG_INIT,"cat: Error making bad hash table: %s.", // mstrerror(g_errno)); // return false; //} } //log(LOG_INFO,"cat: Saving hash table to badtable.dat."); // now try to save it to make it faster next time around m_badTable.save ( g_hostdb.m_dir , "badcattable.dat" ) ; return true; } bool Categories::isInBadCat ( Url *u ) { // hash it unsigned long h = hash32 ( u->getUrl() , u->getUrlLen() ); // if it is in there, it is in a bad catid if ( m_badTable.getSlot ( h ) >= 0 ) return true; // otherwise, not... return false; } bool Categories::isInBadCat ( unsigned long h ) { // if it is in there, it is in an bad catid if ( m_badTable.getSlot ( h ) >= 0 ) return true; // otherwise, not... return false; } int sortCatHash ( const void *h1, const void *h2 ) { if (((CategoryHash*)h1)->m_hash < ((CategoryHash*)h2)->m_hash) return -1; else if (((CategoryHash*)h1)->m_hash > ((CategoryHash*)h2)->m_hash) return 1; else return 0; } // do a binary search to get a cat from an id long Categories::getIndexFromId ( long catid ) { long low = 0; long high = m_numCats-1; long currCat; // binary search while (low <= high) { // next check spot currCat = (low + high)/2; // check for hit if (m_cats[currCat].m_catid == catid) return currCat; // shift search range else if (m_cats[currCat].m_catid > catid) high = currCat-1; else low = currCat+1; } // not found return -1; } // do a binary search to get a cat from a path long Categories::getIndexFromPath ( char *str, long strLen ) { long low = 0; long high = m_numCats-1; long currCat; if (!str || strLen <= 0) return -1; // remove any leading / if (str[0] == '/') { str++; strLen--; } // remove any trailing / if (str[strLen-1] == '/') strLen--; // check for top if (strLen == 3 && strncasecmp(str, "Top", 3) == 0) // it is catid 2 right? but i guess zero is symbolic for us! return 0; // get the hash unsigned long hash = hash32Lower_a(str, strLen, 0); // debug //char c = str[strLen]; //str[strLen] = '\0'; //log("dmoz: looking up hash %lu for %s",hash,str); //str[strLen] = c; // binary search while (low <= high) { // next check spot currCat = (low + high)/2; // check for hit if (m_catHash[currCat].m_hash == hash) return m_catHash[currCat].m_catIndex; // shift search range else if (m_catHash[currCat].m_hash > hash) high = currCat-1; else low = currCat+1; } // not found return -1; } // return the catid from the given path long Categories::getIdFromPath ( char *str, long strLen ) { if ( ! m_cats ) return -1; long index = getIndexFromPath(str, strLen); return m_cats[index].m_catid; } // check this ID for an RTL starter bool Categories::isIdRTLStart ( long catid ) { if ( catid == 88070 || // Top:World:Arabic catid == 39341 || // Top:World:Farsi catid == 118215 || // Top:World:Hebrew catid == 1214070 || // Top:K&T:Inter:Arabic catid == 1262316 || // Top:K&T:Inter:Farsi catid == 910298 ) // Top:K&T:Inter:Hebrew return true; else return false; } // check this ID for an RTL starter bool Categories::isIndexRTLStart ( long catIndex ) { if ( catIndex > 0 ) return isIdRTLStart(m_cats[catIndex].m_catid); return false; } // determine if a category is RTL from Id bool Categories::isIdRTL ( long catid ) { long index = getIndexFromId(catid); if (index < 0) return false; return isIndexRTL(index); } // determine if a category is RTL from Index bool Categories::isIndexRTL ( long catIndex ) { long currIndex = catIndex; while (currIndex > 0) { // check if this is one of the RTLs if (isIdRTLStart(m_cats[currIndex].m_catid)) return true; // otherwise check the parent currIndex = getIndexFromId(m_cats[currIndex].m_parentid); } return false; } // check this ID for a top Adult category bool Categories::isIdAdultStart ( long catid ) { if ( catid == 17 ) // Top:Adult return true; else return false; } bool Categories::isIdBadStart ( long catid ) { // Top:Adult if ( catid == 17 ) return true; // Top:Games:Gambling if ( catid == 144 ) return true; // Top:Shopping:Health:Pharmacy:Online_Pharmacies if ( catid == 128206 ) return true; return false; } // check this index for a top Adult category bool Categories::isIndexAdultStart ( long catIndex ) { if (catIndex > 0) return isIdAdultStart(m_cats[catIndex].m_catid); return false; } // check if a category is Adult from Id bool Categories::isIdAdult ( long catid ) { long index = getIndexFromId(catid); if (index < 0) return false; return isIndexAdult(index); } // check if a category is "bad" from Id bool Categories::isIdBad ( long catid ) { long index = getIndexFromId(catid); if (index < 0) return false; return isIndexBad(index); } // check if a category is Adult from Index bool Categories::isIndexAdult ( long catIndex ) { long currIndex = catIndex; while (currIndex > 0) { // check if this is the Adult category if ( isIdAdultStart(m_cats[currIndex].m_catid) ) return true; // otherwise check the parent currIndex = getIndexFromId(m_cats[currIndex].m_parentid); } return false; } // check if a category is Adult, gambling or online phrarmacy from Index bool Categories::isIndexBad ( long catIndex ) { long currIndex = catIndex; while (currIndex > 0) { // check if this is a "bad" category if ( isIdBadStart(m_cats[currIndex].m_catid) ) return true; // otherwise check the parent currIndex = getIndexFromId(m_cats[currIndex].m_parentid); } return false; } // print cat information void Categories::printCats ( long start, long end ) { for (long i = start; i < end; i++) { char str[512]; char *s = str; s += sprintf(s, "Cat %li:\n", i); s += sprintf(s, " CatID: %li\n", m_cats[i].m_catid); s += sprintf(s, " Name: "); for (long n = m_cats[i].m_nameOffset; n < m_cats[i].m_nameOffset + m_cats[i].m_nameLen; n++) s += sprintf(s, "%c", m_nameBuffer[n]); s += sprintf(s, "\n"); s += sprintf(s, " Name Offset: %li\n", m_cats[i].m_nameOffset); s += sprintf(s, " Structure Offset: %li\n", m_cats[i].m_structureOffset); s += sprintf(s, " Content Offset: %li\n", m_cats[i].m_contentOffset); s += sprintf(s, " Parent: %li\n", m_cats[i].m_parentid); s += sprintf(s, "\n"); log ( LOG_INFO, "%s", str ); } } void Categories::printPathFromId ( SafeBuf *sb , long catid, bool raw, bool isRTL ) { long catIndex; // get the index catIndex = getIndexFromId(catid); //if (catIndex < 1) return; printPathFromIndex(sb, catIndex, raw, isRTL); } void Categories::printPathFromIndex ( SafeBuf *sb , long catIndex, bool raw, bool isRTL ) { long parentId; if (catIndex < 1) return; // get the parent parentId = m_cats[catIndex].m_parentid; long catid = m_cats[catIndex].m_catid; // include Top now. in newer dmoz it is catid2. //if ( catid == 2 ) { // sb->safePrintf("Top"); // return; //} // . print the parent(s) first // . the new dmoz data dumps signify a parentless topic by // havings its parentid equal its catid, so avoid infinite // loops by checking for that here now. mdw oct 2013. // . the new DMOZ has Top has catid 2 now, even though it is // mistakenly labelled as Top/World, which is really catid 3. // so make this parentId > 2... if (parentId >= 1 && parentId != catid ) { bool isParentRTL = isIdRTLStart(parentId); // print spacing here if RTL //if (isRTL && !raw) // p += sprintf(p, " :"); printPathFromId(sb, parentId, raw, isRTL); // print a spacing //if (!isRTL && !raw) // p += sprintf(p, ": "); //else if (raw) // p += sprintf(p, "/"); if (!raw) sb->safePrintf(": "); else sb->safePrintf("/"); // if parent was the start of RTL,
if (isParentRTL && !raw) sb->safePrintf("
"); } // print this category name long nameLen = m_cats[catIndex].m_nameLen; long nameOffset = m_cats[catIndex].m_nameOffset; if (raw) { sb->safeMemcpy(&m_nameBuffer[nameOffset], nameLen); } else { // html encode the name char encodedName[2048]; char *encodeEnd = htmlEncode ( encodedName, encodedName + 2047, &m_nameBuffer[nameOffset], &m_nameBuffer[nameOffset] + nameLen ); nameLen = encodeEnd - encodedName; // fill it, replace _ with space for (long i = 0; i < nameLen; i++) { if (encodedName[i] == '_') sb->safePrintf(" "); else sb->safePrintf("%c", encodedName[i]); } } } void Categories::printPathCrumbFromId ( SafeBuf *sb , long catid, bool isRTL ) { long catIndex; // get the index catIndex = getIndexFromId(catid); //if (catIndex < 1) return; printPathCrumbFromIndex(sb, catIndex, isRTL); } void Categories::printPathCrumbFromIndex ( SafeBuf *sb, long catIndex, bool isRTL ) { long parentId; if (catIndex < 1) return; // get the parent parentId = m_cats[catIndex].m_parentid; long catid = m_cats[catIndex].m_catid; // include Top now. in newer dmoz it is catid2. // seems to already be included below... because you made it // parentId>1 not parentId>2 //if ( catid == 2 ) { // sb->safePrintf("Top"); // return; //} // . print the parent(s) first // . the new dmoz has Top has parentid 2 now, and Top/World is // catid 3. so make this parentId > 2 not parentId > 1 if (parentId > 1 && parentId != catid ) { bool isParentRTL = isIdRTLStart(parentId); printPathCrumbFromId(sb, parentId, isRTL); // print a spacing sb->safePrintf(": "); // if parent starts RTL,
if (isParentRTL && isRTL) sb->safePrintf("
"); } // print this category's link sb->safePrintf("safePrintf("/\">"); long nameLen = m_cats[catIndex].m_nameLen; long nameOffset = m_cats[catIndex].m_nameOffset; // fill it, replace _ with space { // html encode the name char encodedName[2048]; char *encodeEnd = htmlEncode ( encodedName, encodedName + 2047, &m_nameBuffer[nameOffset], &m_nameBuffer[nameOffset] + nameLen ); nameLen = encodeEnd - encodedName; for (long i = 0; i < nameLen; i++) { if (encodedName[i] == '_') sb->safePrintf(" "); else sb->safePrintf("%c", encodedName[i]); } } sb->safePrintf(""); } // increment the ptr into the file, possibly reading the next chunk char* Categories::incRdfPtr( long skip ) { long n; for (long i = 0; i < skip; i++) { m_rdfPtr++; m_currOffset++; // pull the next chunk if we're at the end if (m_rdfPtr == m_rdfEnd) { // if nothing left, return NULL //if (!m_rdfStream.good()) // return NULL; // get the next chunk //m_rdfStream.read(m_rdfBuffer, m_rdfBufferSize); //n = m_rdfStream.gcount(); n = read ( m_rdfStream, m_rdfBuffer, m_rdfBufferSize ); if ( n <= 0 || n > m_rdfBufferSize ) return NULL; m_rdfPtr = m_rdfBuffer; m_rdfEnd = &m_rdfBuffer[n]; } } return m_rdfPtr; } // parse the rdf file up past a given start tag long Categories::rdfParse ( char *tagName ) { bool inQuote = false; do { long matchPos = 0; // move to the next tag while (*m_rdfPtr != '<' || inQuote ) { // check for quotes if (*m_rdfPtr == '"') inQuote = !inQuote; // next char if (!incRdfPtr()) return -1; } // check if the tag is good do { if (!incRdfPtr()) return -1; if (*m_rdfPtr != tagName[matchPos]) break; matchPos++; } while (tagName[matchPos]); // matched if we're at the end of the tagName if (!tagName[matchPos]) { if (!incRdfPtr()) return -1; return 0; } // otherwise it's not a match, keep going matchPos = 0; } while (true); } // move to the next tag in the file long Categories::rdfNextTag ( ) { bool inQuote = false; // move to the next tag while (*m_rdfPtr != '<' || inQuote ) { // check for quotes if (*m_rdfPtr == '"') inQuote = !inQuote; // next char if (!incRdfPtr()) return -1; } // skip the < if (!incRdfPtr()) return -1; // put the tag name in a buffer m_tagLen = 0; while ( *m_rdfPtr != ' ' && *m_rdfPtr != '>' ) { // insert the current char if (m_tagLen < MAX_TAG_LEN) { m_tagRecfer[m_tagLen] = *m_rdfPtr; m_tagLen++; } // next char if (!incRdfPtr()) return -1; } m_tagRecfer[m_tagLen] = '\0'; // success return 0; } // fill the next quoted string into the buffer long Categories::fillNextString(char *str, long max) { // get the next string, skip to the next quote while (*m_rdfPtr != '"') { if (!incRdfPtr()) return -1; } // skip the quote if (!incRdfPtr()) return -1; // . pointing at the string now // dump it in the buffer long strLen = 0; while (*m_rdfPtr != '"') { // fill the next character if (strLen < max) { str[strLen] = *m_rdfPtr; strLen++; } if (!incRdfPtr()) return -1; } // step past the quote if (!incRdfPtr()) return -1; // return the length return strLen; } // fill the next tag body into the buffer long Categories::fillNextTagBody(char *str, long max) { // get the next string, skip to the next quote while (*m_rdfPtr != '>') { if (!incRdfPtr()) return -1; } // skip the > if (!incRdfPtr()) return -1; // . pointing at the string now // dump it in the buffer long strLen = 0; while (*m_rdfPtr != '<') { // fill the next character if (strLen < max) { str[strLen] = *m_rdfPtr; strLen++; } if (!incRdfPtr()) return -1; } // return the length return strLen; } // fix root urls without a trailing / long Categories::fixUrl ( char *url, long urlLen ) { // get past the first :// long slashi = 0; long newUrlLen = urlLen; while (url[slashi] != ':' || url[slashi+1] != '/' || url[slashi+2] != '/') { slashi++; if (slashi >= urlLen) return urlLen; } slashi += 3; // remove a www. /* if (newUrlLen - slashi >= 4 && strncasecmp(&url[slashi], "www.", 4) == 0) { memmove(&url[slashi], &url[slashi+4], newUrlLen - (slashi+4)); newUrlLen -= 4; } */ // look for //, cut down to single / for (; slashi < newUrlLen; slashi++) { if (url[slashi-1] == '/' && url[slashi] == '/') { memmove(&url[slashi-1], &url[slashi], newUrlLen - slashi); newUrlLen--; } if (is_wspace_a(url[slashi])) { memmove(&url[slashi], &url[slashi+1], newUrlLen - (slashi+1)); newUrlLen--; } } // remove any trailing / if (url[newUrlLen-1] == '/') newUrlLen--; // return the new length return newUrlLen; } bool Categories::addUrlsToBadHashTable ( long catid ) { return getTitleAndSummary ( NULL , // urlorig 0 , // urloriglen catid , NULL , // title 0 , // titleLen 0 , // maxTitleLen NULL , // summ 0 , // summLen 0 , // maxSummLen NULL , // anchor 0 , // anchorLen 0 , // maxAnchorLen 0 , // niceness true );// just add to table } // just show the urls in dmoz bool Categories::printUrlsInTopic ( SafeBuf *sb, long catid ) { long catIndex; unsigned long fileOffset; unsigned long n; char* p; unsigned long readSize; char title[1024]; char summ[5000]; long maxTitleLen = 1024; long maxSummLen = 5000; long titleLen; long summLen; long urlStrLen; char urlStr[MAX_URL_LEN]; long niceness = 0; bool printedStart = false; // lookup the index for this catid catIndex = getIndexFromId(catid); if (catIndex < 0) goto errEnd; // get the file offset fileOffset = m_cats[catIndex].m_contentOffset; QUICKPOLL( niceness ); // . open the file char filename[512]; sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFCONTENT_FILE); m_rdfStream = open(filename, O_RDONLY | O_NONBLOCK); if ( m_rdfStream < 0 ) { log("cat: Error Opening %s\n", filename); goto errEnd; } // . seek to the offset n = lseek ( m_rdfStream, fileOffset, SEEK_SET ); if ( n != fileOffset ) { log("cat: Error seeking to Content Offset %li", fileOffset); goto errEnd; } // . read in a chunk m_rdfBuffer = m_rdfSmallBuffer; m_rdfBufferSize = RDFSMALLBUFFER_SIZE; p = m_rdfBuffer; readSize = m_rdfBufferSize; readLoop: n = read ( m_rdfStream, p, readSize ); if(n > 0 && n != readSize) { p += n; readSize -= n; } //log(LOG_WARN,"build: reading %li bytes out of %li",n,m_rdfBufferSize); QUICKPOLL(niceness); if(n < 0 && errno == EAGAIN) goto readLoop; if ( n <= 0 || n > (unsigned long)m_rdfBufferSize ) { log("cat: Error Reading Content"); goto errEnd; } m_rdfPtr = m_rdfBuffer; m_rdfEnd = &m_rdfBuffer[n]; m_currOffset = fileOffset; // . parse to the correct url // parse the first topic and catid if (rdfNextTag() < 0) goto errEnd; if (rdfNextTag() < 0) goto errEnd; // parse until "ExternalPage" nextTag: QUICKPOLL((niceness)); if (rdfNextTag() < 0) goto errEnd; // check for catid of next topic to stop looking if (m_tagLen == 5 && strncmp(m_tagRecfer, "catid", 5) == 0) goto errEnd; if (m_tagLen != 12 ) goto nextTag; if ( strncmp(m_tagRecfer, "ExternalPage", 12) != 0) goto nextTag; // // got one // // get the next string urlStrLen = fillNextString(urlStr, MAX_URL_LEN-1); if (urlStrLen < 0) goto errEnd; // html decode the url /* urlStrLen = htmlDecode(decodedUrl, urlStr, urlStrLen,false, niceness); memcpy(urlStr, decodedUrl, urlStrLen); normUrl.set(urlStr, urlStrLen, true); g_catdb.normalizeUrl(&normUrl, &normUrl); // copy it back urlStrLen = normUrl.getUrlLen(); memcpy(urlStr, normUrl.getUrl(), urlStrLen); // make sure there's a trailing / on root urls // and no www. //urlStrLen = fixUrl(urlStr, urlStrLen); // check for an anchor urlAnchor = NULL; urlAnchorLen = 0; //for (long i = 0; i < urlStrLen; i++) { //if (urlStr[i] == '#') { if (normUrl.getAnchorLen() > 0) { //urlAnchor = &urlStr[i]; //urlAnchorLen = urlStrLen - i; //urlStrLen = i; urlAnchor = normUrl.getAnchor(); urlAnchorLen = normUrl.getAnchorLen(); //break; } */ // . parse out the title if (rdfParse("d:Title") < 0) goto errEnd; titleLen = fillNextTagBody(title, maxTitleLen); QUICKPOLL(niceness); // . parse out the summary if (rdfParse("d:Description") < 0) goto errEnd; summLen = fillNextTagBody(summ, maxSummLen); if ( ! printedStart ) { printedStart = true; sb->safePrintf(""); close(m_rdfStream); return false; } // . get the title and summary for a specific url // and catid bool Categories::getTitleAndSummary ( char *urlOrig, long urlOrigLen, long catid, char *title, long *titleLen, long maxTitleLen, char *summ, long *summLen, long maxSummLen, char *anchor, unsigned char *anchorLen, long maxAnchorLen , long niceness , bool justAddToTable ) { long catIndex; unsigned long fileOffset; unsigned long n; char url[MAX_URL_LEN]; long urlLen; char urlStr[MAX_URL_LEN]; long urlStrLen = 0; char decodedUrl[MAX_URL_LEN]; char *urlAnchor = NULL; long urlAnchorLen = 0; Url normUrl; char* p; unsigned long readSize; // fix the original url //memcpy(url, urlOrig, urlOrigLen); //urlLen = fixUrl(url, urlOrigLen); normUrl.set(urlOrig, urlOrigLen, true); g_catdb.normalizeUrl(&normUrl, &normUrl); memcpy(url, normUrl.getUrl(), normUrl.getUrlLen()); urlLen = normUrl.getUrlLen(); // lookup the index for this catid catIndex = getIndexFromId(catid); if (catIndex < 0) goto errEnd; // get the file offset fileOffset = m_cats[catIndex].m_contentOffset; QUICKPOLL( niceness ); // . open the file char filename[512]; sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFCONTENT_FILE); //m_rdfStream.clear(); //m_rdfStream.open(filename, ifstream::in); m_rdfStream = open(filename, O_RDONLY | O_NONBLOCK); //if (!m_rdfStream.is_open()) { if ( m_rdfStream < 0 ) { log("cat: Error Opening %s\n", filename); goto errEnd; } // . seek to the offset //m_rdfStream.seekg(fileOffset, ios::beg); n = lseek ( m_rdfStream, fileOffset, SEEK_SET ); //if (!m_rdfStream.good()) { if ( n != fileOffset ) { log("cat: Error seeking to Content Offset %li", fileOffset); goto errEnd; } // . read in a chunk m_rdfBuffer = m_rdfSmallBuffer; m_rdfBufferSize = RDFSMALLBUFFER_SIZE; //m_rdfStream.read(m_rdfBuffer, m_rdfBufferSize); //n = m_rdfStream.gcount(); p = m_rdfBuffer; readSize = m_rdfBufferSize; readLoop: n = read ( m_rdfStream, p, readSize ); if(n > 0 && n != readSize) { p += n; readSize -= n; } //log(LOG_WARN,"build: reading %li bytes out of %li",n,m_rdfBufferSize); QUICKPOLL(niceness); if(n < 0 && errno == EAGAIN) goto readLoop; if ( n <= 0 || n > (unsigned long)m_rdfBufferSize ) { log("cat: Error Reading Content"); goto errEnd; } m_rdfPtr = m_rdfBuffer; m_rdfEnd = &m_rdfBuffer[n]; m_currOffset = fileOffset; // . parse to the correct url // parse the first topic and catid if (rdfNextTag() < 0) goto errEnd; if (rdfNextTag() < 0) goto errEnd; // parse until "ExternalPage" and correct url or "Topic" nextTag: QUICKPOLL((niceness)); if (rdfNextTag() < 0) goto errEnd; // check for catid of next topic to stop looking if (m_tagLen == 5 && strncmp(m_tagRecfer, "catid", 5) == 0) goto errEnd; if (m_tagLen == 12 && strncmp(m_tagRecfer, "ExternalPage", 12) == 0) { // get the next string urlStrLen = fillNextString(urlStr, MAX_URL_LEN-1); if (urlStrLen < 0) goto errEnd; // html decode the url urlStrLen = htmlDecode(decodedUrl, urlStr, urlStrLen,false, niceness); memcpy(urlStr, decodedUrl, urlStrLen); // normalize with Url //normUrl.set(urlStr, urlStrLen, false, false, false, true); normUrl.set(urlStr, urlStrLen, true); g_catdb.normalizeUrl(&normUrl, &normUrl); // if we just want the hashes of all the urls, add them if ( justAddToTable ) { // but skip if not a root url... because // LinkText::isBadCatUrl() only checks roots... if ( ! normUrl.isRoot() ) goto nextTag; unsigned long h = hash32 ( normUrl.getUrl() , normUrl.getUrlLen() ); m_badTable.addKey ( h , 1 ); goto nextTag; } // copy it back urlStrLen = normUrl.getUrlLen(); memcpy(urlStr, normUrl.getUrl(), urlStrLen); // make sure there's a trailing / on root urls // and no www. //urlStrLen = fixUrl(urlStr, urlStrLen); // check for an anchor urlAnchor = NULL; urlAnchorLen = 0; //for (long i = 0; i < urlStrLen; i++) { //if (urlStr[i] == '#') { if (normUrl.getAnchorLen() > 0) { //urlAnchor = &urlStr[i]; //urlAnchorLen = urlStrLen - i; //urlStrLen = i; urlAnchor = normUrl.getAnchor(); urlAnchorLen = normUrl.getAnchorLen(); //break; } //} //urlStr[urlStrLen] = '\0'; // check against the url if (urlStrLen == urlLen && strncasecmp(url, urlStr, urlLen) == 0) goto foundTag; } // miss, goto next tag goto nextTag; foundTag: // . parse out the title if (rdfParse("d:Title") < 0) goto errEnd; if (title && titleLen) *titleLen = fillNextTagBody(title, maxTitleLen); QUICKPOLL(niceness); // . parse out the summary if (rdfParse("d:Description") < 0) goto errEnd; if (summ && summLen) *summLen = fillNextTagBody(summ, maxSummLen); // . fill the anchor if (anchor) { if (urlAnchor) { if (urlAnchorLen > maxAnchorLen) urlAnchorLen = maxAnchorLen; memcpy(anchor, urlAnchor, urlAnchorLen); *anchorLen = urlAnchorLen; } else *anchorLen = 0; } // . close the file //m_rdfStream.clear(); //m_rdfStream.close(); close(m_rdfStream); return true; errEnd: if (titleLen) *titleLen = 0; if (summLen) *summLen = 0; if (anchor) *anchorLen = 0; //m_rdfStream.close(); //m_rdfStream.clear(); close(m_rdfStream); return false; } // . generate sub categories for a given catid // . store list of SubCategories into "subCatBuf" return # stored long Categories::generateSubCats ( long catid, SafeBuf *subCatBuf //SubCategory *subCats, //char **catBuffer, //long *catBufferSize, //long *catBufferLen, //bool allowRealloc ) { long catIndex; unsigned long fileOffset; unsigned long n; long numSubCats = 0; long currType; char catStr[MAX_CATNAME_LEN]; long catStrLen; long prefixStart; long prefixLen; long nameStart; long nameLen; long need ; SubCategory *cat; char *p ; //long catp = 0; //long catBufferInc = *catBufferSize; // . lookup the index for this catid // . binary step, guessing to approximate place // and then scanning from there catIndex = getIndexFromId(catid); if (catIndex < 0) goto errEnd; // get the file offset fileOffset = m_cats[catIndex].m_structureOffset; // open the structure file // catdb/structure.rdf.u8 in utf8 char filename[512]; sprintf(filename, "%scatdb/%s", g_hostdb.m_dir, RDFSTRUCTURE_FILE); //m_rdfStream.clear(); //m_rdfStream.open(filename, ifstream::in); m_rdfStream = open(filename, O_RDONLY); //if (!m_rdfStream.is_open()) { if ( m_rdfStream < 0 ) { log("cat: Error Opening %s\n", filename); goto errEnd; } // seek to the offset //m_rdfStream.seekg(fileOffset, ios::beg); n = lseek ( m_rdfStream, fileOffset, SEEK_SET ); //if (!m_rdfStream.good()) { if ( n != fileOffset ) { log("cat: Error seeking to Structure Offset %li", fileOffset); goto errEnd; } // . read in a chunk m_rdfBuffer = m_rdfSmallBuffer; m_rdfBufferSize = RDFSMALLBUFFER_SIZE; //m_rdfStream.read(m_rdfBuffer, m_rdfBufferSize); //n = m_rdfStream.gcount(); n = read ( m_rdfStream, m_rdfBuffer, m_rdfBufferSize ); if ( n <= 0 || n > (unsigned long)m_rdfBufferSize ) { log("cat: Error Reading Structure Offset"); goto errEnd; } // point to the buffer we just read with m_rdfPtr m_rdfPtr = m_rdfBuffer; m_rdfEnd = &m_rdfBuffer[n]; m_currOffset = fileOffset; // parse tags for the sub categories or until we hit /Topic nextTag: // . this increments m_rdfPtr until it points to the beginning of a tag // . it may end up reading another chunk from disk // . it memcopies m_tagRecfer to be the name of the tag it points to if (rdfNextTag() < 0) goto gotSubCats; // check for /Topic if (m_tagLen == 6 && strncmp(m_tagRecfer, "/Topic", 6) == 0) goto gotSubCats; else if (m_tagLen == 7 && strncmp(m_tagRecfer, "altlang", 7) == 0) currType = SUBCAT_ALTLANG; else if (m_tagLen == 7 && strncmp(m_tagRecfer, "related", 7) == 0) currType = SUBCAT_RELATED; else if (m_tagLen == 8 && strncmp(m_tagRecfer, "symbolic", 8) == 0) currType = SUBCAT_SYMBOLIC; else if (m_tagLen == 6 && strncmp(m_tagRecfer, "narrow", 6) == 0) currType = SUBCAT_NARROW; else if (m_tagLen == 9 && strncmp(m_tagRecfer, "symbolic1", 9) == 0) currType = SUBCAT_SYMBOLIC1; else if (m_tagLen == 7 && strncmp(m_tagRecfer, "narrow1", 7) == 0) currType = SUBCAT_NARROW1; else if (m_tagLen == 9 && strncmp(m_tagRecfer, "symbolic2", 9) == 0) currType = SUBCAT_SYMBOLIC2; else if (m_tagLen == 7 && strncmp(m_tagRecfer, "narrow2", 7) == 0) currType = SUBCAT_NARROW2; else if (m_tagLen == 9 && strncmp(m_tagRecfer, "letterbar", 9) == 0) currType = SUBCAT_LETTERBAR; else goto nextTag; // read the name for this category catStrLen = fillNextString(catStr, MAX_CATNAME_LEN-1); if (catStrLen < 0) goto gotSubCats; // html decode it first char htmlDecoded[MAX_HTTP_FILENAME_LEN*2]; if (catStrLen > MAX_HTTP_FILENAME_LEN*2) catStrLen = MAX_HTTP_FILENAME_LEN*2; catStrLen = htmlDecode ( htmlDecoded, catStr, catStrLen , false, 0); memcpy(catStr, htmlDecoded, catStrLen); // reset this offset nameStart = 0; nameLen = catStrLen; // get the prefix and name position/length switch (currType) { case SUBCAT_ALTLANG: case SUBCAT_SYMBOLIC: case SUBCAT_SYMBOLIC1: case SUBCAT_SYMBOLIC2: // prefix is at the start prefixStart = 0; prefixLen = 0; //nameStart = 0; // go to the end of the prefix while (catStr[nameStart] != ':') { nameStart++; prefixLen++; } // skip the : in :Top/ nameStart += 1; nameLen = catStrLen - nameStart; break; case SUBCAT_LETTERBAR: // prefix is the very last letter prefixStart = catStrLen - 1; prefixLen = 1; // skip the Top/ for the name //nameStart = 4; // lose the Top/, keep the end letter //nameLen = catStrLen - 4; break; // . don't do this because of ltr? //case SUBCAT_RELATED: // // prefix the entire path, minus Top // prefixStart = 4; // prefixLen = catStrLen - 4; // // name skips Top/ // nameStart = 4; // nameLen = catStrLen - 4; // break; default: // prefix the last folder prefixStart = catStrLen; prefixLen = 0; while (catStr[prefixStart-1] != '/' && prefixStart > 0) { prefixStart--; prefixLen++; } // name skips Top/ ... no! we include Top now // because we need it so PageResults.cpp can call // currIndex=g_categories->getIndexFromPath(catName,catNameLen) // on this name, and it needs "Top/" because it was part // of the hash of the full name for the category now. // and we lookup the Category record by that hash // in getIndexFromPath(). //nameStart = 4; //nameLen = catStrLen - 4; break; } // . fill the next sub category // . fill the prefix and name in the buffer and subcat need = sizeof(SubCategory) + prefixLen + 1 + nameLen + 1; // reserve space in safebuf for it if ( ! subCatBuf->reserve(need) ) goto errEnd; // point to it in safebuf cat = (SubCategory *)(subCatBuf->getBuf()); cat->m_prefixLen = prefixLen; cat->m_nameLen = nameLen; cat->m_type = currType; p = cat->m_buf; memcpy ( p , catStr + prefixStart , prefixLen ); p += prefixLen; *p++ = '\0'; memcpy ( p , catStr + nameStart , nameLen ); p += nameLen; *p++ = '\0'; // update safebuf length subCatBuf->incrementLength ( cat->getRecSize() ); /* subCats[numSubCats].m_prefixOffset = catp; subCats[numSubCats].m_prefixLen = prefixLen; if (prefixLen > 0) { memcpy(&((*catBuffer)[catp]), &catStr[prefixStart], prefixLen); catp += prefixLen; } subCats[numSubCats].m_nameOffset = catBuf->length();//catp; subCats[numSubCats].m_nameLen = nameLen; if (nameLen > 0) { memcpy(&((*catBuffer)[catp]), &catStr[nameStart], nameLen); catp += nameLen; } subCats[numSubCats].m_type = currType; */ // next sub cat numSubCats++; if (numSubCats >= MAX_SUB_CATS) { log ( LOG_WARN, "categories: Attempted to load too many" " sub-categories, truncating." ); goto gotSubCats; } // next tag goto nextTag; gotSubCats: //*catBufferLen = catp; //m_rdfStream.close(); //m_rdfStream.clear(); close(m_rdfStream); return numSubCats; errEnd: //*catBufferLen = 0; //m_rdfStream.close(); //m_rdfStream.clear(); close(m_rdfStream); return 0; } // creates a directory search request url //void Categories::createDirectorySearchUrl ( Url *url, long Categories::createDirSearchRequest ( char *requestBuf, long requestBufSize, long catid, char *hostname, long hostnameLen, char *coll, long collLen, char *cgi, long cgiLen, bool cgiFromRequest , HttpRequest *r ) { // setup the request Url //char buffer[1024+MAX_COLL_LEN]; //long bufferLen; //char *p = buffer; char *p = requestBuf; //char *pend = buffer + 1024+MAX_COLL_LEN; char *pend = requestBuf + requestBufSize; if ( p + (hostnameLen + collLen + 128 ) >= pend ) return 0; // GET //p += sprintf(p, "GET "); // damnit, keep the ZET if that's what we had, that's how we know // if the sender requires a compressed reply (qcproxy = query // compression proxy) char *cmd = "GET"; char *rrr = r->m_reqBuf.getBufStart(); if ( rrr && rrr[0] == 'Z' ) cmd = "ZET"; // request //p += sprintf(p, "%s /search?dir=%li&dr=0&sc=0&sdir=%li&sdirt=0&c=", // cmd, catid, catid); p += sprintf(p, "%s /search?q=gbcatid%%3A%li&dir=%li&dr=0&sc=0&c=" , cmd , catid , catid); // coll memcpy(p, coll, collLen); p += collLen; // add extra cgi if we have it and have room if ( cgi && cgiLen > 0 && p + cgiLen + 76 < pend ) { // if it's from the request, need to add &'s and ='s if ( cgiFromRequest ) { //p += sprintf(p, "&"); *p = '&'; p++; bool ampToggle = false; //for (long i = cgiPos; i < cgiPos + cgiLen; i++) { //if ( p + 10 >= pend ) break; for (long i = 0; i < cgiLen; i++) { //*p = decodedPath[i]; *p = cgi[i]; if (*p == '\0') { if (ampToggle) *p = '&'; else *p = '='; ampToggle = !ampToggle; } p++; } } else { memcpy(p, cgi, cgiLen); p += cgiLen; } } // hostname p += sprintf(p, " HTTP/1.0\r\nHost: http://"); memcpy(p, hostname, hostnameLen); p += hostnameLen; // rest of the request p += sprintf(p, "\r\n" "Accept-Language: en\r\n" "Accept: text/html\r\n\r\n" ); //buffer[p - buffer] = '\0'; // set the Url //url->set(buffer, p - buffer); return p - requestBuf; } static HashTable langTables[32]; // Horrible hack, must fix later bool Categories::loadLangTables(void) { char line[10240]; FILE *content; unsigned long h; unsigned long lineno = 0L; unsigned long entries = 0L; char *cp; char *cpEnd = line + 10239; if(!(content = fopen("catdb/content.rdf.u8", "r"))) { log(LOG_INFO, "cat: could not open content file.\n"); return(false); } while(!feof(content) && fgets(line, 10239, content)) { lineno++; if(lineno % 1000000 == 0) log(LOG_INFO, "cat: Parsing line %ld\n", lineno); if(!strncmp(line, "", 14)) { h = 0L; // end tag, clear hash continue; } if(!strncmp(line, "Top/World/", 18)) { for(register int i = 2; i <= langTagalog; i++) { if(!memcmp(line + 19, langToTopic[i], gbstrlen((char *)langToTopic[i]))) { langTables[i].addKey(h, 1); entries++; h = 0; // paranoia, clear hash } } } } log(LOG_INFO, "cat: Added %ld total entries.\n", entries); fclose(content); // Save all the tables for later for(register int i = 2; i <= langTagalog; i++) { sprintf(line, "catlang%03d.dat", i); langTables[i].save(g_hostdb.m_dir, line); if(langTables[i].getNumSlotsUsed() <= 0 ) { log(LOG_INFO, "cat: Don't seem to have any data in table %d\n", i); } } return(true); } bool Categories::initLangTables(void) { char name[512]; register int i; // long long memory = g_mem.m_used; unsigned long long start; unsigned long long stop; for(i = 2; i <= MAX_LANGUAGES; i++) { // There is no language 5! if(i == 5) continue; /* langTables[i] = (HashTable *) mmalloc(sizeof(HashTable), "LangHashTable"); if(!langTables[i]) { log(LOG_INFO, "cat: Could not allocate memory for category language tables.\n"); return(false); } */ langTables[i].set(10); // paranoia snprintf(name, 511, "lang%03d.dat", i); langTables[i].load(g_hostdb.m_dir, name); } // check for any empty tables for(i = 2; i <= langTagalog; i++) { // There is no language 5! if(i == 5) continue; if(langTables[i].getNumSlotsUsed() <= 0 ) { log(LOG_INFO, "cat: Starting language load.\n"); start = gettimeofdayInMicroseconds(); loadLangTables(); stop = gettimeofdayInMicroseconds(); log(LOG_INFO, "cat: Parsing content took %lld microseconds\n", stop - start); break; } } return(true); } uint8_t Categories::findLanguage(char *addr) { unsigned long h; char *cp = addr; if(!strncmp(cp, "http://", 7)) cp += 7; h = hash32(cp, gbstrlen(cp)); for(register int i = 2; i <= langTagalog; i++) { if(i == 5) continue; // There is no language 5! if(langTables[i].getNumSlotsUsed() > 0 && langTables[i].getSlot(h) >= 0) return((uint8_t)i); } return(0); }