open-source-search-engine/dmozparse.cpp
2015-09-13 13:21:36 -07:00

2444 lines
69 KiB
C++

//
// Gigablast, Copyright March 2005
// Author: Javier Olivares <jolivares@gigablast.com>
//
// DMOZ RDF file parser into proprietary format
// See the "usage" note in the main function for usage and features.
// I apologize to anyone who must maintain or even simply read this code.
//
#include "gb-include.h"
#include <iostream>
#include <fstream>
#include "Url.h"
#include "HttpRequest.h"
#include "sort.h"
#undef malloc
#undef calloc
#undef realloc
bool closeAll ( void *state , void (* callback)(void *state) ) { return true; }
bool allExit ( ) { return true; };
bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
//int32_t g_qbufNeedSave = false;
//SafeBuf g_qbuf;
bool g_recoveryMode;
int32_t g_recoveryLevel;
int g_inMemcpy;
#define RDFBUFFER_SIZE (1024*1024*10)
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
#define RDFCONTENT_FILE "content.rdf.u8"
#define STRUCTURE_OUTPUT_FILE "gbdmoz.structure.dat"
#define CONTENT_OUTPUT_FILE "gbdmoz.content.dat"
#define URL_OUTPUT_FILE "gbdmoz.urls.dat"
#define URLTEXT_OUTPUT_FILE "gbdmoz.urls.txt"
#define DIFFURLTEXT_OUTPUT_FILE "gbdmoz.diffurls.txt"
#define CATEGORY_OUTPUT_FILE "gbdmoz.categories.txt"
#define NAME_BUFFER_SIZE 24*1024*1024
#define CAT_BUFFER_SIZE 256*1024
#define URL_BUFFER_SIZE 32*1024*1024
#define URLINFO_BUFFER_SIZE 1024*1024
#define MAX_CATID_LEN 63
#define MAX_TAG_LEN 127
#define MAX_URL_CATIDS 32
#define MAX_URLTXT_SIZE 500000
#define HASHTABLE_SIZE (1024*1024)
#define URLHASHTABLE_SIZE (10*1024*1024)
#define MODE_NONE 0
#define MODE_NEW 1
#define MODE_UPDATE 2
#define MODE_URLDUMP 3
#define MODE_DIFFURLDUMP 4
#define MODE_CATDUMP 5
#define OLDURL_BUFFER_SIZE (32*1024*1024)
#define OLDCATID_BUFFER_SIZE (1024*1024)
using namespace std;
// struct for a link list hash table
struct HashLink {
int32_t m_keyOffset;
int32_t m_keyLen;
int32_t m_data;
HashLink *m_next;
};
// another hash, for urls
struct UrlHashLink {
uint64_t m_key;
//uint32_t m_key2;
//int32_t m_urlOffset;
//int32_t m_urlLen;
int32_t m_index;
UrlHashLink *m_next;
};
// structure to store url info
struct UrlInfo {
//uint64_t m_hash;
//int16_t m_urlLen;
//int32_t m_urlOffset;
unsigned char m_numCatids;
//int32_t m_catids[MAX_URL_CATIDS];
int32_t *m_catids;
char m_changed;
};
// struct for storing categories and their related info
struct RdfCat {
int32_t m_catid;
int32_t m_parentid;
//int16_t m_numSymParents;
//int32_t *m_symParents;
int32_t m_nameOffset;
int16_t m_nameLen;
uint32_t m_structureOffset;
uint32_t m_contentOffset;
uint32_t m_catHash;
int32_t m_numUrls;
};
// hash tables
HashLink *hashTable[HASHTABLE_SIZE];
UrlHashLink *urlHashTable[URLHASHTABLE_SIZE];
// url buffer
char *urlBuffer = NULL;
int32_t urlBufferSize = 0;
int32_t urlBufferLen = 0;
// url info array
UrlInfo *urlInfos = NULL;
int32_t urlInfosSize = 0;
int32_t numUrlInfos = 0;
// categories
RdfCat *rdfCats = NULL;
int32_t rdfCatsSize = 0;
int32_t numRdfCats = 0;
// rdf file stream
//ifstream rdfStream;
int rdfStream;
char *rdfBuffer = NULL;
char *rdfPtr = NULL;
char *rdfEnd = NULL;
// output file stream for serialization
//ofstream outStream;
//ofstream outStream2;
int outStream;
int outStream2;
// offset into the file
uint32_t currOffset = 0;
// cat name buffer
char *nameBuffer = NULL;
int32_t nameBufferSize = 0;
int32_t nameBufferLen = 0;
// catid buffer
char catidBuffer[MAX_CATID_LEN+1];
int32_t catidLen = 0;
// tag buffer
char tagRecfer[MAX_TAG_LEN+1];
int32_t tagLen = 0;
bool mainShutdown ( bool urgent ) { return true; }
// increment the ptr into the file, possibly reading the next chunk
char* incRdfPtr( int32_t skip = 1 ) {
int32_t n;
for (int32_t i = 0; i < skip; i++) {
rdfPtr++;
currOffset++;
// pull the next chunk if we're at the end
if (rdfPtr >= rdfEnd) {
// if nothing left, return NULL
//if (!rdfStream.good())
// return NULL;
// get the next chunk
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
//n = rdfStream.gcount();
n = read(rdfStream, rdfBuffer, RDFBUFFER_SIZE);
if ( n <= 0 || n > RDFBUFFER_SIZE )
return NULL;
rdfPtr = rdfBuffer;
rdfEnd = &rdfBuffer[n];
}
}
return rdfPtr;
}
// parse the rdf file up past a given start tag
int32_t rdfParse ( char *tagName ) {
//bool inQuote = false;
do {
int32_t matchPos = 0;
// move to the next tag
// . quotes are no longer escaped out in the newer
// dmoz files in oct 2013... so take that out. i do
// this < is &lt; though.. perhaps only check for
// quotes when in a tag?
while (*rdfPtr != '<' ) { // || inQuote ) {
// check for quotes
//if (*rdfPtr == '"')
// inQuote = !inQuote;
// next char
if (!incRdfPtr())
return -1;
}
// check if the tag is good
do {
if (!incRdfPtr())
return -1;
if (*rdfPtr != tagName[matchPos])
break;
matchPos++;
} while (tagName[matchPos]);
// matched if we're at the end of the tagName
if (!tagName[matchPos]) {
if (!incRdfPtr())
return -1;
return 0;
}
// otherwise it's not a match, keep going
matchPos = 0;
} while (true);
}
// move to the next tag in the file
int32_t rdfNextTag ( ) {
//bool inQuote = false;
// move to the next tag
while (*rdfPtr != '<' ) { // || inQuote ) {
// check for quotes
// NO! too many unbalanced quotes all over the place!
// and i think quotes in tags do not have < or > in them
// because they should be encoded as &gt; and &lt;
//if (*rdfPtr == '"')
// inQuote = !inQuote;
// next char
if (!incRdfPtr())
return -1;
}
// skip the <
if (!incRdfPtr())
return -1;
// put the tag name in a buffer
tagLen = 0;
while ( *rdfPtr != ' ' &&
*rdfPtr != '>' ) {
// insert the current char
if (tagLen < MAX_TAG_LEN) {
tagRecfer[tagLen] = *rdfPtr;
tagLen++;
}
// next char
if (!incRdfPtr())
return -1;
}
tagRecfer[tagLen] = '\0';
// success
return 0;
}
// compare two cats, for gbsort
int catcomp ( const void *c1, const void *c2 ) {
return (((RdfCat*)c1)->m_catid - ((RdfCat*)c2)->m_catid);
}
// hash a string
uint32_t catHash ( char *key, int32_t keyLen ) {
// simple hash
uint32_t hash = 0;
for (int32_t i = 0; i < keyLen; i++)
hash ^= key[i]*i;
return (hash % HASHTABLE_SIZE);
}
// NOTE: these hash functions assume the name buffer
// and key offset are preserved throughout the
// use of the hash
// init the hash table
void initHashTable ( ) {
for (int32_t i = 0; i < HASHTABLE_SIZE; i++)
hashTable[i] = NULL;
}
// clear the hash table
void clearHashTable ( ) {
for (int32_t i = 0; i < HASHTABLE_SIZE; i++) {
while (hashTable[i]) {
HashLink *next = hashTable[i]->m_next;
free(hashTable[i]);
hashTable[i] = next;
}
hashTable[i] = NULL;
}
}
// add a string to a hash table with the given data
int32_t addCatHash ( int32_t keyOffset, int32_t keyLen, int32_t data ) {
// get the hash value
uint32_t hashKey = catHash(&nameBuffer[keyOffset], keyLen);
// get the first node
HashLink **currLink = &hashTable[hashKey];
// go to the first empty node
while (*currLink)
currLink = &((*currLink)->m_next);
// fill the node
*currLink = (HashLink*)malloc(sizeof(HashLink));
if (!(*currLink))
return -1;
(*currLink)->m_keyOffset = keyOffset;
(*currLink)->m_keyLen = keyLen;
(*currLink)->m_data = data;
(*currLink)->m_next = NULL;
return 0;
}
// get the data in the hash using a string key
int32_t getCatHash ( char *key, int32_t keyLen ) {
// get the hash value
uint32_t hashKey = catHash(key, keyLen);
// get the first node
HashLink *currLink = hashTable[hashKey];
// go to the correct node
while ( currLink &&
( currLink->m_keyLen != keyLen ||
strncmp(&nameBuffer[currLink->m_keyOffset], key, keyLen) != 0 ) )
currLink = currLink->m_next;
// return -1 if not found
if (!currLink)
return -1;
else
return currLink->m_data;
}
// init the hash table
void initUrlHashTable ( ) {
for (int32_t i = 0; i < URLHASHTABLE_SIZE; i++)
urlHashTable[i] = NULL;
}
// clear the hash table
void clearUrlHashTable ( ) {
for (int32_t i = 0; i < URLHASHTABLE_SIZE; i++) {
while (urlHashTable[i]) {
UrlHashLink *next = urlHashTable[i]->m_next;
free(urlHashTable[i]);
urlHashTable[i] = next;
}
urlHashTable[i] = NULL;
}
}
// add a url hash to the hash table with the given index
int32_t addUrlHash ( uint64_t key,
//uint32_t key2,
int32_t index ) {
//int32_t index,
//int32_t urlOffset,
//int32_t urlLen ) {
// get the hash value
uint32_t hashKey = (key%(uint64_t)URLHASHTABLE_SIZE);
// get the first node
UrlHashLink **currLink = &urlHashTable[hashKey];
// go to the first empty node
while (*currLink)
currLink = &((*currLink)->m_next);
// fill the node
*currLink = (UrlHashLink*)malloc(sizeof(UrlHashLink));
if (!(*currLink))
return -1;
(*currLink)->m_key = key;
//(*currLink)->m_key2 = key2;
(*currLink)->m_index = index;
//(*currLink)->m_urlOffset = urlOffset;
//(*currLink)->m_urlLen = urlLen;
(*currLink)->m_next = NULL;
return 0;
}
// get the index in the hash using hash key
int32_t getUrlHash ( uint64_t key ) {
//uint32_t key2 ) {
//uint32_t key2,
//int32_t urlOffset,
//int32_t urlLen ) {
// get the hash value
uint32_t hashKey = (key%(uint64_t)URLHASHTABLE_SIZE);
// get the first node
UrlHashLink *currLink = urlHashTable[hashKey];
// go to the correct node
while ( currLink && currLink->m_key != key )
//( currLink->m_key != key || currLink->m_key2 != key2 ) )
//( currLink->m_key != key || currLink->m_key2 != key2 ||
//currLink->m_urlLen != urlLen ||
//strncasecmp(&urlBuffer[currLink->m_urlOffset],
// &urlBuffer[urlOffset], urlLen) != 0) )
currLink = currLink->m_next;
// return -1 if not found
if (!currLink)
return -1;
else
return currLink->m_index;
}
// do a binary search to get a cat from an id
int32_t getIndexFromId ( int32_t catid ) {
int32_t low = 0;
int32_t high = numRdfCats-1;
int32_t currCat;
// binary search
//while (rdfCats[currCat].m_catid != catid) {
while (low <= high) {
// next check spot
currCat = (low + high)/2;
// check for hit
if (rdfCats[currCat].m_catid == catid)
return currCat;
// shift search range
else if (rdfCats[currCat].m_catid > catid)
high = currCat-1;
else
low = currCat+1;
}
//printf("catid %"INT32" not found. sanity checking.\n",catid);
// sanity check our algo
//for ( int32_t i = 0 ; i < numRdfCats ; i++ ) {
// if ( rdfCats[i].m_catid == catid ) { char *xx=NULL;*xx=0;}
//}
// not found
return -1;
}
// print cat information
void printCats ( int32_t start, int32_t end ) {
for (int32_t i = start; i < end; i++) {
printf("Cat %"INT32":\n", i);
printf(" CatID: %"INT32"\n", rdfCats[i].m_catid);
printf(" Name: ");
for (int32_t n = rdfCats[i].m_nameOffset;
n < rdfCats[i].m_nameOffset + rdfCats[i].m_nameLen; n++)
printf("%c", nameBuffer[n]);
printf("\n");
printf(" Name Offset: %"INT32"\n", rdfCats[i].m_nameOffset);
printf(" Structure Offset: %"INT32"\n", rdfCats[i].m_structureOffset);
printf(" Content Offset: %"INT32"\n", rdfCats[i].m_contentOffset);
printf(" Parent: %"INT32"\n", rdfCats[i].m_parentid);
printf("\n");
}
}
// parse out the next catid
int32_t parseNextCatid() {
// parse for <catid, this will be the next cat
if (rdfParse("catid") == -1)
return -1;
// go to the catid, skip '>'
if (!incRdfPtr())
return -1;
catidLen = 0;
while (*rdfPtr != '<') {
if (catidLen < MAX_CATID_LEN) {
catidBuffer[catidLen] = *rdfPtr;
catidLen++;
}
if (!incRdfPtr())
return -1;
}
catidBuffer[catidLen] = '\0';
// translate the id
return atol(catidBuffer);
}
// fill the next quoted string in the name buffer
int32_t fillNextString() {
// get the next string, skip to the next quote
while (*rdfPtr != '"') {
if (!incRdfPtr())
return -1;
}
// skip the quote
if (!incRdfPtr())
return -1;
// . pointing at the string now
// dump it in the buffer
int32_t nameLen = 0;
while (*rdfPtr != '"') {
// make sure there's room in the buffer
if (nameBufferLen+nameLen >= nameBufferSize) {
nameBufferSize += NAME_BUFFER_SIZE;
nameBuffer = (char*)realloc((void*)nameBuffer,
sizeof(char)*nameBufferSize);
printf("nameBuffer: %"INT32" bytes\n", nameBufferSize);
if (!nameBuffer)
return -2;
}
// fill the next character
nameBuffer[nameBufferLen+nameLen] = *rdfPtr;
nameLen++;
if (!incRdfPtr())
return -1;
}
// step past the quote
if (!incRdfPtr())
return -1;
// return the length
return nameLen;
}
// fill the next quoted url in the name buffer
int32_t fillNextUrl() {
// get the next string, skip to the next quote
while (*rdfPtr != '"') {
if (!incRdfPtr())
return -1;
}
// skip the quote
if (!incRdfPtr())
return -1;
// . pointing at the string now
// dump it in the buffer
int32_t urlLen = 0;
while (*rdfPtr != '"') {
// make sure there's room in the buffer
if (urlBufferLen+urlLen+10 >= urlBufferSize) {
urlBufferSize += URL_BUFFER_SIZE;
urlBuffer = (char*)realloc((void*)urlBuffer,
sizeof(char)*urlBufferSize);
printf("urlBuffer: %"INT32" bytes\n", urlBufferSize);
if (!urlBuffer)
return -2;
}
// fill the next character
urlBuffer[urlBufferLen+urlLen] = *rdfPtr;
urlLen++;
if (!incRdfPtr())
return -1;
}
// step past the quote
if (!incRdfPtr())
return -1;
// return the length
return urlLen;
}
// check the url for all valid characters
bool isGoodUrl ( char *url, int32_t urlLen ) {
// . all we're going to check for right now are
// characters that show up as spaces
if ( urlLen <= 0 )
return false;
for (int32_t i = 0; i < urlLen; i++) {
if (is_wspace_a(url[i]))
return false;
}
// check for [prot]://[url]
int32_t bef = 0;
char *p = url;
char *pend = url + urlLen;
while ( p < pend && *p != ':' ) {
p++;
bef++;
}
if ( bef == 0 || pend - p < 3 || p[1] != '/' || p[2] != '/' )
return false;
// good url
return true;
}
// print the category path
int32_t printCatPath ( char *str, int32_t catid, bool raw ) {
int32_t catIndex;
int32_t parentId;
char *p = str;
// get the index
catIndex = getIndexFromId(catid);
if (catIndex < 1)
return 0;
// get the parent
parentId = rdfCats[catIndex].m_parentid;
// . print the parent(s) first
// . in NEWER DMOZ dumps, "Top" is catid 2 and catid 1 is an
// empty title. really catid 2 is Top/World but that is an
// error that we correct below. (see "Top/World" below).
// but do not include the "Top/" as part of the path name
if ( catid == 2 ) {
// no! we now include Top as part of the path. let's
// be consistent. i'd rather have www.gigablast.com/Top
// and www.gigablast.com/Top/Arts etc. then i know if the
// path starts with /Top that it is dmoz!!
sprintf(p,"Top");
return 3;
}
if (parentId > 1 &&
// the newer dmoz files have the catid == the parent id of
// i guess top most categories, like "Top/Arts"... i would think
// it should have a parentId of 1 like the old dmoz files,
// so it's probably a bug on dmoz's end
parentId != catid ) {
p += printCatPath(p, parentId, raw);
// print spacing
if (!raw) p += sprintf(p, " / ");
else p += sprintf(p, "/");
}
// print this category name
int32_t nameLen = rdfCats[catIndex].m_nameLen;
gbmemcpy ( p,
&nameBuffer[rdfCats[catIndex].m_nameOffset],
nameLen );
p += nameLen;
// null terminate
*p = '\0';
// return length
return (p - str);
}
int32_t fixUrl ( char *url, int32_t urlLen ) {
int32_t slashi = 0;
int32_t newUrlLen = urlLen;
// check for a bad protocol, something:
while (url[slashi] != ':') {
slashi++;
// if no :, throw it out
if (slashi >= newUrlLen)
return 0;
}
// check for a ://
if (newUrlLen - slashi < 3)
return 0;
if (url[slashi] != ':' ||
url[slashi+1] != '/' ||
url[slashi+2] != '/') {
// fix news: to news://
if (strncasecmp(url, "news:", 5) == 0) {
char newsFix[1024];
gbmemcpy(newsFix, url, newUrlLen);
gbmemcpy(url, newsFix, 5);
gbmemcpy(&url[5], "//", 2);
gbmemcpy(&url[7], &newsFix[5], newUrlLen - 5);
newUrlLen += 2;
}
// otherwise throw it out
else
return 0;
}
slashi += 3;
// . jump over http:// if it starts with http://http://
// . generic for any protocol
char prot[1024];
gbmemcpy(prot, url, slashi);
prot[slashi] = '\0';
sprintf(prot, "%s%s", prot, prot);
while ( newUrlLen > slashi*2 &&
strncasecmp(url, prot, slashi*2) == 0 ) {
// remove the extra protocol
memmove(url, &url[slashi], newUrlLen - slashi);
newUrlLen -= slashi;
}
/*
// remove a www.
if (newUrlLen - slashi >= 4 &&
strncasecmp(&url[slashi], "www.", 4) == 0) {
memmove(&url[slashi], &url[slashi+4], newUrlLen - (slashi+4));
newUrlLen -= 4;
}
*/
// look for //, cut down to single /, remove any spaces
for (; slashi < newUrlLen; slashi++) {
if (url[slashi-1] == '/' && url[slashi] == '/') {
memmove(&url[slashi-1], &url[slashi], newUrlLen - slashi);
newUrlLen--;
}
if (is_wspace_a(url[slashi])) {
memmove(&url[slashi], &url[slashi+1], newUrlLen - (slashi+1));
newUrlLen--;
}
}
// remove any anchor
// mdw, sep 2013, no because there is twitter.com/#!/ronpaul
// and others...
/*
for (int32_t i = 0; i < newUrlLen; i++) {
if (url[i] == '#') {
newUrlLen = i;
break;
}
}
*/
// remove any trailing /
if (url[newUrlLen-1] == '/')
newUrlLen--;
// return the new length
return newUrlLen;
}
// properly read from file
int32_t fileRead ( int fileid, void *buf, size_t count ) {
char *p = (char*)buf;
int32_t n = 0;
uint32_t sizeRead = 0;
while ( sizeRead < count ) {
n = read ( fileid, p, count - sizeRead );
if ( n <= 0 || n > (int32_t)count )
return n;
sizeRead += n;
p += n;
}
return sizeRead;
}
// properly write to file
int32_t fileWrite ( int fileid, void *buf, size_t count ) {
char *p = (char*)buf;
int32_t n = 0;
uint32_t sizeWrote = 0;
while ( sizeWrote < count ) {
n = write ( fileid, p, count - sizeWrote );
if ( n <= 0 || n > (int32_t)count )
return n;
sizeWrote += n;
p += n;
}
return sizeWrote;
}
// print special meta tags to tell gigablast to only spider/index
// the links and not the links of the links. b/c we only want
// to index the dmoz urls. AND ignore any external error like
// ETCPTIMEDOUT when indexing a dmoz url so we can be sure to index
// all of them under the proper category so our gbcatid:xxx search
// works and we can replicate dmoz accurately. see XmlDoc.cpp
// addOutlinksSpiderRecsToMetaList() and indexDoc() to see
// where these meta tags come into play.
void writeMetaTags ( int outStream2 ) {
char *str =
"<!-- do not spider the links of the links -->\n"
"<meta name=spiderlinkslinks content=0>\n"
"<!--ignore tcp timeouts, dns timeouts, etc.-->\n"
"<meta name=ignorelinksexternalerrors content=1>\n"
"<!--do not index this document, but get links from it-->\n"
"<meta name=noindex content=1>\n"
// tell gigablast to not do a dns lookup on every
// outlink when adding spiderRequests to spiderdb
// for each outlink. will save time up front but
// will have to be done when spidering the doc.
"<!-- do not lookup the ip address of every outlink, "
"but use hash of the subdomain as the ip -->\n"
"<meta name=usefakeips content=1>\n"
;
int32_t len = gbstrlen(str);
if ( write ( outStream2, str , len ) != len )
printf("Error writing to outStream2b\n");
}
// main parser
int main ( int argc, char *argv[] ) {
int32_t n;
int32_t t = 0;
int32_t ti = 0;
int32_t m = 0;
int32_t newNameBufferSize = 0;
int32_t newOffset = 0;
char filename[1256];
int32_t urlTxtCount = 0;
int32_t urlTxtFile = 0;
Url normUrl;
char decodedUrl[MAX_URL_LEN];
char htmlDecoded[MAX_HTTP_FILENAME_LEN];
//int32_t numSymParents = 0;
//int32_t endpos;
// url diff stuff
int32_t numUpdateIndexes = 0;
int32_t *updateIndexes = NULL;
int32_t currUrl = 0;
int32_t currDiffIndex = 0;
// options
bool splitUrls = false;
char mode = MODE_NONE;
int32_t totalNEC = 0;
char *dir="";
bool firstTime;
// check the options and mode
for (int32_t i = 0; i < argc; i++) {
if (strcmp(argv[i], "-s") == 0)
splitUrls = true;
else if (strcmp(argv[i], "urldump") == 0)
mode = MODE_URLDUMP;
else if (strcasecmp(argv[i], "update") == 0)
mode = MODE_UPDATE;
else if (strcasecmp(argv[i], "new") == 0)
mode = MODE_NEW;
else if (strcasecmp(argv[i], "diffurldump") == 0)
mode = MODE_DIFFURLDUMP;
else if (strcasecmp(argv[i], "catdump") == 0)
mode = MODE_CATDUMP;
}
// check for correct call
if (mode == MODE_NONE) {
printf("\n"
"Usage: dmozparse [OPTIONS] [MODE]\n"
"\n"
"Modes:\n"
" new Generate new .dat files.\n"
"\n"
" update Generate new .dat.new files, updating\n"
" existing .dat files. Changes will be\n"
" written to gbdmoz.changes.dat.new.\n"
" Catdb will update using these files\n"
" when told to update.\n"
"\n"
" urldump Dump urls to file only. This will not\n"
" create any .dat files, only url txt \n"
" files.\n"
"\n"
" diffurldump Dump urls that are new, changed, or\n"
" removed in the latest update. (Uses\n"
" gbdmoz.content.dat.new.diff)\n"
"\n"
" catdump Dump categories to file only.\n"
"\n"
"Options:\n"
" -s Split url output into multiple files.\n"
" This is used for adding urls to gb\n"
" which has a limit to the file size.\n"
"\n"
"\n" );
exit(0);
}
// init the hash table for hashing urls
if (!hashinit()) {
printf("Hash Init Failed!\n");
goto errExit;
}
// init the hash table
initHashTable();
printf("\n");
// . create a large buffer for reading chunks
// of the rdf files
rdfBuffer = (char*)malloc(sizeof(char)*(RDFBUFFER_SIZE+1));
if (!rdfBuffer) {
printf("Out of memory!!\n");
goto errExit;
}
// skip hierarchy stuff for url dump
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP )
goto contentParse;
// create the cat array
rdfCatsSize = CAT_BUFFER_SIZE;
rdfCats = (RdfCat*)malloc(sizeof(RdfCat)*rdfCatsSize);
if (!rdfCats) {
printf("Out of memory!!\n");
goto errExit;
}
// create the name buffer
nameBufferSize = NAME_BUFFER_SIZE;
nameBuffer = (char*)malloc(sizeof(char)*nameBufferSize);
if (!nameBuffer) {
printf("Out of memory!!\n");
goto errExit;
}
dir = "";
retry:
// open the structure file
if ( mode == MODE_NEW || mode == MODE_CATDUMP )
sprintf(filename, "%s%s", dir,RDFSTRUCTURE_FILE);
else
sprintf(filename, "%s%s.new", dir,RDFSTRUCTURE_FILE);
//rdfStream.open(filename, ifstream::in);
rdfStream = open ( filename, O_RDONLY );
// make sure it opened okay
//if (!rdfStream.is_open()) {
if ( rdfStream < 0 ) {
// try ./catdb/ subdir if not found
if ( ! dir[0] ) {
dir = "./catdb/";
goto retry;
}
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("Opened Structure File: %s\n", filename);
// take the first chunk
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
//n = rdfStream.gcount();
n = read ( rdfStream, rdfBuffer, RDFBUFFER_SIZE );
if ( n <= 0 || n > RDFBUFFER_SIZE ) {
printf("Error Reading %s\n", filename);
goto errExit;
}
rdfPtr = rdfBuffer;
rdfEnd = &rdfBuffer[n];
currOffset = 0;
firstTime = true;
// read and parse the file
printf("Parsing Topics...\n");
while (true) {
// parse for <Topic...
if (rdfParse("Topic") == -1)
goto fileEnd;
// the offset for this cat is 6 chars back
uint32_t catOffset = currOffset - 6;
// get the topic name, preserve it on the buffer
int32_t nameOffset = nameBufferLen;
// the name inserted by this function into "nameBuffer"
// does not seem to contain "Top/" at the beginning.
// it is from structure.rdf.u8, but it seems to be there!
// yeah, later on we hack the name buffer and nameOffset
// so it is just the last word in the directory to save
// mem. then we print out all the parent names to
// reconstruct.
int32_t nameLen = fillNextString();
if (nameLen == -1)
goto fileEnd;
if (nameLen == -2) {
printf("Out of Memory!\n");
goto errExit1;
}
// fix <Topic r:id=\"\"> in the newer content.rdf.u8
if ( nameLen == 0 ) {
// only do this once!
if ( ! firstTime ) {
printf("Encountered zero length name");
continue;
}
gbmemcpy(nameBuffer+nameOffset,"Top\0",4);
nameLen = 3;
firstTime = false;
}
// html decode it
if (nameLen > MAX_HTTP_FILENAME_LEN)
nameLen = MAX_HTTP_FILENAME_LEN;
nameLen = htmlDecode ( htmlDecoded,
&nameBuffer[nameOffset],
nameLen ,
false,
0);
// parse the catid
int32_t catid = parseNextCatid();
if (catid == -1)
goto fileEnd;
// crap, in the new dmoz structure.rdf.u8 catid 1 is
// empty name and catid 2 has Topic tag "Top/World" but
// Title tag "Top".
// but it should probably be "Top" and not "World". There is
// another catid 3 in structure.rdf.u8 that has
// <Topic r:id="Top/World"> and catid 3 which is the real one,
// so catid 2 is just "Top". this is a bug in the dmoz output
// i think, so fix it here.
if ( catid == 2 ) {
nameLen = 3;
gbmemcpy(&nameBuffer[nameOffset],"Top",nameLen);
nameBufferLen += nameLen;
}
else {
gbmemcpy(&nameBuffer[nameOffset], htmlDecoded, nameLen);
nameBufferLen += nameLen;
}
// . fill the current cat
// make sure there's room
if (numRdfCats >= rdfCatsSize) {
rdfCatsSize += CAT_BUFFER_SIZE;
rdfCats = (RdfCat*)realloc((void*)rdfCats,
sizeof(RdfCat)*rdfCatsSize);
printf("rdfCats: %"INT32" bytes\n", rdfCatsSize);
if (!rdfCats) {
printf("Out of Memory\n");
goto errExit1;
}
}
// hash the name to the catid
if (addCatHash ( nameOffset, nameLen, catid ) == -1) {
printf("Out of Memory!\n");
goto errExit1;
}
// debug
//printf("gbcat=");
//for ( int32_t i = 0 ; i < nameLen ; i++ )
// printf("%c",htmlDecoded[i]);
//printf("\n");
// fill it
rdfCats[numRdfCats].m_catid = catid;
rdfCats[numRdfCats].m_parentid = 0;
//rdfCats[numRdfCats].m_numSymParents = 0;
//rdfCats[numRdfCats].m_symParents = NULL;
rdfCats[numRdfCats].m_nameLen = nameLen;
rdfCats[numRdfCats].m_nameOffset = nameOffset;
rdfCats[numRdfCats].m_structureOffset = catOffset;
rdfCats[numRdfCats].m_contentOffset = 0;
rdfCats[numRdfCats].m_catHash = 0;
rdfCats[numRdfCats].m_numUrls = 0;
numRdfCats++;
}
fileEnd:
// sort the cats by catid
gbsort(rdfCats, numRdfCats, sizeof(RdfCat), catcomp);
// dump out categories for category dump
if ( mode == MODE_CATDUMP ) {
char catTemp[16384];
for ( int32_t i = 0; i < numRdfCats; i++ ) {
//for (int32_t n = rdfCats[i].m_nameOffset;
// n < rdfCats[i].m_nameOffset +
// rdfCats[i].m_nameLen; n++)
// printf("%c", nameBuffer[n]);
//printf("\n");
int32_t encLen = urlEncode(catTemp, 16383,
&nameBuffer[rdfCats[i].m_nameOffset],
rdfCats[i].m_nameLen);
catTemp[encLen] = '\0';
printf("http://dir.gigablast.com%s\n", &catTemp[3]);
}
close(rdfStream);
goto goodEnd;
}
// . now we need to reparse the whole file again and
// parse out the children of each topic, this includes:
// <narrow> hard links
// <narrow1> hard links
// <narrow2> hard links
// <letterbar> hard links
// <symbolic> sym links
// <symbolic1> sym links
// <symbolic2> sym links
// </Topic> ends the topic
// reset to the beginning of the file
//rdfStream.clear();
//rdfStream.seekg(0, ios::beg);
if ( lseek(rdfStream, 0, SEEK_SET) < 0 ) {
printf ( "Error Reseting RDF File\n" );
goto errExit1;
}
// reset the buffer to the first block
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
//n = rdfStream.gcount();
n = read(rdfStream, rdfBuffer, RDFBUFFER_SIZE);
if ( n <= 0 || n > RDFBUFFER_SIZE ) {
printf("Error Reading %s\n", filename);
goto errExit1;
}
rdfPtr = rdfBuffer;
rdfEnd = &rdfBuffer[n];
currOffset = 0;
//
// set m_parentid using structure.rdf.u8
//
// read and parse the file again
printf("Building Hierarchy...\n");
while (true) {
// parse the next catid in the file, sequentially
//if ( currOffset == 545468935 )
// printf("shit\n");
int32_t catid = parseNextCatid();
if (catid == -1)
goto fileEnd1;
nextChildTag:
// now go through the tags looking for what we want
if (rdfNextTag() == -1)
goto fileEnd1;
// check it for one of the tags we're looking for
int32_t parentType;
if ( tagLen == 6 &&
strncmp ( tagRecfer, "/Topic", 6 ) == 0 )
continue;
else if ( tagLen == 6 &&
strncmp ( tagRecfer, "narrow", 6 ) == 0 )
parentType = 1;
else if ( tagLen == 7 &&
strncmp ( tagRecfer, "narrow1", 7 ) == 0 )
parentType = 1;
else if ( tagLen == 7 &&
strncmp ( tagRecfer, "narrow2", 7 ) == 0 )
parentType = 1;
else if ( tagLen == 9 &&
strncmp ( tagRecfer, "letterbar", 9 ) == 0 )
parentType = 1;
// else if ( tagLen == 8 &&
// strncmp ( tagRecfer, "symbolic", 8 ) == 0 )
// parentType = 2;
// else if ( tagLen == 9 &&
// strncmp ( tagRecfer, "symbolic1", 9 ) == 0 )
// parentType = 2;
// else if ( tagLen == 9 &&
// strncmp ( tagRecfer, "symbolic2", 9 ) == 0 )
// parentType = 2;
else
goto nextChildTag;
// will only reach here if we're at a child cat
// get the name, use the end of nameBuffer
char *childName = &nameBuffer[nameBufferLen];
int32_t childNameLen = fillNextString();
if (childNameLen == -1)
goto fileEnd1;
if (childNameLen == -2) {
printf("Out of Memory!\n");
goto errExit1;
}
// html decode it
if (childNameLen > MAX_HTTP_FILENAME_LEN)
childNameLen = MAX_HTTP_FILENAME_LEN;
childNameLen = htmlDecode ( htmlDecoded,
childName,
childNameLen ,
false,
0);
gbmemcpy(childName, htmlDecoded, childNameLen);
// debug log
//if ( currOffset >= 506362430 ) // 556362463
// printf("off=%"INT32"\n",currOffset);
// debug point
//if ( currOffset == 545467573 )
// printf("GOT DEBUG POINT before giant skip\n");
// cut off the leading label if symbolic
// if (parentType == 2) {
// while (*childName != ':') {
// childName++;
// childNameLen--;
// }
// childName++;
// childNameLen--;
// }
// debug point
//if (strcmp(childName,"Top/World/Català/Arts") == 0 )
// printf("hey\n");
// get the catid for the child
int32_t childid = getCatHash(childName, childNameLen);
// get the cat for this id
int32_t cat = getIndexFromId(childid);
// make sure we have a match
if (cat == -1) {
// debug. why does Top/World/Catala/Arts
// not have a parent??
printf("Warning: Child Topic Not Found: ");
for (int32_t i = 0; i < childNameLen; i++)
printf("%c", childName[i]);
printf("\n");
m++;
goto nextChildTag;
}
// . assign the parent to the cat
// . this means we are in a "child" tag within the "catid"
// . catid 84192
if (parentType == 1) {
if (rdfCats[cat].m_parentid != 0)
printf("Warning: Overwriting Parent Id!\n");
rdfCats[cat].m_parentid = catid;
t++;
}
// assign symbolic parent to the cat
// else if (parentType == 2) {
// // grow the buffer
// rdfCats[cat].m_numSymParents++;
// rdfCats[cat].m_symParents = (int32_t*)realloc(
// rdfCats[cat].m_symParents,
// sizeof(int32_t)*rdfCats[cat].m_numSymParents);
// if (!rdfCats[cat].m_symParents) {
// printf("Out of Memory!\n");
// goto errExit1;
// }
// // assign the sym parent
// rdfCats[cat].m_symParents[rdfCats[cat].m_numSymParents-1] = catid;
// // inc overall number of sym parents
// numSymParents++;
// }
// go to the next tag
goto nextChildTag;
}
fileEnd1:
printf("Completed Structure:\n");
printf(" Total Topics: %"INT32"\n", numRdfCats);
printf(" Topics with Parents: %"INT32"\n", t);
printf(" Topics Linked but Nonexistent: %"INT32"\n", m);
if ( t != numRdfCats ) {
printf("\n"
" *Topics without parents is bad because they\n"
" can not have their entired rawPath printed out\n"
" in order to get their proper hash\n");
}
//printf(" Number of Symbolic Links: %"INT32"\n", numSymParents);
printf("\n");
// clear the hash table
clearHashTable();
// close the structure file
//rdfStream.clear();
//rdfStream.close();
close(rdfStream);
printf("Truncating Category Names...\n");
// . truncate the category names to the last directory
// also calculate the size of the truncated buffer
for (int32_t i = 0; i < numRdfCats; i++) {
// find the position of the last /
newOffset = rdfCats[i].m_nameOffset +
rdfCats[i].m_nameLen - 1;
while ( newOffset != rdfCats[i].m_nameOffset &&
nameBuffer[newOffset-1] != '/' )
newOffset--;
// assign the new length and offset
rdfCats[i].m_nameLen -= newOffset - rdfCats[i].m_nameOffset;
rdfCats[i].m_nameOffset = newOffset;
newNameBufferSize += rdfCats[i].m_nameLen;
}
printf("Creating Category Hashes...\n");
// make the hashes
char rawPath[4096];
int32_t rawPathLen;
for (int32_t i = 0; i < numRdfCats; i++) {
// get the hash of the path
rawPathLen = printCatPath(rawPath, rdfCats[i].m_catid, true);
// crap, this rawpath contains "Top/" in the beginning
// but the rdfCats[i].m_nameOffset refers to a name
// that does not include "Top/"
rdfCats[i].m_catHash = hash32Lower_a(rawPath, rawPathLen, 0);
// fix. so that xyz/Arts does not just hash "Arts"
// because it has no parent...
if ( rdfCats[i].m_parentid == 0 ) {
printf("Missing parent for catid %"INT32". Will be "
"excluded from DMOZ so we avoid hash "
"collisions.\n",rdfCats[i].m_catid);
}
//
// DEBUG!
// print this shit out to find the collisions
//
continue;
printf("hash32=%"UINT32" catid=%"INT32" parentid=%"INT32" path=%s\n",
rdfCats[i].m_catHash,
rdfCats[i].m_catid,
rdfCats[i].m_parentid,
rawPath);
}
// . now we want to serialize the needed data into
// one (or more?) file(s) to be quickly read by gb
if ( mode == MODE_NEW )
sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
else
sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::trunc);
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpened %s for writing.\n", filename);
// write the size of the truncated name buffer
//outStream.write((char*)&newNameBufferSize, sizeof(int32_t));
if (write(outStream, &newNameBufferSize, sizeof(int32_t)) !=
sizeof(int32_t)) {
printf("Error writing to %s\n", filename);
goto errExit;
}
// write the number of cats
//outStream.write((char*)&numRdfCats, sizeof(int32_t));
if (write(outStream, &numRdfCats, sizeof(int32_t)) !=
sizeof(int32_t)) {
printf("Error writing to %s\n", filename);
goto errExit;
}
// write the number of symbolic parents
//outStream.write((char*)&numSymParents, sizeof(int32_t));
// write the truncated buffer and further reassign the offsets
newOffset = 0;
for (int32_t i = 0; i < numRdfCats; i++) {
int32_t writeSize = rdfCats[i].m_nameLen;
//outStream.write((char*)&nameBuffer[rdfCats[i].m_nameOffset],
// sizeof(char)*rdfCats[i].m_nameLen);
if ( write ( outStream, &nameBuffer[rdfCats[i].m_nameOffset],
writeSize ) != writeSize ) {
printf("Error writing to %s\n", filename);
goto errExit;
}
rdfCats[i].m_nameOffset = newOffset;
newOffset += rdfCats[i].m_nameLen;
}
// close the output file
//outStream.clear();
//outStream.close();
close(outStream);
printf("Completed Writing File.\n");
// clear up the name buffer
free(nameBuffer);
nameBuffer = NULL;
contentParse:
// . now we need to parse up the content file,
// hash the url's with a gb hash, and store the
// catid associated with each
t = 0;
m = 0;
// creat the url buffer
urlBufferSize = URL_BUFFER_SIZE;
urlBuffer = (char*)malloc(sizeof(char)*urlBufferSize);
if (!urlBuffer) {
printf("Out of Memory!\n");
goto errExit;
}
// create the url info buffer
urlInfosSize = URLINFO_BUFFER_SIZE;
urlInfos = (UrlInfo*)malloc(sizeof(UrlInfo)*urlInfosSize);
if (!urlInfos) {
printf("Out of Memory!\n");
goto errExit;
}
again:
// open the content file
if ( mode == MODE_NEW || mode == MODE_URLDUMP )
sprintf(filename, "%s%s", dir,RDFCONTENT_FILE);
else
sprintf(filename, "%s%s.new", dir,RDFCONTENT_FILE);
//rdfStream.open(filename, ifstream::in);
rdfStream = open ( filename, O_RDONLY );
// make sure it opened okay
//if (!rdfStream.is_open()) {
if ( rdfStream < 0 ) {
if ( ! dir[0] ) {
dir = "./catdb/";
goto again;
}
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpened Content File: %s\n", filename);
// take the first chunk
//rdfStream.read(rdfBuffer, RDFBUFFER_SIZE);
//n = rdfStream.gcount();
n = read ( rdfStream, rdfBuffer, RDFBUFFER_SIZE );
if ( n <= 0 || n > RDFBUFFER_SIZE ) {
printf("Error Reading %s\n", filename);
goto errExit;
}
rdfPtr = rdfBuffer;
rdfEnd = &rdfBuffer[n];
currOffset = 0;
// init hash tables for indexing urls
initUrlHashTable();
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP ) {
// write another file for the urls
if ( mode == MODE_URLDUMP ) {
if (!splitUrls)
sprintf(filename, "html/%s", URLTEXT_OUTPUT_FILE);
else
// put them directly into html/ now for
// easy add url'ing
sprintf(filename, "html/%s.0", URLTEXT_OUTPUT_FILE);
}
else {
if (!splitUrls)
sprintf(filename, "html/%s",
DIFFURLTEXT_OUTPUT_FILE);
else
sprintf(filename, "html/%s.0",
DIFFURLTEXT_OUTPUT_FILE);
}
//outStream2.open(filename, ofstream::out|ofstream::trunc);
outStream2 = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it opened okay
//if (!outStream2.is_open()) {
if ( outStream2 < 0 ) {
printf("Error Opening %s\n", filename);
goto errExit1;
}
printf("Opened %s for writing.\n", filename);
writeMetaTags ( outStream2 );
// if we're doing a diffurldump, load up the diff file first
if ( mode == MODE_DIFFURLDUMP ) {
char diffUrl[MAX_URL_LEN*2];
int32_t numRemoveUrls = 0;
// open the new diff file
//ifstream diffInStream;
int diffInStream;
sprintf(filename, "gbdmoz.content.dat.new.diff");
//diffInStream.open(filename, ifstream::in);
diffInStream = open(filename, O_RDONLY);
//if (!diffInStream.is_open()) {
if ( diffInStream < 0 ) {
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("Opened Diff File: %s\n", filename);
// read in the number of urls to update/add
//diffInStream.read((char*)&numUpdateIndexes,
// sizeof(int32_t));
if ( fileRead ( diffInStream,
&numUpdateIndexes,
sizeof(int32_t) ) != sizeof(int32_t) ) {
printf("Error Reading %s\n", filename);
goto errExit;
}
// read in the number of urls to remove
//diffInStream.read((char*)&numRemoveUrls, sizeof(int32_t));
if ( fileRead ( diffInStream,
&numRemoveUrls,
sizeof(int32_t) ) != sizeof(int32_t) ) {
printf("Error Reading %s\n", filename);
goto errExit;
}
// create the buffer for the update/add indexes
updateIndexes = (int32_t*)malloc(
sizeof(int32_t)*numUpdateIndexes);
if ( !updateIndexes ) {
printf("Out of Memory!\n");
//diffInStream.clear();
//diffInStream.close();
close(diffInStream);
goto errExit;
}
// read in the update/add indexes
//for ( int32_t i = 0; i < numUpdateIndexes &&
// diffInStream.good(); i++ ) {
for ( int32_t i = 0; i < numUpdateIndexes; i++ ) {
//diffInStream.read((char*)&updateIndexes[i],
// sizeof(int32_t));
int32_t n = fileRead ( diffInStream,
&updateIndexes[i],
sizeof(int32_t) );
if ( n < 0 || n > (int32_t)sizeof(int32_t) ) {
printf("Error Reading%s\n", filename);
goto errExit;
}
if ( n == 0 )
break;
}
// read in the urls to remove
//for ( int32_t i = 0; i < numRemoveUrls &&
// diffInStream.good(); i++ ) {
for ( int32_t i = 0; i < numRemoveUrls; i++ ) {
int16_t urlLen;
//diffInStream.read((char*)&urlLen,
// sizeof(int16_t));
if ( fileRead(diffInStream, &urlLen,
sizeof(int16_t)) != sizeof(int16_t) ) {
printf("Error reading diffInStream\n");
goto errExit;
}
if ( urlLen <= 0 ) {
printf("WARNING: Found %"INT32" length"
"url exiting!", (int32_t)urlLen);
//diffInStream.clear();
//diffInStream.close();
close(diffInStream);
goto errExit;
}
// read it in
//diffInStream.read(diffUrl, urlLen);
if ( fileRead(diffInStream, diffUrl, urlLen) !=
urlLen ) {
printf("Error reading diffInStream\n");
goto errExit;
}
// normalize it
urlLen = fixUrl(diffUrl, urlLen);
// write it out to the diffurl file
//outStream2.write(diffUrl, urlLen);
if ( write(outStream2, diffUrl, urlLen) !=
urlLen ) {
printf("Error writing to outStream2\n");
goto errExit;
}
//outStream2.write("\n", 1);
if ( write(outStream2, "\n", 1) != 1 ) {
printf("Error writing to outStream2\n");
goto errExit;
}
urlTxtCount++;
if ( splitUrls &&
urlTxtCount >= MAX_URLTXT_SIZE) {
//outStream2.clear();
//outStream2.close();
close(outStream2);
printf("Completed Writing File.\n");
// write another file for the urls
urlTxtFile++;
sprintf(filename, "html/%s.%"INT32"",
URLTEXT_OUTPUT_FILE,
urlTxtFile);
//outStream2.open(filename,
// ofstream::out|ofstream::trunc);
outStream2 = open ( filename,
O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it opened okay
//if (!outStream2.is_open()) {
if ( outStream2 < 0 ) {
printf("Error Opening %s\n",
filename);
goto errExit1;
}
printf("Opened %s for writing.\n",
filename);
urlTxtCount = 0;
}
}
// close up the diff file
//diffInStream.clear();
//diffInStream.close();
close(diffInStream);
printf("Successfully Built Diff\n");
}
}
else {
if ( mode == MODE_NEW )
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
else
sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
// stream the urls into the content
//outStream.open(filename, ofstream::out|ofstream::trunc);
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("Opened %s for writing.\n", filename);
// store a space for the number of urls at the start of the file
//outStream.write((char*)&numUrlInfos, sizeof(int32_t));
if ( write(outStream, &numUrlInfos, sizeof(int32_t)) !=
sizeof(int32_t) ) {
printf("Error writing to %s", filename);
goto errExit;
}
}
// read and parse the file again
printf("Building Links...\n");
while (true) {
// parse for <Topic...
if (rdfParse("Topic") == -1)
goto fileEnd2;
// the offset for this cat is 6 chars back
uint32_t catOffset = currOffset - 6;
// parse the next catid
int32_t catid = parseNextCatid();
if (catid == -1)
goto fileEnd2;
int32_t cat;
// skip ahead for url dump
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP )
goto nextLink;
// . set the content offset for this cat
// . it's missing catid 425187... why? because it had
// a double quote in it like '4"'!! so i took out inQuotes
// logic above.
cat = getIndexFromId(catid);
if (cat == -1) {
totalNEC++;
printf("Warning: Nonexistent Category, %"INT32", found in "
"Content\n", catid );
continue;
}
rdfCats[cat].m_contentOffset = catOffset;
nextLink:
// get the next tag
if (rdfNextTag() == -1)
goto fileEnd2;
// check it for one of the tags we're looking for
if ( tagLen == 6 &&
strncmp ( tagRecfer, "/Topic", 6 ) == 0 )
continue;
else if ( tagLen == 4 &&
strncmp ( tagRecfer, "link", 4 ) == 0 )
goto hashLink;
else if ( tagLen == 5 &&
strncmp ( tagRecfer, "link1", 5 ) == 0 )
goto hashLink;
else if ( tagLen == 4 &&
strncmp ( tagRecfer, "atom", 4 ) == 0 )
goto hashLink;
else if ( tagLen == 3 &&
strncmp ( tagRecfer, "pdf", 3 ) == 0 )
goto hashLink;
else if ( tagLen == 4 &&
strncmp ( tagRecfer, "pdf1", 4 ) == 0 )
goto hashLink;
else if ( tagLen == 3 &&
strncmp ( tagRecfer, "rss", 3 ) == 0 )
goto hashLink;
else if ( tagLen == 4 &&
strncmp ( tagRecfer, "rss1", 4 ) == 0 )
goto hashLink;
else
goto nextLink;
hashLink:
// . hash the link with the catid
// get the link url
int32_t urlOffset = urlBufferLen;
int16_t urlLen = fillNextUrl();
if (urlLen == -1)
goto fileEnd2;
if (urlLen == -2) {
printf("Out of Memory!\n");
goto errExit1;
}
// html decode the url
if (urlLen > MAX_URL_LEN)
urlLen = MAX_URL_LEN;
urlLen = htmlDecode(decodedUrl, &urlBuffer[urlOffset], urlLen,
false,0);
// debug point
//if ( strcmp(decodedUrl,"http://twitter.com/#!/ronpaul")==0)
// printf("hey\n");
// ignore any url with # in it for now like
// http://twitter.com/#!/ronpaul because it bastardizes
// the meaning of the # (hashtag) and we need to protest that
if ( strchr ( decodedUrl , '#' ) )
goto nextLink;
gbmemcpy(&urlBuffer[urlOffset], decodedUrl, urlLen);
// fix up bad urls
urlLen = fixUrl(&urlBuffer[urlOffset], urlLen);
if (urlLen == 0)
goto nextLink;
// . normalize with Url
// . watch out for
// http://twitter.com/#!/ronpaul to http://www.twitter.com/
// so do not strip # hashtags
normUrl.set(&urlBuffer[urlOffset],
urlLen,
true, // addwww?
false, // stripsessionid
false, // strippound?
true); // stripcommonfile? (i.e. index.htm)
// debug print
//printf("gburl %s -> %s\n",decodedUrl,normUrl.getUrl());
// put it back
urlLen = normUrl.getUrlLen();
if (urlBufferLen+urlLen+10 >= urlBufferSize) {
urlBufferSize += URL_BUFFER_SIZE;
urlBuffer = (char*)realloc((void*)urlBuffer,
sizeof(char)*urlBufferSize);
printf("urlBuffer: %"INT32" bytes\n", urlBufferSize);
if (!urlBuffer)
goto errExit1;
}
gbmemcpy(&urlBuffer[urlOffset], normUrl.getUrl(), urlLen);
// run it through the fixer once more
urlLen = fixUrl(&urlBuffer[urlOffset], urlLen);
if (urlLen == 0)
goto nextLink;
// check the url to make sure it is all valid characters
if (!isGoodUrl(&urlBuffer[urlOffset], urlLen))
goto nextLink;
// if good, add it to the buffer and add the cat
//urlBufferLen += urlLen;
// get the hash value
uint64_t urlHash =
hash64Lower_a(&urlBuffer[urlOffset], urlLen, 0);
//uint32_t urlHash2 =
// hash32Lower(&urlBuffer[urlOffset], urlLen, 0);
// see if it's already indexed
//int32_t urlIndex = getUrlHash(urlHash, urlOffset, urlLen);
//int32_t urlIndex = getUrlHash(urlHash, urlHash2);
//int32_t urlIndex = getUrlHash(urlHash, urlHash2
// urlOffset, urlLen);
int32_t urlIndex = getUrlHash(urlHash);
if (urlIndex == -1) {
if ( mode == MODE_URLDUMP ||
mode == MODE_DIFFURLDUMP ) {
//outStream2.write((char*)&urlLen,
// sizeof(int16_t));
if ( mode != MODE_DIFFURLDUMP ||
currUrl == updateIndexes[currDiffIndex] ) {
//outStream2.write(&urlBuffer[urlOffset],
// urlLen);
// print it in an anchor tag
// now so gigablast can spider
// these links
write ( outStream2,"<a href=\"",9);
if ( write ( outStream2,
&urlBuffer[urlOffset],
urlLen ) != urlLen ) {
printf("Error writing to "
"outStream2\n");
goto errExit1;
}
write ( outStream2,"\"></a>",6);
//outStream2.write("\n", 1);
if (write(outStream2, "\n", 1) != 1) {
printf("Error writing to "
"outStream2\n");
goto errExit1;
}
urlTxtCount++;
currDiffIndex++;
}
currUrl++;
if ( splitUrls &&
urlTxtCount >= MAX_URLTXT_SIZE) {
//outStream2.clear();
//outStream2.close();
close(outStream2);
printf("Completed Writing File.\n");
// write another file for the urls
urlTxtFile++;
if ( mode == MODE_URLDUMP )
sprintf(filename, "html/%s.%"INT32"",
URLTEXT_OUTPUT_FILE,
urlTxtFile);
else
sprintf(filename, "html/%s.%"INT32"",
DIFFURLTEXT_OUTPUT_FILE,
urlTxtFile);
//outStream2.open(filename,
// ofstream::out|ofstream::trunc);
outStream2 = open ( filename,
O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it opened okay
//if (!outStream2.is_open()) {
if ( outStream2 < 0 ) {
printf("Error Opening %s\n",
filename);
goto errExit1;
}
printf("Opened %s for writing.\n",
filename);
writeMetaTags ( outStream2 );
urlTxtCount = 0;
}
}
else {
// write the url to the content file
//outStream.write((char*)&urlLen, sizeof(int16_t));
if ( write(outStream, &urlLen, sizeof(int16_t)) !=
sizeof(int16_t) ) {
printf("Error writing to outStream");
goto errExit1;
}
//outStream.write(&urlBuffer[urlOffset], urlLen);
if ( write ( outStream,
&urlBuffer[urlOffset],
urlLen ) != urlLen ) {
printf("Error writing to outStream");
goto errExit1;
}
}
// add the url info to the buffer
if (numUrlInfos >= urlInfosSize) {
urlInfosSize += URLINFO_BUFFER_SIZE;
urlInfos = (UrlInfo*)realloc((void*)urlInfos,
sizeof(UrlInfo)*urlInfosSize);
printf("urlInfos: %"INT32" bytes\n",
(int32_t)(urlInfosSize*sizeof(UrlInfo)));
if (!urlInfos) {
printf("Out of Memory!\n");
goto errExit1;
}
}
// fill the url info
//urlInfos[numUrlInfos].m_hash = urlHash;
//urlInfos[numUrlInfos].m_urlLen = urlLen;
//urlInfos[numUrlInfos].m_urlOffset = urlOffset;
urlInfos[numUrlInfos].m_numCatids = 1;
urlInfos[numUrlInfos].m_catids =
(int32_t*)malloc(sizeof(int32_t));
if (!urlInfos[numUrlInfos].m_catids) {
printf("Out of memory!\n");
goto errExit1;
}
urlInfos[numUrlInfos].m_catids[0] = catid;
// set changed to true so new urls get in the diff
urlInfos[numUrlInfos].m_changed = 1;
// add it to the hash
//if (addUrlHash(urlHash, numUrlInfos,
// urlOffset, urlLen) == -1) {
//if (addUrlHash ( urlHash,
// urlHash2,
// numUrlInfos) == -1) {
//if (addUrlHash(urlHash, urlHash2, numUrlInfos,
// urlOffset, urlLen) == -1) {
if (addUrlHash(urlHash, numUrlInfos) == -1) {
printf("Out of Memory!\n");
goto errExit1;
}
// next url info
numUrlInfos++;
}
else {
// make sure we aren't duping the catid
for (int32_t i = 0;
i < urlInfos[urlIndex].m_numCatids; i++)
if (urlInfos[urlIndex].m_catids[i] == catid)
goto nextLink;
// add the catid
int32_t numCatids = urlInfos[urlIndex].m_numCatids;
//if (numCatids < MAX_URL_CATIDS) {
urlInfos[urlIndex].m_catids = (int32_t*)realloc(
urlInfos[urlIndex].m_catids,
sizeof(int32_t) *
(urlInfos[urlIndex].m_numCatids+1));
if (!urlInfos[urlIndex].m_catids) {
printf("Out of Memory!\n");
goto errExit1;
}
urlInfos[urlIndex].m_catids[numCatids] = catid;
urlInfos[urlIndex].m_numCatids++;
if (urlInfos[urlIndex].m_numCatids > t) {
t = urlInfos[urlIndex].m_numCatids;
ti = urlIndex;
}
//}
m++;
}
// skip increment for url dump
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP )
goto nextLink;
// increment the url count for this cat and its parents
int32_t currIndex = getIndexFromId(catid);
while (currIndex >= 0) {
rdfCats[currIndex].m_numUrls++;
// the new dmoz files have catids whose parents
// are the same cat id! so stop infinite loops
if ( rdfCats[currIndex].m_parentid ==
rdfCats[currIndex].m_catid )
break;
// otherwise, make "currIndex" point to the parent
currIndex = getIndexFromId(
rdfCats[currIndex].m_parentid );
// in the newer dmoz files 0 is a bad catid i guess
// not -1 any more?
// ??????
}
goto nextLink;
}
fileEnd2:
// close the output file
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP ) {
//outStream2.clear();
//outStream2.close();
close(outStream2);
printf("Completed Writing File.\n");
}
else {
//outStream.clear();
//outStream.close();
close(outStream);
printf("Completed Writing File.\n");
}
printf("Completed Content:\n");
printf(" Total Links: %"INT32"\n", numUrlInfos);
printf(" Duplicated Links: %"INT32"\n", m);
printf(" Max Link Duplicated: %"INT32"\n", t);
printf(" Nonexistant Categories: %"INT32"\n", totalNEC );
//printf(" ");
//for (int32_t i = 0; i < urlInfos[ti].m_urlLen; i++)
// printf("%c", urlBuffer[urlInfos[ti].m_urlOffset + i]);
printf("\n");
printf("\n");
// close the content file
//rdfStream.clear();
//rdfStream.close();
close(rdfStream);
// if we're updating, load up the old content here
if ( mode == MODE_UPDATE ) {
//if ( false ) {
// fill the buffers
int32_t currUrl = 0;
int32_t urlp = 0;
int32_t catidp = 0;
bool oldErr = false;
int32_t oldNumUrls;
char *oldUrls = NULL;
int32_t oldUrlsBufferSize = OLDURL_BUFFER_SIZE;
uint64_t *oldUrlHashes;
char *removeOldUrl;
//char oldUrl[MAX_URL_LEN*2];
int32_t *oldCatids = NULL;
int32_t oldCatidsBufferSize = OLDCATID_BUFFER_SIZE;
unsigned char *oldNumCatids = NULL;
int32_t numUpdateUrls = numUrlInfos;
int32_t numRemoveUrls = 0;
int32_t numChangedUrls = 0;
int32_t updateIndexesWritten = 0;
int32_t numIdsToUpdate = 0;
// load the content and url files
// url info (content) file
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
//rdfStream.open(filename, ifstream::in);
rdfStream = open ( filename, O_RDONLY );
//if (!rdfStream.is_open()) {
if ( rdfStream < 0 ) {
printf("Error Opening %s\n", filename);
goto oldErrExit;
}
// read in the number of urls
//rdfStream.read((char*)&oldNumUrls, sizeof(int32_t));
if (fileRead(rdfStream, &oldNumUrls, sizeof(int32_t)) !=
sizeof(int32_t)) {
printf("Error Reading %s\n", filename);
goto oldErrExit;
}
// create the buffer for the urls and catids
oldUrls = (char*)malloc(oldUrlsBufferSize);
if (!oldUrls) {
printf("Out of Memory!\n");
goto oldErrExit;
}
oldUrlHashes = (uint64_t*)malloc (
sizeof(int64_t)*oldNumUrls );
if (!oldUrlHashes) {
printf("Out of Memory!\n");
goto oldErrExit;
}
removeOldUrl = (char*)malloc(oldNumUrls);
if (!removeOldUrl) {
printf("Out of Memory!\n");
goto oldErrExit;
}
oldCatids = (int32_t*)malloc(sizeof(int32_t)*oldCatidsBufferSize);
if (!oldCatids) {
printf("Out of Memory!\n");
goto oldErrExit;
}
oldNumCatids = (unsigned char*)malloc(oldNumUrls);
if (!oldNumCatids) {
printf("Out of Memory!\n");
goto oldErrExit;
}
printf("Loading Old Content Data...\n");
//while ( rdfStream.good() && currUrl < oldNumUrls ) {
while ( currUrl < oldNumUrls ) {
// read the next url
int16_t urlLen = 0;
//rdfStream.read((char*)&urlLen, sizeof(int16_t));
int32_t n = fileRead(rdfStream, &urlLen, sizeof(int16_t));
if ( n < 0 || n > (int32_t)sizeof(int16_t) ) {
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
break;
// make sure there's room in the buffer
if (urlp + urlLen + 4 >= oldUrlsBufferSize) {
char *re_urls = (char*)realloc(
oldUrls,
oldUrlsBufferSize +
OLDURL_BUFFER_SIZE );
if (!re_urls) {
printf("Out of Memory!\n");
goto oldErrExit;
}
oldUrls = re_urls;
oldUrlsBufferSize += OLDURL_BUFFER_SIZE;
}
// insert a space between urls
//oldUrls[urlp] = '\n';
//urlp++;
//char *url = &m_urls[urlp];
//rdfStream.read(&oldUrls[urlp], urlLen);
if (urlLen <= 0) {
printf("WARNING: FOUND %"INT32" LENGTH URL, "
"WILL BE SKIPPED (1)\n",
(int32_t)urlLen );
}
n = fileRead(rdfStream, &oldUrls[urlp], urlLen);
if ( n < 0 || n > urlLen ) {
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
break;
//rdfStream.read(oldUrl, urlLen);
// normalize it
urlLen = fixUrl(&oldUrls[urlp], urlLen);
// make the hash
oldUrlHashes[currUrl] =
hash64Lower_a(&oldUrls[urlp], urlLen, 0);
removeOldUrl[currUrl] = 0;
// increment the buffer pointer
if (urlLen <= 0) {
printf("WARNING: FOUND %"INT32" LENGTH URL, "
"WILL BE SKIPPED (2)\n",
(int32_t)urlLen );
}
urlp += urlLen;
//urlLen = fixUrl(oldUrl, urlLen);
// null terminate
oldUrls[urlp] = '\0';
urlp++;
currUrl++;
}
currUrl = 0;
//while ( rdfStream.good() && currUrl < oldNumUrls ) {
while ( currUrl < oldNumUrls ) {
// get the number of catids
oldNumCatids[currUrl] = 0;
//rdfStream.read((char*)&oldNumCatids[currUrl], 1);
int32_t n = fileRead(rdfStream, &oldNumCatids[currUrl], 1);
if ( n < 0 || n > 1 ) {
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
break;
// make sure there's room
if ( catidp + oldNumCatids[currUrl] + 1 >=
oldCatidsBufferSize ) {
int32_t *re_catids = (int32_t*)realloc(
oldCatids,
sizeof(int32_t)*(oldCatidsBufferSize+
OLDCATID_BUFFER_SIZE) );
if (!re_catids) {
printf("Out of Memory!\n");
goto oldErrExit;
}
oldCatids = re_catids;
oldCatidsBufferSize += OLDCATID_BUFFER_SIZE;
}
//rdfStream.read((char*)&oldCatids[catidp],
// sizeof(int32_t)*oldNumCatids[currUrl]);
int32_t readSize = sizeof(int32_t)*oldNumCatids[currUrl];
n = fileRead(rdfStream, &oldCatids[catidp], readSize);
if ( n < 0 || n > readSize ) {
printf("Error Reading %s\n",filename);
//CONTENT_OUTPUT_FILE);
goto oldErrExit;
}
if ( n == 0 )
break;
// next url
catidp += oldNumCatids[currUrl];
currUrl++;
}
// now check the old urls against the new for changes
catidp = 0;
for ( int32_t i = 0; i < oldNumUrls; i++ ) {
// check the new url hash for the old url
int32_t n = oldNumCatids[i];
// skip bad urls
if ( oldUrlHashes[i] == 0 ) {
printf("WARNING: FOUND 0 LENGTH URL, "
"SKIPPING\n" );
catidp += n;
continue;
}
int32_t urlIndex = getUrlHash(oldUrlHashes[i]);
// check for a removed url
if ( urlIndex == -1 ) {
removeOldUrl[i] = 1;
numRemoveUrls++;
catidp += n;
continue;
}
// check if we have the same number of catids
if ( urlInfos[urlIndex].m_numCatids != n )
goto oldIsDifferent;
// check if all the catids match
for ( int32_t co = 0; co < n; co++ ) {
bool catMatch = false;
for ( int32_t cn = 0; cn < n; cn++ ) {
if ( urlInfos[urlIndex].m_catids[cn] ==
oldCatids[catidp + co] ) {
catMatch = true;
break;
}
}
if ( !catMatch )
goto oldIsDifferent;
}
// exact match, mark it unchanged and goto the next
catidp += n;
urlInfos[urlIndex].m_changed = 0;
numUpdateUrls--;
continue;
oldIsDifferent:
// just go on, this is already marked as changed
catidp += n;
numChangedUrls++;
continue;
}
printf(" Urls to Update: %"INT32"\n", numChangedUrls);
printf(" Urls to Add: %"INT32"\n",
numUpdateUrls - numChangedUrls);
printf(" Urls to Remove: %"INT32"\n", numRemoveUrls);
//
// . write out the diff file, contains new and changed urls and
// also urls to remove
//
// open the new diff file for writing
sprintf(filename, "%s%s.new.diff", dir,CONTENT_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::trunc);
outStream = open ( filename, O_CREAT|O_WRONLY|O_TRUNC,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Opening %s\n", filename);
goto oldErrExit;
}
printf("\nOpened %s for writing.\n", filename);
// write out the number of urls to update/add
//outStream.write(&numUpdateUrls, sizeof(int32_t));
if ( write(outStream, &numUpdateUrls, sizeof(int32_t)) !=
sizeof(int32_t) ) {
printf("Error writing to %s\n", filename);
goto oldErrExit;
}
// write out the number of urls to delete
//outStream.write(&numRemoveUrls, sizeof(int32_t));
if ( write(outStream, &numRemoveUrls, sizeof(int32_t)) !=
sizeof(int32_t) ) {
printf("Error writing to %s\n", filename);
goto oldErrExit;
}
// write out the urls to update/add
for ( int32_t i = 0; i < numUrlInfos; i++ ) {
if ( urlInfos[i].m_changed == 0 ) {
continue;
}
// write the changed url info
//outStream.write((char*)&urlInfos[i].m_urlLen,
// sizeof(int16_t));
//outStream.write(&urlBuffer[urlInfos[i].m_urlOffset],
// sizeof(char)*urlInfos[i].m_urlLen);
//outStream.write((char*)&urlInfos[i].m_numCatids,
// sizeof(char));
//outStream.write((char*)urlInfos[i].m_catids,
// sizeof(int32_t)*urlInfos[i].m_numCatids);
//outStream.write((char*)&i, sizeof(int32_t));
if ( write(outStream, &i, sizeof(int32_t)) !=
sizeof(int32_t) ) {
printf("Error writing to outStream\n");
goto oldErrExit;
}
updateIndexesWritten++;
numIdsToUpdate += urlInfos[i].m_numCatids;
}
printf ( "Wrote %"INT32" urls and %"INT32" catids to update/add.\n",
updateIndexesWritten, numIdsToUpdate );
if ( updateIndexesWritten != numUpdateUrls )
printf ( "WARNING: Wrote %"INT32" Update Indexes, Should be"
"%"INT32"!", updateIndexesWritten, numUpdateUrls );
// write out the urls to delete
urlp = 0;
for ( int32_t i = 0; i < oldNumUrls; i++ ) {
int16_t oldUrlLen = gbstrlen(&oldUrls[urlp]);
if ( removeOldUrl[i] == 0 ) {
urlp += oldUrlLen + 1;
continue;
}
// write the url to remove
if ( oldUrlLen <= 0 )
printf("WARNING: ATTEMPTING TO WRITE %"INT32" "
"LENGTH URL.\n", (int32_t)oldUrlLen );
//outStream.write((char*)&oldUrlLen, sizeof(int16_t));
if ( write(outStream, &oldUrlLen, sizeof(int16_t)) !=
sizeof(int16_t) ) {
printf("Error writing to outStream\n");
goto oldErrExit;
}
//outStream.write((char*)&oldUrls[urlp], oldUrlLen);
if ( write(outStream, &oldUrls[urlp], oldUrlLen) !=
oldUrlLen ) {
printf("Error writing to outStream\n");
goto oldErrExit;
}
urlp += oldUrlLen + 1;
}
// close the file
//outStream.clear();
//outStream.close();
close(outStream);
printf("Completed Writing File.\n");
printf("\n");
// no error
oldErr = false;
goto oldGoodExit;
oldErrExit:
// set error
oldErr = true;
oldGoodExit:
// close the file
//rdfStream.clear();
//rdfStream.close();
close(rdfStream);
// free the buffers
if (oldUrls) free(oldUrls);
if (oldUrlHashes) free(oldUrlHashes);
if (removeOldUrl) free(removeOldUrl);
if (oldCatids) free(oldCatids);
if (oldNumCatids) free(oldNumCatids);
if (oldErr) goto errExit;
}
printf("Clearing Url Hash Table...\n");
// clear the url index hash
clearUrlHashTable();
// finish up if we're just dumping urls
if ( mode == MODE_URLDUMP || mode == MODE_DIFFURLDUMP )
goto goodEnd;
// . now we want to serialize the needed data into
// one (or more?) file(s) to be quickly read by gb
if ( mode == MODE_NEW )
sprintf(filename, "%s%s", dir,STRUCTURE_OUTPUT_FILE);
else
sprintf(filename, "%s%s.new", dir,STRUCTURE_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::ate);
outStream = open ( filename, O_WRONLY|O_APPEND,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpened %s for writing.\n", filename);
// write the cats
//outStream.write((char*)rdfCats, sizeof(RdfCat)*numRdfCats);
for (int32_t i = 0; i < numRdfCats; i++) {
//outStream.write((char*)&rdfCats[i].m_catid, sizeof(int32_t));
if ( write(outStream, &rdfCats[i].m_catid, sizeof(int32_t)) !=
sizeof(int32_t) ) {
printf("Error writing cats to outStream.\n");
goto errExit;
}
//outStream.write((char*)&rdfCats[i].m_parentid, sizeof(int32_t));
if ( write(outStream, &rdfCats[i].m_parentid, sizeof(int32_t)) !=
sizeof(int32_t) ) {
printf("Error writing cats to outStream.\n");
goto errExit;
}
//outStream.write((char*)&rdfCats[i].m_numSymParents, sizeof(int16_t));
//outStream.write((char*)&rdfCats[i].m_nameOffset, sizeof(int32_t));
if ( write(outStream, &rdfCats[i].m_nameOffset, sizeof(int32_t)) !=
sizeof(int32_t) ) {
printf("Error writing cats to outStream.\n");
goto errExit;
}
//outStream.write((char*)&rdfCats[i].m_nameLen, sizeof(int16_t));
if ( write(outStream, &rdfCats[i].m_nameLen, sizeof(int16_t)) !=
sizeof(int16_t) ) {
printf("Error writing cats to outStream.\n");
goto errExit;
}
//outStream.write((char*)&rdfCats[i].m_structureOffset, sizeof(int32_t));
if ( write(outStream, &rdfCats[i].m_structureOffset,
sizeof(int32_t)) != sizeof(int32_t) ) {
printf("Error writing cats to outStream.\n");
goto errExit;
}
//outStream.write((char*)&rdfCats[i].m_contentOffset, sizeof(int32_t));
if ( write(outStream, &rdfCats[i].m_contentOffset,
sizeof(int32_t)) != sizeof(int32_t) ) {
printf("Error writing cats to outStream.\n");
goto errExit;
}
//outStream.write((char*)&rdfCats[i].m_numUrls, sizeof(int32_t));
if ( write(outStream, &rdfCats[i].m_numUrls, sizeof(int32_t)) !=
sizeof(int32_t) ) {
printf("Error writing cats to outStream.\n");
goto errExit;
}
}
// write the symbolic parents
//for (int32_t i = 0; i < numRdfCats; i++)
// for (int32_t s = 0; s < rdfCats[i].m_numSymParents; s++)
// outStream.write((char*)&rdfCats[i].m_symParents[s], sizeof(int32_t));
// write the cat hashes
for (int32_t i = 0; i < numRdfCats; i++) {
//outStream.write((char*)&rdfCats[i].m_catHash, sizeof(int32_t));
if ( write(outStream, &rdfCats[i].m_catHash, sizeof(int32_t)) !=
sizeof(int32_t) ) {
printf("Error writing cats to outStream.\n");
goto errExit;
}
}
// close the output file
//outStream.clear();
//outStream.close();
close(outStream);
printf("Completed Writing File.\n");
// write another file for the urls
if ( mode == MODE_NEW )
sprintf(filename, "%s%s", dir,CONTENT_OUTPUT_FILE);
else
sprintf(filename, "%s%s.new", dir,CONTENT_OUTPUT_FILE);
//outStream.open(filename, ofstream::out|ofstream::ate);
outStream = open ( filename, O_WRONLY,
S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP );
//outStream.open(filename, ofstream::out|ofstream::trunc);
//endpos = outStream.tellp();
// make sure it opened okay
//if (!outStream.is_open()) {
if ( outStream < 0 ) {
printf("Error Opening %s\n", filename);
goto errExit;
}
printf("\nOpened %s for writing.\n", filename);
//outStream.seekp(0);
lseek(outStream, 0, SEEK_SET);
// write the number of urls at the start of the file
//outStream.write((char*)&numUrlInfos, sizeof(int32_t));
if ( write(outStream, &numUrlInfos, sizeof(int32_t)) != sizeof(int32_t) ) {
printf("Error writing to outStream\n");
goto errExit;
}
// seek to the end
//outStream.seekp(endpos);
lseek(outStream, 0, SEEK_END);
// write the urls
for (int32_t i = 0; i < numUrlInfos; i++) {
//outStream.write((char*)&urlInfos[i].m_hash, sizeof(int64_t));
//outStream.write((char*)&urlInfos[i].m_urlLen, sizeof(int16_t));
//outStream.write(&urlBuffer[urlInfos[i].m_urlOffset],
// sizeof(char)*urlInfos[i].m_urlLen);
//outStream.write((char*)&urlInfos[i].m_numCatids, sizeof(char));
if ( write(outStream, &urlInfos[i].m_numCatids, sizeof(char)) !=
sizeof(char) ) {
printf("Error writing to outStream\n");
goto errExit;
}
//outStream.write((char*)urlInfos[i].m_catids, sizeof(int32_t)*
// urlInfos[i].m_numCatids);
int32_t writeSize = sizeof(int32_t)*urlInfos[i].m_numCatids;
if ( write(outStream, urlInfos[i].m_catids, writeSize) !=
writeSize ) {
printf("Error writing to outStream\n");
goto errExit;
}
}
// close the output file
//outStream.clear();
//outStream.close();
close(outStream);
printf("Completed Writing File.\n\n");
goodEnd:
// free up the buffers
if (urlBuffer)
free(urlBuffer);
if (urlInfos) {
for (int32_t i = 0; i < numUrlInfos; i++) {
if (urlInfos[i].m_catids)
free(urlInfos[i].m_catids);
}
free(urlInfos);
}
//free(nameBuffer);
if (rdfCats)
free(rdfCats);
if (rdfBuffer)
free(rdfBuffer);
// success
return 0;
// error exit points
errExit1:
clearUrlHashTable();
clearHashTable();
//rdfStream.clear();
//rdfStream.close();
close(rdfStream);
errExit:
if (updateIndexes)
free(updateIndexes);
if (urlBuffer)
free(urlBuffer);
if (urlInfos) {
for (int32_t i = 0; i < numUrlInfos; i++) {
if (urlInfos[i].m_catids)
free(urlInfos[i].m_catids);
}
free(urlInfos);
}
if (nameBuffer)
free(nameBuffer);
if (rdfCats)
free(rdfCats);
if (rdfBuffer)
free(rdfBuffer);
// failure
return 1;
}