open-source-search-engine/Categories.h

//
// Gigablast, Copyright March 2005
// Author: Javier Olivares <jolivares@gigablast.com>
//
// Stores Categories in a Hierarchy
// Based on DMOZ
//

#ifndef _CATEGORY_H_
#define _CATEGORY_H_

#include "Mem.h"
#include "HashTable.h"

#define RDFBUFFER_SIZE      (1024*1024*100)
#define RDFSMALLBUFFER_SIZE (32*1024)
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
#define RDFCONTENT_FILE   "content.rdf.u8"

#define STRUCTURE_OUTPUT_FILE  "gbdmoz.structure.dat"
#define CONTENT_OUTPUT_FILE    "gbdmoz.content.dat"
#define URL_OUTPUT_FILE        "gbdmoz.urls.dat"
#define URLTEXT_OUTPUT_FILE    "gbdmoz.urls.txt"

#define MAX_CATID_LEN    63
#define MAX_TAG_LEN      127
#define MAX_URL_CATIDS   64
#define MAX_URLTXT_SIZE  500000
#define MAX_CATIDS       64
#define MAX_CATNAME_LEN  1024

#define HASHTABLE_SIZE    (1024*1024)
#define URLHASHTABLE_SIZE (10*1024*1024)

#define MAX_SUB_CATS       1024
#define SUBCAT_LETTERBAR   10
#define SUBCAT_NARROW2     30
#define SUBCAT_SYMBOLIC2   31
#define SUBCAT_NARROW1     50
#define SUBCAT_SYMBOLIC1   51
#define SUBCAT_NARROW      70
#define SUBCAT_SYMBOLIC    71
#define SUBCAT_RELATED     90
#define SUBCAT_ALTLANG     110

struct Category {
	long  m_catid;
	long  m_parentid;
	//short m_numSymParents;
	//long  m_symParentsOffset;
	long  m_nameOffset;
	short m_nameLen;
	unsigned long m_structureOffset;
	unsigned long m_contentOffset;
	long  m_numUrls;
};

struct CategoryHash {
	unsigned long  m_hash;
	long m_catIndex;
};

struct SubCategory {
	//long  m_prefixOffset;
	long  m_prefixLen;
	//long  m_nameOffset;
	long  m_nameLen;
	char  m_type;
	long getRecSize () { return sizeof(SubCategory)+m_prefixLen+m_nameLen+2;};
	char *getPrefix() { return m_buf; };
	char *getName  () { return m_buf+m_prefixLen+1;};
	char  m_buf[0];
};

class Categories {
public:
	Categories();
	~Categories();

	long fileRead ( int fileid, void *buf, size_t count );

	void reset();

	// load the hierarchy from a file
	long loadCategories ( char *filename );

	// . this is called by loadCategories() and constructs m_adultTable
	// . it will load/save it from/to disk, too
	bool makeBadHashTable ( ) ;
	bool addUrlsToBadHashTable ( long catid ) ;

	// get the index of a cat from its id
	// -1 if not found
	long getIndexFromId   ( long catid );
	long getIndexFromPath ( char *str, long strLen );
	long getIdFromPath    ( char *str, long strLen );

	// determine if a category should be printed RTL
	bool isIdRTLStart    ( long catid );
	bool isIndexRTLStart ( long catIndex );
	bool isIdRTL         ( long catid );
	bool isIndexRTL      ( long catIndex );

	// see if the category is Adult
	bool isIdAdultStart    ( long catid );
	bool isIndexAdultStart ( long catIndex );
	bool isIdAdult         ( long catid );
	bool isIndexAdult      ( long catIndex );

	// is it in a bad cat, like adult, gambling, online pharmacies
	bool isIdBadStart    ( long catid );
	bool isIndexBadStart ( long catIndex );
	bool isIdBad         ( long catid );
	bool isIndexBad      ( long catIndex );
	// is this url directly in a dmoz adult category?
	bool isInBadCat      ( Url *u ) ;
	bool isInBadCat      ( unsigned long urlHash );

	// print info of cats
	void printCats ( long start, long end );

	// print the path of this category
	void printPathFromId ( SafeBuf *sb ,
			       long  catid,
			       bool  raw = false,
			       bool  isRTL = false );
	void printPathFromIndex ( SafeBuf *sb ,
				  long  catIndex,
				  bool  raw = false,
				  bool  isRTL = false );

	// print the path bread crumb links for this category
	void printPathCrumbFromId    ( SafeBuf *sb ,
				       long  catid,
				       bool  isRTL = false );
	void printPathCrumbFromIndex ( SafeBuf *sb ,
				       long  catid,
				       bool  isRTL = false );

	bool printUrlsInTopic ( class SafeBuf *sb , long catid  ) ;

	// . get the title and summary for a specific url
	//   and catid
	bool getTitleAndSummary ( char  *url,
				  long   urlLen,
				  long   catid,
				  char  *title        = NULL,
				  long  *titleLen     = NULL,
				  long   maxTitleLen  = 0,
				  char  *summ         = NULL,
				  long  *summLen      = NULL,
				  long   maxSummLen   = 0,
				  char  *anchor       = NULL,
				  unsigned char *anchorLen    = NULL,
				  long   maxAnchorLen = 0 ,
				  long   niceness     = 0 ,
				  bool   justAddToTable = false );

	// normalize a url string
	long fixUrl ( char *url, long urlLen );

	// . generate sub categories for a given catid
	// . store list of SubCategories into "subCatBuf" return # stored
	// . hits disk without using threads... so kinda sucks...
	long generateSubCats ( long catid, SafeBuf *subCatBuf );

	long getNumUrlsFromIndex ( long catIndex ) {
		return m_cats[catIndex].m_numUrls; };

	// creates a directory search request url
	//void createDirectorySearchUrl ( Url  *url,
	long createDirSearchRequest ( char *requestBuf,
				      long  requestBufSize,
				      long  catid,
				      char *hostname,
				      long  hostnameLen,
				      char *coll,
				      long  collLen,
				      char *cgi ,//= NULL,
				      long  cgiLen ,//= 0,
				      bool  cgiFromRequest ,//= false ,
				      class HttpRequest *r );

	bool initLangTables(void);
	bool loadLangTables(void);
	uint8_t findLanguage(char *addr);

	// Categories
	Category *m_cats;
	long      m_numCats;

	// name buffer
	char *m_nameBuffer;
	long  m_nameBufferSize;

	// symbolic parent buffer
	//long *m_symParents;
	//long  m_numSymParents;

	// hash buffer
	CategoryHash *m_catHash;

	// full buffer
	char *m_buffer;
	long  m_bufferSize;

protected:
	// for parsing the original dmoz files
	char* incRdfPtr   ( long skip = 1 );
	long  rdfParse    ( char *tagName );
	long  rdfNextTag  ( );
	long  fillNextString  ( char *str, long max );
	long  fillNextTagBody ( char *str, long max );

	// rdf stream
	char *m_rdfPtr;
	char *m_rdfEnd;
	//std::ifstream m_rdfStream;
	int   m_rdfStream;
	char *m_rdfBuffer;
	long  m_rdfBufferSize;
	long  m_currOffset;
	// static rdf buffer
	char  m_rdfSmallBuffer[RDFSMALLBUFFER_SIZE];
	// tag buffer
	char  m_tagRecfer[MAX_TAG_LEN+1];
	long  m_tagLen;

	HashTable m_badTable;


	// sub category buffer
	//SubCategory m_subCats[MAX_SUB_CATS];
	//long m_numSubCats;
};

extern class Categories  g_categories1;
extern class Categories  g_categories2;
extern class Categories *g_categories;

#endif
Initial file population. 2013-08-03 00:12:24 +04:00			`//`
			`// Gigablast, Copyright March 2005`
			`// Author: Javier Olivares <jolivares@gigablast.com>`
			`//`
			`// Stores Categories in a Hierarchy`
			`// Based on DMOZ`
			`//`

			`#ifndef _CATEGORY_H_`
			`#define _CATEGORY_H_`

			`#include "Mem.h"`
			`#include "HashTable.h"`

			`#define RDFBUFFER_SIZE (10241024100)`
			`#define RDFSMALLBUFFER_SIZE (32*1024)`
			`#define RDFSTRUCTURE_FILE "structure.rdf.u8"`
			`#define RDFCONTENT_FILE "content.rdf.u8"`

			`#define STRUCTURE_OUTPUT_FILE "gbdmoz.structure.dat"`
			`#define CONTENT_OUTPUT_FILE "gbdmoz.content.dat"`
			`#define URL_OUTPUT_FILE "gbdmoz.urls.dat"`
			`#define URLTEXT_OUTPUT_FILE "gbdmoz.urls.txt"`

			`#define MAX_CATID_LEN 63`
			`#define MAX_TAG_LEN 127`
			`#define MAX_URL_CATIDS 64`
			`#define MAX_URLTXT_SIZE 500000`
			`#define MAX_CATIDS 64`
			`#define MAX_CATNAME_LEN 1024`

			`#define HASHTABLE_SIZE (1024*1024)`
			`#define URLHASHTABLE_SIZE (1010241024)`

			`#define MAX_SUB_CATS 1024`
			`#define SUBCAT_LETTERBAR 10`
			`#define SUBCAT_NARROW2 30`
			`#define SUBCAT_SYMBOLIC2 31`
			`#define SUBCAT_NARROW1 50`
			`#define SUBCAT_SYMBOLIC1 51`
			`#define SUBCAT_NARROW 70`
			`#define SUBCAT_SYMBOLIC 71`
			`#define SUBCAT_RELATED 90`
			`#define SUBCAT_ALTLANG 110`

			`struct Category {`
			`long m_catid;`
			`long m_parentid;`
			`//short m_numSymParents;`
			`//long m_symParentsOffset;`
			`long m_nameOffset;`
			`short m_nameLen;`
			`unsigned long m_structureOffset;`
			`unsigned long m_contentOffset;`
			`long m_numUrls;`
			`};`

			`struct CategoryHash {`
			`unsigned long m_hash;`
			`long m_catIndex;`
			`};`

			`struct SubCategory {`
trying to bring back dmoz integration. 2013-10-03 08:34:21 +04:00			`//long m_prefixOffset;`
Initial file population. 2013-08-03 00:12:24 +04:00			`long m_prefixLen;`
trying to bring back dmoz integration. 2013-10-03 08:34:21 +04:00			`//long m_nameOffset;`
Initial file population. 2013-08-03 00:12:24 +04:00			`long m_nameLen;`
			`char m_type;`
trying to bring back dmoz integration. 2013-10-03 08:34:21 +04:00			`long getRecSize () { return sizeof(SubCategory)+m_prefixLen+m_nameLen+2;};`
			`char *getPrefix() { return m_buf; };`
			`char *getName () { return m_buf+m_prefixLen+1;};`
			`char m_buf[0];`
Initial file population. 2013-08-03 00:12:24 +04:00			`};`

			`class Categories {`
			`public:`
			`Categories();`
			`~Categories();`

			`long fileRead ( int fileid, void *buf, size_t count );`

			`void reset();`

			`// load the hierarchy from a file`
			`long loadCategories ( char *filename );`

			`// . this is called by loadCategories() and constructs m_adultTable`
			`// . it will load/save it from/to disk, too`
			`bool makeBadHashTable ( ) ;`
			`bool addUrlsToBadHashTable ( long catid ) ;`

			`// get the index of a cat from its id`
			`// -1 if not found`
			`long getIndexFromId ( long catid );`
			`long getIndexFromPath ( char *str, long strLen );`
			`long getIdFromPath ( char *str, long strLen );`

			`// determine if a category should be printed RTL`
			`bool isIdRTLStart ( long catid );`
			`bool isIndexRTLStart ( long catIndex );`
			`bool isIdRTL ( long catid );`
			`bool isIndexRTL ( long catIndex );`

			`// see if the category is Adult`
			`bool isIdAdultStart ( long catid );`
			`bool isIndexAdultStart ( long catIndex );`
			`bool isIdAdult ( long catid );`
			`bool isIndexAdult ( long catIndex );`

			`// is it in a bad cat, like adult, gambling, online pharmacies`
			`bool isIdBadStart ( long catid );`
			`bool isIndexBadStart ( long catIndex );`
			`bool isIdBad ( long catid );`
			`bool isIndexBad ( long catIndex );`
			`// is this url directly in a dmoz adult category?`
			`bool isInBadCat ( Url *u ) ;`
			`bool isInBadCat ( unsigned long urlHash );`

			`// print info of cats`
			`void printCats ( long start, long end );`

			`// print the path of this category`
			`void printPathFromId ( SafeBuf *sb ,`
			`long catid,`
			`bool raw = false,`
			`bool isRTL = false );`
			`void printPathFromIndex ( SafeBuf *sb ,`
			`long catIndex,`
			`bool raw = false,`
			`bool isRTL = false );`

			`// print the path bread crumb links for this category`
			`void printPathCrumbFromId ( SafeBuf *sb ,`
			`long catid,`
			`bool isRTL = false );`
			`void printPathCrumbFromIndex ( SafeBuf *sb ,`
			`long catid,`
			`bool isRTL = false );`

added Categories::printUrlsInTopic() to print the dmoz urls for a catid. can replace us doing a search for the time being. 2013-10-11 10:03:07 +04:00			`bool printUrlsInTopic ( class SafeBuf *sb , long catid ) ;`

Initial file population. 2013-08-03 00:12:24 +04:00			`// . get the title and summary for a specific url`
			`// and catid`
			`bool getTitleAndSummary ( char *url,`
			`long urlLen,`
			`long catid,`
			`char *title = NULL,`
			`long *titleLen = NULL,`
			`long maxTitleLen = 0,`
			`char *summ = NULL,`
			`long *summLen = NULL,`
			`long maxSummLen = 0,`
			`char *anchor = NULL,`
			`unsigned char *anchorLen = NULL,`
			`long maxAnchorLen = 0 ,`
			`long niceness = 0 ,`
			`bool justAddToTable = false );`

			`// normalize a url string`
			`long fixUrl ( char *url, long urlLen );`

trying to bring back dmoz integration. 2013-10-03 08:34:21 +04:00			`// . generate sub categories for a given catid`
			`// . store list of SubCategories into "subCatBuf" return # stored`
			`// . hits disk without using threads... so kinda sucks...`
			`long generateSubCats ( long catid, SafeBuf *subCatBuf );`
Initial file population. 2013-08-03 00:12:24 +04:00
			`long getNumUrlsFromIndex ( long catIndex ) {`
			`return m_cats[catIndex].m_numUrls; };`

			`// creates a directory search request url`
			`//void createDirectorySearchUrl ( Url *url,`
			`long createDirSearchRequest ( char *requestBuf,`
			`long requestBufSize,`
			`long catid,`
			`char *hostname,`
			`long hostnameLen,`
			`char *coll,`
			`long collLen,`
			`char *cgi ,//= NULL,`
			`long cgiLen ,//= 0,`
			`bool cgiFromRequest ,//= false ,`
			`class HttpRequest *r );`

			`bool initLangTables(void);`
			`bool loadLangTables(void);`
			`uint8_t findLanguage(char *addr);`

			`// Categories`
			`Category *m_cats;`
			`long m_numCats;`

			`// name buffer`
			`char *m_nameBuffer;`
			`long m_nameBufferSize;`

			`// symbolic parent buffer`
			`//long *m_symParents;`
			`//long m_numSymParents;`

			`// hash buffer`
			`CategoryHash *m_catHash;`

			`// full buffer`
			`char *m_buffer;`
			`long m_bufferSize;`

			`protected:`
			`// for parsing the original dmoz files`
			`char* incRdfPtr ( long skip = 1 );`
			`long rdfParse ( char *tagName );`
			`long rdfNextTag ( );`
			`long fillNextString ( char *str, long max );`
			`long fillNextTagBody ( char *str, long max );`

			`// rdf stream`
			`char *m_rdfPtr;`
			`char *m_rdfEnd;`
			`//std::ifstream m_rdfStream;`
			`int m_rdfStream;`
			`char *m_rdfBuffer;`
			`long m_rdfBufferSize;`
			`long m_currOffset;`
			`// static rdf buffer`
			`char m_rdfSmallBuffer[RDFSMALLBUFFER_SIZE];`
			`// tag buffer`
			`char m_tagRecfer[MAX_TAG_LEN+1];`
			`long m_tagLen;`

			`HashTable m_badTable;`


			`// sub category buffer`
			`//SubCategory m_subCats[MAX_SUB_CATS];`
			`//long m_numSubCats;`
			`};`

			`extern class Categories g_categories1;`
			`extern class Categories g_categories2;`
			`extern class Categories *g_categories;`

			`#endif`