//
// Gigablast, Copyright March 2005
// Author: Javier Olivares <jolivares@gigablast.com>
//
// Stores Categories in a Hierarchy
// Based on DMOZ
//

#ifndef _CATEGORY_H_
#define _CATEGORY_H_

#include "Mem.h"
#include "HashTable.h"

#define RDFBUFFER_SIZE      (1024*1024*100)
#define RDFSMALLBUFFER_SIZE (32*1024)
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
#define RDFCONTENT_FILE   "content.rdf.u8"

#define STRUCTURE_OUTPUT_FILE  "gbdmoz.structure.dat"
#define CONTENT_OUTPUT_FILE    "gbdmoz.content.dat"
#define URL_OUTPUT_FILE        "gbdmoz.urls.dat"
#define URLTEXT_OUTPUT_FILE    "gbdmoz.urls.txt"

#define MAX_CATID_LEN    63
#define MAX_TAG_LEN      127
#define MAX_URL_CATIDS   64
#define MAX_URLTXT_SIZE  500000
#define MAX_CATIDS       96
#define MAX_CATNAME_LEN  1024

#define HASHTABLE_SIZE    (1024*1024)
#define URLHASHTABLE_SIZE (10*1024*1024)

#define MAX_SUB_CATS       1024
#define SUBCAT_LETTERBAR   10
#define SUBCAT_NARROW2     30
#define SUBCAT_SYMBOLIC2   31
#define SUBCAT_NARROW1     50
#define SUBCAT_SYMBOLIC1   51
#define SUBCAT_NARROW      70
#define SUBCAT_SYMBOLIC    71
#define SUBCAT_RELATED     90
#define SUBCAT_ALTLANG     110

struct Category {
	long  m_catid;
	long  m_parentid;
	//short m_numSymParents;
	//long  m_symParentsOffset;
	long  m_nameOffset;
	short m_nameLen;
	unsigned long m_structureOffset;
	unsigned long m_contentOffset;
	long  m_numUrls;
};

struct CategoryHash {
	unsigned long  m_hash;
	long m_catIndex;
};

struct SubCategory {
	//long  m_prefixOffset;
	long  m_prefixLen;
	//long  m_nameOffset;
	long  m_nameLen;
	char  m_type;
	long getRecSize () { return sizeof(SubCategory)+m_prefixLen+m_nameLen+2;};
	char *getPrefix() { return m_buf; };
	char *getName  () { return m_buf+m_prefixLen+1;};
	char  m_buf[0];
};

class Categories {
public:
	Categories();
	~Categories();

	long fileRead ( int fileid, void *buf, size_t count );

	void reset();

	// load the hierarchy from a file
	long loadCategories ( char *filename );

	// . this is called by loadCategories() and constructs m_adultTable
	// . it will load/save it from/to disk, too
	bool makeBadHashTable ( ) ;
	bool addUrlsToBadHashTable ( long catid ) ;

	// get the index of a cat from its id
	// -1 if not found
	long getIndexFromId   ( long catid );
	long getIndexFromPath ( char *str, long strLen );
	long getIdFromPath    ( char *str, long strLen );

	// determine if a category should be printed RTL
	bool isIdRTLStart    ( long catid );
	bool isIndexRTLStart ( long catIndex );
	bool isIdRTL         ( long catid );
	bool isIndexRTL      ( long catIndex );

	// see if the category is Adult
	bool isIdAdultStart    ( long catid );
	bool isIndexAdultStart ( long catIndex );
	bool isIdAdult         ( long catid );
	bool isIndexAdult      ( long catIndex );

	// is it in a bad cat, like adult, gambling, online pharmacies
	bool isIdBadStart    ( long catid );
	bool isIndexBadStart ( long catIndex );
	bool isIdBad         ( long catid );
	bool isIndexBad      ( long catIndex );
	// is this url directly in a dmoz adult category?
	bool isInBadCat      ( Url *u ) ;
	bool isInBadCat      ( unsigned long urlHash );

	// print info of cats
	void printCats ( long start, long end );

	// print the path of this category
	void printPathFromId ( SafeBuf *sb ,
			       long  catid,
			       bool  raw = false,
			       bool  isRTL = false );
	void printPathFromIndex ( SafeBuf *sb ,
				  long  catIndex,
				  bool  raw = false,
				  bool  isRTL = false );

	// print the path bread crumb links for this category
	void printPathCrumbFromId    ( SafeBuf *sb ,
				       long  catid,
				       bool  isRTL = false );
	void printPathCrumbFromIndex ( SafeBuf *sb ,
				       long  catid,
				       bool  isRTL = false );

	bool printUrlsInTopic ( class SafeBuf *sb , long catid  ) ;

	// . get the title and summary for a specific url
	//   and catid
	bool getTitleAndSummary ( char  *url,
				  long   urlLen,
				  long   catid,
				  char  *title        = NULL,
				  long  *titleLen     = NULL,
				  long   maxTitleLen  = 0,
				  char  *summ         = NULL,
				  long  *summLen      = NULL,
				  long   maxSummLen   = 0,
				  char  *anchor       = NULL,
				  unsigned char *anchorLen    = NULL,
				  long   maxAnchorLen = 0 ,
				  long   niceness     = 0 ,
				  bool   justAddToTable = false );

	// normalize a url string
	long fixUrl ( char *url, long urlLen );

	// . generate sub categories for a given catid
	// . store list of SubCategories into "subCatBuf" return # stored
	// . hits disk without using threads... so kinda sucks...
	long generateSubCats ( long catid, SafeBuf *subCatBuf );

	long getNumUrlsFromIndex ( long catIndex ) {
		if ( ! m_cats ) return 0;
		return m_cats[catIndex].m_numUrls; };

	// creates a directory search request url
	//void createDirectorySearchUrl ( Url  *url,
	long createDirSearchRequest ( char *requestBuf,
				      long  requestBufSize,
				      long  catid,
				      char *hostname,
				      long  hostnameLen,
				      char *coll,
				      long  collLen,
				      char *cgi ,//= NULL,
				      long  cgiLen ,//= 0,
				      bool  cgiFromRequest ,//= false ,
				      class HttpRequest *r );

	bool initLangTables(void);
	bool loadLangTables(void);
	uint8_t findLanguage(char *addr);

	// Categories
	Category *m_cats;
	long      m_numCats;

	// name buffer
	char *m_nameBuffer;
	long  m_nameBufferSize;

	// symbolic parent buffer
	//long *m_symParents;
	//long  m_numSymParents;

	// hash buffer
	CategoryHash *m_catHash;

	// full buffer
	char *m_buffer;
	long  m_bufferSize;

protected:
	// for parsing the original dmoz files
	char* incRdfPtr   ( long skip = 1 );
	long  rdfParse    ( char *tagName );
	long  rdfNextTag  ( );
	long  fillNextString  ( char *str, long max );
	long  fillNextTagBody ( char *str, long max );

	// rdf stream
	char *m_rdfPtr;
	char *m_rdfEnd;
	//std::ifstream m_rdfStream;
	int   m_rdfStream;
	char *m_rdfBuffer;
	long  m_rdfBufferSize;
	long  m_currOffset;
	// static rdf buffer
	char  m_rdfSmallBuffer[RDFSMALLBUFFER_SIZE];
	// tag buffer
	char  m_tagRecfer[MAX_TAG_LEN+1];
	long  m_tagLen;

	HashTable m_badTable;


	// sub category buffer
	//SubCategory m_subCats[MAX_SUB_CATS];
	//long m_numSubCats;
};

extern class Categories  g_categories1;
extern class Categories  g_categories2;
extern class Categories *g_categories;

#endif