open-source-search-engine/Categories.h

242 lines
6.2 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
//
// Gigablast, Copyright March 2005
// Author: Javier Olivares <jolivares@gigablast.com>
//
// Stores Categories in a Hierarchy
// Based on DMOZ
//
#ifndef _CATEGORY_H_
#define _CATEGORY_H_
#include "Mem.h"
#include "HashTable.h"
#define RDFBUFFER_SIZE (1024*1024*100)
#define RDFSMALLBUFFER_SIZE (32*1024)
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
#define RDFCONTENT_FILE "content.rdf.u8"
#define STRUCTURE_OUTPUT_FILE "gbdmoz.structure.dat"
#define CONTENT_OUTPUT_FILE "gbdmoz.content.dat"
#define URL_OUTPUT_FILE "gbdmoz.urls.dat"
#define URLTEXT_OUTPUT_FILE "gbdmoz.urls.txt"
#define MAX_CATID_LEN 63
#define MAX_TAG_LEN 127
#define MAX_URL_CATIDS 64
#define MAX_URLTXT_SIZE 500000
#define MAX_CATIDS 64
#define MAX_CATNAME_LEN 1024
#define HASHTABLE_SIZE (1024*1024)
#define URLHASHTABLE_SIZE (10*1024*1024)
#define MAX_SUB_CATS 1024
#define SUBCAT_LETTERBAR 10
#define SUBCAT_NARROW2 30
#define SUBCAT_SYMBOLIC2 31
#define SUBCAT_NARROW1 50
#define SUBCAT_SYMBOLIC1 51
#define SUBCAT_NARROW 70
#define SUBCAT_SYMBOLIC 71
#define SUBCAT_RELATED 90
#define SUBCAT_ALTLANG 110
struct Category {
long m_catid;
long m_parentid;
//short m_numSymParents;
//long m_symParentsOffset;
long m_nameOffset;
short m_nameLen;
unsigned long m_structureOffset;
unsigned long m_contentOffset;
long m_numUrls;
};
struct CategoryHash {
unsigned long m_hash;
long m_catIndex;
};
struct SubCategory {
2013-10-03 08:34:21 +04:00
//long m_prefixOffset;
2013-08-03 00:12:24 +04:00
long m_prefixLen;
2013-10-03 08:34:21 +04:00
//long m_nameOffset;
2013-08-03 00:12:24 +04:00
long m_nameLen;
char m_type;
2013-10-03 08:34:21 +04:00
long getRecSize () { return sizeof(SubCategory)+m_prefixLen+m_nameLen+2;};
char *getPrefix() { return m_buf; };
char *getName () { return m_buf+m_prefixLen+1;};
char m_buf[0];
2013-08-03 00:12:24 +04:00
};
class Categories {
public:
Categories();
~Categories();
long fileRead ( int fileid, void *buf, size_t count );
void reset();
// load the hierarchy from a file
long loadCategories ( char *filename );
// . this is called by loadCategories() and constructs m_adultTable
// . it will load/save it from/to disk, too
bool makeBadHashTable ( ) ;
bool addUrlsToBadHashTable ( long catid ) ;
// get the index of a cat from its id
// -1 if not found
long getIndexFromId ( long catid );
long getIndexFromPath ( char *str, long strLen );
long getIdFromPath ( char *str, long strLen );
// determine if a category should be printed RTL
bool isIdRTLStart ( long catid );
bool isIndexRTLStart ( long catIndex );
bool isIdRTL ( long catid );
bool isIndexRTL ( long catIndex );
// see if the category is Adult
bool isIdAdultStart ( long catid );
bool isIndexAdultStart ( long catIndex );
bool isIdAdult ( long catid );
bool isIndexAdult ( long catIndex );
// is it in a bad cat, like adult, gambling, online pharmacies
bool isIdBadStart ( long catid );
bool isIndexBadStart ( long catIndex );
bool isIdBad ( long catid );
bool isIndexBad ( long catIndex );
// is this url directly in a dmoz adult category?
bool isInBadCat ( Url *u ) ;
bool isInBadCat ( unsigned long urlHash );
// print info of cats
void printCats ( long start, long end );
// print the path of this category
void printPathFromId ( SafeBuf *sb ,
long catid,
bool raw = false,
bool isRTL = false );
void printPathFromIndex ( SafeBuf *sb ,
long catIndex,
bool raw = false,
bool isRTL = false );
// print the path bread crumb links for this category
void printPathCrumbFromId ( SafeBuf *sb ,
long catid,
bool isRTL = false );
void printPathCrumbFromIndex ( SafeBuf *sb ,
long catid,
bool isRTL = false );
bool printUrlsInTopic ( class SafeBuf *sb , long catid ) ;
2013-08-03 00:12:24 +04:00
// . get the title and summary for a specific url
// and catid
bool getTitleAndSummary ( char *url,
long urlLen,
long catid,
char *title = NULL,
long *titleLen = NULL,
long maxTitleLen = 0,
char *summ = NULL,
long *summLen = NULL,
long maxSummLen = 0,
char *anchor = NULL,
unsigned char *anchorLen = NULL,
long maxAnchorLen = 0 ,
long niceness = 0 ,
bool justAddToTable = false );
// normalize a url string
long fixUrl ( char *url, long urlLen );
2013-10-03 08:34:21 +04:00
// . generate sub categories for a given catid
// . store list of SubCategories into "subCatBuf" return # stored
// . hits disk without using threads... so kinda sucks...
long generateSubCats ( long catid, SafeBuf *subCatBuf );
2013-08-03 00:12:24 +04:00
long getNumUrlsFromIndex ( long catIndex ) {
return m_cats[catIndex].m_numUrls; };
// creates a directory search request url
//void createDirectorySearchUrl ( Url *url,
long createDirSearchRequest ( char *requestBuf,
long requestBufSize,
long catid,
char *hostname,
long hostnameLen,
char *coll,
long collLen,
char *cgi ,//= NULL,
long cgiLen ,//= 0,
bool cgiFromRequest ,//= false ,
class HttpRequest *r );
bool initLangTables(void);
bool loadLangTables(void);
uint8_t findLanguage(char *addr);
// Categories
Category *m_cats;
long m_numCats;
// name buffer
char *m_nameBuffer;
long m_nameBufferSize;
// symbolic parent buffer
//long *m_symParents;
//long m_numSymParents;
// hash buffer
CategoryHash *m_catHash;
// full buffer
char *m_buffer;
long m_bufferSize;
protected:
// for parsing the original dmoz files
char* incRdfPtr ( long skip = 1 );
long rdfParse ( char *tagName );
long rdfNextTag ( );
long fillNextString ( char *str, long max );
long fillNextTagBody ( char *str, long max );
// rdf stream
char *m_rdfPtr;
char *m_rdfEnd;
//std::ifstream m_rdfStream;
int m_rdfStream;
char *m_rdfBuffer;
long m_rdfBufferSize;
long m_currOffset;
// static rdf buffer
char m_rdfSmallBuffer[RDFSMALLBUFFER_SIZE];
// tag buffer
char m_tagRecfer[MAX_TAG_LEN+1];
long m_tagLen;
HashTable m_badTable;
// sub category buffer
//SubCategory m_subCats[MAX_SUB_CATS];
//long m_numSubCats;
};
extern class Categories g_categories1;
extern class Categories g_categories2;
extern class Categories *g_categories;
#endif