open-source-search-engine/fctypes.h
Matt 6e7b329cef speed up gb by fixing excessive calling to
gettimeofday() system call.
2015-01-20 16:06:01 -07:00

621 lines
20 KiB
C++

// Matt Wells, copyright Jun 2001
#ifndef _FCTYPES_H_
#define _FCTYPES_H_
#include <sys/time.h> // gettimeofday()
#include <math.h> // floor()
#include "Unicode.h"
extern bool g_clockNeedsUpdate;
// we have to leave this as 32 bits for now because the termlists store
// the hash value as 32 bits in posdb
typedef uint32_t FacetValHash_t;
bool verifyUtf8 ( char *txt ) ;
bool verifyUtf8 ( char *txt , int32_t tlen ) ;
bool print96 ( char *k ) ;
bool print96 ( key_t *kp ) ;
bool print128 ( char *k ) ;
bool print128 ( key128_t *kp ) ;
// print it to stdout for debugging Dates.cpp
int32_t printTime ( int32_t ttt );
time_t mktime_utc ( struct tm *ttt ) ;
class SafeBuf;
// seems like this should be defined, but it isn't
int32_t strnlen ( const char *s , int32_t maxLen );
// this too
char *strncasestr( char *haystack, int32_t haylen, char *needle);
// this is also done below
char *strnstr2( char *haystack, int32_t haylen, char *needle);
// just like sprintf(s,"%"UINT64"",n), but we insert commas
int32_t ulltoa ( char *s , uint64_t n ) ;
// . convert < to &lt; and > to &gt and & to &amp;
// . store "t" into "s"
// . returns bytes stored into "s"
// . NULL terminates "s"
int32_t saftenTags ( char *s , int32_t slen , char *t , int32_t tlen ) ;
// . basically just converts "'s to &#34;'s
// . store "src" into "dest" and return bytes stored
// . does not do bounds checking in "dest"
// . used to encode things as form input variables, like query in HttpPage0.cpp
int32_t dequote ( char *dest , char *dend , char *src , int32_t srcLen ) ;
// . entity-ize a string so it's safe for html output
// . converts "'s to &#34;'s, &'s to &amps; <'s the &lt; and >'s to &gt;
// . store "src" into "dest" and return bytes stored
// . does not do bounds checking on "dest"
// . encode t into s
char *htmlEncode ( char *s , char *send , char *t , char *tend ,
bool pound = false , int32_t niceness = 0) ;
bool htmlEncode ( SafeBuf* s , char *t , char *tend ,
bool pound = false , int32_t niceness = 0 );
// . like above but src is NULL terminated
// . returns length of string stored into "dest"
// . decode html entities like &amp; and &gt;
int32_t htmlDecode ( char *dst, char *src, int32_t srcLen,
bool doSpecial ,//=false);
int32_t niceness);
int32_t cdataDecode ( char *dst , char *src , int32_t niceness ) ;
// . convert " to %22 , & to %26, is that it?
// . urlEncode() stores the encoded, NULL-terminated URL in "dest"
// . requestPath leaves \0 and ? characters intact, for encoding requests
int32_t urlEncode ( char *dest , int32_t destLen , char *src , int32_t srcLen ,
bool requestPath = false ) ;
// determine the length of the encoded url, does NOT include NULL
int32_t urlEncodeLen ( char *s , int32_t slen , bool requestPath = false ) ;
// decode a url -- decode ALL %XX's
int32_t urlDecode ( char *dest , char *t , int32_t tlen ) ;
// . normalize the encoding
// . like urlDecode() but only decodes chars that should not have been encoded
// . also, will encode characters that should have been encoded
int32_t urlNormCode ( char *dest , int32_t destLen , char *src , int32_t srcLen ) ;
bool is_digit(unsigned char c) ;
// is character, "s", used in textual hexadecimal representation?
bool is_hex ( char s ) ;
bool is_urlchar(char s);
// convert hex digit to value
int32_t htob ( char s ) ;
char btoh ( char s ) ;
// convert hex ascii string into binary
void hexToBin ( char *src , int32_t srcLen , char *dst );
// convert binary number of size srcLen bytes into hex string in "dst"
void binToHex ( unsigned char *src , int32_t srcLen , char *dst );
// the _a suffix denotes an ascii string
bool is_lower2_a (char *s,int32_t len) ;
bool is_lower1_a (char *s) ;
bool is_ascii2 (char *s,int32_t len) ;
bool is_alnum2_a (char *s,int32_t len) ;
bool has_alpha_a (char *s , char *send ) ;
bool has_alpha_utf8(char *s, char *send ) ;
bool is_alpha2_a (char *s,int32_t len) ;
bool is_num (char *s,int32_t len) ;
bool is_upper2_a (char *s,int32_t len) ;
bool is_cap_a (char *s,int32_t len) ;
bool is_cap_utf8 (char *s,int32_t len) ;
bool is_vowel_a ( char s );
bool has_vowel_a ( char *s , int32_t slen );
// does it have at least one upper case character in it?
bool has_upper_a (char *s,int32_t len) ;
bool has_binary_a (char *s,int32_t len) ;
void to_lower3_a (char *s,int32_t len, char *buf) ;
void to_lower1 (char *s) ;
int32_t to_lower_alnum (char *s,int32_t len, char *buf) ;
int32_t to_lower_utf8 (char *dst , char *src ) ;
int32_t to_lower_utf8 (char *dst , char *dstEnd, char *src ) ;
int32_t to_lower_utf8 (char *dst , char *dstEnd, char *src, char *srcEnd) ;
int32_t to_lower_utf8_32 (char *src ) ;
int32_t to_cap_alnum (char *s,int32_t len, char *buf) ;
int32_t to_alnum (char *s,int32_t len, char *buf) ;
void to_upper3_a (char *s,int32_t len, char *buf) ;
void to_cap (char *s,int32_t len, char *buf) ;
// . approximate # of non-punct words
// . s must be NULL terminated
// . used by LinkInfo.cpp to weight link text based on # of words
int32_t getNumWords ( char *s );
// true if character should be stripped from the end/beginning of the title
// error! make a map of this
bool is_title_junk(char c) ;
// . get the # of words in this string
int32_t getNumWords ( char *s , int32_t len, int32_t titleVersion ) ;
int32_t atol2 ( const char *s, int32_t len ) ;
int64_t atoll1 ( const char *s ) ;
int64_t atoll2 ( const char *s, int32_t len ) ;
double atof2 ( const char *s, int32_t len ) ;
double atod2 ( char *s, int32_t len ) ;
bool atob ( const char *s, int32_t len ) ;
// like strstr but haystack need not be NULL terminated
char *strncasestr ( char *haystack , char *needle , int32_t haystackSize ) ;
char *strncasestr ( char *haystack , char *needle ,
int32_t haystackSize, int32_t needleSize ) ;
char *strnstr ( char *haystack , char *needle , int32_t haystackSize ) ;
// independent of case
char *gb_strcasestr ( char *haystack , char *needle );
char *gb_strncasestr ( char *haystack , int32_t haystackSize , char *needle ) ;
// updates our static var, s_adjustment to keep our clock in sync to hostId #0
void settimeofdayInMillisecondsGlobal ( int64_t newTime ) ;
// convert global to local time in milliseconds
int64_t globalToLocalTimeMilliseconds ( int64_t global ) ;
int64_t localToGlobalTimeMilliseconds ( int64_t local ) ;
// the same thing but in seconds
int32_t globalToLocalTimeSeconds ( int32_t global ) ;
int32_t localToGlobalTimeSeconds ( int32_t local ) ;
// we now default this to local time to avoid jumpiness associated with
// having to sync with host #0. most routines calling this usually are just
// taking deltas.
int64_t gettimeofdayInMillisecondsGlobal() ; // synced with host #0
int64_t gettimeofdayInMillisecondsGlobalNoCore() ; // synced with host #0
int64_t gettimeofdayInMillisecondsSynced() ; // synced with host #0
int64_t gettimeofdayInMillisecondsLocal () ;// this is local now
int64_t gettimeofdayInMilliseconds() ;// this is local now
uint64_t gettimeofdayInMicroseconds(void) ;
// . get time in seconds since epoch
// . use this instead of call to time(NULL) cuz it uses adjustment
time_t getTime (); // this is local now
time_t getTimeLocal ();
time_t getTimeGlobal (); // synced with host #0's system clock
time_t getTimeGlobalNoCore (); // synced with host #0's system clock
time_t getTimeSynced (); // synced with host #0's system clock
int32_t stripHtml( char *content, int32_t contentLen, int32_t version, int32_t strip );
extern const char g_map_is_vowel[];
extern const unsigned char g_map_to_lower[];
extern const unsigned char g_map_to_upper[];
extern const unsigned char g_map_to_ascii[];
extern const char g_map_is_upper[];
extern const char g_map_canBeInTagName[];
extern const char g_map_is_control[];
extern const char g_map_is_binary[];
extern const char g_map_is_wspace[];
extern const char g_map_is_vspace[];
extern const char g_map_is_hspace[];
extern const char g_map_is_lower[];
extern const char g_map_is_ascii[];
extern const char g_map_is_ascii3[];
extern const char g_map_is_iso[];
extern const char g_map_is_punct[];
extern const char g_map_is_alnum[];
extern const char g_map_is_alpha[];
extern const char g_map_is_digit[];
extern const char g_map_is_hex[];
extern const char g_map_is_tagname_char[];
extern const char g_map_is_tag_control_char[];
//extern bool g_clockInSync;
extern int64_t g_adjustment;
bool isClockInSync();
bool setTimeAdjustmentFilename ( char *dir, char *filename ) ;
bool loadTimeAdjustment ( ) ;
bool saveTimeAdjustment ( ) ;
// . convert "c" to lower case
#define is_vowel_a(c) g_map_is_vowel[(unsigned char)c]
#define is_lower_a(c) g_map_is_lower[(unsigned char)c]
#define to_lower_a(c) g_map_to_lower[(unsigned char)c]
#define is_upper_a(c) g_map_is_upper[(unsigned char)c]
#define to_upper_a(c) g_map_to_upper[(unsigned char)c]
// c is latin1 in this case:
#define to_ascii(c) g_map_to_ascii[(unsigned char)c]
#define canBeInTagName(c) g_map_canBeInTagName[(unsigned char)c]
#define is_control_a(c) g_map_is_control[(unsigned char)c]
#define is_binary_a(c) g_map_is_binary[(unsigned char)c]
#define is_wspace_a(c) g_map_is_wspace[(unsigned char)c]
#define is_vspace_a(c) g_map_is_vspace[(unsigned char)c]
#define is_hspace_a(c) g_map_is_hspace[(unsigned char)c]
#define is_ascii(c) g_map_is_ascii[(unsigned char)c]
#define is_ascii9(c) g_map_is_ascii[(unsigned char)c]
#define is_ascii3(c) g_map_is_ascii3[(unsigned char)c]
#define is_punct_a(c) g_map_is_punct[(unsigned char)c]
#define is_alnum_a(c) g_map_is_alnum[(unsigned char)c]
#define is_alpha_a(c) g_map_is_alpha[(unsigned char)c]
#define is_digit(c) g_map_is_digit[(unsigned char)c]
#define is_hex(c) g_map_is_hex[(unsigned char)c]
#define is_tagname_char(c) g_map_is_tagname_char[(unsigned char)c]
#define is_tag_control_char(c) g_map_is_tag_control_char[(unsigned char)c]
#define is_matchskip_a(c) g_map_is_matchskip[(unsigned char)c]
inline bool is_upper_utf8 ( char *s );
inline bool has_vowel_a ( char *s , int32_t slen ) {
char *send = s + slen;
for ( ; s < send ; s++ )
if ( is_vowel_a(*s) ) return true;
return false;
};
/*
// is character, "s", used in textual hexadecimal representation?
inline bool is_hex ( char s ) {
if ( is_digit(s)) return true;
if ( s >= 'a' && s <= 'f' ) return true;
if ( s >= 'A' && s <= 'F' ) return true;
return false;
}
*/
// convert hex digit to value
inline int32_t htob ( char s ) {
if ( is_digit(s) ) return s - '0';
if ( s >= 'a' && s <= 'f' ) return (s - 'a') + 10;
if ( s >= 'A' && s <= 'F' ) return (s - 'A') + 10;
return 0;
}
inline char btoh ( char s ) {
if ( s >= 16 ) { char *xx=NULL;*xx=0; }
if ( s < 10 ) return s + '0';
return (s - 10) + 'a';
}
// have to put an extra "s" on function name to avoid macro conflict
inline bool is_lower_as(char *s,int32_t len) {
for (int32_t i=0;i<len;i++)
if (!is_lower_a(s[i]))
return false;
return true;
}
// have to put an extra "s" on function name to avoid macro conflict
inline bool is_lower_as(char *s) {
for (int32_t i=0;s[i];i++)
if (!is_lower_a(s[i]))
return false;
return true;
}
inline bool is_ascii2_a(char *s,int32_t len) {
for (int32_t i=0;i<len;i++)
if (!is_ascii(s[i]))
return false;
return true;
}
inline bool is_alnum2_a(char *s,int32_t len) {
for (int32_t i=0;i<len;i++)
if (!is_alnum_a(s[i]))
return false;
return true;
}
inline bool is_alpha2_a(char *s,int32_t len) {
for (int32_t i=0;i<len;i++)
if (!is_alpha_a(s[i]))
return false;
return true;
}
inline bool is_num(char *s,int32_t len) {
for (int32_t i=0;i<len;i++)
if (!is_digit(s[i]))
return false;
return true;
}
inline bool is_upper2_a (char *s,int32_t len) {
for (int32_t i=0;i<len;i++)
if (!is_upper_a(s[i]))
return false;
return true;
}
inline bool is_cap_a (char *s,int32_t len) {
if (!is_upper_a(s[0]))
return false;
for (int32_t i=1;i<len;i++)
if (!is_lower_a(s[i]))
return false;
return true;
}
inline bool is_cap_utf8 (char *s,int32_t len) {
if ( ! is_upper_utf8 ( s ) ) return false;
char *send = s + len;
for ( ; s < send ; s += getUtf8CharSize ( s ) )
if ( is_upper_utf8 ( s ) ) return false;
return true;
}
// does it have at least one upper case character in it?
inline bool has_upper_a (char *s,int32_t len) {
for (int32_t i=0;i<len;i++)
if ( is_upper_a(s[i]))
return true;
return false;
}
// does it have at least one binary character in it?
inline bool has_binary_a (char *s,int32_t len) {
for (int32_t i=0;i<len;i++)
if ( is_binary_a(s[i]))
return true;
return false;
}
inline void to_lower3_a(char *s,int32_t len, char *buf) {
for (int32_t i=0;i<len ;i++)
buf[i]=to_lower_a((unsigned char)s[i]);
}
inline void to_lower1_a(char *s) {
for (int32_t i=0;s[i] ;i++)
s[i]=to_lower_a((unsigned char)s[i]);
}
inline int32_t to_lower_alnum_a(char *s,int32_t len, char *buf) {
int32_t j=0;
for (int32_t i=0;i<len ;i++)
if (is_alnum_a(s[i]))
buf[j++]=to_lower_a((unsigned char)s[i]);
return j;
}
inline int32_t to_cap_alnum_a(char *s,int32_t len, char *buf) {
buf[0] = to_upper_a(s[0]);
int32_t j=1;
for (int32_t i=1;i<len ;i++)
if (is_alnum_a(s[i]))
buf[j++]=to_lower_a((unsigned char)s[i]);
return j;
}
inline int32_t to_alnum_a(char *s,int32_t len, char *buf) {
int32_t j=0;
for (int32_t i=0;i<len ;i++)
if (is_alnum_a(s[i]))
buf[j++]=s[i];
return j;
}
inline void to_upper3_a(char *s,int32_t len, char *buf) {
for (int32_t i=0;i<len;i++)
buf[i]=to_upper_a(s[i]);
}
inline void to_cap_a(char *s,int32_t len, char *buf) {
buf[0]=to_upper_a(s[0]);
for (int32_t i=1;i<len;i++)
buf[i]=to_lower_a(s[i]);
}
inline bool is_binary_utf8 ( char *p ) {
if ( getUtf8CharSize((uint8_t *)p) != 1 ) return false;
// it is ascii, use that table now
return is_binary_a ( *p );
}
inline bool is_lower_utf8 ( char *src ) {
// if in ascii do it quickly
if ( is_ascii3(*src) ) return is_lower_a ( *src );
// convert to a code point
UChar32 x = utf8Decode(src);
// is this codepoint lower?
return ucIsLower ( x );
}
inline bool is_upper_utf8 ( char *src ) {
// if in ascii do it quickly
if ( is_ascii3(*src) ) return is_upper_a ( *src );
// convert to a code point
UChar32 x = utf8Decode(src);
// is this codepoint upper?
return ucIsUpper ( x );
}
inline bool is_alnum_utf8 ( char *src ) {
// if in ascii do it quickly
if ( is_ascii3(*src) ) return is_alnum_a ( *src );
// convert to a code point
UChar32 x = utf8Decode(src);
// is this codepoint lower?
return ucIsAlnum ( x );
}
inline bool is_alnum_utf8 ( unsigned char *src ) {
// if in ascii do it quickly
if ( is_ascii3(*src) ) return is_alnum_a ( *src );
// convert to a code point
UChar32 x = utf8Decode((char *)src);
// is this codepoint lower?
return ucIsAlnum ( x );
}
inline bool is_alpha_utf8 ( char *src ) {
// if in ascii do it quickly
if ( is_ascii3(*src) ) return is_alpha_a ( *src );
// convert to a code point
UChar32 x = utf8Decode(src);
// is this codepoint lower?
return ucIsAlpha ( x );
}
inline bool is_punct_utf8 ( char *src ) {
// if in ascii do it quickly
if ( is_ascii3(*src) ) return is_punct_a ( *src );
// convert to a code point
UChar32 x = utf8Decode(src);
// is this codepoint lower?
if ( ucIsAlnum ( x ) ) return false;
else return true;
}
inline bool is_wspace_utf8 ( uint8_t *src ) {
// if in ascii do it quickly
if ( is_ascii3(*src) ) return is_wspace_a ( *src );
// convert to a code point
UChar32 x = utf8Decode((char *)src);
// is this codepoint a whitespace?
return is_wspace_uc ( x );
}
inline bool is_wspace_utf8 ( char *src ) {
// if in ascii do it quickly
if ( is_ascii3((uint8_t)*src) ) return is_wspace_a ( (uint8_t)*src );
// convert to a code point
UChar32 x = utf8Decode((char *)src);
// is this codepoint a whitespace?
return is_wspace_uc ( x );
}
// . returns bytes stored into "dst" from "src"
// . just do one character, which may be from 1 to 4 bytes
// . TODO: make a native utf8 to_lower to avoid converting to a code point
inline int32_t to_lower_utf8 ( char *dst , char *src ) {
// if in ascii do it quickly
if ( is_ascii3(*src) ) { *dst = to_lower_a ( *src ); return 1; }
// convert to a code point
UChar32 x = utf8Decode(src);
// covert to lower
UChar32 y = ucToLower ( x );
// put it back to utf8. return bytes stored.
return utf8Encode ( y , dst );
}
// store answer in the int32_t and return that!
inline int32_t to_lower_utf8_32 ( char *src ) {
// if in ascii do it quickly
if ( is_ascii3(*src) ) return (int32_t) to_lower_a ( *src );
// convert to a code point
UChar32 x = utf8Decode(src);
// covert to lower
UChar32 y = ucToLower ( x );
// give that back
return y;
}
inline int32_t to_upper_utf8 ( char *dst , char *src ) {
// if in ascii do it quickly
if ( is_ascii3(*src) ) { *dst = to_upper_a ( *src ); return 1; }
// convert to a code point
UChar32 x = utf8Decode(src);
// covert to lower
UChar32 y = ucToUpper ( x );
// put it back to utf8. return bytes stored.
return utf8Encode ( y , dst );
}
inline int32_t to_lower_utf8 (char *dst, char *dstEnd, char *src, char *srcEnd ){
char *dstart = dst;
for ( ; src < srcEnd ; src += getUtf8CharSize((uint8_t *)src) )
dst += to_lower_utf8 ( dst , src );
// return bytes written
return dst - dstart;
}
inline int32_t to_lower_utf8 (char *dst, char *dstEnd, char *src ){
char *dstart = dst;
for ( ; *src ; src += getUtf8CharSize((uint8_t *)src) )
dst += to_lower_utf8 ( dst , src );
// return bytes written
return dst - dstart;
}
void getCalendarFromMs(int64_t ms,
int32_t* days,
int32_t* hours,
int32_t* minutes,
int32_t* secs,
int32_t* msecs);
//inline
//int32_t u16UrlEncode(char *d, int32_t dlen, char *s, int32_t slen,
// bool requestPath = false){
// char u8Buf[2048];
// int32_t u8Len = utf16ToUtf8(u8Buf, 2048, s, slen);
// return urlEncode(d, dlen, u8Buf, u8Len, requestPath);
//}
uint32_t calculateChecksum(char *buf, int32_t bufLen);
char* getNextNum(char* input, char** numPtr);
// use ucIsAlnum instead...
inline bool ucIsWordChar(UChar32 c) {
if (!(c & 0xffffff80)) return is_alnum_a(c);
//if (c < 256) return is_alnum(c);
void *p = g_ucProps.getValue(c);
if (!p) return false;
return *(UCProps*)p & UC_WORDCHAR;
}
// don't allow "> in our input boxes
int32_t cleanInput(char *outbuf, int32_t outbufSize, char *inbuf, int32_t inbufLen);
// not in math.h?
inline double round(double x) {
return floor(x+0.5);
}
// like strcpy but return the length and always null terminates
// dst should be of size maxDstLen + 1
inline int32_t setstr ( char *dst,
int32_t maxDstLen,
char *src,
int32_t srcLen ) {
// get the proper length
int32_t dstLen = srcLen;
if ( srcLen > maxDstLen ) dstLen = maxDstLen;
// copy the string
gbmemcpy ( dst, src, dstLen );
// NULL terminate
dst[dstLen] = '\0';
// return the proper length
return dstLen;
}
//
// these three functions replace the Msg.cpp/.h class
//
// actually "lastParm" point to the thing right after the lastParm
int32_t getMsgStoredSize ( int32_t baseSize,
int32_t *firstSizeParm,
int32_t *lastSizeParm ) ;
// . return ptr to the buffer we serialize into
// . return NULL and set g_errno on error
char *serializeMsg ( int32_t baseSize ,
int32_t *firstSizeParm ,
int32_t *lastSizeParm ,
char **firstStrPtr ,
void *thisPtr ,
int32_t *retSize ,
char *userBuf ,
int32_t userBufSize ,
bool makePtrsRefNewBuf ) ;
// convert offsets back into ptrs
int32_t deserializeMsg ( int32_t baseSize ,
int32_t *firstSizeParm ,
int32_t *lastSizeParm ,
char **firstStrPtr ,
char *stringBuf ) ;
#endif