open-source-search-engine/fctypes.cpp
2014-11-17 18:13:36 -08:00

2456 lines
64 KiB
C++

#include "gb-include.h"
#include "Loop.h"
#include "Entities.h"
#include "UCWordIterator.h"
#include "SafeBuf.h"
#include "Xml.h"
#include "XmlNode.h"
#include "iana_charset.h"
static bool g_clockInSync = false;
bool isClockInSync() {
if ( g_hostdb.m_initialized && g_hostdb.m_hostId == 0 ) return true;
return g_clockInSync;
}
bool print96 ( char *k ) {
key_t *kp = (key_t *)k;
printf("n1=0x%"XINT32" n0=0x%"XINT64"\n",(int32_t)kp->n1,(int64_t)kp->n0);
return true;
}
bool print96 ( key_t *kp ) {
printf("n1=0x%"XINT32" n0=0x%"XINT64"\n",(int32_t)kp->n1,(int64_t)kp->n0);
return true;
}
bool print128 ( char *k ) {
key128_t *kp = (key128_t *)k;
printf("n1=0x%"XINT64" n0=0x%"XINT64"\n",(int64_t)kp->n1,(int64_t)kp->n0);
return true;
}
bool print128 ( key128_t *kp ) {
printf("n1=0x%"XINT64" n0=0x%"XINT64"\n",(int64_t)kp->n1,(int64_t)kp->n0);
return true;
}
// . put all the maps here now
// . convert "c" to lower case
const unsigned char g_map_to_lower[] = {
0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ,
8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ,
16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 ,
24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ,
32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 ,
40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 ,
48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 ,
56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ,
64 , 'a','b' ,'c' ,'d' ,'e' ,'f' ,'g' ,
'h', 'i','j' ,'k' ,'l' ,'m' ,'n' ,'o' ,
'p', 'q','r' ,'s' ,'t' ,'u' ,'v' ,'w' ,
'x', 'y','z' , 91 , 92 ,93 ,94 ,95 ,
96 , 'a','b' ,'c' ,'d' ,'e' ,'f' ,'g' ,
'h', 'i','j' ,'k' ,'l' ,'m' ,'n' ,'o' ,
'p', 'q','r' ,'s' ,'t' ,'u' ,'v' ,'w' ,
'x', 'y','z' ,123 ,124 ,125 ,126 ,127 ,
128,129,130,131,132,133,134,135,
136,137,138,139,140,141,142,143,
144,145,146,147,148,149,150,151,
152,153,154,155,156,157,158,159,
160,161,162,163,164,165,166,167,
168,169,170,171,172,173,174,175,
176,177,178,179,180,181,182,183,
184,185,186,187,188,189,190,191,
224,225,226,227,228,229,230,231,
232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,247,
248,249,250,251,252,253,254,223,
224,225,226,227,228,229,230,231,
232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,247,
248,249,250,251,252,253,254,255
};
// converts ascii chars and IS_O chars to their lower case versions
const unsigned char g_map_to_upper[] = {
0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ,
8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ,
16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 ,
24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ,
32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 ,
40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 ,
48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 ,
56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ,
64 , 'A','B' ,'C' ,'D' ,'E' ,'F' ,'G' ,
'H', 'I','J' ,'K' ,'L' ,'M' ,'N' ,'O' ,
'P', 'Q','R' ,'S' ,'T' ,'U' ,'V' ,'W' ,
'X', 'Y','Z' , 91 , 92 ,93 ,94 ,95 ,
96 , 'A','B' ,'C' ,'D' ,'E' ,'F' ,'G' ,
'H', 'I','J' ,'K' ,'L' ,'M' ,'N' ,'O' ,
'P', 'Q','R' ,'S' ,'T' ,'U' ,'V' ,'W' ,
'X', 'Y','Z' ,123 ,124 ,125 ,126 ,127 ,
128,129,130,131,132,133,134,135,
136,137,138,139,140,141,142,143,
144,145,146,147,148,149,150,151,
152,153,154,155,156,157,158,159,
160,161,162,163,164,165,166,167,
168,169,170,171,172,173,174,175,
176,177,178,179,180,181,182,183,
184,185,186,187,188,189,190,191,
192,193,194,195,196,197,198,199,
200,201,202,203,204,205,206,207,
208,209,210,211,212,213,214,215,
216,217,218,219,220,221,222,223,
192,193,194,195,196,197,198,199,
200,201,202,203,204,205,206,207,
208,209,210,211,212,213,214,215,
216,217,218,219,220,221,222,255
};
const unsigned char g_map_to_ascii[] = {
0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ,
8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ,
16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 ,
24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ,
32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 ,
40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 ,
48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 ,
56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ,
64 , 'A','B' ,'C' ,'D' ,'E' ,'F' ,'G' ,
'H', 'I','J' ,'K' ,'L' ,'M' ,'N' ,'O' ,
'P', 'Q','R' ,'S' ,'T' ,'U' ,'V' ,'W' ,
'X', 'Y','Z' , 91 , 92 ,93 ,94 ,95 ,
96 , 'a','b' ,'c' ,'d' ,'e' ,'f' ,'g' ,
'h', 'i','j' ,'k' ,'l' ,'m' ,'n' ,'o' ,
'p', 'q','r' ,'s' ,'t' ,'u' ,'v' ,'w' ,
'x', 'y','z' ,123 ,124 ,125 ,126 ,127 ,
128,129,130,131, 132,133,134,135,
136,137,138,139, 140,141,142,143,
144,145,146,147, 148,149,150,151,
152,153,154,155, 156,157,158,159,
160,161,162,'#', 'o','Y','|','S',
168,169,'a',171, 172,173,174,175,
176,177,'2','3', 180,'u',182,183,
' ','1','o',187, 188,189,190,'?',
'A','A','A','A', 'A','A','A'/*198-AE*/,'C',
'E','E','E','E', 'I','I','I','I',
'D','N','O','O', 'O','O','O','x',
'O','U','U','U', 'U','Y',222/*TH*/,'s'/*changed from B*/,
'a','a','a','a', 'a','a','a'/*230-ae*/,'c',
'e','e','e','e', 'i','i','i','i',
'd','n','o','o', 'o','o','o','/',
'o','u','u','u', 'u','y',254/*th*/,'y'
};
const char g_map_is_upper[] = {
0,0,0,0,0,0,0,0, // 0 -7
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, // 64
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,0,0,0,0,0, // 88
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 15*8
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 20*8
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, // 192
1,1,1,1,1,1,1,1, // 200
1,1,1,1,1,1,1,0, // 208
1,1,1,1,1,1,1,1, // 216
0,0,0,0,0,0,0,0, // 224
0,0,0,0,0,0,0,0, // 232
0,0,0,0,0,0,0,0, // 240
0,0,0,0,0,0,0,0}; // 248
// can this character be in an html (or xml) tag name??
const char g_map_canBeInTagName[] = {
0,0,0,0,0,0,0,0, // 0 -7
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0, // 40 -- hyphen can be in tag name
0,0,0,0,0,0,0,0, // 48
0,0,0,0,0,0,0,0, // 56
0,1,1,1,1,1,1,1, // 64
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,0,0,0,0,0, // 88
0,1,1,1,1,1,1,1, // 96
1,1,1,1,1,1,1,1, // 104
1,1,1,1,1,1,1,1, // 112
1,1,1,0,0,0,0,0, // 15*8 = 120
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 20*8 = 160
0,0,0,0,0,0,0,0, // 168
0,0,0,0,0,0,0,0, // 176
0,0,0,0,0,0,0,0, // 184
0,0,0,0,0,0,0,0, // 192
0,0,0,0,0,0,0,0, // 200
0,0,0,0,0,0,0,0, // 208
0,0,0,0,0,0,0,0, // 216
0,0,0,0,0,0,0,0, // 224
0,0,0,0,0,0,0,0, // 232
0,0,0,0,0,0,0,0, // 240
0,0,0,0,0,0,0,0}; // 248
const char g_map_is_control [] = {
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 64
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 96
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1, // 120, 127 = DEL
1,1,1,1,1,1,1,1, // 128
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,0,0,0,0,0,0,0, // 160 = backspace
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 192
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 224
0,0,0,0,0,0,0,0, // 232
0,0,0,0,0,0,0,0, // 240
0,0,0,0,0,0,0,0}; // 248
// people mix windows 1252 into latin-1 so we have to be less restrictive here...
const char g_map_is_binary[] = {
1,1,1,1,1,1,1,1,
1,0,0,1,1,0,1,1, // \t=9 \n = 10 \r = 13
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 64
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 96
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1, // 120, 127 = DEL
0,1,1,1,1,1,1,1, // 128 (128 is a quote)
1,1,1,1,1,1,1,1, // 136
1,0,0,0,0,0,0,1, // 144 (145 146 147 are quotes, 148 is dash, 149 bullet,150 dash)
0,0,1,1,0,0,1,1, // 152 (152 & 153 are quotes, 156 & 157 are double quotes)
0,0,0,0,0,0,0,0, // 160 = backspace (some urls have this???)
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 192
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 224
0,0,0,0,0,0,0,0, // 232
0,0,0,0,0,0,0,0, // 240
0,0,0,0,0,0,0,0}; // 248
// ' ' '\n' '\t' '\r'
const char g_map_is_wspace[] = {
0,0,0,0,0,0,0,0, // 0 -7
0,1,1,0,0,1,0,0, // \t=9 \n = 10 \r = 13
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0, // space=32
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 64
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 88
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 15*8
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 160 -- turn off 160, it might be utf8 byte
0,0,0,0,0,0,0,0, // 168
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 192
0,0,0,0,0,0,0,0, // 200
0,0,0,0,0,0,0,0, // 208
0,0,0,0,0,0,0,0, // 216
0,0,0,0,0,0,0,0, // 224
0,0,0,0,0,0,0,0, // 232
0,0,0,0,0,0,0,0, // 240
0,0,0,0,0,0,0,0}; // 248
// '\n'
const char g_map_is_vspace[] = {
0,0,0,0,0,0,0,0, // 0 -7
0,0,1,0,0,0,0,0, // \t=9 \n = 10
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // space=32
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 64
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 88
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 15*8
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 20*8
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, // 192
1,1,1,1,1,1,1,1, // 200
1,1,1,1,1,1,1,0, // 208
1,1,1,1,1,1,1,1, // 216
0,0,0,0,0,0,0,0, // 224
0,0,0,0,0,0,0,0, // 232
0,0,0,0,0,0,0,0, // 240
0,0,0,0,0,0,0,0}; // 248
// ' ' '\t'
const char g_map_is_hspace[] = {
0,0,0,0,0,0,0,0, // 0 -7
0,1,0,0,0,0,0,0, // \t=9 \n = 10
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0, // space=32
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 64
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 88
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 15*8
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 20*8
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, // 192
1,1,1,1,1,1,1,1, // 200
1,1,1,1,1,1,1,0, // 208
1,1,1,1,1,1,1,1, // 216
0,0,0,0,0,0,0,0, // 224
0,0,0,0,0,0,0,0, // 232
0,0,0,0,0,0,0,0, // 240
0,0,0,0,0,0,0,0}; // 248
const char g_map_is_vowel[] = {
0,0,0,0,0,0,0,0, // 0 -7
0,0,0,0,0,0,0,0, // 8-15
0,0,0,0,0,0,0,0, // 16-
0,0,0,0,0,0,0,0, // 24-
0,0,0,0,0,0,0,0, // 32-
0,0,0,0,0,0,0,0, // 40-
0,0,0,0,0,0,0,0, // 48-
0,0,0,0,0,0,0,0, // 56-
0,1,0,0,0,1,0,0, // 64 (A=65)
0,1,0,0,0,0,0,1, // 72
0,0,0,0,0,1,0,0, // 80
0,0,0,0,0,0,0,0, // 88-
0,1,0,0,0,1,0,0, // 96- (a=97)
0,1,0,0,0,0,0,1,
0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 128
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 160
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 192
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0};
// converts ascii chars and IS_O chars to their lower case versions
const char g_map_is_lower[] = { // 97-122 and 224-255 (excluding 247)
0,0,0,0,0,0,0,0, // 0 -7
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 64
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, // 96
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,0,0,0,0,0, // 120
0,0,0,0,0,0,0,0, // 128
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 160
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 192
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, // 224
1,1,1,1,1,1,1,1, // 232
1,1,1,1,1,1,1,0, // 240
1,1,1,1,1,1,1,1}; // 248
const char g_map_is_ascii[] = { // 32 to 126
0,0,0,0,0,0,0,0, // 0
0,0,0,0,0,0,0,0, // 8
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, // 32
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,0,
0,0,0,0,0,0,0,0, // 128
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0};
// just from 0-127, used by the inlined *_utf8() functions in fctypes.h
const char g_map_is_ascii3[] = { // 32 to 126
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, // 32
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, // 128
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0};
const char g_map_is_iso[] = { // 32 to 126
0,0,0,0,0,0,0,0, // 0
0,0,0,0,0,0,0,0, // 8
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 32
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 128
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 152
0,1,1,1,1,1,1,1, // 160
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1};
const char g_map_is_punct[] = { // 33-47, 58-64, 91-96, 123-126, 161-191, 215,247
0,0,0,0,0,0,0,0, // 0
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 16
0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, // 32
1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, // 48
0,0,1,1,1,1,1,1,
1,0,0,0,0,0,0,0, // 64
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 80
0,0,0,1,1,1,1,1,
1,0,0,0,0,0,0,0, // 96
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 112
0,0,0,1,1,1,1,0,
0,0,0,0,0,0,0,0, // 128
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 144
0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, // 160
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, // 176
1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0, // 192
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1, // 208
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 224
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1, // 240
0,0,0,0,0,0,0,0}; // 248
const char g_map_is_alnum[] = { // 48-57, 65-90,97-122,192-255(excluding 215,247)
0,0,0,0,0,0,0,0, // 0
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 16
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 32
0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, // 48
1,1,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, // 64
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, // 80
1,1,1,0,0,0,0,0,
0,1,1,1,1,1,1,1, // 96
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, // 112
1,1,1,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 128
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 144
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 160
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 176
0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, // 192
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,0, // 208
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, // 224
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,0, // 240
1,1,1,1,1,1,1,1};
const char g_map_is_alpha[] = { // 65-90, 97-122, 192-255 (excluding 215, 247)
0,0,0,0,0,0,0,0, // 0
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 16
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 32
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 48
0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1, // 64
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, // 80
1,1,1,0,0,0,0,0,
0,1,1,1,1,1,1,1, // 96
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, // 112
1,1,1,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 128
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 144
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 160
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 176
0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, // 192
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,0, // 208
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, // 224
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,0, // 240
1,1,1,1,1,1,1,1};
const char g_map_is_digit[] = { // 48-57
0,0,0,0,0,0,0,0, // 0
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 16
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 32
0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, // 48
1,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0};
const char g_map_is_hex[] = { // 48-57
0,0,0,0,0,0,0,0, // 0
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 16
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 32
0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, // 48
1,1,0,0,0,0,0,0, // 56
0,1,1,1,1,1,1,0, // 64 (65='A')
0,0,0,0,0,0,0,0, // 72
0,0,0,0,0,0,0,0, // 80
0,0,0,0,0,0,0,0, // 88
0,1,1,1,1,1,1,0, // 96 (97='a')
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0};
// stolen from is_alnum, but turned on - and _
const char g_map_is_tagname_char [] = { // 48-57, 65-90,97-122,192-255(excluding 215,247)
0,0,0,0,0,0,0,0, // 0
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 16
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 32
0,0,0,0,0,1,0,0, // -
1,1,1,1,1,1,1,1, // 48
1,1,1,0,0,0,0,0, // we include the : for feedburner:origlink
0,1,1,1,1,1,1,1, // 64
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, // 80
1,1,1,0,0,0,0,1, // _
0,1,1,1,1,1,1,1, // 96
1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, // 112
1,1,1,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 128
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 144
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 160
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 176
0,0,0,0,0,0,0,0,
// we are no longer necessarily latin-1!!
0,0,0,0,0,0,0,0, // 192
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 208
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 224
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 240
0,0,0,0,0,0,0,0};
const char g_map_is_tag_control_char[] = { // 48-57
0,0,0,0,0,0,0,0, // 0
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 16
0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,1, // 32 " and '
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, // 48
0,0,0,0,1,0,1,0, // 56 < and >
0,0,0,0,0,0,0,0, // 64
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0};
// when matching query terms to words/phrases in doc skip over spaces
// or other punct so that "flypaper" in the query matches "fly paper" in the
// doc
/*
const char g_map_is_match_skip[] = { // 48-57
0,0,0,0,0,0,0,0, // 0
0,1,1,0,0,0,0,0, // \t and \n
0,0,0,0,0,0,0,0, // 16
0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,1, // 32 space and '
0,0,0,0,0,1,0,0, // 40 -
0,0,0,0,0,0,0,0, // 48
0,0,0,0,0,0,0,0, // 56
0,0,0,0,0,0,0,0, // 64
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0};
*/
// seems like this should be defined, but it isn't
int32_t strnlen ( const char *s , int32_t maxLen ) {
int32_t i ;
for ( i = 0 ; i < maxLen ; i++ ) if ( ! s[i] ) return i;
return i;
}
char *strncasestr( char *haystack, int32_t haylen, char *needle){
int32_t matchLen = 0;
int32_t needleLen = gbstrlen(needle);
for (int32_t i = 0; i < haylen;i++){
char c1 = to_lower_a(haystack[i]);
char c2 = to_lower_a(needle[matchLen]);
if ( c1 != c2 ){
// no match
matchLen = 0;
continue;
}
// we matched another character
matchLen++;
if (matchLen < needleLen) continue;
// we've matched the whole string
return haystack + i - matchLen + 1;
}
return NULL;
}
char *strnstr2( char *haystack, int32_t haylen, char *needle){
int32_t matchLen = 0;
int32_t needleLen = gbstrlen(needle);
for (int32_t i = 0; i < haylen;i++){
char c1 = (haystack[i]);
char c2 = (needle[matchLen]);
if ( c1 != c2 ){
// no match
matchLen = 0;
continue;
}
// we matched another character
matchLen++;
if (matchLen < needleLen) continue;
// we've matched the whole string
return haystack + i - matchLen + 1;
}
return NULL;
}
// . get the # of words in this string
int32_t getNumWords ( char *s , int32_t len, int32_t titleVersion ) {
int32_t wordCount = 0;
bool inWord = false;
for ( int32_t i = 0 ; i < len ; i++ ) {
if ( ! is_alnum_a ( s[i] ) && s[i]!='\'' ) {
inWord = false;
continue;
}
if ( ! inWord ) {
inWord = true;
wordCount++;
}
}
return wordCount;
}
// . this stores a "n" into "s" and returns the # of bytes written into "s"
// . it also puts commas into the number
// . it now also NULL terminates bytes written into "s"
int32_t ulltoa ( char *s , uint64_t n ) {
// if n is zero, it's easy
if ( n == 0LL ) { *s++='0'; *s='\0'; return 1; }
// a hunk is a number in [0,999]
int32_t hunks[10];
int32_t lastHunk = -1;
// . get the hunks
// . the first hunk we get is called the "lowest hunk"
// . "lastHunk" is called the "highest hunk"
for ( int32_t i = 0 ; i < 10 ; i++ ) {
hunks[i] = n % 1000;
n /= 1000;
if ( hunks[i] != 0 ) lastHunk = i;
}
// remember start of buf for calculating # bytes written
char *start = s;
// print the hunks separated by comma
for ( int32_t i = lastHunk ; i >= 0 ; i-- ) {
// pad all hunks except highest hunk with zeroes
if ( i != lastHunk ) sprintf ( s , "%03"INT32"" , hunks[i] );
else sprintf ( s , "%"INT32"" , hunks[i] );
s += gbstrlen(s);
// comma after all hunks but lowest hunk
if ( i != 0 ) *s++ = ',';
}
// null terminate it
*s = '\0';
// return # of bytes stored into "s"
return s - start;
}
/*
int32_t atol2 ( const char *s, int32_t len ) {
char tmp[32];
if ( len > 30 ) len = 30;
memcpy ( tmp , s , len );
tmp [ len ] = '\0';
return atol ( s );
}
*/
int32_t atol2 ( const char *s, int32_t len ) {
// skip over spaces
const char *end = s + len;
while ( s < end && is_wspace_a ( *s ) ) s++;
// return 0 if all spaces
if ( s == end ) return 0;
int32_t i = 0;
int32_t val = 0;
bool negative = false;
if ( s[0] == '-' ) { negative = true; i++; }
while ( i < len && is_digit(s[i]) ) val = val * 10 + ( s[i++] - '0' );
if ( negative ) return -val;
return val;
}
int64_t atoll1 ( const char *s ) {
return atoll ( s );
}
int64_t atoll2 ( const char *s, int32_t len ) {
// skip over spaces
const char *end = s + len;
while ( s < end && is_wspace_a ( *s ) ) s++;
// return 0 if all spaces
if ( s == end ) return 0;
int32_t i = 0;
int64_t val = 0LL;
bool negative = false;
if ( s[0] == '-' ) { negative = true; i++; }
while ( i < len && is_digit(s[i]) ) val = val * 10LL + ( s[i++] - '0');
if ( negative ) return -val;
return val;
}
double atof2 ( const char *s, int32_t len ) {
// skip over spaces
const char *end = s + len;
while ( s < end && is_wspace_a ( *s ) ) { s++; len--; }
// return 0 if all spaces
if ( s == end ) return 0;
char tmpBuf[128];
if ( len >= 128 ) len = 127;
//strncpy ( dst , s , len );
const char *p = s;
const char *srcEnd = s + len;
char *dst = tmpBuf;
// remove commas
for ( ; p < srcEnd ; p++ ) {
// skip commas
if ( *p == ',' ) continue;
// otherwise store it
*dst++ = *p;
}
// null term
*dst = '\0';
//buf[len] = '\0';
return atof ( tmpBuf );
}
double atod2 ( char *s, int32_t len ) {
// point to end
char *end = s + len;
// null term temp
char c = *end;
*end = '\0';
// get it
double ret = strtod ( s , NULL );
// undo it
*end = c;
return ret;
}
bool atob ( const char *s, int32_t len ) {
// skip over spaces
const char *end = s + len;
while ( s < end && is_wspace_a ( *s ) ) s++;
// return false if all spaces
if ( s == end ) return false;
// parse the ascii bool value
if ( s[0] == 't' || s[1] == 'T' ) return true;
if ( s[0] == 'y' || s[0] == 'y' ) return true;
if ( ! is_digit ( *s ) || *s == '0' ) return false;
return true;
}
// hexadecimal ascii to key_t
int64_t htoint32_tint32_t ( const char *s, int32_t len ) {
// skip over spaces
const char *end = s + len;
while ( s < end && is_wspace_a ( *s ) ) s++;
// return 0 if all spaces
if ( s == end ) return 0;
int32_t i = 0;
int64_t val = 0;
while ( i < len && is_hex(s[i]) )
val = val * 16 + htob ( s[i++] );
return val;
}
// convert hex ascii string into binary at "dst"
void hexToBin ( char *src , int32_t srcLen , char *dst ) {
char *srcEnd = src + srcLen;
for ( ; src && src < srcEnd ; ) {
*dst = htob(*src++);
*dst <<= 4;
*dst |= htob(*src++);
dst++;
}
// sanity check
if ( src != srcEnd ) { char *xx=NULL;*xx=0; }
}
void binToHex ( unsigned char *src , int32_t srcLen , char *dst ) {
unsigned char *srcEnd = src + srcLen;
for ( ; src && src < srcEnd ; ) {
*dst++ = btoh(*src>>4);
*dst++ = btoh(*src&15);
src++;
}
// always null term!
*dst = '\0';
// sanity check
if ( src != srcEnd ) { char *xx=NULL;*xx=0; }
}
// . like strstr but haystack may not be NULL terminated
// . needle, however, IS null terminated
char *strncasestr ( char *haystack , char *needle , int32_t haystackSize ) {
int32_t needleSize = gbstrlen(needle);
int32_t n = haystackSize - needleSize ;
for ( int32_t i = 0 ; i <= n ; i++ ) {
// keep looping if first chars do not match
if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) )
continue;
// if needle was only 1 char it's a match
if ( ! needle[1] ) return &haystack[i];
// compare the whole strings now
if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 )
return &haystack[i];
}
return NULL;
}
// . like strstr but haystack may not be NULL terminated
// . needle, however, IS null terminated
char *strncasestr ( char *haystack , char *needle ,
int32_t haystackSize, int32_t needleSize ) {
int32_t n = haystackSize - needleSize ;
for ( int32_t i = 0 ; i <= n ; i++ ) {
// keep looping if first chars do not match
if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) )
continue;
// if needle was only 1 char it's a match
if ( ! needle[1] ) return &haystack[i];
// compare the whole strings now
if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 )
return &haystack[i];
}
return NULL;
}
char *strnstr ( char *haystack , char *needle , int32_t haystackSize ) {
int32_t needleSize = gbstrlen(needle);
int32_t n = haystackSize - needleSize ;
for ( int32_t i = 0 ; i <= n ; i++ ) {
// keep looping if first chars do not match
if ( haystack[i] != needle[0] ) continue;
// if needle was only 1 char it's a match
if ( ! needle[1] ) return &haystack[i];
// compare the whole strings now
if ( strncmp ( &haystack[i] , needle , needleSize ) == 0 )
return &haystack[i];
}
return NULL;
}
// independent of case
char *gb_strcasestr ( char *haystack , char *needle ) {
int32_t needleSize = gbstrlen(needle);
int32_t haystackSize = gbstrlen(haystack);
int32_t n = haystackSize - needleSize ;
for ( int32_t i = 0 ; i <= n ; i++ ) {
// keep looping if first chars do not match
if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) )
continue;
// if needle was only 1 char it's a match
if ( ! needle[1] ) return &haystack[i];
// compare the whole strings now
if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 )
return &haystack[i];
}
return NULL;
}
char *gb_strncasestr ( char *haystack , int32_t haystackSize , char *needle ) {
// temp term
char c = haystack[haystackSize];
haystack[haystackSize] = '\0';
char *res = gb_strcasestr ( haystack , needle );
haystack[haystackSize] = c;
return res;
}
// . convert < to &lt; and > to &gt
// . store "t" into "s"
// . returns bytes stored into "s"
// . NULL terminates "s" if slen > 0
int32_t saftenTags ( char *s , int32_t slen , char *t , int32_t tlen ) {
char *start = s ;
// bail if slen is 0
if ( slen <= 0 ) return 0;
// leave a char for the \0
char *send = s + slen - 1;
char *tend = t + tlen;
for ( ; t < tend && s + 4 < send ; t++ ) {
if ( *t == '<' ) {
*s++ = '&';
*s++ = 'l';
*s++ = 't';
*s++ = ';';
continue;
}
if ( *t == '>' ) {
*s++ = '&';
*s++ = 'g';
*s++ = 't';
*s++ = ';';
continue;
}
*s++ = *t;
}
// NULL terminate "s"
*s = '\0';
// return # of bytes, excluding \0, stored into s
return s - start;
}
// . if "doSpecial" is true, then we change &lt;, &gt; and &amp; to
// the following:
// UnicodeData.txt:22E6;LESS-THAN BUT NOT EQUIVALENT TO;Sm;0;ON;;;;;Y;
// UnicodeData.txt:22E7;GREATER-THAN BUT NOT EQUIVALENT TO;Sm;0;ON;;;;;Y;
// UnicodeData.txt:E0026;TAG AMPERSAND;Cf;0;BN;;;;;N;;;;;
// UnicodeData.txt:235E;APL FUNCTIONAL SYMBOL QUOTE QUAD;So;0;L;;;;;N;;;;;
int32_t htmlDecode ( char *dst , char *src , int32_t srcLen , bool doSpecial ,
int32_t niceness ) {
if ( srcLen == 0 ) return 0;
char *start = dst;
char *srcEnd = src + srcLen;
for ( ; src < srcEnd ; ) {
// breathe
QUICKPOLL(niceness);
// utf8 support?
char size = getUtf8CharSize(src);
// all entities must start with '&'
if ( *src != '&' ) {
if ( size == 1 ) { *dst++ = *src++; continue; }
memcpy ( dst , src , size );
src += size;
dst += size;
continue;
//*dst++ = *src++; continue; }
}
// TODO: avoid doSpecial by not decoding crap in tags...
//if ( src[0] == '<' ) {
// // skip to tag end then!
//
// store decoded entity char into dst[j]
uint32_t c;
// "skip" is how many bytes the entites was in "src"
int32_t skip = getEntity_a (src, srcEnd-src, &c );
// ignore the "entity" if it was invalid
if ( skip == 0 ) { *dst++ = *src++ ; continue; }
// force this now always since some tags contain &quot;
// and it was causing the tags to be terminated too early
// for richmondspca.org
//if ( c == '\"' ) c = '\'';
//if ( c == '<' ) c = '[';
//if ( c == '>' ) c = ']';
// . special mapping
// . make &lt; and &gt; special so Xml::set() still works
// . and make &amp; special so we do not screw up summaries
if ( doSpecial ) {
// no longer use this!
//char *xx=NULL;*xx=0;
if ( c == '<' ) {
// using [ and ] looks bad in event titles...
*dst = '|';
dst++;
src += skip;
continue;
memcpy(dst,"+!-",3);
//memcpy(dst,"<gb",3);
dst += 3;
src += skip;
continue;
// paragraph sign:
//c = 0xc2b6;
}
if ( c == '>' ) {
// using [ and ] looks bad in event titles...
*dst = '|';
dst++;
src += skip;
continue;
//memcpy(dst,"gb>",3);
memcpy(dst,"-!+",3);
dst += 3;
src += skip;
continue;
// high-rise hyphen:
//c = 0xc2af;
}
// some tags have &quot; in their value strings
// so we have to preserve that!
// use curling quote:
//http://www.dwheeler.com/essays/quotes-test-utf-8.html
// curling double and single quotes resp:
// &ldquo; &rdquo; &lsquo; &rdquo;
if ( c == '\"' ) {
//c = 0x201c; // 0x235e;
*dst = '\'';
dst++;
src += skip;
continue;
}
//if ( c == '<' ) c = 0x22d6; // e6;
//if ( c == '>' ) c = 0x22d7; // e7;
// this was working ok, but just code it to an
// ampersand. when displaying a page we can code all
// ampersands back into &amp; i guess! that way
// the check for a " & " in the place name in
// Address.cpp works out...
//if ( c == '&' ) c = 0xff06; // full width ampersand
}
// . otherwise it was a legit entity
// . store it into "dst" in utf8 format
// . "numBytes" is how many bytes it stored into 'dst"
int32_t numBytes = utf8Encode ( c , dst );
// sanity check. do not eat our tail if dst == src
if ( numBytes > skip ) { char *xx=NULL;*xx=0; }
// advance dst ptr
dst += numBytes;
// skip over the encoded entity in the source string
src += skip;
}
// NULL term
*dst = '\0';
return dst - start;
}
// cdata
int32_t cdataDecode ( char *dst , char *src , int32_t niceness ) {
if ( ! src ) return 0;
char *start = dst;
for ( ; *src ; ) {
// breathe
QUICKPOLL(niceness);
// utf8 support?
char size = getUtf8CharSize(src);
// see SafeBuf::cdataEncode() we do the opposite here
if ( src[0] != ']' ||
src[1] != ']' ||
src[2] != '&' ||
src[3] != 'g' ||
src[4] != 't' ) {
if ( size == 1 ) { *dst++ = *src++; continue; }
memcpy ( dst , src , size );
src += size;
dst += size;
continue;
//*dst++ = *src++; continue; }
}
// make it ]]>
memcpy ( dst , "]]>" , 3 );
src += 5;
dst += 3;
}
// NULL term
*dst = '\0';
return dst - start;
}
// . make something safe as an form input value by translating the quotes
// . store "t" into "s" and return bytes stored
// . does not do bounds checking
int32_t dequote ( char *s , char *send , char *t , int32_t tlen ) {
char *start = s;
char *tend = t + tlen;
for ( ; t < tend && s < send ; t++ ) {
if ( *t == '"' ) {
if ( s + 5 >= send ) return 0;
*s++ = '&';
*s++ = '#';
*s++ = '3';
*s++ = '4';
*s++ = ';';
continue;
}
*s++ = *t;
}
// all or nothing
if ( s + 1 >= send ) return 0;
*s = '\0';
return s - start;
}
bool dequote ( SafeBuf* sb , char *t , int32_t tlen ) {
char *tend = t + tlen;
for ( ; t < tend; t++ ) {
if ( *t == '"' ) {
sb->safeMemcpy("&#34;", 5);
continue;
}
*sb += *t;
}
*sb += '\0';
return true;
}
//int32_t dequote ( char *s , char *t ) {
// return dequote ( s , t , gbstrlen ( t ) );
//}
// . entity-ize a string so it's safe for html output
// . store "t" into "s" and return bytes stored
// . does bounds checking
char *htmlEncode ( char *s , char *send , char *t , char *tend , bool pound ,
int32_t niceness ) {
for ( ; t < tend ; t++ ) {
QUICKPOLL(niceness);
if ( s + 7 >= send ) { *s = '\0'; return s; }
if ( *t == '"' ) {
*s++ = '&';
*s++ = '#';
*s++ = '3';
*s++ = '4';
*s++ = ';';
continue;
}
if ( *t == '<' ) {
*s++ = '&';
*s++ = 'l';
*s++ = 't';
*s++ = ';';
continue;
}
if ( *t == '>' ) {
*s++ = '&';
*s++ = 'g';
*s++ = 't';
*s++ = ';';
continue;
}
if ( *t == '&' ) {
*s++ = '&';
*s++ = 'a';
*s++ = 'm';
*s++ = 'p';
*s++ = ';';
continue;
}
if ( *t == '#' && pound ) {
*s++ = '&';
*s++ = '#';
*s++ = '0';
*s++ = '3';
*s++ = '5';
*s++ = ';';
continue;
}
*s++ = *t;
}
*s = '\0';
return s;
}
// . entity-ize a string so it's safe for html output
// . store "t" into "s" and return true on success
bool htmlEncode ( SafeBuf* s , char *t , char *tend , bool pound ,
int32_t niceness ) {
for ( ; t < tend ; t++ ) {
QUICKPOLL(niceness);
if ( *t == '"' ) {
s->safeMemcpy("&#34;", 5);
continue;
}
if ( *t == '<' ) {
s->safeMemcpy("&lt;", 4);
continue;
}
if ( *t == '>' ) {
s->safeMemcpy("&gt;", 4);
continue;
}
if ( *t == '&' ) {
s->safeMemcpy("&amp;", 5);
continue;
}
if ( *t == '#' && pound ) {
s->safeMemcpy("&#035;", 6);
continue;
}
// our own specially decoded entites!
if ( *t == '+' && t[1]=='!' && t[2]=='-' ) {
s->safeMemcpy("&lt;",4);
continue;
}
// our own specially decoded entites!
if ( *t == '-' && t[1]=='!' && t[2]=='+' ) {
s->safeMemcpy("&gt;",4);
continue;
}
*s += *t;
}
*s += '\0';
return true;
}
// . convert "-->%22 , &-->%26, +-->%2b, space-->+, ?-->%3f is that it?
// . convert so we can display as a cgi PARAMETER within a url
// . used by HttPage2 (cached web page) to encode the query into a url
// . used by PageRoot to do likewise
// . returns bytes written into "d" not including terminating \0
int32_t urlEncode ( char *d , int32_t dlen , char *s , int32_t slen, bool requestPath ) {
char *dstart = d;
// subtract 1 to make room for a terminating \0
char *dend = d + dlen - 1;
char *send = s + slen;
for ( ; s < send && d < dend ; s++ ) {
if ( *s == '\0' && requestPath ) {
*d++ = *s;
continue;
}
// encode if not fit for display
if ( ! is_ascii ( *s ) ) goto encode;
switch ( *s ) {
case ' ': goto encode;
case '&': goto encode;
case '"': goto encode;
case '+': goto encode;
case '%': goto encode;
case '#': goto encode;
// encoding < and > are more for displaying on an
// html page than sending to an http server
case '>': goto encode;
case '<': goto encode;
case '?': if ( requestPath ) break;
goto encode;
}
// otherwise, no need to encode
*d++ = *s;
continue;
encode:
// space to +
if ( *s == ' ' && d + 1 < dend ) { *d++ = '+'; continue; }
// break out if no room to encode
if ( d + 2 >= dend ) break;
*d++ = '%';
// store first hex digit
unsigned char v = ((unsigned char)*s)/16 ;
if ( v < 10 ) v += '0';
else v += 'A' - 10;
*d++ = v;
// store second hex digit
v = ((unsigned char)*s) & 0x0f ;
if ( v < 10 ) v += '0';
else v += 'A' - 10;
*d++ = v;
}
// NULL terminate it
*d = '\0';
// and return the length
return d - dstart;
}
// determine the length of the encoded url, does NOT include NULL
int32_t urlEncodeLen ( char *s , int32_t slen , bool requestPath ) {
int32_t dLen = 0;
char *send = s + slen;
for ( ; s < send ; s++ ) {
if ( *s == '\0' && requestPath ) {
dLen++;
continue;
}
// encode if not fit for display
if ( ! is_ascii ( *s ) ) goto encode;
switch ( *s ) {
case ' ': goto encode;
case '&': goto encode;
case '"': goto encode;
case '+': goto encode;
case '%': goto encode;
case '#': goto encode;
// encoding < and > are more for displaying on an
// html page than sending to an http server
case '>': goto encode;
case '<': goto encode;
case '?': if ( requestPath ) break;
goto encode;
}
// otherwise, no need to encode
dLen++;
continue;
encode:
// space to +
if ( *s == ' ' ) { dLen++; continue; }
// hex code
dLen += 3; // %XX
}
//dLen++; // NULL TERM
// and return the length
return dLen;
}
// . decodes "s/slen" and stores into "dest"
// . returns the number of bytes stored into "dest"
int32_t urlDecode ( char *dest , char *s , int32_t slen ) {
int32_t j = 0;
for ( int32_t i = 0 ; i < slen ; i++ ) {
if ( s[i] == '+' ) { dest[j++]=' '; continue; }
dest[j++] = s[i];
if ( s[i] != '%' ) continue;
if ( i + 2 >= slen ) continue;
// if two chars after are not hex chars, it's not an encoding
if ( ! is_hex ( s[i+1] ) ) continue;
if ( ! is_hex ( s[i+2] ) ) continue;
// convert hex chars to values
unsigned char a = htob ( s[i+1] ) * 16;
unsigned char b = htob ( s[i+2] ) ;
dest[j-1] = (char) (a + b);
i += 2;
}
return j;
}
// . like above, but only decodes chars that should not have been encoded
// . will also encode binary chars
int32_t urlNormCode ( char *d , int32_t dlen , char *s , int32_t slen ) {
// save start of detination buffer for returning the length
char *dstart = d;
// subtract 1 for NULL termination
char *dend = d + dlen - 1;
char *send = s + slen;
for ( ; s < send && d < dend ; s++ ) {
// if its non-ascii, encode it so it displays correctly
if ( ! is_ascii ( *s ) ) {
// break if no room to encode it
if ( d + 2 >= dend ) break;
// store it encoded
*d++ = '%';
// store first hex digit
unsigned char v = ((unsigned char)*s)/16 ;
if ( v < 10 ) v += '0';
else v += 'A' - 10;
*d++ = v;
// store second hex digit
v = ((unsigned char)*s) & 0x0f ;
if ( v < 10 ) v += '0';
else v += 'A' - 10;
*d++ = v;
continue;
}
// store it
*d++ = *s;
// but it might be something encoded that should not have been
if ( *s != '%' ) continue;
// it requires to following chars to decode
if ( s + 2 >= send ) continue;
// if two chars after are not hex chars, it's not an encoding
if ( ! is_hex ( s[1] ) ) continue;
if ( ! is_hex ( s[2] ) ) continue;
// convert hex chars to values
unsigned char a = htob ( s[1] ) * 16;
unsigned char b = htob ( s[2] ) ;
unsigned char v = a + b;
// don't decode if it decodes in these chars
switch ( v ) {
case ' ': continue;
case '&': continue;
case '"': continue;
case '+': continue;
case '%': continue;
case '>': continue;
case '<': continue;
case '?': continue;
case '=': continue;
}
// otherwise, it's fine to decode it
d[-1] = (char) (a + b);
// skip over those 2 chars as well as leading '%'
s += 2;
}
// NULL terminate
*d = '\0';
// return length
return d - dstart ;
}
// approximate # of non-punct words
int32_t getNumWords ( char *s ) {
int32_t count = 0;
loop:
// skip punct
while ( ! is_alnum_a(*s) ) s++;
// bail if done
if ( !*s ) return count;
// count a word
count++;
// skip word
while ( is_alnum_a(*s) ) s++;
// watch for ' letter punct
if ( *s=='\'' && is_alnum_a(*(s+1)) && !is_alnum_a(*(s+2)) ) {
// skip apostrophe
s++;
// skip rest of word
while ( is_alnum_a(*s) ) s++;
}
goto loop;
}
static int64_t s_adjustment = 0;
int64_t globalToLocalTimeMilliseconds ( int64_t global ) {
// sanity check
//if ( ! g_clockInSync )
// log("gb: Converting global time but clock not in sync.");
return global - s_adjustment;
}
int64_t localToGlobalTimeMilliseconds ( int64_t local ) {
// sanity check
//if ( ! g_clockInSync )
// log("gb: Converting global time but clock not in sync.");
return local + s_adjustment;
}
int32_t globalToLocalTimeSeconds ( int32_t global ) {
// sanity check
//if ( ! g_clockInSync )
// log("gb: Converting global time but clock not in sync.");
return global - (s_adjustment/1000);
}
int32_t localToGlobalTimeSeconds ( int32_t local ) {
// sanity check
//if ( ! g_clockInSync )
// log("gb: Converting global time but clock not in sync.");
return local + (s_adjustment/1000);
}
#include "Timedb.h"
static char s_tafile[1024];
static bool s_hasFileName = false;
// returns false and sets g_errno on error
bool setTimeAdjustmentFilename ( char *dir, char *filename ) {
s_hasFileName = true;
int32_t len1 = gbstrlen(dir);
int32_t len2 = gbstrlen(filename);
if ( len1 + len2 > 1000 ) { char *xx=NULL;*xx=0; }
sprintf(s_tafile,"%s/%s",dir,filename);
return true;
}
// returns false and sets g_errno on error
bool loadTimeAdjustment ( ) {
// bail if no filename to read
if ( ! s_hasFileName ) return true;
// read it in
// one line in text
int fd = open ( s_tafile , O_RDONLY );
if ( fd < 0 ) {
log("util: could not open %s for reading",s_tafile);
g_errno = errno;
return false;
}
char rbuf[1024];
// read in max bytes
int nr = read ( fd , rbuf , 1000 );
if ( nr <= 10 || nr > 1000 ) {
log("util: reading %s had error: %s",s_tafile,
mstrerror(errno));
close(fd);
g_errno = errno;
return false;
}
close(fd);
// parse the text line
int64_t stampTime = 0LL;
int64_t clockAdj = 0LL;
sscanf ( rbuf , "%"UINT64" %"INT64"", &stampTime, &clockAdj );
// get stamp age
int64_t local = gettimeofdayInMillisecondsLocal();
int64_t stampAge = local - stampTime;
// if too old forget about it
if ( stampAge > 2*86400 ) return true;
// update adjustment
s_adjustment = clockAdj;
// if stamp in file is within 2 days old, assume its still good
// this will prevent having to rebuild a sortbydatetable
// and really slow down loadups
g_clockInSync = true;
// note it
log("util: loaded %s and put clock in sync. age=%"UINT64" adj=%"INT64"",
s_tafile,stampAge,clockAdj);
return true;
}
// . returns false and sets g_errno on error
// . saved by Process::saveBlockingFiles1()
bool saveTimeAdjustment ( ) {
// fortget it if setTimeAdjustmentFilename never called
if ( ! s_hasFileName ) return true;
// must be in sync!
if ( ! g_clockInSync ) return true;
// store it
int64_t local = gettimeofdayInMillisecondsLocal();
char wbuf[1024];
sprintf (wbuf,"%"UINT64" %"INT64"\n",local,s_adjustment);
// write it out
int fd = open ( s_tafile , O_CREAT|O_RDWR|O_TRUNC , 00666 );
if ( fd < 0 ) {
log("util: could not open %s for writing",s_tafile);
g_errno = errno;
return false;
}
// how many bytes to write?
int32_t len = gbstrlen(wbuf);
// read in max bytes
int nw = write ( fd , wbuf , len );
if ( nw != len ) {
log("util: writing %s had error: %s",s_tafile,
mstrerror(errno));
close(fd);
g_errno = errno;
return false;
}
close(fd);
// note it
log("util: saved %s",s_tafile);
// it was written ok
return true;
}
// a "fake" settimeofdayInMilliseconds()
void settimeofdayInMillisecondsGlobal ( int64_t newTime ) {
// can't do this in sig handler
if ( g_inSigHandler ) return;
// this isn't async signal safe...
struct timeval tv;
gettimeofday ( &tv , NULL );
int64_t now=(int64_t)(tv.tv_usec/1000)+((int64_t)tv.tv_sec)*1000;
// bail if no change... UNLESS we need to sync clock!!
if ( s_adjustment == newTime - now && g_clockInSync ) return;
// log it, that way we know if there is another issue
// with flip-flopping (before we synced with host #0 and also
// with proxy #0)
int64_t delta = s_adjustment - (newTime - now) ;
if ( delta > 100 || delta < -100 )
logf(LOG_INFO,"gb: Updating clock adjustment from "
"%"INT64" ms to %"INT64" ms", s_adjustment , newTime - now );
// set adjustment
s_adjustment = newTime - now;
// return?
if ( g_clockInSync ) return;
// we are now in sync
g_clockInSync = true;
// log it
if ( s_hasFileName )
logf(LOG_INFO,"gb: clock is now synced with host #0. "
"saving to %s",s_tafile);
else
logf(LOG_INFO,"gb: clock is now synced with host #0.");
// save
saveTimeAdjustment();
// force timedb to load now!
//initAllSortByDateTables ( );
}
time_t getTimeGlobal() {
return gettimeofdayInMillisecondsSynced() / 1000;
}
time_t getTimeGlobalNoCore() {
return gettimeofdayInMillisecondsGlobalNoCore() / 1000;
}
time_t getTimeSynced() {
return gettimeofdayInMillisecondsSynced() / 1000;
}
int64_t gettimeofdayInMillisecondsGlobal() {
return gettimeofdayInMillisecondsSynced();
}
#include "Threads.h"
int64_t gettimeofdayInMillisecondsSynced() {
// if in a sig handler then return g_now
//if ( g_inSigHandler ) return g_nowGlobal;
// i find that a pthread can call this function even though
// a signal handler is underway in the main thread!
if ( g_inSigHandler && ! g_threads.amThread() ) {
char *xx = NULL; *xx = 0; }
// sanity check
if ( ! isClockInSync() ) {
log("xml: clock not in sync with host #0 yet!!!!!!");
//char *xx = NULL; *xx = 0; }
}
//if ( ! g_clockInSync )
// log("gb: Getting global time but clock not in sync.");
// this isn't async signal safe...
struct timeval tv;
gettimeofday ( &tv , NULL );
int64_t now=(int64_t)(tv.tv_usec/1000)+((int64_t)tv.tv_sec)*1000;
// update g_nowLocal
if ( now > g_now ) g_now = now;
// adjust from Msg0x11 time adjustments
now += s_adjustment;
// update g_now if it is more accurate
//if ( now > g_nowGlobal ) g_nowGlobal = now;
return now;
}
int64_t gettimeofdayInMillisecondsGlobalNoCore() {
// if in a sig handler then return g_now
//if ( g_inSigHandler ) return g_nowGlobal;
// i find that a pthread can call this function even though
// a signal handler is underway in the main thread!
if ( g_inSigHandler && ! g_threads.amThread() ) {
char *xx = NULL; *xx = 0; }
// sanity check
//if ( ! g_clockInSync ) { char *xx = NULL; *xx = 0; }
//if ( ! g_clockInSync )
// log("gb: Getting global time but clock not in sync.");
// this isn't async signal safe...
struct timeval tv;
gettimeofday ( &tv , NULL );
int64_t now=(int64_t)(tv.tv_usec/1000)+((int64_t)tv.tv_sec)*1000;
// update g_nowLocal
if ( now > g_now ) g_now = now;
// adjust from Msg0x11 time adjustments
now += s_adjustment;
// update g_now if it is more accurate
//if ( now > g_nowGlobal ) g_nowGlobal = now;
return now;
}
int64_t gettimeofdayInMillisecondsLocal() {
return gettimeofdayInMilliseconds();
}
uint64_t gettimeofdayInMicroseconds(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return(((uint64_t)tv.tv_sec * 1000000LL) + (uint64_t)tv.tv_usec);
}
// "local" means the time on this machine itself, NOT a timezone thing.
int64_t gettimeofdayInMilliseconds() {
// if in a sig handler then return g_now
//if ( g_inSigHandler ) return g_now;
// i find that a pthread can call this function even though
// a signal handler is underway in the main thread!
if ( g_inSigHandler && ! g_threads.amThread() ) {
char *xx = NULL; *xx = 0; }
// this isn't async signal safe...
struct timeval tv;
//g_loop.disableTimer();
gettimeofday ( &tv , NULL );
//g_loop.enableTimer();
int64_t now=(int64_t)(tv.tv_usec/1000)+((int64_t)tv.tv_sec)*1000;
// update g_nowLocal
if ( now > g_now ) g_now = now;
// adjust from Msg0x11 time adjustments
//now += s_adjustment;
// update g_now if it is more accurate
// . or don't, bad to update it here because it could be very different
// from what it should be
//if ( now > g_now ) g_now = now;
return now;
}
time_t getTime () {
return getTimeLocal();
}
// . get time in seconds
// . use this instead of call to time(NULL) cuz it uses adjustment
time_t getTimeLocal () {
// if in a sig handler then return g_now/1000
//if ( g_inSigHandler ) return (time_t)(g_now / 1000);
// i find that a pthread can call this function even though
// a signal handler is underway in the main thread!
if ( g_inSigHandler && ! g_threads.amThread() ) {
char *xx = NULL; *xx = 0; }
// get time now
uint32_t now = gettimeofdayInMilliseconds() / 1000;
// and adjust it
//now += s_adjustment / 1000;
return (time_t)now;
}
// . make it so we can display the ascii string on an html browser
int32_t saftenTags2 ( char *s , int32_t slen , char *t , int32_t tlen ) {
char *start = s ;
// bail if slen is 0
if ( slen <= 0 ) return 0;
// leave a char for the \0
char *send = s + slen - 1;
char *tend = t + tlen;
for ( ; t < tend && s + 6 < send ; t++ ) {
if ( *t == '<' ) {
*s++ = '&';
*s++ = 'l';
*s++ = 't';
*s++ = ';';
continue;
}
if ( *t == '>' ) {
*s++ = '&';
*s++ = 'g';
*s++ = 't';
*s++ = ';';
continue;
}
if ( *t == '&' ) {
*s++ = '&';
*s++ = 'a';
*s++ = 'm';
*s++ = 'p';
*s++ = ';';
continue;
}
*s++ = *t;
}
// return NULL if we broke out because there was not enough room
//if ( s + 6 >= send ) return NULL;
// NULL terminate "s"
*s = '\0';
// return # of bytes, excluding \0, stored into s
return s - start;
}
void getCalendarFromMs(int64_t ms,
int32_t* days,
int32_t* hours,
int32_t* minutes,
int32_t* secs,
int32_t* msecs) {
int32_t s = 1000;
int32_t m = s * 60;
int32_t h = m * 60;
int32_t d = h * 24;
*days = ms / d;
int64_t tmp = ms % d;
*hours = tmp / h;
tmp = tmp % h;
*minutes = tmp / m;
tmp = tmp % m;
*secs = tmp / s;
*msecs = tmp % s;
}
uint32_t calculateChecksum(char *buf, int32_t bufLen){
uint32_t sum = 0;
for(int32_t i = 0; i < bufLen>>2;i++)
sum += ((uint32_t*)buf)[i];
return sum;
}
bool anchorIsLink( char *tag, int32_t tagLen){
if (strncasestr(tag, tagLen, "href")) return true;
if (strncasestr(tag, tagLen, "onclick")) return true;
return false;
}
bool has_alpha_a ( char *s , char *send ) {
for ( ; s < send ; s++ )
if (is_alpha_a(*s)) return true;
return false;
}
bool has_alpha_utf8 ( char *s , char *send ) {
char cs = 0;
for ( ; s < send ; s += cs ) {
cs = getUtf8CharSize ( s );
if ( cs == 1 ) {
if (is_alpha_a(*s)) return true;
continue;
}
if ( is_alpha_utf8(s) ) return true;
}
return false;
}
//takes an input skips leading spaces
//puts next nonspace char* in numPtr
//an returns the next space after that.
char* getNextNum(char* input, char** numPtr) {
char* p = input;
char* nextspace;
while(*p && isspace(*p)) p++;
nextspace = p;
*numPtr = p;
while(*nextspace && !isspace(*nextspace))
nextspace++;
return nextspace;
}
#include "HttpMime.h" // CT_HTML
// returns length of stripped content, but will set g_errno and return -1
// on error
int32_t stripHtml( char *content, int32_t contentLen, int32_t version, int32_t strip ) {
if ( !strip ) {
log( LOG_WARN, "query: html stripping not required!" );
return contentLen;
}
if ( ! content )
return 0;
if ( contentLen == 0 )
return 0;
// filter content if we should
// keep this on the big stack so "content" still references something
Xml tmpXml;
// . get the content as xhtml (should be NULL terminated)
// . parse as utf8 since all we are doing is messing with
// the tags...content manipulation comes later
if ( ! tmpXml.set ( content , contentLen,
false, 0, false, version , true , 0 , CT_HTML ) )
return -1;
//if( strip == 4 )
// return tmpXml.getText( content, contentLen );
// go tag by tag
int32_t n = tmpXml.getNumNodes();
XmlNode *nodes = tmpXml.getNodes();
// Xml class may have converted to utf16
content = tmpXml.getContent();
contentLen = tmpXml.getContentLen();
char *x = content;
char *xend = content + contentLen;
int32_t stackid = -1;
int32_t stackc = 0;
char skipIt = 0;
// . hack COL tag to NOT require a back tag
// . do not leave it that way as it could mess up our parsing
//g_nodes[25].m_hasBackTag = 0;
for ( int32_t i = 0 ; i < n ; i++ ) {
// get id of this node
int32_t id = nodes[i].m_nodeId;
// if strip is 4, just remove the script tag
if( strip == 4 ){
if ( id ){
if ( id == 83 ){
skipIt ^= 1;
continue;
}
}
else if ( skipIt ) continue;
goto keepit;
}
// if strip is 3, ALL tags will be removed!
if( strip == 3 ) {
if( id ) {
// . we dont want anything in between:
// - script tags (83)
// - style tags (111)
if ((id == 83) || (id == 111)) skipIt ^= 1;
// save img to have alt text kept.
if ( id == 54 ) goto keepit;
continue;
}
else {
if( skipIt ) continue;
goto keepit;
}
}
// get it
int32_t fk;
if ( strip == 1 ) fk = g_nodes[id].m_filterKeep1;
else fk = g_nodes[id].m_filterKeep2;
// if tag is <link ...> only keep it if it has
// rel="stylesheet" or rel=stylesheet
if ( strip == 2 && id == 62 ) { // <link> tag id
int32_t fflen;
char *ff = nodes[i].getFieldValue ( "rel" , &fflen );
if ( ff && fflen == 10 &&
strncmp(ff,"stylesheet",10) == 0 )
goto keepit;
}
// just remove just the tag if this is 2
if ( fk == 2 ) continue;
// keep it if not in a stack
if ( ! stackc && fk ) goto keepit;
// if no front/back for tag, just skip it
if ( ! nodes[i].m_hasBackTag ) continue;
// start stack if none
if ( stackc == 0 ) {
// but not if this is a back tag
if ( nodes[i].m_node[1] == '/' ) continue;
// now start the stack
stackid = id;
stackc = 1;
continue;
}
// skip if this tag does not match what is on stack
if ( id != stackid ) continue;
// if ANOTHER front tag, inc stack
if ( nodes[i].m_node[1] != '/' ) stackc++;
// otherwise, dec the stack count
else stackc--;
// . ensure not negative from excess back tags
// . reset stackid to -1 to indicate no stack
if ( stackc <= 0 ) { stackid= -1; stackc = 0; }
// skip it
continue;
keepit:
// replace images with their alt text
int32_t vlen;
char *v;
if ( id == 54 ) {
v = nodes[i].getFieldValue("alt", &vlen );
// try title if no alt text
if ( ! v )
v = nodes[i].getFieldValue("title", &vlen );
if ( v ) { memcpy ( x, v, vlen ); x += vlen; }
continue;
}
// remove background image from body,table,td tags
if ( id == 19 || id == 93 || id == 95 ) {
v = nodes[i].getFieldValue("background", &vlen);
// remove background, just sabotage it
if ( v ) v[-4] = 'x';
}
// store it
memcpy ( x , nodes[i].m_node , nodes[i].m_nodeLen );
x += nodes[i].m_nodeLen;
// sanity check
if ( x > xend ) { char *xx=NULL;*xx=0;}
}
contentLen = x - content;
content [ contentLen ] = '\0';
// unhack COL tag
//g_nodes[25].m_hasBackTag = 1;
return contentLen;
}
bool is_urlchar(char s) {
// [a-z0-9/:_-.?$,~=#&%+@]
if(isalnum(s)) return true;
if(s == '/' ||
s == ':' ||
s == '_' ||
s == '-' ||
s == '.' ||
s == '?' ||
s == '$' ||
s == ',' ||
s == '~' ||
s == '=' ||
s == '#' ||
s == '&' ||
s == '%' ||
s == '+' ||
s == '@') return true;
return false;
}
// don't allow "> in our input boxes
int32_t cleanInput(char *outbuf, int32_t outbufSize, char *inbuf, int32_t inbufLen){
char *p = outbuf;
int32_t numQuotes=0;
int32_t lastQuote = 0;
for (int32_t i=0;i<inbufLen;i++){
if (p-outbuf >= outbufSize-1) break;
if (inbuf[i] == '"'){
numQuotes++;
lastQuote = i;
}
// if we have an odd number of quotes and a close angle bracket
// it could be an xss attempt
if (inbuf[i] == '>' && (numQuotes & 1)) {
p = outbuf+lastQuote;
break;
}
*p = inbuf[i];
p++;
}
*p = '\0';
return p-outbuf;
}
//
// get rid of the virtual Msg class because it screws up how we
// serialize/deserialize everytime we compile gb it seems
//
int32_t getMsgStoredSize ( int32_t baseSize,
int32_t *firstSizeParm,
int32_t *lastSizeParm ) {
//int32_t size = (int32_t)sizeof(Msg);
int32_t size = baseSize;//getBaseSize();
// add up string buffer sizes
int32_t *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
int32_t *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displayMeta
for ( ; sizePtr <= sizeEnd ; sizePtr++ )
size += *sizePtr;
return size;
}
// . return ptr to the buffer we serialize into
// . return NULL and set g_errno on error
char *serializeMsg ( int32_t baseSize ,
int32_t *firstSizeParm ,
int32_t *lastSizeParm ,
char **firstStrPtr ,
void *thisPtr ,
int32_t *retSize ,
char *userBuf ,
int32_t userBufSize ,
bool makePtrsRefNewBuf ) {
// make a buffer to serialize into
char *buf = NULL;
//int32_t need = getStoredSize();
int32_t need = getMsgStoredSize(baseSize,firstSizeParm,lastSizeParm);
// big enough?
if ( need <= userBufSize ) buf = userBuf;
// alloc if we should
if ( ! buf ) buf = (char *)mmalloc ( need , "Ra" );
// bail on error, g_errno should be set
if ( ! buf ) return NULL;
// set how many bytes we will serialize into
*retSize = need;
// copy the easy stuff
char *p = buf;
memcpy ( p , (char *)thisPtr , baseSize );//getBaseSize() );
p += baseSize; // getBaseSize();
// then store the strings!
int32_t *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
int32_t *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displayMet
char **strPtr = firstStrPtr;//getFirstStrPtr (); // &ptr_qbuf;
for ( ; sizePtr <= sizeEnd ; ) {
// if we are NULL, we are a "bookmark", so
// we alloc'd space for it, but don't copy into
// the space until after this call toe serialize()
if ( ! *strPtr ) goto skip;
// sanity check -- cannot copy onto ourselves
if ( p > *strPtr && p < *strPtr + *sizePtr ) {
char *xx = NULL; *xx = 0; }
// copy the string into the buffer
memcpy ( p , *strPtr , *sizePtr );
skip:
// . make it point into the buffer now
// . MDW: why? that is causing problems for the re-call in
// Msg3a, it calls this twice with the same "m_r"
if ( makePtrsRefNewBuf ) *strPtr = p;
// advance our destination ptr
p += *sizePtr;
// advance both ptrs to next string
sizePtr++;
strPtr++;
}
return buf;
}
// convert offsets back into ptrs
int32_t deserializeMsg ( int32_t baseSize ,
int32_t *firstSizeParm ,
int32_t *lastSizeParm ,
char **firstStrPtr ,
char *stringBuf ) {
// point to our string buffer
char *p = stringBuf;//getStringBuf(); // m_buf;
// then store the strings!
int32_t *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
int32_t *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displayMet
char **strPtr = firstStrPtr;//getFirstStrPtr (); // &ptr_qbuf;
for ( ; sizePtr <= sizeEnd ; ) {
// convert the offset to a ptr
*strPtr = p;
// make it NULL if size is 0 though
if ( *sizePtr == 0 ) *strPtr = NULL;
// sanity check
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
// advance our destination ptr
p += *sizePtr;
// advance both ptrs to next string
sizePtr++;
strPtr++;
}
// return how many bytes we processed
return baseSize + (p - stringBuf);//getStringBuf());
}
// print it to stdout for debugging Dates.cpp
int32_t printTime ( time_t ttt ) {
//char *s = ctime(&ttt);
// print in UTC!
char *s = asctime ( gmtime(&ttt) );
// strip \n
s[gbstrlen(s)-1] = '\0';
fprintf(stderr,"%s UTC\n",s);
return 0;
}
// this uses our local timezone which is MST, so we need to tell
// it to use UTC somehow...
time_t mktime_utc ( struct tm *ttt ) {
time_t local = mktime ( ttt );
// bad?
if ( local < 0 ) return local;
/*
// sanity check
static char s_mm = 1;
static int32_t s_localOff;
if ( s_mm ) {
s_mm = 0;
struct tm ff;
ff.tm_mon = 0;
ff.tm_year = 70;
ff.tm_mday = 1;
ff.tm_hour = 0;
ff.tm_min = 0;
ff.tm_sec = 0;
int32_t qq = mktime ( &ff );
//fprintf(stderr,"qq=%"INT32"\n",qq);
// . set this then
// . we subtract s_localOff to further mktime() returns to
// get it into utc
s_localOff = qq;
// sanity
if ( s_localOff != timezone ) { char *xx=NULL;*xx=0; }
}
*/
// see what our timezone is!
//fprintf(stderr,"%"INT32"=tz\n",timezone);
// mod that
return local - timezone;
}
bool verifyUtf8 ( char *txt , int32_t tlen ) {
if ( ! txt || tlen <= 0 ) return true;
char size;
char *p = txt;
char *pend = txt + tlen;
for ( ; p < pend ; p += size ) {
size = getUtf8CharSize(p);
// skip if ascii
if ( ! (p[0] & 0x80) ) continue;
// ok, it's a utf8 char, it must have both hi bits set
if ( (p[0] & 0xc0) != 0xc0 ) return false;
// if only one byte, we are done.. how can that be?
if ( size == 1 ) return false;
//if ( ! utf8IsSane ( p[0] ) ) return false;
// successive utf8 chars must have & 0xc0 be equal to 0x80
// but the first char it must equal 0xc0, both set
if ( (p[1] & 0xc0) != 0x80 ) return false;
if ( size == 2 ) continue;
if ( (p[2] & 0xc0) != 0x80 ) return false;
if ( size == 3 ) continue;
if ( (p[3] & 0xc0) != 0x80 ) return false;
}
if ( p != pend ) return false;
return true;
}
bool verifyUtf8 ( char *txt ) {
int32_t tlen = gbstrlen(txt);
return verifyUtf8(txt,tlen);
}