mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
8a49e87a61
now we store a "sharded by termid" bit in posdb key for checksums, etc keys that are not sharded by docid. save having to do disk seeks on every host in the cluster to do a dup check, etc.
2415 lines
62 KiB
C++
2415 lines
62 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "Loop.h"
|
|
#include "Entities.h"
|
|
#include "UCWordIterator.h"
|
|
#include "SafeBuf.h"
|
|
#include "Xml.h"
|
|
#include "XmlNode.h"
|
|
#include "iana_charset.h"
|
|
|
|
static bool g_clockInSync = false;
|
|
|
|
bool isClockInSync() {
|
|
if ( g_hostdb.m_initialized && g_hostdb.m_hostId == 0 ) return true;
|
|
return g_clockInSync;
|
|
}
|
|
|
|
|
|
bool print96 ( char *k ) {
|
|
key_t *kp = (key_t *)k;
|
|
printf("n1=0x%lx n0=0x%llx\n",(long)kp->n1,(long long)kp->n0);
|
|
return true;
|
|
}
|
|
|
|
bool print96 ( key_t *kp ) {
|
|
printf("n1=0x%lx n0=0x%llx\n",(long)kp->n1,(long long)kp->n0);
|
|
return true;
|
|
}
|
|
|
|
bool print128 ( char *k ) {
|
|
key128_t *kp = (key128_t *)k;
|
|
printf("n1=0x%llx n0=0x%llx\n",(long long)kp->n1,(long long)kp->n0);
|
|
return true;
|
|
}
|
|
|
|
bool print128 ( key128_t *kp ) {
|
|
printf("n1=0x%llx n0=0x%llx\n",(long long)kp->n1,(long long)kp->n0);
|
|
return true;
|
|
}
|
|
|
|
// . put all the maps here now
|
|
// . convert "c" to lower case
|
|
const char g_map_to_lower[] = {
|
|
0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ,
|
|
8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ,
|
|
16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 ,
|
|
24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ,
|
|
32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 ,
|
|
40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 ,
|
|
48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 ,
|
|
56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ,
|
|
64 , 'a','b' ,'c' ,'d' ,'e' ,'f' ,'g' ,
|
|
'h', 'i','j' ,'k' ,'l' ,'m' ,'n' ,'o' ,
|
|
'p', 'q','r' ,'s' ,'t' ,'u' ,'v' ,'w' ,
|
|
'x', 'y','z' , 91 , 92 ,93 ,94 ,95 ,
|
|
96 , 'a','b' ,'c' ,'d' ,'e' ,'f' ,'g' ,
|
|
'h', 'i','j' ,'k' ,'l' ,'m' ,'n' ,'o' ,
|
|
'p', 'q','r' ,'s' ,'t' ,'u' ,'v' ,'w' ,
|
|
'x', 'y','z' ,123 ,124 ,125 ,126 ,127 ,
|
|
128,129,130,131,132,133,134,135,
|
|
136,137,138,139,140,141,142,143,
|
|
144,145,146,147,148,149,150,151,
|
|
152,153,154,155,156,157,158,159,
|
|
160,161,162,163,164,165,166,167,
|
|
168,169,170,171,172,173,174,175,
|
|
176,177,178,179,180,181,182,183,
|
|
184,185,186,187,188,189,190,191,
|
|
224,225,226,227,228,229,230,231,
|
|
232,233,234,235,236,237,238,239,
|
|
240,241,242,243,244,245,246,247,
|
|
248,249,250,251,252,253,254,223,
|
|
224,225,226,227,228,229,230,231,
|
|
232,233,234,235,236,237,238,239,
|
|
240,241,242,243,244,245,246,247,
|
|
248,249,250,251,252,253,254,255
|
|
};
|
|
|
|
|
|
// converts ascii chars and IS_O chars to their lower case versions
|
|
const char g_map_to_upper[] = {
|
|
0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ,
|
|
8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ,
|
|
16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 ,
|
|
24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ,
|
|
32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 ,
|
|
40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 ,
|
|
48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 ,
|
|
56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ,
|
|
64 , 'A','B' ,'C' ,'D' ,'E' ,'F' ,'G' ,
|
|
'H', 'I','J' ,'K' ,'L' ,'M' ,'N' ,'O' ,
|
|
'P', 'Q','R' ,'S' ,'T' ,'U' ,'V' ,'W' ,
|
|
'X', 'Y','Z' , 91 , 92 ,93 ,94 ,95 ,
|
|
96 , 'A','B' ,'C' ,'D' ,'E' ,'F' ,'G' ,
|
|
'H', 'I','J' ,'K' ,'L' ,'M' ,'N' ,'O' ,
|
|
'P', 'Q','R' ,'S' ,'T' ,'U' ,'V' ,'W' ,
|
|
'X', 'Y','Z' ,123 ,124 ,125 ,126 ,127 ,
|
|
128,129,130,131,132,133,134,135,
|
|
136,137,138,139,140,141,142,143,
|
|
144,145,146,147,148,149,150,151,
|
|
152,153,154,155,156,157,158,159,
|
|
160,161,162,163,164,165,166,167,
|
|
168,169,170,171,172,173,174,175,
|
|
176,177,178,179,180,181,182,183,
|
|
184,185,186,187,188,189,190,191,
|
|
192,193,194,195,196,197,198,199,
|
|
200,201,202,203,204,205,206,207,
|
|
208,209,210,211,212,213,214,215,
|
|
216,217,218,219,220,221,222,223,
|
|
192,193,194,195,196,197,198,199,
|
|
200,201,202,203,204,205,206,207,
|
|
208,209,210,211,212,213,214,215,
|
|
216,217,218,219,220,221,222,255
|
|
};
|
|
|
|
const char g_map_to_ascii[] = {
|
|
0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ,
|
|
8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ,
|
|
16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 ,
|
|
24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ,
|
|
32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 ,
|
|
40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 ,
|
|
48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 ,
|
|
56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ,
|
|
64 , 'A','B' ,'C' ,'D' ,'E' ,'F' ,'G' ,
|
|
'H', 'I','J' ,'K' ,'L' ,'M' ,'N' ,'O' ,
|
|
'P', 'Q','R' ,'S' ,'T' ,'U' ,'V' ,'W' ,
|
|
'X', 'Y','Z' , 91 , 92 ,93 ,94 ,95 ,
|
|
96 , 'a','b' ,'c' ,'d' ,'e' ,'f' ,'g' ,
|
|
'h', 'i','j' ,'k' ,'l' ,'m' ,'n' ,'o' ,
|
|
'p', 'q','r' ,'s' ,'t' ,'u' ,'v' ,'w' ,
|
|
'x', 'y','z' ,123 ,124 ,125 ,126 ,127 ,
|
|
128,129,130,131, 132,133,134,135,
|
|
136,137,138,139, 140,141,142,143,
|
|
144,145,146,147, 148,149,150,151,
|
|
152,153,154,155, 156,157,158,159,
|
|
160,161,162,'#', 'o','Y','|','S',
|
|
168,169,'a',171, 172,173,174,175,
|
|
176,177,'2','3', 180,'u',182,183,
|
|
' ','1','o',187, 188,189,190,'?',
|
|
'A','A','A','A', 'A','A','A'/*198-AE*/,'C',
|
|
'E','E','E','E', 'I','I','I','I',
|
|
'D','N','O','O', 'O','O','O','x',
|
|
'O','U','U','U', 'U','Y',222/*TH*/,'s'/*changed from B*/,
|
|
'a','a','a','a', 'a','a','a'/*230-ae*/,'c',
|
|
'e','e','e','e', 'i','i','i','i',
|
|
'd','n','o','o', 'o','o','o','/',
|
|
'o','u','u','u', 'u','y',254/*th*/,'y'
|
|
};
|
|
|
|
|
|
const char g_map_is_upper[] = {
|
|
0,0,0,0,0,0,0,0, // 0 -7
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,1,1,1,1,1,1,1, // 64
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,0,0,0,0,0, // 88
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 15*8
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 20*8
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1, // 192
|
|
1,1,1,1,1,1,1,1, // 200
|
|
1,1,1,1,1,1,1,0, // 208
|
|
1,1,1,1,1,1,1,1, // 216
|
|
0,0,0,0,0,0,0,0, // 224
|
|
0,0,0,0,0,0,0,0, // 232
|
|
0,0,0,0,0,0,0,0, // 240
|
|
0,0,0,0,0,0,0,0}; // 248
|
|
|
|
|
|
// can this character be in an html (or xml) tag name??
|
|
const char g_map_canBeInTagName[] = {
|
|
0,0,0,0,0,0,0,0, // 0 -7
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,1,0,0, // 40 -- hyphen can be in tag name
|
|
0,0,0,0,0,0,0,0, // 48
|
|
0,0,0,0,0,0,0,0, // 56
|
|
0,1,1,1,1,1,1,1, // 64
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,0,0,0,0,0, // 88
|
|
0,1,1,1,1,1,1,1, // 96
|
|
1,1,1,1,1,1,1,1, // 104
|
|
1,1,1,1,1,1,1,1, // 112
|
|
1,1,1,0,0,0,0,0, // 15*8 = 120
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 20*8 = 160
|
|
0,0,0,0,0,0,0,0, // 168
|
|
0,0,0,0,0,0,0,0, // 176
|
|
0,0,0,0,0,0,0,0, // 184
|
|
0,0,0,0,0,0,0,0, // 192
|
|
0,0,0,0,0,0,0,0, // 200
|
|
0,0,0,0,0,0,0,0, // 208
|
|
0,0,0,0,0,0,0,0, // 216
|
|
0,0,0,0,0,0,0,0, // 224
|
|
0,0,0,0,0,0,0,0, // 232
|
|
0,0,0,0,0,0,0,0, // 240
|
|
0,0,0,0,0,0,0,0}; // 248
|
|
|
|
|
|
const char g_map_is_control [] = {
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 64
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 96
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,1, // 120, 127 = DEL
|
|
1,1,1,1,1,1,1,1, // 128
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,0,0,0,0,0,0,0, // 160 = backspace
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 192
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 224
|
|
0,0,0,0,0,0,0,0, // 232
|
|
0,0,0,0,0,0,0,0, // 240
|
|
0,0,0,0,0,0,0,0}; // 248
|
|
|
|
// people mix windows 1252 into latin-1 so we have to be less restrictive here...
|
|
const char g_map_is_binary[] = {
|
|
1,1,1,1,1,1,1,1,
|
|
1,0,0,1,1,0,1,1, // \t=9 \n = 10 \r = 13
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 64
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 96
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,1, // 120, 127 = DEL
|
|
0,1,1,1,1,1,1,1, // 128 (128 is a quote)
|
|
1,1,1,1,1,1,1,1, // 136
|
|
1,0,0,0,0,0,0,1, // 144 (145 146 147 are quotes, 148 is dash, 149 bullet,150 dash)
|
|
0,0,1,1,0,0,1,1, // 152 (152 & 153 are quotes, 156 & 157 are double quotes)
|
|
0,0,0,0,0,0,0,0, // 160 = backspace (some urls have this???)
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 192
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 224
|
|
0,0,0,0,0,0,0,0, // 232
|
|
0,0,0,0,0,0,0,0, // 240
|
|
0,0,0,0,0,0,0,0}; // 248
|
|
|
|
// ' ' '\n' '\t' '\r'
|
|
const char g_map_is_wspace[] = {
|
|
0,0,0,0,0,0,0,0, // 0 -7
|
|
0,1,1,0,0,1,0,0, // \t=9 \n = 10 \r = 13
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
1,0,0,0,0,0,0,0, // space=32
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 64
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 88
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 15*8
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 160 -- turn off 160, it might be utf8 byte
|
|
0,0,0,0,0,0,0,0, // 168
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 192
|
|
0,0,0,0,0,0,0,0, // 200
|
|
0,0,0,0,0,0,0,0, // 208
|
|
0,0,0,0,0,0,0,0, // 216
|
|
0,0,0,0,0,0,0,0, // 224
|
|
0,0,0,0,0,0,0,0, // 232
|
|
0,0,0,0,0,0,0,0, // 240
|
|
0,0,0,0,0,0,0,0}; // 248
|
|
|
|
|
|
// '\n'
|
|
const char g_map_is_vspace[] = {
|
|
0,0,0,0,0,0,0,0, // 0 -7
|
|
0,0,1,0,0,0,0,0, // \t=9 \n = 10
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // space=32
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 64
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 88
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 15*8
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 20*8
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1, // 192
|
|
1,1,1,1,1,1,1,1, // 200
|
|
1,1,1,1,1,1,1,0, // 208
|
|
1,1,1,1,1,1,1,1, // 216
|
|
0,0,0,0,0,0,0,0, // 224
|
|
0,0,0,0,0,0,0,0, // 232
|
|
0,0,0,0,0,0,0,0, // 240
|
|
0,0,0,0,0,0,0,0}; // 248
|
|
|
|
// ' ' '\t'
|
|
const char g_map_is_hspace[] = {
|
|
0,0,0,0,0,0,0,0, // 0 -7
|
|
0,1,0,0,0,0,0,0, // \t=9 \n = 10
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
1,0,0,0,0,0,0,0, // space=32
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 64
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 88
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 15*8
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 20*8
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1, // 192
|
|
1,1,1,1,1,1,1,1, // 200
|
|
1,1,1,1,1,1,1,0, // 208
|
|
1,1,1,1,1,1,1,1, // 216
|
|
0,0,0,0,0,0,0,0, // 224
|
|
0,0,0,0,0,0,0,0, // 232
|
|
0,0,0,0,0,0,0,0, // 240
|
|
0,0,0,0,0,0,0,0}; // 248
|
|
|
|
|
|
const char g_map_is_vowel[] = {
|
|
0,0,0,0,0,0,0,0, // 0 -7
|
|
0,0,0,0,0,0,0,0, // 8-15
|
|
0,0,0,0,0,0,0,0, // 16-
|
|
0,0,0,0,0,0,0,0, // 24-
|
|
0,0,0,0,0,0,0,0, // 32-
|
|
0,0,0,0,0,0,0,0, // 40-
|
|
0,0,0,0,0,0,0,0, // 48-
|
|
0,0,0,0,0,0,0,0, // 56-
|
|
0,1,0,0,0,1,0,0, // 64 (A=65)
|
|
0,1,0,0,0,0,0,1, // 72
|
|
0,0,0,0,0,1,0,0, // 80
|
|
0,0,0,0,0,0,0,0, // 88-
|
|
0,1,0,0,0,1,0,0, // 96- (a=97)
|
|
0,1,0,0,0,0,0,1,
|
|
0,0,0,0,0,1,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 128
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 160
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 192
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0};
|
|
|
|
|
|
|
|
// converts ascii chars and IS_O chars to their lower case versions
|
|
const char g_map_is_lower[] = { // 97-122 and 224-255 (excluding 247)
|
|
0,0,0,0,0,0,0,0, // 0 -7
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 64
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,1,1,1,1,1,1,1, // 96
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,0,0,0,0,0, // 120
|
|
0,0,0,0,0,0,0,0, // 128
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 160
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 192
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1, // 224
|
|
1,1,1,1,1,1,1,1, // 232
|
|
1,1,1,1,1,1,1,0, // 240
|
|
1,1,1,1,1,1,1,1}; // 248
|
|
|
|
const char g_map_is_ascii[] = { // 32 to 126
|
|
0,0,0,0,0,0,0,0, // 0
|
|
0,0,0,0,0,0,0,0, // 8
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1, // 32
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,0,
|
|
0,0,0,0,0,0,0,0, // 128
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0};
|
|
|
|
// just from 0-127, used by the inlined *_utf8() functions in fctypes.h
|
|
const char g_map_is_ascii3[] = { // 32 to 126
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1, // 32
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
0,0,0,0,0,0,0,0, // 128
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0};
|
|
|
|
|
|
|
|
const char g_map_is_iso[] = { // 32 to 126
|
|
0,0,0,0,0,0,0,0, // 0
|
|
0,0,0,0,0,0,0,0, // 8
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 32
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 128
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 152
|
|
0,1,1,1,1,1,1,1, // 160
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1};
|
|
|
|
const char g_map_is_punct[] = { // 33-47, 58-64, 91-96, 123-126, 161-191, 215,247
|
|
0,0,0,0,0,0,0,0, // 0
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 16
|
|
0,0,0,0,0,0,0,0,
|
|
0,1,1,1,1,1,1,1, // 32
|
|
1,1,1,1,1,1,1,1,
|
|
0,0,0,0,0,0,0,0, // 48
|
|
0,0,1,1,1,1,1,1,
|
|
1,0,0,0,0,0,0,0, // 64
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 80
|
|
0,0,0,1,1,1,1,1,
|
|
1,0,0,0,0,0,0,0, // 96
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 112
|
|
0,0,0,1,1,1,1,0,
|
|
0,0,0,0,0,0,0,0, // 128
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 144
|
|
0,0,0,0,0,0,0,0,
|
|
0,1,1,1,1,1,1,1, // 160
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1, // 176
|
|
1,1,1,1,1,1,1,1,
|
|
0,0,0,0,0,0,0,0, // 192
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,1, // 208
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 224
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,1, // 240
|
|
0,0,0,0,0,0,0,0}; // 248
|
|
|
|
const char g_map_is_alnum[] = { // 48-57, 65-90,97-122,192-255(excluding 215,247)
|
|
0,0,0,0,0,0,0,0, // 0
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 16
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 32
|
|
0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1, // 48
|
|
1,1,0,0,0,0,0,0,
|
|
0,1,1,1,1,1,1,1, // 64
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1, // 80
|
|
1,1,1,0,0,0,0,0,
|
|
0,1,1,1,1,1,1,1, // 96
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1, // 112
|
|
1,1,1,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 128
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 144
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 160
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 176
|
|
0,0,0,0,0,0,0,0,
|
|
|
|
1,1,1,1,1,1,1,1, // 192
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,0, // 208
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1, // 224
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,0, // 240
|
|
1,1,1,1,1,1,1,1};
|
|
|
|
const char g_map_is_alpha[] = { // 65-90, 97-122, 192-255 (excluding 215, 247)
|
|
0,0,0,0,0,0,0,0, // 0
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 16
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 32
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 48
|
|
0,0,0,0,0,0,0,0,
|
|
0,1,1,1,1,1,1,1, // 64
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1, // 80
|
|
1,1,1,0,0,0,0,0,
|
|
0,1,1,1,1,1,1,1, // 96
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1, // 112
|
|
1,1,1,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 128
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 144
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 160
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 176
|
|
0,0,0,0,0,0,0,0,
|
|
|
|
1,1,1,1,1,1,1,1, // 192
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,0, // 208
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1, // 224
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,0, // 240
|
|
1,1,1,1,1,1,1,1};
|
|
|
|
const char g_map_is_digit[] = { // 48-57
|
|
0,0,0,0,0,0,0,0, // 0
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 16
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 32
|
|
0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1, // 48
|
|
1,1,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0};
|
|
|
|
|
|
const char g_map_is_hex[] = { // 48-57
|
|
0,0,0,0,0,0,0,0, // 0
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 16
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 32
|
|
0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1, // 48
|
|
1,1,0,0,0,0,0,0, // 56
|
|
0,1,1,1,1,1,1,0, // 64 (65='A')
|
|
0,0,0,0,0,0,0,0, // 72
|
|
0,0,0,0,0,0,0,0, // 80
|
|
0,0,0,0,0,0,0,0, // 88
|
|
0,1,1,1,1,1,1,0, // 96 (97='a')
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0};
|
|
|
|
// stolen from is_alnum, but turned on - and _
|
|
const char g_map_is_tagname_char [] = { // 48-57, 65-90,97-122,192-255(excluding 215,247)
|
|
0,0,0,0,0,0,0,0, // 0
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 16
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 32
|
|
0,0,0,0,0,1,0,0, // -
|
|
1,1,1,1,1,1,1,1, // 48
|
|
1,1,1,0,0,0,0,0, // we include the : for feedburner:origlink
|
|
0,1,1,1,1,1,1,1, // 64
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1, // 80
|
|
1,1,1,0,0,0,0,1, // _
|
|
0,1,1,1,1,1,1,1, // 96
|
|
1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1, // 112
|
|
1,1,1,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 128
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 144
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 160
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 176
|
|
0,0,0,0,0,0,0,0,
|
|
|
|
// we are no longer necessarily latin-1!!
|
|
0,0,0,0,0,0,0,0, // 192
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 208
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 224
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 240
|
|
0,0,0,0,0,0,0,0};
|
|
|
|
const char g_map_is_tag_control_char[] = { // 48-57
|
|
0,0,0,0,0,0,0,0, // 0
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 16
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,1,0,0,0,0,1, // 32 " and '
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0, // 48
|
|
0,0,0,0,1,0,1,0, // 56 < and >
|
|
0,0,0,0,0,0,0,0, // 64
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0};
|
|
|
|
// when matching query terms to words/phrases in doc skip over spaces
|
|
// or other punct so that "flypaper" in the query matches "fly paper" in the
|
|
// doc
|
|
const char g_map_is_match_skip[] = { // 48-57
|
|
0,0,0,0,0,0,0,0, // 0
|
|
0,1,1,0,0,0,0,0, // \t and \n
|
|
0,0,0,0,0,0,0,0, // 16
|
|
0,0,0,0,0,0,0,0,
|
|
1,0,0,0,0,0,0,1, // 32 space and '
|
|
0,0,0,0,0,1,0,0, // 40 -
|
|
0,0,0,0,0,0,0,0, // 48
|
|
0,0,0,0,0,0,0,0, // 56
|
|
0,0,0,0,0,0,0,0, // 64
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0};
|
|
|
|
// seems like this should be defined, but it isn't
|
|
long strnlen ( const char *s , long maxLen ) {
|
|
long i ;
|
|
for ( i = 0 ; i < maxLen ; i++ ) if ( ! s[i] ) return i;
|
|
return i;
|
|
}
|
|
|
|
char *strncasestr( char *haystack, long haylen, char *needle){
|
|
long matchLen = 0;
|
|
long needleLen = gbstrlen(needle);
|
|
for (long i = 0; i < haylen;i++){
|
|
char c1 = to_lower_a(haystack[i]);
|
|
char c2 = to_lower_a(needle[matchLen]);
|
|
if ( c1 != c2 ){
|
|
// no match
|
|
matchLen = 0;
|
|
continue;
|
|
}
|
|
// we matched another character
|
|
matchLen++;
|
|
if (matchLen < needleLen) continue;
|
|
|
|
// we've matched the whole string
|
|
return haystack + i - matchLen + 1;
|
|
}
|
|
return NULL;
|
|
}
|
|
char *strnstr( char *haystack, long haylen, char *needle){
|
|
long matchLen = 0;
|
|
long needleLen = gbstrlen(needle);
|
|
for (long i = 0; i < haylen;i++){
|
|
char c1 = (haystack[i]);
|
|
char c2 = (needle[matchLen]);
|
|
if ( c1 != c2 ){
|
|
// no match
|
|
matchLen = 0;
|
|
continue;
|
|
}
|
|
// we matched another character
|
|
matchLen++;
|
|
if (matchLen < needleLen) continue;
|
|
|
|
// we've matched the whole string
|
|
return haystack + i - matchLen + 1;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
// . get the # of words in this string
|
|
long getNumWords ( char *s , long len, long titleVersion ) {
|
|
|
|
long wordCount = 0;
|
|
bool inWord = false;
|
|
for ( long i = 0 ; i < len ; i++ ) {
|
|
if ( ! is_alnum_a ( s[i] ) && s[i]!='\'' ) {
|
|
inWord = false;
|
|
continue;
|
|
}
|
|
if ( ! inWord ) {
|
|
inWord = true;
|
|
wordCount++;
|
|
}
|
|
}
|
|
return wordCount;
|
|
}
|
|
|
|
// . this stores a "n" into "s" and returns the # of bytes written into "s"
|
|
// . it also puts commas into the number
|
|
// . it now also NULL terminates bytes written into "s"
|
|
long ulltoa ( char *s , unsigned long long n ) {
|
|
// if n is zero, it's easy
|
|
if ( n == 0LL ) { *s++='0'; *s='\0'; return 1; }
|
|
// a hunk is a number in [0,999]
|
|
long hunks[10];
|
|
long lastHunk = -1;
|
|
// . get the hunks
|
|
// . the first hunk we get is called the "lowest hunk"
|
|
// . "lastHunk" is called the "highest hunk"
|
|
for ( long i = 0 ; i < 10 ; i++ ) {
|
|
hunks[i] = n % 1000;
|
|
n /= 1000;
|
|
if ( hunks[i] != 0 ) lastHunk = i;
|
|
}
|
|
// remember start of buf for calculating # bytes written
|
|
char *start = s;
|
|
// print the hunks separated by comma
|
|
for ( long i = lastHunk ; i >= 0 ; i-- ) {
|
|
// pad all hunks except highest hunk with zeroes
|
|
if ( i != lastHunk ) sprintf ( s , "%03li" , hunks[i] );
|
|
else sprintf ( s , "%li" , hunks[i] );
|
|
s += gbstrlen(s);
|
|
// comma after all hunks but lowest hunk
|
|
if ( i != 0 ) *s++ = ',';
|
|
}
|
|
// null terminate it
|
|
*s = '\0';
|
|
// return # of bytes stored into "s"
|
|
return s - start;
|
|
}
|
|
|
|
/*
|
|
long atol2 ( const char *s, long len ) {
|
|
char tmp[32];
|
|
if ( len > 30 ) len = 30;
|
|
memcpy ( tmp , s , len );
|
|
tmp [ len ] = '\0';
|
|
return atol ( s );
|
|
}
|
|
*/
|
|
|
|
long atol2 ( const char *s, long len ) {
|
|
// skip over spaces
|
|
const char *end = s + len;
|
|
while ( s < end && is_wspace_a ( *s ) ) s++;
|
|
// return 0 if all spaces
|
|
if ( s == end ) return 0;
|
|
long i = 0;
|
|
long val = 0;
|
|
bool negative = false;
|
|
if ( s[0] == '-' ) { negative = true; i++; }
|
|
while ( i < len && is_digit(s[i]) ) val = val * 10 + ( s[i++] - '0' );
|
|
if ( negative ) return -val;
|
|
return val;
|
|
}
|
|
|
|
long long atoll1 ( const char *s ) {
|
|
return atoll ( s );
|
|
}
|
|
|
|
long long atoll2 ( const char *s, long len ) {
|
|
// skip over spaces
|
|
const char *end = s + len;
|
|
while ( s < end && is_wspace_a ( *s ) ) s++;
|
|
// return 0 if all spaces
|
|
if ( s == end ) return 0;
|
|
long i = 0;
|
|
long long val = 0LL;
|
|
bool negative = false;
|
|
if ( s[0] == '-' ) { negative = true; i++; }
|
|
while ( i < len && is_digit(s[i]) ) val = val * 10LL + ( s[i++] - '0');
|
|
if ( negative ) return -val;
|
|
return val;
|
|
}
|
|
|
|
double atof2 ( const char *s, long len ) {
|
|
// skip over spaces
|
|
const char *end = s + len;
|
|
while ( s < end && is_wspace_a ( *s ) ) { s++; len--; }
|
|
// return 0 if all spaces
|
|
if ( s == end ) return 0;
|
|
char tmpBuf[128];
|
|
if ( len >= 128 ) len = 127;
|
|
//strncpy ( dst , s , len );
|
|
|
|
const char *p = s;
|
|
const char *srcEnd = s + len;
|
|
char *dst = tmpBuf;
|
|
// remove commas
|
|
for ( ; p < srcEnd ; p++ ) {
|
|
// skip commas
|
|
if ( *p == ',' ) continue;
|
|
// otherwise store it
|
|
*dst++ = *p;
|
|
}
|
|
// null term
|
|
*dst = '\0';
|
|
//buf[len] = '\0';
|
|
return atof ( tmpBuf );
|
|
}
|
|
|
|
double atod2 ( char *s, long len ) {
|
|
// point to end
|
|
char *end = s + len;
|
|
// null term temp
|
|
char c = *end;
|
|
*end = '\0';
|
|
// get it
|
|
double ret = strtod ( s , NULL );
|
|
// undo it
|
|
*end = c;
|
|
return ret;
|
|
}
|
|
|
|
|
|
bool atob ( const char *s, long len ) {
|
|
// skip over spaces
|
|
const char *end = s + len;
|
|
while ( s < end && is_wspace_a ( *s ) ) s++;
|
|
// return false if all spaces
|
|
if ( s == end ) return false;
|
|
// parse the ascii bool value
|
|
if ( s[0] == 't' || s[1] == 'T' ) return true;
|
|
if ( s[0] == 'y' || s[0] == 'y' ) return true;
|
|
if ( ! is_digit ( *s ) || *s == '0' ) return false;
|
|
return true;
|
|
}
|
|
|
|
// hexadecimal ascii to key_t
|
|
long long htolonglong ( const char *s, long len ) {
|
|
// skip over spaces
|
|
const char *end = s + len;
|
|
while ( s < end && is_wspace_a ( *s ) ) s++;
|
|
// return 0 if all spaces
|
|
if ( s == end ) return 0;
|
|
long i = 0;
|
|
long long val = 0;
|
|
while ( i < len && is_hex(s[i]) )
|
|
val = val * 16 + htob ( s[i++] );
|
|
return val;
|
|
}
|
|
|
|
// convert hex ascii string into binary at "dst"
|
|
void hexToBin ( char *src , long srcLen , char *dst ) {
|
|
char *srcEnd = src + srcLen;
|
|
for ( ; src && src < srcEnd ; ) {
|
|
*dst = htob(*src++);
|
|
*dst <<= 4;
|
|
*dst |= htob(*src++);
|
|
dst++;
|
|
}
|
|
// sanity check
|
|
if ( src != srcEnd ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
void binToHex ( unsigned char *src , long srcLen , char *dst ) {
|
|
unsigned char *srcEnd = src + srcLen;
|
|
for ( ; src && src < srcEnd ; ) {
|
|
*dst++ = btoh(*src>>4);
|
|
*dst++ = btoh(*src&15);
|
|
src++;
|
|
}
|
|
// sanity check
|
|
if ( src != srcEnd ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
|
|
|
|
// . like strstr but haystack may not be NULL terminated
|
|
// . needle, however, IS null terminated
|
|
char *strncasestr ( char *haystack , char *needle , long haystackSize ) {
|
|
long needleSize = gbstrlen(needle);
|
|
long n = haystackSize - needleSize ;
|
|
for ( long i = 0 ; i <= n ; i++ ) {
|
|
// keep looping if first chars do not match
|
|
if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) )
|
|
continue;
|
|
// if needle was only 1 char it's a match
|
|
if ( ! needle[1] ) return &haystack[i];
|
|
// compare the whole strings now
|
|
if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 )
|
|
return &haystack[i];
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
// . like strstr but haystack may not be NULL terminated
|
|
// . needle, however, IS null terminated
|
|
char *strncasestr ( char *haystack , char *needle ,
|
|
long haystackSize, long needleSize ) {
|
|
long n = haystackSize - needleSize ;
|
|
for ( long i = 0 ; i <= n ; i++ ) {
|
|
// keep looping if first chars do not match
|
|
if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) )
|
|
continue;
|
|
// if needle was only 1 char it's a match
|
|
if ( ! needle[1] ) return &haystack[i];
|
|
// compare the whole strings now
|
|
if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 )
|
|
return &haystack[i];
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
char *strnstr ( char *haystack , char *needle , long haystackSize ) {
|
|
long needleSize = gbstrlen(needle);
|
|
long n = haystackSize - needleSize ;
|
|
for ( long i = 0 ; i <= n ; i++ ) {
|
|
// keep looping if first chars do not match
|
|
if ( haystack[i] != needle[0] ) continue;
|
|
// if needle was only 1 char it's a match
|
|
if ( ! needle[1] ) return &haystack[i];
|
|
// compare the whole strings now
|
|
if ( strncmp ( &haystack[i] , needle , needleSize ) == 0 )
|
|
return &haystack[i];
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
// independent of case
|
|
char *gb_strcasestr ( char *haystack , char *needle ) {
|
|
long needleSize = gbstrlen(needle);
|
|
long haystackSize = gbstrlen(haystack);
|
|
long n = haystackSize - needleSize ;
|
|
for ( long i = 0 ; i <= n ; i++ ) {
|
|
// keep looping if first chars do not match
|
|
if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) )
|
|
continue;
|
|
// if needle was only 1 char it's a match
|
|
if ( ! needle[1] ) return &haystack[i];
|
|
// compare the whole strings now
|
|
if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 )
|
|
return &haystack[i];
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
char *gb_strncasestr ( char *haystack , long haystackSize , char *needle ) {
|
|
// temp term
|
|
char c = haystack[haystackSize];
|
|
haystack[haystackSize] = '\0';
|
|
char *res = gb_strcasestr ( haystack , needle );
|
|
haystack[haystackSize] = c;
|
|
return res;
|
|
}
|
|
|
|
// . convert < to < and > to >
|
|
// . store "t" into "s"
|
|
// . returns bytes stored into "s"
|
|
// . NULL terminates "s" if slen > 0
|
|
long saftenTags ( char *s , long slen , char *t , long tlen ) {
|
|
char *start = s ;
|
|
// bail if slen is 0
|
|
if ( slen <= 0 ) return 0;
|
|
// leave a char for the \0
|
|
char *send = s + slen - 1;
|
|
char *tend = t + tlen;
|
|
for ( ; t < tend && s + 4 < send ; t++ ) {
|
|
if ( *t == '<' ) {
|
|
*s++ = '&';
|
|
*s++ = 'l';
|
|
*s++ = 't';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
if ( *t == '>' ) {
|
|
*s++ = '&';
|
|
*s++ = 'g';
|
|
*s++ = 't';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
*s++ = *t;
|
|
}
|
|
// NULL terminate "s"
|
|
*s = '\0';
|
|
// return # of bytes, excluding \0, stored into s
|
|
return s - start;
|
|
}
|
|
|
|
// . if "doSpecial" is true, then we change <, > and & to
|
|
// the following:
|
|
// UnicodeData.txt:22E6;LESS-THAN BUT NOT EQUIVALENT TO;Sm;0;ON;;;;;Y;
|
|
// UnicodeData.txt:22E7;GREATER-THAN BUT NOT EQUIVALENT TO;Sm;0;ON;;;;;Y;
|
|
// UnicodeData.txt:E0026;TAG AMPERSAND;Cf;0;BN;;;;;N;;;;;
|
|
// UnicodeData.txt:235E;APL FUNCTIONAL SYMBOL QUOTE QUAD;So;0;L;;;;;N;;;;;
|
|
long htmlDecode ( char *dst , char *src , long srcLen , bool doSpecial ,
|
|
long niceness ) {
|
|
if ( srcLen == 0 ) return 0;
|
|
char *start = dst;
|
|
char *srcEnd = src + srcLen;
|
|
for ( ; src < srcEnd ; ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// utf8 support?
|
|
char size = getUtf8CharSize(src);
|
|
// all entities must start with '&'
|
|
if ( *src != '&' ) {
|
|
if ( size == 1 ) { *dst++ = *src++; continue; }
|
|
memcpy ( dst , src , size );
|
|
src += size;
|
|
dst += size;
|
|
continue;
|
|
//*dst++ = *src++; continue; }
|
|
}
|
|
// TODO: avoid doSpecial by not decoding crap in tags...
|
|
//if ( src[0] == '<' ) {
|
|
// // skip to tag end then!
|
|
//
|
|
// store decoded entity char into dst[j]
|
|
uint32_t c;
|
|
// "skip" is how many bytes the entites was in "src"
|
|
long skip = getEntity_a (src, srcEnd-src, &c );
|
|
// ignore the "entity" if it was invalid
|
|
if ( skip == 0 ) { *dst++ = *src++ ; continue; }
|
|
// force this now always since some tags contain "
|
|
// and it was causing the tags to be terminated too early
|
|
// for richmondspca.org
|
|
//if ( c == '\"' ) c = '\'';
|
|
//if ( c == '<' ) c = '[';
|
|
//if ( c == '>' ) c = ']';
|
|
// . special mapping
|
|
// . make < and > special so Xml::set() still works
|
|
// . and make & special so we do not screw up summaries
|
|
if ( doSpecial ) {
|
|
// no longer use this!
|
|
//char *xx=NULL;*xx=0;
|
|
if ( c == '<' ) {
|
|
// using [ and ] looks bad in event titles...
|
|
*dst = '|';
|
|
dst++;
|
|
src += skip;
|
|
continue;
|
|
memcpy(dst,"+!-",3);
|
|
//memcpy(dst,"<gb",3);
|
|
dst += 3;
|
|
src += skip;
|
|
continue;
|
|
// paragraph sign:
|
|
//c = 0xc2b6;
|
|
}
|
|
if ( c == '>' ) {
|
|
// using [ and ] looks bad in event titles...
|
|
*dst = '|';
|
|
dst++;
|
|
src += skip;
|
|
continue;
|
|
//memcpy(dst,"gb>",3);
|
|
memcpy(dst,"-!+",3);
|
|
dst += 3;
|
|
src += skip;
|
|
continue;
|
|
// high-rise hyphen:
|
|
//c = 0xc2af;
|
|
}
|
|
// some tags have " in their value strings
|
|
// so we have to preserve that!
|
|
// use curling quote:
|
|
//http://www.dwheeler.com/essays/quotes-test-utf-8.html
|
|
// curling double and single quotes resp:
|
|
// “ ” ‘ ”
|
|
if ( c == '\"' ) {
|
|
//c = 0x201c; // 0x235e;
|
|
*dst = '\'';
|
|
dst++;
|
|
src += skip;
|
|
continue;
|
|
}
|
|
//if ( c == '<' ) c = 0x22d6; // e6;
|
|
//if ( c == '>' ) c = 0x22d7; // e7;
|
|
// this was working ok, but just code it to an
|
|
// ampersand. when displaying a page we can code all
|
|
// ampersands back into & i guess! that way
|
|
// the check for a " & " in the place name in
|
|
// Address.cpp works out...
|
|
//if ( c == '&' ) c = 0xff06; // full width ampersand
|
|
}
|
|
// . otherwise it was a legit entity
|
|
// . store it into "dst" in utf8 format
|
|
// . "numBytes" is how many bytes it stored into 'dst"
|
|
long numBytes = utf8Encode ( c , dst );
|
|
// sanity check. do not eat our tail if dst == src
|
|
if ( numBytes > skip ) { char *xx=NULL;*xx=0; }
|
|
// advance dst ptr
|
|
dst += numBytes;
|
|
// skip over the encoded entity in the source string
|
|
src += skip;
|
|
}
|
|
// NULL term
|
|
*dst = '\0';
|
|
return dst - start;
|
|
}
|
|
|
|
// . make something safe as an form input value by translating the quotes
|
|
// . store "t" into "s" and return bytes stored
|
|
// . does not do bounds checking
|
|
long dequote ( char *s , char *send , char *t , long tlen ) {
|
|
char *start = s;
|
|
char *tend = t + tlen;
|
|
for ( ; t < tend && s < send ; t++ ) {
|
|
if ( *t == '"' ) {
|
|
if ( s + 5 >= send ) return 0;
|
|
*s++ = '&';
|
|
*s++ = '#';
|
|
*s++ = '3';
|
|
*s++ = '4';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
*s++ = *t;
|
|
}
|
|
// all or nothing
|
|
if ( s + 1 >= send ) return 0;
|
|
*s = '\0';
|
|
return s - start;
|
|
}
|
|
|
|
bool dequote ( SafeBuf* sb , char *t , long tlen ) {
|
|
char *tend = t + tlen;
|
|
for ( ; t < tend; t++ ) {
|
|
if ( *t == '"' ) {
|
|
sb->safeMemcpy(""", 5);
|
|
continue;
|
|
}
|
|
*sb += *t;
|
|
}
|
|
*sb += '\0';
|
|
return true;
|
|
}
|
|
|
|
//long dequote ( char *s , char *t ) {
|
|
// return dequote ( s , t , gbstrlen ( t ) );
|
|
//}
|
|
|
|
// . entity-ize a string so it's safe for html output
|
|
// . store "t" into "s" and return bytes stored
|
|
// . does bounds checking
|
|
char *htmlEncode ( char *s , char *send , char *t , char *tend , bool pound ,
|
|
long niceness ) {
|
|
for ( ; t < tend ; t++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( s + 7 >= send ) { *s = '\0'; return s; }
|
|
if ( *t == '"' ) {
|
|
*s++ = '&';
|
|
*s++ = '#';
|
|
*s++ = '3';
|
|
*s++ = '4';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
if ( *t == '<' ) {
|
|
*s++ = '&';
|
|
*s++ = 'l';
|
|
*s++ = 't';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
if ( *t == '>' ) {
|
|
*s++ = '&';
|
|
*s++ = 'g';
|
|
*s++ = 't';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
if ( *t == '&' ) {
|
|
*s++ = '&';
|
|
*s++ = 'a';
|
|
*s++ = 'm';
|
|
*s++ = 'p';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
if ( *t == '#' && pound ) {
|
|
*s++ = '&';
|
|
*s++ = '#';
|
|
*s++ = '0';
|
|
*s++ = '3';
|
|
*s++ = '5';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
*s++ = *t;
|
|
}
|
|
*s = '\0';
|
|
return s;
|
|
}
|
|
|
|
|
|
// . entity-ize a string so it's safe for html output
|
|
// . store "t" into "s" and return true on success
|
|
bool htmlEncode ( SafeBuf* s , char *t , char *tend , bool pound ,
|
|
long niceness ) {
|
|
for ( ; t < tend ; t++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( *t == '"' ) {
|
|
s->safeMemcpy(""", 5);
|
|
continue;
|
|
}
|
|
if ( *t == '<' ) {
|
|
s->safeMemcpy("<", 4);
|
|
continue;
|
|
}
|
|
if ( *t == '>' ) {
|
|
s->safeMemcpy(">", 4);
|
|
continue;
|
|
}
|
|
if ( *t == '&' ) {
|
|
s->safeMemcpy("&", 5);
|
|
continue;
|
|
}
|
|
if ( *t == '#' && pound ) {
|
|
s->safeMemcpy("#", 6);
|
|
continue;
|
|
}
|
|
// our own specially decoded entites!
|
|
if ( *t == '+' && t[1]=='!' && t[2]=='-' ) {
|
|
s->safeMemcpy("<",4);
|
|
continue;
|
|
}
|
|
// our own specially decoded entites!
|
|
if ( *t == '-' && t[1]=='!' && t[2]=='+' ) {
|
|
s->safeMemcpy(">",4);
|
|
continue;
|
|
}
|
|
*s += *t;
|
|
}
|
|
*s += '\0';
|
|
return true;
|
|
}
|
|
|
|
// . convert "-->%22 , &-->%26, +-->%2b, space-->+, ?-->%3f is that it?
|
|
// . convert so we can display as a cgi PARAMETER within a url
|
|
// . used by HttPage2 (cached web page) to encode the query into a url
|
|
// . used by PageRoot to do likewise
|
|
// . returns bytes written into "d" not including terminating \0
|
|
long urlEncode ( char *d , long dlen , char *s , long slen, bool requestPath ) {
|
|
char *dstart = d;
|
|
// subtract 1 to make room for a terminating \0
|
|
char *dend = d + dlen - 1;
|
|
char *send = s + slen;
|
|
for ( ; s < send && d < dend ; s++ ) {
|
|
if ( *s == '\0' && requestPath ) {
|
|
*d++ = *s;
|
|
continue;
|
|
}
|
|
// encode if not fit for display
|
|
if ( ! is_ascii ( *s ) ) goto encode;
|
|
switch ( *s ) {
|
|
case ' ': goto encode;
|
|
case '&': goto encode;
|
|
case '"': goto encode;
|
|
case '+': goto encode;
|
|
case '%': goto encode;
|
|
case '#': goto encode;
|
|
// encoding < and > are more for displaying on an
|
|
// html page than sending to an http server
|
|
case '>': goto encode;
|
|
case '<': goto encode;
|
|
case '?': if ( requestPath ) break;
|
|
goto encode;
|
|
}
|
|
// otherwise, no need to encode
|
|
*d++ = *s;
|
|
continue;
|
|
encode:
|
|
// space to +
|
|
if ( *s == ' ' && d + 1 < dend ) { *d++ = '+'; continue; }
|
|
// break out if no room to encode
|
|
if ( d + 2 >= dend ) break;
|
|
*d++ = '%';
|
|
// store first hex digit
|
|
unsigned char v = ((unsigned char)*s)/16 ;
|
|
if ( v < 10 ) v += '0';
|
|
else v += 'A' - 10;
|
|
*d++ = v;
|
|
// store second hex digit
|
|
v = ((unsigned char)*s) & 0x0f ;
|
|
if ( v < 10 ) v += '0';
|
|
else v += 'A' - 10;
|
|
*d++ = v;
|
|
}
|
|
// NULL terminate it
|
|
*d = '\0';
|
|
// and return the length
|
|
return d - dstart;
|
|
}
|
|
|
|
// determine the length of the encoded url, does NOT include NULL
|
|
long urlEncodeLen ( char *s , long slen , bool requestPath ) {
|
|
long dLen = 0;
|
|
char *send = s + slen;
|
|
for ( ; s < send ; s++ ) {
|
|
if ( *s == '\0' && requestPath ) {
|
|
dLen++;
|
|
continue;
|
|
}
|
|
// encode if not fit for display
|
|
if ( ! is_ascii ( *s ) ) goto encode;
|
|
switch ( *s ) {
|
|
case ' ': goto encode;
|
|
case '&': goto encode;
|
|
case '"': goto encode;
|
|
case '+': goto encode;
|
|
case '%': goto encode;
|
|
case '#': goto encode;
|
|
// encoding < and > are more for displaying on an
|
|
// html page than sending to an http server
|
|
case '>': goto encode;
|
|
case '<': goto encode;
|
|
case '?': if ( requestPath ) break;
|
|
goto encode;
|
|
}
|
|
// otherwise, no need to encode
|
|
dLen++;
|
|
continue;
|
|
encode:
|
|
// space to +
|
|
if ( *s == ' ' ) { dLen++; continue; }
|
|
// hex code
|
|
dLen += 3; // %XX
|
|
}
|
|
//dLen++; // NULL TERM
|
|
// and return the length
|
|
return dLen;
|
|
}
|
|
|
|
// . decodes "s/slen" and stores into "dest"
|
|
// . returns the number of bytes stored into "dest"
|
|
long urlDecode ( char *dest , char *s , long slen ) {
|
|
long j = 0;
|
|
for ( long i = 0 ; i < slen ; i++ ) {
|
|
if ( s[i] == '+' ) { dest[j++]=' '; continue; }
|
|
dest[j++] = s[i];
|
|
if ( s[i] != '%' ) continue;
|
|
if ( i + 2 >= slen ) continue;
|
|
// if two chars after are not hex chars, it's not an encoding
|
|
if ( ! is_hex ( s[i+1] ) ) continue;
|
|
if ( ! is_hex ( s[i+2] ) ) continue;
|
|
// convert hex chars to values
|
|
unsigned char a = htob ( s[i+1] ) * 16;
|
|
unsigned char b = htob ( s[i+2] ) ;
|
|
dest[j-1] = (char) (a + b);
|
|
i += 2;
|
|
}
|
|
return j;
|
|
}
|
|
|
|
// . like above, but only decodes chars that should not have been encoded
|
|
// . will also encode binary chars
|
|
long urlNormCode ( char *d , long dlen , char *s , long slen ) {
|
|
// save start of detination buffer for returning the length
|
|
char *dstart = d;
|
|
// subtract 1 for NULL termination
|
|
char *dend = d + dlen - 1;
|
|
char *send = s + slen;
|
|
for ( ; s < send && d < dend ; s++ ) {
|
|
// if its non-ascii, encode it so it displays correctly
|
|
if ( ! is_ascii ( *s ) ) {
|
|
// break if no room to encode it
|
|
if ( d + 2 >= dend ) break;
|
|
// store it encoded
|
|
*d++ = '%';
|
|
// store first hex digit
|
|
unsigned char v = ((unsigned char)*s)/16 ;
|
|
if ( v < 10 ) v += '0';
|
|
else v += 'A' - 10;
|
|
*d++ = v;
|
|
// store second hex digit
|
|
v = ((unsigned char)*s) & 0x0f ;
|
|
if ( v < 10 ) v += '0';
|
|
else v += 'A' - 10;
|
|
*d++ = v;
|
|
continue;
|
|
}
|
|
// store it
|
|
*d++ = *s;
|
|
// but it might be something encoded that should not have been
|
|
if ( *s != '%' ) continue;
|
|
// it requires to following chars to decode
|
|
if ( s + 2 >= send ) continue;
|
|
// if two chars after are not hex chars, it's not an encoding
|
|
if ( ! is_hex ( s[1] ) ) continue;
|
|
if ( ! is_hex ( s[2] ) ) continue;
|
|
// convert hex chars to values
|
|
unsigned char a = htob ( s[1] ) * 16;
|
|
unsigned char b = htob ( s[2] ) ;
|
|
unsigned char v = a + b;
|
|
// don't decode if it decodes in these chars
|
|
switch ( v ) {
|
|
case ' ': continue;
|
|
case '&': continue;
|
|
case '"': continue;
|
|
case '+': continue;
|
|
case '%': continue;
|
|
case '>': continue;
|
|
case '<': continue;
|
|
case '?': continue;
|
|
case '=': continue;
|
|
}
|
|
// otherwise, it's fine to decode it
|
|
d[-1] = (char) (a + b);
|
|
// skip over those 2 chars as well as leading '%'
|
|
s += 2;
|
|
}
|
|
// NULL terminate
|
|
*d = '\0';
|
|
// return length
|
|
return d - dstart ;
|
|
}
|
|
|
|
// approximate # of non-punct words
|
|
long getNumWords ( char *s ) {
|
|
long count = 0;
|
|
loop:
|
|
// skip punct
|
|
while ( ! is_alnum_a(*s) ) s++;
|
|
// bail if done
|
|
if ( !*s ) return count;
|
|
// count a word
|
|
count++;
|
|
// skip word
|
|
while ( is_alnum_a(*s) ) s++;
|
|
// watch for ' letter punct
|
|
if ( *s=='\'' && is_alnum_a(*(s+1)) && !is_alnum_a(*(s+2)) ) {
|
|
// skip apostrophe
|
|
s++;
|
|
// skip rest of word
|
|
while ( is_alnum_a(*s) ) s++;
|
|
}
|
|
goto loop;
|
|
}
|
|
|
|
static long long s_adjustment = 0;
|
|
|
|
long long globalToLocalTimeMilliseconds ( long long global ) {
|
|
// sanity check
|
|
//if ( ! g_clockInSync )
|
|
// log("gb: Converting global time but clock not in sync.");
|
|
return global - s_adjustment;
|
|
}
|
|
|
|
long long localToGlobalTimeMilliseconds ( long long local ) {
|
|
// sanity check
|
|
//if ( ! g_clockInSync )
|
|
// log("gb: Converting global time but clock not in sync.");
|
|
return local + s_adjustment;
|
|
}
|
|
|
|
long globalToLocalTimeSeconds ( long global ) {
|
|
// sanity check
|
|
//if ( ! g_clockInSync )
|
|
// log("gb: Converting global time but clock not in sync.");
|
|
return global - (s_adjustment/1000);
|
|
}
|
|
|
|
long localToGlobalTimeSeconds ( long local ) {
|
|
// sanity check
|
|
//if ( ! g_clockInSync )
|
|
// log("gb: Converting global time but clock not in sync.");
|
|
return local + (s_adjustment/1000);
|
|
}
|
|
|
|
#include "Timedb.h"
|
|
|
|
|
|
static char s_tafile[1024];
|
|
static bool s_hasFileName = false;
|
|
|
|
// returns false and sets g_errno on error
|
|
bool setTimeAdjustmentFilename ( char *dir, char *filename ) {
|
|
s_hasFileName = true;
|
|
long len1 = gbstrlen(dir);
|
|
long len2 = gbstrlen(filename);
|
|
if ( len1 + len2 > 1000 ) { char *xx=NULL;*xx=0; }
|
|
sprintf(s_tafile,"%s/%s",dir,filename);
|
|
return true;
|
|
}
|
|
|
|
// returns false and sets g_errno on error
|
|
bool loadTimeAdjustment ( ) {
|
|
// bail if no filename to read
|
|
if ( ! s_hasFileName ) return true;
|
|
// read it in
|
|
// one line in text
|
|
int fd = open ( s_tafile , O_RDONLY );
|
|
if ( fd < 0 ) {
|
|
log("util: could not open %s for reading",s_tafile);
|
|
g_errno = errno;
|
|
return false;
|
|
}
|
|
char rbuf[1024];
|
|
// read in max bytes
|
|
int nr = read ( fd , rbuf , 1000 );
|
|
if ( nr <= 10 || nr > 1000 ) {
|
|
log("util: reading %s had error: %s",s_tafile,
|
|
mstrerror(errno));
|
|
close(fd);
|
|
g_errno = errno;
|
|
return false;
|
|
}
|
|
close(fd);
|
|
// parse the text line
|
|
long long stampTime = 0LL;
|
|
long long clockAdj = 0LL;
|
|
sscanf ( rbuf , "%llu %lli", &stampTime, &clockAdj );
|
|
// get stamp age
|
|
long long local = gettimeofdayInMillisecondsLocal();
|
|
long long stampAge = local - stampTime;
|
|
// if too old forget about it
|
|
if ( stampAge > 2*86400 ) return true;
|
|
// update adjustment
|
|
s_adjustment = clockAdj;
|
|
// if stamp in file is within 2 days old, assume its still good
|
|
// this will prevent having to rebuild a sortbydatetable
|
|
// and really slow down loadups
|
|
g_clockInSync = true;
|
|
// note it
|
|
log("util: loaded %s and put clock in sync. age=%llu adj=%lli",
|
|
s_tafile,stampAge,clockAdj);
|
|
return true;
|
|
}
|
|
|
|
// . returns false and sets g_errno on error
|
|
// . saved by Process::saveBlockingFiles1()
|
|
bool saveTimeAdjustment ( ) {
|
|
// fortget it if setTimeAdjustmentFilename never called
|
|
if ( ! s_hasFileName ) return true;
|
|
// must be in sync!
|
|
if ( ! g_clockInSync ) return true;
|
|
// store it
|
|
long long local = gettimeofdayInMillisecondsLocal();
|
|
char wbuf[1024];
|
|
sprintf (wbuf,"%llu %lli\n",local,s_adjustment);
|
|
// write it out
|
|
int fd = open ( s_tafile , O_CREAT|O_RDWR|O_TRUNC , 00666 );
|
|
if ( fd < 0 ) {
|
|
log("util: could not open %s for writing",s_tafile);
|
|
g_errno = errno;
|
|
return false;
|
|
}
|
|
// how many bytes to write?
|
|
long len = gbstrlen(wbuf);
|
|
// read in max bytes
|
|
int nw = write ( fd , wbuf , len );
|
|
if ( nw != len ) {
|
|
log("util: writing %s had error: %s",s_tafile,
|
|
mstrerror(errno));
|
|
close(fd);
|
|
g_errno = errno;
|
|
return false;
|
|
}
|
|
close(fd);
|
|
// note it
|
|
log("util: saved %s",s_tafile);
|
|
// it was written ok
|
|
return true;
|
|
}
|
|
|
|
// a "fake" settimeofdayInMilliseconds()
|
|
void settimeofdayInMillisecondsGlobal ( long long newTime ) {
|
|
// can't do this in sig handler
|
|
if ( g_inSigHandler ) return;
|
|
// this isn't async signal safe...
|
|
struct timeval tv;
|
|
gettimeofday ( &tv , NULL );
|
|
long long now=(long long)(tv.tv_usec/1000)+((long long)tv.tv_sec)*1000;
|
|
// bail if no change... UNLESS we need to sync clock!!
|
|
if ( s_adjustment == newTime - now && g_clockInSync ) return;
|
|
// log it, that way we know if there is another issue
|
|
// with flip-flopping (before we synced with host #0 and also
|
|
// with proxy #0)
|
|
long long delta = s_adjustment - (newTime - now) ;
|
|
if ( delta > 100 || delta < -100 )
|
|
logf(LOG_INFO,"gb: Updating clock adjustment from "
|
|
"%lli ms to %lli ms", s_adjustment , newTime - now );
|
|
// set adjustment
|
|
s_adjustment = newTime - now;
|
|
// return?
|
|
if ( g_clockInSync ) return;
|
|
// we are now in sync
|
|
g_clockInSync = true;
|
|
// log it
|
|
if ( s_hasFileName )
|
|
logf(LOG_INFO,"gb: clock is now synced with host #0. "
|
|
"saving to %s",s_tafile);
|
|
else
|
|
logf(LOG_INFO,"gb: clock is now synced with host #0.");
|
|
// save
|
|
saveTimeAdjustment();
|
|
// force timedb to load now!
|
|
//initAllSortByDateTables ( );
|
|
}
|
|
|
|
long getTimeGlobal() {
|
|
return gettimeofdayInMillisecondsSynced() / 1000;
|
|
}
|
|
|
|
long getTimeGlobalNoCore() {
|
|
return gettimeofdayInMillisecondsGlobalNoCore() / 1000;
|
|
}
|
|
|
|
long getTimeSynced() {
|
|
return gettimeofdayInMillisecondsSynced() / 1000;
|
|
}
|
|
|
|
long long gettimeofdayInMillisecondsGlobal() {
|
|
return gettimeofdayInMillisecondsSynced();
|
|
}
|
|
|
|
#include "Threads.h"
|
|
|
|
long long gettimeofdayInMillisecondsSynced() {
|
|
// if in a sig handler then return g_now
|
|
//if ( g_inSigHandler ) return g_nowGlobal;
|
|
// i find that a pthread can call this function even though
|
|
// a signal handler is underway in the main thread!
|
|
if ( g_inSigHandler && ! g_threads.amThread() ) {
|
|
char *xx = NULL; *xx = 0; }
|
|
// sanity check
|
|
if ( ! isClockInSync() ) { char *xx = NULL; *xx = 0; }
|
|
//if ( ! g_clockInSync )
|
|
// log("gb: Getting global time but clock not in sync.");
|
|
// this isn't async signal safe...
|
|
struct timeval tv;
|
|
gettimeofday ( &tv , NULL );
|
|
long long now=(long long)(tv.tv_usec/1000)+((long long)tv.tv_sec)*1000;
|
|
// update g_nowLocal
|
|
if ( now > g_now ) g_now = now;
|
|
// adjust from Msg0x11 time adjustments
|
|
now += s_adjustment;
|
|
// update g_now if it is more accurate
|
|
//if ( now > g_nowGlobal ) g_nowGlobal = now;
|
|
return now;
|
|
}
|
|
|
|
long long gettimeofdayInMillisecondsGlobalNoCore() {
|
|
// if in a sig handler then return g_now
|
|
//if ( g_inSigHandler ) return g_nowGlobal;
|
|
// i find that a pthread can call this function even though
|
|
// a signal handler is underway in the main thread!
|
|
if ( g_inSigHandler && ! g_threads.amThread() ) {
|
|
char *xx = NULL; *xx = 0; }
|
|
// sanity check
|
|
//if ( ! g_clockInSync ) { char *xx = NULL; *xx = 0; }
|
|
//if ( ! g_clockInSync )
|
|
// log("gb: Getting global time but clock not in sync.");
|
|
// this isn't async signal safe...
|
|
struct timeval tv;
|
|
gettimeofday ( &tv , NULL );
|
|
long long now=(long long)(tv.tv_usec/1000)+((long long)tv.tv_sec)*1000;
|
|
// update g_nowLocal
|
|
if ( now > g_now ) g_now = now;
|
|
// adjust from Msg0x11 time adjustments
|
|
now += s_adjustment;
|
|
// update g_now if it is more accurate
|
|
//if ( now > g_nowGlobal ) g_nowGlobal = now;
|
|
return now;
|
|
}
|
|
|
|
long long gettimeofdayInMillisecondsLocal() {
|
|
return gettimeofdayInMilliseconds();
|
|
}
|
|
|
|
uint64_t gettimeofdayInMicroseconds(void) {
|
|
struct timeval tv;
|
|
gettimeofday(&tv, NULL);
|
|
return(((uint64_t)tv.tv_sec * 1000000LL) + (uint64_t)tv.tv_usec);
|
|
}
|
|
|
|
// "local" means the time on this machine itself, NOT a timezone thing.
|
|
long long gettimeofdayInMilliseconds() {
|
|
// if in a sig handler then return g_now
|
|
//if ( g_inSigHandler ) return g_now;
|
|
// i find that a pthread can call this function even though
|
|
// a signal handler is underway in the main thread!
|
|
if ( g_inSigHandler && ! g_threads.amThread() ) {
|
|
char *xx = NULL; *xx = 0; }
|
|
// this isn't async signal safe...
|
|
struct timeval tv;
|
|
//g_loop.disableTimer();
|
|
gettimeofday ( &tv , NULL );
|
|
//g_loop.enableTimer();
|
|
long long now=(long long)(tv.tv_usec/1000)+((long long)tv.tv_sec)*1000;
|
|
// update g_nowLocal
|
|
if ( now > g_now ) g_now = now;
|
|
// adjust from Msg0x11 time adjustments
|
|
//now += s_adjustment;
|
|
// update g_now if it is more accurate
|
|
// . or don't, bad to update it here because it could be very different
|
|
// from what it should be
|
|
//if ( now > g_now ) g_now = now;
|
|
return now;
|
|
}
|
|
|
|
time_t getTime () {
|
|
return getTimeLocal();
|
|
}
|
|
|
|
// . get time in seconds
|
|
// . use this instead of call to time(NULL) cuz it uses adjustment
|
|
time_t getTimeLocal () {
|
|
// if in a sig handler then return g_now/1000
|
|
//if ( g_inSigHandler ) return (time_t)(g_now / 1000);
|
|
// i find that a pthread can call this function even though
|
|
// a signal handler is underway in the main thread!
|
|
if ( g_inSigHandler && ! g_threads.amThread() ) {
|
|
char *xx = NULL; *xx = 0; }
|
|
// get time now
|
|
unsigned long now = gettimeofdayInMilliseconds() / 1000;
|
|
// and adjust it
|
|
//now += s_adjustment / 1000;
|
|
return (time_t)now;
|
|
}
|
|
|
|
// . make it so we can display the ascii string on an html browser
|
|
long saftenTags2 ( char *s , long slen , char *t , long tlen ) {
|
|
char *start = s ;
|
|
// bail if slen is 0
|
|
if ( slen <= 0 ) return 0;
|
|
// leave a char for the \0
|
|
char *send = s + slen - 1;
|
|
char *tend = t + tlen;
|
|
for ( ; t < tend && s + 6 < send ; t++ ) {
|
|
if ( *t == '<' ) {
|
|
*s++ = '&';
|
|
*s++ = 'l';
|
|
*s++ = 't';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
if ( *t == '>' ) {
|
|
*s++ = '&';
|
|
*s++ = 'g';
|
|
*s++ = 't';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
if ( *t == '&' ) {
|
|
*s++ = '&';
|
|
*s++ = 'a';
|
|
*s++ = 'm';
|
|
*s++ = 'p';
|
|
*s++ = ';';
|
|
continue;
|
|
}
|
|
*s++ = *t;
|
|
}
|
|
// return NULL if we broke out because there was not enough room
|
|
//if ( s + 6 >= send ) return NULL;
|
|
// NULL terminate "s"
|
|
*s = '\0';
|
|
// return # of bytes, excluding \0, stored into s
|
|
return s - start;
|
|
}
|
|
|
|
void getCalendarFromMs(long long ms,
|
|
long* days,
|
|
long* hours,
|
|
long* minutes,
|
|
long* secs,
|
|
long* msecs) {
|
|
long s = 1000;
|
|
long m = s * 60;
|
|
long h = m * 60;
|
|
long d = h * 24;
|
|
|
|
*days = ms / d;
|
|
long long tmp = ms % d;
|
|
*hours = tmp / h;
|
|
tmp = tmp % h;
|
|
*minutes = tmp / m;
|
|
tmp = tmp % m;
|
|
*secs = tmp / s;
|
|
|
|
*msecs = tmp % s;
|
|
}
|
|
|
|
unsigned long calculateChecksum(char *buf, long bufLen){
|
|
unsigned long sum = 0;
|
|
for(long i = 0; i < bufLen>>2;i++)
|
|
sum += ((unsigned long*)buf)[i];
|
|
return sum;
|
|
}
|
|
|
|
bool anchorIsLink( char *tag, long tagLen){
|
|
if (strncasestr(tag, tagLen, "href")) return true;
|
|
if (strncasestr(tag, tagLen, "onclick")) return true;
|
|
return false;
|
|
}
|
|
|
|
bool has_alpha_a ( char *s , char *send ) {
|
|
for ( ; s < send ; s++ )
|
|
if (is_alpha_a(*s)) return true;
|
|
return false;
|
|
}
|
|
|
|
bool has_alpha_utf8 ( char *s , char *send ) {
|
|
char cs = 0;
|
|
for ( ; s < send ; s += cs ) {
|
|
cs = getUtf8CharSize ( s );
|
|
if ( cs == 1 ) {
|
|
if (is_alpha_a(*s)) return true;
|
|
continue;
|
|
}
|
|
if ( is_alpha_utf8(s) ) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
//takes an input skips leading spaces
|
|
//puts next nonspace char* in numPtr
|
|
//an returns the next space after that.
|
|
char* getNextNum(char* input, char** numPtr) {
|
|
char* p = input;
|
|
char* nextspace;
|
|
while(*p && isspace(*p)) p++;
|
|
nextspace = p;
|
|
*numPtr = p;
|
|
while(*nextspace && !isspace(*nextspace))
|
|
nextspace++;
|
|
return nextspace;
|
|
}
|
|
|
|
|
|
// returns length of stripped content, but will set g_errno and return -1
|
|
// on error
|
|
long stripHtml( char *content, long contentLen, long version, long strip ) {
|
|
if ( !strip ) {
|
|
log( LOG_WARN, "query: html stripping not required!" );
|
|
return contentLen;
|
|
}
|
|
if ( ! content )
|
|
return 0;
|
|
if ( contentLen == 0 )
|
|
return 0;
|
|
|
|
// filter content if we should
|
|
// keep this on the big stack so "content" still references something
|
|
Xml tmpXml;
|
|
// . get the content as xhtml (should be NULL terminated)
|
|
// . parse as utf8 since all we are doing is messing with
|
|
// the tags...content manipulation comes later
|
|
if ( ! tmpXml.set ( content , contentLen,
|
|
false, 0, false, version ) )
|
|
return -1;
|
|
|
|
//if( strip == 4 )
|
|
// return tmpXml.getText( content, contentLen );
|
|
|
|
// go tag by tag
|
|
long n = tmpXml.getNumNodes();
|
|
XmlNode *nodes = tmpXml.getNodes();
|
|
// Xml class may have converted to utf16
|
|
content = tmpXml.getContent();
|
|
contentLen = tmpXml.getContentLen();
|
|
char *x = content;
|
|
char *xend = content + contentLen;
|
|
long stackid = -1;
|
|
long stackc = 0;
|
|
char skipIt = 0;
|
|
// . hack COL tag to NOT require a back tag
|
|
// . do not leave it that way as it could mess up our parsing
|
|
//g_nodes[25].m_hasBackTag = 0;
|
|
for ( long i = 0 ; i < n ; i++ ) {
|
|
// get id of this node
|
|
long id = nodes[i].m_nodeId;
|
|
|
|
// if strip is 4, just remove the script tag
|
|
if( strip == 4 ){
|
|
if ( id ){
|
|
if ( id == 83 ){
|
|
skipIt ^= 1;
|
|
continue;
|
|
}
|
|
}
|
|
else if ( skipIt ) continue;
|
|
goto keepit;
|
|
}
|
|
|
|
// if strip is 3, ALL tags will be removed!
|
|
if( strip == 3 ) {
|
|
if( id ) {
|
|
// . we dont want anything in between:
|
|
// - script tags (83)
|
|
// - style tags (111)
|
|
if ((id == 83) || (id == 111)) skipIt ^= 1;
|
|
// save img to have alt text kept.
|
|
if ( id == 54 ) goto keepit;
|
|
continue;
|
|
}
|
|
else {
|
|
if( skipIt ) continue;
|
|
goto keepit;
|
|
}
|
|
}
|
|
// get it
|
|
long fk;
|
|
if ( strip == 1 ) fk = g_nodes[id].m_filterKeep1;
|
|
else fk = g_nodes[id].m_filterKeep2;
|
|
// if tag is <link ...> only keep it if it has
|
|
// rel="stylesheet" or rel=stylesheet
|
|
if ( strip == 2 && id == 62 ) { // <link> tag id
|
|
long fflen;
|
|
char *ff = nodes[i].getFieldValue ( "rel" , &fflen );
|
|
if ( ff && fflen == 10 &&
|
|
strncmp(ff,"stylesheet",10) == 0 )
|
|
goto keepit;
|
|
}
|
|
// just remove just the tag if this is 2
|
|
if ( fk == 2 ) continue;
|
|
// keep it if not in a stack
|
|
if ( ! stackc && fk ) goto keepit;
|
|
// if no front/back for tag, just skip it
|
|
if ( ! nodes[i].m_hasBackTag ) continue;
|
|
// start stack if none
|
|
if ( stackc == 0 ) {
|
|
// but not if this is a back tag
|
|
if ( nodes[i].m_node[1] == '/' ) continue;
|
|
// now start the stack
|
|
stackid = id;
|
|
stackc = 1;
|
|
continue;
|
|
}
|
|
// skip if this tag does not match what is on stack
|
|
if ( id != stackid ) continue;
|
|
// if ANOTHER front tag, inc stack
|
|
if ( nodes[i].m_node[1] != '/' ) stackc++;
|
|
// otherwise, dec the stack count
|
|
else stackc--;
|
|
// . ensure not negative from excess back tags
|
|
// . reset stackid to -1 to indicate no stack
|
|
if ( stackc <= 0 ) { stackid= -1; stackc = 0; }
|
|
// skip it
|
|
continue;
|
|
keepit:
|
|
// replace images with their alt text
|
|
long vlen;
|
|
char *v;
|
|
if ( id == 54 ) {
|
|
v = nodes[i].getFieldValue("alt", &vlen );
|
|
// try title if no alt text
|
|
if ( ! v )
|
|
v = nodes[i].getFieldValue("title", &vlen );
|
|
if ( v ) { memcpy ( x, v, vlen ); x += vlen; }
|
|
continue;
|
|
}
|
|
// remove background image from body,table,td tags
|
|
if ( id == 19 || id == 93 || id == 95 ) {
|
|
v = nodes[i].getFieldValue("background", &vlen);
|
|
// remove background, just sabotage it
|
|
if ( v ) v[-4] = 'x';
|
|
}
|
|
// store it
|
|
memcpy ( x , nodes[i].m_node , nodes[i].m_nodeLen );
|
|
x += nodes[i].m_nodeLen;
|
|
// sanity check
|
|
if ( x > xend ) { char *xx=NULL;*xx=0;}
|
|
}
|
|
contentLen = x - content;
|
|
content [ contentLen ] = '\0';
|
|
// unhack COL tag
|
|
//g_nodes[25].m_hasBackTag = 1;
|
|
return contentLen;
|
|
}
|
|
|
|
|
|
bool is_urlchar(char s) {
|
|
// [a-z0-9/:_-.?$,~=#&%+@]
|
|
if(isalnum(s)) return true;
|
|
if(s == '/' ||
|
|
s == ':' ||
|
|
s == '_' ||
|
|
s == '-' ||
|
|
s == '.' ||
|
|
s == '?' ||
|
|
s == '$' ||
|
|
s == ',' ||
|
|
s == '~' ||
|
|
s == '=' ||
|
|
s == '#' ||
|
|
s == '&' ||
|
|
s == '%' ||
|
|
s == '+' ||
|
|
s == '@') return true;
|
|
return false;
|
|
}
|
|
// don't allow "> in our input boxes
|
|
long cleanInput(char *outbuf, long outbufSize, char *inbuf, long inbufLen){
|
|
char *p = outbuf;
|
|
long numQuotes=0;
|
|
long lastQuote = 0;
|
|
for (long i=0;i<inbufLen;i++){
|
|
if (p-outbuf >= outbufSize-1) break;
|
|
|
|
if (inbuf[i] == '"'){
|
|
numQuotes++;
|
|
lastQuote = i;
|
|
}
|
|
// if we have an odd number of quotes and a close angle bracket
|
|
// it could be an xss attempt
|
|
if (inbuf[i] == '>' && (numQuotes & 1)) {
|
|
p = outbuf+lastQuote;
|
|
break;
|
|
}
|
|
*p = inbuf[i];
|
|
p++;
|
|
}
|
|
*p = '\0';
|
|
return p-outbuf;
|
|
}
|
|
|
|
|
|
//
|
|
// get rid of the virtual Msg class because it screws up how we
|
|
// serialize/deserialize everytime we compile gb it seems
|
|
//
|
|
|
|
long getMsgStoredSize ( long baseSize,
|
|
long *firstSizeParm,
|
|
long *lastSizeParm ) {
|
|
//long size = (long)sizeof(Msg);
|
|
long size = baseSize;//getBaseSize();
|
|
// add up string buffer sizes
|
|
long *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
|
|
long *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displayMeta
|
|
for ( ; sizePtr <= sizeEnd ; sizePtr++ )
|
|
size += *sizePtr;
|
|
return size;
|
|
}
|
|
|
|
// . return ptr to the buffer we serialize into
|
|
// . return NULL and set g_errno on error
|
|
char *serializeMsg ( long baseSize ,
|
|
long *firstSizeParm ,
|
|
long *lastSizeParm ,
|
|
char **firstStrPtr ,
|
|
void *thisPtr ,
|
|
long *retSize ,
|
|
char *userBuf ,
|
|
long userBufSize ,
|
|
bool makePtrsRefNewBuf ) {
|
|
// make a buffer to serialize into
|
|
char *buf = NULL;
|
|
//long need = getStoredSize();
|
|
long need = getMsgStoredSize(baseSize,firstSizeParm,lastSizeParm);
|
|
// big enough?
|
|
if ( need <= userBufSize ) buf = userBuf;
|
|
// alloc if we should
|
|
if ( ! buf ) buf = (char *)mmalloc ( need , "Ra" );
|
|
// bail on error, g_errno should be set
|
|
if ( ! buf ) return NULL;
|
|
// set how many bytes we will serialize into
|
|
*retSize = need;
|
|
// copy the easy stuff
|
|
char *p = buf;
|
|
memcpy ( p , (char *)thisPtr , baseSize );//getBaseSize() );
|
|
p += baseSize; // getBaseSize();
|
|
// then store the strings!
|
|
long *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
|
|
long *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displayMet
|
|
char **strPtr = firstStrPtr;//getFirstStrPtr (); // &ptr_qbuf;
|
|
for ( ; sizePtr <= sizeEnd ; ) {
|
|
// if we are NULL, we are a "bookmark", so
|
|
// we alloc'd space for it, but don't copy into
|
|
// the space until after this call toe serialize()
|
|
if ( ! *strPtr ) goto skip;
|
|
// sanity check -- cannot copy onto ourselves
|
|
if ( p > *strPtr && p < *strPtr + *sizePtr ) {
|
|
char *xx = NULL; *xx = 0; }
|
|
// copy the string into the buffer
|
|
memcpy ( p , *strPtr , *sizePtr );
|
|
skip:
|
|
// . make it point into the buffer now
|
|
// . MDW: why? that is causing problems for the re-call in
|
|
// Msg3a, it calls this twice with the same "m_r"
|
|
if ( makePtrsRefNewBuf ) *strPtr = p;
|
|
// advance our destination ptr
|
|
p += *sizePtr;
|
|
// advance both ptrs to next string
|
|
sizePtr++;
|
|
strPtr++;
|
|
}
|
|
return buf;
|
|
}
|
|
|
|
// convert offsets back into ptrs
|
|
long deserializeMsg ( long baseSize ,
|
|
long *firstSizeParm ,
|
|
long *lastSizeParm ,
|
|
char **firstStrPtr ,
|
|
char *stringBuf ) {
|
|
// point to our string buffer
|
|
char *p = stringBuf;//getStringBuf(); // m_buf;
|
|
// then store the strings!
|
|
long *sizePtr = firstSizeParm;//getFirstSizeParm(); // &size_qbuf;
|
|
long *sizeEnd = lastSizeParm;//getLastSizeParm (); // &size_displayMet
|
|
char **strPtr = firstStrPtr;//getFirstStrPtr (); // &ptr_qbuf;
|
|
for ( ; sizePtr <= sizeEnd ; ) {
|
|
// convert the offset to a ptr
|
|
*strPtr = p;
|
|
// make it NULL if size is 0 though
|
|
if ( *sizePtr == 0 ) *strPtr = NULL;
|
|
// sanity check
|
|
if ( *sizePtr < 0 ) { char *xx = NULL; *xx =0; }
|
|
// advance our destination ptr
|
|
p += *sizePtr;
|
|
// advance both ptrs to next string
|
|
sizePtr++;
|
|
strPtr++;
|
|
}
|
|
// return how many bytes we processed
|
|
return baseSize + (p - stringBuf);//getStringBuf());
|
|
}
|
|
|
|
// print it to stdout for debugging Dates.cpp
|
|
long printTime ( long ttt ) {
|
|
//char *s = ctime(&ttt);
|
|
// print in UTC!
|
|
char *s = asctime ( gmtime(&ttt) );
|
|
// strip \n
|
|
s[gbstrlen(s)-1] = '\0';
|
|
fprintf(stderr,"%s UTC\n",s);
|
|
return 0;
|
|
}
|
|
|
|
// this uses our local timezone which is MST, so we need to tell
|
|
// it to use UTC somehow...
|
|
time_t mktime_utc ( struct tm *ttt ) {
|
|
time_t local = mktime ( ttt );
|
|
// bad?
|
|
if ( local < 0 ) return local;
|
|
/*
|
|
// sanity check
|
|
static char s_mm = 1;
|
|
static long s_localOff;
|
|
if ( s_mm ) {
|
|
s_mm = 0;
|
|
struct tm ff;
|
|
ff.tm_mon = 0;
|
|
ff.tm_year = 70;
|
|
ff.tm_mday = 1;
|
|
ff.tm_hour = 0;
|
|
ff.tm_min = 0;
|
|
ff.tm_sec = 0;
|
|
long qq = mktime ( &ff );
|
|
//fprintf(stderr,"qq=%li\n",qq);
|
|
// . set this then
|
|
// . we subtract s_localOff to further mktime() returns to
|
|
// get it into utc
|
|
s_localOff = qq;
|
|
// sanity
|
|
if ( s_localOff != timezone ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
*/
|
|
// see what our timezone is!
|
|
//fprintf(stderr,"%li=tz\n",timezone);
|
|
// mod that
|
|
return local - timezone;
|
|
}
|
|
|
|
bool verifyUtf8 ( char *txt , long tlen ) {
|
|
if ( ! txt || tlen <= 0 ) return true;
|
|
char size;
|
|
char *p = txt;
|
|
char *pend = txt + tlen;
|
|
for ( ; p < pend ; p += size ) {
|
|
size = getUtf8CharSize(p);
|
|
// skip if ascii
|
|
if ( ! (p[0] & 0x80) ) continue;
|
|
// ok, it's a utf8 char, it must have both hi bits set
|
|
if ( (p[0] & 0xc0) != 0xc0 ) return false;
|
|
// if only one byte, we are done.. how can that be?
|
|
if ( size == 1 ) return false;
|
|
//if ( ! utf8IsSane ( p[0] ) ) return false;
|
|
// successive utf8 chars must have & 0xc0 be equal to 0x80
|
|
// but the first char it must equal 0xc0, both set
|
|
if ( (p[1] & 0xc0) != 0x80 ) return false;
|
|
if ( size == 2 ) continue;
|
|
if ( (p[2] & 0xc0) != 0x80 ) return false;
|
|
if ( size == 3 ) continue;
|
|
if ( (p[3] & 0xc0) != 0x80 ) return false;
|
|
}
|
|
if ( p != pend ) return false;
|
|
return true;
|
|
}
|
|
|
|
bool verifyUtf8 ( char *txt ) {
|
|
long tlen = gbstrlen(txt);
|
|
return verifyUtf8(txt,tlen);
|
|
}
|
|
|