open-source-search-engine/SafeBuf.h

412 lines
13 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
#ifndef _SAFEBUF_H_
#define _SAFEBUF_H_
//#include "Mem.h"
//#include "Unicode.h"
#include "gb-include.h"
2013-08-03 00:12:24 +04:00
/**
* Safe Char Buffer, or mutable Strings.
* (for java programmers, very similar to the StringBuffer class, with all the speed that c++ allows).
* Most of strings in Gigablast are handled by those.
*/
2015-07-13 23:59:44 +03:00
#include "iana_charset.h"
class SafeBuf {
public:
2013-08-03 00:12:24 +04:00
//*TRUCTORS
SafeBuf();
2014-11-11 01:45:11 +03:00
SafeBuf(int32_t initSize, char *label = NULL);
2013-11-25 06:46:44 +04:00
void constructor();
2013-08-03 00:12:24 +04:00
//be careful with passing in a stackBuf! it could go out
//of scope independently of the safebuf.
2014-11-11 01:45:11 +03:00
SafeBuf(char* stackBuf, int32_t cap);
SafeBuf(char *heapBuf, int32_t bufMax, int32_t bytesInUse, bool ownData);
2013-08-03 00:12:24 +04:00
~SafeBuf();
void setLabel ( char *label );
2013-08-03 00:12:24 +04:00
// CAUTION: BE CAREFUL WHEN USING THE FOLLOWING TWO FUNCTIONS!!
// setBuf() allows you reset the contents of the SafeBuf to either
// a stack buffer or a dynamic buffer. Only pass in true for
// ownData if this is not a stack buffer and you are sure you
// want SafeBuf to free the data for you. Keep in mind, all
// previous content in SafeBuf will be cleared when you pass it
// a new buffer.
2015-07-13 23:59:44 +03:00
bool setBuf(char *newBuf,
int32_t bufMax,
int32_t bytesInUse,
bool ownData,
int16_t encoding = csUTF8 );
2013-08-03 00:12:24 +04:00
// yieldBuf() allows you to take over the buffer in SafeBuf.
// You may only free the data if it was originally owned by
// the SafeBuf.
// Think twice before using this function.
2014-11-11 01:45:11 +03:00
bool yieldBuf(char **bufPtr, int32_t *bufAlloc, int32_t *bytesInUse,
bool *ownData, int16_t *encoding );
2013-08-03 00:12:24 +04:00
// set buffer from another safebuf, stealing it
bool stealBuf ( SafeBuf *sb );
//ACCESSORS
char *getBuf() { return m_buf + m_length; }
char *getBufStart() { return m_buf; }
char *getBufEnd() { return m_buf + m_capacity; }
2014-11-11 01:45:11 +03:00
int32_t getCapacity() { return m_capacity; }
int32_t getAvail() { return m_capacity - m_length; }
int32_t length() { return m_length; }
int32_t getLength() { return m_length; }
int32_t getBufUsed() { return m_length; }
2013-08-03 00:12:24 +04:00
void print() {
if ( write(1,m_buf,m_length) != m_length) { char*xx=NULL;*xx=0;}; }
// . returns bytes written to file, 0 is acceptable if m_length == 0
// . returns -1 on error and sets g_errno
2014-11-11 01:45:11 +03:00
int32_t saveToFile ( char *dir , char *filename ) ;
int32_t dumpToFile(char *filename);
int32_t save ( char *dir, char *fname){return saveToFile(dir,fname); };
int32_t save ( char *fullFilename ) ;
// saves to tmp file and if that succeeds then renames to orig filename
2014-11-11 01:45:11 +03:00
int32_t safeSave (char *filename );
2014-11-11 01:45:11 +03:00
int32_t fillFromFile(char *filename);
2015-07-14 02:47:34 +03:00
int32_t fillFromFile(char *dir,char *filename, char *label=NULL);
int32_t load(char *dir,char *fname,char *label = NULL) {
return fillFromFile(dir,fname,label);};
2014-11-11 01:45:11 +03:00
int32_t load(char *fname) { return fillFromFile(fname);};
2013-08-03 00:12:24 +04:00
void filterTags();
void filterQuotes();
2014-11-11 01:45:11 +03:00
bool truncateLongWords ( char *src, int32_t srcLen , int32_t minmax );
bool safeTruncateEllipsis ( char *src , int32_t maxLen );
bool safeTruncateEllipsis ( char *src , int32_t srcLen, int32_t maxLen );
2014-07-09 06:38:54 +04:00
2014-11-11 01:45:11 +03:00
bool convertJSONtoXML ( int32_t niceness , int32_t startConvertPos );
2013-10-12 02:14:26 +04:00
bool safeDecodeJSONToUtf8 ( char *json, int32_t jsonLen,
int32_t niceness);
// bool decodeAll = false );
2013-10-12 02:14:26 +04:00
2014-11-11 01:45:11 +03:00
bool decodeJSONToUtf8 ( int32_t niceness );
bool decodeJSON ( int32_t niceness );
bool linkify ( int32_t niceness , int32_t startPos );
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
void truncLen ( int32_t newLen ) {
2013-08-03 00:12:24 +04:00
if ( m_length > newLen ) m_length = newLen; };
bool set ( char *str ) {
purge();
if ( ! str ) return true;
2013-10-15 04:19:59 +04:00
// puts a \0 at the end, but does not include it in m_length:
return safeStrcpy ( str );
};
void removeLastChar ( char lastChar ) {
if ( m_length <= 0 ) return;
if ( m_buf[m_length-1] != lastChar ) return;
m_length--;
m_buf[m_length] = '\0';
};
2013-08-03 00:12:24 +04:00
//MUTATORS
#ifdef _CHECK_FORMAT_STRING_
bool safePrintf(char *formatString, ...)
__attribute__ ((format(printf, 2, 3)));
#else
bool safePrintf(char *formatString, ...);
#endif
2014-11-11 01:45:11 +03:00
bool safeMemcpy(void *s, int32_t len){return safeMemcpy((char *)s,len);};
bool safeMemcpy(char *s, int32_t len);
bool safeMemcpy_nospaces(char *s, int32_t len);
2013-08-03 00:12:24 +04:00
bool safeMemcpy(SafeBuf *c){return safeMemcpy(c->m_buf,c->m_length);};
2014-11-11 01:45:11 +03:00
bool safeMemcpy ( class Words *w , int32_t a , int32_t b ) ;
2013-08-03 00:12:24 +04:00
bool safeStrcpy ( char *s ) ;
//bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
bool safeUtf8ToJSON ( char *utf8 ) ;
bool jsonEncode ( char *utf8 ) { return safeUtf8ToJSON(utf8); };
2014-11-11 01:45:11 +03:00
bool jsonEncode ( char *utf8 , int32_t utf8Len );
2014-11-11 01:45:11 +03:00
bool csvEncode ( char *s , int32_t len , int32_t niceness = 0 );
2014-11-11 01:45:11 +03:00
bool base64Encode ( char *s , int32_t len , int32_t niceness = 0 );
bool base64Decode ( char *src , int32_t srcLen , int32_t niceness = 0 ) ;
2015-03-05 22:10:40 +03:00
bool base64Encode( char *s ) ;
2014-11-11 01:45:11 +03:00
//bool pushLong ( int32_t val ) { return safeMemcpy((char *)&val,4); }
2013-08-03 00:12:24 +04:00
bool cat(SafeBuf& c);
// . only cat the sections/tag that start with "tagFilter"
2014-11-11 01:45:11 +03:00
// . used by Spider.cpp to dump <div class=int16_tdisplay> sections
// to parse-int16_tdisplay.uh64.runid.txt for displaying the
2013-08-03 00:12:24 +04:00
// validation checkboxes in qa.html
bool cat2 ( SafeBuf& c,char *tagFilter1,char *tagFilter2);
void reset() { m_length = 0; }
void purge(); // Clear all data and free all allocated memory
2014-11-11 01:45:11 +03:00
bool advance ( int32_t i ) ;
2014-11-11 01:45:11 +03:00
bool safePrintFilterTagsAndLines ( char *p , int32_t plen ,
bool oneWordPerLine ) ;
2013-10-16 23:12:22 +04:00
// . if clearIt is true we init the new buffer space to zeroes
// . used by Collectiondb.cpp
2014-11-11 01:45:11 +03:00
bool reserve(int32_t i, char *label=NULL , bool clearIt = false );
bool reserve2x(int32_t i, char *label = NULL );
2013-10-12 02:14:26 +04:00
2014-11-11 01:45:11 +03:00
char *makeSpace ( int32_t size ) {
2013-10-12 02:14:26 +04:00
if ( ! reserve ( size ) ) return NULL;
return m_buf + m_length;
};
2013-08-03 00:12:24 +04:00
bool inlineStyleTags();
2014-11-11 01:45:11 +03:00
void incrementLength(int32_t i) {
m_length += i;
// watch out for negative i's
if ( m_length < 0 ) m_length = 0;
};
2014-11-11 01:45:11 +03:00
void setLength(int32_t i) { m_length = i; };
2013-08-03 00:12:24 +04:00
char *getNextLine ( char *p ) ;
2014-11-11 01:45:11 +03:00
int32_t catFile(char *filename) ;
//int32_t load(char *dir,char *filename) {
2013-08-03 00:12:24 +04:00
// return fillFromFile(dir,filename);};
2014-11-11 01:45:11 +03:00
bool safeLatin1ToUtf8(char *s, int32_t len);
bool safeUtf8ToLatin1(char *s, int32_t len);
2013-08-03 00:12:24 +04:00
void detachBuf();
2014-11-11 01:45:11 +03:00
bool insert ( class SafeBuf *c , int32_t insertPos ) ;
bool insert ( char *s , int32_t insertPos ) ;
bool insert2 ( char *s , int32_t slen, int32_t insertPos ) ;
2013-08-03 00:12:24 +04:00
bool replace ( char *src , char *dst ) ; // must be same lengths!
2014-11-11 01:45:11 +03:00
bool removeChunk1 ( char *p , int32_t len ) ;
bool removeChunk2 ( int32_t pos , int32_t len ) ;
bool safeReplace(char *s, int32_t len, int32_t pos, int32_t replaceLen);
bool safeReplace2 ( char *s, int32_t slen,
char *t , int32_t tlen ,
int32_t niceness ,
int32_t startOff = 0 );
bool safeReplace3 ( char *s, char *t , int32_t niceness = 0 ) ;
void replaceChar ( char src , char dst );
2013-08-03 00:12:24 +04:00
bool copyToken(char* s);;
//output encoding
2014-11-11 01:45:11 +03:00
bool setEncoding(int16_t cs);
int16_t getEncoding() { return m_encoding; };
2013-08-03 00:12:24 +04:00
void zeroOut() { memset ( m_buf , 0 , m_capacity ); }
// insert <br>'s to make 's' no more than 'cols' chars per line
2014-11-11 01:45:11 +03:00
bool brify2 ( char *s , int32_t cols , char *sep = "<br>" ,
bool isHtml = true ) ;
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
bool brify ( char *s , int32_t slen , int32_t niceness , int32_t cols ,
char *sep = "<br>" , bool isHtml = true );
2013-08-03 00:12:24 +04:00
bool fixIsolatedPeriods ( ) ;
2014-05-31 02:05:00 +04:00
bool hasDigits();
2014-11-11 01:45:11 +03:00
// treat safebuf as an array of signed int32_ts and sort them
void sortLongs ( int32_t niceness );
2013-08-03 00:12:24 +04:00
2014-05-30 07:17:51 +04:00
// . like "1 minute ago" "5 hours ago" "3 days ago" etc.
// . "ts" is the delta-t in seconds
2014-11-11 01:45:11 +03:00
bool printTimeAgo ( int32_t ts , int32_t now , bool int16_thand = false ) ;
2014-05-30 07:17:51 +04:00
2013-08-03 00:12:24 +04:00
// . a function for adding Tags to buffer, like from Tagdb.cpp
// . if safebuf is a buffer of Tags from Tagdb.cpp
class Tag *addTag2 ( char *mysite ,
char *tagname ,
2014-11-11 01:45:11 +03:00
int32_t now ,
2013-08-03 00:12:24 +04:00
char *user ,
2014-11-11 01:45:11 +03:00
int32_t ip ,
int32_t val ,
2013-08-03 00:12:24 +04:00
char rdbId );
class Tag *addTag3 ( char *mysite ,
char *tagname ,
2014-11-11 01:45:11 +03:00
int32_t now ,
2013-08-03 00:12:24 +04:00
char *user ,
2014-11-11 01:45:11 +03:00
int32_t ip ,
2013-08-03 00:12:24 +04:00
char *data ,
char rdbId );
2014-11-11 01:45:11 +03:00
// makes the site "%"UINT64".com" where %"UINT64" is userId
2014-10-30 22:36:39 +03:00
class Tag *addFaceookTag ( int64_t userId ,
2013-08-03 00:12:24 +04:00
char *tagname ,
2014-11-11 01:45:11 +03:00
int32_t now ,
int32_t ip ,
2013-08-03 00:12:24 +04:00
char *data ,
2014-11-11 01:45:11 +03:00
int32_t dsize ,
2013-08-03 00:12:24 +04:00
char rdbId ,
bool pushRdbId ) ;
class Tag *addTag ( char *mysite ,
char *tagname ,
2014-11-11 01:45:11 +03:00
int32_t now ,
2013-08-03 00:12:24 +04:00
char *user ,
2014-11-11 01:45:11 +03:00
int32_t ip ,
2013-08-03 00:12:24 +04:00
char *data ,
2014-11-11 01:45:11 +03:00
int32_t dsize ,
2013-08-03 00:12:24 +04:00
char rdbId ,
bool pushRdbId );
bool addTag ( class Tag *tag );
//insert strings in their native encoding
2014-11-11 01:45:11 +03:00
bool encode ( char *s , int32_t len , int32_t niceness=0) {
return utf8Encode2(s,len,false,niceness); };
2013-08-03 00:12:24 +04:00
// htmlEncode default = false
2014-11-11 01:45:11 +03:00
bool utf8Encode2(char *s, int32_t len, bool htmlEncode=false,
int32_t niceness=0);
bool latin1Encode(char *s, int32_t len, bool htmlEncode=false,
int32_t niceness=0);
//bool utf16Encode(UChar *s, int32_t len, bool htmlEncode=false);
//bool utf16Encode(char *s, int32_t len, bool htmlEncode=false) {
2013-08-03 00:12:24 +04:00
// return utf16Encode((UChar*)s, len>>1, htmlEncode); };
//bool utf32Encode(UChar32 c);
2014-11-11 01:45:11 +03:00
bool htmlEncode(char *s, int32_t len,bool encodePoundSign,
int32_t niceness=0 , int32_t truncateLen = -1 );
bool javascriptEncode(char *s, int32_t len );
bool htmlEncode(char *s) ;
2013-08-03 00:12:24 +04:00
//bool convertUtf8CharsToEntity = false ) ;
// html-encode any of the last "len" bytes that need it
2014-11-11 01:45:11 +03:00
bool htmlEncode(int32_t len,int32_t niceness=0);
bool htmlDecode (char *s,
2014-11-11 01:45:11 +03:00
int32_t len,
bool doSpecial = false,
2014-11-11 01:45:11 +03:00
int32_t niceness = 0 );
2014-11-11 01:45:11 +03:00
//bool htmlEncode(int32_t niceness );
bool dequote ( char *t , int32_t tlen );
bool escapeJS ( char *s , int32_t slen ) ;
2013-08-03 00:12:24 +04:00
bool urlEncode (char *s ,
2014-11-11 01:45:11 +03:00
int32_t slen,
2013-08-03 00:12:24 +04:00
bool requestPath = false,
bool encodeApostrophes = false );
2013-10-03 08:34:21 +04:00
bool urlEncode (char *s ) {
return urlEncode ( s,strlen(s),false,false); };
bool urlEncode2 (char *s ,
bool encodeApostrophes ) { // usually false
2013-08-03 00:12:24 +04:00
return urlEncode ( s,strlen(s),false,encodeApostrophes); };
2013-10-03 08:34:21 +04:00
bool urlEncodeAllBuf ( bool spaceToPlus = true );
2014-11-11 01:45:11 +03:00
bool latin1CdataEncode(char *s, int32_t len);
bool utf8CdataEncode(char *s, int32_t len);
2013-08-03 00:12:24 +04:00
// . filter out parentheses and other query operators
// . used by SearchInput.cpp when it constructs the big UOR query
// of facebook interests
2014-11-11 01:45:11 +03:00
bool queryFilter ( char *s , int32_t len );
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
//bool utf16CdataEncode(UChar *s, int32_t len);
//bool utf16CdataEncode(char *s, int32_t len) {
2013-08-03 00:12:24 +04:00
// return utf16CdataEncode((UChar*)s, len>>1); };
2014-11-11 01:45:11 +03:00
bool latin1HtmlEncode(char *s, int32_t len, int32_t niceness=0);
//bool utf16HtmlEncode(UChar *s, int32_t len);
//bool utf16HtmlEncode(char *s, int32_t len) {
2013-08-03 00:12:24 +04:00
// return utf16HtmlEncode((UChar*)s, len>>1); };
2014-11-11 01:45:11 +03:00
bool htmlEncodeXmlTags ( char *s , int32_t slen , int32_t niceness ) ;
2013-08-03 00:12:24 +04:00
bool cdataEncode ( char *s ) ;
2014-11-11 01:45:11 +03:00
bool cdataEncode ( char *s , int32_t slen ) ;
2013-08-03 00:12:24 +04:00
// . append a \0 but do not inc m_length
// . for null terminating strings
bool nullTerm ( ) {
if(m_length >= m_capacity && !reserve(m_capacity + 1) )
return false;
m_buf[m_length] = '\0';
return true;
};
2014-11-11 01:45:11 +03:00
bool safeCdataMemcpy(char *s, int32_t len);
2013-08-03 00:12:24 +04:00
bool pushChar (char i) {
if(m_length >= m_capacity)
if(!reserve(2*m_capacity + 1))
return false;
m_buf[m_length++] = i;
// let's do this because we kinda expect it when making strings
// and i've been burned by not having this before.
// no, cause if we reserve just the right length, we end up
// doing a realloc!! sux...
//m_buf[m_length] = '\0';
return true;
};
// hack off trailing 0's
bool printFloatPretty ( float f ) ;
bool pushPtr ( void *ptr );
2014-11-11 01:45:11 +03:00
bool pushLong (int32_t i);
2014-10-30 22:36:39 +03:00
bool pushLongLong (int64_t i);
2013-08-03 00:12:24 +04:00
bool pushFloat (float i);
bool pushDouble (double i);
2014-11-11 01:45:11 +03:00
int32_t popLong();
2013-08-03 00:12:24 +04:00
float popFloat();
2014-11-11 01:45:11 +03:00
int32_t pad(const char ch, const int32_t len);
2013-08-03 00:12:24 +04:00
bool printKey(char* key, char ks);
// these use zlib
bool compress();
bool uncompress();
//OPERATORS
//copy numbers into the buffer, *in binary*
//useful for making lists.
bool operator += (uint64_t i);
2014-10-30 22:36:39 +03:00
bool operator += (int64_t i);
2014-11-11 01:45:11 +03:00
//bool operator += (int32_t i);
//bool operator += (uint32_t i);
2013-08-03 00:12:24 +04:00
bool operator += (float i);
bool operator += (double i);
bool operator += (char i);
//bool operator += (uint64_t i);
bool operator += (uint32_t i);
bool operator += (uint16_t i);
bool operator += (uint8_t i);
//bool operator += (int64_t i) { return *this += (uint64_t)i; };
bool operator += (int32_t i) { return *this += (uint32_t)i; };
bool operator += (int16_t i) { return *this += (uint16_t)i; };
bool operator += (int8_t i) { return *this += (uint8_t)i; };
//return a reference so we can use on lhs and rhs.
2014-11-11 01:45:11 +03:00
char& operator[](int32_t i);
2013-08-03 00:12:24 +04:00
2013-12-08 09:12:48 +04:00
public:
2014-11-11 01:45:11 +03:00
int32_t m_capacity;
int32_t m_length;
2013-12-08 09:12:48 +04:00
protected:
2013-08-03 00:12:24 +04:00
char *m_buf;
2013-12-08 09:12:48 +04:00
public:
char *m_label;
2013-08-03 00:12:24 +04:00
bool m_usingStack;
2014-11-11 01:45:11 +03:00
int16_t m_encoding; // output charset
2013-08-03 00:12:24 +04:00
// . a special flag used by PageParser.cpp
// . if this is true it PageParser shows the page in its html form,
// otherwise, if false, it converts the "<" to &lt; etc. so we see the html
// source view.
// . only Words.cpp looks at this flag
char m_renderHtml;
};
#define TOKENPASTE(x, y) x ## y
#define TOKENPASTE2(x, y) TOKENPASTE(x, y)
#define StackBuf(name) char TOKENPASTE2(tmpsafebuf, __LINE__)[1024]; \
SafeBuf name(TOKENPASTE2(tmpsafebuf, __LINE__), 1024)
2013-08-03 00:12:24 +04:00
#endif