open-source-search-engine/SafeBuf.h
Matt Wells 8a49e87a61 got code with shard rebalancing compiling.
now we store a "sharded by termid" bit in posdb
key for checksums, etc keys that are not sharded
by docid. save having to do disk seeks on every
host in the cluster to do a dup check, etc.
2014-01-11 16:08:42 -08:00

363 lines
11 KiB
C++

#ifndef _SAFEBUF_H_
#define _SAFEBUF_H_
//#include "Mem.h"
//#include "Unicode.h"
#include "gb-include.h"
struct SafeBuf {
//*TRUCTORS
SafeBuf();
SafeBuf(long initSize, char *label = NULL);
void constructor();
//be careful with passing in a stackBuf! it could go out
//of scope independently of the safebuf.
SafeBuf(char* stackBuf, long cap);
SafeBuf(char *heapBuf, long bufMax, long bytesInUse, bool ownData);
~SafeBuf();
void setLabel ( char *label );
// CAUTION: BE CAREFUL WHEN USING THE FOLLOWING TWO FUNCTIONS!!
// setBuf() allows you reset the contents of the SafeBuf to either
// a stack buffer or a dynamic buffer. Only pass in true for
// ownData if this is not a stack buffer and you are sure you
// want SafeBuf to free the data for you. Keep in mind, all
// previous content in SafeBuf will be cleared when you pass it
// a new buffer.
bool setBuf(char *newBuf, long bufMax, long bytesInUse, bool ownData,
short encoding );
// yieldBuf() allows you to take over the buffer in SafeBuf.
// You may only free the data if it was originally owned by
// the SafeBuf.
// Think twice before using this function.
bool yieldBuf(char **bufPtr, long *bufAlloc, long *bytesInUse,
bool *ownData, short *encoding );
// set buffer from another safebuf, stealing it
bool stealBuf ( SafeBuf *sb );
//ACCESSORS
char *getBuf() { return m_buf + m_length; }
char *getBufStart() { return m_buf; }
char *getBufEnd() { return m_buf + m_capacity; }
long getCapacity() { return m_capacity; }
long getAvail() { return m_capacity - m_length; }
long length() { return m_length; }
long getLength() { return m_length; }
long getBufUsed() { return m_length; }
void print() {
if ( write(1,m_buf,m_length) != m_length) { char*xx=NULL;*xx=0;}; }
// . returns bytes written to file, 0 is acceptable if m_length == 0
// . returns -1 on error and sets g_errno
long saveToFile ( char *dir , char *filename ) ;
long dumpToFile(char *filename);
long save ( char *dir, char *fname){return saveToFile(dir,fname); };
long fillFromFile(char *filename);
long fillFromFile(char *dir,char *filename);
long load(char *dir,char *fname) { return fillFromFile(dir,fname);};
void filterTags();
void filterQuotes();
bool truncateLongWords ( char *src, long srcLen , long minmax );
bool safeTruncateEllipsis ( char *src , long maxLen );
bool convertJSONtoXML ( long niceness , long startConvertPos );
bool safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness);
// bool decodeAll = false );
bool decodeJSONToUtf8 ( long niceness );
bool decodeJSON ( long niceness );
bool linkify ( long niceness , long startPos );
void truncLen ( long newLen ) {
if ( m_length > newLen ) m_length = newLen; };
bool set ( char *str ) {
purge();
if ( ! str ) return true;
// puts a \0 at the end, but does not include it in m_length:
return safeStrcpy ( str );
};
void removeLastChar ( char lastChar ) {
if ( m_length <= 0 ) return;
if ( m_buf[m_length-1] != lastChar ) return;
m_length--;
m_buf[m_length] = '\0';
};
//MUTATORS
#ifdef _CHECK_FORMAT_STRING_
bool safePrintf(char *formatString, ...)
__attribute__ ((format(printf, 2, 3)));
#else
bool safePrintf(char *formatString, ...);
#endif
bool safeMemcpy(void *s, long len){return safeMemcpy((char *)s,len);};
bool safeMemcpy(char *s, long len);
bool safeMemcpy_nospaces(char *s, long len);
bool safeMemcpy(SafeBuf *c){return safeMemcpy(c->m_buf,c->m_length);};
bool safeMemcpy ( class Words *w , long a , long b ) ;
bool safeStrcpy ( char *s ) ;
bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
bool safeUtf8ToJSON ( char *utf8 ) ;
bool csvEncode ( char *s , long len , long niceness = 0 );
//bool pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
bool cat(SafeBuf& c);
// . only cat the sections/tag that start with "tagFilter"
// . used by Spider.cpp to dump <div class=shortdisplay> sections
// to parse-shortdisplay.uh64.runid.txt for displaying the
// validation checkboxes in qa.html
bool cat2 ( SafeBuf& c,char *tagFilter1,char *tagFilter2);
void reset() { m_length = 0; }
void purge(); // Clear all data and free all allocated memory
bool advance ( long i ) ;
bool safePrintFilterTagsAndLines ( char *p , long plen ,
bool oneWordPerLine ) ;
// . if clearIt is true we init the new buffer space to zeroes
// . used by Collectiondb.cpp
bool reserve(long i, char *label=NULL , bool clearIt = false );
bool reserve2x(long i, char *label = NULL );
char *makeSpace ( long size ) {
if ( ! reserve ( size ) ) return NULL;
return m_buf + m_length;
};
bool inlineStyleTags();
void incrementLength(long i) {
m_length += i;
// watch out for negative i's
if ( m_length < 0 ) m_length = 0;
};
void setLength(long i) { m_length = i; };
char *getNextLine ( char *p ) ;
long catFile(char *filename) ;
//long load(char *dir,char *filename) {
// return fillFromFile(dir,filename);};
bool safeLatin1ToUtf8(char *s, long len);
bool safeUtf8ToLatin1(char *s, long len);
void detachBuf();
bool insert ( class SafeBuf *c , long insertPos ) ;
bool insert ( char *s , long insertPos ) ;
bool insert2 ( char *s , long slen, long insertPos ) ;
bool replace ( char *src , char *dst ) ; // must be same lengths!
bool removeChunk1 ( char *p , long len ) ;
bool removeChunk2 ( long pos , long len ) ;
bool safeReplace(char *s, long len, long pos, long replaceLen);
bool safeReplace2 ( char *s, long slen,
char *t , long tlen ,
long niceness ,
long startOff = 0 );
bool safeReplace3 ( char *s, char *t , long niceness = 0 ) ;
void replaceChar ( char src , char dst );
bool copyToken(char* s);;
//output encoding
bool setEncoding(short cs);
short getEncoding() { return m_encoding; };
void zeroOut() { memset ( m_buf , 0 , m_capacity ); }
bool brify2 ( char *s , long cols , char *sep = "<br>" ,
bool isHtml = true ) ;
bool brify ( char *s , long slen , long niceness , long cols ,
char *sep = "<br>" , bool isHtml = true );
bool fixIsolatedPeriods ( ) ;
// treat safebuf as an array of signed longs and sort them
void sortLongs ( long niceness );
// . a function for adding Tags to buffer, like from Tagdb.cpp
// . if safebuf is a buffer of Tags from Tagdb.cpp
class Tag *addTag2 ( char *mysite ,
char *tagname ,
long now ,
char *user ,
long ip ,
long val ,
char rdbId );
class Tag *addTag3 ( char *mysite ,
char *tagname ,
long now ,
char *user ,
long ip ,
char *data ,
char rdbId );
// makes the site "%llu.com" where %llu is userId
class Tag *addFaceookTag ( long long userId ,
char *tagname ,
long now ,
long ip ,
char *data ,
long dsize ,
char rdbId ,
bool pushRdbId ) ;
class Tag *addTag ( char *mysite ,
char *tagname ,
long now ,
char *user ,
long ip ,
char *data ,
long dsize ,
char rdbId ,
bool pushRdbId );
bool addTag ( class Tag *tag );
//insert strings in their native encoding
bool encode ( char *s , long len , long niceness=0) {
return utf8Encode2(s,len,false,niceness); };
// htmlEncode default = false
bool utf8Encode2(char *s, long len, bool htmlEncode=false,
long niceness=0);
bool latin1Encode(char *s, long len, bool htmlEncode=false,
long niceness=0);
//bool utf16Encode(UChar *s, long len, bool htmlEncode=false);
//bool utf16Encode(char *s, long len, bool htmlEncode=false) {
// return utf16Encode((UChar*)s, len>>1, htmlEncode); };
//bool utf32Encode(UChar32 c);
bool htmlEncode(char *s, long len,bool encodePoundSign,
long niceness=0);
bool javascriptEncode(char *s, long len );
//bool convertUtf8CharsToEntity = false ) ;
// html-encode any of the last "len" bytes that need it
bool htmlEncode(long len,long niceness=0);
bool htmlDecode (char *s,
long len,
bool doSpecial = false,
long niceness = 0 );
//bool htmlEncode(long niceness );
bool dequote ( char *t , long tlen );
bool escapeJS ( char *s , long slen ) ;
bool urlEncode (char *s ,
long slen,
bool requestPath = false,
bool encodeApostrophes = false );
bool urlEncode (char *s ) {
return urlEncode ( s,strlen(s),false,false); };
bool urlEncode2 (char *s ,
bool encodeApostrophes ) { // usually false
return urlEncode ( s,strlen(s),false,encodeApostrophes); };
bool urlEncodeAllBuf ( bool spaceToPlus = true );
bool latin1CdataEncode(char *s, long len);
bool utf8CdataEncode(char *s, long len);
// . filter out parentheses and other query operators
// . used by SearchInput.cpp when it constructs the big UOR query
// of facebook interests
bool queryFilter ( char *s , long len );
//bool utf16CdataEncode(UChar *s, long len);
//bool utf16CdataEncode(char *s, long len) {
// return utf16CdataEncode((UChar*)s, len>>1); };
bool latin1HtmlEncode(char *s, long len, long niceness=0);
//bool utf16HtmlEncode(UChar *s, long len);
//bool utf16HtmlEncode(char *s, long len) {
// return utf16HtmlEncode((UChar*)s, len>>1); };
bool htmlEncodeXmlTags ( char *s , long slen , long niceness ) ;
bool cdataEncode ( char *s ) ;
// . append a \0 but do not inc m_length
// . for null terminating strings
bool nullTerm ( ) {
if(m_length >= m_capacity && !reserve(m_capacity + 1) )
return false;
m_buf[m_length] = '\0';
return true;
};
bool safeCdataMemcpy(char *s, long len);
bool pushChar (char i) {
if(m_length >= m_capacity)
if(!reserve(2*m_capacity + 1))
return false;
m_buf[m_length++] = i;
// let's do this because we kinda expect it when making strings
// and i've been burned by not having this before.
// no, cause if we reserve just the right length, we end up
// doing a realloc!! sux...
//m_buf[m_length] = '\0';
return true;
};
bool pushLong (long i);
bool pushLongLong (long long i);
bool pushFloat (float i);
long popLong();
float popFloat();
long pad(const char ch, const long len);
bool printKey(char* key, char ks);
// these use zlib
bool compress();
bool uncompress();
//OPERATORS
//copy numbers into the buffer, *in binary*
//useful for making lists.
bool operator += (uint64_t i);
bool operator += (long long i);
bool operator += (long i);
bool operator += (unsigned long i);
bool operator += (float i);
bool operator += (double i);
bool operator += (char i);
//bool operator += (uint64_t i);
bool operator += (uint32_t i);
bool operator += (uint16_t i);
bool operator += (uint8_t i);
//bool operator += (int64_t i) { return *this += (uint64_t)i; };
bool operator += (int32_t i) { return *this += (uint32_t)i; };
bool operator += (int16_t i) { return *this += (uint16_t)i; };
bool operator += (int8_t i) { return *this += (uint8_t)i; };
//return a reference so we can use on lhs and rhs.
char& operator[](long i);
public:
long m_capacity;
long m_length;
protected:
char *m_buf;
public:
char *m_label;
bool m_usingStack;
short m_encoding; // output charset
// . a special flag used by PageParser.cpp
// . if this is true it PageParser shows the page in its html form,
// otherwise, if false, it converts the "<" to &lt; etc. so we see the html
// source view.
// . only Words.cpp looks at this flag
char m_renderHtml;
};
#endif