open-source-search-engine/Pos.cpp
2014-11-10 14:45:11 -08:00

225 lines
4.9 KiB
C++

#include "gb-include.h"
#include "Pos.h"
#include "Sections.h"
Pos::Pos() {
m_buf = NULL;
m_needsFree = false;
}
Pos::~Pos () {
reset();
}
void Pos::reset() {
if ( m_buf && m_needsFree )
mfree ( m_buf , m_bufSize , "Pos" );
m_buf = NULL;
}
// . the interval is half-open [a,b)
// . do not print out any alnum word with negative score
int32_t Pos::filter( char *p, char *pend, class Words *words, int32_t a,
int32_t b, Sections *sections ) {
int32_t plen = 0;
set ( words , sections , p , pend, &plen , a , b );
return plen;
}
// . set the filtered position of each word
// . used by Summary.cpp to determine how many chars are in the summary,
// be those chars single byte or utf8 chars that are 4 bytes
// . returns false and sets g_errno on error
// . if f is non-NULL store filtered words into there. back to back spaces
// are eliminated.
bool Pos::set ( Words *words ,
Sections *sections ,
char *f ,
char *fend,
int32_t *len ,
int32_t a ,
int32_t b ,
char *buf ,
int32_t bufSize ) {
// free m_buf in case this is a second call
if ( ! f ) reset();
int32_t nw = words->getNumWords();
int32_t *wlens = words->m_wordLens;
nodeid_t *tids = words->getTagIds(); // m_tagIds;
char **wp = words->m_words;
//int32_t *ss = NULL;
//int64_t *wids = words->m_wordIds;
//if ( scores ) ss = scores->m_scores;
// save start point for filtering
char *fstart = f;
// -1 is the default value
if ( b == -1 ) b = nw;
// alloc array if need to
int32_t need = (nw+1) * 4;
// do not destroy m_pos/m_numWords if only filtering into a buffer
if ( f ) goto skip;
m_needsFree = false;
m_buf = m_localBuf;
if ( need > POS_LOCALBUFSIZE && need < bufSize )
m_buf = buf;
else if ( need > POS_LOCALBUFSIZE ) {
m_buf = (char *)mmalloc(need,"Pos");
m_needsFree = true;
}
// bail on error
if ( ! m_buf ) return false;
m_bufSize = need;
m_pos = (int32_t *)m_buf;
m_numWords = nw;
skip:
// this is the CHARACTER count.
int32_t pos = 0;
bool trunc = false;
char *p , *pend;
//char *nextp;
//int32_t skip;
char* lastBreak = NULL;
// utf8 char
//int32_t c;
// its size in bytes
//char cs;
// int16_tcut
//Section **sp = NULL;
//if ( sections ) sp = sections->m_sectionPtrs;
//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
// flag for stopping back-to-back spaces. only count those as one char.
bool lastSpace = false;
int32_t maxCharSize = 4; // we are utf8
for ( int32_t i = a ; i < b ; i++ ) {
if (trunc) break;
// set pos for the ith word to "pos"
if ( ! f ) m_pos[i] = pos;
// if inside a bad tag, skip it
//if ( sp && (sp[i]->m_flags & badFlags) ) continue;
// is tag?
if ( tids && tids[i] ) {
// if not breaking, does nothing
if ( ! g_nodes[tids[i]&0x7f].m_isBreaking ) continue;
// list tag? <li>
if ( tids[i] == TAG_LI ) {
if ( f ){
if ((fend - f > maxCharSize)) {
*f++ = '*';
}
else {
trunc = true;
}
}
pos++;
lastSpace = false;
continue;
}
// if had a previous breaking tag and no non-tag
// word after it, do not count back-to-back spaces
if ( lastSpace ) continue;
// if had a br tag count it as a '.'
if ( tids[i] ) { // == 20 ) { // <br>
// are we filtering?
if ( f && f != fstart ) {
if ((fend-f>2*maxCharSize)) {
*f++ = '.';
*f++ = ' ';
}
else trunc = true;
}
// count as double periods
//pos += 3;
// no, just single period.
pos += 2;
lastSpace = true;
continue;
}
// are we filtering?
if ( f ) {
if ((fend-f > maxCharSize)) {
*f++ = ' ';
}
else trunc = true;
}
// count as a single space
pos++;
// do not allow back-to-back spaces
lastSpace = true;
continue;
}
// scan through all chars discounting back-to-back spaces
// assume filters out to the same # of chars
p = wp[i] ;
pend = p + wlens[i];
unsigned char cs = 0;
for ( ; p < pend ; p += cs ) {
// get size
cs = getUtf8CharSize(p);
// do not count space if one before
if ( is_wspace_utf8 (p) ) {
if ( lastSpace ) continue;
lastSpace = true;
// are we filtering?
if ( f ) {
if (fend-f > 1 ) {
lastBreak = f;
*f++ = ' ';
}
else trunc = true;
}
pos++;
continue;
}
if ( f ) {
if (fend-f > cs){
// change '|' to commas
if ( *p == '|' )
*f++ = ',';
else if ( cs == 1 )
*f++ = *p;
else {
memcpy(f,p,cs);
f += cs;
}
}
else trunc = true;
}
pos++;
lastSpace = false;
}
}
if (trunc) {
if(lastBreak == NULL) {
*len = 0;
return false;
}
else if(f) f = lastBreak;
}
// set pos for the END of the last word here (used in Summary.cpp)
if ( ! f ) m_pos[nw] = pos;
// NULL terminate f
else { *len = f - fstart; }
if ( fend-f > maxCharSize) { *f = '\0';}
// Success
return true;
}