open-source-search-engine/Words.h

401 lines
12 KiB
C++

// Matt Wells, copyright Jul 2001
// . used to parse XML/HTML (romantic char set) into words
// . TODO: ensure WordType m_types[] array is only 1 byte per entry
// . ??? a word should end at any non-alnum ??? then using phrasing for "tim's"
#ifndef _WORDS_H_
#define _WORDS_H_
// . we can have up to about 16,000 words per doc
// . can be big since we never use threads
// . this was 1024*32 but using librt's aio_read somehow enforces a 500k
// stack onto us, even when we're not in a thread per se!!
//#define MAX_WORDS (1024*16)
// now keep this small and malloc if we need more... save some stack
#define MAX_WORDS (1024)
// Leaving MAX_WORDS alone because other classes use it...
// We use LOCALBUFSIZE now for allocation here now
//#define LOCALBUFSIZE (MAX_WORDS*16)
// now Matches.h has 300 Words classes handy... try to do away with this
// make sure it does not slow us down!!
#define WORDS_LOCALBUFSIZE 80
// an upper bound really
int32_t countWords ( char *p , int32_t niceness ) ;
int32_t countWords ( char *p , int32_t plen , int32_t niceness );
int32_t printstring ( char *s , int32_t len ) ;
char *getFieldValue ( char *s ,int32_t slen, char *field , int32_t *valueLen ) ;
unsigned char getCharacterLanguage ( char *utf8Char ) ;
//#include "TermTable.h" // used in hash()
#include "Xml.h"
#include "SafeBuf.h"
#include "StopWords.h"
#include "fctypes.h"
#include "Titledb.h"
#define NUM_LANGUAGE_SAMPLES 1000
//#define TITLEREC_CURRENT_VERSION 114
// this bit is set in the tag id to indicate a back tag
#define BACKBIT ((nodeid_t)0x8000)
#define BACKBITCOMP ((nodeid_t)0x7fff)
class Words {
public:
// . set words from a string
// . s must be NULL terminated
// . NOTE: we never own the data
// . there is typically no html in "s"
// . html tags are NOT parsed out
bool set ( char *s ,
int32_t version , // = TITLEREC_CURRENT_VERSION ,
bool computeIds , // = true ,
int32_t niceness ); // = 0);
// assume TITLEREC_CURRENT_VERSION and computeIds is true
bool set9 ( char *s , int32_t niceness ) {
return set ( s , TITLEREC_CURRENT_VERSION, true , niceness);};
bool setxi ( char *s , char *buf, int32_t bufSize, int32_t niceness ) ;
bool setx ( char *s , int32_t slen , int32_t niceness ) {
return set ( s,slen,TITLEREC_CURRENT_VERSION,true,niceness);};
bool set11 ( char *s , char *send , int32_t niceness ) ;
// . similar to above
// . but we temporarily stick a \0 @ s[slen] for parsing purposes
bool set ( char *s , int32_t slen , int32_t version,
bool computeIds ,
int32_t niceness = 0);
bool set3 ( char *s ) {return set(s,TITLEREC_CURRENT_VERSION,true,0);};
// . new function to set directly from an Xml, rather than extracting
// text first
// . use range (node1,node2] and if node2 is -1 that means the last one
bool set ( Xml *xml,
bool computeIds ,
int32_t niceness = 0 ,
int32_t node1 = 0 ,
int32_t node2 = -1 );
// trying to make it faster
bool set2 ( Xml *xml, bool computeIds , int32_t niceness = 0);
// . if score == 0 then use spam modified score
// . each non-spammy occurence of a word adds "baseScore" to it's score
// . keep baseScore pretty high in case reduced by spamming
// . typically i use 100 as the baseScore to preserve fractions
/*
bool hash ( TermTable *table ,
class Spam *spam ,
class Weights *weights ,
uint32_t baseScore ,
uint32_t maxScore ,
int64_t startHash ,
char *prefix1 ,
int32_t prefixLen1 ,
char *prefix2 ,
int32_t prefixLen2 ,
bool useStems ,
bool hashUniqueOnly ,
int32_t titleRecVersion ,
class Phrases *phrases ,//= NULL ,
bool hashWordIffNotInPhrase ,//= false,
int32_t niceness );//= 0);
*/
inline bool addWords(char* s, int32_t nodeLen,
bool computeIds, int32_t niceness);
// get the spam modified score of the ith word (baseScore is the
// score if the word is not spammed)
int32_t getNumWords ( ) const { return m_numWords; };
int32_t getNumAlnumWords ( ) const { return m_numAlnumWords;};
char *getWord ( int32_t n ) const { return m_words [n];};
int32_t getWordLen ( int32_t n ) const { return m_wordLens[n];};
//int64_t getNextWid ( int32_t i , int32_t toscan , int32_t niceness ) {
// int32_t max = i + toscan;
// if ( max > m_numWords ) max = m_numWords;
// for ( ; i < max ; i++ ) {
// QUICKPOLL(niceness);
// if ( m_wids[i] ) return m_wids[i];
// }
// return 0LL;
//};
// . size of string from word #a up to and NOT including word #b
// . "b" can be m_numWords to mean up to the end of the doc
int32_t getStringSize ( int32_t a , int32_t b ) {
// do not let it exceed this
if ( b >= m_numWords ) b = m_numWords;
// pedal it back. we might equal a then. which is ok, that
// means to just return the length of word #a then
b--;
if ( b < a ) return 0;
if ( a < 0 ) return 0;
int32_t size = m_words[b] - m_words[a];
// add in size of word #b
size += m_wordLens[b];
return size;
};
int32_t getWordAt ( char *charPos ); // int32_t charPos );
// . CAUTION: don't call this for punct "words"... it's bogus for them
// . this is only for alnum "words"
int64_t getWordId ( int32_t n ) const { return m_wordIds [n];};
bool isStopWord ( int32_t n ) {
return ::isStopWord(m_words [n],
m_wordLens[n],
m_wordIds [n]);
}
bool isQueryStopWord ( int32_t n , int32_t langId ) {
return ::isQueryStopWord(m_words [n],
m_wordLens[n],
m_wordIds [n],
langId);
}
// . how many quotes in the nth word?
// . how many plusses in the nth word?
// . used exclusively by Query class for parsing query syntax
int32_t getNumQuotes ( int32_t n ) {
int32_t count = 0;
for ( int32_t i = 0 ; i < m_wordLens[n] ; i++ )
if ( m_words[n][i] == '\"' ) count++;
return count; };
int32_t getNumPlusses ( int32_t n );
// . do we have a ' ' 't' '\n' or '\r' in this word?
// . caller should not call this is isPunct(n) is false, pointless.
bool hasSpace ( int32_t n ) {
for ( int32_t i = 0 ; i < m_wordLens[n] ; i++ )
if ( is_wspace_utf8(&m_words[n][i]) ) return true;
return false;
};
bool hasChar ( int32_t n , char c ) const {
for ( int32_t i = 0 ; i < m_wordLens[n] ; i++ )
if ( m_words[n][i] == c ) return true;
return false;
};
bool hasDigit ( int32_t n ) const {
for ( int32_t i = 0 ; i < m_wordLens[n] ; i++ )
if ( is_digit(m_words[n][i]) ) return true;
return false;
};
// this doesn't really work for utf8!!!
bool hasAlpha ( int32_t n ) const {
for ( int32_t i = 0 ; i < m_wordLens[n] ; i++ )
if ( is_alpha_a(m_words[n][i]) ) return true;
return false;
};
bool isSpaces ( int32_t n ) {
for ( int32_t i = 0 ; i < m_wordLens[n] ; i++ )
if ( ! is_wspace_utf8(&m_words[n][i]) ) return false;
return true;
};
bool isSpaces2 ( int32_t n , int32_t starti ) {
for ( int32_t i = starti ; i < m_wordLens[n] ; i++ )
if ( ! is_wspace_utf8(&m_words[n][i]) ) return false;
return true;
};
//bool isSpacesOrComma ( int32_t n ) {
// for ( int32_t i = 0 ; i < m_wordLens[n] ; i++ )
// if ( ! is_wspace_utf8(&m_words[n][i]) &&
// m_words[n][i]!=',' ) return false;
// return true;
//};
//if this is set from xml, every word is either a word or an xml node
nodeid_t getTagId(int32_t n) {
if ( ! m_tagIds ) return 0;
return ( m_tagIds[n] & BACKBITCOMP );
};
bool isBackTag(int32_t n) {
if ( ! m_tagIds ) return false;
if ( m_tagIds[n] & BACKBIT ) return true;
return false;
};
bool isBackTagId ( nodeid_t tid ) {
if ( tid & BACKBIT ) return true;
return false;
};
// CAUTION!!!
//
// "BACKBIT" is set in the tagid of m_tagIds[] to indicate the tag is
// a "back tag" as opposed to a "front tag". i.e. </a> vs. <a>
// respectively. so mask it out by doing "& BACKBITCOMP" if you just
// want the pure tagid!!!!
//
// CAUTION!!!
nodeid_t *getTagIds () { return m_tagIds; };
char **getWords () { return m_words; };
char **getWordPtrs() { return m_words; };
int32_t *getWordLens() { return m_wordLens; };
int64_t *getWordIds () { return m_wordIds; };
// 2 types of "words": punctuation and alnum
// isPunct() will return true on tags, too, so they are "punct"
bool isPunct ( int32_t n ) const { return m_wordIds[n] == 0;};
bool isAlnum ( int32_t n ) const { return m_wordIds[n] != 0;};
bool isAlpha ( int32_t n ) const {
if ( m_wordIds[n] == 0LL ) return false;
if ( isNum ( n ) ) return false;
return true;
};
int32_t getAsLong ( int32_t n ) {
// skip if no digit
if ( ! is_digit ( m_words[n][0] ) ) return -1;
return atol2(m_words[n],m_wordLens[n]);
};
bool isNum ( int32_t n ) const {
if ( ! is_digit(m_words[n][0]) ) return false;
char *p = m_words[n];
char *pend = p + m_wordLens[n];
for ( ; p < pend ; p++ )
if ( ! is_digit(*p) ) return false;
return true;
};
bool isHexNum ( int32_t n ) const {
if ( ! is_hex(m_words[n][0]) ) return false;
char *p = m_words[n];
char *pend = p + m_wordLens[n];
for ( ; p < pend ; p++ )
if ( ! is_hex(*p) ) return false;
return true;
};
// include &frac12's utf8 equivalent. used by Address.cpp
bool isNum2 ( int32_t n ) const {
if ( ! is_digit(m_words[n][0]) ) return false;
char *p = m_words[n];
char *pend = p + m_wordLens[n];
for ( ; p < pend ; p++ ) {
if ( is_digit(*p) ) continue;
// this is frac14
if ( p[0] == -62 && p[1] == -68 ) { p++; continue; }
// might be that &frac12 char, 14, 34 utf8 chars
if ( p[0] == -62 && p[1] == -67 ) { p++; continue; }
// this is frac34
if ( p[0] == -62 && p[1] == -66 ) { p++; continue; }
return false;
}
return true;
};
// . used in SimpleQuery.cpp
// . are all alpha char capitalized?
bool isUpper ( int32_t n ) {
// skip if not alnum...
if ( m_wordIds[n] == 0LL ) return false;
char *p = m_words[n];
char *pend = p + m_wordLens[n];
char cs;
for ( ; p < pend ; p += cs ) {
cs = getUtf8CharSize ( p );
if ( is_digit ( *p ) ) continue;
if ( is_lower_utf8 ( p ) ) return false;
}
return true;
}
bool isCapitalized ( int32_t n ) {
if ( ! is_alpha_utf8 ( m_words[n] ) ) return false;
return is_upper_utf8 ( m_words[n] ) ;
};
//returns the number of words in the float.
int32_t isFloat ( int32_t n, float& f);
int32_t getTotalLen ( ) { return m_totalLen; };
unsigned char isBounded(int wordi);
Words ( );
~Words ( );
void reset ( );
void print ( );
void printWord ( int32_t i );
//unsigned char getLanguage() { return langUnknown; }
// returns -1 and sets g_errno on error
int32_t getLanguage ( class Sections *sections = NULL ,
int32_t maxSamples = NUM_LANGUAGE_SAMPLES,
int32_t niceness = 0,
int32_t *langScore = NULL);
int32_t getMemUsed () { return m_bufSize; };
char *getContent() {
if ( m_numWords == 0 ) return NULL;
return m_words[0];
};
char *getContentEnd() {
if ( m_numWords == 0 ) return NULL;
return m_words[m_numWords-1] + m_wordLens[m_numWords-1];
};
// private:
bool allocateWordBuffers(int32_t count, bool tagIds = false);
char m_localBuf [ WORDS_LOCALBUFSIZE ];
char *m_localBuf2;
int32_t m_localBufSize2;
char *m_buf;
int32_t m_bufSize;
Xml *m_xml ; // if the class is set from xml, rather than a string
int32_t m_preCount ; // estimate of number of words in the doc
char **m_words ; // pointers to the word
int32_t *m_wordLens ; // length of each word
int64_t *m_wordIds ; // lower ascii hash of word
int32_t *m_nodes ; // Xml.cpp node # (for tags only)
nodeid_t *m_tagIds ; // tag for xml "words"
int32_t m_numWords; // # of words we have
int32_t m_numAlnumWords;
int32_t m_totalLen; // of all words
int32_t m_version; // titlerec version
bool m_hasTags;
// sanity checkes for performance
char *m_s;
int32_t m_numTags;
};
#endif