open-source-search-engine/Phrases.h

159 lines
5.2 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
// Matt Wells, copyright Jul 2001
// . generate phrases and store their hashes into m_phraseIds[] array
// . hash() will then hash the phraseIds into the TermTable (hashtable)
// . will it hash a word as a phrase if it's the only word? No, it will not.
// it only hashes 2+ word phrases
#ifndef _PHRASES_H_
#define _PHRASES_H_
//#include "TermTable.h"
#include "Bits.h"
//#include "Spam.h"
//#include "Scores.h"
#include "Words.h"
//#include "Weights.h"
#define PHRASE_BUF_SIZE (MAX_WORDS * 14)
#define PSKIP 201
class Phrases {
public:
Phrases();
~Phrases();
void reset() ;
2014-11-11 01:45:11 +03:00
bool set2 ( Words *words, Bits *bits , int32_t niceness ) {
2013-08-03 00:12:24 +04:00
return set ( words,bits,true,false,TITLEREC_CURRENT_VERSION,
niceness); };
// . set the hashes (m_phraseIds) of the phrases for these words
// . a phraseSpam of PSKIP means word is not in a phrase
// . "bits" describes the words in a phrasing context
// . "spam" is % spam of each word (spam may be NULL)
bool set ( Words *words,
Bits *bits ,
//Spam *spam ,
//Scores *scores ,
bool useStopWords ,
bool useStems ,
2014-11-11 01:45:11 +03:00
int32_t titleRecVersion,
int32_t niceness);
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
//int64_t getPhraseId ( int32_t n ) { return m_phraseIds [n]; };
int64_t getPhraseId2 ( int32_t n ) { return m_phraseIds2[n]; };
2014-10-30 22:36:39 +03:00
//int64_t *getPhraseIds ( ) { return m_phraseIds ; };
int64_t *getPhraseIds2( ) { return m_phraseIds2; };
int64_t *getPhraseIds3( ) { return m_phraseIds3; };
//int64_t *getPhraseIds4( ) { return m_phraseIds4; };
//int64_t *getPhraseIds5( ) { return m_phraseIds5; };
//int64_t *getStripPhraseIds ( ) { return m_stripPhraseIds ; };
2014-11-11 01:45:11 +03:00
//int64_t getStripPhraseId ( int32_t n )
2013-08-03 00:12:24 +04:00
//{ return m_stripPhraseIds [n]; };
2014-11-11 01:45:11 +03:00
int32_t getPhraseSpam ( int32_t n ) { return m_phraseSpam[n]; };
bool hasPhraseId ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};
bool startsAPhrase ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};
bool isInPhrase ( int32_t n ) ;
2013-08-03 00:12:24 +04:00
// . often word #i is involved in 2 phrases
// . m_phraseIds[i] only holds the one he starts
// . this gets the one he's in the middle of or on the right of
// . used by Query.cpp for phrase-forcing
2014-11-11 01:45:11 +03:00
//int64_t getLeftPhraseId ( int32_t i ) ;
//int64_t getLeftStripPhraseId ( int32_t i ) ;
//int32_t getLeftPhraseIndex ( int32_t i ) ;
2013-08-03 00:12:24 +04:00
// . each non-spammy occurence of phrase adds "baseScore" to it's score
/*
bool hash ( TermTable *table ,
Weights *weightsPtr ,
2014-11-11 01:45:11 +03:00
uint32_t baseScore ,
uint32_t maxScore ,
2014-10-30 22:36:39 +03:00
int64_t startHash ,
2013-08-03 00:12:24 +04:00
char *prefix1 ,
2014-11-11 01:45:11 +03:00
int32_t prefixLen1 ,
2013-08-03 00:12:24 +04:00
char *prefix2 ,
2014-11-11 01:45:11 +03:00
int32_t prefixLen2 ,
2013-08-03 00:12:24 +04:00
bool hashUniqueOnly ,
2014-11-11 01:45:11 +03:00
int32_t titleRecVersion,
int32_t niceness = 0);
2013-08-03 00:12:24 +04:00
*/
// . store phrase that starts with word #i into "dest"
// . we also NULL terminated it in "dest"
// . return length
2014-11-11 01:45:11 +03:00
char *getPhrase ( int32_t i , int32_t *phrLen , int32_t npw );
//char *getNWordPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) ;
//char *getStripPhrase ( int32_t i , int32_t *phrLen );
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
//int32_t getNumWords ( int32_t i ) { return m_numWordsTotal[i]; };
//int32_t getNumWordsInPhrase ( int32_t i ) { return m_numWordsTotal [i]; };
int32_t getNumWordsInPhrase2( int32_t i ) { return m_numWordsTotal2[i]; };
2013-08-03 00:12:24 +04:00
2014-11-11 01:45:11 +03:00
int32_t getMaxWordsInPhrase( int32_t i , int64_t *pid ) ;
int32_t getMinWordsInPhrase( int32_t i , int64_t *pid ) ;
2013-08-03 00:12:24 +04:00
// . leave this public so SimpleQuery.cpp can mess with it
// . called by Phrases::set() above for each i
// . we set phraseSpam to 0 to 100% typically
// . we set phraseSpam to PSKIP if word #i cannot start a phrase
2014-11-11 01:45:11 +03:00
void setPhrase ( int32_t i ,
int32_t niceness);
2013-08-03 00:12:24 +04:00
// private:
char m_localBuf [ PHRASE_BUF_SIZE ];
char *m_buf;
2014-11-11 01:45:11 +03:00
int32_t m_bufSize;
2013-08-03 00:12:24 +04:00
// . these are 1-1 with the words in the Words class
// . phraseSpam is PSKIP if the phraseId is invalid
2014-10-30 22:36:39 +03:00
//int64_t *m_phraseIds ;
2013-08-03 00:12:24 +04:00
// the two word hash
2014-10-30 22:36:39 +03:00
int64_t *m_phraseIds2 ;
int64_t *m_phraseIds3 ;
//int64_t *m_phraseIds4 ;
//int64_t *m_phraseIds5 ;
//int64_t *m_stripPhraseIds ;
2013-08-03 00:12:24 +04:00
unsigned char *m_phraseSpam ;
// . # words in phrase TOTAL (including punct words)
// . used for printing
// . used by SimpleQuery::getTermIds() for setting word ranges
// for phrases
//unsigned char *m_numWordsTotal ;
// for the two word phrases:
unsigned char *m_numWordsTotal2 ;
unsigned char *m_numWordsTotal3 ;
//unsigned char *m_numWordsTotal4 ;
//unsigned char *m_numWordsTotal5 ;
2014-11-11 01:45:11 +03:00
int32_t m_numPhrases; // should equal the # of words
2013-08-03 00:12:24 +04:00
// placeholders to avoid passing to subroutine
Words *m_words;
2014-10-30 22:36:39 +03:00
int64_t *m_wids;
2013-08-03 00:12:24 +04:00
char **m_wptrs;
2014-11-11 01:45:11 +03:00
int32_t *m_wlens;
2013-08-03 00:12:24 +04:00
Bits *m_bits;
bool m_useStems;
bool m_useStopWords;
2014-11-11 01:45:11 +03:00
int32_t m_titleRecVersion;
2013-08-03 00:12:24 +04:00
// replaces Scores
//class Sections *m_sections;
//class Section *m_sectionPtrs;
// word scores, set in Scores.cpp
2014-11-11 01:45:11 +03:00
//int32_t *m_wordScores;
2013-08-03 00:12:24 +04:00
// the score of the phrase is the min of the scores of the words that
// make up the phrase
2014-11-11 01:45:11 +03:00
//int32_t *m_phraseScores ;
2013-08-03 00:12:24 +04:00
};
#endif