open-source-search-engine/Phrases.h

// Matt Wells, copyright Jul 2001

// . generate phrases and store their hashes into m_phraseIds[] array
// . hash() will then hash the phraseIds into the TermTable (hashtable)
// . will it hash a word as a phrase if it's the only word? No, it will not.
//   it only hashes 2+ word phrases

#ifndef _PHRASES_H_
#define _PHRASES_H_

//#include "TermTable.h"
#include "Bits.h"
//#include "Spam.h"
//#include "Scores.h"
#include "Words.h"
//#include "Weights.h"

#define PHRASE_BUF_SIZE (MAX_WORDS * 14)

#define PSKIP 201

class Phrases {

 public:

	Phrases();
	~Phrases();
	void reset() ;

	bool set2 ( Words *words, Bits *bits , int32_t niceness ) {
		return set ( words,bits,true,false,TITLEREC_CURRENT_VERSION,
			     niceness); };

	// . set the hashes (m_phraseIds) of the phrases for these words
	// . a phraseSpam of PSKIP means word is not in a phrase
	// . "bits" describes the words in a phrasing context
	// . "spam" is % spam of each word (spam may be NULL)
	bool set ( Words    *words, 
		   Bits     *bits ,
		   //Spam     *spam ,
		   //Scores   *scores ,
		   bool      useStopWords ,
		   bool      useStems     ,
		   int32_t      titleRecVersion,
		   int32_t      niceness);

	//int64_t getPhraseId   ( int32_t n ) { return m_phraseIds [n]; };
	int64_t getPhraseId2  ( int32_t n ) { return m_phraseIds2[n]; };
	//int64_t *getPhraseIds (        ) { return m_phraseIds ; };
	int64_t *getPhraseIds2(        ) { return m_phraseIds2; };
	int64_t *getPhraseIds3(        ) { return m_phraseIds3; };
	//int64_t *getPhraseIds4(        ) { return m_phraseIds4; };
	//int64_t *getPhraseIds5(        ) { return m_phraseIds5; };

	//int64_t *getStripPhraseIds (      ) { return m_stripPhraseIds ; };
	//int64_t getStripPhraseId   ( int32_t n ) 
	//{ return m_stripPhraseIds [n]; };
	int32_t      getPhraseSpam ( int32_t n ) { return m_phraseSpam[n]; };
	bool      hasPhraseId   ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};
	bool      startsAPhrase ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};
	bool      isInPhrase    ( int32_t n ) ;
	// . often word #i is involved in 2 phrases
	// . m_phraseIds[i] only holds the one he starts
	// . this gets the one he's in the middle of or on the right of
	// . used by Query.cpp for phrase-forcing
	//int64_t getLeftPhraseId       ( int32_t i ) ;
	//int64_t getLeftStripPhraseId  ( int32_t i ) ;
	//int32_t      getLeftPhraseIndex    ( int32_t i ) ;

	// . each non-spammy occurence of phrase adds "baseScore" to it's score
	/*
	bool hash ( TermTable      *table       ,
		    Weights        *weightsPtr  ,
		    uint32_t   baseScore   ,
		    uint32_t   maxScore    ,
		    int64_t       startHash   ,
		    char           *prefix1     ,
		    int32_t            prefixLen1  ,
		    char           *prefix2     ,
		    int32_t            prefixLen2  ,
		    bool            hashUniqueOnly ,
		    int32_t            titleRecVersion,
		    int32_t            niceness = 0);
	*/

	// . store phrase that starts with word #i into "dest"
	// . we also NULL terminated it in "dest"
	// . return length
	char *getPhrase ( int32_t i , int32_t *phrLen , int32_t npw );
	//char *getNWordPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) ;
	//char *getStripPhrase ( int32_t i , int32_t *phrLen );

	//int32_t  getNumWords         ( int32_t i ) { return m_numWordsTotal[i]; };
	//int32_t  getNumWordsInPhrase ( int32_t i ) { return m_numWordsTotal [i]; };
	int32_t  getNumWordsInPhrase2( int32_t i ) { return m_numWordsTotal2[i]; };

	int32_t  getMaxWordsInPhrase( int32_t i , int64_t *pid ) ;
	int32_t  getMinWordsInPhrase( int32_t i , int64_t *pid ) ;

	// . leave this public so SimpleQuery.cpp can mess with it
	// . called by Phrases::set() above for each i
	// . we set phraseSpam to 0 to 100% typically
	// . we set phraseSpam to PSKIP if word #i cannot start a phrase
	void setPhrase ( int32_t i ,
			 int32_t niceness);

	// private:

	char  m_localBuf [ PHRASE_BUF_SIZE ];

	char *m_buf;
	int32_t  m_bufSize;

	// . these are 1-1 with the words in the Words class
	// . phraseSpam is PSKIP if the phraseId is invalid
	//int64_t     *m_phraseIds  ;
	// the two word hash
	int64_t     *m_phraseIds2  ;
	int64_t     *m_phraseIds3  ;
	//int64_t     *m_phraseIds4  ;
	//int64_t     *m_phraseIds5  ;
	//int64_t     *m_stripPhraseIds  ;
	unsigned char *m_phraseSpam ;
	// . # words in phrase TOTAL (including punct words)
	// . used for printing
	// . used by SimpleQuery::getTermIds() for setting word ranges
	//   for phrases
	//unsigned char *m_numWordsTotal ;
	// for the two word phrases:
	unsigned char *m_numWordsTotal2 ;
	unsigned char *m_numWordsTotal3 ;
	//unsigned char *m_numWordsTotal4 ;
	//unsigned char *m_numWordsTotal5 ;
	int32_t           m_numPhrases; // should equal the # of words

	// placeholders to avoid passing to subroutine
	Words      *m_words;
	int64_t  *m_wids;
	char      **m_wptrs;
	int32_t       *m_wlens;

	Bits    *m_bits;
	bool     m_useStems;
	bool     m_useStopWords;
	int32_t     m_titleRecVersion;

	// replaces Scores
	//class Sections *m_sections;
	//class Section  *m_sectionPtrs;

	// word scores, set in Scores.cpp
	//int32_t    *m_wordScores;
	// the score of the phrase is the min of the scores of the words that
	// make up the phrase
	//int32_t    *m_phraseScores ;
};

#endif
Initial file population. 2013-08-03 00:12:24 +04:00			`// Matt Wells, copyright Jul 2001`

			`// . generate phrases and store their hashes into m_phraseIds[] array`
			`// . hash() will then hash the phraseIds into the TermTable (hashtable)`
			`// . will it hash a word as a phrase if it's the only word? No, it will not.`
			`// it only hashes 2+ word phrases`

			`#ifndef _PHRASES_H_`
			`#define _PHRASES_H_`

			`//#include "TermTable.h"`
			`#include "Bits.h"`
			`//#include "Spam.h"`
			`//#include "Scores.h"`
			`#include "Words.h"`
			`//#include "Weights.h"`

			`#define PHRASE_BUF_SIZE (MAX_WORDS * 14)`

			`#define PSKIP 201`

			`class Phrases {`

			`public:`

			`Phrases();`
			`~Phrases();`
			`void reset() ;`

now it compiles with -m32 2014-11-11 01:45:11 +03:00			`bool set2 ( Words words, Bits bits , int32_t niceness ) {`
Initial file population. 2013-08-03 00:12:24 +04:00			`return set ( words,bits,true,false,TITLEREC_CURRENT_VERSION,`
			`niceness); };`

			`// . set the hashes (m_phraseIds) of the phrases for these words`
			`// . a phraseSpam of PSKIP means word is not in a phrase`
			`// . "bits" describes the words in a phrasing context`
			`// . "spam" is % spam of each word (spam may be NULL)`
			`bool set ( Words *words,`
			`Bits *bits ,`
			`//Spam *spam ,`
			`//Scores *scores ,`
			`bool useStopWords ,`
			`bool useStems ,`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t titleRecVersion,`
			`int32_t niceness);`
Initial file population. 2013-08-03 00:12:24 +04:00
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int64_t getPhraseId ( int32_t n ) { return m_phraseIds [n]; };`
			`int64_t getPhraseId2 ( int32_t n ) { return m_phraseIds2[n]; };`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`//int64_t *getPhraseIds ( ) { return m_phraseIds ; };`
			`int64_t *getPhraseIds2( ) { return m_phraseIds2; };`
			`int64_t *getPhraseIds3( ) { return m_phraseIds3; };`
			`//int64_t *getPhraseIds4( ) { return m_phraseIds4; };`
			`//int64_t *getPhraseIds5( ) { return m_phraseIds5; };`

			`//int64_t *getStripPhraseIds ( ) { return m_stripPhraseIds ; };`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int64_t getStripPhraseId ( int32_t n )`
Initial file population. 2013-08-03 00:12:24 +04:00			`//{ return m_stripPhraseIds [n]; };`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t getPhraseSpam ( int32_t n ) { return m_phraseSpam[n]; };`
			`bool hasPhraseId ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};`
			`bool startsAPhrase ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};`
			`bool isInPhrase ( int32_t n ) ;`
Initial file population. 2013-08-03 00:12:24 +04:00			`// . often word #i is involved in 2 phrases`
			`// . m_phraseIds[i] only holds the one he starts`
			`// . this gets the one he's in the middle of or on the right of`
			`// . used by Query.cpp for phrase-forcing`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int64_t getLeftPhraseId ( int32_t i ) ;`
			`//int64_t getLeftStripPhraseId ( int32_t i ) ;`
			`//int32_t getLeftPhraseIndex ( int32_t i ) ;`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// . each non-spammy occurence of phrase adds "baseScore" to it's score`
			`/*`
			`bool hash ( TermTable *table ,`
			`Weights *weightsPtr ,`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`uint32_t baseScore ,`
			`uint32_t maxScore ,`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t startHash ,`
Initial file population. 2013-08-03 00:12:24 +04:00			`char *prefix1 ,`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t prefixLen1 ,`
Initial file population. 2013-08-03 00:12:24 +04:00			`char *prefix2 ,`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t prefixLen2 ,`
Initial file population. 2013-08-03 00:12:24 +04:00			`bool hashUniqueOnly ,`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t titleRecVersion,`
			`int32_t niceness = 0);`
Initial file population. 2013-08-03 00:12:24 +04:00			`*/`

			`// . store phrase that starts with word #i into "dest"`
			`// . we also NULL terminated it in "dest"`
			`// . return length`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`char getPhrase ( int32_t i , int32_t phrLen , int32_t npw );`
			`//char getNWordPhrase ( int32_t i , int32_t phrLen , int32_t npw ) ;`
			`//char getStripPhrase ( int32_t i , int32_t phrLen );`
Initial file population. 2013-08-03 00:12:24 +04:00
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int32_t getNumWords ( int32_t i ) { return m_numWordsTotal[i]; };`
			`//int32_t getNumWordsInPhrase ( int32_t i ) { return m_numWordsTotal [i]; };`
			`int32_t getNumWordsInPhrase2( int32_t i ) { return m_numWordsTotal2[i]; };`
Initial file population. 2013-08-03 00:12:24 +04:00
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t getMaxWordsInPhrase( int32_t i , int64_t *pid ) ;`
			`int32_t getMinWordsInPhrase( int32_t i , int64_t *pid ) ;`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// . leave this public so SimpleQuery.cpp can mess with it`
			`// . called by Phrases::set() above for each i`
			`// . we set phraseSpam to 0 to 100% typically`
			`// . we set phraseSpam to PSKIP if word #i cannot start a phrase`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`void setPhrase ( int32_t i ,`
			`int32_t niceness);`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// private:`

			`char m_localBuf [ PHRASE_BUF_SIZE ];`

			`char *m_buf;`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t m_bufSize;`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// . these are 1-1 with the words in the Words class`
			`// . phraseSpam is PSKIP if the phraseId is invalid`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`//int64_t *m_phraseIds ;`
Initial file population. 2013-08-03 00:12:24 +04:00			`// the two word hash`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t *m_phraseIds2 ;`
			`int64_t *m_phraseIds3 ;`
			`//int64_t *m_phraseIds4 ;`
			`//int64_t *m_phraseIds5 ;`
			`//int64_t *m_stripPhraseIds ;`
Initial file population. 2013-08-03 00:12:24 +04:00			`unsigned char *m_phraseSpam ;`
			`// . # words in phrase TOTAL (including punct words)`
			`// . used for printing`
			`// . used by SimpleQuery::getTermIds() for setting word ranges`
			`// for phrases`
			`//unsigned char *m_numWordsTotal ;`
			`// for the two word phrases:`
			`unsigned char *m_numWordsTotal2 ;`
			`unsigned char *m_numWordsTotal3 ;`
			`//unsigned char *m_numWordsTotal4 ;`
			`//unsigned char *m_numWordsTotal5 ;`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t m_numPhrases; // should equal the # of words`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// placeholders to avoid passing to subroutine`
			`Words *m_words;`
replace long long with int64_t 2014-10-30 22:36:39 +03:00			`int64_t *m_wids;`
Initial file population. 2013-08-03 00:12:24 +04:00			`char **m_wptrs;`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t *m_wlens;`
Initial file population. 2013-08-03 00:12:24 +04:00
			`Bits *m_bits;`
			`bool m_useStems;`
			`bool m_useStopWords;`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`int32_t m_titleRecVersion;`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// replaces Scores`
			`//class Sections *m_sections;`
			`//class Section *m_sectionPtrs;`

			`// word scores, set in Scores.cpp`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int32_t *m_wordScores;`
Initial file population. 2013-08-03 00:12:24 +04:00			`// the score of the phrase is the min of the scores of the words that`
			`// make up the phrase`
now it compiles with -m32 2014-11-11 01:45:11 +03:00			`//int32_t *m_phraseScores ;`
Initial file population. 2013-08-03 00:12:24 +04:00			`};`

			`#endif`