open-source-search-engine/PageParser.h

111 lines
2.3 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
#ifndef _PAGEPARSER_H_
#define _PAGEPARSER_H_
// global flag
extern bool g_inPageParser ;
extern bool g_inPageInject ;
#define PP_NICENESS 2
#include "XmlDoc.h"
#include "Pages.h"
#include "Unicode.h"
#include "Title.h"
#include "Pos.h"
#include "TopTree.h"
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) ;
bool sendPageParser2 ( TcpSocket *s ,
HttpRequest *r ,
class State8 *st ,
2014-10-30 22:36:39 +03:00
int64_t docId ,
2013-08-03 00:12:24 +04:00
Query *q ,
2014-10-30 22:36:39 +03:00
int64_t *termFreqs ,
2013-08-03 00:12:24 +04:00
float *termFreqWeights ,
float *affWeights ,
void *state ,
void (* callback)(void *state) ) ;
class State8 {
public:
TopTree m_topTree;
//Msg16 m_msg16;
//Msg14 m_msg14;
//Msg15 m_msg15;
Msg22 m_msg22;
SafeBuf m_dbuf;
//XmlDoc m_doc;
//Url m_url;
//Url m_rootUrl;
char *m_u;
2014-11-11 01:45:11 +03:00
int32_t m_ulen;
2013-08-03 00:12:24 +04:00
bool m_applyRulesetToRoot;
char m_rootQuality;
2014-11-11 01:45:11 +03:00
int32_t m_reparseRootRetries;
2013-08-03 00:12:24 +04:00
char m_coll[MAX_COLL_LEN];
2014-11-11 01:45:11 +03:00
int32_t m_collLen;
//int32_t m_sfn;
//int32_t m_urlLen;
2013-08-03 00:12:24 +04:00
TcpSocket *m_s;
bool m_isLocal;
char m_pwd[32];
HttpRequest m_r;
2014-11-11 01:45:11 +03:00
int32_t m_old;
2013-08-03 00:12:24 +04:00
// recyle the link info from the title rec?
2014-11-11 01:45:11 +03:00
int32_t m_recycle;
2013-08-03 00:12:24 +04:00
// recycle the link info that was imported from another coll?
2014-11-11 01:45:11 +03:00
int32_t m_recycle2;
int32_t m_render;
2013-08-03 00:12:24 +04:00
char m_recompute;
2014-11-11 01:45:11 +03:00
int32_t m_oips;
2013-08-03 00:12:24 +04:00
char m_linkInfoColl[11];
// char m_buf[16384 * 1024];
2014-11-11 01:45:11 +03:00
//int32_t m_page;
2013-08-03 00:12:24 +04:00
// m_pbuf now points to m_sbuf if we are showing the parsing junk
SafeBuf m_xbuf;
SafeBuf m_wbuf;
bool m_donePrinting;
//SafeBuf m_sbuf;
// this is a buffer which cats m_sbuf into it
//SafeBuf m_sbuf2;
// new state vars for Msg3b.cpp
2014-10-30 22:36:39 +03:00
int64_t m_docId;
2013-08-03 00:12:24 +04:00
void *m_state ;
void (* m_callback) (void *state);
Query m_tq;
Query *m_q;
2014-10-30 22:36:39 +03:00
int64_t *m_termFreqs;
2013-08-03 00:12:24 +04:00
float *m_termFreqWeights;
float *m_affWeights;
2014-03-14 00:09:33 +04:00
//score_t m_total;
2013-08-03 00:12:24 +04:00
bool m_freeIt;
bool m_blocked;
// these are from rearranging the code
2014-11-11 01:45:11 +03:00
int32_t m_indexCode;
//uint64_t m_chksum1;
2014-10-30 22:36:39 +03:00
int64_t m_took1;
int64_t m_took1b;
int64_t m_took2;
int64_t m_took3;
2013-08-03 00:12:24 +04:00
char m_didRootDom;
char m_didRootWWW;
char m_wasRootDom;
// call Msg16 with a versino of title rec to do
2014-11-11 01:45:11 +03:00
int32_t m_titleRecVersion;
2013-08-03 00:12:24 +04:00
char m_hopCount;
//TitleRec m_tr;
//XmlDoc m_oldDoc;
XmlDoc m_xd;
};
#endif