open-source-search-engine/PageGet.cpp
Matt 90456222b6 now we add the spider status docs as json documents.
so you can facet/sortby the various fields, etc.
2015-03-19 16:17:36 -06:00

966 lines
28 KiB
C++

#include "gb-include.h"
#include "SafeBuf.h"
#include "Collectiondb.h"
//#include "CollectionRec.h"
#include "Msg22.h"
#include "Query.h"
#include "HttpServer.h"
#include "Highlight.h"
#include "Pages.h"
#include "PageNetTest.h"
#include "Tagdb.h"
#include "XmlDoc.h"
#include "PageResults.h" // printEventAddress()...
// TODO: redirect to host that has the titleRec locally
static bool sendErrorReply ( void *state , int32_t err ) ;
static void processLoopWrapper ( void *state ) ;
static bool processLoop ( void *state ) ;
class State2 {
public:
Msg22 m_msg22;
char m_format;
//TitleRec m_tr;
int32_t m_niceness;
XmlDoc m_xd;
char *m_tr;
int32_t m_trSize;
uint8_t m_langId;
//Msg8a m_msg8a;
//SiteRec m_sr;
//TagRec m_tagRec;
TcpSocket *m_socket;
HttpRequest m_r;
char m_coll[MAX_COLL_LEN+2];
//CollectionRec *m_cr;
bool m_isMasterAdmin;
bool m_isLocal;
//bool m_seq;
bool m_rtq;
char m_q[MAX_QUERY_LEN+1];
int32_t m_qlen;
char m_boolFlag;
bool m_printed;
int64_t m_docId;
bool m_includeHeader;
bool m_includeBaseHref;
bool m_queryHighlighting;
int32_t m_strip;
bool m_clickAndScroll;
bool m_clickNScroll; // new click 'n' scroll
bool m_cnsPage; // Are we in the click 'n' scroll page?
bool m_printDisclaimer;
bool m_netTestResults;
bool m_isBanned;
bool m_noArchive;
SafeBuf m_sb;
};
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
// get the collection
int32_t collLen = 0;
char *coll = r->getString("c",&collLen);
if ( ! coll || ! coll[0] ) {
//coll = g_conf.m_defaultColl;
coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
collLen = gbstrlen(coll);
}
// ensure collection not too big
if ( collLen >= MAX_COLL_LEN ) {
g_errno = ECOLLTOOBIG;
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
}
// get the collection rec
CollectionRec *cr = g_collectiondb.getRec ( coll );
if ( ! cr ) {
g_errno = ENOCOLLREC;
log("query: Archived copy retrieval failed. "
"No collection record found for "
"collection \"%s\".",coll);
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
}
// does this collection ban this IP?
if ( ! cr->hasSearchPermission ( s ) ) {
g_errno = ENOPERM;
//log("PageGet::sendDynamicReply0: permission denied for %s",
// iptoa(s->m_ip) );
g_msg = " (error: permission denied)";
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
}
// . get fields from cgi field of the requested url
// . get the search query
int32_t qlen = 0;
char *q = r->getString ( "q" , &qlen , NULL /*default*/);
// ensure query not too big
if ( qlen >= MAX_QUERY_LEN-1 ) {
g_errno=EQUERYTOOBIG;
return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
}
// the docId
int64_t docId = r->getLongLong ( "d" , 0LL /*default*/ );
// get url
char *url = r->getString ( "u",NULL);
if ( docId == 0 && ! url ) {
g_errno = EMISSINGINPUT;
return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
}
// . should we do a sequential lookup?
// . we need to match summary here so we need to know this
//bool seq = r->getLong ( "seq" , false );
// restrict to root file?
bool rtq = r->getLong ( "rtq" , false );
// . get the titleRec
// . TODO: redirect client to a better http server to save bandwidth
State2 *st ;
try { st = new (State2); }
catch (... ) {
g_errno = ENOMEM;
log("PageGet: new(%i): %s",
(int)sizeof(State2),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
mnew ( st , sizeof(State2) , "PageGet1" );
// save the socket and if Host: is local in the Http request Mime
st->m_socket = s;
st->m_isMasterAdmin = g_conf.isCollAdmin ( s , r );
st->m_isLocal = r->isLocal();
st->m_docId = docId;
st->m_printed = false;
// include header ... "this page cached by Gigablast on..."
st->m_includeHeader = r->getLong ("ih" , true );
st->m_includeBaseHref = r->getLong ("ibh" , false );
st->m_queryHighlighting = r->getLong ("qh" , true );
st->m_strip = r->getLong ("strip" , 0 );
st->m_clickAndScroll = r->getLong ("cas" , true );
st->m_cnsPage = r->getLong ("cnsp" , true );
char *langAbbr = r->getString("qlang",NULL);
st->m_langId = langUnknown;
if ( langAbbr ) {
uint8_t langId = getLangIdFromAbbr ( langAbbr );
st->m_langId = langId;
}
strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
// store query for query highlighting
st->m_netTestResults = r->getLong ("rnettest", false );
//if( st->m_netTestResults ) {
// mdelete ( st , sizeof(State2) , "PageGet1" );
// delete ( st );
// return sendPageNetResult( s );
//}
if ( q && qlen > 0 ) strcpy ( st->m_q , q );
else st->m_q[0] = '\0';
st->m_qlen = qlen;
//st->m_seq = seq;
st->m_rtq = rtq;
st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ );
st->m_isBanned = false;
st->m_noArchive = false;
st->m_socket = s;
st->m_format = r->getReplyFormat();
// default to 0 niceness
st->m_niceness = 0;
st->m_r.copy ( r );
//st->m_cr = cr;
st->m_printDisclaimer = true;
if ( st->m_cnsPage )
st->m_printDisclaimer = false;
if ( st->m_strip ) // ! st->m_evbits.isEmpty() )
st->m_printDisclaimer = false;
// should we cache it?
char useCache = r->getLong ( "usecache" , 1 );
char rcache = r->getLong ( "rcache" , 1 );
char wcache = r->getLong ( "wcache" , 1 );
int32_t cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour
if ( useCache == 0 ) { cacheAge = 0; wcache = 0; }
if ( rcache == 0 ) cacheAge = 0;
// . fetch the TitleRec
// . a max cache age of 0 means not to read from the cache
XmlDoc *xd = &st->m_xd;
// url based?
if ( url ) {
SpiderRequest sreq;
sreq.reset();
strcpy(sreq.m_url, url );
sreq.setDataSize();
// this returns false if "coll" is invalid
if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) )
goto hadSetError;
}
// . when getTitleRec() is called it will load the old one
// since XmlDoc::m_setFromTitleRec will be true
// . niceness is 0
// . use st->m_coll since XmlDoc just points to it!
// . this returns false if "coll" is invalid
else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) {
hadSetError:
mdelete ( st , sizeof(State2) , "PageGet1" );
delete ( st );
g_errno = ENOMEM;
log("PageGet: set3: %s", mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
}
// if it blocks while it loads title rec, it will re-call this routine
xd->setCallback ( st , processLoopWrapper );
// good to go!
return processLoop ( st );
}
// returns true
bool sendErrorReply ( void *state , int32_t err ) {
// ensure this is set
if ( ! err ) { char *xx=NULL;*xx=0; }
// get it
State2 *st = (State2 *)state;
// get the tcp socket from the state
TcpSocket *s = st->m_socket;
char tmp [ 1024*32 ] ;
sprintf ( tmp , "%s",
mstrerror(g_errno));
// nuke state2
mdelete ( st , sizeof(State2) , "PageGet1" );
delete (st);
// erase g_errno for sending
//g_errno = 0;
// . now encapsulate it in html head/tail and send it off
//return g_httpServer.sendDynamicPage ( s , tmp , gbstrlen(tmp) );
return g_httpServer.sendErrorReply ( s, err, mstrerror(err) );
}
void processLoopWrapper ( void *state ) {
processLoop ( state );
}
// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
// get it
State2 *st = (State2 *)state;
// get the tcp socket from the state
TcpSocket *s = st->m_socket;
// get it
XmlDoc *xd = &st->m_xd;
if ( ! xd->m_loaded ) {
// setting just the docid. niceness is 0.
//xd->set3 ( st->m_docId , st->m_coll , 0 );
// callback
xd->setCallback ( state , processLoop );
// . and tell it to load from the old title rec
// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
// . this sets xd->ptr_* and all other member vars from
// the old title rec if found in titledb.
if ( ! xd->loadFromOldTitleRec ( ) ) return false;
}
if ( g_errno ) return sendErrorReply ( st , g_errno );
// now force it to load old title rec
//char **tr = xd->getTitleRec();
SafeBuf *tr = xd->getTitleRecBuf();
// blocked? return false if so. it will call processLoop() when it rets
if ( tr == (void *)-1 ) return false;
// we did not block. check for error? this will free "st" too.
if ( ! tr ) return sendErrorReply ( st , g_errno );
// if title rec was empty, that is a problem
if ( xd->m_titleRecBuf.length() == 0 )
return sendErrorReply ( st , ENOTFOUND);
// set callback
char *na = xd->getIsNoArchive();
// wait if blocked
if ( na == (void *)-1 ) return false;
// error?
if ( ! na ) return sendErrorReply ( st , g_errno );
// forbidden? allow turkeys through though...
if ( ! st->m_isMasterAdmin && *na )
return sendErrorReply ( st , ENOCACHE );
SafeBuf *sb = &st->m_sb;
// &page=4 will print rainbow sections
if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
// do not repeat this call
st->m_printed = true;
// this will call us again since we called
// xd->setCallback() above to us
if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
return false;
}
char *contentType = "text/html";
char format = st->m_format;
if ( format == FORMAT_XML ) contentType = "text/xml";
if ( format == FORMAT_JSON ) contentType = "application/json";
// if we printed a special page (like rainbow sections) then return now
if ( st->m_printed ) {
bool status = g_httpServer.sendDynamicPage (s,
//buf,bufLen,
sb->getBufStart(),
sb->getLength(),
-1,false,
//"text/html",
contentType,
-1, NULL, "utf8" );
// nuke state2
mdelete ( st , sizeof(State2) , "PageGet1" );
delete (st);
return status;
}
/*
// this was calling XmlDoc and setting sections, etc. to
// get the SpiderReply junk... no no no
// is it banned or filtered? this ignores the TagRec in the titleRec
// and uses msg8a to get it fresh instead
char *vi = xd->getIsFiltered();//Visible( );
// wait if blocked
if ( vi == (void *)-1 ) return false;
// error?
if ( ! vi ) return sendErrorReply ( st , g_errno );
// banned?
if ( ! st->m_isMasterAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
*/
// get the utf8 content
char **utf8 = xd->getUtf8Content();
//int32_t len = xd->size_utf8Content - 1;
// wait if blocked???
if ( utf8 == (void *)-1 ) return false;
// strange
if ( xd->size_utf8Content<=0) {
log("pageget: utf8 content <= 0");
return sendErrorReply(st,EBADENGINEER );
}
// alloc error?
if ( ! utf8 ) return sendErrorReply ( st , g_errno );
// get this host
Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
if ( ! h ) {
log("pageget: hostid %"INT32" is bad",g_hostdb.m_hostId);
return sendErrorReply(st,EBADENGINEER );
}
char *content = xd->ptr_utf8Content;
int32_t contentLen = xd->size_utf8Content - 1;
// int16_tcut
char strip = st->m_strip;
// alloc buffer now
//char *buf = NULL;
//int32_t bufMaxSize = 0;
//bufMaxSize = len + ( 32 * 1024 ) ;
//bufMaxSize = contentLen + ( 32 * 1024 ) ;
//buf = (char *)mmalloc ( bufMaxSize , "PageGet2" );
//char *p = buf;
//char *bufEnd = buf + bufMaxSize;
//if ( ! buf ) {
// return sendErrorReply ( st , g_errno );
//}
// for undoing the header
//char *start1 = p;
int32_t startLen1 = sb->length();
// we are always utfu
if ( strip != 2 )
sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
"content=\"text/html;charset=utf8\">\n");
// base href
//Url *base = &xd->m_firstUrl;
//if ( xd->ptr_redirUrl.m_url[0] )
// base = &xd->m_redirUrl;
char *base = xd->ptr_firstUrl;
if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
//Url *redir = *xd->getRedirUrl();
if ( strip != 2 ) {
sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
//p += gbstrlen ( p );
}
// default colors in case css files missing
if ( strip != 2 ) {
sb->safePrintf( "\n<style type=\"text/css\">\n"
"body{background-color:white;color:black;}\n"
"</style>\n");
//p += gbstrlen ( p );
}
//char format = st->m_format;
if ( format == FORMAT_XML ) sb->reset();
if ( format == FORMAT_JSON ) sb->reset();
if ( xd->m_contentType == CT_JSON ) sb->reset();
if ( xd->m_contentType == CT_XML ) sb->reset();
if ( xd->m_contentType == CT_STATUS ) sb->reset();
// for undoing the stuff below
int32_t startLen2 = sb->length();//p;
// query should be NULL terminated
char *q = st->m_q;
int32_t qlen = st->m_qlen;
char styleTitle[128] = "font-size:14px;font-weight:600;"
"color:#000000;";
char styleText[128] = "font-size:14px;font-weight:400;"
"color:#000000;";
char styleLink[128] = "font-size:14px;font-weight:400;"
"color:#0000ff;";
char styleTell[128] = "font-size:14px;font-weight:600;"
"color:#cc0000;";
// get the url of the title rec
Url *f = xd->getFirstUrl();
bool printDisclaimer = st->m_printDisclaimer;
if ( xd->m_contentType == CT_JSON )
printDisclaimer = false;
if ( xd->m_contentType == CT_STATUS )
printDisclaimer = false;
if ( format == FORMAT_XML ) printDisclaimer = false;
if ( format == FORMAT_JSON ) printDisclaimer = false;
char tbuf[100];
tbuf[0] = 0;
time_t lastSpiderDate = xd->m_spideredTime;
if ( printDisclaimer ||
format == FORMAT_XML ||
format == FORMAT_JSON ) {
struct tm *timeStruct = gmtime ( &lastSpiderDate );
strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
}
// We should always be displaying this disclaimer.
// - May eventually want to display this at a different location
// on the page, or on the click 'n' scroll browser page itself
// when this page is not being viewed solo.
// CNS: if ( ! st->m_clickNScroll ) {
if ( printDisclaimer ) {
sb->safePrintf(//sprintf ( p ,
//"<BASE HREF=\"%s\">"
//"<table border=1 width=100%%>"
//"<tr><td>"
"<table border=\"1\" bgcolor=\"#"
BGCOLOR
"\" cellpadding=\"10\" "
//"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
"cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
"<tr"
//" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
"><td>"
//"<font face=times,sans-serif color=black size=-1>"
"<span style=\"%s\">"
"This is Gigablast's cached page of </span>"
"<a href=\"%s\" style=\"%s\">%s</a>"
"" , styleTitle, f->getUrl(), styleLink,
f->getUrl() );
//p += gbstrlen ( p );
// then the rest
//sprintf(p ,
sb->safePrintf(
"<span style=\"%s\">. "
"Gigablast is not responsible for the content of "
"this page.</span>", styleTitle );
//p += gbstrlen ( p );
sb->safePrintf ( "<br/><span style=\"%s\">"
"Cached: </span>"
"<span style=\"%s\">",
styleTitle, styleText );
//p += gbstrlen ( p );
// then the spider date in GMT
// time_t lastSpiderDate = xd->m_spideredTime;
// struct tm *timeStruct = gmtime ( &lastSpiderDate );
// char tbuf[100];
// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
//p += gbstrlen ( p );
sb->safeStrcpy(tbuf);
// Moved over from PageResults.cpp
sb->safePrintf( "</span> - <a href=\""
"/get?"
"q=%s&amp;c=%s&amp;rtq=%"INT32"&amp;"
"d=%"INT64"&amp;strip=1\""
" style=\"%s\">"
"[stripped]</a>",
q , st->m_coll ,
(int32_t)st->m_rtq,
st->m_docId, styleLink );
// a link to alexa
if ( f->getUrlLen() > 5 ) {
sb->safePrintf( " - <a href=\"http:"
"//web.archive.org/web/*/%s\""
" style=\"%s\">"
"[older copies]</a>" ,
f->getUrl(), styleLink );
}
if (st->m_noArchive){
sb->safePrintf( " - <span style=\"%s\"><b>"
"[NOARCHIVE]</b></span>",
styleTell );
}
if (st->m_isBanned){
sb->safePrintf(" - <span style=\"%s\"><b>"
"[BANNED]</b></span>",
styleTell );
}
// only print this if we got a query
if ( qlen > 0 ) {
sb->safePrintf("<br/><br/><span style=\"%s\"> "
"These search terms have been "
"highlighted: ",
styleText );
//p += gbstrlen ( p );
}
}
// how much space left in p?
//int32_t avail = bufEnd - p;
// . make the url that we're outputting for (like in PageResults.cpp)
// . "thisUrl" is the baseUrl for click & scroll
char thisUrl[MAX_URL_LEN];
char *thisUrlEnd = thisUrl + MAX_URL_LEN;
char *x = thisUrl;
// . use the external ip of our gateway
// . construct the NAT mapped port
// . you should have used iptables to map port to the correct
// internal ip:port
//uint32_t ip =g_conf.m_mainExternalIp ; // h->m_externalIp;
//uint16_t port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
// local check
//if ( st->m_isLocal ) {
uint32_t ip = h->m_ip;
uint16_t port = h->m_httpPort;
//}
//sprintf ( x , "http://%s:%"INT32"/get?q=" , iptoa ( ip ) , port );
// . we no longer put the port in here
// . but still need http:// since we use <base href=>
if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
else sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
x += gbstrlen ( x );
// the query url encoded
int32_t elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
x += elen;
// separate cgi vars with a &
//sprintf ( x, "&seq=%"INT32"&rtq=%"INT32"d=%"INT64"",
// (int32_t)st->m_seq,(int32_t)st->m_rtq,st->m_msg22.getDocId());
sprintf ( x, "&d=%"INT64"",st->m_docId );
x += gbstrlen(x);
// set our query for highlighting
Query qq;
qq.set2 ( q, st->m_langId , true );
// print the query terms into our highlight buffer
Highlight hi;
// make words so we can set the scores to ignore fielded terms
Words qw;
qw.set ( q , // content being highlighted, utf8
qlen , // content being highlighted, utf8
TITLEREC_CURRENT_VERSION,
true , // computeIds
false ); // hasHtmlEntities?
// . assign scores of 0 to query words that should be ignored
// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
// with words in qw.
// . sanity check
//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
// declare up here
Matches m;
// do the loop
//Scores ss;
//ss.set ( &qw , NULL );
//for ( int32_t i = 0 ; i < qq.m_numWords ; i++ )
// if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
// now set m.m_matches[] to those words in qw that match a query word
// or phrase in qq.
m.setQuery ( &qq );
//m.addMatches ( &qw , &ss , true );
m.addMatches ( &qw );
int32_t hilen = 0;
// CNS: if ( ! st->m_clickNScroll ) {
// and highlight the matches
if ( printDisclaimer ) {
hilen = hi.set ( //p ,
//avail ,
sb ,
&qw , // words to highlight
&m , // matches relative to qw
false , // doSteming
false , // st->m_clickAndScroll ,
(char *)thisUrl );// base url for ClcknScrll
//p += hilen;
// now an hr
//gbmemcpy ( p , "</span></table></table>\n" , 24 ); p += 24;
sb->safeStrcpy("</span></table></table>\n");
}
bool includeHeader = st->m_includeHeader;
// do not show header for json object display
if ( xd->m_contentType == CT_JSON )
includeHeader = false;
if ( xd->m_contentType == CT_XML )
includeHeader = false;
if ( xd->m_contentType == CT_STATUS )
includeHeader = false;
if ( format == FORMAT_XML ) includeHeader = false;
if ( format == FORMAT_JSON ) includeHeader = false;
//mfree(uq, uqCapacity, "PageGet");
// undo the header writes if we should
if ( ! includeHeader ) {
// including base href is off by default when not including
// the header, so the caller must explicitly turn it back on
if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
else sb->m_length=startLen1;//p=start1;
}
//sb->safeStrcpy(tbuf);
if ( format == FORMAT_XML ) {
sb->safePrintf("<response>\n");
sb->safePrintf("<statusCode>0</statusCode>\n");
sb->safePrintf("<statusMsg>Success</statusMsg>\n");
sb->safePrintf("<url><![CDATA[");
sb->cdataEncode(xd->m_firstUrl.m_url);
sb->safePrintf("]]></url>\n");
sb->safePrintf("<docId>%"UINT64"</docId>\n",xd->m_docId);
sb->safePrintf("\t<cachedTimeUTC>%"INT32"</cachedTimeUTC>\n",
(int32_t)lastSpiderDate);
sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
}
if ( format == FORMAT_JSON ) {
sb->safePrintf("{\"response\":{\n");
sb->safePrintf("\t\"statusCode\":0,\n");
sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
sb->safePrintf("\t\"url\":\"");
sb->jsonEncode(xd->m_firstUrl.m_url);
sb->safePrintf("\",\n");
sb->safePrintf("\t\"docId\":%"UINT64",\n",xd->m_docId);
sb->safePrintf("\t\"cachedTimeUTC\":%"INT32",\n",
(int32_t)lastSpiderDate);
sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
}
// identify start of <title> tag we wrote out
char *sbstart = sb->getBufStart();
char *sbend = sb->getBufEnd();
char *titleStart = NULL;
char *titleEnd = NULL;
char ctype = (char)xd->m_contentType;
// do not calc title or print it if doc is xml or json
if ( ctype == CT_XML ) sbend = sbstart;
if ( ctype == CT_JSON ) sbend = sbstart;
if ( ctype == CT_STATUS ) sbend = sbstart;
for ( char *t = sbstart ; t < sbend ; t++ ) {
// title tag?
if ( t[0]!='<' ) continue;
if ( to_lower_a(t[1])!='t' ) continue;
if ( to_lower_a(t[2])!='i' ) continue;
if ( to_lower_a(t[3])!='t' ) continue;
if ( to_lower_a(t[4])!='l' ) continue;
if ( to_lower_a(t[5])!='e' ) continue;
// point to it
char *x = t + 5;
// max - to keep things fast
char *max = x + 500;
for ( ; *x && *x != '>' && x < max ; x++ );
x++;
// find end
char *e = x;
for ( ; *e && e < max ; e++ ) {
if ( e[0]=='<' &&
to_lower_a(e[1])=='/' &&
to_lower_a(e[2])=='t' &&
to_lower_a(e[3])=='i' &&
to_lower_a(e[4])=='t' &&
to_lower_a(e[5])=='l' &&
to_lower_a(e[6])=='e' )
break;
}
if ( e < max ) {
titleStart = x;
titleEnd = e;
}
break;
}
// . print title at top!
// . consider moving
if ( titleStart ) {
char *ebuf = st->m_r.getString("eb");
if ( ! ebuf ) ebuf = "";
//p += sprintf ( p ,
sb->safePrintf(
"<table border=1 "
"cellpadding=10 "
"cellspacing=0 "
"width=100%% "
"color=#ffffff>" );
int32_t printLinks = st->m_r.getLong("links",0);
if ( ! printDisclaimer && printLinks )
sb->safePrintf(//p += sprintf ( p ,
// first put cached and live link
"<tr>"
"<td bgcolor=lightyellow>"
// print cached link
//"<center>"
"&nbsp; "
"<b>"
"<a "
"style=\"font-size:18px;font-weight:600;"
"color:#000000;\" "
"href=\""
"/get?"
"c=%s&d=%"INT64"&qh=0&cnsp=1&eb=%s\">"
"cached link</a>"
" &nbsp; "
"<a "
"style=\"font-size:18px;font-weight:600;"
"color:#000000;\" "
"href=%s>live link</a>"
"</b>"
//"</center>"
"</td>"
"</tr>\n"
,st->m_coll
,st->m_docId
,ebuf
,thisUrl // st->ptr_ubuf
);
if ( printLinks ) {
sb->safePrintf(//p += sprintf ( p ,
"<tr><td bgcolor=pink>"
"<span style=\"font-size:18px;"
"font-weight:600;"
"color:#000000;\">"
"&nbsp; "
"<b>PAGE TITLE:</b> "
);
int32_t tlen = titleEnd - titleStart;
sb->safeMemcpy ( titleStart , tlen );
sb->safePrintf ( "</span></td></tr>" );
}
sb->safePrintf( "</table><br>\n" );
}
// is the content preformatted?
bool pre = false;
if ( ctype == CT_TEXT ) pre = true ; // text/plain
if ( ctype == CT_DOC ) pre = true ; // filtered msword
if ( ctype == CT_PS ) pre = true ; // filtered postscript
if ( format == FORMAT_XML ) pre = false;
if ( format == FORMAT_JSON ) pre = false;
// if it is content-type text, add a <pre>
if ( pre ) {//p + 5 < bufEnd && pre ) {
sb->safePrintf("<pre>");
//p += 5;
}
if ( st->m_strip == 1 )
contentLen = stripHtml( content, contentLen,
(int32_t)xd->m_version, st->m_strip );
// it returns -1 and sets g_errno on error, line OOM
if ( contentLen == -1 ) {
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno );
}
Xml xml;
Words ww;
// if no highlighting, skip it
bool queryHighlighting = st->m_queryHighlighting;
if ( st->m_strip == 2 ) queryHighlighting = false;
// do not do term highlighting if json
if ( xd->m_contentType == CT_JSON )
queryHighlighting = false;
if ( xd->m_contentType == CT_STATUS )
queryHighlighting = false;
SafeBuf tmp;
SafeBuf *xb = sb;
if ( format == FORMAT_XML ) xb = &tmp;
if ( format == FORMAT_JSON ) xb = &tmp;
if ( ! queryHighlighting ) {
xb->safeMemcpy ( content , contentLen );
xb->nullTerm();
//p += contentLen ;
}
else {
// get the content as xhtml (should be NULL terminated)
//Words *ww = xd->getWords();
if ( ! xml.set ( content , contentLen , false ,
0 , false , TITLEREC_CURRENT_VERSION ,
false , 0 , CT_HTML ) ) { // niceness is 0
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno );
}
if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno );
}
// sanity check
//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
// how much space left in p?
//avail = bufEnd - p;
Matches m;
m.setQuery ( &qq );
m.addMatches ( &ww );
hilen = hi.set ( xb , // p , avail ,
&ww , &m ,
false /*doStemming?*/ ,
st->m_clickAndScroll ,
thisUrl /*base url for click & scroll*/);
//p += hilen;
log(LOG_DEBUG, "query: Done highlighting cached page content");
}
if ( format == FORMAT_XML ) {
sb->safePrintf("\t<content><![CDATA[");
sb->cdataEncode ( xb->getBufStart() );
sb->safePrintf("]]></content>\n");
sb->safePrintf("</response>\n");
}
if ( format == FORMAT_JSON ) {
sb->safePrintf("\t\"content\":\"\n");
sb->jsonEncode ( xb->getBufStart() );
sb->safePrintf("\"\n}\n}\n");
}
// if it is content-type text, add a </pre>
if ( pre ) { // p + 6 < bufEnd && pre ) {
sb->safeMemcpy ( "</pre>" , 6 );
//p += 6;
}
// calculate bufLen
//int32_t bufLen = p - buf;
/*
MDW: return the xml page as is now. 9/28/2014
int32_t ct = xd->m_contentType;
// now filter the entire buffer to escape out the xml tags
// so it is displayed nice
SafeBuf newbuf;
if ( ct == CT_XML ) {
// encode the xml tags into &lt;tagname&gt; sequences
if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
sb->getLength(),
0)){// niceness=0
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
return sendErrorReply ( st , g_errno );
}
// free out buffer that we alloc'd before returning since this
// should have copied it into another buffer
//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
// reassign
//buf = newbuf.getBufStart();
//bufLen = newbuf.length();
sb->stealBuf ( &newbuf );
}
*/
// now encapsulate it in html head/tail and send it off
// sendErr:
contentType = "text/html";
if ( strip == 2 ) contentType = "text/xml";
// xml is usually buggy and this throws browser off
//if ( ctype == CT_XML ) contentType = "text/xml";
if ( xd->m_contentType == CT_JSON )
contentType = "application/json";
if ( xd->m_contentType == CT_STATUS )
contentType = "application/json";
if ( xd->m_contentType == CT_XML )
contentType = "test/xml";
if ( format == FORMAT_XML ) contentType = "text/xml";
if ( format == FORMAT_JSON ) contentType = "application/json";
// safebuf, sb, is a member of "st" so this should copy the buffer
// when it constructs the http reply, and we gotta call delete(st)
// AFTER this so sb is still valid.
bool status = g_httpServer.sendDynamicPage (s,
//buf,bufLen,
sb->getBufStart(),
sb->getLength(),
-1,false,
contentType,
-1, NULL, "utf8" );
// nuke state2
mdelete ( st , sizeof(State2) , "PageGet1" );
delete (st);
// free out buffer that we alloc'd before returning since this
// should have copied it into another buffer
//if ( ct == CT_XML ) newbuf.purge();
//else if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
// and convey the status
return status;
}