
6651 lines
182 KiB
Raw Normal View History

2013-08-03 00:12:24 +04:00
#include "gb-include.h"
#include "Collectiondb.h"
//#include "CollectionRec.h"
2013-08-03 00:12:24 +04:00
#include "Stats.h"
#include "Statsdb.h"
#include "Ads.h"
#include "Query.h"
#include "Speller.h"
#include "Msg40.h"
#include "Pages.h"
#include "Highlight.h"
#include "SearchInput.h"
#include <math.h>
#include "SafeBuf.h"
#include "iana_charset.h"
#include "Pos.h"
#include "Bits.h"
#include "AutoBan.h"
#include "sort.h"
#include "LanguageIdentifier.h"
#include "LanguagePages.h"
#include "LangList.h"
#include "CountryCode.h"
#include "Unicode.h"
#include "XmlDoc.h" // GigabitInfo class
#include "Posdb.h" // MAX_TOP definition
#include "PageResults.h"
#include "Proxy.h"
//static void gotSpellingWrapper ( void *state ) ;
static void gotResultsWrapper ( void *state ) ;
//static void gotAdsWrapper ( void *state ) ;
2013-08-03 00:12:24 +04:00
static void gotState ( void *state ) ;
static bool gotResults ( void *state ) ;
bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) ;
bool printJsonItemInCSV ( char *json , SafeBuf *sb , class State0 *st ) ;
2014-02-05 05:05:43 +04:00
bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps ,
2013-08-03 00:12:24 +04:00
Msg20Reply *mr , Msg40 *msg40 , bool first ) ;
2014-02-05 05:05:43 +04:00
bool printScoresHeader ( SafeBuf *sb ) ;
bool printSingleScore ( SafeBuf *sb , SearchInput *si , SingleScore *ss ,
2013-08-03 00:12:24 +04:00
Msg20Reply *mr , Msg40 *msg40 ) ;
bool sendReply ( State0 *st , char *reply ) {
long savedErr = g_errno;
TcpSocket *s = st->m_socket;
if ( ! s ) { char *xx=NULL;*xx=0; }
SearchInput *si = &st->m_si;
char *ct = "text/html";
if ( si && si->m_format == FORMAT_XML ) ct = "text/xml";
if ( si && si->m_format == FORMAT_JSON ) ct = "application/json";
if ( si && si->m_format == FORMAT_CSV ) ct = "text/csv";
2013-08-03 00:12:24 +04:00
char *charset = "utf-8";
2013-12-09 02:36:23 +04:00
char format = si->m_format;
2013-08-03 00:12:24 +04:00
// . filter anything < 0x20 to 0x20 to keep XML legal
// . except \t, \n and \r, they're ok
// . gotta set "f" down here in case it realloc'd the buf
2013-12-09 02:36:23 +04:00
if ( format == FORMAT_XML && reply ) {
2013-08-03 00:12:24 +04:00
unsigned char *f = (unsigned char *)reply;
for ( ; *f ; f++ )
if ( *f < 0x20 && *f!='\t' && *f!='\n' && *f!='\r' )
*f = 0x20;
long rlen = 0;
if ( reply ) rlen = gbstrlen(reply);
logf(LOG_DEBUG,"gb: sending back %li bytes",rlen);
// . use light brown if coming directly from an end user
// . use darker brown if xml feed
long color = 0x00b58869;
if ( si->m_format != FORMAT_HTML )color = 0x00753d30 ;
2013-08-03 00:12:24 +04:00
long long nowms = gettimeofdayInMilliseconds();
long long took = nowms - st->m_startTime ;
g_stats.addStat_r ( took ,
st->m_startTime ,
color ,
// add to statsdb, use # of qterms as the value/qty
g_statsdb.addStat ( 0,
2013-08-03 00:12:24 +04:00
// . log the time
// . do not do this if g_errno is set lest m_sbuf1 be bogus b/c
// it failed to allocate its buf to hold terminating \0 in
// SearchInput::setQueryBuffers()
if ( ! g_errno && st->m_took >= g_conf.m_logQueryTimeThreshold ) {
2013-08-03 00:12:24 +04:00
logf(LOG_TIMING,"query: Took %lli ms for %s. results=%li",
//bool xml = si->m_xml;
2013-08-03 00:12:24 +04:00
if ( ! savedErr ) { // g_errno ) {
// . one hour cache time... no 1000 hours, basically infinite
// . no because if we redo the query the results are cached
long cacheTime = 3600;//*1000;
// no... do not use cache
cacheTime = -1;
2013-08-03 00:12:24 +04:00
// the "Check it" link on add url uses &usecache=0 to tell
// the browser not to use its cache...
//if ( hr->getLong("usecache",-1) == 0 ) cacheTime = 0;
// send back the actual search results
// don't let the ajax re-gen
// if they hit the back button!
// so make this 1 hour, not 0
cacheTime, // cachetime in secs
false, // POSTReply?
-1, // httpstatus -1 -> 200
NULL, // cookieptr
charset );
2014-02-06 02:56:22 +04:00
// free st after sending reply since "st->m_sb" = "reply"
mdelete(st, sizeof(State0), "PageResults2");
delete st;
2013-08-03 00:12:24 +04:00
return true;
// error otherwise
if ( savedErr != ENOPERM )
2014-02-06 02:56:22 +04:00
mdelete(st, sizeof(State0), "PageResults2");
delete st;
2013-12-09 02:36:23 +04:00
if ( format == FORMAT_XML ) {
2013-08-03 00:12:24 +04:00
SafeBuf sb;
sb.safePrintf("<?xml version=\"1.0\" "
"encoding=\"UTF-8\" ?>\n"
// clear it for sending back
g_errno = 0;
// send back as normal reply
0, // cachetime in secs
false, // POSTReply?
-1, // httpstatus -1 -> 200
NULL, // cookieptr
charset );
return true;
long status = 500;
if (savedErr == ETOOMANYOPERANDS ||
savedErr == EBADREQUEST ||
savedErr == ENOPERM ||
savedErr == ENOCOLLREC)
status = 400;
2013-12-09 02:36:23 +04:00
2013-08-03 00:12:24 +04:00
"There was an error!");
return true;
2014-04-09 06:34:43 +04:00
bool printCSSHead ( SafeBuf *sb , char format ) {
2013-08-03 00:12:24 +04:00
"4.01 Transitional//EN\">\n"
//"<meta http-equiv=\"Content-Type\" "
//"content=\"text/html; charset=utf-8\">\n"
"<title>Gigablast Search Results</title>\n"
"body {"
"font-family:Arial, Helvetica, sans-serif;"
2014-04-09 06:34:43 +04:00
sb->safePrintf( "color: #000000;"
2013-08-03 00:12:24 +04:00
"font-size: 12px;"
//"margin: 20px 5px;"
"a:link {color:#00c}"
"a:visited {color:#551a8b}"
"a:active {color:#f00}"
".bold {font-weight: bold;}"
".bluetable {background:#d1e1ff;"
".url {color:#008000;}"
".cached, .cached a {font-size: 10px;"
"color: #666666;"
"table {"
"font-family:Arial, Helvetica, sans-serif;"
"color: #000000;"
"font-size: 12px;"
".directory {font-size: 16px;}"
2014-04-09 06:34:43 +04:00
return true;
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . "msg" will be inserted into the access log for this request
bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
// . check for sdirt=4, this a site search on the given directory id
// . need to pre-query the directory first to get the sites to search
// this will likely have just been cached so it should be quick
// . then need to construct a site search query
//long rawFormat = hr->getLong("xml", 0); // was "raw"
//long xml = hr->getLong("xml",0);
// what format should search results be in? default is html
char format = getFormatFromRequest ( hr );
// get the dmoz catid if given
//long searchingDmoz = hr->getLong("dmoz",0);
// DO WE NEED TO ALTER cr->m_siteListBuf for a widget?
// when a wordpress user changes the "Websites to Include" for
// her widget, it should send a /search?
// request here...
// so we need to remove her old sites and add in her new ones.
MDW TURN BACK ON IN A DAY. do indexing or err pages first.
// get wordpressid supplied with all widget requests
char *wpid = hr->getString("wpid");
// we have to add set &spidersites=1 which all widgets should do
if ( wpid ) {
// this returns NULL if cr->m_siteListBuf would be unchanged
// because we already have the whiteListBuf sites in there
// for this wordPressId (wpid)
SafeBuf newSiteListBuf;
makeNewSiteList( &si->m_whiteListBuf,
cr->m_siteListBuf ,
wpid ,
// . update the list of sites to crawl/search & show in widget
// . if they give an empty list then allow that, stops crawling
SafeBuf parmList;
g_parms.addNewParmToList1 ( &parmList,
// send the parms to all hosts in the network
g_parms.broadcastParmList ( &parmList ,
NULL,//s,// state is socket i guess
NULL);//doneBroadcastingParms2 );
// nothing left to do now
return g_httpServer.sendDynamicPage(s,
false, // POST?
200, // httpstatus
NULL, // cookie
"UTF-8"); // charset
// . send back page frame with the ajax call to get the real
// search results. do not do this if a "&dir=" (dmoz category)
// is given.
// . if not matt wells we do not do ajax
// . the ajax is just there to prevent bots from slamming me
// with queries.
if ( hr->getLong("id",0) == 0 &&
format == FORMAT_HTML &&
g_conf.m_isMattWells ) {
SafeBuf sb;
2014-04-09 06:34:43 +04:00
printCSSHead ( &sb ,format );
2013-08-03 00:12:24 +04:00
"<body "
"var client = new XMLHttpRequest();\n"
"client.onreadystatechange = handler;\n"
//"var url='"
"var url='/search?q="
long qlen;
char *qstr = hr->getString("q",&qlen,"",NULL);
// . crap! also gotta encode apostrophe since "var url='..."
// . true = encodeApostrophes?
2013-10-03 08:34:21 +04:00
sb.urlEncode2 ( qstr , true );
// progate query language
char *qlang = hr->getString("qlang",NULL,NULL);
if ( qlang ) sb.safePrintf("&qlang=%s",qlang);
2013-08-03 00:12:24 +04:00
// propagate "admin" if set
long admin = hr->getLong("admin",-1);
if ( admin != -1 ) sb.safePrintf("&admin=%li",admin);
// propagate showing of banned results
if ( hr->getLong("sb",0) ) sb.safePrintf("&sb=1");
// propagate list of sites to restrict query to
long sitesLen;
char *sites = hr->getString("sites",&sitesLen,NULL);
if ( sites ) {
2013-10-03 08:34:21 +04:00
sb.urlEncode2 ( sites,true);
// propagate "prepend"
char *prepend = hr->getString("prepend",NULL);
if ( prepend ) {
2013-08-03 00:12:24 +04:00
// propagate "debug" if set
long debug = hr->getLong("debug",0);
if ( debug ) sb.safePrintf("&debug=%li",debug);
// propagate "s"
long ss = hr->getLong("s",-1);
if ( ss > 0 ) sb.safePrintf("&s=%li",ss);
// propagate "n"
long n = hr->getLong("n",-1);
if ( n >= 0 ) sb.safePrintf("&n=%li",n);
// Docs to Scan for Related Topics
long dsrt = hr->getLong("dsrt",-1);
if ( dsrt >= 0 ) sb.safePrintf("&dsrt=%li",dsrt);
// debug gigabits?
long dg = hr->getLong("dg",-1);
if ( dg >= 0 ) sb.safePrintf("&dg=%li",dg);
// show gigabits?
2013-12-02 02:07:06 +04:00
long gb = hr->getLong("gigabits",1);
2013-08-03 00:12:24 +04:00
if ( gb >= 1 ) sb.safePrintf("&gigabits=%li",gb);
2013-09-17 00:59:11 +04:00
// show banned results?
long showBanned = hr->getLong("sb",0);
if ( showBanned ) sb.safePrintf("&sb=1");
// propagate collection
long clen;
char *coll = hr->getString("c",&clen,"",NULL);
if ( coll ) sb.safePrintf("&c=%s",coll);
2013-09-02 04:28:49 +04:00
// forward the "ff" family filter as well
long ff = hr->getLong("ff",0);
if ( ff ) sb.safePrintf("&ff=%li",ff);
2013-08-03 00:12:24 +04:00
// provide hash of the query so clients can't just pass in
// a bogus id to get search results from us
unsigned long h32 = hash32n(qstr);
if ( h32 == 0 ) h32 = 1;
// add this timestamp so when we hit back button this
// parent page will be cached and so will this ajax url.
// but if they hit reload the parent page reloads with a
// different ajax url because "rand" is different
unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
"'GET', url );\n"
, h32
, rand64
// . login bar
// . proxy will replace it byte by byte with a login/logout
// link etc.
// logo header
printLogoAndSearchBox ( &sb , hr , -1 ); // catId = -1
2013-08-03 00:12:24 +04:00
// script to populate search results
sb.safePrintf("<script type=\"text/javascript\">\n"
"function handler() {\n"
"if(this.readyState == 4 ) {\n"
// gigabit unhide function
"function ccc ( gn ) {\n"
"var e = document.getElementById('fd'+gn);\n"
"var f = document.getElementById('sd'+gn);\n"
"if ( == 'none' ){\n"
" = '';\n"
" = 'none';\n"
"else {\n"
" = 'none';\n"
" = '';\n"
// put search results into this div
"<div id=results>"
"<img height=50 width=50 "
"Waiting for results... "
"Please be a little "
"patient I am trying to get more servers."
"<font color=gray>"
2014-04-09 06:34:43 +04:00
"Copyright &copy; 2014. "
"All Rights Reserved.<br/>"
"Powered by the "
2014-05-11 23:04:10 +04:00
"<a href=\"\">"
2014-04-09 06:34:43 +04:00
"GigaBlast</a> open source search engine."
2013-08-03 00:12:24 +04:00
// one hour cache time... no 1000 hours, basically infinite
long cacheTime = 3600; // *1000;
//if ( hr->getLong("usecache",-1) == 0 ) cacheTime = 0;
// send back the parent stub containing the ajax
return g_httpServer.sendDynamicPage(s,
false, // POST?
200, // httpstatus
NULL, // cookie
"UTF-8"); // charset
// make a new state
State0 *st;
try { st = new (State0); }
catch ( ... ) {
g_errno = ENOMEM;
log("query: Query failed. "
"Could not allocate %li bytes for query. "
"Returning HTTP status of 500.",(long)sizeof(State0));
return g_httpServer.sendQueryErrorReply
format, g_errno, "Query failed. "
2013-08-03 00:12:24 +04:00
"Could not allocate memory to execute a search. "
"Please try later." );
mnew ( st , sizeof(State0) , "PageResults2" );
2014-01-28 21:46:58 +04:00
// init some stuff
st->m_didRedownload = false;
st->m_xd = NULL;
st->m_oldContentHash32 = 0;
2013-08-03 00:12:24 +04:00
// copy yhits
if ( ! st->m_hr.copy ( hr ) )
return sendReply ( st , NULL );
2013-08-03 00:12:24 +04:00
// set this in case SearchInput::set fails!
st->m_socket = s;
// save this count so we know if TcpServer.cpp calls destroySocket(s)
st->m_numDestroys = s->m_numDestroys;
// you have to say "&header=1" to get back the header for json now.
// later on maybe it will default to on.
st->m_header = hr->getLong("header",0);
2013-08-03 00:12:24 +04:00
// . parse it up
// . this returns false and sets g_errno and, maybe, g_msg on error
SearchInput *si = &st->m_si;
if ( ! si->set ( s ,
// si just copies the ptr into the httprequest
// into stuff like SearchInput::m_defaultSortLanguage
// so do not use the "hr" on the stack. SearchInput::
// m_hr points to the hr we pass into
// SearchInput::set
&st->m_hr ) ) {
//&st->m_q ) ) {
log("query: set search input: %s",mstrerror(g_errno));
2014-04-09 06:34:43 +04:00
if ( ! g_errno ) g_errno = EBADENGINEER;
2013-08-03 00:12:24 +04:00
return sendReply ( st, NULL );
2013-08-03 00:12:24 +04:00
long codeLen = 0;
char *code = hr->getString("code", &codeLen, NULL);
// allow up to 1000 results per query for paying clients
CollectionRec *cr = si->m_cr;
2014-02-04 07:17:58 +04:00
// save collnum now
if ( cr ) st->m_collnum = cr->m_collnum;
else st->m_collnum = -1;
2013-08-03 00:12:24 +04:00
// take this out here as well!
2013-08-03 00:12:24 +04:00
// limit here
// long maxpp = cr->m_maxSearchResultsPerQuery ;
// if ( si->m_docsWanted > maxpp &&
// // disable serp max per page for custom crawls
// ! cr->m_isCustomCrawl )
// si->m_docsWanted = maxpp;
2013-08-03 00:12:24 +04:00
st->m_numDocIds = si->m_docsWanted;
// watch out for cowboys
//if(si->m_firstResultNum>=si->m_maxResults) return sendReply(st,NULL);
2013-08-03 00:12:24 +04:00
// save state in TcpSocket's m_tmp ptr for debugging. in case
// we lose our string of control and Msg40::getResults() never
// comes back.
s->m_tmp = (char *)st;
// add query stat
st->m_startTime = gettimeofdayInMilliseconds();
// reset
st->m_errno = 0;
// debug msg
log ( LOG_DEBUG , "query: Getting search results for q=%s",
// assume we'll block
st->m_gotResults = false;
st->m_gotAds = false;
st->m_gotSpell = false;
// reset
st->m_printedHeaderRow = false;
2013-08-03 00:12:24 +04:00
long ip = s->m_ip;
long uipLen;
char *uip = hr->getString("uip", &uipLen, NULL);
char testBufSpace[2048];
SafeBuf testBuf(testBufSpace, 1024);
if( g_conf.m_doAutoBan &&
code, codeLen,
uip, uipLen,
false)) { // just check? no incrementing counts
if ( uip )
log("results: returning EBUYFEED for uip=%s",uip);
g_errno = EBUYFEED;
return sendReply(st,NULL);
// . now get the ad space for this query
// . don't get ads if we're not on the first page of results
// . query must be NULL terminated
st->m_gotAds = true;
2013-08-03 00:12:24 +04:00
if (si->m_adFeedEnabled && ! si->m_xml && si->m_docsWanted > 0) {
long pageNum = (si->m_firstResultNum/si->m_docsWanted) + 1;
st->m_gotAds = st->m_ads.
getAds(si->m_displayQuery , //query
si->m_displayQueryLen , //q len
pageNum , //page num
si->m_queryIP ,
si->m_coll2 , //coll
st , //state
gotAdsWrapper );//clbk
2013-08-03 00:12:24 +04:00
// get our spelling correction if we should (spell checker)
st->m_gotSpell = true;
st->m_spell[0] = '\0';
if ( si->m_spellCheck &&
cr->m_spellCheck &&
g_conf.m_doSpellChecking ) {
st->m_gotSpell = g_speller.
getRecommendation( &st->m_q, // Query
si->m_spellCheck, // spellcheck
st->m_spell, // Spell buffer
MAX_FRAG_SIZE, // spell buf size
false, // narrow search?
NULL,//st->m_narrow // narrow buf
MAX_FRAG_SIZE, // narrow buf size
NULL,// num of narrows ptr
st, // state
gotSpellingWrapper );// callback
// . get some results from it
// . this returns false if blocked, true otherwise
// . it also sets g_errno on error
// . use a niceness of 0 for all queries so they take precedence
// over the indexing process
// . this will copy our passed "query" and "coll" to it's own buffer
// . we print out matching docIds to long if m_isDebug is true
// . no longer forward this, since proxy will take care of evenly
// distributing its msg 0xfd "forward" requests now
// save error
st->m_errno = g_errno;
// wait for ads and spellcheck and results?
if ( !st->m_gotAds || !st->m_gotSpell || !st->m_gotResults )
return false;
// otherwise call gotResults which returns false if blocked, true else
// and sets g_errno on error
bool status2 = gotResults ( st );
return status2;
2014-01-28 21:46:58 +04:00
// if returned json result is > maxagebeforedownload then we redownload the
// page and if its checksum has changed we return empty results
void doneRedownloadingWrapper ( void *state ) {
// cast our State0 class from this
State0 *st = (State0 *) state;
// resume
gotResults ( st );
2013-08-03 00:12:24 +04:00
void gotSpellingWrapper( void *state ){
// cast our State0 class from this
State0 *st = (State0 *) state;
// log the error first
if ( g_errno ) log("query: speller: %s.",mstrerror(g_errno));
// clear any error cuz spellchecks aren't needed
g_errno = 0;
st->m_gotSpell = true;
void gotResultsWrapper ( void *state ) {
// cast our State0 class from this
State0 *st = (State0 *) state;
// save error
st->m_errno = g_errno;
// mark as gotten
st->m_gotResults = true;
gotState (st);
2013-08-03 00:12:24 +04:00
void gotAdsWrapper ( void *state ) {
// cast our State0 class from this
State0 *st = (State0 *) state;
// mark as gotten
st->m_gotAds = true;
// log the error first
if ( g_errno ) log("query: adclient: %s.",mstrerror(g_errno));
// clear any error cuz ads aren't needed
g_errno = 0;
gotState (st);;
2013-08-03 00:12:24 +04:00
void gotState ( void *state ){
// cast our State0 class from this
State0 *st = (State0 *) state;
if ( !st->m_gotAds || !st->m_gotSpell || !st->m_gotResults )
// we're ready to go
gotResults ( state );
// print all sentences containing this gigabit
static bool printGigabit ( State0 *st,
2014-02-05 05:05:43 +04:00
SafeBuf *sb ,
2013-08-03 00:12:24 +04:00
Msg40 *msg40 ,
Gigabit *gi ,
SearchInput *si ) {
//static long s_gigabitCount = 0;
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
//"<img src="
HttpRequest *hr = &st->m_hr;
// make a new query
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a href=\"/search?gigabits=1&q=");
2013-08-03 00:12:24 +04:00
char *q = hr->getString("q",NULL,"");
2014-02-05 05:05:43 +04:00
sb->safePrintf(" <font color=gray size=-1>");
//long numOff = sb->m_length;
2013-08-03 00:12:24 +04:00
// now the # of pages not nuggets
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
if ( si->m_isAdmin )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// that's it for the gigabit
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
return true;
class StateAU {
SafeBuf m_metaListBuf;
Msg4 m_msg4;
void freeMsg4Wrapper( void *st ) {
StateAU *stau = (StateAU *)st;
mdelete(stau, sizeof(StateAU), "staud");
delete stau;
2013-08-03 00:12:24 +04:00
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotResults ( void *state ) {
// cast our State0 class from this
State0 *st = (State0 *) state;
long long nowMS = gettimeofdayInMilliseconds();
// log the time
long long took = nowMS - st->m_startTime;
// record that
st->m_took = took;
// grab the query
Msg40 *msg40 = &(st->m_msg40);
2014-02-05 05:05:43 +04:00
//char *q = msg40->getQuery();
//long qlen = msg40->getQueryLen();
2014-02-05 05:05:43 +04:00
SearchInput *si = &st->m_si;
2014-02-07 06:21:22 +04:00
// if already printed from Msg40.cpp, bail out now
if ( si->m_streamResults ) {
log("msg40: done streaming. nuking state.");
mdelete(st, sizeof(State0), "PageResults2");
delete st;
return true;
// shortcuts
//char *coll = si->m_coll2;
//long collLen = si->m_collLen2;
collnum_t collnum = si->m_firstCollnum;
// collection rec must still be there since SearchInput references
// into it, and it must be the SAME ptr too!
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr || cr != si->m_cr ) {
g_errno = ENOCOLLREC;
return sendReply(st,NULL);
//char *coll = cr->m_coll;
2014-01-28 21:46:58 +04:00
// if caller wants a certain freshness we might have to redownload the
// parent url to get the new json
// get the first result
Msg20 *m20first = msg40->m_msg20[0];
long mabr = st->m_hr.getLong("maxagebeforeredownload",-1);
if ( mabr >= 0 &&
numResults > 0 &&
// only do this once
! st->m_didRedownload &&
// need at least one result
m20first &&
// get the last spidered time from the msg20 reply of that result
m20first->m_r->m_lastSpidered - now > mabr ) {
// make a new xmldoc to do the redownload
XmlDoc *xd;
try { xd = new (XmlDoc); }
catch ( ... ) {
g_errno = ENOMEM;
log("query: Failed to alloc xmldoc.");
if ( g_errno ) return sendReply (st,NULL);
mnew ( xd , sizeof(XmlDoc) , "mabrxd");
// save it
st->m_xd = xd;
// get this
st->m_oldContentHash32 = m20rep->m_contentHash32;
// do not re-do redownload
st->m_didRedownload = true;
// set it
xd->setCallback ( st , doneRedownloadingWrapper );
// get the checksum
if ( xd->getContentChecksum32Fast() == (void *)-1 )
// return false if it blocked
return false;
// error?
if ( g_errno ) return sendReply (st,NULL);
// how did this not block
log("page: redownload did not would block adding parent");
// if we did the redownload and checksum changed, return 0 results
if ( st->m_didRedownload ) {
// get the doc we downloaded
XmlDoc *xd = st->m_xd;
// get it
long newHash32 = xd->getContentHash32();
// log it
if ( newHash32 != st->m_oldContentHash32 )
// note it in logs for now
log("results: content changed for %s",xd->m_firstUrl.m_url);
// free it
mdelete(xd, sizeof(XmlDoc), "mabrxd" );
delete xd;
// null it out so we don't try to re-free
st->m_xd = NULL;
// if content is significantly different, return 0 results
if ( newHash32 != st->m_oldContentHash32 ) {
SafeBuf sb;
// empty json i guess
return sendReply(st,sb.getBufStart());
// otherwise, print the diffbot json results, they are still valid
// if its a special request to get diffbot json objects for
// a given parent url, it often contains the same url in "addurl"
// to add as a spider request to spiderdb so that
// it gets spidered and processed through diffbot.
char *addUrl = st->m_hr.getString("addurl",NULL);
if ( addUrl ) { // && cr->m_isCustomCrawl ) {
2014-01-08 02:27:58 +04:00
Url norm;
norm.set ( addUrl );
SpiderRequest sreq;
// returns false and sets g_errno on error
2014-01-08 02:27:58 +04:00
if ( ! sreq.setFromAddUrl ( norm.getUrl() ) ) { //addUrl ) ) {
log("addurl: url had problem: %s",mstrerror(g_errno));
return true;
// addurl state
StateAU *stau;
try { stau = new(StateAU); }
catch ( ... ) {
g_errno = ENOMEM;
return true;
mnew ( stau , sizeof(StateAU) , "stau");
// fill it up
SafeBuf *mlist = &stau->m_metaListBuf;
if ( ! mlist->pushChar(RDB_SPIDERDB) )
return true;
if ( ! mlist->safeMemcpy ( &sreq , sreq.getRecSize() ) )
return true;
Msg4 *msg4 = &stau->m_msg4;
// this should copy the recs from list into the buffers
if ( msg4->addMetaList ( mlist->getBufStart() ,
mlist->getLength() ,
stau ,
freeMsg4Wrapper ,
// if it copied everything ok, nuke our msg4
// otherwise it will call freeMsg4Wraper when it
// completes!
freeMsg4Wrapper( stau );
2014-05-06 22:33:00 +04:00
long numResults = msg40->getNumResults();
2014-04-17 08:36:28 +04:00
// if user is doing ajax widget we need to know the current docid
// that is listed at the top of their widget display so we can
// hide the new docids above that and scroll them down slowly.
2014-05-06 22:33:00 +04:00
//long topDocIdPos = -1;
2014-04-17 08:36:28 +04:00
bool hasInvisibleResults = false;
2014-05-06 21:47:57 +04:00
//long numInvisible = 0;
long numAbove = 0;
2014-04-17 08:36:28 +04:00
HttpRequest *hr = &st->m_hr;
long long oldTop = 0LL;
2014-05-06 21:47:57 +04:00
long long lastDocId = 0LL;
double lastSerpScore = 0.0;
2014-04-17 08:36:28 +04:00
if ( si->m_format == FORMAT_WIDGET_AJAX ) {
2014-05-06 21:47:57 +04:00
// sanity, no stream mode here, it won't work
if ( si->m_streamResults )
log("results: do not use stream=1 for widget");
2014-04-17 08:36:28 +04:00
// get current top docid
long long topDocId = hr->getLongLong("topdocid",0LL);
2014-04-17 23:58:27 +04:00
// DEBUG: force it on for now
//topDocId = 4961990748LL;
2014-05-06 21:47:57 +04:00
// scan results. this does not support &stream=1 streaming
// mode. it doesn't make sense that it needs to.
2014-04-17 08:36:28 +04:00
for ( long i = 0 ; i < numResults ; i++ ) {
2014-05-06 21:47:57 +04:00
// skip if already invisible
if ( msg40->m_msg3a.m_clusterLevels[i] != CR_OK )
2014-04-17 08:36:28 +04:00
// get it
2014-05-06 21:47:57 +04:00
Msg20 *m20 = msg40->m_msg20[i];
if ( ! m20 ) continue;
2014-04-17 08:36:28 +04:00
// checkdocid
Msg20Reply *mr = m20->m_r;
if ( ! mr ) continue;
2014-05-06 21:47:57 +04:00
// save this
lastDocId = mr->m_docId;
lastSerpScore = msg40->m_msg3a.m_scores[i];
// set "oldTop" to first docid we encounter
2014-04-17 08:36:28 +04:00
if ( ! oldTop ) oldTop = mr->m_docId;
2014-04-17 21:30:56 +04:00
// stop if no topdocid otherwise. oldTop is now set
2014-05-06 21:47:57 +04:00
if ( ! topDocId ) continue; // == 0 ) break;
2014-04-17 08:36:28 +04:00
if ( mr->m_docId != topDocId ) {
hasInvisibleResults = true;
2014-05-06 21:47:57 +04:00
// count # of docids above top docid
2014-04-17 08:36:28 +04:00
2014-05-06 21:47:57 +04:00
// we match it, so set this if not already set
2014-05-06 22:33:00 +04:00
//if ( topDocIdPos != -1 ) topDocIdPos = i;
2014-05-06 21:47:57 +04:00
2014-04-17 08:36:28 +04:00
2014-05-06 22:33:00 +04:00
2014-04-17 08:36:28 +04:00
2014-04-17 21:30:56 +04:00
SafeBuf *sb = &st->m_sb;
// print javascript for scrolling down invisible div for
// ajax based widgets
2014-04-17 23:58:27 +04:00
// MDW: this does not execute because it is loaded via ajax...
// so i moved logic into diffbot.php for now.
2014-04-17 21:30:56 +04:00
if ( si->m_format == FORMAT_WIDGET_AJAX && numInvisible ) {
sb->safePrintf("<script type=text/javascript>"
// call this function like 5 times a second
"function diffbot_scroll() {\n"
// get hidden div
"var hd = document.getElementById('diffbot_"
// get current bottom
// decrement by 1 pixel and reassign
" = hd +1;\n"
// we are done if height is equal to
// X * resultdivheight which is 140px i think
"if ( hd >= %li ) return;\n"
// call us again in 300ms
// on load start scrolling
2014-04-17 23:58:27 +04:00
2014-04-17 21:30:56 +04:00
, numInvisible * (long)RESULT_HEIGHT
2014-04-17 23:58:27 +04:00
2014-04-17 21:30:56 +04:00
// print logo, search box, results x-y, ... into st->m_sb
printSearchResultsHeader ( st );
2014-04-17 08:36:28 +04:00
// propagate "topdocid" so when he does another query every 30 secs
// or so we know what docid was on top for scrolling purposes
2014-05-06 22:33:00 +04:00
//if ( si->m_format == FORMAT_WIDGET_AJAX )
// sb->safePrintf("<input type=hidden "
// "id=topdocid name=topdocid value=%lli>\n",
// oldTop);
2014-04-17 08:36:28 +04:00
2014-05-06 21:47:57 +04:00
// report how many results we added above the topdocid provided, if any
// so widget can scroll down automatically
2014-05-06 22:33:00 +04:00
//if ( si->m_format == FORMAT_WIDGET_AJAX && numAbove )
// sb->safePrintf("<input type=hidden "
// "id=topadd name=topadd value=%li>\n",numAbove);
2014-05-06 21:47:57 +04:00
// we often can add 100s of things to the widget's result set per
// second especially when sorting by last spidered time and spidering
// a lot. setting the maxserpscore of the serp score of the last result
// allows us to append new search results to what we have in a
// consistent manner.
// if ( si->m_format == FORMAT_WIDGET_AJAX ) {
// // let's make this ascii encoded crap
// sb->safePrintf("<input type=hidden "
// "id=maxserpscore "
// "value=%f>\n",
// lastSerpScore);
// // let's make this ascii encoded crap
// sb->safePrintf("<input type=hidden "
// "id=maxserpdocid "
// "value=%lli>\n",
// lastDocId);
// }
2014-04-17 08:36:28 +04:00
2014-02-04 07:17:58 +04:00
// then print each result
// don't display more than docsWanted results
long count = msg40->getDocsWanted();
bool hadPrintError = false;
2014-04-26 00:46:20 +04:00
long numPrintedSoFar = 0;
2014-04-17 23:58:27 +04:00
//long widgetHeight = hr->getLong("widgetheight",400);
//long widgetwidth = hr->getLong("widgetwidth",250);
2014-02-04 07:17:58 +04:00
for ( long i = 0 ; count > 0 && i < numResults ; i++ ) {
2014-04-17 08:36:28 +04:00
2014-05-06 21:47:57 +04:00
2014-04-17 08:36:28 +04:00
if ( hasInvisibleResults ) {
// if doing a widget, we initially hide the new results
// and scroll them down in time so it looks cool.
if ( i == 0 )
sb->safePrintf("<div id=diffbot_invisible "
2014-04-17 23:58:27 +04:00
// relative to containing div
// which is position:relative!
2014-04-18 02:58:02 +04:00
2014-04-17 08:36:28 +04:00
2014-04-17 23:58:27 +04:00
// to test scrolling, hide the first result and
// scroll it out
2014-04-17 08:36:28 +04:00
if ( i == topDocIdPos )
2014-04-17 23:58:27 +04:00
"<div id=diffbot_visible"
2014-04-18 02:58:02 +04:00
" style=top:0px;"
2014-04-17 23:58:27 +04:00
2014-04-17 08:36:28 +04:00
2014-05-06 21:47:57 +04:00
2014-04-17 08:36:28 +04:00
2014-02-04 07:17:58 +04:00
// prints in xml or html
2014-04-17 08:36:28 +04:00
if ( ! printResult ( st , i , &numPrintedSoFar ) ) {
2014-02-04 07:17:58 +04:00
hadPrintError = true;
2014-04-17 08:36:28 +04:00
2014-02-04 07:17:58 +04:00
// limit it
2014-04-17 21:30:56 +04:00
2014-02-04 07:17:58 +04:00
if ( hadPrintError ) {
if ( ! g_errno ) g_errno = EBADENGINEER;
log("query: had error: %s",mstrerror(g_errno));
2014-02-05 05:05:43 +04:00
//return sendReply ( st , sb.getBufStart() );
2014-02-04 07:17:58 +04:00
// wrap it up with Next 10 etc.
printSearchResultsTail ( st );
2014-04-21 20:21:28 +04:00
// if we split the serps into 2 divs for scrolling purposes
// then close up the 2nd one
2014-05-06 22:33:00 +04:00
//if ( hasInvisibleResults ) sb->safePrintf("</div>");
2014-04-21 20:21:28 +04:00
2014-04-17 21:30:56 +04:00
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX )
// send it off
sendReply ( st , st->m_sb.getBufStart() );
2014-02-04 07:17:58 +04:00
return true;
bool printSearchResultsHeader ( State0 *st ) {
SearchInput *si = &st->m_si;
// grab the query
Msg40 *msg40 = &(st->m_msg40);
char *q = msg40->getQuery();
long qlen = msg40->getQueryLen();
2013-08-03 00:12:24 +04:00
//char local[ 128000 ];
//SafeBuf sb(local, 128000);
2014-02-04 07:17:58 +04:00
SafeBuf *sb = &st->m_sb;
2013-08-03 00:12:24 +04:00
// reserve 1.5MB now!
2014-02-05 05:05:43 +04:00
if ( ! sb->reserve(1500000 ,"pgresbuf" ) ) // 128000) )
2014-02-13 01:21:30 +04:00
return false;
2013-12-30 22:39:45 +04:00
// just in case it is empty, make it null terminated
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-07 03:23:53 +04:00
// print first [ for json
if ( si->m_format == FORMAT_JSON ) {
if ( st->m_header ) sb->safePrintf("{\n");
else sb->safePrintf("[\n");
2014-02-07 03:23:53 +04:00
// . if not matt wells we do not do ajax
// . the ajax is just there to prevent bots from slamming me
// with queries.
2014-04-10 06:51:36 +04:00
if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) {
2014-04-09 06:34:43 +04:00
printCSSHead ( sb ,si->m_format );
2014-02-05 05:05:43 +04:00
2014-05-06 21:47:57 +04:00
if ( ! g_conf.m_isMattWells && si->m_format==FORMAT_WIDGET_IFRAME ) {
2014-04-10 06:51:36 +04:00
printCSSHead ( sb ,si->m_format );
sb->safePrintf("<body style=padding:0px;margin:0px;>");
2014-04-09 22:03:31 +04:00
HttpRequest *hr = &st->m_hr;
2014-04-17 02:35:16 +04:00
if ( si->m_format == FORMAT_WIDGET_IFRAME ) {
2014-04-10 06:51:36 +04:00
long refresh = hr->getLong("refresh",0);
if ( refresh )
sb->safePrintf("<meta http-equiv=\"refresh\" "
2014-04-09 22:03:31 +04:00
// lead with user's widget header which usually has custom style tags
2014-04-17 02:35:16 +04:00
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX ) {
2014-04-09 22:03:31 +04:00
char *header = hr->getString("header",NULL);
if ( header ) sb->safeStrcpy ( header );
2014-04-09 06:34:43 +04:00
if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) {
printLogoAndSearchBox ( sb , &st->m_hr , -1 ); // catId = -1
2014-04-11 02:08:54 +04:00
// the calling function checked this so it should be non-null
CollectionRec *cr = si->m_cr;
char *coll = cr->m_coll;
long collLen = gbstrlen(coll);
2014-04-17 02:35:16 +04:00
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX ) {
2014-04-17 08:36:28 +04:00
char *pos = "relative";
if ( si->m_format == FORMAT_WIDGET_IFRAME ) pos = "absolute";
2014-04-17 23:58:27 +04:00
long widgetwidth = hr->getLong("widgetwidth",150);
2014-04-17 21:30:56 +04:00
long widgetHeight = hr->getLong("widgetheight",400);
//long iconWidth = 25;
// put image in this div which will have top:0px JUST like
// the div holding the search results we print out below
// so that the image does not scroll when you use the
2014-05-06 21:47:57 +04:00
// scrollbar. holds the magifying glass img and searchbox.
sb->safePrintf("<div class=magglassdiv "
2014-04-17 21:30:56 +04:00
2014-05-07 00:45:53 +04:00
//long refresh = hr->getLong("refresh",15);
2014-04-17 21:30:56 +04:00
char *oq = hr->getString("q",NULL);
if ( ! oq ) oq = "";
char *prepend = hr->getString("prepend");
if ( ! prepend ) prepend = "";
char *displayStr = "none";
if ( prepend && prepend[0] ) displayStr = "";
2014-05-07 00:45:53 +04:00
// to do a search we need to re-call the ajax,
// just call reload like the one that is called every 15s or so
sb->safePrintf("<form "//method=get action=/search "
// use "1" as arg to force reload
2014-05-07 01:53:01 +04:00
// let user know we are loading
"var w=document.getElementById("
// just set the widget content to the reply
"if (w) "
"w.innerHTML='<br><br><b>Loading Results..."
2014-05-07 00:45:53 +04:00
// prevent it from actually submitting
"return false;\">");
2014-04-17 21:30:56 +04:00
2014-04-17 08:36:28 +04:00
sb->safePrintf("<img "
2014-04-17 21:30:56 +04:00
//"position:absolute;" // absolute or relative?
// put it on TOP of the other stuff
2014-04-17 08:36:28 +04:00
2014-05-08 03:33:16 +04:00
2014-04-17 08:36:28 +04:00
2014-04-17 21:30:56 +04:00
// so we are to the right of the searchbox
2014-04-17 08:36:28 +04:00
"\" "
2014-04-17 21:30:56 +04:00
2014-04-11 02:08:54 +04:00
"var e=document.getElementById('sbox');"
"if( == 'none') {"
" = '';"
2014-05-07 01:38:51 +04:00
// give it focus
"var qb=document.getElementById('qbox');"
2014-04-11 02:08:54 +04:00
"} else {"
" = 'none';"
"\" " // end function
" "
2014-05-08 03:33:16 +04:00
"width=35 "
"height=31 "
2014-05-08 00:06:42 +04:00
2014-04-11 02:08:54 +04:00
2014-04-17 21:30:56 +04:00
//char *origq = hr->getString("q");
// we sort all results by spider date now so PREPEND
// the actual user query
char *origq = hr->getString("prepend");
2014-05-07 00:45:53 +04:00
if ( ! origq ) origq = "";
2014-05-07 01:53:01 +04:00
sb->safePrintf("<div id=sbox style=\"float:left;"
2014-05-08 00:28:20 +04:00
2014-05-07 01:53:01 +04:00
2014-05-07 00:45:53 +04:00
// the box that holds the query
"<input type=text id=qbox name=qbox "
"size=%li " //name=prepend "
"value=\"%s\" "
2014-05-08 00:28:20 +04:00
"border:4px solid black;"
2014-04-17 23:58:27 +04:00
2014-04-11 02:08:54 +04:00
, displayStr
2014-05-08 00:28:20 +04:00
, widgetwidth / 23
2014-05-07 00:45:53 +04:00
, origq
2014-04-11 02:08:54 +04:00
2014-04-17 21:30:56 +04:00
// . div to hold the search results
// . this will have the scrollbar to just scroll the serps
// and not the magnifying glass
2014-05-06 21:47:57 +04:00
"<div id=widget123_scrolldiv "
"onscroll=widget123_append(); "
2014-04-17 23:58:27 +04:00
2014-05-01 00:17:39 +04:00
2014-04-17 23:58:27 +04:00
2014-04-18 02:58:02 +04:00
2014-04-17 23:58:27 +04:00
, widgetwidth
2014-04-17 21:30:56 +04:00
, widgetHeight);
2014-04-11 02:08:54 +04:00
2013-08-03 00:12:24 +04:00
// xml
if ( si->m_format == FORMAT_XML )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<?xml version=\"1.0\" "
2013-08-03 00:12:24 +04:00
"encoding=\"UTF-8\" ?>\n"
"<response>\n" );
2014-02-05 05:05:43 +04:00
long long nowMS = gettimeofdayInMillisecondsLocal();
2013-08-03 00:12:24 +04:00
// show current time
if ( si->m_format == FORMAT_XML ) {
2013-08-03 00:12:24 +04:00
long long globalNowMS = localToGlobalTimeMilliseconds(nowMS);
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-04-09 06:34:43 +04:00
else if ( st->m_header && si->m_format == FORMAT_JSON ) {
long long globalNowMS = localToGlobalTimeMilliseconds(nowMS);
2014-04-09 06:34:43 +04:00
2013-08-03 00:12:24 +04:00
2014-04-05 22:33:42 +04:00
// show response time if not doing Quality Assurance
if ( si->m_format == FORMAT_XML )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
else if ( st->m_header && si->m_format == FORMAT_JSON )
sb->safePrintf("\"responseTimeMS\":%lli,\n", st->m_took);
2013-08-03 00:12:24 +04:00
// out of memory allocating msg20s?
if ( st->m_errno ) {
log("query: Query failed. Had error processing query: %s",
g_errno = st->m_errno;
2014-02-13 01:21:30 +04:00
//return sendReply(st,sb->getBufStart());
return false;
2013-08-03 00:12:24 +04:00
2013-10-03 08:34:21 +04:00
2013-10-14 02:45:12 +04:00
//bool xml = si->m_xml;
2013-10-03 08:34:21 +04:00
2013-10-14 02:45:12 +04:00
// if they are doing a search in dmoz, catId will be > 0.
//if ( si->m_directCatId >= 0 ) {
// printDMOZCrumb ( sb , si->m_directCatId , xml );
2013-10-03 08:34:21 +04:00
// show DMOZ subcategories if doing either a
2013-10-14 00:15:31 +04:00
// "gbpcatid:<catid> |" (Search restricted to category)
// "gbcatid:<catid>" (DMOZ urls in that topic)
2013-10-03 08:34:21 +04:00
2013-10-14 00:15:31 +04:00
// The search gbcatid: results should be sorted by siterank i guess
// since it is only search a single term: gbcatid:<catid> so we can
2013-10-03 08:34:21 +04:00
// put our stars back onto that and should be sorted by them.
2013-10-14 02:45:12 +04:00
if ( si->m_catId >= 0 ) {
2013-10-03 08:34:21 +04:00
// print the subtopcis in this topic. show as links above
// the search results
printDMOZSubTopics ( sb, si->m_catId , xml );//st, xml );
// ok, for now just print the dmoz topics since our search
// results will be empty... until populated!
//g_categories->printUrlsInTopic ( &sb , si->m_catId );
2013-10-14 02:45:12 +04:00
2013-10-03 08:34:21 +04:00
2014-04-09 22:03:31 +04:00
// save how many docs are in this collection
2013-08-03 00:12:24 +04:00
long long docsInColl = -1;
//RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , si->m_coll );
//RdbBase *base = getRdbBase ( (uint8_t)RDB_CLUSTERDB , si->m_coll2 );
//if ( base ) docsInColl = base->getNumGlobalRecs();
docsInColl = g_hostdb.getNumGlobalRecs ( );
// include number of docs in the collection corpus
if ( docsInColl >= 0LL ) {
if ( si->m_format == FORMAT_XML)
sb->safePrintf ( "\t<docsInCollection>%lli"
"</docsInCollection>\n", docsInColl );
else if ( st->m_header && si->m_format == FORMAT_JSON)
sb->safePrintf("\"docsInCollection\":%lli,\n", docsInColl);
2013-08-03 00:12:24 +04:00
long numResults = msg40->getNumResults();
bool moreFollow = msg40->moreResultsFollow();
// an estimate of the # of total hits
long long totalHits = msg40->getNumTotalHits();
// only adjust upwards for first page now so it doesn't keep chaning
if ( totalHits < numResults ) totalHits = numResults;
if ( si->m_format == FORMAT_XML )
sb->safePrintf("\t<hits>%lli</hits>\n",(long long)totalHits);
else if ( st->m_header && si->m_format == FORMAT_JSON )
sb->safePrintf("\"hits\":%lli,\n", (long long)totalHits);
// if streaming results we just don't know if we will require
// a "Next 10" link or not! we can print that after we print out
// the results i guess...
if ( ! si->m_streamResults ) {
if ( si->m_format == FORMAT_XML )
else if ( st->m_header && si->m_format == FORMAT_JSON )
2014-04-09 22:03:31 +04:00
if ( st->m_header && si->m_format == FORMAT_JSON ) {
2014-05-06 02:09:14 +04:00
2014-04-09 22:03:31 +04:00
return true;
2013-08-03 00:12:24 +04:00
// . did he get a spelling recommendation?
// . do not use htmlEncode() on this anymore since receiver
// of the XML feed usually does not want that.
if ( si->m_format == FORMAT_XML && st->m_spell[0] ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t<spell><![CDATA[");
sb->safePrintf ("]]></spell>\n");
2013-08-03 00:12:24 +04:00
// debug
if ( si->m_debug )
logf(LOG_DEBUG,"query: Displaying up to %li results.",
// tell browser again
//if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
// sb->safePrintf("<meta http-equiv=\"Content-Type\" "
2013-08-03 00:12:24 +04:00
// "content=\"text/html; charset=utf-8\">\n");
// get some result info from msg40
long firstNum = msg40->getFirstResultNum() ;
// numResults may be more than we requested now!
long n = msg40->getDocsWanted();
if ( n > numResults ) n = numResults;
2013-08-03 00:12:24 +04:00
// . make the query class here for highlighting
// . keepAllSingles means to convert all individual words into
// QueryTerms even if they're in quotes or in a connection (cd-rom).
// we use this for highlighting purposes
Query qq;
qq.set2 ( si->m_displayQuery, langUnknown , si->m_queryExpansion );
// si->m_boolFlag,
// true ); // keepAllSingles?
2014-02-13 01:21:30 +04:00
if ( g_errno ) return false;//sendReply (st,NULL);
2013-08-03 00:12:24 +04:00
DocIdScore *dpx = NULL;
if ( numResults > 0 ) dpx = msg40->getScoreInfo(0);
if ( si->m_format == FORMAT_XML && dpx ) {
2013-08-03 00:12:24 +04:00
// # query terms used!
//long nr = dpx->m_numRequiredTerms;
float max = 0.0;
// max pairwise
float lw = getHashGroupWeight(HASHGROUP_INLINKTEXT);
// square that location weight
lw *= lw;
// assume its an inlinker's text, who has rank 15!!!
lw *= getLinkerWeight(MAXSITERANK);
// double loops
for ( long i = 0 ; i< nr ; i++ ) {
SingleScore *ssi = &dpx->m_singleScores[i];
float tfwi = getTermFreqWeight(ssi->m_listSize);
for ( long j = i+1; j< nr ; j++ ) {
SingleScore *ssj = &dpx->m_singleScores[j];
2014-04-09 22:03:31 +04:00
float tfwj =getTermFreqWeight(ssj->m_listSize);
2013-08-03 00:12:24 +04:00
max += (lw * tfwi * tfwj)/3.0;
// single weights
float maxtfw1 = 0.0;
long maxi1;
// now we can have multiple SingleScores for the same term!
// because we take the top MAX_TOP now and add them to
// get the term's final score.
for ( long i = 0 ; i< dpx->m_numSingles ; i++ ) {
SingleScore *ssi = &dpx->m_singleScores[i];
float tfwi = ssi->m_tfWeight;
if ( tfwi <= maxtfw1 ) continue;
maxtfw1 = tfwi;
maxi1 = i;
float maxtfw2 = 0.0;
long maxi2;
for ( long i = 0 ; i< dpx->m_numSingles ; i++ ) {
if ( i == maxi1 ) continue;
SingleScore *ssi = &dpx->m_singleScores[i];
float tfwi = ssi->m_tfWeight;
if ( tfwi <= maxtfw2 ) continue;
maxtfw2 = tfwi;
maxi2 = i;
// only 1 term?
if ( maxtfw2 == 0.0 ) maxtfw2 = maxtfw1;
// best term freqs
max *= maxtfw1 * maxtfw2;
// site rank effect
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t\t<theoreticalMaxFinalScore>%f"
2013-08-03 00:12:24 +04:00
max );
// debug msg
log ( LOG_TIMING ,
"query: Got %li search results in %lli ms for q=%s",
//Highlight h;
st->m_qe[0] = '\0';
2013-08-03 00:12:24 +04:00
// encode query buf
2014-02-04 07:17:58 +04:00
//char qe[MAX_QUERY_LEN+1];
2013-08-03 00:12:24 +04:00
char *dq = si->m_displayQuery;
//long dqlen = si->m_displayQueryLen;
if ( dq ) urlEncode(st->m_qe,MAX_QUERY_LEN*2,dq,gbstrlen(dq));
2013-08-03 00:12:24 +04:00
// how many results were requested?
long docsWanted = msg40->getDocsWanted();
// store html head into p, but stop at %q
//char *head = cr->m_htmlHead;
//long hlen = cr->m_htmlHeadLen;
2014-02-05 05:05:43 +04:00
//if ( ! si->m_xml ) sb->safeMemcpy ( head , hlen );
2013-08-03 00:12:24 +04:00
// ignore imcomplete or invalid multibyte or wide characters errors
//if ( g_errno == EILSEQ ) {
// log("query: Query error: %s. Ignoring.", mstrerror(g_errno));
// g_errno = 0;
// secret search backdoor
if ( qlen == 7 && q[0]=='3' && q[1]=='b' && q[2]=='Y' &&
q[3]=='6' && q[4]=='u' && q[5]=='2' && q[6]=='Z' ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "<br><b>You owe me!</b><br><br>" );
2013-08-03 00:12:24 +04:00
// print it with commas into "thbuf" and null terminate it
char thbuf[64];
ulltoa ( thbuf , totalHits );
char inbuf[128];
ulltoa ( inbuf , docsInColl );
Query qq3;
Query *qq2;
bool firstIgnored;
bool isAdmin = si->m_isAdmin;
2013-12-30 22:39:45 +04:00
if ( si->m_format != FORMAT_HTML ) isAdmin = false;
2013-08-03 00:12:24 +04:00
// otherwise, we had no error
2014-04-09 22:03:31 +04:00
if ( numResults == 0 && si->m_format == FORMAT_HTML ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "No results found in <b>%s</b> collection.",
2013-08-03 00:12:24 +04:00
2014-04-09 22:03:31 +04:00
// the token is currently in the collection name so do not show that
2014-04-17 02:35:16 +04:00
else if ( numResults == 0 &&
( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX ) ) {
2014-05-08 00:28:20 +04:00
sb->safePrintf ( "No results found. Wait for spider to "
"kick in.");
2014-04-09 22:03:31 +04:00
else if ( moreFollow && si->m_format == FORMAT_HTML ) {
2013-08-03 00:12:24 +04:00
if ( isAdmin && si->m_docsToScanForReranking > 1 )
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "PQR'd " );
sb->safePrintf ("Results <b>%li</b> to <b>%li</b> of "
2013-08-03 00:12:24 +04:00
"exactly <b>%s</b> from an index "
"of %s pages" ,
firstNum + 1 ,
firstNum + n ,
thbuf ,
// otherwise, we didn't get enough results to show this page
else if ( si->m_format == FORMAT_HTML ) {
2013-08-03 00:12:24 +04:00
if ( isAdmin && si->m_docsToScanForReranking > 1 )
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "PQR'd " );
sb->safePrintf ("Results <b>%li</b> to <b>%li</b> of "
2013-08-03 00:12:24 +04:00
"exactly <b>%s</b> from an index "
"of %s pages" ,
firstNum + 1 ,
firstNum + n ,
thbuf ,
// if query was a url print add url msg
char *url = NULL;
if ( !strncmp(q,"url:" ,4) && qlen > 4 ) url = q+4;
if ( !strncmp(q,"http://" ,7) && qlen > 7 ) url = q;
if ( !strncmp(q,"https://",8) && qlen > 8 ) url = q;
if ( !strncmp(q,"www." ,4) && qlen > 4 ) url = q;
// find end of url
char *ue = url;
for ( ; ue && *ue && ! is_wspace_a(*ue) ; ue++ ) ;
if ( numResults == 0 && si->m_format == FORMAT_HTML && url ) {
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"Could not find that url in the "
"index. Try <a href=/addurl?u=");
2014-02-05 05:05:43 +04:00
sb->safePrintf(">Adding it.</a>");
2013-08-03 00:12:24 +04:00
// sometimes ppl search for "" so ask them if they
// want to search for
if ( numResults > 0 && si->m_format == FORMAT_HTML && url && url ==q){
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"Did you mean to "
"search for the url "
"<a href=/search?q=url%%3A");
2014-02-05 05:05:43 +04:00
sb->safePrintf("</a> itself?");
2013-08-03 00:12:24 +04:00
// is it the main collection?
bool isMain = false;
if ( collLen == 4 && strncmp ( coll, "main", 4) == 0 ) isMain = true;
// print "in collection ***" if we had a collection
2014-04-10 11:31:41 +04:00
if (collLen>0 && numResults>0 && !isMain && si->m_format==FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf (" in collection <b>%s</b>",coll);
2013-08-03 00:12:24 +04:00
//char *pwd = si->m_pwd;
//if ( ! pwd ) pwd = "";
2013-08-03 00:12:24 +04:00
if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf(" &nbsp; <u><b><font color=blue><a onclick=\""
2013-08-03 00:12:24 +04:00
"for (var i = 0; i < %li; i++) {"
"var nombre;"
"nombre = 'r' + i;"
"var e = document.getElementById(nombre);"
"if ( e == null ) continue;"
"if ( == 'none' ){"
" = '';"
"else {"
" = 'none';"
"[show scores]"
"</a></font></b></u> ",
numResults );
// convenient admin link
if ( isAdmin ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf(" &nbsp; "
2013-08-03 00:12:24 +04:00
"<font color=red><b>"
"<a href=\"/admin/settings?c=%s\">"
2013-08-03 00:12:24 +04:00
// print reindex link
// get the filename directly
char *langStr = si->m_defaultSortLang;
2014-04-10 11:31:41 +04:00
if ( numResults>0 )
sb->safePrintf (" &nbsp; "
"<font color=red><b>"
"<a href=\"/admin/reindex?c=%s&"
"[reindex or delete these results]"
"</font> ",coll, langStr , st->m_qe );
2014-02-05 05:05:43 +04:00
sb->safePrintf (" &nbsp; "
2013-08-03 00:12:24 +04:00
"<font color=red><b>"
"<a href=\"/inject?c=%s&qts=%s\">"
2014-02-04 07:17:58 +04:00
"</font> ", coll , st->m_qe );
2014-02-05 05:05:43 +04:00
sb->safePrintf (" &nbsp; "
"<font color=red><b>"
"<a href=\"/search?sb=1&c=%s&"
"[show banned results]</a></b>"
2014-02-04 07:17:58 +04:00
"</font> ", coll , langStr , st->m_qe );
2013-08-03 00:12:24 +04:00
// if its an ip: or site: query, print ban link
if ( isAdmin && strncmp(si->m_displayQuery,"ip:",3)==0) {
// get the ip
char *ips = si->m_displayQuery + 3;
// copy to buf, append a ".0" if we need to
char buf [ 32 ];
long i ;
long np = 0;
for ( i = 0 ; i<29 && (is_digit(ips[i])||ips[i]=='.'); i++ ){
if ( ips[i] == '.' ) np++;
// if not enough periods bail
if ( np <= 1 ) goto skip2;
if ( np == 2 ) { buf[i++]='.'; buf[i++]='0'; }
buf[i] = '\0';
// search ip back or forward
long ip = atoip(buf,i);
2014-02-05 05:05:43 +04:00
sb->safePrintf ("&nbsp <b>"
2013-08-03 00:12:24 +04:00
"<a href=\"/search?q=ip%%3A%s&c=%s&n=%li\">"
"[prev %s]</a></b>" ,
2014-02-05 05:05:43 +04:00
sb->safePrintf ("&nbsp <b>"
2013-08-03 00:12:24 +04:00
"<a href=\"/search?q=ip%%3A%s&c=%s&n=%li\">"
"[next %s]</a></b>" ,
// if its an ip: or site: query, print ban link
if ( isAdmin && strncmp(si->m_displayQuery,"site:",5)==0) {
// get the ip
char *start = si->m_displayQuery + 5;
char *sp = start;
while ( *sp && ! is_wspace_a(*sp) ) sp++;
char c = *sp;
// get the filename directly
2014-02-05 05:05:43 +04:00
sb->safePrintf (" &nbsp; "
2013-08-03 00:12:24 +04:00
"<font color=red><b>"
2014-02-08 11:34:45 +04:00
"<a href=\"/admin/tagdb?"
2013-08-03 00:12:24 +04:00
"[ban %s]</a></b>"
"</font> ",coll , start );
*sp = c;
if ( isAdmin && strncmp(si->m_displayQuery,"gbad:",5)==0) {
// get the ip
char *start = si->m_displayQuery + 5;
char *sp = start;
while ( *sp && ! is_wspace_a(*sp) ) sp++;
char c = *sp;
*sp = '\0';
2014-02-05 05:05:43 +04:00
sb->safePrintf (" &nbsp; "
2013-08-03 00:12:24 +04:00
"<font color=red><b>"
2014-02-08 11:34:45 +04:00
"<a href=\"/admin/tagdb?"
2013-08-03 00:12:24 +04:00
"[ban %s]</a></b>"
"</font> ", coll , start , start );
*sp = c;
// cache switch for admin
if ( isAdmin && msg40->getCachedTime() > 0 ) {
// get the filename directly
2014-02-05 05:05:43 +04:00
sb->safePrintf(" &nbsp; "
2013-08-03 00:12:24 +04:00
"<font color=red><b>"
"<a href=\"/search?c=%s",
coll );
// finish it
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"[cache off]</a></b>"
2014-02-04 07:17:58 +04:00
"</font> ", st->m_qe );
2013-08-03 00:12:24 +04:00
// mention ignored query terms
// we need to set another Query with "keepAllSingles" set to false
qq2 = &si->m_q;
2013-08-03 00:12:24 +04:00
//qq2.set ( q , qlen , NULL , 0 , si->m_boolFlag , false );
firstIgnored = true;
for ( long i = 0 ; i < qq2->m_numWords ; i++ ) {
//if ( si->m_xml ) break;
QueryWord *qw = &qq2->m_qwords[i];
// only print out words ignored cuz they were stop words
if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue;
// print header -- we got one
if ( firstIgnored ) {
if ( si->m_format == FORMAT_XML )
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t<ignoredWords><![CDATA[");
2014-04-09 06:34:43 +04:00
else if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf (" &nbsp; <font "
2013-08-03 00:12:24 +04:00
"color=\"#707070\">The "
"following query words "
"were ignored: "
firstIgnored = false;
// print the word
char *t = qw->m_word;
long tlen = qw->m_wordLen;
2014-02-05 05:05:43 +04:00
sb->utf8Encode2 ( t , tlen );
sb->safePrintf (" ");
2013-08-03 00:12:24 +04:00
// print tail if we had ignored terms
if ( ! firstIgnored ) {
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_XML )
2014-02-05 05:05:43 +04:00
2014-04-09 06:34:43 +04:00
else if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf ("</b>. Preceed each with a '+' or "
2013-08-03 00:12:24 +04:00
"wrap in "
"quotes to not ignore.</font>");
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_HTML ) sb->safePrintf("<br><br>");
2013-08-03 00:12:24 +04:00
if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<table cellpadding=0 cellspacing=0>"
2013-08-03 00:12:24 +04:00
"<tr><td valign=top>");
SafeBuf *gbuf = &msg40->m_gigabitBuf;
long numGigabits = gbuf->length()/sizeof(Gigabit);
2014-04-09 06:34:43 +04:00
if ( si->m_format != FORMAT_HTML ) numGigabits = 0;
2013-08-03 00:12:24 +04:00
// print gigabits
Gigabit *gigabits = (Gigabit *)gbuf->getBufStart();
//long numCols = 5;
//long perRow = numGigabits / numCols;
if ( numGigabits && si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<table cellspacing=7 bgcolor=lightgray>"
2013-08-03 00:12:24 +04:00
"<tr><td width=200px; valign=top>");
for ( long i = 0 ; i < numGigabits ; i++ ) {
if ( i > 0 && si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
//if ( perRow && (i % perRow == 0) )
2014-02-05 05:05:43 +04:00
// sb->safePrintf("</td><td valign=top>");
2013-08-03 00:12:24 +04:00
// print all sentences containing this gigabit
Gigabit *gi = &gigabits[i];
printGigabit ( st,sb , msg40 , gi , si );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
if ( numGigabits && si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// two pane table
if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf("</td><td valign=top>");
2013-08-03 00:12:24 +04:00
// did we get a spelling recommendation?
if ( si->m_format == FORMAT_HTML && st->m_spell[0] ) {
2013-08-03 00:12:24 +04:00
// encode the spelling recommendation
long len = gbstrlen ( st->m_spell );
char qe2[MAX_FRAG_SIZE];
urlEncode(qe2, MAX_FRAG_SIZE, st->m_spell, len);
2014-02-05 05:05:43 +04:00
sb->safePrintf ("<font size=+0 color=\"#c62939\">Did you mean:"
2013-08-03 00:12:24 +04:00
"</font> <font size=+0>"
"<a href=\"/search?q=%s",
qe2 );
// close it up
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\"><i><b>");
sb->utf8Encode2(st->m_spell, len);
2013-08-03 00:12:24 +04:00
// then finish it off
2014-02-05 05:05:43 +04:00
sb->safePrintf ("</b></i></a></font>\n<br><br>\n");
2013-08-03 00:12:24 +04:00
// . Wrap results in a table if we are using ads. Easier to display.
//Ads *ads = &st->m_ads;
//if ( ads->hasAds() )
2014-02-05 05:05:43 +04:00
// sb->safePrintf("<table width=\"100%%\">\n"
// "<tr><td style=\"vertical-align:top;\">\n");
2013-08-03 00:12:24 +04:00
// debug
if ( si->m_debug )
logf(LOG_DEBUG,"query: Printing up to %li results. "
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
//sb->safePrintf("<iframe src=\"\"></iframe>");
//sb->safePrintf("<iframe src=\"\"></iframe>");
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<script type=\"text/javascript\">\n"
"function handler() {\n"
"if(this.readyState == 4 ) {\n"
"<div id=foobar onclick=\""
"var client = new XMLHttpRequest();\n"
"client.onreadystatechange = handler;\n"
//"var url='';\n"
//"var url='';\n"
"var url='';\n"
"'GET', url );\n"
"\">CLICK ME</div>\n"
if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a onclick=\""
2013-08-03 00:12:24 +04:00
"var e = "
"alert ('i='+e.innerHTML);"
2014-02-04 07:17:58 +04:00
return true;
2013-08-03 00:12:24 +04:00
2014-02-04 07:17:58 +04:00
bool printSearchResultsTail ( State0 *st ) {
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
SafeBuf *sb = &st->m_sb;
SearchInput *si = &st->m_si;
Msg40 *msg40 = &(st->m_msg40);
CollectionRec *cr = si->m_cr;
char *coll = cr->m_coll;
// if ended in ",\n" cuz it was json, remove that
//if ( si->m_format == FORMAT_JSON && sb->length() >= 4 ) {
// char *p = sb->getBuf() - 2;
// if ( p[0] ==',' && p[1] == '\n' ) sb->incrementLength(-2);
if ( si->m_format == FORMAT_JSON ) {
// print ending ] for json
if ( st->m_header ) sb->safePrintf("}\n");
// all done for json
return true;
2014-02-05 05:05:43 +04:00
// get some result info from msg40
long firstNum = msg40->getFirstResultNum() ;
2013-08-03 00:12:24 +04:00
// end the two-pane table
2014-04-09 06:34:43 +04:00
if ( si->m_format == FORMAT_HTML) sb->safePrintf("</td></tr></table>");
2013-08-03 00:12:24 +04:00
// for storing a list of all of the sites we displayed, now we print a
// link at the bottom of the page to ban all of the sites displayed
// with one click
SafeBuf banSites;
//long tailLen = 0;
//char *tail = NULL;
// PRINT PREV 10 NEXT 10 links!
// center everything below here
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "<br><center>" );
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
long remember = sb->length();
2013-08-03 00:12:24 +04:00
// now print "Prev X Results" if we need to
if ( firstNum < 0 ) firstNum = 0;
char abuf[300];
SafeBuf args(abuf,300);
// show banned?
if ( si->m_showBanned && ! si->m_isAdmin )
if ( ! si->m_showBanned && si->m_isAdmin )
2013-09-18 04:19:41 +04:00
// collection
2014-04-09 06:34:43 +04:00
// formatting info
2014-04-17 02:35:16 +04:00
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX ) {
2014-04-09 06:34:43 +04:00
2014-04-10 06:51:36 +04:00
HttpRequest *hr = &st->m_hr;
long widgetwidth = hr->getLong("widgetwidth",250);
// carry over the sites we are restricting the search results to
if ( si->m_sites )
2013-08-03 00:12:24 +04:00
2014-04-09 06:34:43 +04:00
if ( firstNum > 0 &&
2014-04-17 02:35:16 +04:00
(si->m_format == FORMAT_HTML ||
2014-05-01 00:17:39 +04:00
si->m_format == FORMAT_WIDGET_IFRAME //||
//si->m_format == FORMAT_WIDGET_AJAX
) ) {
2013-08-03 00:12:24 +04:00
long ss = firstNum - msg40->getDocsWanted();
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a href=\"/search?s=%li&q=",ss);
2013-08-03 00:12:24 +04:00
// our current query parameters
2014-02-05 05:05:43 +04:00
sb->safeStrcpy ( st->m_qe );
2013-08-03 00:12:24 +04:00
// print other args if not zero
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( &args );
2013-08-03 00:12:24 +04:00
// close it up
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\"><b>"
2013-08-03 00:12:24 +04:00
"<font size=+0>Prev %li Results</font>"
2014-02-05 05:05:43 +04:00
msg40->getDocsWanted() );
2013-08-03 00:12:24 +04:00
// now print "Next X Results"
2014-04-09 06:34:43 +04:00
if ( msg40->moreResultsFollow() &&
2014-04-17 02:35:16 +04:00
(si->m_format == FORMAT_HTML ||
2014-05-01 00:17:39 +04:00
si->m_format == FORMAT_WIDGET_IFRAME
//si->m_format == FORMAT_WIDGET_AJAX
)) {
2013-08-03 00:12:24 +04:00
long ss = firstNum + msg40->getDocsWanted();
// print a separator first if we had a prev results before us
2014-02-05 05:05:43 +04:00
if ( sb->length() > remember ) sb->safePrintf ( " &nbsp; " );
2013-08-03 00:12:24 +04:00
// add the query
2014-02-05 05:05:43 +04:00
sb->safePrintf ("<a href=\"/search?s=%li&q=",ss);
2013-08-03 00:12:24 +04:00
// our current query parameters
2014-02-05 05:05:43 +04:00
sb->safeStrcpy ( st->m_qe );
2013-08-03 00:12:24 +04:00
// print other args if not zero
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( &args );
2013-08-03 00:12:24 +04:00
// close it up
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<font size=+0>Next %li Results</font>"
2014-02-05 05:05:43 +04:00
msg40->getDocsWanted() );
2013-08-03 00:12:24 +04:00
// print try this search on...
// an additional <br> if we had a Next or Prev results link
2014-02-05 05:05:43 +04:00
if ( sb->length() > remember ) sb->safeMemcpy ("<br>" , 4 );
2013-08-03 00:12:24 +04:00
// END PRINT PREV 10 NEXT 10 links!
// end results table cell... and print calendar at top
//tail = cr->m_htmlTail;
//tailLen = gbstrlen (tail );
2014-02-05 05:05:43 +04:00
//if ( si->m_format == FORMAT_HTML ) sb->safeMemcpy ( tail , tailLen );
2013-08-03 00:12:24 +04:00
if ( si->m_format == FORMAT_HTML ) {
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf("<table cellpadding=2 cellspacing=0 border=0>"
2013-08-03 00:12:24 +04:00
"<td valign=top align=center>"
"<input type=text name=q2 size=60 value=\"" );
2014-02-05 05:05:43 +04:00
sb->htmlEncode ( si->m_sbuf1.getBufStart() ,
2013-08-03 00:12:24 +04:00
si->m_sbuf1.length() ,
false );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<input type=submit value=\"Search\" border=0>"
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf("<input name=c type=hidden value=\"%s\">",coll);
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
bool isAdmin = si->m_isAdmin;
if ( si->m_format != FORMAT_HTML ) isAdmin = false;
2013-08-03 00:12:24 +04:00
if ( isAdmin && banSites.length() > 0 )
2014-02-05 05:05:43 +04:00
sb->safePrintf ("<br><br><div align=right><b>"
2014-02-08 11:34:45 +04:00
"<a href=\"/admin/tagdb?"
2013-08-03 00:12:24 +04:00
"[ban all of these domains]</a></b></div>"
"<br>\n ",
coll, banSites.getBufStart());
// TODO: print cache line in light gray here
// TODO: "these results were cached X minutes ago"
if ( msg40->getCachedTime() > 0 && si->m_format == FORMAT_HTML ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf("<br><br><font size=1 color=707070><b><center>");
sb->safePrintf ( " These results were cached " );
2013-08-03 00:12:24 +04:00
// this cached time is this local cpu's time
long diff = getTime() - msg40->getCachedTime();
2014-02-05 05:05:43 +04:00
if ( diff < 60 ) sb->safePrintf ( "%li seconds" , diff );
else if ( diff < 2*60 ) sb->safePrintf ( "1 minute");
else sb->safePrintf ( "%li minutes",diff/60);
sb->safePrintf ( " ago. [<a href=\"/pageCache.html\">"
2013-08-03 00:12:24 +04:00
"<font color=707070>Info</font></a>]");
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</center></font>");
2013-08-03 00:12:24 +04:00
if ( si->m_format == FORMAT_XML )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// if we did not use ajax, print this tail here now
if ( si->m_format == FORMAT_HTML && ! g_conf.m_isMattWells ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "<br>"
"<font color=gray>"
2014-04-09 06:34:43 +04:00
"Copyright &copy; 2014. All Rights "
2014-05-11 23:04:10 +04:00
"Powered by the <a href=\"http://www."
"\">GigaBlast</a> open source "
"search engine."
2014-04-17 02:35:16 +04:00
// ajax widgets will have this outside the downloaded content
if ( si->m_format == FORMAT_WIDGET_IFRAME ) {
2014-04-09 06:34:43 +04:00
sb->safePrintf ( "<br>"
"<font color=gray>"
// link to edit the list of widget sites
// or various other widget content properties
// because we can't edit the width/height
// of the widget like this.
"<a href=/widget?inlineedit=1>edit</a> "
"&bull; "
//"Copyright &copy; 2014. All Rights "
"Powered by <a href=>"
if ( sb->length() == 0 && si && si->m_format == FORMAT_JSON )
if ( sb->length() == 0 ) {
2014-02-05 05:05:43 +04:00
2013-12-30 22:39:45 +04:00
return true;
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
bool printTimeAgo ( SafeBuf *sb , long ts , char *prefix , SearchInput *si ) {
2013-08-03 00:12:24 +04:00
// Jul 23, 1971
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
long now = getTimeGlobal();
// for printing
long mins = 1000;
long hrs = 1000;
long days ;
if ( ts > 0 ) {
mins = (long)((now - ts)/60);
hrs = (long)((now - ts)/3600);
days = (long)((now - ts)/(3600*24));
if ( mins < 0 ) mins = 0;
if ( hrs < 0 ) hrs = 0;
if ( days < 0 ) days = 0;
// print the time ago
2014-02-05 05:05:43 +04:00
if ( mins ==1)sb->safePrintf(" - %s: %li minute ago",prefix,mins);
else if (mins<60)sb->safePrintf ( " - %s: %li minutes ago",prefix,mins);
else if ( hrs == 1 )sb->safePrintf ( " - %s: %li hour ago",prefix,hrs);
else if ( hrs < 24 )sb->safePrintf ( " - %s: %li hours ago",prefix,hrs);
else if ( days == 1 )sb->safePrintf ( " - %s: %li day ago",prefix,days);
else if (days< 7 )sb->safePrintf ( " - %s: %li days ago",prefix,days);
2013-08-03 00:12:24 +04:00
// do not show if more than 1 wk old! we want to seem as
// fresh as possible
else if ( ts > 0 ) { // && si->m_isAdmin ) {
struct tm *timeStruct = localtime ( &ts );
2014-02-05 05:05:43 +04:00
sb->safePrintf(" - %s: ",prefix);
2013-08-03 00:12:24 +04:00
char tmp[100];
strftime(tmp,100,"%b %d %Y",timeStruct);
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
return true;
int linkSiteRankCmp (const void *v1, const void *v2) {
Inlink *i1 = *(Inlink **)v1;
Inlink *i2 = *(Inlink **)v2;
return i2->m_siteRank - i1->m_siteRank;
2014-02-05 05:05:43 +04:00
bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si ,
2013-08-03 00:12:24 +04:00
long *numPrinted ) {
*numPrinted = 0;
// . show the "LinkInfo"
// . Msg20.cpp will have "computed" the LinkInfo if we set
// Msg20Request::m_computeLinkInfo to true, but if we set
// Msg20Request::m_getLinkInfo to true it will just get it
// from the TitleRec, which is much faster but more stale.
// . "&inlinks=1" is slow and fresh, "&inlinks=2" is fast
// and stale. Both are really only for BuzzLogic.
LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks;
// sanity
if ( info && mr->size_linkInfo != info->m_size ){char *xx=NULL;*xx=0; }
// NULLify if empty
if ( mr->size_linkInfo <= 0 ) info = NULL;
// do not both if none
if ( info && ! info->m_numStoredInlinks ) info = NULL;
// bail?
if ( ! info ) return true;
// now sort them up
Inlink *k = info->getNextInlink(NULL);
// #define from Linkdb.h
Inlink *ptrs[MAX_LINKERS];
long numLinks = 0;
for ( ; k ; k = info->getNextInlink(k) ) {
ptrs[numLinks++] = k;
if ( numLinks >= MAX_LINKERS ) break;
// sort them
gbsort ( ptrs , numLinks , 4 , linkSiteRankCmp );
// print xml starter
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t<inlinks>\n");
2013-08-03 00:12:24 +04:00
// loop through the inlinks
bool printedInlinkText = false;
bool firstTime = true;
long inlinkId = 0;
long long starttime = gettimeofdayInMillisecondsLocal();
//long icount = 0;
//long ecount = 0;
//long absSum = 0;
for ( long i = 0 ; i < numLinks ; i++ ) {
k = ptrs[i];
if ( ! k->ptr_linkText ) continue;
if ( ! si->m_doQueryHighlighting &&
si->m_format == FORMAT_HTML )
2013-08-03 00:12:24 +04:00
char *str = k-> ptr_linkText;
long strLen = k->size_linkText;
//char tt[1024*3];
//char *ttend = tt + 1024*3;
2013-08-03 00:12:24 +04:00
char *frontTag =
"<font style=\"color:black;background-color:yellow\">" ;
char *backTag = "</font>";
if ( si->m_format == FORMAT_XML ) {
2013-08-03 00:12:24 +04:00
frontTag = "<b>";
backTag = "</b>";
2014-04-17 02:35:16 +04:00
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_AJAX ) {
2014-04-09 22:03:31 +04:00
frontTag = "<font style=\"background-color:yellow\">" ;
2013-08-03 00:12:24 +04:00
Highlight hi;
SafeBuf hb;
long hlen = hi.set ( &hb,//tt ,
//ttend - tt ,
2013-08-03 00:12:24 +04:00
strLen ,
mr->m_language, // docLangId
&si->m_hqq , // highlight query CLASS
false , // doStemming?
false , // use click&scroll?
NULL , // base url
0 ); // niceness
if ( hlen <= 0 ) continue;
// skip it if nothing highlighted
if ( hi.getNumMatches() == 0 ) continue;
if ( si->m_format == FORMAT_XML ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf("\t\t\t<inlink "
2013-08-03 00:12:24 +04:00
"docId=\"%lli\" "
k->m_docId );
// encode it for xml
2014-02-05 05:05:43 +04:00
sb->htmlEncode ( k->ptr_urlBuf,
2013-08-03 00:12:24 +04:00
k->size_urlBuf - 1 , false );
2014-02-05 05:05:43 +04:00
sb->safePrintf("\" "
2013-08-03 00:12:24 +04:00
//"hostId=\"%lu\" "
"firstindexed=\"%lu\" "
// not accurate!
//"lastspidered=\"%lu\" "
"wordposstart=\"%li\" "
"id=\"%li\" "
"siterank=\"%li\" "
//hh ,
(unsigned long)k->m_firstIndexedDate,
//(unsigned long)k->m_lastSpidered,
// HACK!!!
k->m_siteHash = inlinkId;
// inc it
// encode it for xml
2014-02-05 05:05:43 +04:00
if ( !sb->htmlEncode ( hb.getBufStart(),
return false;
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
if ( firstTime ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf("<font size=-1>");
sb->safePrintf("<table border=1>"
2013-08-03 00:12:24 +04:00
"<tr><td colspan=3>"
"<b>Inlinks with Query Terms</b>"
"<td>Inlink Text</td>"
"<td>Site Rank</td>"
firstTime = false;
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<a href=/get?c=%s&d=%lli&cnsp=0>"
//"<a href=\"/print?"
2014-02-05 05:05:43 +04:00
if ( ! sb->safeMemcpy(&hb) ) return false;
2013-08-03 00:12:24 +04:00
long hostLen = 0;
char *host = getHostFast(k->ptr_urlBuf,&hostLen,NULL);
2014-02-05 05:05:43 +04:00
if ( host ) sb->safeMemcpy(host,hostLen);
2013-08-03 00:12:24 +04:00
printedInlinkText = true;
*numPrinted = *numPrinted + 1;
long long took = gettimeofdayInMillisecondsLocal() - starttime;
if ( took > 2 )
log("timing: took %lli ms to highlight %li links."
// closer for xml
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t</inlinks>\n");
//if ( printedInlinkText ) sb->safePrintf("<br>\n");
2013-08-03 00:12:24 +04:00
if ( printedInlinkText )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
return true;
2013-10-03 08:34:21 +04:00
// . print a dmoz topic for the given numeric catid UNDER search result
// . print "Search in Category" link as well
2014-02-05 05:05:43 +04:00
static bool printDMOZCategoryUnderResult ( SafeBuf *sb ,
2013-10-03 08:34:21 +04:00
SearchInput *si,
long catid ,
State0 *st ) {
//uint8_t queryLanguage = langUnknown;
uint8_t queryLanguage = si->m_queryLangId;
2013-10-03 08:34:21 +04:00
// Don't print category if not in native language category
// Note that this only trims out "World" cats, not all
// of them. Some of them may still sneak in.
// queryLanguage = si->m_langHint;
2013-10-03 08:34:21 +04:00
if(queryLanguage != langUnknown) {
char tmpbuf[1024];
SafeBuf langsb(tmpbuf, 1024);
g_categories->printPathFromId(&langsb, catid, false);
char *ptr = langsb.getBufStart();
uint8_t lang = g_langId.findLangFromDMOZTopic(ptr + 7);
if(!strncmp("World: ", ptr, 6) &&
lang != langUnknown &&
lang != queryLanguage)
// do not print it if not in our language
return true;
// print a link to apply your query to this DMOZ category
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a href=\"/search?s=0&q=gbipcatid%%3A%li",catid);
sb->safePrintf("\">Search in Category</a>: ");
2013-10-03 08:34:21 +04:00
// setup the host of the url
//if ( dmozHost )
2014-02-05 05:05:43 +04:00
// sb->safePrintf("<a href=\"http://%s/", dmozHost );
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a href=\"/");
2013-10-03 08:34:21 +04:00
// print link
2014-02-05 05:05:43 +04:00
g_categories->printPathFromId(sb, catid, true,si->m_isRTL);
2013-10-03 08:34:21 +04:00
// print the name of the dmoz category
2014-02-05 05:05:43 +04:00
sb->safePrintf("<font color=#c62939>");
g_categories->printPathFromId(sb, catid, false,si->m_isRTL);
2013-10-03 08:34:21 +04:00
return true;
2013-08-03 00:12:24 +04:00
// use this for xml as well as html
bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
2014-02-04 07:17:58 +04:00
SafeBuf *sb = &st->m_sb;
2014-04-09 06:34:43 +04:00
HttpRequest *hr = &st->m_hr;
2014-02-04 07:17:58 +04:00
CollectionRec *cr = NULL;
cr = g_collectiondb.getRec ( st->m_collnum );
if ( ! cr ) {
2014-04-10 06:51:36 +04:00
log("query: printResult: collnum %li gone",
2014-02-04 07:17:58 +04:00
return true;
2013-08-03 00:12:24 +04:00
// shortcuts
SearchInput *si = &st->m_si;
Msg40 *msg40 = &st->m_msg40;
// ensure not all cluster levels are invisible
if ( si->m_debug )
logf(LOG_DEBUG,"query: result #%li clusterlevel=%li",
ix, (long)msg40->getClusterLevel(ix));
2013-12-06 20:53:32 +04:00
long long d = msg40->getDocId(ix);
2013-08-03 00:12:24 +04:00
if ( si->m_docIdsOnly ) {
if ( si->m_format == FORMAT_XML )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2013-12-06 20:53:32 +04:00
d );
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-12-06 20:53:32 +04:00
d );
// inc it
*numPrintedSoFar = *numPrintedSoFar + 1;
2013-08-03 00:12:24 +04:00
return true;
2013-12-06 20:53:32 +04:00
Msg20 *m20 ;
if ( si->m_streamResults )
m20 = msg40->getCompletedSummary(ix);
m20 = msg40->m_msg20[ix];
// get the reply
Msg20Reply *mr = m20->m_r;
2013-08-03 00:12:24 +04:00
// . sometimes the msg20reply is NULL so prevent it coring
// . i think this happens if all hosts in a shard are down or timeout
// or something
if ( ! mr ) return false;
// . if section voting info was request, display now, it's in json
// . so if in csv it will mess things up!!!
if ( mr->ptr_sectionVotingInfo )
// it is possible this is just "\0"
2014-02-05 05:05:43 +04:00
sb->safeStrcpy ( mr->ptr_sectionVotingInfo );
// each "result" is the actual cached page, in this case, a json
// object, because we were called with &icc=1. in that situation
// ptr_content is set in the msg20reply.
if ( si->m_format == FORMAT_CSV &&
mr->ptr_content &&
mr->m_contentType == CT_JSON ) {
// parse it up
char *json = mr->ptr_content;
// only print header row once, so pass in that flag
if ( ! st->m_printedHeaderRow ) {
2014-02-05 05:05:43 +04:00
printCSVHeaderRow ( sb , st );
st->m_printedHeaderRow = true;
2014-02-05 05:05:43 +04:00
printJsonItemInCSV ( json , sb , st );
// inc it
*numPrintedSoFar = *numPrintedSoFar + 1;
return true;
// just print cached web page?
if ( mr->ptr_content ) {
// for json items separate with \n,\n
if ( si->m_format != FORMAT_HTML && *numPrintedSoFar > 0 )
// a dud? just print empty {}'s
if ( mr->size_content == 1 )
sb->safeStrcpy ( mr->ptr_content );
// . let's hack the spidertime onto the end
// . so when we sort by that using gbsortby:spiderdate
// we can ensure it is ordered correctly
2014-05-14 04:23:07 +04:00
// As of the update on 5/13/2014, the end of sb may have whitespace, so first move away from that
int distance; // distance from end to first non-whitespace char
char *end;
2014-05-14 04:25:42 +04:00
for (distance = 1; distance < sb->getLength(); distance++) {
2014-05-14 04:23:07 +04:00
end = sb->getBuf() - distance;
if (!is_wspace_a(*end))
if ( si->m_format == FORMAT_JSON &&
end > sb->getBufStart() &&
*end == '}' ) {
// replace trailing } with spidertime}
2014-05-14 04:23:07 +04:00
// comma?
if ( mr->size_content>1 ) sb->pushChar(',');
sb->safePrintf("\"docId\":%lli", mr->m_docId);
2014-03-21 02:33:37 +04:00
// for deduping
// crap, we lose resolution storing as a float
// so fix that shit here...
//float f = mr->m_lastSpidered;
// MDW: this is VERY convenient for debugging pls
// leave in. we can easily see if a result
// should be there for a query like
// gbmin:gbspiderdate:12345678
// also include a timestamp field with an RFC 1123 formatted date
char timestamp[50];
struct tm *ptm = gmtime ( &mr->m_lastSpidered );
strftime(timestamp, 50, "%a, %d %b %Y %X %Z", ptm);
sb->safePrintf(",\"timestamp\":\"%s\"}\n", timestamp);
//mr->size_content );
if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
// inc it
*numPrintedSoFar = *numPrintedSoFar + 1;
// just in case
2014-02-05 05:05:43 +04:00
return true;
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t<result>\n" );
2013-08-03 00:12:24 +04:00
Highlight hi;
2013-08-03 00:12:24 +04:00
// get the url
char *url = mr->ptr_ubuf ;
long urlLen = mr->size_ubuf - 1 ;
long err = mr->m_errno ;
// . remove any session ids from the url
// . for speed reasons, only check if its a cgi url
Url uu;
uu.set ( url , urlLen, false, true );
url = uu.getUrl();
urlLen = uu.getUrlLen();
// get my site hash
unsigned long long siteHash = 0;
if ( uu.getHostLen() > 0 )
siteHash = hash64(uu.getHost(),uu.getHostLen());
// indent it if level is 2
bool indent = false;
bool isAdmin = si->m_isAdmin;
if ( si->m_format == FORMAT_XML ) isAdmin = false;
2013-08-03 00:12:24 +04:00
//unsigned long long lastSiteHash = siteHash;
if ( indent && si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// print the rank. it starts at 0 so add 1
if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<table><tr><td valign=top>%li.</td><td>",
2013-08-03 00:12:24 +04:00
ix+1 + si->m_firstResultNum );
if ( si->m_showBanned ) {
if ( err == EDOCBANNED ) err = 0;
if ( err == EDOCFILTERED ) err = 0;
// if this msg20 had an error print "had error"
if ( err || urlLen <= 0 || ! url ) {
// it's unprofessional to display this in browser
// so just let admin see it
if ( isAdmin ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf("<i>docId %lli had error: "
2013-08-03 00:12:24 +04:00
// log it too!
log("query: docId %lli had error: %s.",
// wrap it up if clustered
2014-02-05 05:05:43 +04:00
if ( indent ) sb->safeMemcpy("</blockquote>",13);
// inc it
*numPrintedSoFar = *numPrintedSoFar + 1;
2013-08-03 00:12:24 +04:00
return true;
// the score if admin
if ( isAdmin ) {
long level = (long)msg40->getClusterLevel(ix);
// print out score
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "s=%.03f "
2013-08-03 00:12:24 +04:00
"docid=%llu "
"sitenuminlinks=%li%% "
"hop=%li "
"cluster=%li "
"summaryLang=%s "
(float)msg40->getScore(ix) ,
(long )mr->m_siteNumInlinks,
level ,
2014-04-09 06:34:43 +04:00
char *diffbotSuffix = strstr(url,"-diffbotxyz");
2013-08-03 00:12:24 +04:00
// print youtube and metacafe thumbnails here
// get the thumbnail url
2014-05-11 01:24:13 +04:00
if ( mr->ptr_imgUrl &&
si->m_format == FORMAT_HTML &&
// if we got thumbnail use that not this
! mr->ptr_imgData )
sb->safePrintf ("<a href=%s><img src=%s></a>",
2013-08-03 00:12:24 +04:00
2014-04-09 06:34:43 +04:00
// if we have a thumbnail show it next to the search result
if ( si->m_format == FORMAT_HTML &&
2014-05-11 01:24:13 +04:00
//! mr->ptr_imgUrl &&
mr->ptr_imgData ) {
2014-04-27 22:05:30 +04:00
ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
ThumbnailInfo *ti = ta->getThumbnailInfo(0);
2014-05-11 01:30:30 +04:00
ti->printThumbnailInHtml ( sb ,
100 , // max width
100 , // max height
true , // add <a href>
2014-05-11 18:06:35 +04:00
" style=\"margin:10px;\" ");
2014-04-17 08:36:28 +04:00
2014-04-09 06:34:43 +04:00
// print image for widget
2014-04-21 20:21:28 +04:00
if ( //mr->ptr_imgUrl &&
2014-04-17 02:35:16 +04:00
( si->m_format == FORMAT_WIDGET_IFRAME ||
2014-05-06 21:47:57 +04:00
si->m_format == FORMAT_WIDGET_AJAX ||
si->m_format == FORMAT_WIDGET_APPEND ) ) {
2014-04-09 06:34:43 +04:00
long widgetWidth = hr->getLong("widgetwidth",200);
// prevent coring
if ( widgetWidth < 1 ) widgetWidth = 1;
2014-05-01 00:17:39 +04:00
// each search result in widget has a div around it
2014-04-09 06:34:43 +04:00
sb->safePrintf("<div "
2014-05-01 00:17:39 +04:00
"class=result "
2014-05-06 21:47:57 +04:00
// we need the docid and score of last result
// when we append new results to the end
// of the widget for infinite scrolling
// using the scripts in PageBasic.cpp
"docid=%lli "
"score=%f " // double
2014-04-09 06:34:43 +04:00
2014-04-17 21:30:56 +04:00
2014-04-18 02:58:02 +04:00
2014-05-01 00:17:39 +04:00
2014-05-06 21:47:57 +04:00
, mr->m_docId
// this is a double now. this won't work
// for streaming...
, msg40->m_msg3a.m_scores[ix]
, widgetWidth - 2*8 // padding is 8px
2014-04-17 21:30:56 +04:00
2014-04-18 02:58:02 +04:00
, (long)PADDING
2014-04-21 20:21:28 +04:00
// if ( mr->ptr_imgUrl )
// sb->safePrintf("background-repeat:no-repeat;"
// "background-size:%lipx 140px;"
// "background-image:url('%s');"
// , widgetwidth - 2*8 // padding is 8px
// , mr->ptr_imgUrl);
2014-05-01 00:17:39 +04:00
long newdx = 0;
if ( mr->ptr_imgData ) {
ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
ThumbnailInfo *ti = ta->getThumbnailInfo(0);
2014-05-01 00:17:39 +04:00
// account for scrollbar on the right
2014-05-11 01:30:30 +04:00
long maxWidth = widgetWidth - (long)SCROLLBAR_WIDTH;
long maxHeight = (long)RESULT_HEIGHT;
2014-05-11 01:24:13 +04:00
// false = do not print <a href> link on image
2014-05-11 01:30:30 +04:00
ti->printThumbnailInHtml ( sb ,
maxWidth ,
maxHeight ,
false , // add <a href>
&newdx );
2014-04-17 08:36:28 +04:00
// end the div style attribute and div tag
2014-05-01 00:17:39 +04:00
2014-04-09 06:34:43 +04:00
sb->safePrintf ( "<a "
"target=_blank "
2014-05-01 00:17:39 +04:00
// don't let scroll bar obscure text
// if thumbnail is wide enough put text on top of it, otherwise
// image is to the left and text is to the right of image
if ( newdx > .5 * widgetWidth )
, (long) PADDING
, (long) PADDING
// to align the text verticall we gotta make a textbox div
// otherwise it wraps below image! mdw
// sb->safePrintf("vertical-align:middle;");
, (long) PADDING
, (long) PADDING + newdx + 10 );
// close the style and begin the url
sb->safePrintf( "\" "
2014-04-09 06:34:43 +04:00
// truncate off -diffbotxyz%li
long newLen = urlLen;
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
// print the url in the href tag
sb->safeMemcpy ( url , newLen );
// then finish the a href tag and start a bold for title
sb->safePrintf ( "\">");//<font size=+0>" );
2014-04-09 06:34:43 +04:00
sb->safePrintf("<b style=\""
2014-04-09 22:03:31 +04:00
"font-size: 15px;"
2014-04-09 06:34:43 +04:00
2014-05-01 00:44:19 +04:00
// add padding so shadow does not stick out
2014-04-09 06:34:43 +04:00
//"text-shadow:2px 4px 3px rgba(0,0,1,3);"
"text-shadow: 2px 2px 0 #000 "
",-2px -2px 0 #000 "
",-2px 2px 0 #000 "
", 2px -2px 0 #000 "
", 2px -2px 0 #000 "
", 0px -2px 0 #000 "
", 0px 2px 0 #000 "
", -2px 0px 0 #000 "
", 2px 0px 0 #000 "
//"-2px 2px 0 #000 "
//"2px -2px 0 #000 "
//"-2px -2px 0 #000;"
2014-05-11 01:24:13 +04:00
//sb->safePrintf ("<img width=50 height=50 src=%s></a>",
2014-04-09 06:34:43 +04:00
// mr->ptr_imgUrl);
// then title over image
// only do link here if we have no thumbnail so no bg image
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
2014-05-06 21:47:57 +04:00
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX ) &&
! mr->ptr_imgData ) {
sb->safePrintf ( "<a style=text-decoration:none;"
"color:white; "
"href=" );
// truncate off -diffbotxyz%li
long newLen = urlLen;
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
// print the url in the href tag
sb->safeMemcpy ( url , newLen );
// then finish the a href tag and start a bold for title
sb->safePrintf ( ">");//<font size=+0>" );
2014-04-09 06:34:43 +04:00
2013-08-03 00:12:24 +04:00
// the a href tag
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_HTML ) sb->safePrintf ( "\n\n" );
2013-08-03 00:12:24 +04:00
// then if it is banned
if ( mr->m_isBanned && si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<font color=red><b>BANNED</b></font> ");
2013-08-03 00:12:24 +04:00
2014-04-09 06:34:43 +04:00
2013-10-03 08:34:21 +04:00
2013-08-03 00:12:24 +04:00
// the a href tag
if ( si->m_format == FORMAT_HTML ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "<a href=" );
2014-04-09 06:34:43 +04:00
// truncate off -diffbotxyz%li
long newLen = urlLen;
if ( diffbotSuffix ) newLen = diffbotSuffix - url;
2013-08-03 00:12:24 +04:00
// print the url in the href tag
2014-04-09 06:34:43 +04:00
sb->safeMemcpy ( url , newLen );
2013-08-03 00:12:24 +04:00
// then finish the a href tag and start a bold for title
2014-02-05 05:05:43 +04:00
sb->safePrintf ( ">");//<font size=+0>" );
2013-08-03 00:12:24 +04:00
// . then the title (should be NULL terminated)
// . the title can be NULL
// . highlight it first
// . the title itself should not have any tags in it!
char *str = mr->ptr_tbuf;//msg40->getTitle(i);
long strLen = mr->size_tbuf - 1;// msg40->getTitleLen(i);
if ( ! str || strLen < 0 ) strLen = 0;
2013-10-03 08:34:21 +04:00
// are we printing a dmoz category page?
// get the appropriate dmoz title/summary to use since the same
// url can exist in multiple topics (catIds) with different
// titles summaries.
char *dmozSummary = NULL;
// TODO: just get the catid from httprequest directly?
if ( si->m_catId > 0 ) { // si->m_cat_dirId > 0) {
// . get the dmoz title and summary
// . if empty then just a bunch of \0s, except for catIds
Msg20Reply *mr = m20->getReply();
char *dmozTitle = mr->ptr_dmozTitles;
dmozSummary = mr->ptr_dmozSumms;
char *dmozAnchor = mr->ptr_dmozAnchors;
long *catIds = mr->ptr_catIds;
long numCats = mr->size_catIds / 4;
// loop through looking for the right ID
for (long i = 0; i < numCats ; i++ ) {
// assign shit if we match the dmoz cat we are showing
if ( catIds[i] == si->m_catId) break;
dmozTitle +=gbstrlen(dmozTitle)+1;
dmozSummary +=gbstrlen(dmozSummary)+1;
dmozAnchor += gbstrlen(dmozAnchor)+1;
// now make the title the dmoz title
str = dmozTitle;
strLen = gbstrlen(str);
2013-08-03 00:12:24 +04:00
long hlen;
//copy all summary and title excerpts for this result into here
//char tt[1024*32];
//char *ttend = tt + 1024*32;
2013-08-03 00:12:24 +04:00
char *frontTag =
"<font style=\"color:black;background-color:yellow\">" ;
char *backTag = "</font>";
if ( si->m_format == FORMAT_XML ) {
2013-08-03 00:12:24 +04:00
frontTag = "<b>";
backTag = "</b>";
2014-04-17 02:35:16 +04:00
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
2014-05-06 21:47:57 +04:00
si->m_format == FORMAT_WIDGET_APPEND ||
2014-04-17 02:35:16 +04:00
si->m_format == FORMAT_WIDGET_AJAX ) {
2014-04-09 22:03:31 +04:00
frontTag = "<font style=\"background-color:yellow\">" ;
2013-08-03 00:12:24 +04:00
long cols = 80;
if ( si->m_format == FORMAT_XML )
2014-02-05 05:05:43 +04:00
SafeBuf hb;
2013-08-03 00:12:24 +04:00
if ( str && strLen && si->m_doQueryHighlighting ) {
hlen = hi.set ( &hb,
//tt ,
//ttend - tt ,
2013-08-03 00:12:24 +04:00
strLen ,
mr->m_language, // docLangId
&si->m_hqq , // highlight query CLASS
false , // doStemming?
false , // use click&scroll?
NULL , // base url
0 ); // niceness
2014-02-05 05:05:43 +04:00
//if (!sb->utf8Encode2(tt, hlen)) return false;
if ( ! sb->brify ( hb.getBufStart(),
cols) ) return false;
2013-08-03 00:12:24 +04:00
else if ( str && strLen ) {
// determine if TiTle wraps, if it does add a <br> count for
// each wrap
2014-02-05 05:05:43 +04:00
//if (!sb->utf8Encode2(str , strLen )) return false;
if ( ! sb->brify ( str,strLen,0,cols) ) return false;
2013-08-03 00:12:24 +04:00
// . use "UNTITLED" if no title
// . msg20 should supply the dmoz title if it can
if ( strLen == 0 ) {
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
return false;
// close up the title tag
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></title>\n");
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("</a><br>\n" ) ;
2013-08-03 00:12:24 +04:00
2014-04-09 06:34:43 +04:00
2014-05-01 00:17:39 +04:00
// close the title tag stuf
2014-04-17 02:35:16 +04:00
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
2014-05-06 21:47:57 +04:00
si->m_format == FORMAT_WIDGET_APPEND ||
2014-04-17 02:35:16 +04:00
si->m_format == FORMAT_WIDGET_AJAX )
2014-05-01 00:17:39 +04:00
2014-04-09 06:34:43 +04:00
2013-10-03 08:34:21 +04:00
2013-08-03 00:12:24 +04:00
// print content type after title
2013-10-03 08:34:21 +04:00
2013-08-03 00:12:24 +04:00
unsigned char ctype = mr->m_contentType;
if ( ctype != CT_HTML && ctype != CT_UNKNOWN ){//&&ctype <= CT_JSON ) {
2013-08-03 00:12:24 +04:00
char *cs = g_contentTypeStrings[ctype];
if ( si->m_format == FORMAT_XML )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-04-27 22:05:30 +04:00
else if ( si->m_format == FORMAT_HTML && ctype != CT_HTML ) {
sb->safePrintf(" <b><font style=color:white;"
char *p = cs;
for ( ; *p ; p++ ) {
char c = to_upper_a(*p);
sb->safePrintf("</font></b> &nbsp;");
2013-08-03 00:12:24 +04:00
2013-10-03 08:34:21 +04:00
// print the summary
2013-08-03 00:12:24 +04:00
// . then the summary
// . "s" is a string of null terminated strings
char *send;
// do the normal summary
str = mr->ptr_sum;
strLen = mr->size_sum-1;
// this includes the terminating \0 or \0\0 so back up
if ( strLen < 0 ) strLen = 0;
send = str + strLen;
2013-10-03 08:34:21 +04:00
// dmoz summary might override if we are showing a dmoz topic page
if ( dmozSummary ) {
str = dmozSummary;
strLen = gbstrlen(dmozSummary);
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t<sum><![CDATA[");
2013-08-03 00:12:24 +04:00
2014-04-09 22:03:31 +04:00
bool printSummary = true;
// do not print summaries for widgets by default unless overridden
// with &summary=1
2014-04-17 02:35:16 +04:00
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
2014-05-06 21:47:57 +04:00
si->m_format == FORMAT_WIDGET_APPEND ||
2014-04-17 02:35:16 +04:00
si->m_format == FORMAT_WIDGET_AJAX ) &&
hr->getLong("summaries",0) == 0 )
2014-04-09 22:03:31 +04:00
printSummary = false;
if ( printSummary )
sb->brify ( str , strLen, 0 , cols ); // niceness = 0
2013-08-03 00:12:24 +04:00
// close xml tag
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></sum>\n");
2013-08-03 00:12:24 +04:00
// new line if not xml
2014-02-05 05:05:43 +04:00
else if ( strLen ) sb->safePrintf("<br>\n");
2013-08-03 00:12:24 +04:00
2013-10-03 08:34:21 +04:00
// . print DMOZ topics under the summary
// . will print the "Search in Category" link too
//Msg20Reply *mr = m20->getMsg20Reply();
long nCatIds = mr->getNumCatIds();
for (long i = 0; i < nCatIds; i++) {
long catid = ((long *)(mr->ptr_catIds))[i];
// skipCatsPrint:
// print the indirect category Ids
long nIndCatids = mr->size_indCatIds / 4;
//if ( !cr->m_displayIndirectDmozCategories )
// goto skipCatsPrint2;
for ( long i = 0; i < nIndCatids; i++ ) {
long catid = ((long *)(mr->ptr_indCatIds))[i];
// skip it if it's a regular category
//bool skip = false;
long d; for ( d = 0; d < nCatIds; d++) {
if ( catid == mr->ptr_catIds[i] ) break;
// skip if the indirect catid matched a directed catid
if ( d < nCatIds ) continue;
// otherwise print it
// print the URL
2013-08-03 00:12:24 +04:00
// hack off the http:// if any for displaying it on screen
if ( urlLen > 8 && strncmp ( url , "http://" , 7 )==0 ) {
url += 7; urlLen -= 7; }
// . remove trailing /
// . only remove from root urls in case user cuts and
// pastes it for link: search
if ( url [ urlLen - 1 ] == '/' ) {
// see if any other slash before us
long j;
for ( j = urlLen - 2 ; j >= 0 ; j-- )
if ( url[j] == '/' ) break;
// if there wasn't, we must have been a root url
// so hack off the last slash
if ( j < 0 ) urlLen--;
if ( si->m_format == FORMAT_HTML ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ("<font color=gray>" );
//sb->htmlEncode ( url , gbstrlen(url) , false );
2013-08-03 00:12:24 +04:00
// 20 for the date after it
2014-02-05 05:05:43 +04:00
sb->safeTruncateEllipsis ( url , cols - 30 );
2013-08-03 00:12:24 +04:00
// turn off the color
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</font>\n" );
2013-08-03 00:12:24 +04:00
if ( si->m_format == FORMAT_XML ) {
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( url , urlLen );
2013-08-03 00:12:24 +04:00
// now the last spidered date of the document
time_t ts = mr->m_lastSpidered;
if ( si->m_format == FORMAT_HTML )
printTimeAgo ( sb , ts , "indexed" , si );
2013-08-03 00:12:24 +04:00
// the date it was last modified
ts = mr->m_lastModified;
if ( si->m_format == FORMAT_HTML )
printTimeAgo ( sb , ts , "modified" , si );
2013-08-03 00:12:24 +04:00
// more xml stuff
if ( si->m_format == FORMAT_XML ) {
2013-08-03 00:12:24 +04:00
// doc size in Kilobytes
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<size><![CDATA[%4.0fk]]></size>\n",
2013-08-03 00:12:24 +04:00
// . docId for possible cached link
// . might have merged a bunch together
2014-02-05 05:05:43 +04:00
sb->safePrintf("\t\t<docId>%lli</docId>\n",mr->m_docId );
2013-08-03 00:12:24 +04:00
// . show the site root
// . for this will be
// . for this will be
// etc.
long siteLen = 0;
char *site = NULL;
// seems like this isn't the way to do it, cuz Tagdb.cpp
// adds the "site" tag itself and we do not always have it
// in the XmlDoc::ptr_tagRec... so do it this way:
site = mr->ptr_site;
siteLen = mr->size_site-1;
//char *site=uu.getSite( &siteLen , si->m_coll, false, tagRec);
2014-02-05 05:05:43 +04:00
if ( site && siteLen > 0 ) sb->safeMemcpy ( site , siteLen );
2013-08-03 00:12:24 +04:00
//long sh = hash32 ( site , siteLen );
2014-02-05 05:05:43 +04:00
//sb->safePrintf ("\t\t<siteHash32>%lu</siteHash32>\n",sh);
2013-08-03 00:12:24 +04:00
//long dh = uu.getDomainHash32 ();
2014-02-05 05:05:43 +04:00
//sb->safePrintf ("\t\t<domainHash32>%lu</domainHash32>\n",dh);
2013-08-03 00:12:24 +04:00
// spider date
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<spidered>%lu</spidered>\n",
2013-08-03 00:12:24 +04:00
// backwards compatibility for buzz
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<firstIndexedDateUTC>%lu"
2013-08-03 00:12:24 +04:00
sb->safePrintf( "\t\t<contentHash32>%lu"
2013-08-03 00:12:24 +04:00
// pub date
long datedbDate = mr->m_datedbDate;
// show the datedb date as "<pubDate>" for now
if ( datedbDate != -1 )
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<pubdate>%lu</pubdate>\n",
2013-08-03 00:12:24 +04:00
// . we also store the outlinks in a linkInfo structure
// . we can call LinkInfo::set ( Links *outlinks ) to set it
// in the msg20
LinkInfo *outlinks = (LinkInfo *)mr->ptr_outlinks;
// NULLify if empty
if ( mr->size_outlinks <= 0 ) outlinks = NULL;
// only for xml for now
if ( si->m_format == FORMAT_HTML ) outlinks = NULL;
2013-08-03 00:12:24 +04:00
Inlink *k;
// do we need absScore2 for outlinks?
//k = NULL;
while ( outlinks &&
(k =outlinks->getNextInlink(k)))
// print it out
2014-02-05 05:05:43 +04:00
sb->safePrintf("\t\t<outlink "
2013-08-03 00:12:24 +04:00
"docId=\"%lli\" "
"hostId=\"%lu\" "
"indexed=\"%li\" "
"pubdate=\"%li\" ",
k->m_docId ,
k->m_ip, // hostHash, but use ip for now
(long)k->m_firstIndexedDate ,
(long)k->m_datedbDate );
if ( si->m_format == FORMAT_XML ) {
2013-08-03 00:12:24 +04:00
// result
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
char *charset = get_charset_str(mr->m_charset);
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"</charset>\n", charset);
// end more xml stuff
2014-04-09 06:34:43 +04:00
if ( isAdmin && si->m_format == FORMAT_HTML ) {
2013-08-03 00:12:24 +04:00
long lang = mr->m_language;
2014-02-05 05:05:43 +04:00
if ( lang ) sb->safePrintf(" - %s",getLanguageString(lang));
2013-08-03 00:12:24 +04:00
uint16_t cc = mr->m_computedCountry;
2014-02-05 05:05:43 +04:00
if( cc ) sb->safePrintf(" - %s", g_countryCode.getName(cc));
2013-08-03 00:12:24 +04:00
char *charset = get_charset_str(mr->m_charset);
2014-02-05 05:05:43 +04:00
if ( charset ) sb->safePrintf(" - %s ", charset);
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
if ( si->m_format == FORMAT_HTML ) sb->safePrintf("<br>\n");
2013-08-03 00:12:24 +04:00
2014-03-07 05:01:41 +04:00
//char *coll = si->m_cr->m_coll;
2013-08-03 00:12:24 +04:00
// print the [cached] link?
bool printCached = true;
if ( mr->m_noArchive ) printCached = false;
if ( isAdmin ) printCached = true;
if ( mr->m_contentLen <= 0 ) printCached = false;
2014-04-09 06:34:43 +04:00
if ( si->m_format != FORMAT_HTML ) printCached = false;
2014-03-07 05:01:41 +04:00
// get collnum result is from
//collnum_t collnum = si->m_cr->m_collnum;
// if searching multiple collections - federated search
CollectionRec *scr = g_collectiondb.getRec ( mr->m_collnum );
char *coll = "UNKNOWN";
if ( scr ) coll = scr->m_coll;
2013-08-03 00:12:24 +04:00
if ( printCached && cr->m_clickNScrollEnabled )
2014-02-05 05:05:43 +04:00
sb->safePrintf ( " - <a href=/scroll.html?page="
2013-08-03 00:12:24 +04:00
2014-02-04 07:17:58 +04:00
st->m_qe , coll ,
2013-08-03 00:12:24 +04:00
mr->m_docId );
else if ( printCached )
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "<a href=\""
2013-08-03 00:12:24 +04:00
2014-02-04 07:17:58 +04:00
st->m_qe ,
2013-08-03 00:12:24 +04:00
// "qlang" parm
2013-08-03 00:12:24 +04:00
coll ,
mr->m_docId );
// the new links
if ( si->m_format == FORMAT_HTML && g_conf.m_isMattWells && 1 == 0 ) {
2014-02-05 05:05:43 +04:00
//sb->safePrintf(" - <a href=\"/scoring?"
2013-08-03 00:12:24 +04:00
// "c=%s&\">scoring</a>",
// coll );
2014-02-05 05:05:43 +04:00
//sb->safePrintf(" - <a href=\"/print?c=%s&",coll);
if ( g_conf.m_isMattWells )
2014-02-05 05:05:43 +04:00
sb->safePrintf(" - <a href=\"/seo?");//c=%s&",coll);
2014-02-05 05:05:43 +04:00
sb->safePrintf(" - <a href=\"https://www.gigablast."
2014-02-05 05:05:43 +04:00
sb->urlEncode ( url , gbstrlen(url) , false );
//sb->safePrintf("&page=1\">seo</a>" );
sb->safePrintf("\"><font color=red>seo</font></a>" );
2013-08-03 00:12:24 +04:00
// only display re-spider link if addurl is enabled
//if ( isAdmin &&
// g_conf.m_addUrlEnabled &&
// cr->m_addUrlEnabled ) {
if ( si->m_format == FORMAT_HTML ) {
2013-08-03 00:12:24 +04:00
// the [respider] link
// save this for seo iframe!
2014-02-05 05:05:43 +04:00
sb->safePrintf (" - <a href=\"/admin/inject?u=" );
2013-08-03 00:12:24 +04:00
// encode the url now
2014-02-05 05:05:43 +04:00
sb->urlEncode ( url , urlLen );
2013-08-03 00:12:24 +04:00
// then collection
if ( coll ) {
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( "&c=" , 3 );
sb->safeMemcpy ( coll , gbstrlen(coll) );
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
//sb->safePrintf ( "&force=1\">reindex</a>" );
sb->safePrintf ( "\">reindex</a>" );
2013-08-03 00:12:24 +04:00
// unhide the divs on click
long placeHolder = -1;
long placeHolderLen;
if ( si->m_format == FORMAT_HTML ) {
2013-08-03 00:12:24 +04:00
// place holder for backlink table link
2014-02-05 05:05:43 +04:00
placeHolder = sb->length();
sb->safePrintf (" - <a onclick="
2013-08-03 00:12:24 +04:00
"var e = document.getElementById('bl%li');"
"if ( == 'none' ){"
" = '';"
"else {"
" = 'none';"
" "
"<u>00000 backlinks</u>"
, ix
2014-02-05 05:05:43 +04:00
placeHolderLen = sb->length() - placeHolder;
2013-08-03 00:12:24 +04:00
// unhide the scoring table on click
2014-02-05 05:05:43 +04:00
sb->safePrintf (" - <a onclick="
2013-08-03 00:12:24 +04:00
"var e = document.getElementById('sc%li');"
"if ( == 'none' ){"
" = '';"
"else {"
" = 'none';"
" "
// reindex
2014-02-05 05:05:43 +04:00
sb->safePrintf(" - <a style=color:red; href=\"/addurl?u=");
sb->urlEncode ( url , gbstrlen(url) , false );
2013-08-03 00:12:24 +04:00
unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
2014-05-11 23:04:10 +04:00
2013-08-03 00:12:24 +04:00
// this stuff is secret just for local guys!
if ( si->m_format == FORMAT_HTML && ( isAdmin || cr->m_isCustomCrawl)){
2013-08-03 00:12:24 +04:00
// now the ip of url
//long urlip = msg40->getIp(i);
// don't combine this with the sprintf above cuz
// iptoa uses a static local buffer like ctime()
2014-02-05 05:05:43 +04:00
2013-09-17 00:59:11 +04:00
" &nbsp; - &nbsp; <a href=\"/search?"
2013-08-03 00:12:24 +04:00
coll,iptoa(mr->m_ip), iptoa(mr->m_ip) );
// ip domain link
unsigned char *us = (unsigned char *)&mr->m_ip;//urlip;
2014-02-05 05:05:43 +04:00
sb->safePrintf (" - <a href=\"/search?c=%s&sc=1&dr=0&n=100&"
2013-08-03 00:12:24 +04:00
// . now the info link
// . if it's local, don't put the hostname/port in
// there cuz it will mess up Global Spec's machine
//if ( h->m_groupId == g_hostdb.m_groupId )
2014-02-08 11:34:45 +04:00
sb.safePrintf(" - <a href=\"/admin/titledb?c=%s&"
2013-08-03 00:12:24 +04:00
// then the [info] link to show the TitleRec
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\">[info]</a>" );
2013-08-03 00:12:24 +04:00
// now the analyze link
2014-02-08 11:34:45 +04:00
sb.safePrintf (" - <a href=\"/admin/parser?c=%s&"
2013-08-03 00:12:24 +04:00
// encode the url now
2014-02-05 05:05:43 +04:00
sb->urlEncode ( url , urlLen );
2013-08-03 00:12:24 +04:00
// then the [analyze] link
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\">[analyze]</a>" );
2013-08-03 00:12:24 +04:00
// and links: query link
2014-02-05 05:05:43 +04:00
sb->safePrintf( " - <a href=\"/search?c=%s&dr=0&"
2013-08-03 00:12:24 +04:00
// encode the url now
2014-02-05 05:05:43 +04:00
sb->urlEncode ( url , urlLen );
sb->safeMemcpy ("\">linkers</a>" , 14 );
2013-08-03 00:12:24 +04:00
// admin always gets the site: option so he can ban
if ( si->m_format == FORMAT_HTML && ( isAdmin || cr->m_isCustomCrawl)){
2013-08-03 00:12:24 +04:00
char dbuf [ MAX_URL_LEN ];
long dlen = uu.getDomainLen();
memcpy ( dbuf , uu.getDomain() , dlen );
dbuf [ dlen ] = '\0';
// newspaperarchive urls have no domain
if ( dlen == 0 ) {
dlen = uu.getHostLen();
memcpy ( dbuf , uu.getHost() , dlen );
dbuf [ dlen ] = '\0';
2014-02-05 05:05:43 +04:00
sb->safePrintf (" - "
2013-08-03 00:12:24 +04:00
" <a href=\"/search?"
"%s</a> " ,
dbuf ,
coll , dbuf );
2013-09-17 00:59:11 +04:00
char *un = "";
long banVal = 1;
if ( mr->m_isBanned ) {
un = "UN";
banVal = 0;
2014-02-05 05:05:43 +04:00
sb->safePrintf(" - "
2014-02-08 11:34:45 +04:00
" <a href=\"/admin/tagdb?"
2013-08-03 00:12:24 +04:00
2013-09-17 00:59:11 +04:00
2013-08-03 00:12:24 +04:00
2013-09-17 00:59:11 +04:00
"<nobr><b>%sBAN %s</b>"
"</nobr></a> "
, banVal
, dbuf
, coll
, un
, dbuf );
2013-08-03 00:12:24 +04:00
//banSites->safePrintf("%s+", dbuf);
dlen = uu.getHostLen();
memcpy ( dbuf , uu.getHost() , dlen );
dbuf [ dlen ] = '\0';
2014-02-05 05:05:43 +04:00
sb->safePrintf(" - "
2014-02-08 11:34:45 +04:00
" <a href=\"/admin/tagdb?"
2013-08-03 00:12:24 +04:00
2013-09-17 00:59:11 +04:00
2013-08-03 00:12:24 +04:00
2013-09-17 00:59:11 +04:00
"<nobr>%sBAN %s</nobr></a> "
, banVal
, dbuf
, coll
, un
, dbuf
// take similarity out until working again
2014-02-05 05:05:43 +04:00
sb->safePrintf (" - [similar -"
2013-08-03 00:12:24 +04:00
" <a href=\"/search?"
"tag</a> " ,
(long)mr->m_tagVectorHash, coll);
2014-02-05 05:05:43 +04:00
sb->safePrintf ("<a href=\"/search?"
2013-08-03 00:12:24 +04:00
"topic</a> " ,
(long)mr->m_gigabitVectorHash, coll);
2013-09-17 00:59:11 +04:00
2013-08-03 00:12:24 +04:00
if ( mr->size_gbAdIds > 0 )
2014-02-05 05:05:43 +04:00
sb->safePrintf ("<a href=\"/search?"
2013-08-03 00:12:24 +04:00
"Ad Id</a> " ,
mr->ptr_gbAdIds, coll);
2014-02-05 05:05:43 +04:00
//sb->safePrintf ("] ");
2013-08-03 00:12:24 +04:00
long urlFilterNum = (long)mr->m_urlFilterNum;
if(urlFilterNum != -1) {
2014-02-05 05:05:43 +04:00
sb->safePrintf (" - <a href=/admin/filters?c=%s>"
coll ,
2013-08-03 00:12:24 +04:00
// print the help
SafeBuf help;
help.safePrintf("The distance matrix uses the "
"following formula to calculate "
"a score in a table cell for a pair of query terms: "
"<span style=\""
"border:1px black solid;"
"SCORE = (%li - |pos1-pos2|) * "
"locationWeight * "
"densityWeight * "
"synWeight1 * "
"synWeight2 * "
"spamWeight1 * "
"spamWeight2 * "
"tfWeight1 * "
, (long)MAXWORDPOS+1
"<tr><td>pos1</td><td>The word position of "
"query term 1</td></tr>"
"<tr><td>pos2</td><td>The word position of "
"query term 2</td></tr>"
//"locationWeight is based on where "
//"the two terms occur in the document "
//"and uses the following table: <br>"
"<tr><td>term location</td>"
for ( long i = 0 ; i < HASHGROUP_END ; i++ ) {
char *hs = getHashGroupString(i);
float hw = s_hashGroupWeights[i];
,hs,hw );
"<tr><td>max # alphanumeric words in location</td>"
for ( long i = 0 ; i < MAXDENSITYRANK ; i++ ) {
,maxw,i,dweight );
2014-05-01 00:17:39 +04:00
// end serp div
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
2014-05-06 21:47:57 +04:00
si->m_format == FORMAT_WIDGET_APPEND ||
2014-05-01 00:17:39 +04:00
si->m_format == FORMAT_WIDGET_AJAX )
2014-04-10 06:51:36 +04:00
if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "<br><br>\n");
2013-08-03 00:12:24 +04:00
2014-04-18 02:58:02 +04:00
// search result spacer
2014-04-17 02:35:16 +04:00
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
2014-05-06 21:47:57 +04:00
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX )
2014-04-18 02:58:02 +04:00
sb->safePrintf("<div style=line-height:%lipx;><br></div>",
2014-04-10 06:51:36 +04:00
// inc it
*numPrintedSoFar = *numPrintedSoFar + 1;
2013-08-03 00:12:24 +04:00
// done?
DocIdScore *dp = msg40->getScoreInfo(ix);
if ( ! dp ) {
if ( si->m_format == FORMAT_XML )
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t</result>\n\n");
2013-08-03 00:12:24 +04:00
// wtf?
//char *xx=NULL;*xx=0;
// at least close up the table
if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
return true;
// scoring info tables
long nr = dp->m_numRequiredTerms;
if ( nr == 1 ) nr = 0;
// print breakout tables here for distance matrix
//SafeBuf bt;
// final score calc
SafeBuf ft;
// shortcut
//Query *q = si->m_q;
// put in a hidden div so you can unhide it
if ( si->m_format == FORMAT_HTML )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<div id=bl%li style=display:none;>\n", ix );
2013-08-03 00:12:24 +04:00
// print xml and html inlinks
long numInlinks = 0;
2013-08-03 00:12:24 +04:00
printInlinkText ( sb , mr , si , &numInlinks );
if ( si->m_format == FORMAT_HTML ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf("<div id=sc%li style=display:none;>\n", ix );
2013-08-03 00:12:24 +04:00
// if pair changes then display the sum
long lastTermNum1 = -1;
long lastTermNum2 = -1;
float minScore = -1;
// display all the PairScores
for ( long i = 0 ; i < dp->m_numPairs ; i++ ) {
float totalPairScore = 0.0;
// print all the top winners for this pair
PairScore *fps = &dp->m_pairScores[i];
// if same combo as last time skip
if ( fps->m_qtermNum1 == lastTermNum1 &&
fps->m_qtermNum2 == lastTermNum2 )
lastTermNum1 = fps->m_qtermNum1;
lastTermNum2 = fps->m_qtermNum2;
bool firstTime = true;
bool first = true;
// print all pairs for this combo
for ( long j = i ; j < dp->m_numPairs ; j++ ) {
// get it
PairScore *ps = &dp->m_pairScores[j];
// stop if different pair now
if ( ps->m_qtermNum1 != fps->m_qtermNum1 ) break;
if ( ps->m_qtermNum2 != fps->m_qtermNum2 ) break;
// skip if 0. neighborhood terms have weight of 0 now
if ( ps->m_finalScore == 0.0 ) continue;
// first time?
if ( firstTime && si->m_format == FORMAT_HTML ) {
Query *q = &si->m_q;
2013-08-03 00:12:24 +04:00
printTermPairs ( sb , q , ps );
printScoresHeader ( sb );
firstTime = false;
// print it
printPairScore ( sb , si , ps , mr , msg40 , first );
// not first any more!
first = false;
// add it up
totalPairScore += ps->m_finalScore;
if ( ft.length() ) ft.safePrintf(" , ");
// min?
if ( minScore < 0.0 || totalPairScore < minScore )
minScore = totalPairScore;
// we need to set "ft" for xml stuff below
if ( si->m_format == FORMAT_XML ) continue;
2014-02-05 05:05:43 +04:00
//sb->safePrintf("<table border=1><tr><td><center><b>");
2013-08-03 00:12:24 +04:00
// print pair text
//long qtn1 = fps->m_qtermNum1;
//long qtn2 = fps->m_qtermNum2;
//if ( q->m_qterms[qtn1].m_isPhrase )
2014-02-05 05:05:43 +04:00
// sb->pushChar('\"');
//sb->safeMemcpy ( q->m_qterms[qtn1].m_term ,
2013-08-03 00:12:24 +04:00
// q->m_qterms[qtn1].m_termLen );
//if ( q->m_qterms[qtn1].m_isPhrase )
2014-02-05 05:05:43 +04:00
// sb->pushChar('\"');
//sb->safePrintf("</b> vs <b>");
2013-08-03 00:12:24 +04:00
//if ( q->m_qterms[qtn2].m_isPhrase )
2014-02-05 05:05:43 +04:00
// sb->pushChar('\"');
//sb->safeMemcpy ( q->m_qterms[qtn2].m_term ,
2013-08-03 00:12:24 +04:00
// q->m_qterms[qtn2].m_termLen );
//if ( q->m_qterms[qtn2].m_isPhrase )
2014-02-05 05:05:43 +04:00
// sb->pushChar('\"');
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<td colspan=20>total of above scores</td>"
// close table from printScoresHeader
2014-02-05 05:05:43 +04:00
if ( ! firstTime ) sb->safePrintf("</table><br>");
2013-08-03 00:12:24 +04:00
// close the distance table
2014-02-05 05:05:43 +04:00
//if ( nr ) sb->safePrintf("</table>");
2013-08-03 00:12:24 +04:00
// print the breakout tables
//if ( nr ) {
2014-02-05 05:05:43 +04:00
// //sb->safePrintf("<br>");
// sb->safeMemcpy ( &bt );
2013-08-03 00:12:24 +04:00
// the singles --- TODO: make it ALL query terms
//nr = dp->m_numRequiredTerms;
//for ( long i = 0 ; i < nr && nr == 1 ; i++ ) {
long lastTermNum = -1;
long numSingles = dp->m_numSingles;
// do not print this if we got pairs
if ( dp->m_numPairs ) numSingles = 0;
for ( long i = 0 ; i < numSingles ; i++ ) {
float totalSingleScore = 0.0;
// print all the top winners for this single
SingleScore *fss = &dp->m_singleScores[i];
// if same combo as last time skip
if ( fss->m_qtermNum == lastTermNum ) continue;
// do not reprint for this query term num
lastTermNum = fss->m_qtermNum;
bool firstTime = true;
// print all singles for this combo
for ( long j = i ; j < dp->m_numSingles ; j++ ) {
// get it
SingleScore *ss = &dp->m_singleScores[j];
// stop if different single now
if ( ss->m_qtermNum != fss->m_qtermNum ) break;
// skip if 0. skip neighborhoods i guess
if ( ss->m_finalScore == 0.0 ) continue;
// first time?
if ( firstTime && si->m_format == FORMAT_HTML ) {
Query *q = &si->m_q;
2013-08-03 00:12:24 +04:00
printSingleTerm ( sb , q , ss );
printScoresHeader ( sb );
firstTime = false;
// print it
printSingleScore ( sb , si , ss , mr , msg40 );
// add up
totalSingleScore += ss->m_finalScore;
if ( ft.length() ) ft.safePrintf(" , ");
// min?
if ( minScore < 0.0 || totalSingleScore < minScore )
minScore = totalSingleScore;
// we need to set "ft" for xml stuff below
if ( si->m_format == FORMAT_XML ) continue;
2014-02-05 05:05:43 +04:00
//sb->safePrintf("<table border=1><tr><td><center><b>");
2013-08-03 00:12:24 +04:00
// print pair text
//long qtn = fss->m_qtermNum;
2014-02-05 05:05:43 +04:00
//sb->safeMemcpy(q->m_qterms[qtn].m_term ,
2013-08-03 00:12:24 +04:00
// q->m_qterms[qtn].m_termLen );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<td colspan=20>total of above scores</td>"
// close table from printScoresHeader
2014-02-05 05:05:43 +04:00
if ( ! firstTime ) sb->safePrintf("</table><br>");
2013-08-03 00:12:24 +04:00
char *ff = "";
if ( si->m_useMinAlgo ) ff = "MIN ";
char *ff2 = "sum";
if ( si->m_useMinAlgo ) ff2 = "min";
2014-02-05 05:05:43 +04:00
//if ( nr ) sb->safePrintf("</table>");
2013-08-03 00:12:24 +04:00
// final score!!!
if ( si->m_format == FORMAT_XML ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t\t<siteRank>%li</siteRank>\n",
2013-08-03 00:12:24 +04:00
(long)dp->m_siteRank );
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t\t<numGoodSiteInlinks>%li"
2013-08-03 00:12:24 +04:00
(long)mr->m_siteNumInlinks );
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t\t<numTotalSiteInlinks>%li"
2013-08-03 00:12:24 +04:00
(long)mr->m_siteNumInlinksTotal );
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t\t<numUniqueIpsLinkingToSite>%li"
2013-08-03 00:12:24 +04:00
(long)mr->m_siteNumUniqueIps );
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t\t<numUniqueCBlocksLinkingToSite>%li"
2013-08-03 00:12:24 +04:00
(long)mr->m_siteNumUniqueCBlocks );
struct tm *timeStruct3 = gmtime(&mr->m_pageInlinksLastUpdated);
char tmp3[64];
strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );
// -1 means unknown
if ( mr->m_pageNumInlinks >= 0 )
// how many inlinks, external and internal, we have
// to this page not filtered in any way!!!
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// how many inlinking ips we got, including our own if
// we link to ourself
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// how many inlinking cblocks we got, including our own if
// we link to ourself
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// how many "good" inlinks. i.e. inlinks whose linktext we
// count and index.
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
float score = msg40->getScore (ix);
2014-02-05 05:05:43 +04:00
sb->safePrintf("\t\t<finalScore>%f</finalScore>\n", score );
sb->safePrintf ("\t\t<finalScoreEquationCanonical>"
2013-08-03 00:12:24 +04:00
"Final Score = (siteRank/%.01f+1) * "
"(%.01f [if not foreign language]) * "
"(%s of above matrix scores)"
, ff2
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t\t<finalScoreEquation>"
2013-08-03 00:12:24 +04:00
"<b>%.03f</b> = (%li/%.01f+1) " // * %s("
, dp->m_finalScore
, (long)dp->m_siteRank
//, ff
// then language weight
if ( si->m_queryLangId == 0 ||
mr->m_language == 0 ||
si->m_queryLangId == mr->m_language )
2014-02-05 05:05:43 +04:00
sb->safePrintf(" * %.01f",
2013-08-03 00:12:24 +04:00
// the actual min then
2014-02-05 05:05:43 +04:00
sb->safePrintf(" * %.03f",minScore);
2013-08-03 00:12:24 +04:00
// no longer list all the scores
2014-02-05 05:05:43 +04:00
//sb->safeMemcpy ( &ft );
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf ("\t</result>\n\n");
2013-08-03 00:12:24 +04:00
return true;
char *cc = getCountryCode ( mr->m_country );
if ( mr->m_country == 0 ) cc = "Unknown";
2014-02-05 05:05:43 +04:00
sb->safePrintf("<table border=1>"
2013-08-03 00:12:24 +04:00
"<tr><td colspan=10><b><center>"
"final score</center></b>"
"<td><font color=green><b>%s</b></font></td>"
"<td><font color=blue>%li</font></td>"
"<tr><td colspan=100>"
, dp->m_docId
, mr->ptr_site
, (long)mr->m_hopcount
//, getLanguageString(mr->m_summaryLanguage)
, getLanguageString(mr->m_language) // use page language
, cc
, (long)dp->m_siteRank
// list all final scores starting with pairs
2014-02-05 05:05:43 +04:00
sb->safePrintf("<b>%f</b> = "
2013-08-03 00:12:24 +04:00
"(<font color=blue>%li</font>/%.01f+1)"
, dp->m_finalScore
, (long)dp->m_siteRank
// if lang is different
if ( si->m_queryLangId == 0 ||
mr->m_language == 0 ||
si->m_queryLangId == mr->m_language )
2014-02-05 05:05:43 +04:00
sb->safePrintf(" * <font color=green><b>%.01f</b></font>",
2013-08-03 00:12:24 +04:00
// list all final scores starting with pairs
2014-02-05 05:05:43 +04:00
sb->safePrintf(" * %s("
2013-08-03 00:12:24 +04:00
, ff
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( &ft );
2013-08-03 00:12:24 +04:00
// put in a hidden div so you can unhide it
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// result is in a table so we can put the result # in its own column
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// UN-indent it if level is 1
if ( si->m_format == FORMAT_HTML && si->m_doIpClustering ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf (" - [ <a href=\"/search?"
2013-08-03 00:12:24 +04:00
"More from this ip</a> ]",
iptoa ( mr->m_ip ) ,
2014-02-04 07:17:58 +04:00
st->m_qe , coll );
2014-02-05 05:05:43 +04:00
if ( indent ) sb->safePrintf ( "</blockquote><br>\n");
else sb->safePrintf ( "<br><br>\n");
2013-08-03 00:12:24 +04:00
else if ( si->m_format == FORMAT_HTML && si->m_doSiteClustering ) {
2013-08-03 00:12:24 +04:00
char hbuf [ MAX_URL_LEN ];
long hlen = uu.getHostLen();
memcpy ( hbuf , uu.getHost() , hlen );
hbuf [ hlen ] = '\0';
2014-02-05 05:05:43 +04:00
sb->safePrintf (" - <nobr><a href=\"/search?"
2013-08-03 00:12:24 +04:00
"More from this site</a></nobr>",
hbuf ,
2014-02-04 07:17:58 +04:00
st->m_qe , coll );
2014-02-05 05:05:43 +04:00
if ( indent ) sb->safePrintf ( "</blockquote><br>\n");
else sb->safePrintf ( "<br><br>\n");
2013-08-03 00:12:24 +04:00
// space out 0000 backlinks
2014-02-05 05:05:43 +04:00
char *p = sb->getBufStart() + placeHolder;
2013-08-03 00:12:24 +04:00
long plen = placeHolderLen;
if ( numInlinks == 0 )
memset ( p , ' ' , plen );
if ( numInlinks > 0 && numInlinks < 99999 ) {
char *ss = strstr ( p, "00000" );
char c = ss[5];
ss[5] = c;
// print "1 backlink" not "1 backlinks"
if ( numInlinks == 1 ) {
char *xx = strstr(p,"backlinks");
xx[8] = ' ';
return true;
2014-02-05 05:05:43 +04:00
bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps ,
2013-08-03 00:12:24 +04:00
Msg20Reply *mr , Msg40 *msg40 , bool first ) {
// shortcut
Query *q = &si->m_q;
2013-08-03 00:12:24 +04:00
//SafeBuf ft;
// store in final score calc
//if ( ft.length() ) ft.safePrintf(" + ");
long qtn1 = ps->m_qtermNum1;
long qtn2 = ps->m_qtermNum2;
unsigned char drl1 = ps->m_diversityRankLeft1;
unsigned char drl2 = ps->m_diversityRankLeft2;
float dvwl1 = getDiversityWeight(dr1);
float dvwl2 = getDiversityWeight(dr2);
unsigned char drr1 = ps->m_diversityRankRight1;
unsigned char drr2 = ps->m_diversityRankRight2;
float dvwr1 = getDiversityWeight(dr1);
float dvwr2 = getDiversityWeight(dr2);
unsigned char de1 = ps->m_densityRank1;
unsigned char de2 = ps->m_densityRank2;
float dnw1 = getDensityWeight(de1);
float dnw2 = getDensityWeight(de2);
long hg1 = ps->m_hashGroup1;
long hg2 = ps->m_hashGroup2;
float hgw1 = getHashGroupWeight(hg1);
float hgw2 = getHashGroupWeight(hg2);
long wp1 = ps->m_wordPos1;
long wp2 = ps->m_wordPos2;
unsigned char wr1 = ps->m_wordSpamRank1;
float wsw1 = getWordSpamWeight(wr1);
unsigned char wr2 = ps->m_wordSpamRank2;
float wsw2 = getWordSpamWeight(wr2);
// HACK for inlink text!
wsw1 = getLinkerWeight(wr1);
wsw2 = getLinkerWeight(wr2);
char *syn1 = "no";
char *syn2 = "no";
float sw1 = 1.0;
float sw2 = 1.0;
if ( ps->m_isSynonym1 ) {
syn1 = "yes";
if ( ps->m_isSynonym2 ) {
syn2 = "yes";
//char bf1 = ps->m_bflags1;
//char bf2 = ps->m_bflags2;
char *bs1 = "no";
char *bs2 = "no";
//if ( bf1 & BF_HALFSTOPWIKIBIGRAM ) bs1 = "yes";
//if ( bf2 & BF_HALFSTOPWIKIBIGRAM ) bs2 = "yes";
if ( ps->m_isHalfStopWikiBigram1 ) bs1 = "yes";
if ( ps->m_isHalfStopWikiBigram2 ) bs2 = "yes";
float wbw1 = 1.0;
float wbw2 = 1.0;
if ( ps->m_isHalfStopWikiBigram1 ) wbw1 = WIKI_BIGRAM_WEIGHT;
if ( ps->m_isHalfStopWikiBigram2 ) wbw2 = WIKI_BIGRAM_WEIGHT;
//long long sz1 = ps->m_listSize1;
//long long sz2 = ps->m_listSize2;
//long long tf1 = ps->m_termFreq1;//sz1 / 10;
//long long tf2 = ps->m_termFreq2;//sz2 / 10;
long long tf1 = msg40->m_msg3a.m_termFreqs[qtn1];
long long tf2 = msg40->m_msg3a.m_termFreqs[qtn2];
float tfw1 = ps->m_tfWeight1;
float tfw2 = ps->m_tfWeight2;
char *wp = "no";
float wiw = 1.0;
if ( ps->m_inSameWikiPhrase ) {
wp = "yes";
wiw = WIKI_WEIGHT; // 0.50;
long a = ps->m_wordPos2;
long b = ps->m_wordPos1;
char *es = "";
char *bes = "";
if ( a < b ) {
a = ps->m_wordPos1;
b = ps->m_wordPos2;
// out of query order penalty!
es = "+ 1.0";
bes = "+ <b>1.0</b>";
if ( si->m_format == FORMAT_XML ) {
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( q->m_qterms[qtn1].m_term ,
2013-08-03 00:12:24 +04:00
q->m_qterms[qtn1].m_termLen );
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( q->m_qterms[qtn2].m_term ,
2013-08-03 00:12:24 +04:00
q->m_qterms[qtn2].m_termLen );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
hgw1 );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
hgw2 );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"</wordPos1>\n", wp1 );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"</wordPos2>\n", wp2 );
//long wordDist = wp2 - wp1;
//if ( wordDist < 0 ) wordDist *= -1;
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// "</wordDist>\n",wdist);
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// word spam / link text weight
char *r1 = "wordSpamRank1";
char *r2 = "wordSpamRank2";
char *t1 = "wordSpamWeight1";
char *t2 = "wordSpamWeight2";
r1 = "inlinkSiteRank1";
t1 = "inlinkTextWeight1";
r2 = "inlinkSiteRank2";
t2 = "inlinkTextWeight2";
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// if offsite inlink text show the inlinkid for matching
// to an <inlink>
LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks;
Inlink *k = info->getNextInlink(NULL);
for (;k&&hg1==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){
if ( ! k->ptr_linkText ) continue;
if ( k->m_wordPosStart > wp1 ) continue;
if ( k->m_wordPosStart + 50 < wp1 ) continue;
// got it. we HACKED this to put the id
// in k->m_siteHash
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
k = info->getNextInlink(NULL);
for (;k&&hg2==HASHGROUP_INLINKTEXT ; k=info->getNextInlink(k)){
if ( ! k->ptr_linkText ) continue;
if ( k->m_wordPosStart > wp2 ) continue;
if ( k->m_wordPosStart + 50 < wp2 ) continue;
// got it. we HACKED this to put the id
// in k->m_siteHash
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// term freq
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
ps->m_qdist );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
wiw );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"score = "
" 100 * "
" locationWeight1" // hgw
" * "
" locationWeight2" // hgw
" * "
" synonymWeight1" // synweight
" * "
" synonymWeight2" // synweight
" * "
" wikiBigramWeight1"
" * "
" wikiBigramWeight2"
" * "
//" * "
//" * "
"densityWeight1" //density weight
" * "
"densityWeight2" //density weight
" * "
"%s" // wordspam weight
" * "
"%s" // wordspam weight
" * "
"termFreqWeight1" // tfw
" * "
"termFreqWeight2" // tfw
" / ( ||wordPos1 - wordPos2| "
" - queryDist| + 1.0 ) * "
, t1
, t2
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<font color=orange>%.1f</font>"//hashgroupweight
"<font color=orange>%.1f</font>"//hashgroupweight
"<font color=blue>%.1f</font>" // syn weight
"<font color=blue>%.1f</font>" // syn weight
"<font color=green>%.1f</font>"//wikibigramweight
"<font color=green>%.1f</font>"//wikibigramweight
"<font color=purple>%.02f</font>"//density weight
"<font color=purple>%.02f</font>"//density weight
"<font color=red>%.02f</font>" // wordspam weight
"<font color=red>%.02f</font>" // wordspam weight
"<font color=magenta>%.02f</font>"//tf weight
"<font color=magenta>%.02f</font>"//tf weight
, ps->m_finalScore
, hgw1
, hgw2
, sw1
, sw2
, wbw1
, wbw2
, dnw1
, dnw2
, wsw1
, wsw2
, tfw1
, tfw2
if ( ps->m_fixedDistance )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"/<b>%li</b> "
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"(((<font color=darkgreen>%li</font>"
"-<font color=darkgreen>%li</font>"
")-<font color=lime>%li</font>)+1.0%s)"
// wikipedia weight
if ( wiw != 1.0 )
2014-02-05 05:05:43 +04:00
sb->safePrintf("*%.01f", wiw );
2013-08-03 00:12:24 +04:00
"</equation>\n" );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
return true; // continue;
// print out the entire details i guess
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// ,ps->m_finalScore
// );
// . print out the breakout tables then
// . they should pop-up when the user
// mouses over a cell in the distance matrix
2014-02-05 05:05:43 +04:00
//sb->safePrintf("<table border=1>"
2013-08-03 00:12:24 +04:00
// "<tr><td colspan=100>"
// "<center><b>");
//if ( q->m_qterms[qtn1].m_isPhrase )
2014-02-05 05:05:43 +04:00
// sb->pushChar('\"');
//sb->safeMemcpy ( q->m_qterms[qtn1].m_term ,
2013-08-03 00:12:24 +04:00
// q->m_qterms[qtn1].m_termLen );
//if ( q->m_qterms[qtn1].m_isPhrase )
2014-02-05 05:05:43 +04:00
// sb->pushChar('\"');
//sb->safePrintf("</b> vs <b>");
2013-08-03 00:12:24 +04:00
//if ( q->m_qterms[qtn2].m_isPhrase )
2014-02-05 05:05:43 +04:00
// sb->pushChar('\"');
//sb->safeMemcpy ( q->m_qterms[qtn2].m_term ,
2013-08-03 00:12:24 +04:00
// q->m_qterms[qtn2].m_termLen );
//if ( q->m_qterms[qtn2].m_isPhrase )
2014-02-05 05:05:43 +04:00
// sb->pushChar('\"');
2013-08-03 00:12:24 +04:00
// then print the details just like the
// single term table below
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// "<td>term</td>"
// "<td>location</td>"
// "<td>wordPos</td>"
// "<td>synonym</td>"
// "<td>wikibigram</td>"
// //"<td>diversityRank/weight</td>"
// "<td>densityRank</td>"
// "<td>wordSpamRank</td>"
// "<td>inlinkSiteRank</td>"
// "<td>termFreq</td>"
// "<td>inWikiPhrase/qdist</td>"
// "</tr>"
// );
// print first term in first row
2014-02-05 05:05:43 +04:00
sb->safePrintf("<tr><td rowspan=3>");
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a onclick=\""
2013-08-03 00:12:24 +04:00
"var e = document.getElementById('poo');"
"if ( == 'none' ){"
" = '';"
"else {"
" = 'none';"
2014-02-05 05:05:43 +04:00
//sb->safeMemcpy ( q->m_qterms[qtn1].m_term ,
2013-08-03 00:12:24 +04:00
// q->m_qterms[qtn1].m_termLen );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"%s <font color=orange>"
, getHashGroupString(hg1)
, hgw1 );
// the word position
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
//"<a href=\"/print?d="
if ( g_conf.m_isMattWells )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a href=\"/seo?d=");
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a href=\"");
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// is synonym?
//if ( sw1 != 1.00 )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%s <font color=blue>%.02f"
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
// sb->safePrintf("<td>&nbsp;</td>");
2013-08-03 00:12:24 +04:00
// wikibigram?/weight
//if ( wbw1 != 1.0 )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%s <font color=green>%.02f"
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
// sb->safePrintf("<td>&nbsp;</td>");
2013-08-03 00:12:24 +04:00
// diversity -
// not needed for term pair algo
2014-02-05 05:05:43 +04:00
//sb->safePrintf("<td>%li/<font color=green>"
2013-08-03 00:12:24 +04:00
// "%f</font></td>",
// (long)dr1,dvw1);
// density
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%li <font color=purple>"
2013-08-03 00:12:24 +04:00
// word spam
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%li <font color=red>"
2013-08-03 00:12:24 +04:00
else {
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%li", (long)wr1);
2013-08-03 00:12:24 +04:00
//if ( wsw1 != 1.0 )
2014-02-05 05:05:43 +04:00
sb->safePrintf( " <font color=red>"
2013-08-03 00:12:24 +04:00
"%.02f</font>", wsw1);
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// term freq
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%lli <font color=magenta>"
2013-08-03 00:12:24 +04:00
// insamewikiphrase?
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%s %li/%.01f</td>",
2013-08-03 00:12:24 +04:00
// end the row
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// print 2nd term in 2nd row
2014-02-05 05:05:43 +04:00
//sb->safeMemcpy ( q->m_qterms[qtn2].m_term ,
2013-08-03 00:12:24 +04:00
// q->m_qterms[qtn2].m_termLen );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"%s <font color=orange>"
, getHashGroupString(hg2)
, hgw2 );
// the word position
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
//"<a href=\"/print?d="
if ( g_conf.m_isMattWells )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a href=\"/seo?d=");
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a href=\"");
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// is synonym?
//if ( sw2 != 1.00 )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%s <font color=blue>%.02f"
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
// sb->safePrintf("<td>&nbsp;</td>");
2013-08-03 00:12:24 +04:00
// wikibigram?/weight
//if ( wbw2 != 1.0 )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%s <font color=green>%.02f"
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
// sb->safePrintf("<td>&nbsp;</td>");
2013-08-03 00:12:24 +04:00
// diversity
2014-02-05 05:05:43 +04:00
//sb->safePrintf("<td>%li/<font color=green>"
2013-08-03 00:12:24 +04:00
// "%f</font></td>",
// (long)dr2,dvw2);
// density
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%li <font color=purple>"
2013-08-03 00:12:24 +04:00
// word spam
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%li <font color=red>"
2013-08-03 00:12:24 +04:00
else {
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%li", (long)wr2);
2013-08-03 00:12:24 +04:00
//if ( wsw2 != 1.0 )
2014-02-05 05:05:43 +04:00
sb->safePrintf( " <font color=red>"
2013-08-03 00:12:24 +04:00
"%.02f</font>", wsw2);
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// term freq
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%lli <font color=magenta>"
2013-08-03 00:12:24 +04:00
// insamewikiphrase?
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%s/%li %.01f</td>",
2013-08-03 00:12:24 +04:00
// end the row
2014-02-05 05:05:43 +04:00
sb->safePrintf("<tr><td ");
2013-08-03 00:12:24 +04:00
// last row is the computation of score
//static bool s_first = true;
if ( first ) {
//static long s_count = 0;
//s_first = false;
2014-02-05 05:05:43 +04:00
//sb->safePrintf("id=poo%li ",s_count);
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf("colspan=50>" // style=\"display:none\">"
2013-08-03 00:12:24 +04:00
"%.03f "
"= "
//" ( "
"<font color=orange>%.1f"
"<font color=orange>%.1f"
//"(%li - "
, ps->m_finalScore
//, idstr
, hgw1
, hgw2
//, (long)MAXWORDPOS+1
2014-02-05 05:05:43 +04:00
sb->safePrintf("<font color=blue>%.1f</font>"
2013-08-03 00:12:24 +04:00
" <font color=blue>%.1f</font>"
// wiki bigram weight
"<font color=green>%.02f</font>"
"<font color=green>%.02f</font>"
"<font color=purple>%.02f</font>"
"<font color=purple>%.02f</font>"
"<font color=red>%.02f</font>"
" <font color=red>%.02f</font>"
"<font color=magenta>%.02f</font>"
"<font color=magenta>%.02f</font>"
, sw1
, sw2
, wbw1
, wbw2
, dnw1
, dnw2
, wsw1
, wsw2
, tfw1
, tfw2
if ( ps->m_fixedDistance )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"/<b>%li</b> "
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"(((<font color=darkgreen>%li</font>"
"-<font color=darkgreen>%li</font>)-"
"<font color=lime>%li</font>) + 1.0%s)"
// wikipedia weight
if ( wiw != 1.0 )
2014-02-05 05:05:43 +04:00
sb->safePrintf("*%.01f", wiw );
sb->safePrintf( // end formula
2013-08-03 00:12:24 +04:00
return true;
2014-02-05 05:05:43 +04:00
bool printSingleTerm ( SafeBuf *sb , Query *q , SingleScore *ss ) {
2013-08-03 00:12:24 +04:00
long qtn = ss->m_qtermNum;
2014-02-05 05:05:43 +04:00
sb->safePrintf("<table border=1 cellpadding=3>");
sb->safePrintf("<tr><td colspan=50><center><b>");
2013-08-03 00:12:24 +04:00
// link to rainbow page
2014-02-05 05:05:43 +04:00
//sb->safePrintf("<a href=\"/print?u=");
//sb->urlEncode( mr->ptr_ubuf );
2013-08-03 00:12:24 +04:00
if ( q->m_qterms[qtn].m_isPhrase )
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( q->m_qterms[qtn].m_term ,
2013-08-03 00:12:24 +04:00
q->m_qterms[qtn].m_termLen );
if ( q->m_qterms[qtn].m_isPhrase )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
return true;
2014-02-05 05:05:43 +04:00
bool printTermPairs ( SafeBuf *sb , Query *q , PairScore *ps ) {
2013-08-03 00:12:24 +04:00
// print pair text
long qtn1 = ps->m_qtermNum1;
long qtn2 = ps->m_qtermNum2;
2014-02-05 05:05:43 +04:00
sb->safePrintf("<table cellpadding=3 border=1>"
2013-08-03 00:12:24 +04:00
"<tr><td colspan=20><center><b>");
if ( q->m_qterms[qtn1].m_isPhrase )
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( q->m_qterms[qtn1].m_term ,
2013-08-03 00:12:24 +04:00
q->m_qterms[qtn1].m_termLen );
if ( q->m_qterms[qtn1].m_isPhrase )
2014-02-05 05:05:43 +04:00
sb->safePrintf("</b> vs <b>");
2013-08-03 00:12:24 +04:00
if ( q->m_qterms[qtn2].m_isPhrase )
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( q->m_qterms[qtn2].m_term ,
2013-08-03 00:12:24 +04:00
q->m_qterms[qtn2].m_termLen );
if ( q->m_qterms[qtn2].m_isPhrase )
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
return true;
2014-02-05 05:05:43 +04:00
bool printScoresHeader ( SafeBuf *sb ) {
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<td>inlnkPR</td>" // nlinkSiteRank</td>"
return true;
2014-02-05 05:05:43 +04:00
bool printSingleScore ( SafeBuf *sb ,
2013-08-03 00:12:24 +04:00
SearchInput *si ,
SingleScore *ss ,
Msg20Reply *mr , Msg40 *msg40 ) {
// shortcut
Query *q = &si->m_q;
2013-08-03 00:12:24 +04:00
//SafeBuf ft;
// store in final score calc
//if ( ft.length() ) ft.safePrintf(" + ");
char *syn = "no";
float sw = 1.0;
if ( ss->m_isSynonym ) {
syn = "yes";
sw = SYNONYM_WEIGHT; // Posdb.h
//char bf = ss->m_bflags;
float wbw = 1.0;
char *bs = "no";
if ( ss->m_isHalfStopWikiBigram ) {
bs = "yes";
float hgw = getHashGroupWeight(ss->m_hashGroup);
//float dvw = getDiversityWeight(ss->m_diversityRank);
float dnw = getDensityWeight(ss->m_densityRank);
float wsw = getWordSpamWeight(ss->m_wordSpamRank);
// HACK for inlink text!
if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT )
wsw = getLinkerWeight(ss->m_wordSpamRank);
//long long tf = ss->m_termFreq;//ss->m_listSize;
long qtn = ss->m_qtermNum;
long long tf = msg40->m_msg3a.m_termFreqs[qtn];
float tfw = ss->m_tfWeight;
if ( si->m_format == FORMAT_XML ) {
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
sb->safeMemcpy ( q->m_qterms[qtn].m_term ,
2013-08-03 00:12:24 +04:00
q->m_qterms[qtn].m_termLen );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
hgw );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"</wordPos>\n", (long)ss->m_wordPos );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
(long)(ss->m_isHalfStopWikiBigram) );
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// word spam
if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) {
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
else {
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// if offsite inlink text show the inlinkid for matching
// to an <inlink>
LinkInfo *info = (LinkInfo *)mr->ptr_linkInfo;//inlinks;
Inlink *k = info->getNextInlink(NULL);
for ( ; k && ss->m_hashGroup==HASHGROUP_INLINKTEXT ;
if ( ! k->ptr_linkText ) continue;
if ( k->m_wordPosStart > ss->m_wordPos ) continue;
if ( k->m_wordPosStart + 50 < ss->m_wordPos ) continue;
// got it. we HACKED this to put the id
// in k->m_siteHash
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// term freq
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"score = "
" 100 * "
" locationWeight" // hgw
" * "
" locationWeight" // hgw
" * "
" synonymWeight" // synweight
" * "
" synonymWeight" // synweight
" * "
" wikiBigramWeight"
" * "
" wikiBigramWeight"
" * "
//" diversityWeight" // divweight
//" * "
//" diversityWeight" // divweight
//" * "
"densityWeight" // density weight
" * "
"densityWeight" // density weight
" * "
"wordSpamWeight" // wordspam weight
" * "
"wordSpamWeight" // wordspam weight
" * "
"termFreqWeight" // tfw
" * "
"termFreqWeight" // tfw
//" / ( 3.0 )"
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"%.1f" // hgw
"%.1f" // hgw
"%.1f" // synweight
"%.1f" // synweight
"%.02f" // wikibigram weight
"%.02f" // wikibigram weight
"%.02f" // density weight
"%.02f" // density weight
"%.02f" // wordspam weight
"%.02f" // wordspam weight
"%.02f" // tfw
"%.02f" // tfw
//" / ( 3.0 )"
, ss->m_finalScore
, hgw
, hgw
, sw
, sw
, wbw
, wbw
, dnw
, dnw
, wsw
, wsw
, tfw
, tfw
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
return true;
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<td rowspan=2>%.03f</td>"
"<td>%s <font color=orange>%.1f"
// wordpos
"<a href=\""
, ss->m_finalScore
, getHashGroupString(ss->m_hashGroup)
, hgw
2014-02-05 05:05:43 +04:00
//sb->urlEncode( mr->ptr_ubuf );
sb->safePrintf("%lli",mr->m_docId );
2013-08-03 00:12:24 +04:00
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<td>%s <font color=blue>%.1f"
"</font></td>" // syn
// wikibigram?/weight
"<td>%s <font color=green>%.02f</font></td>"
//"<td>%li/<font color=green>%f"
//"</font></td>" // diversity
"<td>%li <font color=purple>"
"%.02f</font></td>" // density
, (long)ss->m_wordPos
, syn
, sw // synonym weight
, bs
, wbw
//, (long)ss->m_diversityRank
//, dvw
, (long)ss->m_densityRank
, dnw
if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) {
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
"<td>%li <font color=red>%.02f"
"</font></td>" // wordspam
, (long)ss->m_wordSpamRank
, wsw
else {
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%li <font color=red>%.02f"
2013-08-03 00:12:24 +04:00
"</font></td>" // wordspam
, (long)ss->m_wordSpamRank
, wsw
2014-02-05 05:05:43 +04:00
sb->safePrintf("<td>%lli <font color=magenta>"
2013-08-03 00:12:24 +04:00
"%.02f</font></td>" // termfreq
, tf
, tfw
// last row is the computation of score
2014-02-05 05:05:43 +04:00
sb->safePrintf("<tr><td colspan=50>"
2013-08-03 00:12:24 +04:00
"%.03f "
" = "
//" %li * "
"100 * "
" <font color=orange>%.1f</font>"
" * "
" <font color=orange>%.1f</font>"
" * "
" <font color=blue>%.1f</font>"
" * "
" <font color=blue>%.1f</font>"
" * "
" <font color=green>%.02f</font>"//wikibigramwght
" * "
" <font color=green>%.02f</font>"
" * "
"<font color=purple>%.02f</font>"
" * "
"<font color=purple>%.02f</font>"
" * "
"<font color=red>%.02f</font>"
" * "
"<font color=red>%.02f</font>"
" * "
"<font color=magenta>%.02f</font>"
" * "
"<font color=magenta>%.02f</font>"
//" / ( 3.0 )"
// end formula
, ss->m_finalScore
//, (long)MAXWORDPOS+1
, hgw
, hgw
, sw
, sw
, wbw
, wbw
//, dvw
//, dvw
, dnw
, dnw
, wsw
, wsw
, tfw
, tfw
2014-02-05 05:05:43 +04:00
2013-08-03 00:12:24 +04:00
// "<br>");
return true;
2013-10-03 08:34:21 +04:00
// . print the directory subtopics
// . show these when we are in a directory topic browsing dmoz
// . just a list of all the topics/categories
2014-02-05 05:05:43 +04:00
bool printDMOZSubTopics ( SafeBuf *sb, long catId, bool inXml ) {
2013-10-14 01:24:41 +04:00
if ( catId <= 0 ) return true;
2013-10-03 08:34:21 +04:00
long currType;
bool first;
bool nextColumn;
long maxPerColumn;
long currInColumn;
long currIndex;
char *prefixp;
long prefixLen;
char *catName;
long catNameLen;
char encodedName[2048];
//SearchInput *si = &st->m_si;
bool isRTL = g_categories->isIdRTL ( catId );
2013-10-03 08:34:21 +04:00
SafeBuf subCatBuf;
// stores a list of SubCategories into "subCatBuf"
long numSubCats = g_categories->generateSubCats ( catId , &subCatBuf );
2013-10-03 08:34:21 +04:00
// . get the subcategories for a given categoriy
// . msg2b::gernerateDirectory() was launched in Msg40.cpp
//long numSubCats = st->m_msg40.m_msg2b.m_numSubCats;
//SubCategory *subCats = st->m_msg40.m_msg2b.m_subCats;
//char *catBuffer = st->m_msg40.m_msg2b.m_catBuffer;
//bool showAdultOnTop = st->m_si.m_cr->m_showAdultCategoryOnTop;
// just print <hr> if no sub categories
if (inXml) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t<directory>\n"
2013-10-03 08:34:21 +04:00
catId);//si.m_cat_dirId );
2014-02-05 05:05:43 +04:00
g_categories->printPathFromId ( sb,
catId, // st->m_si.m_cat_dirId,
2013-10-03 08:34:21 +04:00
true );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "]]></dirName>\n");
sb->safePrintf ( "\t\t<dirIsRTL>%li</dirIsRTL>\n",
2013-10-03 08:34:21 +04:00
char *p = subCatBuf.getBufStart();
char *pend = subCatBuf.getBuf();
SubCategory *ptrs[MAX_SUB_CATS];
long count = 0;
if (numSubCats <= 0)
goto dirEnd;
// print out the cats
currType = 0;
// first make ptrs to them
for ( ; p < pend ; ) {
SubCategory *cat = (SubCategory *)p;
ptrs[count++] = cat;
p += cat->getRecSize();
2013-10-14 03:48:59 +04:00
// do not breach
if ( count >= MAX_SUB_CATS ) break;
2013-10-03 08:34:21 +04:00
for (long i = 0; i < count ; i++ ) {
SubCategory *cat = ptrs[i];
first = false;
catName = cat->getName();//&catBuffer[subCats[i].m_nameOffset];
catNameLen = cat->m_nameLen;//subCats[i].m_nameLen;
// this is the last topic in the dmoz dir path
// so if the dmoz topic is Top/Arts/Directories then
// the prefixp is "Directories"
2013-10-03 08:34:21 +04:00
prefixp = cat->getPrefix();//&catBuffer[subCats[i].m_prefixOffset];
prefixLen = cat->m_prefixLen;//subCats[i].m_prefixLen;
// skip bad categories
2013-10-03 08:34:21 +04:00
if (currIndex < 0)
// skip top adult category if we're supposed to
2013-10-03 08:34:21 +04:00
if ( !inXml &&
st->m_si.m_catId == 1 &&
si->m_familyFilter &&
g_categories->isIndexAdultStart ( currIndex ) )
2013-10-03 08:34:21 +04:00
// check for room
//if (p + subCats[i].m_prefixLen*2 +
// subCats[i].m_nameLen*2 +
// 512 > pend){
// goto diroverflow;
// print simple xml tag for inXml
if (inXml) {
switch ( cat->m_type ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<letterbar><![CDATA[" );
sb->safePrintf ( "]]>" );
sb->safePrintf ( "<urlcount>%li</urlcount>",
2013-10-03 08:34:21 +04:00
currIndex) );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</letterbar>\n" );
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<narrow2><![CDATA[" );
sb->utf8Encode2 ( catName, catNameLen );
sb->safePrintf ( "]]>");
sb->safePrintf ( "<urlcount>%li</urlcount>",
2013-10-03 08:34:21 +04:00
currIndex) );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</narrow2>\n" );
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<narrow1><![CDATA[" );
sb->utf8Encode2 ( catName, catNameLen );
sb->safePrintf ( "]]>" );
sb->safePrintf ( "<urlcount>%li</urlcount>",
2013-10-03 08:34:21 +04:00
currIndex) );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</narrow1>\n" );
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<narrow><![CDATA[" );
sb->utf8Encode2 ( catName, catNameLen );
sb->safePrintf ( "]]>" );
sb->safePrintf ( "<urlcount>%li</urlcount>",
2013-10-03 08:34:21 +04:00
currIndex) );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</narrow>\n" );
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<symbolic2><![CDATA[" );
sb->utf8Encode2 ( prefixp, prefixLen );
sb->safePrintf ( ":" );
sb->utf8Encode2 ( catName, catNameLen );
sb->safePrintf ( "]]>" );
sb->safePrintf ( "<urlcount>%li</urlcount>",
2013-10-03 08:34:21 +04:00
currIndex) );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</symbolic2>\n" );
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<symbolic1><![CDATA[" );
sb->utf8Encode2 ( prefixp, prefixLen );
sb->safePrintf ( ":" );
sb->utf8Encode2 ( catName, catNameLen );
sb->safePrintf ( "]]>" );
sb->safePrintf ( "<urlcount>%li</urlcount>",
2013-10-03 08:34:21 +04:00
currIndex) );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</symbolic1>\n" );
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<symbolic><![CDATA[" );
sb->utf8Encode2 ( prefixp, prefixLen );
sb->safePrintf ( ":" );
sb->utf8Encode2 ( catName, catNameLen );
sb->safePrintf ( "]]>" );
sb->safePrintf ( "<urlcount>%li</urlcount>",
2013-10-03 08:34:21 +04:00
currIndex) );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</symbolic>\n" );
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<related><![CDATA[" );
sb->utf8Encode2 ( catName, catNameLen );
sb->safePrintf ( "]]>" );
sb->safePrintf ( "<urlcount>%li</urlcount>",
2013-10-03 08:34:21 +04:00
currIndex) );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</related>\n" );
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\t\t<altlang><![CDATA[" );
sb->utf8Encode2 ( prefixp, prefixLen );
sb->safePrintf ( ":" );
sb->utf8Encode2 ( catName, catNameLen );
sb->safePrintf ( "]]>" );
sb->safePrintf ( "<urlcount>%li</urlcount>",
2013-10-03 08:34:21 +04:00
currIndex) );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "</altlang>\n");
2013-10-03 08:34:21 +04:00
// print type header
if ( cat->m_type - currType >= 10) {
// end the last type
if (currType == SUBCAT_LETTERBAR)
2014-02-05 05:05:43 +04:00
sb->safePrintf(" ]</center>\n");
2013-10-03 08:34:21 +04:00
else if (currType != 0)
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\n</span></ul></td></tr>"
2013-10-03 08:34:21 +04:00
"</table>\n" );
// start the new type
switch (cat->m_type) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "<span class=\"directory\">"
2013-10-03 08:34:21 +04:00
"<center>[ " );
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
if (currType == 0 ||
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
if (isRTL)
2014-02-05 05:05:43 +04:00
sb->safePrintf("<span dir=ltr>");
sb->safePrintf ( "<b>Related Categories:"
2013-10-03 08:34:21 +04:00
"</b>" );
if (isRTL)
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
if (currType == 0 ||
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
if (isRTL)
2014-02-05 05:05:43 +04:00
sb->safePrintf("<span dir=ltr>");
sb->safePrintf ( "<b>This category in other"
2013-10-03 08:34:21 +04:00
" languages:</b>");
if (isRTL)
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
currType = ( cat->m_type/10)*10;
first = true;
nextColumn = false;
currInColumn = 0;
if (currType == SUBCAT_LETTERBAR ||
maxPerColumn = 999;
else {
// . check how many columns we'll use for this
// type
long numInType = 1;
for (long j = i+1; j < numSubCats; j++) {
if ( ptrs[j]->m_type - currType >= 10)
// column for every 5, up to 3 columns
long numColumns = numInType/5;
if ( numInType%5 > 0 ) numColumns++;
if ( currType == SUBCAT_ALTLANG &&
numColumns > 4)
numColumns = 4;
else if (numColumns > 3)
numColumns = 3;
// max number of links per column
maxPerColumn = numInType/numColumns;
if (numInType%numColumns > 0)
// start the sub cat
if (first) {
if (currType != SUBCAT_LETTERBAR)
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "<table border=0>"
2013-10-03 08:34:21 +04:00
"<tr><td valign=top>"
"<ul><span class=\"directory\">"
// check for the next column
else if (nextColumn) {
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "\n</span></ul></td><td valign=top>"
2013-10-03 08:34:21 +04:00
"<ul><span class=\"directory\">"
nextColumn = false;
// or just next link
else {
if (currType == SUBCAT_LETTERBAR)
2014-02-05 05:05:43 +04:00
sb->safePrintf("| ");
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
// print out the prefix as a link
//if ( p + catNameLen + 16 > pend ) {
// goto diroverflow;
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a href=\"/");
sb->utf8Encode2(catName, catNameLen);
2013-10-03 08:34:21 +04:00
// prefix...
//if ( p + prefixLen + 512 > pend ) {
// goto diroverflow;
if (currType != SUBCAT_ALTLANG)
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
else {
// check for coded <b> or <strong> tags, remove
if (prefixLen >= 19 &&
strncasecmp(prefixp, "&lt;b&gt;", 9) == 0 &&
strncasecmp(prefixp + (prefixLen-10),
"&lt;/b&gt;", 10) == 0) {
prefixp += 9;
prefixLen -= 19;
else if (prefixLen >= 29 &&
strncasecmp(prefixp, "&lt;strong&gt;", 14) == 0 &&
strncasecmp(prefixp + (prefixLen-15),
"&lt;/strong&gt;", 15) == 0) {
prefixp += 14;
prefixLen -= 29;
if (currType == SUBCAT_RELATED) {
// print the full path
if (g_categories->isIndexRTL(currIndex))
2014-02-05 05:05:43 +04:00
sb->safePrintf("<span dir=ltr>");
2013-10-03 08:34:21 +04:00
g_categories->printPathFromIndex (
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
2013-10-03 08:34:21 +04:00
else {
char *encodeEnd = htmlEncode ( encodedName,
encodedName + 2047,
prefixp + prefixLen );
prefixp = encodedName;
prefixLen = encodeEnd - encodedName;
//if ( p + prefixLen + 512 > pend ) {
// goto diroverflow;
for (long c = 0; c < prefixLen; c++) {
if (*prefixp == '_')
//*p = ' ';
2014-02-05 05:05:43 +04:00
sb->safePrintf(" ");
2013-10-03 08:34:21 +04:00
//*p = *prefixp;
2014-02-05 05:05:43 +04:00
sb->utf8Encode2(prefixp, 1);
2013-10-03 08:34:21 +04:00
//if ( p + 512 > pend ) {
// goto diroverflow;
// end the link
if (currType != SUBCAT_ALTLANG)
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
// print an @ for symbolic links
if ( (cat->m_type % 10) == 1)
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
// print number of urls under here
if ( cat->m_type != SUBCAT_LETTERBAR) {
2014-02-05 05:05:43 +04:00
if (isRTL)
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "<span dir=ltr>(%li)"
2013-10-03 08:34:21 +04:00
currIndex) );
2014-02-05 05:05:43 +04:00
sb->safePrintf ( "(%li)</i>",
2013-10-03 08:34:21 +04:00
currIndex) );
// next line/letter
if ( cat->m_type == SUBCAT_LETTERBAR) {
2014-02-05 05:05:43 +04:00
sb->safePrintf(" ");
2013-10-03 08:34:21 +04:00
// check for next column
if (currInColumn >= maxPerColumn) {
currInColumn = 0;
nextColumn = true;
//if ( p + 512 > pend ) {
// goto diroverflow;
// end the last type
if (!inXml) {
if (currType == SUBCAT_LETTERBAR)
2014-02-05 05:05:43 +04:00
sb->safePrintf(" ]</center>\n");
2013-10-03 08:34:21 +04:00
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
if (inXml)
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
else {
2014-02-05 05:05:43 +04:00
2013-10-03 08:34:21 +04:00
return true;
2014-02-05 05:05:43 +04:00
bool printDMOZCrumb ( SafeBuf *sb , long catId , bool xml ) {
2013-10-14 01:24:41 +04:00
// catid -1 means error
if ( catId <= 0 ) return true;
long dirIndex = g_categories->getIndexFromId(catId);
// dirIndex = g_categories->getIndexFromId(si->m_cat_sdir);
if (dirIndex < 0) dirIndex = 0;
// display the directory bread crumb
//if( (si->m_cat_dirId > 0 && si->m_isAdmin && !si->m_isFriend)
// || (si->m_cat_sdir > 0 && si->m_cat_sdirt != 0) )
2014-02-05 05:05:43 +04:00
// sb->safePrintf("<br><br>");
// shortcut. rtl=Right To Left language format.
bool rtl = g_categories->isIdRTL ( catId ) ;
//st->m_isRTL = rtl;
if ( ! xml ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf("\n<font size=4><b>");
if ( rtl ) sb->safePrintf("<span dir=ltr>");
//sb->safePrintf("<a href=\"/Top\">Top</a>: ");
// put crumbin xml?
if ( xml )
2014-02-05 05:05:43 +04:00
// display the breadcrumb in xml or html?
2014-02-05 05:05:43 +04:00
if ( xml )
2014-02-05 05:05:43 +04:00
sb->safePrintf("]]></breadcrumb>\n" );
2013-10-14 01:24:41 +04:00
// how many urls/entries in this topic?
long nu =g_categories->getNumUrlsFromIndex(dirIndex);
// print the num
if ( ! xml ) {
2014-02-05 05:05:43 +04:00
if ( rtl )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<span dir=ltr>(%li)</span>",nu);
2014-02-05 05:05:43 +04:00
sb->safePrintf("(%li)", nu);
return true;
2013-10-14 02:45:12 +04:00
2014-02-05 05:05:43 +04:00
bool printDmozRadioButtons ( SafeBuf *sb , long catId ) ;
2013-10-14 02:45:12 +04:00
// if catId >= 1 then print the dmoz radio button
2014-02-05 05:05:43 +04:00
bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , long catId ) {
2013-10-14 02:45:12 +04:00
char *root = "";
if ( g_conf.m_isMattWells )
root = "";
2014-02-05 05:05:43 +04:00
2013-10-14 02:45:12 +04:00
// logo and menu table
"<table border=0 cellspacing=5>"
"<td rowspan=2 valign=top>"
"<a href=/>"
"<img "
"border=0 "
"src=%s/logo-small.png "
2013-10-14 02:45:12 +04:00
"height=64 width=295>"
, root
2013-10-14 02:45:12 +04:00
// menu above search box
2014-02-05 05:05:43 +04:00
2013-10-14 02:45:12 +04:00
" &nbsp; "
if ( catId <= 0 )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<b title=\"Search the web\">web</b>");
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a title=\"Search the web\" href=/>web</a>");
2014-01-19 00:09:33 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; " );
2014-01-19 00:09:33 +04:00
if ( g_conf.m_isMattWells ) {
// SEO functionality not included yet - so redir to gigablast.
if ( g_conf.m_isMattWells )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a title=\"Rank higher in "
2014-01-19 00:09:33 +04:00
"Google\" href='/seo'>");
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a title=\"Rank higher in "
2014-01-19 00:09:33 +04:00
"Google\" href='https://www.gigablast."
2013-10-14 02:45:12 +04:00
2014-02-05 05:05:43 +04:00
2014-01-19 00:09:33 +04:00
" &nbsp;&nbsp;&nbsp;&nbsp; "
if (catId <= 0 )
2014-02-05 05:05:43 +04:00
sb->safePrintf("<a title=\"Browse the DMOZ directory\" "
"</a>" );
2014-02-05 05:05:43 +04:00
sb->safePrintf("<b title=\"Browse the DMOZ directory\">"
2013-10-17 04:43:46 +04:00
char *coll = hr->getString("c");
if ( ! coll ) coll = "";
2014-02-05 05:05:43 +04:00
sb->safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; "
2013-10-14 02:45:12 +04:00
// i'm not sure why this was removed. perhaps
// because it is not working yet because of
// some bugs...
2014-01-19 00:09:33 +04:00
"<a title=\"Advanced web search\" "
2013-10-14 02:45:12 +04:00
2014-01-19 00:09:33 +04:00
" &nbsp;&nbsp;&nbsp;&nbsp;"
2013-10-14 02:45:12 +04:00
"<a title=\"Add your url to the index\" "
"add url"
" &nbsp;&nbsp;|&nbsp;&nbsp; "
"<a title=\"Words from Gigablast\" "
" &nbsp;&nbsp;|&nbsp;&nbsp; "
"<a title=\"About Gigablast\" href=/about.html>"
// search box
"<form name=f method=GET action=/search>\n\n"
2013-10-17 04:43:46 +04:00
// propagate the collection if they re-search
"<input name=c type=hidden value=\"%s\">"
, coll
// propagate prepend
char *prepend = hr->getString("prepend");
if ( prepend ) {
2014-02-05 05:05:43 +04:00
sb->safePrintf("<input name=prepend type=hidden value=\"");
sb->htmlEncode ( prepend, gbstrlen(prepend), false);
2013-10-17 04:43:46 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf (
2013-10-14 02:45:12 +04:00
// input table
//"<div style=margin-left:5px;margin-right:5px;>
"<input size=40 type=text name=q value=\""
// contents of search box
long qlen;
char *qstr = hr->getString("q",&qlen,"",NULL);
2014-02-05 05:05:43 +04:00
sb->htmlEncode ( qstr , qlen , false );
sb->safePrintf ("\">"
2013-10-14 02:45:12 +04:00
"<input type=submit value=\"Search\" border=0>"
if ( catId >= 0 ) {
else {
2014-02-05 05:05:43 +04:00
sb->safePrintf("Try your search (not secure) on: "
2013-10-14 02:45:12 +04:00
"&nbsp;&nbsp; "
"<a href="
2014-02-05 05:05:43 +04:00
sb->urlEncode ( qstr );
sb->safePrintf (">google</a> &nbsp;&nbsp;&nbsp;&nbsp; "
2013-10-14 02:45:12 +04:00
"<a href="
2014-02-05 05:05:43 +04:00
sb->urlEncode ( qstr );
sb->safePrintf (">bing</a>");
2013-10-14 02:45:12 +04:00
2014-02-05 05:05:43 +04:00
sb->safePrintf( "</form>\n"
2013-10-14 02:45:12 +04:00
return true;
2013-10-14 03:48:59 +04:00
2014-02-05 05:05:43 +04:00
bool printDmozRadioButtons ( SafeBuf *sb , long catId ) {
sb->safePrintf("Search "
2013-10-14 03:48:59 +04:00
"<input type=radio name=prepend "
"value=gbipcatid:%li checked> sites "
"<input type=radio name=prepend "
"value=gbpcatid:%li> pages "
"in this topic or below"
, catId
, catId
return true;
// print the search options under a dmoz search box
bool printDirectorySearchType ( SafeBuf& sb, long sdirt ) {
// default to entire directory
if (sdirt < 1 || sdirt > 4)
sdirt = 3;
// by default search the whole thing
2014-02-05 05:05:43 +04:00
sb->safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"3\"");
if (sdirt == 3) sb->safePrintf(" checked>");
else sb->safePrintf(">");
sb->safePrintf("Entire Directory<br>\n");
2013-10-14 03:48:59 +04:00
// entire category
2014-02-05 05:05:43 +04:00
sb->safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"1\"");
if (sdirt == 1) sb->safePrintf(" checked>");
else sb->safePrintf(">");
sb->safePrintf("Entire Category<br>\n");
2013-10-14 03:48:59 +04:00
// base category only
2014-02-05 05:05:43 +04:00
sb->safePrintf("<nobr><input type=\"radio\" name=\"sdirt\" value=\"2\"");
if (sdirt == 2) sb->safePrintf(" checked>");
else sb->safePrintf(">");
sb->safePrintf("Pages in Base Category</nobr><br>\n");
2013-10-14 03:48:59 +04:00
// sites in base category
2014-02-05 05:05:43 +04:00
sb->safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"7\"");
if (sdirt == 7) sb->safePrintf(" checked>");
else sb->safePrintf(">");
sb->safePrintf("Sites in Base Category<br>\n");
2013-10-14 03:48:59 +04:00
// sites in entire category
2014-02-05 05:05:43 +04:00
sb->safePrintf("<input type=\"radio\" name=\"sdirt\" value=\"6\"");
if (sdirt == 6) sb->safePrintf(" checked>");
else sb->safePrintf(">");
sb->safePrintf("Sites in Entire Category<br>\n");
2013-10-14 03:48:59 +04:00
// end it
return true;
// return 1 if a should be before b
int csvPtrCmp ( const void *a, const void *b ) {
//JsonItem *ja = (JsonItem **)a;
//JsonItem *jb = (JsonItem **)b;
char *pa = *(char **)a;
char *pb = *(char **)b;
if ( strcmp(pa,"type") == 0 ) return -1;
if ( strcmp(pb,"type") == 0 ) return 1;
// force title on top
if ( strcmp(pa,"product.title") == 0 ) return -1;
if ( strcmp(pb,"product.title") == 0 ) return 1;
if ( strcmp(pa,"title") == 0 ) return -1;
if ( strcmp(pb,"title") == 0 ) return 1;
// otherwise string compare
int val = strcmp(pa,pb);
return val;
#include "Json.h"
// print header row in csv
bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) {
Msg40 *msg40 = &st->m_msg40;
long numResults = msg40->getNumResults();
char tmp1[1024];
SafeBuf tmpBuf (tmp1 , 1024);
char tmp2[1024];
SafeBuf nameBuf (tmp2, 1024);
char nbuf[27000];
HashTableX nameTable;
if ( ! nameTable.set ( 8,4,2048,nbuf,27000,false,0,"ntbuf") )
return false;
long niceness = 0;
// . scan every fucking json item in the search results.
// . we still need to deal with the case when there are so many
// search results we have to dump each msg20 reply to disk in
// order. then we'll have to update this code to scan that file.
for ( long i = 0 ; i < numResults ; i++ ) {
// get the msg20 reply for search result #i
Msg20 *m20 = msg40->m_msg20[i];
Msg20Reply *mr = m20->m_r;
if ( ! mr ) {
log("results: missing msg20 reply for result #%li",i);
// get content
char *json = mr->ptr_content;
// how can it be empty?
if ( ! json ) continue;
// parse it up
Json jp;
jp.parseJsonStringIntoJsonItems ( json , niceness );
// scan each json item
for ( JsonItem *ji = jp.getFirstItem(); ji ; ji = ji->m_next ){
// skip if not number or string
if ( ji->m_type != JT_NUMBER &&
ji->m_type != JT_STRING )
// if in an array, do not print! csv is not
// good for arrays... like "media":[....] . that
// one might be ok, but if the elements in the
// array are not simple types, like, if they are
// unflat json objects then it is not well suited
// for csv.
if ( ji->isInArray() ) continue;
// reset length of buf to 0
// . get the name of the item into "nameBuf"
// . returns false with g_errno set on error
if ( ! ji->getCompoundName ( tmpBuf ) )
return false;
// is it new?
long long h64 = hash64n ( tmpBuf.getBufStart() );
if ( nameTable.isInTable ( &h64 ) ) continue;
// record offset of the name for our hash table
long nameBufOffset = nameBuf.length();
// store the name in our name buffer
if ( ! nameBuf.safeStrcpy ( tmpBuf.getBufStart() ) )
return false;
if ( ! nameBuf.pushChar ( '\0' ) )
return false;
// it's new. add it
if ( ! nameTable.addKey ( &h64 , &nameBufOffset ) )
return false;
// . make array of ptrs to the names so we can sort them
// . try to always put title first regardless
char *ptrs [ 1024 ];
long numPtrs = 0;
for ( long i = 0 ; i < nameTable.m_numSlots ; i++ ) {
if ( ! nameTable.m_flags[i] ) continue;
long off = *(long *)nameTable.getValueFromSlot(i);
char *p = nameBuf.getBufStart() + off;
ptrs[numPtrs++] = p;
if ( numPtrs >= 1024 ) break;
// sort them
qsort ( ptrs , numPtrs , 4 , csvPtrCmp );
// set up table to map field name to column for printing the json items
HashTableX *columnTable = &st->m_columnTable;
if ( ! columnTable->set ( 8,4, numPtrs * 4,NULL,0,false,0,"coltbl" ) )
return false;
// now print them out as the header row
for ( long i = 0 ; i < numPtrs ; i++ ) {
if ( i > 0 && ! sb->pushChar(',') ) return false;
if ( ! sb->safeStrcpy ( ptrs[i] ) ) return false;
// record the hash of each one for printing out further json
// objects in the same order so columns are aligned!
long long h64 = hash64n ( ptrs[i] );
if ( ! columnTable->addKey ( &h64 , &i ) )
return false;
st->m_numCSVColumns = numPtrs;
if ( ! sb->pushChar('\n') )
return false;
if ( ! sb->nullTerm() )
return false;
return true;
// returns false and sets g_errno on error
bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
long niceness = 0;
// parse the json
Json jp;
jp.parseJsonStringIntoJsonItems ( json , niceness );
HashTableX *columnTable = &st->m_columnTable;
long numCSVColumns = st->m_numCSVColumns;
// make buffer space that we need
char ttt[1024];
SafeBuf ptrBuf(ttt,1024);
long need = numCSVColumns * sizeof(JsonItem *);
if ( ! ptrBuf.reserve ( need ) ) return false;
JsonItem **ptrs = (JsonItem **)ptrBuf.getBufStart();
// reset json item ptrs for csv columns. all to NULL
memset ( ptrs , 0 , need );
char tmp1[1024];
SafeBuf tmpBuf (tmp1 , 1024);
JsonItem *ji;
// print json item in csv
for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) {
// skip if not number or string
if ( ji->m_type != JT_NUMBER &&
ji->m_type != JT_STRING )
// skip if not well suited for csv (see above comment)
if ( ji->isInArray() ) continue;
// . get the name of the item into "nameBuf"
// . returns false with g_errno set on error
if ( ! ji->getCompoundName ( tmpBuf ) )
return false;
// is it new?
long long h64 = hash64n ( tmpBuf.getBufStart() );
long slot = columnTable->getSlot ( &h64 ) ;
// MUST be in there
if ( slot < 0 ) { char *xx=NULL;*xx=0;}
// get col #
long column = *(long *)columnTable->getValueFromSlot ( slot );
// sanity
if ( column >= numCSVColumns ) { char *xx=NULL;*xx=0; }
// set ptr to it for printing when done parsing every field
// for this json item
ptrs[column] = ji;
// now print out what we got
for ( long i = 0 ; i < numCSVColumns ; i++ ) {
// , delimeted
if ( i > 0 ) sb->pushChar(',');
// get it
ji = ptrs[i];
// skip if none
if ( ! ji ) continue;
// skip "html" field... too spammy for csv and > 32k causes
// libreoffice calc to truncate it and break its parsing
if ( ji->m_name &&
//! ji->m_parent &&
// get value and print otherwise
if ( ji->m_type == JT_NUMBER ) {
// print numbers without double quotes
if ( ji->m_valueDouble *10000000.0 ==
(double)ji->m_valueLong * 10000000.0 )
// print the value
2014-04-11 22:51:12 +04:00
// get the json item to print out
long vlen = ji->getValueLen();
// truncate
char *truncStr = NULL;
if ( vlen > 32000 ) {
vlen = 32000;
truncStr = " ... value truncated because "
"Excel can not handle it. Download the "
"JSON to get untruncated data.";
// print it out
sb->csvEncode ( ji->getValue() , vlen );
// print truncate msg?
if ( truncStr ) sb->safeStrcpy ( truncStr );
// end the CSV
return true;
2014-05-08 00:28:20 +04:00
2014-04-09 06:34:43 +04:00
2014-04-09 22:03:31 +04:00
bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
2014-04-09 06:34:43 +04:00
// begin print controls
"<body bgcolor=#e8e8e8>"
2014-04-09 22:03:31 +04:00
"<title>Widget Creator</title>"
2014-04-09 06:34:43 +04:00
2014-04-09 22:03:31 +04:00
//char *coll = "GLOBAL-INDEX";
CollectionRec *cr = NULL;
if ( coll ) cr = g_collectiondb.getRec(coll);
2014-04-09 06:34:43 +04:00
// if admin clicks "edit" in the live widget itself put up
// some simpler content editing boxes. token required!
long edit = hr->getLong("inlineedit",0);
if ( edit ) {
// get widget sites
char *sites = cr->m_siteListBuf.getBufStart();
, sites);
"<input type=text name=token>"
"<input type=submit name=submit value=ok>"
return true;
// onclick of a checkbox toggle it here since we reload after
sb->safePrintf("function toggleBool ( control , id ) {\n"
"if(document.forms[0].elements[id].value == 1 ) {\n"
"document.forms[0].elements[id].value = 0;\n"
"} else {\n"
"document.forms[0].elements[id].value = 1;\n"
// construct url based on input parms
sb->safePrintf("function getFormParms ( ) {\n"
"var i;\n"
"var url = '';\n"
"for(i=0; i<document.myform.elements.length; i++){\n"
"var elm = document.myform.elements[i];\n"
// skip submit button and nameless checkboxes
"if ( == '' ) {\n"
// until we had def=%li to each input parm assume
// default is 0. i guess if it has no def= attribute
// assume default is 0
//"if ( elm.value == '0' ) {\n"
"if ( elm.value == '' ) {\n"
"url = "
"url + "
" + \"=\" + "
"elm.value + \"&\" ;\n"
"return url;\n"
sb->safePrintf("function reload() {\n"
"var url='/widget?' + getFormParms();\n"
char *c1 = "";
char *c2 = "";
char *c3 = "";
2014-04-09 22:03:31 +04:00
long x1 = hr->getLong("dates" ,0);
long x2 = hr->getLong("summaries",0);
2014-04-10 06:51:36 +04:00
long x3 = hr->getLong("border" ,0);
2014-04-09 06:34:43 +04:00
if ( x1 ) c1 = " checked";
if ( x2 ) c2 = " checked";
if ( x3 ) c3 = " checked";
2014-04-10 06:51:36 +04:00
long width = hr->getLong("width",250);
long height = hr->getLong("height",400);
long refresh = hr->getLong("refresh",15);
2014-04-10 00:31:11 +04:00
char *def = "<style>html {font-size:12px;font-family:arial;background-color:transparent;color:black;}span.title { font-size:16px;font-weight:bold;}span.summary { font-size:12px;} { font-size:12px;}span.prevnext { font-size:12px;font-weight:bold;}</style>";//<h2>News</h2>";
2014-04-09 06:34:43 +04:00
long len1,len2,len3,len4;
char *header = hr->getString("header",&len1,def);
char *sites = hr->getString("sites",&len2,"");
char *token = hr->getString("token",&len3,"");
2014-04-10 06:51:36 +04:00
//"type:article gbsortbyint:date");
char *query =hr->getString("query",&len4,
"type:article gbsortbyint:gbspiderdate");
2014-04-09 06:34:43 +04:00
sb->safePrintf("<form method=GET action=/widget>"
"<input type=hidden name=c value=\"%s\">"
"<input type=hidden name=format value=\"widget\">"
2014-04-09 22:03:31 +04:00
, coll
2014-04-09 06:34:43 +04:00
"<div style=\""
"<table cellpadding=0>"
"<td "
"bottom-margin:5px; "
2014-04-09 22:03:31 +04:00
"<img align=right height=50 width=52 "
2014-04-10 06:51:36 +04:00
"<b style=font-size:22px;><font style=font-size:27px;>"
"idget <font style=font-size:27px;>C</font>reator</b>"
"<font style=font-size:12px;>"
"Harness the power of Diffbot."
2014-04-09 06:34:43 +04:00
"<td style=text-align:right;line-height:30px;>"
"Websites to crawl:"
"<textarea rows=10 name=sites style=width:100%%;>"
"<textarea name=token style=width:100%%;>"
2014-04-11 11:29:06 +04:00
"<textarea rows=4 name=query style=width:100%%;>"
2014-04-09 06:34:43 +04:00
"Show Dates "
2014-04-09 22:03:31 +04:00
"<input type=checkbox value=1 "
//"onclick=\"toggleBool(this,'dates');reload();\" "
2014-04-09 06:34:43 +04:00
"Show Summaries "
2014-04-09 22:03:31 +04:00
"<input type=checkbox value=1 "
//"onclick=\"toggleBool(this,'summaries');reload();\" "
2014-04-09 06:34:43 +04:00
"Frame border "
2014-04-09 22:03:31 +04:00
"<input type=checkbox value=1 "
//"onclick=\"toggleBool(this,'border');reload();\" "
2014-04-09 06:34:43 +04:00
"Width "
"<input size=4 type=text value=%li "
"Height "
"<input size=4 type=text value=%li "
"<nobr>Refresh in seconds "
"<input size=4 type=text value=%li "
"<nobr>Custom widget header:</nobr>"
2014-04-11 11:29:06 +04:00
"<textarea rows=10 name=header style=width:100%%;>"
2014-04-09 06:34:43 +04:00
"<input type=submit name=submit value=ok>"
, sites
, token
, query
, c1
, c2
, c3
, width
, height
, refresh
, header
// end print controls
// begin print widget
sb->safePrintf ( "<td>"
"<div style=\""
"<div style=line-height:13px;><br></div>"
//printTabs ( sb , st );
//printRedBoxes ( sb , st );
#define SHADOWCOLOR "#000000"
sb->safePrintf (
// end widget div
// end widget column in table
// begin div with source in it
2014-05-08 00:28:20 +04:00
// "<div "
// //"class=grad3 "
// "style=\""
// "border-radius:10px;"
// "box-shadow: 6px 6px 3px %s;"
// "border:2px solid black;"
// "padding:15px;"
// "width:600px;"
// //"background-image:url('/ss.jpg');"
// //"background-repeat:repeat;"
// //"background-attachment:fixed;"
// "background-color:lightgray;"
// "\">"
// //"<br>"
2014-04-09 06:34:43 +04:00
// space widget to the right using this table
//class=grad3 "
//"border:2px solid black;"
"<td valign=top>"
2014-04-09 22:03:31 +04:00
2014-04-09 06:34:43 +04:00
"<img src=/gears32.png width=64 height=64>"
long start = sb->length();
2014-04-09 22:03:31 +04:00
char *border = "frameborder=no ";
if ( x3 ) border = "";
2014-04-09 06:34:43 +04:00
// this iframe contains the WIDGET
sb->safePrintf (
2014-05-08 00:28:20 +04:00
// "<div "
// "id=scrollerxyz "
// "style=\""
2014-04-09 06:34:43 +04:00
//"width:%lipx;" // 200;"
//"height:%lipx;" // 400;"
2014-05-08 00:28:20 +04:00
// "padding:0px;"
// "margin:0px;"
// "background-color:white;"
2014-04-09 06:34:43 +04:00
2014-05-08 00:28:20 +04:00
2014-04-09 06:34:43 +04:00
"<iframe width=\"%lipx\" height=\"%lipx\" "
//"scrolling=yes "
2014-05-08 00:28:20 +04:00
2014-04-09 06:34:43 +04:00
//"%s\" "
2014-05-08 00:28:20 +04:00
//"scrolling=no "
//"frameborder=no "
2014-04-09 06:34:43 +04:00
2014-04-09 22:03:31 +04:00
// frameborder=no
2014-04-09 06:34:43 +04:00
2014-04-10 23:59:15 +04:00
2014-04-09 06:34:43 +04:00
2014-04-09 22:03:31 +04:00
2014-04-09 06:34:43 +04:00
// show articles sorted by newest pubdate first
, width
, height
2014-04-09 22:03:31 +04:00
, border
2014-04-10 23:59:15 +04:00
, iptoa(g_hostdb.m_myHost->m_ip)
, (long)g_hostdb.m_myHost->m_httpPort
2014-04-09 06:34:43 +04:00
, width
, height
2014-04-09 22:03:31 +04:00
, coll
, refresh
2014-04-09 06:34:43 +04:00
2014-04-09 22:03:31 +04:00
sb->urlEncode ( query );
// widget content header, usually a style tag
sb->urlEncode ( header );
2014-04-09 06:34:43 +04:00
sb->safePrintf ( // do not reset the user's "where" cookie
// to NYC from looking at this widget!
"Your browser does not support iframes"
//, si->m_urlParms);
//, wp
long end = sb->length();
sb->reserve ( end - start + 1000 );
char *wdir = "on the left";
long cols = 32;
//if ( width <= 240 )
sb->safePrintf("</td><td>&nbsp;&nbsp;</td><td valign=top>");
//else {
// sb->safePrintf("</td></tr><tr><td><br><br>");
// wdir = "above";
// cols = 60;
// }
sb->safePrintf ( "\n\n"
2014-04-09 22:03:31 +04:00
2014-04-09 06:34:43 +04:00
"<font style=\"font-size:16px;\">"
2014-04-10 00:31:11 +04:00
"Insert the following code into your webpage to "
2014-04-09 06:34:43 +04:00
"generate the widget %s. "
//"<a style=color:white href=/widget.html>"
//"Make $1 per click!</a></u></b>"
"<br><br><b>" , wdir );
char *p = sb->getBufStart() + start;
sb->safePrintf("<textarea rows=30 cols=%li "
"style=\"border:2px solid black;\">", cols);
sb->htmlEncode ( p ,
end - start ,
false , // bool encodePoundSign
0 ); // niceness
// space widget to the right using this table
return true;
bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
SafeBuf sb;
2014-04-09 22:03:31 +04:00
char *token = hr->getString("token",NULL);
if ( token && ! token[0] ) token = NULL;
2014-04-11 11:29:06 +04:00
long edit = hr->getLong("inlineedit",0);
if ( ! token && ! edit ) {
2014-04-09 22:03:31 +04:00
g_errno = ENOTOKEN;
char *msg = mstrerror(g_errno);
return g_httpServer.sendErrorReply(s,g_errno,msg);
long tlen = 0;
if ( token ) tlen = gbstrlen(token);
if ( tlen > 64 ) {
g_errno = ENOCOLLREC;
char *msg = mstrerror(g_errno);
return g_httpServer.sendErrorReply(s,g_errno,msg);
char coll[MAX_COLL_LEN];
CollectionRec *cr = NULL;
if ( token ) {
cr = g_collectiondb.getRec(coll);
SafeBuf parmList;
2014-04-10 00:31:11 +04:00
collnum_t cn = -1;
if ( cr ) cn = cr->m_collnum;
2014-04-09 22:03:31 +04:00
// . first update their collection with the sites to crawl
// . this is NOT a custom diffbot crawl, just a regular one using
// the new crawl filters logic, "siteList"
char *sites = hr->getString("sites",NULL);
// add the collection if does not exist
if ( sites && ! cr && token ) {
// we need to add the new collnum, so reserve it
collnum_t newCollnum = g_collectiondb.reserveCollNum();
2014-04-10 00:31:11 +04:00
// use that
cn = newCollnum;
2014-04-09 22:03:31 +04:00
// add the new colection named <token>-widget123
2014-04-10 00:31:11 +04:00
g_parms.addNewParmToList1 ( &parmList,cn,coll,0,"addColl");
// note it
log("widget: adding new widget coll %s",coll);
if ( cn >= 0 && token ) {
2014-04-09 22:03:31 +04:00
// use special url filters profile that spiders sites
// shallowly and frequently to pick up new news stories
// "1" = (long)UFP_NEWS
char ttt[12];
2014-04-10 06:51:36 +04:00
// urlfiltersprofile
g_parms.addNewParmToList1 ( &parmList,cn,ttt,0,"ufp");
2014-04-09 22:03:31 +04:00
// use diffbot analyze
char durl[1024];
2014-04-10 00:31:11 +04:00
2014-04-09 22:03:31 +04:00
// TODO: ensure we call diffbot ok
2014-04-10 00:31:11 +04:00
g_parms.addNewParmToList1 ( &parmList,cn,durl,0,"apiUrl");
2014-04-09 22:03:31 +04:00
2014-04-10 00:31:11 +04:00
if ( ! sites ) sites = "";
// . update the list of sites to crawl and search and show in widget
// . if they give an empty list then allow that, it will stop crawling
if ( cn >= 0 && token )
g_parms.addNewParmToList1 ( &parmList,cn,sites,0,"sitelist");
2014-04-09 22:03:31 +04:00
if ( parmList.length() ) {
// send the parms to all hosts in the network
g_parms.broadcastParmList ( &parmList ,
NULL,//s,// state is socket i guess
NULL);//doneBroadcastingParms2 );
// now display the widget controls and the widget and the iframe code
printWidgetPage ( &sb , hr , coll );
2014-04-09 06:34:43 +04:00
return g_httpServer.sendDynamicPage(s,
-1,//cacheTime -1 means not tocache
false, // POST?
200, // httpstatus
NULL, // cookie
"UTF-8"); // charset
2014-05-08 00:28:20 +04:00