support base64 generated thumbnails in serps.

This commit is contained in:
Matt Wells 2014-04-24 14:04:57 -07:00
parent 08058d4f69
commit 82726879a2
10 changed files with 323 additions and 94 deletions

View File

@ -169,6 +169,7 @@ case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
case EDOCNONCANONICAL: return "Url was dup of canonical page";
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
case ENOTOKEN: return "Missing token";
case EBADIMG: return "Bad image";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

View File

@ -172,6 +172,7 @@ enum {
EWAITINGTOSYNCHOSTSCONF,
EDOCNONCANONICAL,
ECUSTOMCRAWLMISMATCH, // a crawl request was made with a name that already existed for bulk request (or the other way around)
ENOTOKEN
ENOTOKEN,
EBADIMG
};
#endif

View File

@ -12,10 +12,7 @@
//static void gotTermFreqWrapper ( void *state ) ;
static void gotTermListWrapper ( void *state ) ;
static void gotImgIpWrapper ( void *state , long ip ) ;
static void gotImageWrapper ( void *state ) ;
static void *thumbStartWrapper_r ( void *state , ThreadEntry *te );
static void thumbDoneWrapper ( void *state , ThreadEntry *te );
static void getImageInfo ( char *buf, long size, long *dx, long *dy, long *it);
Images::Images ( ) {
@ -31,6 +28,8 @@ void Images::reset() {
m_imgReplyLen = 0;
m_imgReplyMaxLen = 0;
m_numImages = 0;
m_imageBufValid = false;
m_phase = 0;
}
/*
@ -242,6 +241,7 @@ bool Images::getThumbnail ( char *pageSite ,
// reset here now
m_i = 0;
m_j = 0;
m_phase = 0;
// sanity check
if ( ! m_pageUrl ) { char *xx=NULL;*xx=0; }
@ -481,16 +481,11 @@ void Images::gotTermList ( ) {
bool Images::downloadImages () {
// all done if we got a valid thumbnail
if ( m_thumbnailValid ) return true;
// if not valid free old image
if ( m_imgReply ) {
mfree ( m_imgReply , m_imgReplyMaxLen , "Image" );
m_imgReply = NULL;
}
//if ( m_thumbnailValid ) return true;
long srcLen;
char *src = NULL;
long i = 0;
long node;
// downloading an image from diffbot json reply?
if ( m_xd->m_isDiffbotJSONObject ) {
@ -506,45 +501,98 @@ bool Images::downloadImages () {
goto insertionPoint;
}
// . download each leftover image
// . stop as soon as we get one with good dimensions
// . make a thumbnail of that one
for ( i = m_j ; i < m_numImages ; i++ ) {
// advance now
m_j++;
// if we should stop, stop
if ( m_stopDownloading ) break;
// skip if bad or not unique
if ( m_errors[i] ) continue;
// set status msg
sprintf ( m_statusBuf ,"downloading image %li",i);
// point to it
if ( m_xd ) m_xd->setStatus ( m_statusBuf );
// get the url of the image
src = m_xml->getString(i,i+1,"src",&srcLen);
// construct the url to download
insertionPoint:
// set it to the full url
//Url iu;
// use "pageUrl" as the baseUrl
m_imageUrl.set ( m_pageUrl , src , srcLen );
for ( ; m_j < m_numImages ; m_j++ , m_phase = 0 ) {
// get the image ip. will also download the image.
if ( ! downloadImage () )
return false;
if ( m_phase == 0 ) {
// advance
m_phase++;
// get img tag node
node = m_imageNodes[m_j];
// get the url of the image
src = m_xml->getString(node,"src",&srcLen);
// construct the url to download
insertionPoint:
// if we should stop, stop
if ( m_stopDownloading ) break;
// skip if bad or not unique
if ( m_errors[m_j] ) continue;
// set status msg
sprintf ( m_statusBuf ,"downloading image %li",m_j);
// point to it
if ( m_xd ) m_xd->setStatus ( m_statusBuf );
// use "pageUrl" as the baseUrl
m_imageUrl.set ( m_pageUrl , src , srcLen );
}
// process the image we downloaded in case did not block,
// maybe it was in the html cache
gotImage();
// get image ip
if ( m_phase == 1 ) {
// advance
m_phase++;
// this increments phase if it should
if ( ! getImageIp() ) return false;
// error?
if ( g_errno ) continue;
}
// download the actual image
if ( m_phase == 2 ) {
// advance
m_phase++;
// download image data
if ( ! downloadImage() ) return false;
// error downloading?
if ( g_errno ) continue;
}
// get thumbnail using threaded call to netpbm stuff
if ( m_phase == 3 ) {
// advance
m_phase++;
// download image data
if ( ! makeThumb() ) return false;
// error downloading?
if ( g_errno ) continue;
}
// error making thumb or just not a good thumb size?
if ( ! m_thumbnailValid ) {
// free old image we downloaded, if any
m_msg13.reset();
// i guess do this too, it was pointing at it in msg13
m_imgReply = NULL;
}
// it's a keeper
m_imageBuf.safeStrcpy ( m_imageUrl.getUrl() );
m_imageBuf.pushChar('\0');
m_imageBuf.pushLong(m_tdx);
m_imageBuf.pushLong(m_tdy);
m_imageBuf.safeMemcpy ( m_imgData , m_thumbnailSize );
m_imageBufValid = true;
// save mem. do this after because m_imgData uses m_msg13's
// reply buf to store the thumbnail for now...
m_msg13.reset();
m_imgReply = NULL;
return true;
}
return gotImage();
return true;
}
bool Images::downloadImage ( ) {
static void gotImgIpWrapper ( void *state , long ip ) {
Images *THIS = (Images *)state;
// control loop
if ( ! THIS->downloadImages() ) return;
// call callback at this point, we are done with the download loop
THIS->m_callback ( THIS->m_state );
}
bool Images::getImageIp ( ) {
if ( ! m_msgc.getIp ( m_imageUrl.getHost () ,
m_imageUrl.getHostLen() ,
&m_latestIp ,
@ -552,21 +600,18 @@ bool Images::downloadImage ( ) {
gotImgIpWrapper ))
// we blocked
return false;
return downloadImage2 ( );
return true;
}
void gotImgIpWrapper ( void *state , long ip ) {
static void downloadImageWrapper ( void *state ) {
Images *THIS = (Images *)state;
if ( ! THIS->downloadImage2 ( ) ) return;
// if did not block return control to loop
// control loop
if ( ! THIS->downloadImages() ) return;
// call callback at this point, we are done with the download loop
// all done
THIS->m_callback ( THIS->m_state );
}
bool Images::downloadImage2 ( ) {
bool Images::downloadImage ( ) {
// error?
if ( m_latestIp == 0 || m_latestIp == -1 ) {
log(LOG_DEBUG,"images: ip of %s is %li (%s)",
@ -575,9 +620,7 @@ bool Images::downloadImage2 ( ) {
g_errno = 0;
return true;
}
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
// assume success
m_httpStatus = 200;
// set the request
@ -594,24 +637,21 @@ bool Images::downloadImage2 ( ) {
strcpy(r->m_url,m_imageUrl.getUrl());
// . try to download it
// . i guess we are ignoring hammers at this point
if ( ! m_msg13.getDoc(r,false,this,gotImageWrapper))
if ( ! m_msg13.getDoc(r,false,this,downloadImageWrapper))
return false;
// make thumbnail. this can return false if blocks, true otherwise
// because it uses a thread
return gotImage ( );
return true;
}
void gotImageWrapper ( void *state ) {
static void makeThumbWrapper ( void *state , ThreadEntry *t ) {
Images *THIS = (Images *)state;
// process/store the reply
if ( ! THIS->gotImage ( ) ) return;
// download the images. will set m_stopDownloading when we get one
// control loop
if ( ! THIS->downloadImages() ) return;
// all done
THIS->m_callback ( THIS->m_state );
}
bool Images::gotImage ( ) {
bool Images::makeThumb ( ) {
// did it have an error?
if ( g_errno ) {
// just give up on all of them if one has an error
@ -633,7 +673,7 @@ bool Images::gotImage ( ) {
// the real page.
if ( g_errno ) {
log( "ERROR? g_errno puked: %s", mstrerror(g_errno) );
g_errno = 0;
//g_errno = 0;
return true;
}
//if ( ! slot ) return true;
@ -642,12 +682,17 @@ bool Images::gotImage ( ) {
bufLen = m_msg13.m_replyBufSize;
bufMaxLen = m_msg13.m_replyBufAllocSize;
// no image?
if ( ! buf || bufLen <= 0 ) return true;
if ( ! buf || bufLen <= 0 ) {
g_errno = EBADIMG;
return true;
}
// we are image candidate #i
long i = m_j - 1;
//long i = m_j - 1;
// get img tag node
long node = m_imageNodes[m_j];
// get the url of the image
long srcLen;
char *src = m_xml->getString(i,i+1,"src",&srcLen);
char *src = m_xml->getString(node,"src",&srcLen);
// set it to the full url
Url iu;
// use "pageUrl" as the baseUrl
@ -657,6 +702,7 @@ bool Images::gotImage ( ) {
log ( "image: MIME.set() failed in gotImage()" );
// give up on the remaining images then
m_stopDownloading = true;
g_errno = EBADIMG;
return true;
}
// set the status so caller can see
@ -667,6 +713,7 @@ bool Images::gotImage ( ) {
m_httpStatus);
// give up on the remaining images then
m_stopDownloading = true;
g_errno = EBADIMG;
return true;
}
// make sure this is an image
@ -675,6 +722,7 @@ bool Images::gotImage ( ) {
log( LOG_DEBUG, "image: gotImage() states that this image is "
"not in a format we currently handle." );
// try the next image if any
g_errno = EBADIMG;
return true;
}
// get the content
@ -690,11 +738,12 @@ bool Images::gotImage ( ) {
if ( ! m_imgReply || m_imgReplyLen == 0 ) {
log( LOG_DEBUG, "image: Returned empty image reply!" );
g_errno = EBADIMG;
return true;
}
// get next if too small
if ( m_imgDataSize < 20 ) return true;
if ( m_imgDataSize < 20 ) { g_errno = EBADIMG; return true; }
long imageType;
getImageInfo ( m_imgData, m_imgDataSize, &m_dx, &m_dy, &imageType );
@ -710,6 +759,7 @@ bool Images::gotImage ( ) {
// skip if bad dimensions
if( ((m_dx < 50) || (m_dy < 50)) && ((m_dx > 0) && (m_dy > 0)) ) {
log( "image: Image is too small to represent a news article." );
g_errno = EBADIMG;
return true;
}
@ -726,23 +776,14 @@ bool Images::gotImage ( ) {
if ( g_threads.call ( FILTER_THREAD ,
MAX_NICENESS ,
this ,
thumbDoneWrapper ,
makeThumbWrapper ,
thumbStartWrapper_r ) ) return false;
// threads might be off
logf ( LOG_DEBUG, "image: Calling thumbnail gen without thread.");
thumbStartWrapper_r ( NULL , NULL );
thumbStartWrapper_r ( this , NULL );
return true;
}
void thumbDoneWrapper ( void *state , ThreadEntry *t ) {
Images *THIS = (Images *)state;
// . download another image if we ! m_thumbnailValid
// . should also free m_imgReply if ! m_thumbnailValid
if ( ! THIS->downloadImages() ) return;
// all done
THIS->m_callback ( THIS->m_state );
}
void *thumbStartWrapper_r ( void *state , ThreadEntry *t ) {
Images *THIS = (Images *)state;
THIS->thumbStart_r ( true /* am thread?*/ );
@ -855,7 +896,8 @@ void Images::thumbStart_r ( bool amThread ) {
// Call clone function for the shell to execute command
// This call WILL BLOCK . timeout is 30 seconds.
int err = my_system_r( cmd, 30 ); // m_thmbconvTimeout );
//int err = my_system_r( cmd, 30 ); // m_thmbconvTimeout );
int err = system( cmd ); // m_thmbconvTimeout );
//if( (m_dx != 0) && (m_dy != 0) )
// unlink( in );
@ -936,6 +978,11 @@ void Images::thumbStart_r ( bool amThread ) {
// MDW: this was m_imgReply
getImageInfo ( m_imgData , m_thumbnailSize , &m_tdx , &m_tdy , NULL );
// now make the meta data struct
// <imageUrl>\0<width><height><thumbnailData>
log( LOG_DEBUG, "image: Thumbnail size: %li bytes.", m_imgDataSize );
log( LOG_DEBUG, "image: Thumbnail dx=%li dy=%li.", m_tdx,m_tdy );
log( LOG_DEBUG, "image: Thumbnail generated in %lldms.", stop-start );

View File

@ -49,19 +49,26 @@ class Images {
void *state ,
void (*callback)(void *state) );
char *getImageData () { return m_imgData; };
long getImageDataSize() { return m_imgDataSize; };
//char *getImageData () { return m_imgData; };
//long getImageDataSize() { return m_imgDataSize; };
//long getImageType () { return m_imageType; };
SafeBuf m_imageBuf;
bool m_imageBufValid;
long m_phase;
bool gotTermFreq();
bool launchRequests();
void gotTermList();
bool downloadImages();
bool downloadImage ( ) ;
bool downloadImage2 ( ) ;
bool gotImage ( );
bool getImageIp();
bool downloadImage();
bool makeThumb();
//bool gotImage ( );
void thumbStart_r ( bool amThread );
long m_i;

View File

@ -2567,9 +2567,34 @@ bool printResult ( State0 *st, long ix ) {
// http://img.youtube.com/vi/auQbi_fkdGE/2.jpg
// get the thumbnail url
if ( mr->ptr_imgUrl && si->m_format == FORMAT_HTML )
sb->safePrintf ("<a href=%s><image src=%s></a>",
sb->safePrintf ("<a href=%s><img src=%s></a>",
url,mr->ptr_imgUrl);
// if we have a thumbnail show it next to the search result
if ( si->m_format == FORMAT_HTML &&
! mr->ptr_imgUrl &&
mr->ptr_imgData ) {
char *p = mr->ptr_imgData; // orig img url
p += gbstrlen(p) + 1; // dx of thumb
long tdx = *(long *)p;
p += 4;
long tdy = *(long *)p;
p += 4;
char *imgData = p;
char *pend = mr->ptr_imgData + mr->size_imgData;
long thumbBytes = pend - p;
sb->safePrintf("<a href=%s>"
"<img width=%li height=%li "
"src=\""
"data:image/jpg;base64,"
,url
,tdx
,tdy);
// encode image in base 64
sb->base64Encode ( imgData , thumbBytes , 0 ); // 0 niceness
sb->safePrintf("\"></a>");
}
// print image for widget
if ( //mr->ptr_imgUrl &&
@ -2593,12 +2618,33 @@ bool printResult ( State0 *st, long ix ) {
, (long)RESULT_HEIGHT
, (long)PADDING
);
if ( mr->ptr_imgUrl )
sb->safePrintf("background-repeat:no-repeat;"
"background-size:%lipx 140px;"
"background-image:url('%s');"
, widgetwidth - 2*8 // padding is 8px
, mr->ptr_imgUrl);
// if ( mr->ptr_imgUrl )
// sb->safePrintf("background-repeat:no-repeat;"
// "background-size:%lipx 140px;"
// "background-image:url('%s');"
// , widgetwidth - 2*8 // padding is 8px
// , mr->ptr_imgUrl);
if ( mr->ptr_imgData ) {
char *p = mr->ptr_imgData; // orig img url
p += gbstrlen(p) + 1; // dx of thumb
//long tdx = *(long *)p;
p += 4;
//long tdy = *(long *)p;
p += 4;
char *imgData = p;
char *pend = mr->ptr_imgData + mr->size_imgData;
long thumbBytes = pend - p;
sb->safePrintf("background-repeat:no-repeat;"
"background-size:%lipx 140px;"
"background-image:url('data:image/"
"jpg;base64,"
, widgetwidth - 2*8); // padding is 8px
// encode image in base 64
sb->base64Encode (imgData,thumbBytes,0); // 0 niceness
sb->safePrintf("');");
}
// end the div style attribute and div tag
sb->safePrintf("\">");
sb->safePrintf ( "<a "

View File

@ -3285,3 +3285,92 @@ bool SafeBuf::csvEncode ( char *s , long len , long niceness ) {
return true;
}
bool SafeBuf::base64Encode ( char *sx , long len , long niceness ) {
unsigned char *s = (unsigned char *)sx;
if ( ! s ) return true;
// assume all chars are double quotes and will have to be encoded
long need = len * 2 + 1 +3; // +3 for = padding
if ( ! reserve ( need ) ) return false;
// tmp vars
char *dst = m_buf + m_length;
long round = 0;
// the table of 64 entities
static char tab[] = {
'A','B','C','D','E','F','G','H','I','J','K','L','M',
'N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
'a','b','c','d','e','f','g','h','i','j','k','l','m',
'n','o','p','q','r','s','t','u','v','w','x','y','z',
'0','1','2','3','4','5','6','7','8','9','+','/'
};
unsigned char val;
// scan through all
unsigned char *send = s + len;
for ( ; s < send ; ) {
// breathe
QUICKPOLL ( niceness );
unsigned char c1 = s[0];
unsigned char c2 = 0;
//unsigned char c3 = 0;
if ( s+1 < send ) c2 = s[1];
else c2 = 0;
if ( round == 0 ) {
val = c1 >>2;
}
else if ( round == 1 ) {
val = (c1 & 0x03) << 4;
val |= c2 >> 4;
// time for this
s++;
}
else if ( round == 2 ) {
val = ((c1 & 0x0f) << 2);
val |= ((c2 & 0xc0) >> 6);
s++;
}
else if ( round == 3 ) {
val = (c1 & 0x3f);
s++;
}
// add '0'
*dst = tab[val];
// point to next char
dst++;
// keep going if more left
if ( s < send ) {
// repeat every 4 cycles since it is aligned then
if ( ++round == 4 ) round = 0;
continue;
}
// if we are done do padding
if ( round == 0 ) {
*dst++ = '=';
}
if ( round == 1 ) {
*dst++ = '=';
*dst++ = '=';
}
if ( round == 2 ) {
*dst++ = '=';
}
}
m_length += dst - (m_buf + m_length);
nullTerm();
return true;
}

View File

@ -110,6 +110,8 @@ struct SafeBuf {
bool csvEncode ( char *s , long len , long niceness = 0 );
bool base64Encode ( char *s , long len , long niceness = 0 );
//bool pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
bool cat(SafeBuf& c);
// . only cat the sections/tag that start with "tagFilter"

View File

@ -3340,7 +3340,7 @@ char *XmlDoc::prepareToMakeTitleRec ( ) {
//Images *images = getImages();
//if ( ! images || images == (Images *)-1 ) return (char *)images;
char **id = getImageData();
char **id = getThumbnailData();
if ( ! id || id == (void *)-1 ) return (char *)id;
int8_t *hopCount = getHopCount();
@ -17387,13 +17387,19 @@ long XmlDoc::getDomHash32( ) {
// . you can inline it in an image tag like
// <img src="...."/>
// background-image:url(...);
char **XmlDoc::getImageData ( ) {
// . FORMAT of ptr_imageData:
// <origimageUrl>\0<4bytethumbwidth><4bytethumbheight><thumbnaildatajpg>
char **XmlDoc::getThumbnailData ( ) {
if ( m_imageDataValid ) return &ptr_imageData;
Images *images = getImages();
if ( ! images || images == (Images *)-1 ) return (char **)images;
ptr_imageData = images->m_imgData;
size_imageData = images->m_thumbnailSize; // size of image in bytes
ptr_imageData = NULL;
size_imageData = 0;
m_imageDataValid = true;
if ( ! images || ! images->m_imageBufValid ) return &ptr_imageData;
if ( images->m_imageBuf.length() <= 0 ) return &ptr_imageData;
ptr_imageData = images->m_imageBuf.getBufStart();
size_imageData = images->m_imageBuf.length();
return &ptr_imageData;
}
@ -18548,6 +18554,24 @@ bool XmlDoc::logIt ( ) {
else
sb.safePrintf("addlistsize=%05li ",(long)0);
if ( size_imageData && m_imageDataValid ) {
// url is in data now
char *imgUrl = ptr_imageData;
long imgUrlLen = gbstrlen(imgUrl);
char *p = imgUrl + imgUrlLen + 1;
long tdx = *(long *)p; p += 4; // thumb width
long tdy = *(long *)p; p += 4; // thumb height
long used = p - ptr_imageData;
long remain = size_imageData - used;
//char *imgData = imgUrl + imgUrlLen + 1;
sb.safePrintf("thumbnail=%s,%libytes,%lix%li ",
imgUrl,remain,tdx,tdy);
}
else
sb.safePrintf("thumbnail=none ");
/*
if ( m_hasAddressValid && m_addressesValid )
sb.safePrintf("numaddr=%li ",(long)m_addresses.m_numValid);
@ -27574,6 +27598,12 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
if ( *iu ) reply->size_imgUrl = gbstrlen(*iu)+1;
}
// get thumbnail image DATA
if ( ! reply->ptr_imgData ) { // && m_req->m_getImageUrl ) {
reply-> ptr_imgData = ptr_imageData;
reply->size_imgData = size_imageData;
}
// . adids contained in the doc
// . get from title rec rather than generating
// . but we need to generate to store in titleRec at index time
@ -28178,6 +28208,7 @@ char **XmlDoc::getImageUrl() {
// diffbot often extracts an image in the json. but even if pure
// json it might be diffbot json that was injected an we don't know
// it so check contentType...
/*
if ( m_isDiffbotJSONObject || m_contentType == CT_JSON ) {
char *iu = strstr(ptr_utf8Content,"\"images\":[{");
if ( ! iu ) return &m_imageUrl;
@ -28211,6 +28242,7 @@ char **XmlDoc::getImageUrl() {
m_imageUrl = m_imageUrlBuf.getBufStart();
return &m_imageUrl;
}
*/
// all done if not youtube or meta cafe
char *host = f->getHost();

View File

@ -670,7 +670,7 @@ class XmlDoc {
long getHostHash32a ( ) ;
long getHostHash32b ( ) ;
long getDomHash32 ( );
char **getImageData();
char **getThumbnailData();
class Images *getImages ( ) ;
int8_t *getNextSpiderPriority ( ) ;
long *getPriorityQueueNum ( ) ;

View File

@ -947,6 +947,11 @@ int main2 ( int argc , char *argv[] ) {
return 0;
}
//SafeBuf tt;
//tt.base64Encode("any carnal pleas",16);
//fprintf(stderr,"%s\n",tt.getBufStart());
//exit(0);
// get hosts.conf file
//char *hostsConf = "./hosts.conf";
long hostId = 0;
@ -1061,7 +1066,6 @@ int main2 ( int argc , char *argv[] ) {
// return 0;
//}
// these tests do not need a hosts.conf
/*
if ( strcmp ( cmd , "trietest" ) == 0 ) {