open-source-search-engine/reindex2.cpp
2014-11-10 14:45:11 -08:00

202 lines
5.3 KiB
C++

// Matt Wells, copyright Jan 2002
// . usage: reindex2
// . moves files resulting from running "reindex" to their right locations
// . run on host0
// . assumes files on hosts4-7 in /?/new/*
#include "gb-include.h"
#include <ctype.h>
int main ( int argc , char *argv[] ) {
// must have big filename
if ( argc != 2 ) {
printf("reindex2 [this hostnum]\n");
exit(-1);
}
int32_t thishostnum = atoi ( argv[1] );
if ( thishostnum <4 || thishostnum >7 ) {
printf("reindex2 [this hostnum]\n");
exit(-1);
}
printf("using hostnum %"INT32"\n",thishostnum);
// map of dbname to index #
char *names[] = { "index" , "spider" , "url" , "checksum" , "title" };
int32_t numnames = 5;
//
// gather list of all hosts/files
//
int32_t hosts [600*4*4];
char drives [600*4*4];
char filenames[600*4*4][64];
//int32_t filesize [600*4*4];
int32_t prenum [600*4*4];
int32_t filenum [600*4*4];
int32_t dbnamenum [600*4*4];
char *ext [600*4*4];
int32_t next [ 8 ] [ 4 ] [ 5 ];
int32_t count = 0;
for ( int32_t i = 4 ; i <= 7 ;i++ ) {
for ( char c ='a' ; c <= 'd' ; c++ ) {
char buf[128];
sprintf ( buf , "rsh host%"INT32" ls -1 /%c/new" , i, c );
//sprintf ( buf , "ls -1 /%c/new" , c );
// open pipe to read in
FILE *fd;
fd = popen ( buf , "r" );
if ( ! fd ) {
printf("reindex2: popen failed");
return -1;
}
//char tmp[1024];
while ( fgets ( filenames[count] , 64 , fd ) ) {
// get filename
//sscanf ( tmp,"%*s %*s %*s %*s %"INT32" %*s %*s %*s %s",
// &filesize[count], filenames[count] );
// ref the filename
char *f = filenames[count];
int32_t len = gbstrlen ( f );
f[--len] = '\0';
// print it
//printf("%s\n", f);
// parse it up
hosts [ count ] = i;
drives [ count ] = c;
// parse out prenum and filenum
int32_t *p1 = &prenum[count];
int32_t *p2 = &filenum[count];
int32_t *p = p1;
for ( int32_t j = 0 ; j < len ; j++ ) {
if ( ! isdigit ( f[j] ) ) continue;
char *end = &f[j+1];
while ( isdigit ( *end ) ) end++;
// tmp null
char x = *end;
*end = '\0';
*p = atoi(&f[j]);
j += (end - &f[j]) -1;
*end = x;
if ( p == p2 ) break;
p = p2;
}
// parse out extension
int32_t j = 0;
while ( f[j] != '.' ) j++;
ext [ count ] = &f[j+1];
// parse out db name
j = 0;
while ( ! isdigit (f[j]) ) j++;
f[j] ='\0';
// map f to #
int32_t k = 0;
for ( ; k < numnames ; k++ )
if ( strcmp ( f , names[k] ) == 0 ) {
dbnamenum [ count ] = k;
break;
}
if ( k >= numnames ) {
printf("error2\n");
return -1;
}
// last # for each dbnum/host/drive
next [ i ] [ c -'a'] [ dbnamenum[count]] = 1;
// print our reconstruction to verify
sprintf(buf,
"host%"INT32":/%c/new/%s%"INT32"db%04"INT32".%s",
hosts [count] , drives[count] ,
names[dbnamenum [count]] ,
prenum[count] ,
filenum[count] , ext[count] );
//printf("%s\n",buf);
// next file
count++;
}
pclose ( fd );
}
}
// print out all file names
//for ( int32_t i = 0 ; i < count ; i++ )
// printf("host%"INT32":/%c/new/%s%"INT32"db%"INT32".%s (%"INT32",%"INT32")\n",
// hosts[i],drives[i],
// dbname[i], prenum[i],
// filenum[i] , ext[i]);
// print total
//printf("total files = %"INT32"\n", count);
printf("echo \"ls phase done. writing rcps now\"\n");
// populate base dir of each host 4-7
for ( int32_t i = 0 ; i < count ; i++ ) {
// . get all files for host #i, dir $c
// . index*db0001.dat ...
char buf[128];
sprintf(buf,
"host%"INT32":/%c/new/%s%"INT32"db%04"INT32".%s",
hosts [i] , drives[i] ,
names [ dbnamenum [i]] , prenum[i] ,
filenum[i] , ext[i] );
// map prenum to new host/drive/dbnum
int32_t newhost = 4 + prenum[i] / 4;
char newdrive = 'a' + (prenum[i] % 4);
int32_t *p = &next[newhost] [newdrive -'a'] [dbnamenum[i]];
int32_t newnext = *p;
// skip, but advance, if we're not src host
if ( thishostnum != hosts[i] ) {
// advance to next file #
if ( ext[i][0]=='m') *p = *p + 2;
continue;
}
// print new filename
char buf2[128];
sprintf(buf2,
"host%"INT32":/%c/%sdb%04"INT32".%s",
newhost , newdrive ,
names [ dbnamenum [i]] ,
newnext , ext[i] );
// first get destinate filesize if it exists
// make the ls cmd first
/*
char buf3[128];
sprintf ( buf3 , "rsh host%"INT32" ls -la /%c/%sdb%04"INT32".%s",
newhost , newdrive ,
names [ dbnamenum [i]] ,
newnext , ext[i] );
// echo command
printf("echo \"%s\"\n", buf3 );
FILE *fd = popen ( buf3 , "r" );
char ttt[1024];
int32_t dorcp = 1;
int32_t size = 0;
if ( fgets ( ttt , 1024 , fd ) ) {
char tmp[1024];
sscanf ( tmp,"%*s %*s %*s %*s %"INT32" %*s %*s %*s %*s",
&size);
if ( size == filesize[i] ) dorcp = 0;
}
pclose(fd);
// avoid rcp if we can
if ( dorcp == 0 ) continue;
*/
//
//printf ( "%s --> %s\n", buf , buf2 );
// now copy file if we're src host
char buf4[128];
sprintf ( buf4 , "rcp /%c/new/%s%"INT32"db%04"INT32".%s %s",
drives[i] ,
names [ dbnamenum [i]] , prenum[i] ,
filenum[i] , ext[i] , buf2);
// execute and wait for copy to complete
printf ("%s\n",buf4 );
system ( buf4 );
//printf ("echo \"%s\"\n",buf4 );
//system ( buf4 );
// advance to next file #
if ( ext[i][0]=='m') *p = *p + 2;
}
}