fix gb dump sitelinks

This commit is contained in:
Matt Wells 2015-01-25 19:33:31 -08:00
parent 7c4a625779
commit ec55540432

View File

@ -12039,12 +12039,24 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
if ( tag->m_type == typeSite ) {
hostHash = tag->m_key.n1;
site = tag->getTagData();
// make it null if too many .'s
if ( site ) {
char *p = site;
int count = 0;
// foo.bar.baz.com is ok
for ( ; *p ; p++ )
if ( *p == '.' ) count++;
if ( count >= 4 )
site = NULL;
}
if ( site && ! is_ascii2_a ( site, gbstrlen(site) ) ) {
site = NULL;
continue;
}
if ( lastHostHash == hostHash && siteNumInlinks>=0) {
if ( siteNumInlinks > 0 && site )
// if we ask for 1 or 2 we end up with 100M
// entries, but with 3+ we get 27M
if ( siteNumInlinks > 2 && site )
printf("%i %s\n",siteNumInlinks,site);
siteNumInlinks = -1;
site = NULL;
@ -12059,7 +12071,9 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
hostHash = tag->m_key.n1;
siteNumInlinks = atoi(tag->getTagData());
if ( lastHostHash == hostHash && site ) {
if ( siteNumInlinks > 0 )
// if we ask for 1 or 2 we end up with 100M
// entries, but with 3+ we get 27M
if ( siteNumInlinks > 2 )
printf("%i %s\n",siteNumInlinks,sbuf);
siteNumInlinks = -1;
site = NULL;