open-source-search-engine/AdultBit.cpp
2013-08-02 13:12:24 -07:00

166 lines
3.4 KiB
C++

#include "gb-include.h"
#include "AdultBit.h"
#include "HashTableX.h"
// . returns true if document is adult, false otherwise
bool AdultBit::getBit ( char *s , long niceness) {
// rudimentary adult detection algorithm
long i = 0;
long dirties = 0;
long j;
long slen;
loop:
// skip until we hit an alpha
while ( s[i] && ! is_alpha_a(s[i]) ) i++;
// return if done
if ( ! s[i] ) return false;
// . point to char after this alpha
// . return if none
j = i + 1;
// find end of the alpha char sequence
while ( s[j] && is_alpha_a(s[j]) ) j++;
// skip over 1 or 2 letter words
slen = j - i;
if ( slen <= 2 ) { i = j; goto loop; }
// it's adult content if it has just 1 obscene word
if ( isObscene ( (char *) s+i , slen ) ) return true;
// W = non-dirty word
// D = dirty word
// . = sequence of punctuation/num and/or 1 to 2 letter words
// dirty sequences:
// . D . D . D . (dirties=6)
// . D . W . D . D . (dirties=5)
// . basically, if 3 out of 4 words in a subsequence are
// "dirty" then the whole document is "adult" content
if ( isDirty ( (char *) s+i , slen ) ) {
dirties += 2;
if ( dirties >= 5 ) return true;
i = j;
goto loop;
}
dirties--;
if ( dirties < 0 ) dirties = 0;
QUICKPOLL((niceness));
i = j;
goto loop;
}
static HashTableX s_dtable;
bool AdultBit::isDirty ( char *s , long len ) {
static bool s_isInitialized = false;
static char *s_dirty[] = {
"anal",
"analsex",
"blowjob",
"blowjobs",
"boob",
"boobs",
"clitoris",
"cock",
"cocks",
"cum",
"dick",
"dicks",
"gangbang",
"gangbangs",
"gangbanging",
"movie",
"movies",
"oral",
"oralsex",
"porn",
"porno",
"pussy",
"pussies",
"sex",
"sexy",
"tit",
"tits",
"video",
"videos",
"xxx",
"xxxx",
"xxxx"
};
if ( ! s_isInitialized ) {
// set up the hash table
if ( ! s_dtable.set ( 8,4,sizeof(s_dirty )*2,NULL,0,false,0,
"adulttab"))
return log("build: Error initializing "
"dirty word hash table." );
// now add in all the dirty words
long n = (long)sizeof(s_dirty)/ sizeof(char *);
for ( long i = 0 ; i < n ; i++ ) {
long long h = hash64b ( s_dirty [i] );
if ( ! s_dtable.addTerm (&h, i+1) ) return false;
}
s_isInitialized = true;
}
// compute the hash of the word "s"
long long h = hash64Lower_a ( s , len );
// get from table
return s_dtable.getScore ( &h );
}
static HashTableX s_otable;
bool AdultBit::isObscene ( char *s , long len ) {
static bool s_isInitialized = false;
static char *s_obscene[] = {
"clit",
"clits",
// "cum", magna cum laude
"cums",
"cumshot",
"cunt",
"cunts",
"milf",
"rimjob",
"felch",
"fuck",
"fucked",
"fucker",
"fucking",
"fucks",
"whore",
"whores"
};
if ( ! s_isInitialized ) {
// set up the hash table
if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0,
"obscenetab") )
return log("build: Error initializing "
"obscene word hash table." );
// now add in all the stop words
long n = sizeof(s_obscene) / sizeof(char *);
for ( long i = 0 ; i < n ; i++ ) {
long long h = hash64b ( s_obscene[i] );
if ( ! s_otable.addTerm ( &h, i+1 ) ) return false;
}
s_isInitialized = true;
}
// compute the hash of the word "s"
long long h = hash64Lower_a ( s , len );
// get from table
return s_otable.getScore ( &h );
}
void resetAdultBit ( ) {
s_dtable.reset();
s_otable.reset();
}