mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-11-20 18:57:32 +03:00
166 lines
3.4 KiB
C++
166 lines
3.4 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "AdultBit.h"
|
|
#include "HashTableX.h"
|
|
|
|
// . returns true if document is adult, false otherwise
|
|
bool AdultBit::getBit ( char *s , int32_t niceness) {
|
|
|
|
// rudimentary adult detection algorithm
|
|
int32_t i = 0;
|
|
int32_t dirties = 0;
|
|
int32_t j;
|
|
int32_t slen;
|
|
loop:
|
|
|
|
// skip until we hit an alpha
|
|
while ( s[i] && ! is_alpha_a(s[i]) ) i++;
|
|
// return if done
|
|
if ( ! s[i] ) return false;
|
|
// . point to char after this alpha
|
|
// . return if none
|
|
j = i + 1;
|
|
// find end of the alpha char sequence
|
|
while ( s[j] && is_alpha_a(s[j]) ) j++;
|
|
// skip over 1 or 2 letter words
|
|
slen = j - i;
|
|
if ( slen <= 2 ) { i = j; goto loop; }
|
|
// it's adult content if it has just 1 obscene word
|
|
if ( isObscene ( (char *) s+i , slen ) ) return true;
|
|
|
|
// W = non-dirty word
|
|
// D = dirty word
|
|
// . = sequence of punctuation/num and/or 1 to 2 letter words
|
|
// dirty sequences:
|
|
// . D . D . D . (dirties=6)
|
|
// . D . W . D . D . (dirties=5)
|
|
// . basically, if 3 out of 4 words in a subsequence are
|
|
// "dirty" then the whole document is "adult" content
|
|
if ( isDirty ( (char *) s+i , slen ) ) {
|
|
dirties += 2;
|
|
if ( dirties >= 5 ) return true;
|
|
i = j;
|
|
goto loop;
|
|
}
|
|
|
|
dirties--;
|
|
if ( dirties < 0 ) dirties = 0;
|
|
|
|
QUICKPOLL((niceness));
|
|
i = j;
|
|
goto loop;
|
|
}
|
|
|
|
static HashTableX s_dtable;
|
|
bool AdultBit::isDirty ( char *s , int32_t len ) {
|
|
|
|
static bool s_isInitialized = false;
|
|
static char *s_dirty[] = {
|
|
"anal",
|
|
"analsex",
|
|
"blowjob",
|
|
"blowjobs",
|
|
"boob",
|
|
"boobs",
|
|
"clitoris",
|
|
"cock",
|
|
"cocks",
|
|
"cum",
|
|
"dick",
|
|
"dicks",
|
|
"gangbang",
|
|
"gangbangs",
|
|
"gangbanging",
|
|
"movie",
|
|
"movies",
|
|
"oral",
|
|
"oralsex",
|
|
"porn",
|
|
"porno",
|
|
"pussy",
|
|
"pussies",
|
|
"sex",
|
|
"sexy",
|
|
"tit",
|
|
"tits",
|
|
"video",
|
|
"videos",
|
|
"xxx",
|
|
"xxxx",
|
|
"xxxx"
|
|
};
|
|
|
|
if ( ! s_isInitialized ) {
|
|
// set up the hash table
|
|
if ( ! s_dtable.set ( 8,4,sizeof(s_dirty )*2,NULL,0,false,0,
|
|
"adulttab"))
|
|
return log("build: Error initializing "
|
|
"dirty word hash table." );
|
|
// now add in all the dirty words
|
|
int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
int64_t h = hash64b ( s_dirty [i] );
|
|
if ( ! s_dtable.addTerm (&h, i+1) ) return false;
|
|
}
|
|
s_isInitialized = true;
|
|
}
|
|
|
|
// compute the hash of the word "s"
|
|
int64_t h = hash64Lower_a ( s , len );
|
|
|
|
// get from table
|
|
return s_dtable.getScore ( &h );
|
|
}
|
|
|
|
|
|
static HashTableX s_otable;
|
|
bool AdultBit::isObscene ( char *s , int32_t len ) {
|
|
|
|
static bool s_isInitialized = false;
|
|
static char *s_obscene[] = {
|
|
"clit",
|
|
"clits",
|
|
// "cum", magna cum laude
|
|
"cums",
|
|
"cumshot",
|
|
"cunt",
|
|
"cunts",
|
|
"milf",
|
|
"rimjob",
|
|
"felch",
|
|
"fuck",
|
|
"fucked",
|
|
"fucker",
|
|
"fucking",
|
|
"fucks",
|
|
"whore",
|
|
"whores"
|
|
};
|
|
|
|
if ( ! s_isInitialized ) {
|
|
// set up the hash table
|
|
if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0,
|
|
"obscenetab") )
|
|
return log("build: Error initializing "
|
|
"obscene word hash table." );
|
|
// now add in all the stop words
|
|
int32_t n = sizeof(s_obscene) / sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
int64_t h = hash64b ( s_obscene[i] );
|
|
if ( ! s_otable.addTerm ( &h, i+1 ) ) return false;
|
|
}
|
|
s_isInitialized = true;
|
|
}
|
|
|
|
// compute the hash of the word "s"
|
|
int64_t h = hash64Lower_a ( s , len );
|
|
|
|
// get from table
|
|
return s_otable.getScore ( &h );
|
|
}
|
|
|
|
void resetAdultBit ( ) {
|
|
s_dtable.reset();
|
|
s_otable.reset();
|
|
}
|