open-source-search-engine/Domains.cpp
2013-12-10 15:28:04 -08:00

841 lines
11 KiB
C++

#include "gb-include.h"
#include "HashTableX.h"
#include "Domains.h"
#include "Mem.h"
char *getDomainOfIp ( char *host , long hostLen , long *dlen ) {
// get host length
//long hostLen = gbstrlen(host);
// if ip != 0 then host is a numeric ip, point to first 3 #'s
char *s = host + hostLen - 1;
while ( s > host && *s!='.' ) s--;
// if no '.' return NULL and 0
if ( s == host ) { *dlen = 0; return NULL; }
// otherwise, set length
*dlen = s - host;
// return the first 3 #'s (1.2.3) as the domain
return host;
}
char *getDomain ( char *host , long hostLen , char *tld , long *dlen ) {
// assume no domain
*dlen = 0;
// get host length
//long hostLen = gbstrlen(host);
// get the tld in host, if any, if not, it returns NULL
char *s = tld; // getTLD ( host , hostLen );
// return NULL if host contains no valid tld
if ( ! s ) return NULL;
// if s is host we just have tld
if ( s == host ) return NULL;
// there MUST be a period before s
s--; if ( *s != '.' ) return NULL;
// back up over the period
s--;
// now go back until s hits "host" or another period
while ( s > host && *s !='.' ) s--;
// . now *s=='.' or s==host
// . if s is host then "host" is an acceptable domain w/o a hostname
// . fix http://.xyz.com/...... by checking for period
if ( s == host && *s !='.' ) { *dlen = hostLen; return s; }
// skip s forward over the period to point to domain name
s++;
// set domain length
*dlen = hostLen - ( s - host );
return s;
}
// host must be NULL terminated
char *getTLD ( char *host , long hostLen ) {
// make "s" point to last period in the host
//char *s = host + gbstrlen(host) - 1;
char *hostEnd = host + hostLen;
char *s = hostEnd - 1;
while ( s > host && *s !='.' ) s--;
// point to the tld in question
char *t = s;
if ( *t == '.' ) t++;
// reset our current tld ptr
char *tld = NULL;
// is t a valid tld? if so, set "tld" to "t".
if ( isTLD ( t , hostEnd - t ) ) tld = t;
// host had no period at most we had just a tld so return NULL
if ( s == host ) return tld;
// back up over last period
s--;
// just because it's in table doesn't mean we can't try going up more
while ( s > host && *s !='.' ) s--;
// point to the tld in question
t = s;
if ( *t == '.' ) t++;
// is t a valid tld? if so, set "tld" to "t".
if ( isTLD ( t , hostEnd - t ) ) tld = t;
// host had no period at most we had just a tld so return NULL
if ( s == host ) return tld;
// . now only 1 tld has 2 period and that is "LKD.CO.IM"
// . so waste another iteration for that (TODO: speed up?)
// . back up over last period
s--;
// just because it's in table doesn't mean we can't try going up more
while ( s > host && *s !='.' ) s--;
// point to the tld in question
t = s;
if ( *t == '.' ) t++;
// is t a valid tld? if so, set "tld" to "t".
if ( isTLD ( t , hostEnd - t ) ) tld = t;
// we must have gotten the tld by this point, if there was a valid one
return tld;
}
//static TermTable s_table(false);
static HashTableX s_table;
bool isTLD ( char *tld , long tldLen ) {
long pcount = 0;
// now they are random!
for ( long i = 0 ; i < tldLen ; i++ ) {
// period count
if ( tld[i] == '.' ) { pcount++; continue; }
if ( ! is_alpha_a(tld[i]) ) return false;
}
if ( pcount == 0 ) return true;
if ( pcount >= 2 ) return false;
// otherwise, if one period, check table to see if qualified
// we use this as our hashtable
static bool s_isInitialized = false;
// . i shrunk this list a lot
// . see backups for the hold list
static char *s_tlds[] = {
"AB.CA",
"AC",
"AC.AE",
"AC.AT",
"AC.CN",
"AC.CR",
"AC.CY",
"AC.FJ",
"AC.GG",
"AC.ID",
"AC.IL",
"AC.IM",
"AC.IN",
"AC.JE",
"AC.JP",
"AC.KR",
"AC.NZ",
"AC.PA",
"AC.TH",
"AC.UG",
"AC.UK",
"AC.YU",
"AC.ZA",
"AD",
"AD.JP",
"AE",
"AERO",
"AH.CN",
"AI",
"ALDERNEY.GG",
"ALT.ZA",
"AM", // 10.am
"ART.BR",
"ART.DO",
"ARTS.CO",
"ARTS.VE",
"ASN.AU",
"ASN.LV",
"AG",
"AS",
"AT",
"AU",
"AW",
"AZ",
"BA",
"BB",
"BBS.TR",
"BC.CA",
"BD",
"BE",
"BF",
"BG",
"BH",
"BI",
"BIB.VE",
"BIZ",
"BJ",
"BJ.CN",
"BM",
"BN",
"BO",
"BR",
"BS",
"BT",
"BV",
"BW",
"BY",
"BZ",
"CA",
"CC",
"CD", // mdw
"CF",
"CG",
"CH",
"CI",
"CK",
"CL",
"CM",
"CN",
"CO",
"CO.AT",
"CO.AO",
"CO.CK",
"CO.CR",
"CO.GG",
"CO.HU",
"CO.ID",
"CO.IL",
"CO.IM",
"CO.IN",
"CO.JE",
"CO.JP",
"CO.KR",
"COM",
"COM.AR",
"COM.AU",
"COM.AZ",
"COM.BB",
"COM.BM",
"COM.BR",
"COM.BS",
"COM.CN",
"COM.CO",
"COM.CU",
"COM.CY",
"COM.DO",
"COM.EC",
"COM.EG",
"COM.FJ",
"COM.GE",
"COM.GU",
"COM.HK",
"COM.JO",
"COM.KH",
"COM.LA",
"COM.LB",
"COM.LC",
"COM.LV",
"COM.LY",
"COM.MM",
"COM.MO",
"COM.MT",
"COM.MX",
"COM.MY",
"COM.NA",
"COM.NC",
"COM.NI",
"COM.NP",
"COM.PA",
"COM.PE",
"COM.PH",
"COM.PL",
"COM.PY",
"COM.RU",
"COM.SG",
"COM.SH",
"COM.SY",
"COM.TN",
"COM.TR",
"COM.TW",
"COM.UA",
"COM.UY",
"COM.VE",
"CONF.AU",
"CONF.LV",
"CO.NZ",
"COOP",
"CO.AE",
"CO.SV",
"CO.TH",
"CO.UG",
"CO.UK",
"CO.VE",
"CO.VI",
"CO.YU",
"CO.ZA",
"CQ.CN",
"CR",
"CSIRO.AU",
"CU",
"CV",
"CX",
"CY",
"CZ",
"DE",
"DJ",
"DK",
"DM",
"DO",
"DZ",
"EC",
"ED.CR",
"EDU",
"EDU.BM",
"EDU.AR",
"EDU.CN",
"EDU.CO",
"EDU.DO",
"EDU.EC",
"EDU.EG",
"EDU.GE",
"EDU.GU",
"EDU.JO",
"EDU.LC",
"EDU.LV",
"EDU.MM",
"EDU.MO",
"EDU.MY",
"EDUNET.TN",
"EDU.PA",
"EDU.PY",
"EDU.SG",
"EDU.SH",
"EDU.TR",
"EDU.TW",
"EDU.UY",
"EDU.VE",
"EDU.YU",
"EDU.ZA",
"EE",
"EG",
"EH",
"ENS.TN",
"ER",
"ERNET.IN",
"ES",
"ESP.BR",
"ET",
"ETC.BR",
"EU",
"EUN.EG",
"FI",
"FI.CR",
"FIN.EC",
"FIN.TN",
"FIRM.CO",
"FIRM.VE",
"FJ",
"FK",
"FM",
"FO",
"FR",
"FX",
"G12.BR",
"GA",
"GB",
"GD",
"GD.CN",
"GE",
"GEN.NZ",
"GF",
"GG",
"GH",
"GI",
"GL",
"GM",
"GN",
"GOB.PA",
"GO.CR",
"GO.ID",
"GO.KR",
"GO.TH",
"GO.UG",
"GOV",
"GOV.AE",
"GOV.AR",
"GOV.AU",
"GOV.BM",
"GOV.BR",
"GOV.CN",
"GOV.CO",
"GOV.CY",
"GOV.DO",
"GOV.EC",
"GOV.EG",
"GOVE.TW",
"GOV.FJ",
"GOV.GE",
"GOV.GG",
"GOV.GU",
"GOV.IL",
"GOV.IM",
"GOV.IN",
"GOV.JE",
"GOV.JO",
"GOV.JP",
"GOV.LB",
"GOV.LC",
"GOV.LV",
"GOV.MM",
"GOV.MO",
"GOV.MY",
"GOV.SG",
"GOV.SH",
"GOV.TN",
"GOVT.NZ",
"GOV.TR",
"GOV.UA",
"GOV.UK",
"GOV.VE",
"GOV.ZA",
"GP",
"GQ",
"GR",
"GS",
"GS.CN",
"GT",
"GU",
"GUERNSEY.GG",
"GW",
"GX.CN",
"GY",
"GZ.CN",
"HB.CN",
"HE.CN",
"HI.CN",
"HK",
"HK.CN",
"HL.CN",
"HM",
"HN",
"HN.CN",
"HR",
"HT",
"HU",
"ID",
"ID.AU",
"ID.FJ",
"ID.LV",
"IE",
"IL",
"IM",
"IN",
"IND.BR",
"IND.GG",
"IND.JE",
"IND.TN",
"INF.BR",
"INFO",
"INFO.AU",
"INFO.CO",
"INFO.HU",
"INFO.TN",
"INFO.VE",
"INT",
"INT.CO",
"INTL.TN",
"INT.VE",
"IO",
"IQ",
"IR",
"IS",
"IT",
"JE",
"JERSEY.JE",
"JL.CN",
"JM",
"JO",
"JP",
"JS.CN",
"K12.EC",
"K12.IL",
"K12.TR",
"KE",
"KG",
"KH",
"KI",
"KIDS",
"KM",
"KN",
"KP",
"KR",
"KW",
"KY",
"KZ",
"LA",
"LB",
"LC",
"LI",
"LK",
"LKD.CO.IM",
"LN.CN",
"LR",
"LS",
"LT",
"LTD.GG",
"LTD.JE",
"LTD.UK",
"LU",
"LV",
"LY",
"MA",
"MB.CA",
"MC",
"MD",
"ME",
"MED.EC",
"MG",
"MH",
"MIL",
"MIL.BR",
"MIL.CO",
"MIL.DO",
"MIL.EC",
"MIL.GE",
"MIL.GU",
"MIL.ID",
"MIL.LB",
"MIL.LV",
"MIL.PH",
"MIL.SH",
"MIL.TR",
"MIL.VE",
"MIL.ZA",
"MK",
"ML",
"MM",
"MN",
"MO",
"MO.CN",
"MOD.UK",
"MP",
"MQ",
"MR",
"MS",
"MT",
"MU",
"MUNI.IL",
"MUSEUM",
"MV",
"MW",
"MX",
"MY",
"MZ",
"NA",
"NAME",
"NAT.TN",
"NB.CA",
"NC",
"NE",
"NET",
"NET.AR",
"NET.AU",
"NET.AZ",
"NET.BB",
"NET.BM",
"NET.BR",
"NET.BS",
"NET.CN",
"NET.CU",
"NET.CY",
"NET.DO",
"NET.EC",
"NET.EG",
"NET.GE",
"NET.GG",
"NET.GU",
"NET.HK",
"NET.ID",
"NET.IL",
"NET.IM",
"NET.IN",
"NET.JE",
"NET.JO",
"NET.JP",
"NET.KH",
"NET.LA",
"NET.LB",
"NET.LC",
"NET.LV",
"NET.LY",
"NET.MM",
"NET.MO",
"NET.MT",
"NET.MX",
"NET.MY",
"NET.NA",
"NET.NC",
"NET.NP",
"NET.NZ",
"NET.PA",
"NET.PE",
"NET.PH",
"NET.PL",
"NET.PY",
"NET.RU",
"NET.SG",
"NET.SH",
"NET.SY",
"NET.TH",
"NET.TN",
"NET.TR",
"NET.TW",
"NET.UA",
"NET.UK",
"NET.UY",
"NET.VE",
"NET.VI",
"NET.ZA",
"NF",
"NF.CA",
"NG",
"NGO.PH",
"NGO.ZA",
"NHS.UK",
"NI",
"NIC.IM",
"NIC.IN",
"NL",
"NM.CN",
"NM.KR",
"NO",
"NOM.CO",
"NOM.VE",
"NOM.ZA",
"NP",
"NR",
"NS.CA",
"NSK.SU",
"NT.CA",
"NU",
"NUI.HU",
"NX.CN",
"NZ",
"OM",
"ON.CA",
"OR.CR",
"ORG",
"ORG.AE",
"ORG.AR",
"ORG.AU",
"ORG.AZ",
"ORG.BB",
"ORG.BM",
"ORG.BR",
"ORG.BS",
"ORG.CN",
"ORG.CO",
"ORG.CU",
"ORG.CY",
"ORG.DO",
"ORG.EC",
"ORG.EG",
"ORG.FJ",
"ORG.GE",
"ORG.GG",
"ORG.GU",
"ORG.HK",
"ORG.HU",
"ORG.IL",
"ORG.IM",
"ORG.JE",
"ORG.JP",
"ORG.KH",
"ORG.LA",
"ORG.LB",
"ORG.LC",
"ORG.LV",
"ORG.LY",
"ORG.MM",
"ORG.MO",
"ORG.MT",
"ORG.MX",
"ORG.MY",
"ORG.NA",
"ORG.NC",
"ORG.NZ",
"ORG.PA",
"ORG.PE",
"ORG.PH",
"ORG.PL",
"ORG.PY",
"ORG.RU",
"ORG.SG",
"ORG.SH",
"ORG.SY",
"ORG.TN",
"ORG.TR",
"ORG.TW",
"ORG.UK",
"ORG.UY",
"ORG.VE",
"ORG.VI",
"ORG.YU",
"ORG.ZA",
"OR.ID",
"OR.KR",
"OR.TH",
"ORT.NP",
"OR.UG",
"OZ.AU",
"PA",
"PE",
"PE.CA",
"PF",
"PG",
"PH",
"PK",
"PL",
"PLC.CO.IM",
"PLC.UK",
"PM",
"PN",
"POLICE.UK",
"PR",
"PRIV.HU",
"PRO",
"PSI.BR",
"PT",
"PVT.GE",
"PW",
"PY",
"QA",
"QC.CA",
"QH.CN",
"RE",
"REC.BR",
"REC.CO",
"REC.VE",
"RE.KR",
"RES.IN",
"RNRT.TN",
"RNS.TN",
"RNU.TN",
"RO",
"RU",
"RW",
"SA",
"SA.CR",
"SARK.GG",
"SB",
"SC",
"SC.CN",
"SCH.GG",
"SCH.JE",
"SCHOOL.FJ",
"SCHOOL.ZA",
"SCH.UK",
"SCI.EG",
"SD",
"SE",
"SG",
"SH",
"SH.CN",
"SI",
"SJ",
"SK",
"SK.CA",
"SL",
"SLD.PA",
"SM",
"SN",
"SN.CN",
"SO",
"SR",
"ST",
"STORE.CO",
"STORE.VE",
"SU",
"SV",
"SX.CN",
"SY",
"SZ",
"TC",
"TD",
"TEC.VE",
"TELEMEMO.AU",
"TF",
"TG",
"TH",
"TJ",
"TJ.CN",
"TK",
"TM",
"TM.HU",
"TMP.BR",
"TM.ZA",
"TN",
"TO",
"TOURISM.TN",
"TP",
"TR",
"TRAVEL",
"TT",
"TV",
"TW",
"TW.CN",
"TZ",
"UA",
"UG",
"UK",
"UM",
"US",
"UY",
"UZ",
"VA",
"VC",
"VE",
"VG",
"VI",
"VN",
"VU",
"WEB.CO",
"WEB.DO",
"WEB.VE",
"WEB.ZA",
"WF",
"WS",
"XJ.CN",
"XZ.CN",
"YE",
"YK.CA",
"YN.CN",
"YT",
"YU",
"ZA",
"ZJ.CN",
"ZM",
"ZR",
"ZW" };
if ( ! s_isInitialized ) {
// set up the hash table
if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0,
"tldtbl") )
return log("build: Could not init table of TLDs.");
// now add in all the stop words
long n = (long)sizeof(s_tlds)/ sizeof(char *);
for ( long i = 0 ; i < n ; i++ ) {
char *d = s_tlds[i];
long dlen = gbstrlen ( d );
long long dh = hash64Lower_a ( d , dlen );
if ( ! s_table.addKey (&dh,NULL) )
return log("build: dom table failed");
}
s_isInitialized = true;
}
long long h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld));
return s_table.isInTable ( &h );//getScoreFromTermId ( h );
}
void resetDomains ( ) {
s_table.reset();
}