#include "gb-include.h" #include "HashTableX.h" #include "Domains.h" #include "Mem.h" char *getDomainOfIp ( char *host , long hostLen , long *dlen ) { // get host length //long hostLen = gbstrlen(host); // if ip != 0 then host is a numeric ip, point to first 3 #'s char *s = host + hostLen - 1; while ( s > host && *s!='.' ) s--; // if no '.' return NULL and 0 if ( s == host ) { *dlen = 0; return NULL; } // otherwise, set length *dlen = s - host; // return the first 3 #'s (1.2.3) as the domain return host; } char *getDomain ( char *host , long hostLen , char *tld , long *dlen ) { // assume no domain *dlen = 0; // get host length //long hostLen = gbstrlen(host); // get the tld in host, if any, if not, it returns NULL char *s = tld; // getTLD ( host , hostLen ); // return NULL if host contains no valid tld if ( ! s ) return NULL; // if s is host we just have tld if ( s == host ) return NULL; // there MUST be a period before s s--; if ( *s != '.' ) return NULL; // back up over the period s--; // now go back until s hits "host" or another period while ( s > host && *s !='.' ) s--; // . now *s=='.' or s==host // . if s is host then "host" is an acceptable domain w/o a hostname // . fix http://.xyz.com/...... by checking for period if ( s == host && *s !='.' ) { *dlen = hostLen; return s; } // skip s forward over the period to point to domain name s++; // set domain length *dlen = hostLen - ( s - host ); return s; } // host must be NULL terminated char *getTLD ( char *host , long hostLen ) { // make "s" point to last period in the host //char *s = host + gbstrlen(host) - 1; char *hostEnd = host + hostLen; char *s = hostEnd - 1; while ( s > host && *s !='.' ) s--; // point to the tld in question char *t = s; if ( *t == '.' ) t++; // reset our current tld ptr char *tld = NULL; // is t a valid tld? if so, set "tld" to "t". if ( isTLD ( t , hostEnd - t ) ) tld = t; // host had no period at most we had just a tld so return NULL if ( s == host ) return tld; // back up over last period s--; // just because it's in table doesn't mean we can't try going up more while ( s > host && *s !='.' ) s--; // point to the tld in question t = s; if ( *t == '.' ) t++; // is t a valid tld? if so, set "tld" to "t". if ( isTLD ( t , hostEnd - t ) ) tld = t; // host had no period at most we had just a tld so return NULL if ( s == host ) return tld; // . now only 1 tld has 2 period and that is "LKD.CO.IM" // . so waste another iteration for that (TODO: speed up?) // . back up over last period s--; // just because it's in table doesn't mean we can't try going up more while ( s > host && *s !='.' ) s--; // point to the tld in question t = s; if ( *t == '.' ) t++; // is t a valid tld? if so, set "tld" to "t". if ( isTLD ( t , hostEnd - t ) ) tld = t; // we must have gotten the tld by this point, if there was a valid one return tld; } //static TermTable s_table(false); static HashTableX s_table; bool isTLD ( char *tld , long tldLen ) { long pcount = 0; // now they are random! for ( long i = 0 ; i < tldLen ; i++ ) { // period count if ( tld[i] == '.' ) { pcount++; continue; } if ( ! is_alpha_a(tld[i]) ) return false; } if ( pcount == 0 ) return true; if ( pcount >= 2 ) return false; // otherwise, if one period, check table to see if qualified // we use this as our hashtable static bool s_isInitialized = false; // . i shrunk this list a lot // . see backups for the hold list static char *s_tlds[] = { "AB.CA", "AC", "AC.AE", "AC.AT", "AC.CN", "AC.CR", "AC.CY", "AC.FJ", "AC.GG", "AC.ID", "AC.IL", "AC.IM", "AC.IN", "AC.JE", "AC.JP", "AC.KR", "AC.NZ", "AC.PA", "AC.TH", "AC.UG", "AC.UK", "AC.YU", "AC.ZA", "AD", "AD.JP", "AE", "AERO", "AH.CN", "AI", "ALDERNEY.GG", "ALT.ZA", "AM", // 10.am "ART.BR", "ART.DO", "ARTS.CO", "ARTS.VE", "ASN.AU", "ASN.LV", "AG", "AS", "AT", "AU", "AW", "AZ", "BA", "BB", "BBS.TR", "BC.CA", "BD", "BE", "BF", "BG", "BH", "BI", "BIB.VE", "BIZ", "BJ", "BJ.CN", "BM", "BN", "BO", "BR", "BS", "BT", "BV", "BW", "BY", "BZ", "CA", "CC", "CD", // mdw "CF", "CG", "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CO.AT", "CO.AO", "CO.CK", "CO.CR", "CO.GG", "CO.HU", "CO.ID", "CO.IL", "CO.IM", "CO.IN", "CO.JE", "CO.JP", "CO.KR", "COM", "COM.AR", "COM.AU", "COM.AZ", "COM.BB", "COM.BM", "COM.BR", "COM.BS", "COM.CN", "COM.CO", "COM.CU", "COM.CY", "COM.DO", "COM.EC", "COM.EG", "COM.FJ", "COM.GE", "COM.GU", "COM.HK", "COM.JO", "COM.KH", "COM.LA", "COM.LB", "COM.LC", "COM.LV", "COM.LY", "COM.MM", "COM.MO", "COM.MT", "COM.MX", "COM.MY", "COM.NA", "COM.NC", "COM.NI", "COM.NP", "COM.PA", "COM.PE", "COM.PH", "COM.PL", "COM.PY", "COM.RU", "COM.SG", "COM.SH", "COM.SY", "COM.TN", "COM.TR", "COM.TW", "COM.UA", "COM.UY", "COM.VE", "CONF.AU", "CONF.LV", "CO.NZ", "COOP", "CO.AE", "CO.SV", "CO.TH", "CO.UG", "CO.UK", "CO.VE", "CO.VI", "CO.YU", "CO.ZA", "CQ.CN", "CR", "CSIRO.AU", "CU", "CV", "CX", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO", "DZ", "EC", "ED.CR", "EDU", "EDU.BM", "EDU.AR", "EDU.CN", "EDU.CO", "EDU.DO", "EDU.EC", "EDU.EG", "EDU.GE", "EDU.GU", "EDU.JO", "EDU.LC", "EDU.LV", "EDU.MM", "EDU.MO", "EDU.MY", "EDUNET.TN", "EDU.PA", "EDU.PY", "EDU.SG", "EDU.SH", "EDU.TR", "EDU.TW", "EDU.UY", "EDU.VE", "EDU.YU", "EDU.ZA", "EE", "EG", "EH", "ENS.TN", "ER", "ERNET.IN", "ES", "ESP.BR", "ET", "ETC.BR", "EU", "EUN.EG", "FI", "FI.CR", "FIN.EC", "FIN.TN", "FIRM.CO", "FIRM.VE", "FJ", "FK", "FM", "FO", "FR", "FX", "G12.BR", "GA", "GB", "GD", "GD.CN", "GE", "GEN.NZ", "GF", "GG", "GH", "GI", "GL", "GM", "GN", "GOB.PA", "GO.CR", "GO.ID", "GO.KR", "GO.TH", "GO.UG", "GOV", "GOV.AE", "GOV.AR", "GOV.AU", "GOV.BM", "GOV.BR", "GOV.CN", "GOV.CO", "GOV.CY", "GOV.DO", "GOV.EC", "GOV.EG", "GOVE.TW", "GOV.FJ", "GOV.GE", "GOV.GG", "GOV.GU", "GOV.IL", "GOV.IM", "GOV.IN", "GOV.JE", "GOV.JO", "GOV.JP", "GOV.LB", "GOV.LC", "GOV.LV", "GOV.MM", "GOV.MO", "GOV.MY", "GOV.SG", "GOV.SH", "GOV.TN", "GOVT.NZ", "GOV.TR", "GOV.UA", "GOV.UK", "GOV.VE", "GOV.ZA", "GP", "GQ", "GR", "GS", "GS.CN", "GT", "GU", "GUERNSEY.GG", "GW", "GX.CN", "GY", "GZ.CN", "HB.CN", "HE.CN", "HI.CN", "HK", "HK.CN", "HL.CN", "HM", "HN", "HN.CN", "HR", "HT", "HU", "ID", "ID.AU", "ID.FJ", "ID.LV", "IE", "IL", "IM", "IN", "IND.BR", "IND.GG", "IND.JE", "IND.TN", "INF.BR", "INFO", "INFO.AU", "INFO.CO", "INFO.HU", "INFO.TN", "INFO.VE", "INT", "INT.CO", "INTL.TN", "INT.VE", "IO", "IQ", "IR", "IS", "IT", "JE", "JERSEY.JE", "JL.CN", "JM", "JO", "JP", "JS.CN", "K12.EC", "K12.IL", "K12.TR", "KE", "KG", "KH", "KI", "KIDS", "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", "LB", "LC", "LI", "LK", "LKD.CO.IM", "LN.CN", "LR", "LS", "LT", "LTD.GG", "LTD.JE", "LTD.UK", "LU", "LV", "LY", "MA", "MB.CA", "MC", "MD", "ME", "MED.EC", "MG", "MH", "MIL", "MIL.BR", "MIL.CO", "MIL.DO", "MIL.EC", "MIL.GE", "MIL.GU", "MIL.ID", "MIL.LB", "MIL.LV", "MIL.PH", "MIL.SH", "MIL.TR", "MIL.VE", "MIL.ZA", "MK", "ML", "MM", "MN", "MO", "MO.CN", "MOD.UK", "MP", "MQ", "MR", "MS", "MT", "MU", "MUNI.IL", "MUSEUM", "MV", "MW", "MX", "MY", "MZ", "NA", "NAME", "NAT.TN", "NB.CA", "NC", "NE", "NET", "NET.AR", "NET.AU", "NET.AZ", "NET.BB", "NET.BM", "NET.BR", "NET.BS", "NET.CN", "NET.CU", "NET.CY", "NET.DO", "NET.EC", "NET.EG", "NET.GE", "NET.GG", "NET.GU", "NET.HK", "NET.ID", "NET.IL", "NET.IM", "NET.IN", "NET.JE", "NET.JO", "NET.JP", "NET.KH", "NET.LA", "NET.LB", "NET.LC", "NET.LV", "NET.LY", "NET.MM", "NET.MO", "NET.MT", "NET.MX", "NET.MY", "NET.NA", "NET.NC", "NET.NP", "NET.NZ", "NET.PA", "NET.PE", "NET.PH", "NET.PL", "NET.PY", "NET.RU", "NET.SG", "NET.SH", "NET.SY", "NET.TH", "NET.TN", "NET.TR", "NET.TW", "NET.UA", "NET.UK", "NET.UY", "NET.VE", "NET.VI", "NET.ZA", "NF", "NF.CA", "NG", "NGO.PH", "NGO.ZA", "NHS.UK", "NI", "NIC.IM", "NIC.IN", "NL", "NM.CN", "NM.KR", "NO", "NOM.CO", "NOM.VE", "NOM.ZA", "NP", "NR", "NS.CA", "NSK.SU", "NT.CA", "NU", "NUI.HU", "NX.CN", "NZ", "OM", "ON.CA", "OR.CR", "ORG", "ORG.AE", "ORG.AR", "ORG.AU", "ORG.AZ", "ORG.BB", "ORG.BM", "ORG.BR", "ORG.BS", "ORG.CN", "ORG.CO", "ORG.CU", "ORG.CY", "ORG.DO", "ORG.EC", "ORG.EG", "ORG.FJ", "ORG.GE", "ORG.GG", "ORG.GU", "ORG.HK", "ORG.HU", "ORG.IL", "ORG.IM", "ORG.JE", "ORG.JP", "ORG.KH", "ORG.LA", "ORG.LB", "ORG.LC", "ORG.LV", "ORG.LY", "ORG.MM", "ORG.MO", "ORG.MT", "ORG.MX", "ORG.MY", "ORG.NA", "ORG.NC", "ORG.NZ", "ORG.PA", "ORG.PE", "ORG.PH", "ORG.PL", "ORG.PY", "ORG.RU", "ORG.SG", "ORG.SH", "ORG.SY", "ORG.TN", "ORG.TR", "ORG.TW", "ORG.UK", "ORG.UY", "ORG.VE", "ORG.VI", "ORG.YU", "ORG.ZA", "OR.ID", "OR.KR", "OR.TH", "ORT.NP", "OR.UG", "OZ.AU", "PA", "PE", "PE.CA", "PF", "PG", "PH", "PK", "PL", "PLC.CO.IM", "PLC.UK", "PM", "PN", "POLICE.UK", "PR", "PRIV.HU", "PRO", "PSI.BR", "PT", "PVT.GE", "PW", "PY", "QA", "QC.CA", "QH.CN", "RE", "REC.BR", "REC.CO", "REC.VE", "RE.KR", "RES.IN", "RNRT.TN", "RNS.TN", "RNU.TN", "RO", "RU", "RW", "SA", "SA.CR", "SARK.GG", "SB", "SC", "SC.CN", "SCH.GG", "SCH.JE", "SCHOOL.FJ", "SCHOOL.ZA", "SCH.UK", "SCI.EG", "SD", "SE", "SG", "SH", "SH.CN", "SI", "SJ", "SK", "SK.CA", "SL", "SLD.PA", "SM", "SN", "SN.CN", "SO", "SR", "ST", "STORE.CO", "STORE.VE", "SU", "SV", "SX.CN", "SY", "SZ", "TC", "TD", "TEC.VE", "TELEMEMO.AU", "TF", "TG", "TH", "TJ", "TJ.CN", "TK", "TM", "TM.HU", "TMP.BR", "TM.ZA", "TN", "TO", "TOURISM.TN", "TP", "TR", "TRAVEL", "TT", "TV", "TW", "TW.CN", "TZ", "UA", "UG", "UK", "UM", "US", "UY", "UZ", "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WEB.CO", "WEB.DO", "WEB.VE", "WEB.ZA", "WF", "WS", "XJ.CN", "XZ.CN", "YE", "YK.CA", "YN.CN", "YT", "YU", "ZA", "ZJ.CN", "ZM", "ZR", "ZW" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0, "tldtbl") ) return log("build: Could not init table of TLDs."); // now add in all the stop words long n = (long)sizeof(s_tlds)/ sizeof(char *); for ( long i = 0 ; i < n ; i++ ) { char *d = s_tlds[i]; long dlen = gbstrlen ( d ); long long dh = hash64Lower_a ( d , dlen ); if ( ! s_table.addKey (&dh,NULL) ) return log("build: dom table failed"); } s_isInitialized = true; } long long h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld)); return s_table.isInTable ( &h );//getScoreFromTermId ( h ); } void resetDomains ( ) { s_table.reset(); }