#include "gb-include.h" #include "Entities.h" #include "Unicode.h" #include "HashTableX.h" // JAB: const-ness for optimizer... // don't call these, they're used internally static bool initEntityTable(); static uint32_t getTextEntity ( char *s , int32_t len ); static uint32_t getDecimalEntity ( char *s , int32_t len ); static uint32_t getHexadecimalEntity ( char *s , int32_t len ); // . s[maxLen] should be the NULL // . returns full length of entity @ "s" if there is a valid one, 0 otherwise // . sets *c to the iso character the entity represents (if there is one) // JAB: const-ness for optimizer... int32_t getEntity_a ( char *s , int32_t maxLen , uint32_t *c ) { // ensure there's an & as first char if ( s[0] != '&' ) return 0; // compute maximum length of entity, if it's indeed an entity int32_t len = 1; if ( s[len]=='#' ) len++; // cut it off after 9 chars to save time while ( len < maxLen && len < 9 && is_alnum_a(s[len]) ) len++; // include the ending ; if any if ( len < maxLen && s[len]==';' ) len++; // char d = s[len]; // s[len]='\0'; // fprintf(stderr,"got entity %s \n",s); // s[len]=d; // we don't have entities longer than "¤" if ( len > 10 ) return 0; // all entites are 3 or more chars (>) if ( len < 3 ) return 0; // . if it's a numeric entity like { use this routine // . pass in the whole she-bang: " ...;" or "´...; if ( s[1] == '#' ) { if ( s[2] == 'x' ) *c = getHexadecimalEntity (s, len ); else *c = getDecimalEntity (s, len ); } // otherwise, it's text else *c = getTextEntity ( s , len ); // return 0 if not an entity, length of entity if it is an entity if ( *c ) return len; else return 0; } // Moved this out of function to be shared by ascii and unicode versions static HashTableX s_table; static bool s_isInitialized = false; struct Entity { int32_t unicode; char *entity; unsigned char c; int32_t utf8Len; unsigned char utf8[4]; }; //parse these out of //http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references // http://www.w3.org/TR/html4/sgml/entities.html // wget that and and awk the crap out: //grep ENTITY poo | awk '{print $2" "$4}' | awk -F" \"&#" '{print $1" "$2}' | awk -F";" '{print $1}' | awk '{print "\t{ "$2", \"&"$1"\", 0,0,{0,0,0,0}},"}' >> Entities.cpp static struct Entity s_entities[] = { { 160, " ", 0,0,{0,0,0,0}}, { 161, "¡", 0,0,{0,0,0,0}}, { 162, "¢", 0,0,{0,0,0,0}}, { 163, "£", 0,0,{0,0,0,0}}, { 164, "¤", 0,0,{0,0,0,0}}, { 165, "¥", 0,0,{0,0,0,0}}, { 166, "¦", 0,0,{0,0,0,0}}, { 167, "§", 0,0,{0,0,0,0}}, { 168, "¨", 0,0,{0,0,0,0}}, { 169, "©", 0,0,{0,0,0,0}}, { 170, "ª", 0,0,{0,0,0,0}}, { 171, "«", 0,0,{0,0,0,0}}, { 172, "¬", 0,0,{0,0,0,0}}, { 173, "­", 0,0,{0,0,0,0}}, { 174, "®", 0,0,{0,0,0,0}}, { 175, "¯", 0,0,{0,0,0,0}}, { 176, "°", 0,0,{0,0,0,0}}, { 177, "±", 0,0,{0,0,0,0}}, { 178, "²", 0,0,{0,0,0,0}}, { 179, "³", 0,0,{0,0,0,0}}, { 180, "´", 0,0,{0,0,0,0}}, { 181, "µ", 0,0,{0,0,0,0}}, { 182, "¶", 0,0,{0,0,0,0}}, { 183, "·", 0,0,{0,0,0,0}}, { 184, "¸", 0,0,{0,0,0,0}}, { 185, "¹", 0,0,{0,0,0,0}}, { 186, "º", 0,0,{0,0,0,0}}, { 187, "»", 0,0,{0,0,0,0}}, { 188, "¼", 0,0,{0,0,0,0}}, { 189, "½", 0,0,{0,0,0,0}}, { 190, "¾", 0,0,{0,0,0,0}}, { 191, "¿", 0,0,{0,0,0,0}}, { 192, "À", 0,0,{0,0,0,0}}, { 193, "Á", 0,0,{0,0,0,0}}, { 194, "Â", 0,0,{0,0,0,0}}, { 195, "Ã", 0,0,{0,0,0,0}}, { 196, "Ä", 0,0,{0,0,0,0}}, { 197, "Å", 0,0,{0,0,0,0}}, { 198, "Æ", 0,0,{0,0,0,0}}, { 199, "Ç", 0,0,{0,0,0,0}}, { 200, "È", 0,0,{0,0,0,0}}, { 201, "É", 0,0,{0,0,0,0}}, { 202, "Ê", 0,0,{0,0,0,0}}, { 203, "Ë", 0,0,{0,0,0,0}}, { 204, "Ì", 0,0,{0,0,0,0}}, { 205, "Í", 0,0,{0,0,0,0}}, { 206, "Î", 0,0,{0,0,0,0}}, { 207, "Ï", 0,0,{0,0,0,0}}, { 208, "Ð", 0,0,{0,0,0,0}}, { 209, "Ñ", 0,0,{0,0,0,0}}, { 210, "Ò", 0,0,{0,0,0,0}}, { 211, "Ó", 0,0,{0,0,0,0}}, { 212, "Ô", 0,0,{0,0,0,0}}, { 213, "Õ", 0,0,{0,0,0,0}}, { 214, "Ö", 0,0,{0,0,0,0}}, { 215, "×", 0,0,{0,0,0,0}}, { 216, "Ø", 0,0,{0,0,0,0}}, { 217, "Ù", 0,0,{0,0,0,0}}, { 218, "Ú", 0,0,{0,0,0,0}}, { 219, "Û", 0,0,{0,0,0,0}}, { 220, "Ü", 0,0,{0,0,0,0}}, { 221, "Ý", 0,0,{0,0,0,0}}, { 222, "Þ", 0,0,{0,0,0,0}}, { 223, "ß", 0,0,{0,0,0,0}}, { 224, "à", 0,0,{0,0,0,0}}, { 225, "á", 0,0,{0,0,0,0}}, { 226, "â", 0,0,{0,0,0,0}}, { 227, "ã", 0,0,{0,0,0,0}}, { 228, "ä", 0,0,{0,0,0,0}}, { 229, "å", 0,0,{0,0,0,0}}, { 230, "æ", 0,0,{0,0,0,0}}, { 231, "ç", 0,0,{0,0,0,0}}, { 232, "è", 0,0,{0,0,0,0}}, { 233, "é", 0,0,{0,0,0,0}}, { 234, "ê", 0,0,{0,0,0,0}}, { 235, "ë", 0,0,{0,0,0,0}}, { 236, "ì", 0,0,{0,0,0,0}}, { 237, "í", 0,0,{0,0,0,0}}, { 238, "î", 0,0,{0,0,0,0}}, { 239, "ï", 0,0,{0,0,0,0}}, { 240, "ð", 0,0,{0,0,0,0}}, { 241, "ñ", 0,0,{0,0,0,0}}, { 242, "ò", 0,0,{0,0,0,0}}, { 243, "ó", 0,0,{0,0,0,0}}, { 244, "ô", 0,0,{0,0,0,0}}, { 245, "õ", 0,0,{0,0,0,0}}, { 246, "ö", 0,0,{0,0,0,0}}, { 247, "÷", 0,0,{0,0,0,0}}, { 248, "ø", 0,0,{0,0,0,0}}, { 249, "ù", 0,0,{0,0,0,0}}, { 250, "ú", 0,0,{0,0,0,0}}, { 251, "û", 0,0,{0,0,0,0}}, { 252, "ü", 0,0,{0,0,0,0}}, { 253, "ý", 0,0,{0,0,0,0}}, { 254, "þ", 0,0,{0,0,0,0}}, { 255, "ÿ", 0,0,{0,0,0,0}}, { 402, "&fnof", 0,0,{0,0,0,0}}, { 913, "&Alpha", 0,0,{0,0,0,0}}, { 914, "&Beta", 0,0,{0,0,0,0}}, { 915, "&Gamma", 0,0,{0,0,0,0}}, { 916, "&Delta", 0,0,{0,0,0,0}}, { 917, "&Epsilon", 0,0,{0,0,0,0}}, { 918, "&Zeta", 0,0,{0,0,0,0}}, { 919, "&Eta", 0,0,{0,0,0,0}}, { 920, "&Theta", 0,0,{0,0,0,0}}, { 921, "&Iota", 0,0,{0,0,0,0}}, { 922, "&Kappa", 0,0,{0,0,0,0}}, { 923, "&Lambda", 0,0,{0,0,0,0}}, { 924, "&Mu", 0,0,{0,0,0,0}}, { 925, "&Nu", 0,0,{0,0,0,0}}, { 926, "&Xi", 0,0,{0,0,0,0}}, { 927, "&Omicron", 0,0,{0,0,0,0}}, { 928, "&Pi", 0,0,{0,0,0,0}}, { 929, "&Rho", 0,0,{0,0,0,0}}, { 931, "&Sigma", 0,0,{0,0,0,0}}, { 932, "&Tau", 0,0,{0,0,0,0}}, { 933, "&Upsilon", 0,0,{0,0,0,0}}, { 934, "&Phi", 0,0,{0,0,0,0}}, { 935, "&Chi", 0,0,{0,0,0,0}}, { 936, "&Psi", 0,0,{0,0,0,0}}, { 937, "&Omega", 0,0,{0,0,0,0}}, { 945, "&alpha", 0,0,{0,0,0,0}}, { 946, "&beta", 0,0,{0,0,0,0}}, { 947, "&gamma", 0,0,{0,0,0,0}}, { 948, "&delta", 0,0,{0,0,0,0}}, { 949, "&epsilon", 0,0,{0,0,0,0}}, { 950, "&zeta", 0,0,{0,0,0,0}}, { 951, "&eta", 0,0,{0,0,0,0}}, { 952, "&theta", 0,0,{0,0,0,0}}, { 953, "&iota", 0,0,{0,0,0,0}}, { 954, "&kappa", 0,0,{0,0,0,0}}, { 955, "&lambda", 0,0,{0,0,0,0}}, { 956, "&mu", 0,0,{0,0,0,0}}, { 957, "&nu", 0,0,{0,0,0,0}}, { 958, "&xi", 0,0,{0,0,0,0}}, { 959, "&omicron", 0,0,{0,0,0,0}}, { 960, "&pi", 0,0,{0,0,0,0}}, { 961, "&rho", 0,0,{0,0,0,0}}, { 962, "&sigmaf", 0,0,{0,0,0,0}}, { 963, "&sigma", 0,0,{0,0,0,0}}, { 964, "&tau", 0,0,{0,0,0,0}}, { 965, "&upsilon", 0,0,{0,0,0,0}}, { 966, "&phi", 0,0,{0,0,0,0}}, { 967, "&chi", 0,0,{0,0,0,0}}, { 968, "&psi", 0,0,{0,0,0,0}}, { 969, "&omega", 0,0,{0,0,0,0}}, { 977, "&thetasym", 0,0,{0,0,0,0}}, { 978, "&upsih", 0,0,{0,0,0,0}}, { 982, "&piv", 0,0,{0,0,0,0}}, { 8226, "&bull", 0,0,{0,0,0,0}}, { 8230, "&hellip", 0,0,{0,0,0,0}}, { 8242, "&prime", 0,0,{0,0,0,0}}, { 8243, "&Prime", 0,0,{0,0,0,0}}, { 8254, "&oline", 0,0,{0,0,0,0}}, { 8260, "&frasl", 0,0,{0,0,0,0}}, { 8472, "&weierp", 0,0,{0,0,0,0}}, { 8465, "&image", 0,0,{0,0,0,0}}, { 8476, "&real", 0,0,{0,0,0,0}}, { 8482, "&trade", 0,0,{0,0,0,0}}, { 8501, "&alefsym", 0,0,{0,0,0,0}}, { 8592, "&larr", 0,0,{0,0,0,0}}, { 8593, "&uarr", 0,0,{0,0,0,0}}, { 8594, "&rarr", 0,0,{0,0,0,0}}, { 8595, "&darr", 0,0,{0,0,0,0}}, { 8596, "&harr", 0,0,{0,0,0,0}}, { 8629, "&crarr", 0,0,{0,0,0,0}}, { 8656, "&lArr", 0,0,{0,0,0,0}}, { 8657, "&uArr", 0,0,{0,0,0,0}}, { 8658, "&rArr", 0,0,{0,0,0,0}}, { 8659, "&dArr", 0,0,{0,0,0,0}}, { 8660, "&hArr", 0,0,{0,0,0,0}}, { 8704, "&forall", 0,0,{0,0,0,0}}, { 8706, "&part", 0,0,{0,0,0,0}}, { 8707, "&exist", 0,0,{0,0,0,0}}, { 8709, "&empty", 0,0,{0,0,0,0}}, { 8711, "&nabla", 0,0,{0,0,0,0}}, { 8712, "&isin", 0,0,{0,0,0,0}}, { 8713, "¬in", 0,0,{0,0,0,0}}, { 8715, "&ni", 0,0,{0,0,0,0}}, { 8719, "&prod", 0,0,{0,0,0,0}}, { 8721, "&sum", 0,0,{0,0,0,0}}, { 8722, "&minus", 0,0,{0,0,0,0}}, { 8727, "&lowast", 0,0,{0,0,0,0}}, { 8730, "&radic", 0,0,{0,0,0,0}}, { 8733, "&prop", 0,0,{0,0,0,0}}, { 8734, "&infin", 0,0,{0,0,0,0}}, { 8736, "&ang", 0,0,{0,0,0,0}}, { 8743, "&and", 0,0,{0,0,0,0}}, { 8744, "&or", 0,0,{0,0,0,0}}, { 8745, "&cap", 0,0,{0,0,0,0}}, { 8746, "&cup", 0,0,{0,0,0,0}}, { 8747, "&int", 0,0,{0,0,0,0}}, { 8756, "&there4", 0,0,{0,0,0,0}}, { 8764, "&sim", 0,0,{0,0,0,0}}, { 8773, "&cong", 0,0,{0,0,0,0}}, { 8776, "&asymp", 0,0,{0,0,0,0}}, { 8800, "&ne", 0,0,{0,0,0,0}}, { 8801, "&equiv", 0,0,{0,0,0,0}}, { 8804, "&le", 0,0,{0,0,0,0}}, { 8805, "&ge", 0,0,{0,0,0,0}}, { 8834, "&sub", 0,0,{0,0,0,0}}, { 8835, "&sup", 0,0,{0,0,0,0}}, { 8836, "&nsub", 0,0,{0,0,0,0}}, { 8838, "&sube", 0,0,{0,0,0,0}}, { 8839, "&supe", 0,0,{0,0,0,0}}, { 8853, "&oplus", 0,0,{0,0,0,0}}, { 8855, "&otimes", 0,0,{0,0,0,0}}, { 8869, "&perp", 0,0,{0,0,0,0}}, { 8901, "&sdot", 0,0,{0,0,0,0}}, { 8968, "&lceil", 0,0,{0,0,0,0}}, { 8969, "&rceil", 0,0,{0,0,0,0}}, { 8970, "&lfloor", 0,0,{0,0,0,0}}, { 8971, "&rfloor", 0,0,{0,0,0,0}}, { 9001, "&lang", 0,0,{0,0,0,0}}, { 9002, "&rang", 0,0,{0,0,0,0}}, { 9674, "&loz", 0,0,{0,0,0,0}}, { 9824, "&spades", 0,0,{0,0,0,0}}, { 9827, "&clubs", 0,0,{0,0,0,0}}, { 9829, "&hearts", 0,0,{0,0,0,0}}, { 9830, "&diams", 0,0,{0,0,0,0}}, { 34, """, 0,0,{0,0,0,0}}, { 38, "&", 0,0,{0,0,0,0}}, { 38, "&", 0,0,{0,0,0,0}}, // a hack fix { 60, "<", 0,0,{0,0,0,0}}, { 62, ">", 0,0,{0,0,0,0}}, { 338, "&OElig", 0,0,{0,0,0,0}}, { 339, "&oelig", 0,0,{0,0,0,0}}, { 352, "&Scaron", 0,0,{0,0,0,0}}, { 353, "&scaron", 0,0,{0,0,0,0}}, { 376, "&Yuml", 0,0,{0,0,0,0}}, { 710, "&circ", 0,0,{0,0,0,0}}, { 732, "&tilde", 0,0,{0,0,0,0}}, { 8194, "&ensp", 0,0,{0,0,0,0}}, { 8195, "&emsp", 0,0,{0,0,0,0}}, { 8201, "&thinsp", 0,0,{0,0,0,0}}, { 8204, "&zwnj", 0,0,{0,0,0,0}}, { 8205, "&zwj", 0,0,{0,0,0,0}}, { 8206, "&lrm", 0,0,{0,0,0,0}}, { 8207, "&rlm", 0,0,{0,0,0,0}}, { 8211, "&ndash", 0,0,{0,0,0,0}}, { 8212, "&mdash", 0,0,{0,0,0,0}}, { 8216, "&lsquo", 0,0,{0,0,0,0}}, { 8217, "&rsquo", 0,0,{0,0,0,0}}, { 8218, "&sbquo", 0,0,{0,0,0,0}}, { 8220, "&ldquo", 0,0,{0,0,0,0}}, { 8221, "&rdquo", 0,0,{0,0,0,0}}, { 8222, "&bdquo", 0,0,{0,0,0,0}}, { 8224, "&dagger", 0,0,{0,0,0,0}}, { 8225, "&Dagger", 0,0,{0,0,0,0}}, { 8240, "&permil", 0,0,{0,0,0,0}}, { 8249, "&lsaquo", 0,0,{0,0,0,0}}, { 8250, "&rsaquo", 0,0,{0,0,0,0}}, { 8364, "&euro", 0,0,{0,0,0,0}} }; /* // yeah right... here is a ton ton more! // http://www.blackwellpublishing.com/xml/dtds/4-0/help/bpg4-0entities.mod // it is like there is a text entity for every char! // JAB: from http://rabbit.eng.miami.edu/info/htmlchars.html // non-Latin1 that are missing from this version... // &Etilde // &Ering // &etilde // &ering // &Itilde // &Iring // &itilde // &iring // &OElig // &Oring // &oelig // &oring // &Utilde // &Uring // &utilde // &uring // &Ygrave // &Ycirc // &Ytilde // &Yuml // &Yring // &ygrave // &ycirc // &ytilde // &yring }; */ void resetEntities ( ) { s_table.reset(); } static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) ) return log("build: Could not init table of " "HTML entities."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // grab the unicode code point UChar32 up = s_entities[i].unicode; // now we are 100% up if ( ! up ) { char *xx=NULL;*xx=0; } // point to it char *buf = (char *)s_entities[i].utf8; // if uchar32 not 0 then set the utf8 with it int32_t len = utf8Encode(up,buf); // // make my own mods to make parsing easier // if ( up == 160 ) { // nbsp buf[0] = ' '; len = 1; } // make all quotes equal '\"' (34 decimal) // double and single curling quotes //http://www.dwheeler.com/essays/quotes-test-utf-8.html // “, 201d, 2018, 2019 (unicode values, not utf8) // &ldquo, &rdquo, &lsquo, &rsquo /* if ( up == 171 || up == 187 || up == 8216 || up == 8217 || up == 8218 || up == 8220 || up == 8221 || up == 8222 || up == 8249 || up == 8250 ) { buf[0] = '\"'; len = 1; } // and normalize all dashes (mdash,ndash) if ( up == 8211 || up == 8212 ) { buf[0] = '-'; len = 1; } */ // // end custom mods // // set length s_entities[i].utf8Len = len; // check it if ( len == 0 ) { char *xx=NULL;*xx=0; } // must not exist! if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;} // store the entity index in the hash table as score if ( ! s_table.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } return true; } // . is "s" an HTML entity? (ascii representative of an iso char) // . return the 32-bit unicode char it represents // . returns 0 if none // . JAB: const-ness for optimizer... uint32_t getTextEntity ( char *s , int32_t len ) { if ( !initEntityTable()) return 0; // take the ; off, if any if ( s[len-1] == ';' ) len--; // compute the hash of the entity including &, but not ; int64_t h = hash64 ( s , len ); // get the entity index from table (stored in the score field) int32_t i = (int32_t) s_table.getScore ( &h ); // return 0 if no match if ( i == 0 ) return 0; // point to the utf8 char. these is 1 or 2 bytes it seems char *p = (char *)s_entities[i-1].utf8; // encode into unicode uint32_t c = utf8Decode ( p ); // return that return c; // return the iso character //printf("Converted text entity \""); //for(int si=0;si 9 ) return 0; // . must start with &#[0-9] if ( s[0] !='&' || s[1] != '#' || ! is_digit(s[2]) ) return 0; // use space as default uint32_t v ; if ( len == 3 ) v = (s[2]-48); else if ( len == 4 ) v = (s[2]-48)*10 + (s[3]-48); else if ( len == 5 ) v = (s[2]-48)*100 + (s[3]-48)*10 + (s[4]-48); else if ( len == 6 ) v = (s[2]-48)*1000 + (s[3]-48)*100 + (s[4]-48)*10 + s[5]-48; else if ( len == 7 ) v = (s[2]-48)*10000 + (s[3]-48)*1000+ (s[4]-48)*100+ (s[5]-48)*10+ s[5]-48; else if ( len == 8 ) v = (s[2]-48)*100000 + (s[3]-48)*10000 + (s[4]-48)*1000+ (s[5]-48)*100+ (s[6]-48)*10+ s[7]-48; else if ( len == 9 ) v = (s[2]-48)*1000000 + (s[3]-48)*100000 + (s[4]-48)*10000 + (s[5]-48)*1000 + (s[6]-48)*100 + (s[7]-48)*10 + s[7]-48; else return (uint32_t)' '; //printf("Translated entity (dec)"); //for (int i=0;i0x10ffff) return (uint32_t)' '; return v; } // . get a hexadecimal encoded entity // . JAB: const-ness for optimizer... // . returns a UChar32 uint32_t getHexadecimalEntity ( char *s , int32_t len ) { // take the ; off, if any if ( s[len-1] == ';' ) len--; // .  is smallest it can be // . 􏿿 is biggest if ( len < 4 || len > 9 ) return (char)0; // . must start with &#x[0-f] if ( s[0] !='&' || s[1] != '#' || s[2] !='x' ) return (char)0; if ( ! is_hex ( s[3] ) ) return (char)0; // use space as default uint32_t v; if ( len == 4 ) v = htob(s[3]); else if ( len == 5 ) v = (htob(s[3]) << 4) + htob(s[4]); else if ( len == 6 ) v = (htob(s[3]) << 8) + (htob(s[4]) << 4) + htob(s[5]); else if ( len == 7 ) v = (htob(s[3]) << 12) + (htob(s[4]) << 8) + (htob(s[5]) << 4) + htob(s[6]); else if ( len == 8 ) v = (htob(s[3]) << 16) + (htob(s[4]) << 12) + (htob(s[5]) << 8) + (htob(s[6]) << 4) + htob(s[7]); else if ( len == 9 ) v = (htob(s[3]) << 20) + (htob(s[4]) << 16) + (htob(s[5]) << 12) + (htob(s[6]) << 8) + (htob(s[7]) << 4) + htob(s[8]); else return (uint32_t)' '; // return the char //printf("Translated entity (dec)"); //for (int i=0;i0x10ffff) return (uint32_t)' '; return (uint32_t) v; }