#!/usr/bin/perl # reads the official IANA charset list on stdin # http://www.iana.org/assignments/character-sets # generates iana_charset.h and iana_charset.cpp # sets a flag on "supported" charsets # ...the ones we recognise and that iconv will convert for us # need supported_charsets.txt for this my $curname; # current charset name my $csCount = 0; my %charsets; open(SUPPORTED, "supported_charsets.txt") or die "Couldn't open supported_charsets.txt"; my %supportedCharsets; while () { my $line = $_; chomp $line; chomp $line; if ($line =~ /^\s*(\d+)\s+([\w-]+)/){ my $csEnum = $1; my $name = $2; print "Supported charset: $2 ($1)\n"; $supportedCharsets{$csEnum} = 1; } } while (<>){ my $line = $_; chomp $line; chomp $line; if ($line =~ /^Name:\s+([^\s]+)[^\[]*(\[([^\]]*)\])?/){ #new charset $csCount++; #print "Charset: $1\n"; #print "Ref: $3\n"; $curname=$1; $charsets{$curname} = {}; $charsets{$curname}->{ref} = $3; $charsets{$curname}->{names} = []; push @{$charsets{$curname}->{names}}, $curname; $charsets{$curname}->{preferred} = 0; $charsets{$curname}->{enum_name} = 0; next; } next unless defined($curname); if ($line =~ /^\s*$/){ # end of charset undef $curname; next; } if ($line =~ /MIBenum:\s*(\d+)/){ $charsets{$curname}->{enum_val} = $1; next } if ($line =~ /Alias:\s+([^\s]+)(\s+\(preferred MIME name\))?/){ next if ($1 eq 'None'); my $name = $1; push @{$charsets{$curname}->{names}}, $name; if (length($2)){ $charsets{$curname}->{preferred} = $#{@{$charsets{$curname}->{names}}}; } if ($name =~/^cs/){ $charsets{$curname}->{enum_name} = $#{@{$charsets{$curname}->{names}}}; } } } #additional aliases push @{$charsets{"TIS-620"}->{names}}, "windows-874"; push @{$charsets{"Shift_JIS"}->{names}}, "x-sjis"; open CFILE, ">iana_charset.h" or die "Can't open iana_charset.h for writing"; print CFILE "// iana_charset.h\n"; print CFILE "// Generated automatically by parse_iana_charsets.pl ".gmtime()."\n"; print CFILE "// DO NOT EDIT!!!\n\n"; print CFILE "#ifndef IANA_CHARSET_H__\n"; print CFILE "#define IANA_CHARSET_H__\n"; print CFILE "enum eIANACharset{\n"; print CFILE "\tcsOther = 1, // unregistered character set\n"; print CFILE "\tcsUnknown = 2, // used as a default value\n"; foreach my $cs (sort {$a->{enum_val} <=> $b->{enum_val}} values %charsets){ next if !defined($cs->{enum_val}); my $enum_name = $cs->{names}[$cs->{enum_name}]; $enum_name =~ s/[\-\_\:]+//sg; if ($enum_name !~ /^cs/){ $enum_name = "cs".$enum_name; #print ">>>$enum_name: $cs->{enum_val}\n"; } print CFILE "\t$enum_name = $cs->{enum_val},\n"; } print CFILE "\tcsReserved = 3000\n};\n\n"; print CFILE "short get_iana_charset(char *cs, int len); \n"; print CFILE "char *get_charset_str(short cs); \n"; print CFILE "bool supportedCharset(short cs); \n"; print CFILE "void setSupportedCharsets(short *cs, int numCharsets);\n"; print CFILE "#endif\n"; close CFILE; open CFILE, ">iana_charset.cpp" or die "Can't open iana_charset.cpp for writing"; print CFILE "// iana_charset.h\n"; print CFILE "// Generated automatically by parse_iana_charsets.pl ".gmtime()."\n"; print CFILE "// DO NOT EDIT!!!\n\n"; print CFILE "#include \"gb-include.h\"\n"; print CFILE "#include \"iana_charset.h\"\n"; print CFILE "#include \"HashTableX.h\"\n"; print CFILE "#include \"Conf.h\"\n"; print CFILE "#include \"hash.h\"\n"; print CFILE<{enum_val} <=> $b->{enum_val}} values %charsets){ next if !defined($cs->{enum_val}); my $enum_name = $cs->{names}[$cs->{enum_name}]; my $mime_name = $cs->{names}[$cs->{preferred}]; # Microsoft bastards if ($mime_name eq 'KS_C_5601-1987'){ $mime_name = 'x-windows-949'; } if ($enum_name =~ /^cs/){ #print "$enum_name: $cs->{enum_val}\n"; } else{ $enum_name =~ s/[\-\_\:]+//g; $enum_name = "cs".$enum_name; #print ">>>$enum_name: $cs->{enum_val}\n"; } foreach my $name (@{$cs->{names}}){ my $supported = $supportedCharsets{$cs->{enum_val}}?"1":"0"; #print "supportedCharsets: ",%supportedCharsets,"\n"; #print "$name $cs->{enum_val}: $supportedCharsets{$cs->{enum_val}}\n"; $str .= "\t{\"$name\", \"$mime_name\", $cs->{enum_val}, $supported},\n"; #print CFILE ",\n" if $name ne $cs->{names}[$#{@{$cs->{names}}}]; } } # special case...not listed in IANA charsets, but found "in the wild" #$str .= "\t{\"windows-874\", \"TIS-620\", 2259, 0},\n"; #$str .= "\t{\"x-sjis\", \"Shift_JIS\", 17, 1},\n"; chop $str;chop $str; print CFILE $str; print CFILE "\n};\n\n"; print CFILE < s_charsets[e].mib_enum) return NULL; // Binary search while (1){ // Check endpoints if (cs == s_charsets[s].mib_enum) return s_charsets[s].mime; if (cs ==s_charsets[e].mib_enum) return s_charsets[e].mime; // check midpoint i = (s+e)/2; if (cs ==s_charsets[i].mib_enum) return s_charsets[i].mime; // end of search if ((e-s)<3) return NULL; // reset either endpoint if (cs < s_charsets[i].mib_enum){e = i-1;continue;} if (cs > s_charsets[i].mib_enum){s = i+1;continue;} } } // is this charset supported? bool supportedCharset(short cs) { int s=0; int e=sizeof(s_charsets)/sizeof(IANACharset)-2; int i; if (cs < s_charsets[s].mib_enum) return false; if (cs > s_charsets[e].mib_enum) return false; // Binary search while (1){ // Check endpoints if (cs == s_charsets[s].mib_enum) return s_charsets[s].supported; if (cs ==s_charsets[e].mib_enum) return s_charsets[e].supported; // check midpoint i = (s+e)/2; if (cs ==s_charsets[i].mib_enum) return s_charsets[i].supported; // end of search if ((e-s)<3) return false; // reset either endpoint if (cs < s_charsets[i].mib_enum){e = i-1;continue;} if (cs > s_charsets[i].mib_enum){s = i+1;continue;} } } EOL close CFILE;