open-source-search-engine/parse_iana_charsets.pl

268 lines
7.3 KiB
Perl
Raw Normal View History

2013-08-03 00:12:24 +04:00
#!/usr/bin/perl
# reads the official IANA charset list on stdin
# http://www.iana.org/assignments/character-sets
# generates iana_charset.h and iana_charset.cpp
# sets a flag on "supported" charsets
# ...the ones we recognise and that iconv will convert for us
# need supported_charsets.txt for this
my $curname; # current charset name
my $csCount = 0;
my %charsets;
open(SUPPORTED, "supported_charsets.txt")
or die "Couldn't open supported_charsets.txt";
my %supportedCharsets;
while (<SUPPORTED>) {
my $line = $_;
chomp $line;
chomp $line;
if ($line =~ /^\s*(\d+)\s+([\w-]+)/){
my $csEnum = $1;
my $name = $2;
print "Supported charset: $2 ($1)\n";
$supportedCharsets{$csEnum} = 1;
}
}
while (<>){
my $line = $_;
chomp $line;
chomp $line;
if ($line =~ /^Name:\s+([^\s]+)[^\[]*(\[([^\]]*)\])?/){
#new charset
$csCount++;
#print "Charset: $1\n";
#print "Ref: $3\n";
$curname=$1;
$charsets{$curname} = {};
$charsets{$curname}->{ref} = $3;
$charsets{$curname}->{names} = [];
push @{$charsets{$curname}->{names}}, $curname;
$charsets{$curname}->{preferred} = 0;
$charsets{$curname}->{enum_name} = 0;
next;
}
next unless defined($curname);
if ($line =~ /^\s*$/){
# end of charset
undef $curname;
next;
}
if ($line =~ /MIBenum:\s*(\d+)/){
$charsets{$curname}->{enum_val} = $1;
next
}
if ($line =~ /Alias:\s+([^\s]+)(\s+\(preferred MIME name\))?/){
next if ($1 eq 'None');
my $name = $1;
push @{$charsets{$curname}->{names}}, $name;
if (length($2)){
$charsets{$curname}->{preferred} = $#{@{$charsets{$curname}->{names}}};
}
if ($name =~/^cs/){
$charsets{$curname}->{enum_name} = $#{@{$charsets{$curname}->{names}}};
}
}
}
#additional aliases
push @{$charsets{"TIS-620"}->{names}}, "windows-874";
push @{$charsets{"Shift_JIS"}->{names}}, "x-sjis";
open CFILE, ">iana_charset.h" or die "Can't open iana_charset.h for writing";
print CFILE "// iana_charset.h\n";
print CFILE "// Generated automatically by parse_iana_charsets.pl ".gmtime()."\n";
print CFILE "// DO NOT EDIT!!!\n\n";
print CFILE "#ifndef IANA_CHARSET_H__\n";
print CFILE "#define IANA_CHARSET_H__\n";
print CFILE "enum eIANACharset{\n";
print CFILE "\tcsOther = 1, // unregistered character set\n";
print CFILE "\tcsUnknown = 2, // used as a default value\n";
foreach my $cs (sort {$a->{enum_val} <=> $b->{enum_val}} values %charsets){
next if !defined($cs->{enum_val});
my $enum_name = $cs->{names}[$cs->{enum_name}];
$enum_name =~ s/[\-\_\:]+//sg;
if ($enum_name !~ /^cs/){
$enum_name = "cs".$enum_name;
#print ">>>$enum_name: $cs->{enum_val}\n";
}
print CFILE "\t$enum_name = $cs->{enum_val},\n";
}
print CFILE "\tcsReserved = 3000\n};\n\n";
print CFILE "short get_iana_charset(char *cs, int len); \n";
print CFILE "char *get_charset_str(short cs); \n";
print CFILE "bool supportedCharset(short cs); \n";
print CFILE "void setSupportedCharsets(short *cs, int numCharsets);\n";
print CFILE "#endif\n";
close CFILE;
open CFILE, ">iana_charset.cpp" or die "Can't open iana_charset.cpp for writing";
print CFILE "// iana_charset.h\n";
print CFILE "// Generated automatically by parse_iana_charsets.pl ".gmtime()."\n";
print CFILE "// DO NOT EDIT!!!\n\n";
print CFILE "#include \"gb-include.h\"\n";
print CFILE "#include \"iana_charset.h\"\n";
print CFILE "#include \"HashTableX.h\"\n";
print CFILE "#include \"Conf.h\"\n";
2013-08-03 00:12:24 +04:00
print CFILE "#include \"hash.h\"\n";
print CFILE<<EOL;
typedef struct {
char *name;
char *mime;
short mib_enum;
char supported;
} IANACharset;
EOL
my $str = "static IANACharset s_charsets[] = {\n";
foreach my $cs (sort {$a->{enum_val} <=> $b->{enum_val}} values %charsets){
next if !defined($cs->{enum_val});
my $enum_name = $cs->{names}[$cs->{enum_name}];
my $mime_name = $cs->{names}[$cs->{preferred}];
# Microsoft bastards
if ($mime_name eq 'KS_C_5601-1987'){
$mime_name = 'x-windows-949';
}
if ($enum_name =~ /^cs/){
#print "$enum_name: $cs->{enum_val}\n";
}
else{
$enum_name =~ s/[\-\_\:]+//g;
$enum_name = "cs".$enum_name;
#print ">>>$enum_name: $cs->{enum_val}\n";
}
foreach my $name (@{$cs->{names}}){
my $supported = $supportedCharsets{$cs->{enum_val}}?"1":"0";
#print "supportedCharsets: ",%supportedCharsets,"\n";
#print "$name $cs->{enum_val}: $supportedCharsets{$cs->{enum_val}}\n";
$str .= "\t{\"$name\", \"$mime_name\", $cs->{enum_val}, $supported},\n";
#print CFILE ",\n" if $name ne $cs->{names}[$#{@{$cs->{names}}}];
}
}
# special case...not listed in IANA charsets, but found "in the wild"
#$str .= "\t{\"windows-874\", \"TIS-620\", 2259, 0},\n";
#$str .= "\t{\"x-sjis\", \"Shift_JIS\", 17, 1},\n";
chop $str;chop $str;
print CFILE $str;
print CFILE "\n};\n\n";
print CFILE <<EOL;
static HashTableX s_table;
static bool s_isInitialized = false;
void reset_iana_charset ( ) {
s_table.reset();
}
// Slightly modified from getTextEntity
short get_iana_charset(char *cs, int len)
{
if (!s_isInitialized){
// set up the hash table
if ( ! s_table.set ( 8,4,4096,NULL,0,false,0,"ianatbl") )
return log("build: Could not init table of "
"IANA Charsets.");
// now add in all the charset entries
long n = (long)sizeof(s_charsets) / (long)sizeof(IANACharset);
// turn off quickpolling
char saved = g_conf.m_useQuickpoll;
g_conf.m_useQuickpoll = false;
for ( long i = 0 ; i < n ; i++ ) {
long long h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
// store the charset index in the hash table as score
if ( ! s_table.addTerm(&h, i+1) )
return log("build: add term failed");
}
g_conf.m_useQuickpoll = saved;
s_isInitialized = true;
}
long long h = hash64Lower_a ( cs , len );
// get the entity index from table (stored in the score field)
long i = (long) s_table.getScore ( &h );
// return 0 if no match
if ( i == 0 ) return csUnknown;
// return the iso character
return (short)s_charsets[i-1].mib_enum;
}
char *get_charset_str(short cs)
{
int s=0;
int e=sizeof(s_charsets)/sizeof(IANACharset)-2;
int i;
if (cs < s_charsets[s].mib_enum) return NULL;
if (cs > s_charsets[e].mib_enum) return NULL;
// Binary search
while (1){
// Check endpoints
if (cs == s_charsets[s].mib_enum) return s_charsets[s].mime;
if (cs ==s_charsets[e].mib_enum) return s_charsets[e].mime;
// check midpoint
i = (s+e)/2;
if (cs ==s_charsets[i].mib_enum) return s_charsets[i].mime;
// end of search
if ((e-s)<3) return NULL;
// reset either endpoint
if (cs < s_charsets[i].mib_enum){e = i-1;continue;}
if (cs > s_charsets[i].mib_enum){s = i+1;continue;}
}
}
// is this charset supported?
bool supportedCharset(short cs) {
int s=0;
int e=sizeof(s_charsets)/sizeof(IANACharset)-2;
int i;
if (cs < s_charsets[s].mib_enum) return false;
if (cs > s_charsets[e].mib_enum) return false;
// Binary search
while (1){
// Check endpoints
if (cs == s_charsets[s].mib_enum) return s_charsets[s].supported;
if (cs ==s_charsets[e].mib_enum) return s_charsets[e].supported;
// check midpoint
i = (s+e)/2;
if (cs ==s_charsets[i].mib_enum) return s_charsets[i].supported;
// end of search
if ((e-s)<3) return false;
// reset either endpoint
if (cs < s_charsets[i].mib_enum){e = i-1;continue;}
if (cs > s_charsets[i].mib_enum){s = i+1;continue;}
}
}
EOL
close CFILE;