Add real unicode uppercasing.

Place UnicodeData.txt in %/lib/unicode-data/txt. The +capitalize generator
should then capitalize any unicode tape, including characters outside lower
ASCII.
This commit is contained in:
Elliot Glaysher 2017-10-06 20:52:33 -07:00
parent 7d46bccdfc
commit 25accb9eba
3 changed files with 361 additions and 0 deletions

52
gen/capitalize.hoon Normal file
View File

@ -0,0 +1,52 @@
::
:: part 1: parse the file into {uppers}
::
/- unicode-data
::
:: while this works, it'd be better to build range based data structures like
:: golang does. golang uses flat tables that it binary searches over. storage
:: as a binary tree?
::
/= uppers
/; |= a/(list line:unicode-data)
=| ret/(map @c @c)
|-
^- (map @c @c)
?~ a
ret
?~ up.i.a
$(a t.a)
$(a t.a, ret (~(put by ret) code.i.a u.up.i.a))
/: /===/lib/unicode-data /&unicode-data&/txt/
::
:: part 2: utility core
::
|%
++ transform
|= {a/tape fun/$-(@c @c)}
%- tufa
(turn (tuba a) fun)
::
++ to-upper
|= a/@c
^- @c
:: special case ascii to not perform map lookup.
?: (lte a max-ascii)
?: &((gte a 'a') (lte a 'z'))
(sub a 32)
a
=+ x=(~(get by uppers) a)
(fall x a)
::
++ max-ascii `@c`0x7f
--
::
:: part 3: generator
::
:- %say
|= $: {now/@da eny/@uvJ bec/beak}
{n/tape $~}
$~
==
:- %noun
(transform n to-upper)

179
mar/unicode-data.hoon Normal file
View File

@ -0,0 +1,179 @@
/- unicode-data
=, eyre
=, format
:: ok, so we can currently slurp the data in. we're having problems getting it
:: back out. ++txt:grow is probably the next logical step here.
|_ all/(list line:unicode-data)
++ grow
:> converts from unicode-data to mark.
|%
:: ++ txt
:: ^- wain
:: %+ turn all
:: |= line:unicode-data
:: ;: weld
:: ";"
:: name
:: ";"
--
::
++ grab
:> converts from mark to unicode-data.
|%
++ mime |=({* a/octs} (txt (to-wain q.a))) :: XX mark translation
++ txt
|^ |= a/wain
^+ all
%+ turn a
|= b/cord
^- line:unicode-data
(rash b line)
::
:> parses a single line of the unicode data file.
++ line
;~ (glue sem)
hex
name-string
general-category
(bass 10 (plus sid:ab))
bidi-category
decomposition-mapping
string-number
string-number
string-number
yes-or-no
name-string
name-string
optional-hex
optional-hex
optional-hex
==
::
:> parses a single name or comment string.
++ name-string
%+ cook
|=(a/tape a)
(star ;~(less sem prn))
:: (star ;~(pose hig low nud hep ace gal gar pel per))
::
:> parses a unicode general category abbreviation to symbol
++ general-category
;~ pose
(cold %lu (jest 'Lu'))
(cold %ll (jest 'Ll'))
(cold %lt (jest 'Lt'))
(cold %mn (jest 'Mn'))
(cold %mc (jest 'Mc'))
(cold %me (jest 'Me'))
(cold %nd (jest 'Nd'))
(cold %nl (jest 'Nl'))
(cold %no (jest 'No'))
(cold %zs (jest 'Zs'))
(cold %zl (jest 'Zl'))
(cold %zp (jest 'Zp'))
(cold %cc (jest 'Cc'))
(cold %cf (jest 'Cf'))
(cold %cs (jest 'Cs'))
(cold %co (jest 'Co'))
(cold %cn (jest 'Cn'))
(cold %lm (jest 'Lm'))
(cold %lo (jest 'Lo'))
(cold %pc (jest 'Pc'))
(cold %pd (jest 'Pd'))
(cold %ps (jest 'Ps'))
(cold %pe (jest 'Pe'))
(cold %pi (jest 'Pi'))
(cold %pf (jest 'Pf'))
(cold %po (jest 'Po'))
(cold %sm (jest 'Sm'))
(cold %sc (jest 'Sc'))
(cold %sk (jest 'Sk'))
(cold %so (jest 'So'))
==
::
:> parses a bidirectional category abbreviation to symbol.
++ bidi-category
;~ pose
(cold %fsi (jest 'FSI'))
(cold %lre (jest 'LRE'))
(cold %lri (jest 'LRI'))
(cold %lro (jest 'LRO'))
(cold %nsm (jest 'NSM'))
(cold %pdf (jest 'PDF'))
(cold %pdi (jest 'PDI'))
(cold %rle (jest 'RLE'))
(cold %rli (jest 'RLI'))
(cold %rlo (jest 'RLO'))
(cold %al (jest 'AL'))
(cold %an (jest 'AN'))
(cold %bn (jest 'BN'))
(cold %cs (jest 'CS'))
(cold %en (jest 'EN'))
(cold %es (jest 'ES'))
(cold %et (jest 'ET'))
(cold %on (jest 'ON'))
(cold %ws (jest 'WS'))
(cold %b (jest 'B'))
(cold %l (jest 'L'))
(cold %r (jest 'R'))
(cold %s (jest 'S'))
==
::
:: TODO: This seems to be where the nest-fail is. There's an extra @ here?
++ decomposition-mapping
;~ pose
:: a tag and a list of characters to decompose to
%+ stag ~
;~ plug
;~ pose
(stag ~ (ifix [gal ;~(plug gar ace)] decomp-tag))
(easy ~)
==
(cook |=(a/(list @c) a) (most ace hex))
==
:: no decomposition information
(easy ~)
==
::
++ decomp-tag
;~ pose
(cold %font (jest 'font'))
(cold %no-break (jest 'noBreak'))
(cold %initial (jest 'initial'))
(cold %medial (jest 'medial'))
(cold %final (jest 'final'))
(cold %isolated (jest 'isolated'))
(cold %circle (jest 'circle'))
(cold %super (jest 'super'))
(cold %sub (jest 'sub'))
(cold %vertical (jest 'vertical'))
(cold %wide (jest 'wide'))
(cold %narrow (jest 'narrow'))
(cold %small (jest 'small'))
(cold %square (jest 'square'))
(cold %fraction (jest 'fraction'))
(cold %compat (jest 'compat'))
==
::
++ string-number
%+ cook
|=(a/tape a)
(star ;~(pose nud fas hep))
::
++ yes-or-no
;~ pose
(cold %.y (jest 'Y'))
(cold %.n (jest 'N'))
==
::
++ optional-hex
;~ pose
(stag ~ hex)
(easy ~)
==
--
--
++ grad %txt
--

130
sur/unicode-data.hoon Normal file
View File

@ -0,0 +1,130 @@
|%
++ line
:> an individual codepoint definition
:>
:> code: the codepoint in hexadecimal format
:> name: the character name
:> gen: the type of character this is
:> can: the canonical combining class for ordering algorithms
:> bidi: the bidirectional category of this character
:> de: the character decomposition mapping
:> decimal: the decimal digit value (or ~)
:> digit: the digit value, covering non decimal radix forms
:> numeric: the numeric value, including fractions
:> mirrored: whether char is mirrored in bidirectional text
:> old-name: unicode 1.0 compatibility name
:> iso: iso 10646 comment field
:> up: uppercase mapping codepoint
:> low: lowercase mapping codepoint
:> title: titlecase mapping codepoint
:>
$: code/@c
name/tape
gen/general
can/@ud
bi/bidi
de/decomp
:: todo: decimal/digit/numeric need to be parsed.
decimal/tape
digit/tape
numeric/tape
mirrored/?
old-name/tape
iso/tape
up/(unit @c)
low/(unit @c)
title/(unit @c)
==
::
++ general
:> one of the normative or informative unicode general categories
:>
:> these abbreviations are as found in the unicode standard, except
:> lowercased as to be valid symbols.
$? $lu :< letter, uppercase
$ll :< letter, lowercase
$lt :< letter, titlecase
$mn :< mark, non-spacing
$mc :< mark, spacing combining
$me :< mark, enclosing
$nd :< number, decimal digit
$nl :< number, letter
$no :< number, other
$zs :< separator, space
$zl :< separator, line
$zp :< separator, paragraph
$cc :< other, control
$cf :< other, format
$cs :< other, surrogate
$co :< other, private use
$cn :< other, not assigned
::
$lm :< letter, modifier
$lo :< letter, other
$pc :< punctuation, connector
$pd :< punctuation, dash
$ps :< punctuation, open
$pe :< punctuation, close
$pi :< punctuation, initial quote
$pf :< punctuation, final quote
$po :< punctuation, other
$sm :< symbol, math
$sc :< symbol, currency
$sk :< symbol, modifier
$so :< symbol, other
==
::
++ bidi
:> bidirectional category of a unicode character
$? $l :< left-to-right
$lre :< left-to-right embedding
$lri :< left-to-right isolate
$lro :< left-to-right override
$fsi :< first strong isolate
$r :< right-to-left
$al :< right-to-left arabic
$rle :< right-to-left embedding
$rli :< right-to-left isolate
$rlo :< right-to-left override
$pdf :< pop directional format
$pdi :< pop directional isolate
$en :< european number
$es :< european number separator
$et :< european number terminator
$an :< arabic number
$cs :< common number separator
$nsm :< non-spacing mark
$bn :< boundary neutral
$b :< paragraph separator
$s :< segment separator
$ws :< whitespace
$on :< other neutrals
==
::
++ decomp
:> character decomposition mapping.
:>
:> tag: type of decomposition.
:> c: a list of codepoints this decomposes into.
(unit {tag/(unit decomp-tag) c/(list @c)})
::
++ decomp-tag
:> tag that describes the type of a character decomposition.
$? $font :< a font variant
$no-break :< a no-break version of a space or hyphen
$initial :< an initial presentation form (arabic)
$medial :< a medial presentation form (arabic)
$final :< a final presentation form (arabic)
$isolated :< an isolated presentation form (arabic)
$circle :< an encircled form
$super :< a superscript form
$sub :< a subscript form
$vertical :< a vertical layout presentation form
$wide :< a wide (or zenkaku) compatibility character
$narrow :< a narrow (or hankaku) compatibility character
$small :< a small variant form (cns compatibility)
$square :< a cjk squared font variant
$fraction :< a vulgar fraction form
$compat :< otherwise unspecified compatibility character
==
--