mirror of
https://github.com/urbit/shrub.git
synced 2024-12-14 11:08:45 +03:00
Add real unicode uppercasing.
Place UnicodeData.txt in %/lib/unicode-data/txt. The +capitalize generator should then capitalize any unicode tape, including characters outside lower ASCII.
This commit is contained in:
parent
7d46bccdfc
commit
25accb9eba
52
gen/capitalize.hoon
Normal file
52
gen/capitalize.hoon
Normal file
@ -0,0 +1,52 @@
|
||||
::
|
||||
:: part 1: parse the file into {uppers}
|
||||
::
|
||||
/- unicode-data
|
||||
::
|
||||
:: while this works, it'd be better to build range based data structures like
|
||||
:: golang does. golang uses flat tables that it binary searches over. storage
|
||||
:: as a binary tree?
|
||||
::
|
||||
/= uppers
|
||||
/; |= a/(list line:unicode-data)
|
||||
=| ret/(map @c @c)
|
||||
|-
|
||||
^- (map @c @c)
|
||||
?~ a
|
||||
ret
|
||||
?~ up.i.a
|
||||
$(a t.a)
|
||||
$(a t.a, ret (~(put by ret) code.i.a u.up.i.a))
|
||||
/: /===/lib/unicode-data /&unicode-data&/txt/
|
||||
::
|
||||
:: part 2: utility core
|
||||
::
|
||||
|%
|
||||
++ transform
|
||||
|= {a/tape fun/$-(@c @c)}
|
||||
%- tufa
|
||||
(turn (tuba a) fun)
|
||||
::
|
||||
++ to-upper
|
||||
|= a/@c
|
||||
^- @c
|
||||
:: special case ascii to not perform map lookup.
|
||||
?: (lte a max-ascii)
|
||||
?: &((gte a 'a') (lte a 'z'))
|
||||
(sub a 32)
|
||||
a
|
||||
=+ x=(~(get by uppers) a)
|
||||
(fall x a)
|
||||
::
|
||||
++ max-ascii `@c`0x7f
|
||||
--
|
||||
::
|
||||
:: part 3: generator
|
||||
::
|
||||
:- %say
|
||||
|= $: {now/@da eny/@uvJ bec/beak}
|
||||
{n/tape $~}
|
||||
$~
|
||||
==
|
||||
:- %noun
|
||||
(transform n to-upper)
|
179
mar/unicode-data.hoon
Normal file
179
mar/unicode-data.hoon
Normal file
@ -0,0 +1,179 @@
|
||||
/- unicode-data
|
||||
=, eyre
|
||||
=, format
|
||||
|
||||
:: ok, so we can currently slurp the data in. we're having problems getting it
|
||||
:: back out. ++txt:grow is probably the next logical step here.
|
||||
|
||||
|_ all/(list line:unicode-data)
|
||||
++ grow
|
||||
:> converts from unicode-data to mark.
|
||||
|%
|
||||
:: ++ txt
|
||||
:: ^- wain
|
||||
:: %+ turn all
|
||||
:: |= line:unicode-data
|
||||
:: ;: weld
|
||||
:: ";"
|
||||
:: name
|
||||
:: ";"
|
||||
--
|
||||
::
|
||||
++ grab
|
||||
:> converts from mark to unicode-data.
|
||||
|%
|
||||
++ mime |=({* a/octs} (txt (to-wain q.a))) :: XX mark translation
|
||||
++ txt
|
||||
|^ |= a/wain
|
||||
^+ all
|
||||
%+ turn a
|
||||
|= b/cord
|
||||
^- line:unicode-data
|
||||
(rash b line)
|
||||
::
|
||||
:> parses a single line of the unicode data file.
|
||||
++ line
|
||||
;~ (glue sem)
|
||||
hex
|
||||
name-string
|
||||
general-category
|
||||
(bass 10 (plus sid:ab))
|
||||
bidi-category
|
||||
decomposition-mapping
|
||||
string-number
|
||||
string-number
|
||||
string-number
|
||||
yes-or-no
|
||||
name-string
|
||||
name-string
|
||||
optional-hex
|
||||
optional-hex
|
||||
optional-hex
|
||||
==
|
||||
::
|
||||
:> parses a single name or comment string.
|
||||
++ name-string
|
||||
%+ cook
|
||||
|=(a/tape a)
|
||||
(star ;~(less sem prn))
|
||||
:: (star ;~(pose hig low nud hep ace gal gar pel per))
|
||||
::
|
||||
:> parses a unicode general category abbreviation to symbol
|
||||
++ general-category
|
||||
;~ pose
|
||||
(cold %lu (jest 'Lu'))
|
||||
(cold %ll (jest 'Ll'))
|
||||
(cold %lt (jest 'Lt'))
|
||||
(cold %mn (jest 'Mn'))
|
||||
(cold %mc (jest 'Mc'))
|
||||
(cold %me (jest 'Me'))
|
||||
(cold %nd (jest 'Nd'))
|
||||
(cold %nl (jest 'Nl'))
|
||||
(cold %no (jest 'No'))
|
||||
(cold %zs (jest 'Zs'))
|
||||
(cold %zl (jest 'Zl'))
|
||||
(cold %zp (jest 'Zp'))
|
||||
(cold %cc (jest 'Cc'))
|
||||
(cold %cf (jest 'Cf'))
|
||||
(cold %cs (jest 'Cs'))
|
||||
(cold %co (jest 'Co'))
|
||||
(cold %cn (jest 'Cn'))
|
||||
(cold %lm (jest 'Lm'))
|
||||
(cold %lo (jest 'Lo'))
|
||||
(cold %pc (jest 'Pc'))
|
||||
(cold %pd (jest 'Pd'))
|
||||
(cold %ps (jest 'Ps'))
|
||||
(cold %pe (jest 'Pe'))
|
||||
(cold %pi (jest 'Pi'))
|
||||
(cold %pf (jest 'Pf'))
|
||||
(cold %po (jest 'Po'))
|
||||
(cold %sm (jest 'Sm'))
|
||||
(cold %sc (jest 'Sc'))
|
||||
(cold %sk (jest 'Sk'))
|
||||
(cold %so (jest 'So'))
|
||||
==
|
||||
::
|
||||
:> parses a bidirectional category abbreviation to symbol.
|
||||
++ bidi-category
|
||||
;~ pose
|
||||
(cold %fsi (jest 'FSI'))
|
||||
(cold %lre (jest 'LRE'))
|
||||
(cold %lri (jest 'LRI'))
|
||||
(cold %lro (jest 'LRO'))
|
||||
(cold %nsm (jest 'NSM'))
|
||||
(cold %pdf (jest 'PDF'))
|
||||
(cold %pdi (jest 'PDI'))
|
||||
(cold %rle (jest 'RLE'))
|
||||
(cold %rli (jest 'RLI'))
|
||||
(cold %rlo (jest 'RLO'))
|
||||
(cold %al (jest 'AL'))
|
||||
(cold %an (jest 'AN'))
|
||||
(cold %bn (jest 'BN'))
|
||||
(cold %cs (jest 'CS'))
|
||||
(cold %en (jest 'EN'))
|
||||
(cold %es (jest 'ES'))
|
||||
(cold %et (jest 'ET'))
|
||||
(cold %on (jest 'ON'))
|
||||
(cold %ws (jest 'WS'))
|
||||
(cold %b (jest 'B'))
|
||||
(cold %l (jest 'L'))
|
||||
(cold %r (jest 'R'))
|
||||
(cold %s (jest 'S'))
|
||||
==
|
||||
::
|
||||
:: TODO: This seems to be where the nest-fail is. There's an extra @ here?
|
||||
++ decomposition-mapping
|
||||
;~ pose
|
||||
:: a tag and a list of characters to decompose to
|
||||
%+ stag ~
|
||||
;~ plug
|
||||
;~ pose
|
||||
(stag ~ (ifix [gal ;~(plug gar ace)] decomp-tag))
|
||||
(easy ~)
|
||||
==
|
||||
(cook |=(a/(list @c) a) (most ace hex))
|
||||
==
|
||||
:: no decomposition information
|
||||
(easy ~)
|
||||
==
|
||||
::
|
||||
++ decomp-tag
|
||||
;~ pose
|
||||
(cold %font (jest 'font'))
|
||||
(cold %no-break (jest 'noBreak'))
|
||||
(cold %initial (jest 'initial'))
|
||||
(cold %medial (jest 'medial'))
|
||||
(cold %final (jest 'final'))
|
||||
(cold %isolated (jest 'isolated'))
|
||||
(cold %circle (jest 'circle'))
|
||||
(cold %super (jest 'super'))
|
||||
(cold %sub (jest 'sub'))
|
||||
(cold %vertical (jest 'vertical'))
|
||||
(cold %wide (jest 'wide'))
|
||||
(cold %narrow (jest 'narrow'))
|
||||
(cold %small (jest 'small'))
|
||||
(cold %square (jest 'square'))
|
||||
(cold %fraction (jest 'fraction'))
|
||||
(cold %compat (jest 'compat'))
|
||||
==
|
||||
::
|
||||
++ string-number
|
||||
%+ cook
|
||||
|=(a/tape a)
|
||||
(star ;~(pose nud fas hep))
|
||||
::
|
||||
++ yes-or-no
|
||||
;~ pose
|
||||
(cold %.y (jest 'Y'))
|
||||
(cold %.n (jest 'N'))
|
||||
==
|
||||
::
|
||||
++ optional-hex
|
||||
;~ pose
|
||||
(stag ~ hex)
|
||||
(easy ~)
|
||||
==
|
||||
--
|
||||
--
|
||||
++ grad %txt
|
||||
--
|
130
sur/unicode-data.hoon
Normal file
130
sur/unicode-data.hoon
Normal file
@ -0,0 +1,130 @@
|
||||
|%
|
||||
++ line
|
||||
:> an individual codepoint definition
|
||||
:>
|
||||
:> code: the codepoint in hexadecimal format
|
||||
:> name: the character name
|
||||
:> gen: the type of character this is
|
||||
:> can: the canonical combining class for ordering algorithms
|
||||
:> bidi: the bidirectional category of this character
|
||||
:> de: the character decomposition mapping
|
||||
:> decimal: the decimal digit value (or ~)
|
||||
:> digit: the digit value, covering non decimal radix forms
|
||||
:> numeric: the numeric value, including fractions
|
||||
:> mirrored: whether char is mirrored in bidirectional text
|
||||
:> old-name: unicode 1.0 compatibility name
|
||||
:> iso: iso 10646 comment field
|
||||
:> up: uppercase mapping codepoint
|
||||
:> low: lowercase mapping codepoint
|
||||
:> title: titlecase mapping codepoint
|
||||
:>
|
||||
$: code/@c
|
||||
name/tape
|
||||
gen/general
|
||||
can/@ud
|
||||
bi/bidi
|
||||
de/decomp
|
||||
:: todo: decimal/digit/numeric need to be parsed.
|
||||
decimal/tape
|
||||
digit/tape
|
||||
numeric/tape
|
||||
mirrored/?
|
||||
old-name/tape
|
||||
iso/tape
|
||||
up/(unit @c)
|
||||
low/(unit @c)
|
||||
title/(unit @c)
|
||||
==
|
||||
::
|
||||
++ general
|
||||
:> one of the normative or informative unicode general categories
|
||||
:>
|
||||
:> these abbreviations are as found in the unicode standard, except
|
||||
:> lowercased as to be valid symbols.
|
||||
$? $lu :< letter, uppercase
|
||||
$ll :< letter, lowercase
|
||||
$lt :< letter, titlecase
|
||||
$mn :< mark, non-spacing
|
||||
$mc :< mark, spacing combining
|
||||
$me :< mark, enclosing
|
||||
$nd :< number, decimal digit
|
||||
$nl :< number, letter
|
||||
$no :< number, other
|
||||
$zs :< separator, space
|
||||
$zl :< separator, line
|
||||
$zp :< separator, paragraph
|
||||
$cc :< other, control
|
||||
$cf :< other, format
|
||||
$cs :< other, surrogate
|
||||
$co :< other, private use
|
||||
$cn :< other, not assigned
|
||||
::
|
||||
$lm :< letter, modifier
|
||||
$lo :< letter, other
|
||||
$pc :< punctuation, connector
|
||||
$pd :< punctuation, dash
|
||||
$ps :< punctuation, open
|
||||
$pe :< punctuation, close
|
||||
$pi :< punctuation, initial quote
|
||||
$pf :< punctuation, final quote
|
||||
$po :< punctuation, other
|
||||
$sm :< symbol, math
|
||||
$sc :< symbol, currency
|
||||
$sk :< symbol, modifier
|
||||
$so :< symbol, other
|
||||
==
|
||||
::
|
||||
++ bidi
|
||||
:> bidirectional category of a unicode character
|
||||
$? $l :< left-to-right
|
||||
$lre :< left-to-right embedding
|
||||
$lri :< left-to-right isolate
|
||||
$lro :< left-to-right override
|
||||
$fsi :< first strong isolate
|
||||
$r :< right-to-left
|
||||
$al :< right-to-left arabic
|
||||
$rle :< right-to-left embedding
|
||||
$rli :< right-to-left isolate
|
||||
$rlo :< right-to-left override
|
||||
$pdf :< pop directional format
|
||||
$pdi :< pop directional isolate
|
||||
$en :< european number
|
||||
$es :< european number separator
|
||||
$et :< european number terminator
|
||||
$an :< arabic number
|
||||
$cs :< common number separator
|
||||
$nsm :< non-spacing mark
|
||||
$bn :< boundary neutral
|
||||
$b :< paragraph separator
|
||||
$s :< segment separator
|
||||
$ws :< whitespace
|
||||
$on :< other neutrals
|
||||
==
|
||||
::
|
||||
++ decomp
|
||||
:> character decomposition mapping.
|
||||
:>
|
||||
:> tag: type of decomposition.
|
||||
:> c: a list of codepoints this decomposes into.
|
||||
(unit {tag/(unit decomp-tag) c/(list @c)})
|
||||
::
|
||||
++ decomp-tag
|
||||
:> tag that describes the type of a character decomposition.
|
||||
$? $font :< a font variant
|
||||
$no-break :< a no-break version of a space or hyphen
|
||||
$initial :< an initial presentation form (arabic)
|
||||
$medial :< a medial presentation form (arabic)
|
||||
$final :< a final presentation form (arabic)
|
||||
$isolated :< an isolated presentation form (arabic)
|
||||
$circle :< an encircled form
|
||||
$super :< a superscript form
|
||||
$sub :< a subscript form
|
||||
$vertical :< a vertical layout presentation form
|
||||
$wide :< a wide (or zenkaku) compatibility character
|
||||
$narrow :< a narrow (or hankaku) compatibility character
|
||||
$small :< a small variant form (cns compatibility)
|
||||
$square :< a cjk squared font variant
|
||||
$fraction :< a vulgar fraction form
|
||||
$compat :< otherwise unspecified compatibility character
|
||||
==
|
||||
--
|
Loading…
Reference in New Issue
Block a user