Add real unicode uppercasing.

Place UnicodeData.txt in %/lib/unicode-data/txt. The +capitalize generator should then capitalize any unicode tape, including characters outside lower ASCII.
2024-12-14 11:08:45 +03:00 · 2017-10-06 20:52:33 -07:00 · 2017-10-06 20:52:33 -07:00 · 25accb9eba
commit 25accb9eba
parent 7d46bccdfc
3 changed files with 361 additions and 0 deletions
--- a/gen/capitalize.hoon
+++ b/gen/capitalize.hoon
@ -0,0 +1,52 @@
+::
+::  part 1: parse the file into {uppers}
+::
+/-  unicode-data
+::
+::  while this works, it'd be better to build range based data structures like
+::  golang does. golang uses flat tables that it binary searches over. storage
+::  as a binary tree?
+::
+/=  uppers
+  /;  |=  a/(list line:unicode-data)
+      =|  ret/(map @c @c)
+      |-
+      ^-  (map @c @c)
+      ?~  a
+        ret
+      ?~  up.i.a
+        $(a t.a)
+      $(a t.a, ret (~(put by ret) code.i.a u.up.i.a))
+  /:  /===/lib/unicode-data  /&unicode-data&/txt/
+::
+::  part 2: utility core
+::
+|%
++  transform
+  |=  {a/tape fun/$-(@c @c)}
+  %-  tufa
+  (turn (tuba a) fun)
+::
++  to-upper
+  |=  a/@c
+  ^-  @c
+  ::  special case ascii to not perform map lookup.
+  ?:  (lte a max-ascii)
+    ?:  &((gte a 'a') (lte a 'z'))
+      (sub a 32)
+    a
+  =+  x=(~(get by uppers) a)
+  (fall x a)
+::
++  max-ascii  `@c`0x7f
+--
+::
+::  part 3: generator
+::
+:-  %say
+|=  $:  {now/@da eny/@uvJ bec/beak}
+        {n/tape $~}
+        $~
+    ==
+:-  %noun
+(transform n to-upper)
--- a/mar/unicode-data.hoon
+++ b/mar/unicode-data.hoon
@ -0,0 +1,179 @@
+/-  unicode-data
+=,  eyre
+=,  format
+
+::  ok, so we can currently slurp the data in. we're having problems getting it
+::  back out. ++txt:grow is probably the next logical step here.
+
+|_  all/(list line:unicode-data)
++  grow
+  :>  converts from unicode-data to mark.
+  |%
+  ::  ++  txt
+  ::    ^-  wain
+  ::    %+  turn  all
+  ::    |=  line:unicode-data
+  ::    ;:  weld
+  ::      ";"
+  ::      name
+  ::      ";"
+  --
+::
++  grab
+  :>  converts from mark to unicode-data.
+  |%
+  ++  mime  |=({* a/octs} (txt (to-wain q.a)))     ::  XX mark translation
+  ++  txt
+    |^  |=  a/wain
+        ^+  all
+        %+  turn  a
+        |=  b/cord
+        ^-  line:unicode-data
+        (rash b line)
+    ::
+    :>  parses a single line of the unicode data file.
+    ++  line
+      ;~  (glue sem)
+        hex
+        name-string
+        general-category
+        (bass 10 (plus sid:ab))
+        bidi-category
+        decomposition-mapping
+        string-number
+        string-number
+        string-number
+        yes-or-no
+        name-string
+        name-string
+        optional-hex
+        optional-hex
+        optional-hex
+      ==
+    ::
+    :>  parses a single name or comment string.
+    ++  name-string
+      %+  cook
+        |=(a/tape a)
+      (star ;~(less sem prn))
+::      (star ;~(pose hig low nud hep ace gal gar pel per))
+    ::
+    :>  parses a unicode general category abbreviation to symbol
+    ++  general-category
+      ;~  pose
+        (cold %lu (jest 'Lu'))
+        (cold %ll (jest 'Ll'))
+        (cold %lt (jest 'Lt'))
+        (cold %mn (jest 'Mn'))
+        (cold %mc (jest 'Mc'))
+        (cold %me (jest 'Me'))
+        (cold %nd (jest 'Nd'))
+        (cold %nl (jest 'Nl'))
+        (cold %no (jest 'No'))
+        (cold %zs (jest 'Zs'))
+        (cold %zl (jest 'Zl'))
+        (cold %zp (jest 'Zp'))
+        (cold %cc (jest 'Cc'))
+        (cold %cf (jest 'Cf'))
+        (cold %cs (jest 'Cs'))
+        (cold %co (jest 'Co'))
+        (cold %cn (jest 'Cn'))
+        (cold %lm (jest 'Lm'))
+        (cold %lo (jest 'Lo'))
+        (cold %pc (jest 'Pc'))
+        (cold %pd (jest 'Pd'))
+        (cold %ps (jest 'Ps'))
+        (cold %pe (jest 'Pe'))
+        (cold %pi (jest 'Pi'))
+        (cold %pf (jest 'Pf'))
+        (cold %po (jest 'Po'))
+        (cold %sm (jest 'Sm'))
+        (cold %sc (jest 'Sc'))
+        (cold %sk (jest 'Sk'))
+        (cold %so (jest 'So'))
+      ==
+    ::
+    :>  parses a bidirectional category abbreviation to symbol.
+    ++  bidi-category
+      ;~  pose
+        (cold %fsi (jest 'FSI'))
+        (cold %lre (jest 'LRE'))
+        (cold %lri (jest 'LRI'))
+        (cold %lro (jest 'LRO'))
+        (cold %nsm (jest 'NSM'))
+        (cold %pdf (jest 'PDF'))
+        (cold %pdi (jest 'PDI'))
+        (cold %rle (jest 'RLE'))
+        (cold %rli (jest 'RLI'))
+        (cold %rlo (jest 'RLO'))
+        (cold %al (jest 'AL'))
+        (cold %an (jest 'AN'))
+        (cold %bn (jest 'BN'))
+        (cold %cs (jest 'CS'))
+        (cold %en (jest 'EN'))
+        (cold %es (jest 'ES'))
+        (cold %et (jest 'ET'))
+        (cold %on (jest 'ON'))
+        (cold %ws (jest 'WS'))
+        (cold %b (jest 'B'))
+        (cold %l (jest 'L'))
+        (cold %r (jest 'R'))
+        (cold %s (jest 'S'))
+      ==
+    ::
+    ::  TODO: This seems to be where the nest-fail is. There's an extra @ here?
+    ++  decomposition-mapping
+      ;~  pose
+        :: a tag and a list of characters to decompose to
+        %+  stag  ~
+          ;~  plug
+            ;~  pose
+              (stag ~ (ifix [gal ;~(plug gar ace)] decomp-tag))
+              (easy ~)
+            ==
+            (cook |=(a/(list @c) a) (most ace hex))
+          ==
+        ::  no decomposition information
+        (easy ~)
+      ==
+    ::
+    ++  decomp-tag
+      ;~  pose
+        (cold %font (jest 'font'))
+        (cold %no-break (jest 'noBreak'))
+        (cold %initial (jest 'initial'))
+        (cold %medial (jest 'medial'))
+        (cold %final (jest 'final'))
+        (cold %isolated (jest 'isolated'))
+        (cold %circle (jest 'circle'))
+        (cold %super (jest 'super'))
+        (cold %sub (jest 'sub'))
+        (cold %vertical (jest 'vertical'))
+        (cold %wide (jest 'wide'))
+        (cold %narrow (jest 'narrow'))
+        (cold %small (jest 'small'))
+        (cold %square (jest 'square'))
+        (cold %fraction (jest 'fraction'))
+        (cold %compat (jest 'compat'))
+      ==
+    ::
+    ++  string-number
+      %+  cook
+        |=(a/tape a)
+      (star ;~(pose nud fas hep))
+    ::
+    ++  yes-or-no
+      ;~  pose
+        (cold %.y (jest 'Y'))
+        (cold %.n (jest 'N'))
+      ==
+    ::
+    ++  optional-hex
+      ;~  pose
+        (stag ~ hex)
+        (easy ~)
+      ==
+    --
+  --
++  grad  %txt
+--
--- a/sur/unicode-data.hoon
+++ b/sur/unicode-data.hoon
@ -0,0 +1,130 @@
+|%
++  line
+  :>    an individual codepoint definition
+  :>
+  :>  code: the codepoint in hexadecimal format
+  :>  name: the character name
+  :>  gen: the type of character this is
+  :>  can: the canonical combining class for ordering algorithms
+  :>  bidi: the bidirectional category of this character
+  :>  de: the character decomposition mapping
+  :>  decimal: the decimal digit value (or ~)
+  :>  digit: the digit value, covering non decimal radix forms
+  :>  numeric: the numeric value, including fractions
+  :>  mirrored: whether char is mirrored in bidirectional text
+  :>  old-name: unicode 1.0 compatibility name
+  :>  iso: iso 10646 comment field
+  :>  up: uppercase mapping codepoint
+  :>  low: lowercase mapping codepoint
+  :>  title: titlecase mapping codepoint
+  :>
+  $:  code/@c
+      name/tape
+      gen/general
+      can/@ud
+      bi/bidi
+      de/decomp
+      ::  todo: decimal/digit/numeric need to be parsed.
+      decimal/tape
+      digit/tape
+      numeric/tape
+      mirrored/?
+      old-name/tape
+      iso/tape
+      up/(unit @c)
+      low/(unit @c)
+      title/(unit @c)
+  ==
+::
++  general
+  :>    one of the normative or informative unicode general categories
+  :>
+  :>  these abbreviations are as found in the unicode standard, except
+  :>  lowercased as to be valid symbols.
+  $?  $lu  :<  letter, uppercase
+      $ll  :<  letter, lowercase
+      $lt  :<  letter, titlecase
+      $mn  :<  mark, non-spacing
+      $mc  :<  mark, spacing combining
+      $me  :<  mark, enclosing
+      $nd  :<  number, decimal digit
+      $nl  :<  number, letter
+      $no  :<  number, other
+      $zs  :<  separator, space
+      $zl  :<  separator, line
+      $zp  :<  separator, paragraph
+      $cc  :<  other, control
+      $cf  :<  other, format
+      $cs  :<  other, surrogate
+      $co  :<  other, private use
+      $cn  :<  other, not assigned
+      ::
+      $lm  :<  letter, modifier
+      $lo  :<  letter, other
+      $pc  :<  punctuation, connector
+      $pd  :<  punctuation, dash
+      $ps  :<  punctuation, open
+      $pe  :<  punctuation, close
+      $pi  :<  punctuation, initial quote
+      $pf  :<  punctuation, final quote
+      $po  :<  punctuation, other
+      $sm  :<  symbol, math
+      $sc  :<  symbol, currency
+      $sk  :<  symbol, modifier
+      $so  :<  symbol, other
+  ==
+::
++  bidi
+  :>  bidirectional category of a unicode character
+  $?  $l    :<  left-to-right
+      $lre  :<  left-to-right embedding
+      $lri  :<  left-to-right isolate
+      $lro  :<  left-to-right override
+      $fsi  :<  first strong isolate
+      $r    :<  right-to-left
+      $al   :<  right-to-left arabic
+      $rle  :<  right-to-left embedding
+      $rli  :<  right-to-left isolate
+      $rlo  :<  right-to-left override
+      $pdf  :<  pop directional format
+      $pdi  :<  pop directional isolate
+      $en   :<  european number
+      $es   :<  european number separator
+      $et   :<  european number terminator
+      $an   :<  arabic number
+      $cs   :<  common number separator
+      $nsm  :<  non-spacing mark
+      $bn   :<  boundary neutral
+      $b    :<  paragraph separator
+      $s    :<  segment separator
+      $ws   :<  whitespace
+      $on   :<  other neutrals
+  ==
+::
++  decomp
+  :>  character decomposition mapping.
+  :>
+  :>  tag: type of decomposition.
+  :>  c: a list of codepoints this decomposes into.
+  (unit {tag/(unit decomp-tag) c/(list @c)})
+::
++  decomp-tag
+  :>  tag that describes the type of a character decomposition.
+  $?  $font      :<  a font variant
+      $no-break  :<  a no-break version of a space or hyphen
+      $initial   :<  an initial presentation form (arabic)
+      $medial    :<  a medial presentation form (arabic)
+      $final     :<  a final presentation form (arabic)
+      $isolated  :<  an isolated presentation form (arabic)
+      $circle    :<  an encircled form
+      $super     :<  a superscript form
+      $sub       :<  a subscript form
+      $vertical  :<  a vertical layout presentation form
+      $wide      :<  a wide (or zenkaku) compatibility character
+      $narrow    :<  a narrow (or hankaku) compatibility character
+      $small     :<  a small variant form (cns compatibility)
+      $square    :<  a cjk squared font variant
+      $fraction  :<  a vulgar fraction form
+      $compat    :<  otherwise unspecified compatibility character
+  ==
+--