Complete prototype for a unicode aware ++to-{upper,lower}.

This uses a mark to parse UnicodeData.txt, and some ford runes to change this into a binary search tree data structure for quick lookups, along with the optimizations found in golang's unicode table to record ranges instead of individual characters.
2024-12-14 20:02:51 +03:00 · 2017-10-12 22:55:45 -07:00 · 2017-10-12 22:55:45 -07:00 · ccfb11bda9
commit ccfb11bda9
parent 25accb9eba
3 changed files with 284 additions and 36 deletions
--- a/gen/capitalize.hoon
+++ b/gen/capitalize.hoon
@ -2,21 +2,203 @@
 ::  part 1: parse the file into {uppers}
 ::
 /-  unicode-data
-::
-::  while this works, it'd be better to build range based data structures like
-::  golang does. golang uses flat tables that it binary searches over. storage
-::  as a binary tree?
-::
-/=  uppers
+/+  new-hoon
+/=  case-table
  /;  |=  a/(list line:unicode-data)
-      =|  ret/(map @c @c)
-      |-
-      ^-  (map @c @c)
+      =,  new-hoon
+      |^  %-  build-tree
+          %-  flop
+          (build-case-nodes a)
+      ::
+      :>  #
+      :>  #  %case-nodes
+      :>  #
+      :>    transforms raw unicode data into sequential case nodes.
+      +|
+      ++  build-case-nodes
+        :>  raw list of unicode data lines to a compact list of chardata
+        |=  a/(list line:unicode-data)
+        ^-  (list case-node:unicode-data)
+        =<  out
+        ::
+        ::  todo: we don't have the final case range in the output of this
+        ::  gate. this is because this algorithm doesn't work when the last
+        ::  char is part of a range. this doesn't happen with the real one,
+        ::  only the excerpts i was using for testing.
+        ::
+        %^  foldl:ls  a  *case-fold
+        |=  {c/case-fold l/line:unicode-data}
+        ^+  c
+        =+  state=(line-to-case-state l)
+        ?:  (is-adjacent state prev.c)
+          c(prev state)
+        =.  c  (add-range c)
+        %=  c
+          start
+            ?:  &(!=(case.state %missing) !=(case.state %none))
+              `state
+            ~
+          prev  state
+        ==
+      ::
+      ++  line-to-case-state
+        :>  creates an easy to merge form.
+        |=  line:unicode-data
+        ^-  case-state
+        =/  out/case-state
+          [code %none [%none ~] [%none ~] [%none ~]]
+        ?:  =(code `@c`0)
+          =.  case.out  %missing
+          out
+        =.  case.out
+          ?+  gen  %none
+            $lu  %upper
+            $ll  %lower
+            $lt  %title
+          ==
+        ::
+        ::  several characters aren't described as $lu or $ll but have lower or
+        ::  upper state, such as u+2161. detect this and fix it up.
+        ::
+        =?  case.out  &(=(case.out %none) !=(low ~))  %upper
+        =?  case.out  &(=(case.out %none) !=(up ~))  %lower
+        ::
+        ::  calculate offsets
+        ::
+        =?  upper.out  !=(up ~)     (calculate-offset (need up) code)
+        =?  lower.out  !=(low ~)
+           (calculate-offset (need low) code)
+        =?  title.out  !=(title ~)  (calculate-offset (need title) code)
+        out
+      ::
+      ++  calculate-offset
+        |=  {src/@c dst/@c}
+        ^-  case-offset:unicode-data
+        ?:  =(src dst)
+          [%none ~]
+        ?:  (gth src dst)
+          [%add (sub src dst)]
+        [%sub (sub dst src)]
+      ::
+      ++  is-adjacent
+        :>  is {rhs} a continuation of {lhs}?
+        |=  {lhs/case-state rhs/case-state}
+        ^-  ?
+        ?:  (lth point.rhs point.lhs)
+          $(lhs rhs, rhs lhs)
+        ?:  !=(point.rhs +(point.lhs))
+          %.n
+        ?:  !=(case.rhs case.lhs)
+          (upper-lower-adjacent lhs rhs)
+        ?:  =(case.lhs %none)
+          %.n
+        ?:  =(case.lhs %missing)
+          %.n
+        ?:  !=(upper.lhs upper.rhs)
+          %.n
+        ?:  !=(lower.lhs lower.rhs)
+          %.n
+        ?:  !=(title.lhs title.rhs)
+          %.n
+        %.y
+      ::
+      ++  upper-lower-adjacent
+        :>    detects %upper-lower spans.
+        :>
+        :>  is {lhs} the same as {rhs}, but with opposite case?
+        |=  {lhs/case-state rhs/case-state}
+        ?:  &(=(case.lhs %upper) !=(case.rhs %lower))
+          %.n
+        ?:  &(=(case.lhs %lower) !=(case.rhs %upper))
+          %.n
+        ::
+        ::  to simplify detection, if things are in the opposite order, redo
+        ::  things flipped.
+        ::
+        ?:  =(case.lhs %lower)
+          $(lhs rhs, rhs lhs)
+        ?&  (is-upper-lower lhs)
+            (is-lower-upper rhs)
+        ==
+      ::
+      ++  is-upper-lower
+        |=  i/case-state
+        =(+.+.i [[%none ~] [%add 1] [%none ~]])
+      ::
+      ++  is-lower-upper
+        |=  i/case-state
+        =(+.+.i [[%sub 1] [%none ~] [%sub 1]])
+      ::
+      ++  is-none
+        |=  i/case-state
+        =(+.+.i [[%none ~] [%none ~] [%none ~]])
+      ::
+      ++  add-range
+        |=  c/case-fold
+        ^+  c
+        ?~  start.c
+          c
+        ?:  (is-none u.start.c)
+          c
+        ?:  ?&  (gth point.prev.c point.u.start.c)
+                (is-upper-lower u.start.c)
+            ==
+          =/  node/case-node:unicode-data
+            [`@ux`point.u.start.c `@ux`point.prev.c [%uplo ~] [%uplo ~] [%uplo ~]]
+          c(out [node out.c])
+        =/  node/case-node:unicode-data
+          [`@ux`point.u.start.c `@ux`point.prev.c +.+.u.start.c]
+        c(out [node out.c])
+      ::
+      ++  case-fold
+        :>  state that's part of the fold which generates the list of case-nodes
+        $:  :>  resulting data to pass to treeify.
+            out/(list case-node:unicode-data)
+            :>  the start of a run of characters; ~ for not active.
+            start/(unit case-state)
+            :>  previous character state
+            prev/case-state
+        ==
+      ::
+      ++  case-state
+        :>  a temporary model which we compress later in a second pass.
+        $:  point/@c
+            case/case-class
+            upper/case-offset:unicode-data
+            lower/case-offset:unicode-data
+            title/case-offset:unicode-data
+        ==
+      ::
+      ++  case-class
+        :>  classification of an individual character.
+        $?  $upper
+            $lower
+            $title
+            $none
+            $missing
+        ==
+      ::
+      :>  #
+      :>  #  %tree-building
+      :>  #
+      :>    builds a binary search tree out of the list
+      +|
+      ++  build-tree
+        |=  a/(list case-node:unicode-data)
+        ^-  case-tree:unicode-data
+        ::  there's probably a bottom up approach that doesn't require walking
+        ::  a list over and over again.
        ?~  a
-        ret
-      ?~  up.i.a
-        $(a t.a)
-      $(a t.a, ret (~(put by ret) code.i.a u.up.i.a))
+          ~
+        =+  len=(lent a)
+        =+  [lhs rhs]=(split-at:ls (div len 2) a)
+        ?~  rhs
+          ?~  lhs
+            ~
+          [i.lhs ~ ~]
+        =+  x=[i.rhs $(a lhs) $(a t.rhs)]
+        x
+      --
  /:  /===/lib/unicode-data  /&unicode-data&/txt/
 ::
 ::  part 2: utility core
@ -28,6 +210,7 @@
  (turn (tuba a) fun)
 ::
 ++  to-upper
+  :>  returns the uppercase of unicode codepoint {a}
  |=  a/@c
  ^-  @c
  ::  special case ascii to not perform map lookup.
@ -35,8 +218,56 @@
    ?:  &((gte a 'a') (lte a 'z'))
      (sub a 32)
    a
-  =+  x=(~(get by uppers) a)
-  (fall x a)
+  (apply-table a case-table %upper)
+::
++  to-lower
+  :>  returns the lowercase of unicode codepoint {a}
+  |=  a/@c
+  ^-  @c
+  ?:  (lte a max-ascii)
+    ?:  &((get a 'A') (lte a 'Z'))
+      (add 32 a)
+    a
+  (apply-table a case-table %lower)
+::
++  apply-table
+  :>    searches {table} and apples applies {type} to {a}.
+  :>
+  :>  this recursively walks the case tree {table}. if it finds an entry which
+  :>  matches on {a}, it will apply the offset. otherwise, returns {a}.
+  |=  {a/@c table/case-tree:unicode-data type/?($upper $lower $title)}
+  ^-  @c
+  ?~  table
+    a
+  ?:  (lth a start.n.table)
+    $(table l.table)
+  ?:  (gth a end.n.table)
+    $(table r.table)
+  ?.  &((lte start.n.table a) (lte a end.n.table))
+    a
+  %^  apply-offset  a  type
+  ?-  type
+    $upper  upper.n.table
+    $lower  lower.n.table
+    $title  title.n.table
+  ==
+::
++  apply-offset
+  :>  applies an character offset to {a}.
+  |=  {a/@c type/?($upper $lower $title) offset/case-offset:unicode-data}
+  ^-  @c
+  ?-  offset
+    {$add *}   (add a a.offset)
+    {$sub *}   (sub a s.offset)
+    {$none *}  a
+  ::
+      {$uplo *}
+    ?-  type
+      $upper  (sub a 1)
+      $lower  (add a 1)
+      $title  (sub a 1)
+    ==
+  ==
 ::
 ++  max-ascii  `@c`0x7f
 --
@ -48,5 +279,4 @@
        {n/tape $~}
        $~
    ==
-:-  %noun
-(transform n to-upper)
+:-  %tape  (transform n to-upper)
--- a/mar/unicode-data.hoon
+++ b/mar/unicode-data.hoon
@ -1,24 +1,8 @@
 /-  unicode-data
 =,  eyre
 =,  format
-
-::  ok, so we can currently slurp the data in. we're having problems getting it
-::  back out. ++txt:grow is probably the next logical step here.
-
-|_  all/(list line:unicode-data)
-++  grow
-  :>  converts from unicode-data to mark.
-  |%
-  ::  ++  txt
-  ::    ^-  wain
-  ::    %+  turn  all
-  ::    |=  line:unicode-data
-  ::    ;:  weld
-  ::      ";"
-  ::      name
-  ::      ";"
-  --
 ::
+|_  all/(list line:unicode-data)
 ++  grab
  :>  converts from mark to unicode-data.
  |%
@ -56,7 +40,6 @@
      %+  cook
        |=(a/tape a)
      (star ;~(less sem prn))
-::      (star ;~(pose hig low nud hep ace gal gar pel per))
    ::
    :>  parses a unicode general category abbreviation to symbol
    ++  general-category
--- a/sur/unicode-data.hoon
+++ b/sur/unicode-data.hoon
@ -1,4 +1,7 @@
 |%
+:>  #  %unicode-data
+:>    types to represent UnicdoeData.txt.
+|
 ++  line
  :>    an individual codepoint definition
  :>
@ -127,4 +130,36 @@
      $fraction  :<  a vulgar fraction form
      $compat    :<  otherwise unspecified compatibility character
  ==
+::
+:>  #
+:>  #  %case-map
+:>  #
+:>    types to represent fast lookups of case data
+|
++  case-offset
+  :>  case offsets can be in either direction
+  $%  :>  add {a} to get the new character
+      {$add a/@u}
+      :>  subtract {a} to get the new character
+      {$sub s/@u}
+      :>  take no action; return self
+      {$none $~}
+      :>  represents series of alternating uppercase/lowercase characters
+      {$uplo $~}
+  ==
+::
++  case-node
+  :>    a node in a case-tree.
+  :>
+  :>  represents a range of
+  $:  start/@ux
+      end/@ux
+      upper/case-offset
+      lower/case-offset
+      title/case-offset
+  ==
+::
++  case-tree
+  :>  a binary search tree of ++case-node items, sorted on span.
+  (tree case-node)
 --