Merge pull request #586 from eglaysher/unicode-string-gates

Unicode string gates
2025-01-05 19:46:50 +03:00 · 2018-01-29 11:09:08 -08:00 · 2018-01-29 11:09:08 -08:00 · 1e8e3fdd84
commit 1e8e3fdd84
parent cadc61929f 58900d30f7
3 changed files with 514 additions and 0 deletions
--- a/gen/capitalize.hoon
+++ b/gen/capitalize.hoon
@ -0,0 +1,285 @@
+::  to use, download UnicdoeData.txt and place it in `%/lib/unicode-data/txt`.
+::
+::::
+::
+::  part 1: parse the file into {uppers}
+::
+/-  unicode-data
+/+  new-hoon
+/=  case-table
+  /;  |=  a=(list line:unicode-data)
+      =,  new-hoon
+      |^  %-  build-tree
+          %-  flop
+          (build-case-nodes a)
+      ::
+      :>  #
+      :>  #  %case-nodes
+      :>  #
+      :>    transforms raw unicode data into sequential case nodes.
+      +|
+      ++  build-case-nodes
+        :>  raw list of unicode data lines to a compact list of chardata
+        |=  a=(list line:unicode-data)
+        ^-  (list case-node:unicode-data)
+        =<  out
+        ::
+        ::  todo: we don't have the final case range in the output of this
+        ::  gate. this is because this algorithm doesn't work when the last
+        ::  char is part of a range. this doesn't happen with the real one,
+        ::  only the excerpts i was using for testing.
+        ::
+        %^  foldl:ls  a  *case-fold
+        |=  [c=case-fold l=line:unicode-data]
+        ^+  c
+        =+  state=(line-to-case-state l)
+        ?:  (is-adjacent state prev.c)
+          c(prev state)
+        =.  c  (add-range c)
+        %=  c
+          start
+            ?:  &(!=(case.state %missing) !=(case.state %none))
+              `state
+            ~
+          prev  state
+        ==
+      ::
+      ++  line-to-case-state
+        :>  creates an easy to merge form.
+        |=  line:unicode-data
+        ^-  case-state
+        =/  out=case-state
+          [code %none [%none ~] [%none ~] [%none ~]]
+        ?:  =(code `@c`0)
+          =.  case.out  %missing
+          out
+        =.  case.out
+          ?+  gen  %none
+            $lu  %upper
+            $ll  %lower
+            $lt  %title
+          ==
+        ::
+        ::  several characters aren't described as $lu or $ll but have lower or
+        ::  upper state, such as u+2161. detect this and fix it up.
+        ::
+        =?  case.out  &(=(case.out %none) !=(low ~))  %upper
+        =?  case.out  &(=(case.out %none) !=(up ~))  %lower
+        ::
+        ::  calculate offsets
+        ::
+        =?  upper.out  !=(up ~)     (calculate-offset (need up) code)
+        =?  lower.out  !=(low ~)
+           (calculate-offset (need low) code)
+        =?  title.out  !=(title ~)  (calculate-offset (need title) code)
+        out
+      ::
+      ++  calculate-offset
+        |=  [src=@c dst=@c]
+        ^-  case-offset:unicode-data
+        ?:  =(src dst)
+          [%none ~]
+        ?:  (gth src dst)
+          [%add (sub src dst)]
+        [%sub (sub dst src)]
+      ::
+      ++  is-adjacent
+        :>  is {rhs} a continuation of {lhs}?
+        |=  [lhs=case-state rhs=case-state]
+        ^-  ?
+        ?:  (lth point.rhs point.lhs)
+          $(lhs rhs, rhs lhs)
+        ?:  !=(point.rhs +(point.lhs))
+          %.n
+        ?:  !=(case.rhs case.lhs)
+          (upper-lower-adjacent lhs rhs)
+        ?:  =(case.lhs %none)
+          %.n
+        ?:  =(case.lhs %missing)
+          %.n
+        ?:  !=(upper.lhs upper.rhs)
+          %.n
+        ?:  !=(lower.lhs lower.rhs)
+          %.n
+        ?:  !=(title.lhs title.rhs)
+          %.n
+        %.y
+      ::
+      ++  upper-lower-adjacent
+        :>    detects %upper-lower spans.
+        :>
+        :>  is {lhs} the same as {rhs}, but with opposite case?
+        |=  [lhs=case-state rhs=case-state]
+        ?:  &(=(case.lhs %upper) !=(case.rhs %lower))
+          %.n
+        ?:  &(=(case.lhs %lower) !=(case.rhs %upper))
+          %.n
+        ::
+        ::  to simplify detection, if things are in the opposite order, redo
+        ::  things flipped.
+        ::
+        ?:  =(case.lhs %lower)
+          $(lhs rhs, rhs lhs)
+        ?&  (is-upper-lower lhs)
+            (is-lower-upper rhs)
+        ==
+      ::
+      ++  is-upper-lower
+        |=  i=case-state
+        =(+.+.i [[%none ~] [%add 1] [%none ~]])
+      ::
+      ++  is-lower-upper
+        |=  i=case-state
+        =(+.+.i [[%sub 1] [%none ~] [%sub 1]])
+      ::
+      ++  is-none
+        |=  i=case-state
+        =(+.+.i [[%none ~] [%none ~] [%none ~]])
+      ::
+      ++  add-range
+        |=  c=case-fold
+        ^+  c
+        ?~  start.c
+          c
+        ?:  (is-none u.start.c)
+          c
+        ?:  ?&  (gth point.prev.c point.u.start.c)
+                (is-upper-lower u.start.c)
+            ==
+          =/  node=case-node:unicode-data
+            [`@ux`point.u.start.c `@ux`point.prev.c [%uplo ~] [%uplo ~] [%uplo ~]]
+          c(out [node out.c])
+        =/  node=case-node:unicode-data
+          [`@ux`point.u.start.c `@ux`point.prev.c +.+.u.start.c]
+        c(out [node out.c])
+      ::
+      ++  case-fold
+        :>  state that's part of the fold which generates the list of case-nodes
+        $:  :>  resulting data to pass to treeify.
+            out=(list case-node:unicode-data)
+            :>  the start of a run of characters; ~ for not active.
+            start=(unit case-state)
+            :>  previous character state
+            prev=case-state
+        ==
+      ::
+      ++  case-state
+        :>  a temporary model which we compress later in a second pass.
+        $:  point=@c
+            case=case-class
+            upper=case-offset:unicode-data
+            lower=case-offset:unicode-data
+            title=case-offset:unicode-data
+        ==
+      ::
+      ++  case-class
+        :>  classification of an individual character.
+        $?  $upper
+            $lower
+            $title
+            $none
+            $missing
+        ==
+      ::
+      :>  #
+      :>  #  %tree-building
+      :>  #
+      :>    builds a binary search tree out of the list
+      +|
+      ++  build-tree
+        |=  a=(list case-node:unicode-data)
+        ^-  case-tree:unicode-data
+        ::  there's probably a bottom up approach that doesn't require walking
+        ::  a list over and over again.
+        ?~  a
+          ~
+        =+  len=(lent a)
+        =+  [lhs rhs]=(split-at:ls (div len 2) a)
+        ?~  rhs
+          ?~  lhs
+            ~
+          [i.lhs ~ ~]
+        =+  x=[i.rhs $(a lhs) $(a t.rhs)]
+        x
+      --
+  /:  /===/lib/unicode-data  /&unicode-data&/txt/
+::
+::  part 2: utility core
+::
+|%
++  transform
+  |=  [a=tape fun=$-(@c @c)]
+  %-  tufa
+  (turn (tuba a) fun)
+::
++  to-upper
+  :>  returns the uppercase of unicode codepoint {a}
+  |=  a=@c
+  ^-  @c
+  ::  special case ascii to not perform map lookup.
+  ?:  (lte a max-ascii)
+    ?:  &((gte a 'a') (lte a 'z'))
+      (sub a 32)
+    a
+  (apply-table a case-table %upper)
+::
++  to-lower
+  :>  returns the lowercase of unicode codepoint {a}
+  |=  a=@c
+  ^-  @c
+  ?:  (lte a max-ascii)
+    ?:  &((gte a 'A') (lte a 'Z'))
+      (add 32 a)
+    a
+  (apply-table a case-table %lower)
+::
++  apply-table
+  :>    searches {table} and apples applies {type} to {a}.
+  :>
+  :>  this recursively walks the case tree {table}. if it finds an entry which
+  :>  matches on {a}, it will apply the offset. otherwise, returns {a}.
+  |=  [a=@c table=case-tree:unicode-data type=?($upper $lower $title)]
+  ^-  @c
+  ?~  table
+    a
+  ?:  (lth a start.n.table)
+    $(table l.table)
+  ?:  (gth a end.n.table)
+    $(table r.table)
+  ?.  &((lte start.n.table a) (lte a end.n.table))
+    a
+  %^  apply-offset  a  type
+  ?-  type
+    $upper  upper.n.table
+    $lower  lower.n.table
+    $title  title.n.table
+  ==
+::
++  apply-offset
+  :>  applies an character offset to {a}.
+  |=  [a=@c type=?($upper $lower $title) offset=case-offset:unicode-data]
+  ^-  @c
+  ?-  offset
+    {$add *}   (add a a.offset)
+    {$sub *}   (sub a s.offset)
+    {$none *}  a
+  ::
+      {$uplo *}
+    ?-  type
+      $upper  (sub a 1)
+      $lower  (add a 1)
+      $title  (sub a 1)
+    ==
+  ==
+::
++  max-ascii  `@c`0x7f
+--
+::
+::  part 3: generator
+::
+:-  %say
+|=  $:  [now=@da eny=@uvJ bec=beak]
+        [n=tape $~]
+        $~
+    ==
+:-  %tape  (transform n to-upper)
--- a/mar/unicode-data.hoon
+++ b/mar/unicode-data.hoon
@ -0,0 +1,79 @@
+/-  unicode-data
+=,  eyre
+=,  format
+::
+|_  all/(list line:unicode-data)
++  grab
+  :>  converts from mark to unicode-data.
+  |%
+  ++  mime  |=([* a=octs] (txt (to-wain q.a)))     ::  XX mark translation
+  ++  txt
+    |^  |=  a=wain
+        ^+  all
+        %+  murn  a
+        |=  b=cord
+        ^-  (unit line:unicode-data)
+        ?~  b  ~
+        `(rash b line)
+    ::
+    :>  parses a single character information line of the unicode data file.
+    ++  line
+      ;~  (glue sem)
+        hex                       :: code/@c        codepoint in hex format
+        name-string               :: name/tape      character name
+        general-category          :: gen/general    type of character
+        (bass 10 (plus dit))      :: can/@ud        canonical combining class
+        bidi-category             :: bi/bidi        bidirectional category
+        decomposition-mapping     :: de/decomp      decomposition mapping
+      ::
+      :: todo: decimal/digit/numeric need to be parsed.
+      ::
+        string-number    :: decimal/tape      decimal digit value (or ~)
+        string-number    :: digit/tape        digit value, even if non-decimal
+        string-number    :: numeric/tape      numeric value, including fractions
+      ::
+        (flag 'Y' 'N')   :: mirrored/?        is char mirrored in bidi text?
+        name-string      :: old-name/tape     unicode 1.0 compatibility name
+        name-string      :: iso/tape          iso 10646 comment field
+        (punt hex)       :: up/(unit @c)      uppercase mapping codepoint
+        (punt hex)       :: low/(unit @c)     lowercase mapping codepoint
+        (punt hex)       :: title/(unit @c)   titlecase mapping codepoint
+      ==
+    ::
+    :>  parses a single name or comment string.
+    ++  name-string
+      %+  cook
+        |=(a=tape a)
+      (star ;~(less sem prn))
+    ::
+    :>  parses a unicode general category abbreviation to symbol
+    ++  general-category
+      %+  sear  (soft general:unicode-data)
+      :(cook crip cass ;~(plug hig low (easy ~)))
+    ::
+    :>  parses a bidirectional category abbreviation to symbol.
+    ++  bidi-category
+      %+  sear  (soft bidi:unicode-data)
+      :(cook crip cass (star hig))
+    ::
+    ++  decomposition-mapping
+      %-  punt  :: optional
+      :: a tag and a list of characters to decompose to
+      ;~  plug
+        (punt (ifix [gal ;~(plug gar ace)] decomp-tag))
+        (cook |=(a=(list @c) a) (most ace hex))
+      ==
+    ::
+    ++  decomp-tag
+      %+  sear  (soft decomp-tag:unicode-data)
+      :(cook crip cass (star alf))
+    ::
+    ++  string-number
+      %+  cook
+        |=(a=tape a)
+      (star ;~(pose nud fas hep))
+    ::
+    --
+  --
++  grad  %txt
+--
--- a/sur/unicode-data.hoon
+++ b/sur/unicode-data.hoon
@ -0,0 +1,150 @@
+|%
+:>  #  %unicode-data
+:>    types to represent UnicdoeData.txt.
+|
++  line
+  :>    an individual codepoint definition
+  :>
+  $:  code=@c               :<  codepoint in hexadecimal format
+      name=tape             :<  character name
+      gen=general           :<  type of character this is
+      :>  canonical combining class for ordering algorithms
+      can=@ud
+      bi=bidi               :<  bidirectional category of this character
+      de=decomp             :<  character decomposition mapping
+      ::  todo: decimal/digit/numeric need to be parsed.
+      decimal=tape          :<  decimal digit value (or ~)
+      digit=tape            :<  digit value, covering non decimal radix forms
+      numeric=tape          :<  numeric value, including fractions
+      mirrored=?            :<  whether char is mirrored in bidirectional text
+      old-name=tape         :<  unicode 1.0 compatibility name
+      iso=tape              :<  iso 10646 comment field
+      up=(unit @c)          :<  uppercase mapping codepoint
+      low=(unit @c)         :<  lowercase mapping codepoint
+      title=(unit @c)       :<  titlecase mapping codepoint
+  ==
+::
++  general
+  :>    one of the normative or informative unicode general categories
+  :>
+  :>  these abbreviations are as found in the unicode standard, except
+  :>  lowercased as to be valid symbols.
+  $?  $lu  :<  letter, uppercase
+      $ll  :<  letter, lowercase
+      $lt  :<  letter, titlecase
+      $mn  :<  mark, non-spacing
+      $mc  :<  mark, spacing combining
+      $me  :<  mark, enclosing
+      $nd  :<  number, decimal digit
+      $nl  :<  number, letter
+      $no  :<  number, other
+      $zs  :<  separator, space
+      $zl  :<  separator, line
+      $zp  :<  separator, paragraph
+      $cc  :<  other, control
+      $cf  :<  other, format
+      $cs  :<  other, surrogate
+      $co  :<  other, private use
+      $cn  :<  other, not assigned
+      ::
+      $lm  :<  letter, modifier
+      $lo  :<  letter, other
+      $pc  :<  punctuation, connector
+      $pd  :<  punctuation, dash
+      $ps  :<  punctuation, open
+      $pe  :<  punctuation, close
+      $pi  :<  punctuation, initial quote
+      $pf  :<  punctuation, final quote
+      $po  :<  punctuation, other
+      $sm  :<  symbol, math
+      $sc  :<  symbol, currency
+      $sk  :<  symbol, modifier
+      $so  :<  symbol, other
+  ==
+::
++  bidi
+  :>  bidirectional category of a unicode character
+  $?  $l    :<  left-to-right
+      $lre  :<  left-to-right embedding
+      $lri  :<  left-to-right isolate
+      $lro  :<  left-to-right override
+      $fsi  :<  first strong isolate
+      $r    :<  right-to-left
+      $al   :<  right-to-left arabic
+      $rle  :<  right-to-left embedding
+      $rli  :<  right-to-left isolate
+      $rlo  :<  right-to-left override
+      $pdf  :<  pop directional format
+      $pdi  :<  pop directional isolate
+      $en   :<  european number
+      $es   :<  european number separator
+      $et   :<  european number terminator
+      $an   :<  arabic number
+      $cs   :<  common number separator
+      $nsm  :<  non-spacing mark
+      $bn   :<  boundary neutral
+      $b    :<  paragraph separator
+      $s    :<  segment separator
+      $ws   :<  whitespace
+      $on   :<  other neutrals
+  ==
+::
++  decomp
+  :>  character decomposition mapping.
+  :>
+  :>  tag: type of decomposition.
+  :>  c: a list of codepoints this decomposes into.
+  (unit {tag/(unit decomp-tag) c/(list @c)})
+::
++  decomp-tag
+  :>  tag that describes the type of a character decomposition.
+  $?  $font      :<  a font variant
+      $nobreak   :<  a no-break version of a space or hyphen
+      $initial   :<  an initial presentation form (arabic)
+      $medial    :<  a medial presentation form (arabic)
+      $final     :<  a final presentation form (arabic)
+      $isolated  :<  an isolated presentation form (arabic)
+      $circle    :<  an encircled form
+      $super     :<  a superscript form
+      $sub       :<  a subscript form
+      $vertical  :<  a vertical layout presentation form
+      $wide      :<  a wide (or zenkaku) compatibility character
+      $narrow    :<  a narrow (or hankaku) compatibility character
+      $small     :<  a small variant form (cns compatibility)
+      $square    :<  a cjk squared font variant
+      $fraction  :<  a vulgar fraction form
+      $compat    :<  otherwise unspecified compatibility character
+  ==
+::
+:>  #
+:>  #  %case-map
+:>  #
+:>    types to represent fast lookups of case data
+|
++  case-offset
+  :>  case offsets can be in either direction
+  $%  :>  add {a} to get the new character
+      [%add a=@u]
+      :>  subtract {a} to get the new character
+      [%sub s=@u]
+      :>  take no action; return self
+      [%none $~]
+      :>  represents series of alternating uppercase/lowercase characters
+      [%uplo $~]
+  ==
+::
++  case-node
+  :>    a node in a case-tree.
+  :>
+  :>  represents a range of
+  $:  start=@ux
+      end=@ux
+      upper=case-offset
+      lower=case-offset
+      title=case-offset
+  ==
+::
++  case-tree
+  :>  a binary search tree of ++case-node items, sorted on span.
+  (tree case-node)
+--