Remove capitalize

2025-01-05 19:46:50 +03:00 · 2018-12-10 15:51:18 -08:00 · 2018-12-10 15:51:18 -08:00 · b00f78be28
commit b00f78be28
parent 2c4e537b72
3 changed files with 0 additions and 445 deletions
--- a/app/test.hoon
+++ b/app/test.hoon
@ -171,8 +171,6 @@
 ++  failing
  ^~  ^-  (map path tape)
  %-  my  :~ ::TODO don't hardcode
-  ::
-    :-  /gen/capitalize        "wants unicode-data/txt"
  ::
    :-  /gen/al                "compiler types out-of-date"
    :-  /gen/musk              "compiler types out-of-date"
--- a/gen/capitalize.hoon
+++ b/gen/capitalize.hoon
@ -1,293 +0,0 @@
-::  to use, download UnicdoeData.txt and place it in `%/lib/unicode-data/txt`.
-::
-::::
-::
-::  part 1: parse the file into {uppers}
-::
-/-  unicode-data
-/=  case-table
-  /;  !:
-      =>
-      |%
-      +$  case-fold
-        ::  state that's part of the fold which generates the list of case-nodes
-        $:  ::  resulting data to pass to treeify.
-            out=(list case-node:unicode-data)
-            ::  the start of a run of characters; ~ for not active.
-            start=(unit case-state)
-            ::  previous character state
-            prev=case-state
-        ==
-      ::
-      +$  case-state
-        ::  a temporary model which we compress later in a second pass.
-        $:  point=@c
-            case=case-class
-            upper=case-offset:unicode-data
-            lower=case-offset:unicode-data
-            title=case-offset:unicode-data
-        ==
-      ::
-      +$  case-class
-        ::  classification of an individual character.
-        $?  $upper
-            $lower
-            $title
-            $none
-            $missing
-        ==
-      --
-      |=  a=(list line:unicode-data)
-      ::
-      |^  %-  build-tree
-          %-  flop
-          (build-case-nodes a)
-      ::
-      ::  #
-      ::  #  %case-nodes
-      ::  #
-      ::    transforms raw unicode data into sequential case nodes.
-      +|  %case-nodes
-      ++  build-case-nodes
-        ::  raw list of unicode data lines to a compact list of chardata
-        |=  lines=(list line:unicode-data)
-        ^-  (list case-node:unicode-data)
-        ::
-        ::  todo: we don't have the final case range in the output of this
-        ::  gate. this is because this algorithm doesn't work when the last
-        ::  char is part of a range. this doesn't happen with the real one,
-        ::  only the excerpts i was using for testing.
-        ::
-        =<  out
-        =|  =case-fold
-        |-  ^+  case-fold
-        ?~  lines  case-fold
-        ::
-        =/  state=case-state  (line-to-case-state i.lines)
-        ::
-        ?:  (is-adjacent state prev.case-fold)
-          case-fold(prev state)
-        ::
-        =.  case-fold  (add-range case-fold)
-        ::
-        %_    case-fold
-          prev   state
-          start  ?.(?=(?(%missing %none) case.state) ~ `state)
-        ==
-      ::
-      ++  line-to-case-state
-        ::  creates an easy to merge form.
-        |=  line:unicode-data
-        ^-  case-state
-        =/  out=case-state
-          [code %none [%none ~] [%none ~] [%none ~]]
-        ?:  =(code `@c`0)
-          =.  case.out  %missing
-          out
-        =.  case.out
-          ?+  gen  %none
-            $lu  %upper
-            $ll  %lower
-            $lt  %title
-          ==
-        ::
-        ::  several characters aren't described as $lu or $ll but have lower or
-        ::  upper state, such as u+2161. detect this and fix it up.
-        ::
-        =?  case.out  &(=(case.out %none) !=(low ~))  %upper
-        =?  case.out  &(=(case.out %none) !=(up ~))  %lower
-        ::
-        ::  calculate offsets
-        ::
-        =?  upper.out  !=(up ~)     (calculate-offset (need up) code)
-        =?  lower.out  !=(low ~)
-           (calculate-offset (need low) code)
-        =?  title.out  !=(title ~)  (calculate-offset (need title) code)
-        out
-      ::
-      ++  calculate-offset
-        |=  [src=@c dst=@c]
-        ^-  case-offset:unicode-data
-        ?:  =(src dst)
-          [%none ~]
-        ?:  (gth src dst)
-          [%add (sub src dst)]
-        [%sub (sub dst src)]
-      ::
-      ++  is-adjacent
-        ::  is {rhs} a continuation of {lhs}?
-        |=  [lhs=case-state rhs=case-state]
-        ^-  ?
-        ?:  (lth point.rhs point.lhs)
-          $(lhs rhs, rhs lhs)
-        ?:  !=(point.rhs +(point.lhs))
-          %.n
-        ?:  !=(case.rhs case.lhs)
-          (upper-lower-adjacent lhs rhs)
-        ?:  =(case.lhs %none)
-          %.n
-        ?:  =(case.lhs %missing)
-          %.n
-        ?:  !=(upper.lhs upper.rhs)
-          %.n
-        ?:  !=(lower.lhs lower.rhs)
-          %.n
-        ?:  !=(title.lhs title.rhs)
-          %.n
-        %.y
-      ::
-      ++  upper-lower-adjacent
-        ::    detects %upper-lower spans.
-        ::
-        ::  is {lhs} the same as {rhs}, but with opposite case?
-        |=  [lhs=case-state rhs=case-state]
-        ?:  &(=(case.lhs %upper) !=(case.rhs %lower))
-          %.n
-        ?:  &(=(case.lhs %lower) !=(case.rhs %upper))
-          %.n
-        ::
-        ::  to simplify detection, if things are in the opposite order, redo
-        ::  things flipped.
-        ::
-        ?:  =(case.lhs %lower)
-          $(lhs rhs, rhs lhs)
-        ?&  (is-upper-lower lhs)
-            (is-lower-upper rhs)
-        ==
-      ::
-      ++  is-upper-lower
-        |=  i=case-state
-        =(+.+.i [[%none ~] [%add 1] [%none ~]])
-      ::
-      ++  is-lower-upper
-        |=  i=case-state
-        =(+.+.i [[%sub 1] [%none ~] [%sub 1]])
-      ::
-      ++  is-none
-        |=  i=case-state
-        =(+.+.i [[%none ~] [%none ~] [%none ~]])
-      ::
-      ++  add-range
-        |=  c=case-fold
-        ^+  c
-        ?~  start.c
-          c
-        ?:  (is-none u.start.c)
-          c
-        ?:  ?&  (gth point.prev.c point.u.start.c)
-                (is-upper-lower u.start.c)
-            ==
-          =/  node=case-node:unicode-data
-            [`@ux`point.u.start.c `@ux`point.prev.c [%uplo ~] [%uplo ~] [%uplo ~]]
-          c(out [node out.c])
-        =/  node=case-node:unicode-data
-          [`@ux`point.u.start.c `@ux`point.prev.c +.+.u.start.c]
-        c(out [node out.c])
-      ::
-      ::  #
-      ::  #  %tree-building
-      ::  #
-      ::    builds a binary search tree out of the list
-      +|  %tree-building
-      ++  build-tree
-        |=  a=(list case-node:unicode-data)
-        ^-  case-tree:unicode-data
-        ::  there's probably a bottom up approach that doesn't require walking
-        ::  a list over and over again.
-        ::
-        ::  use ?: instead of ?~ to prevent the TMI problem.
-        ::
-        ?:  =(~ a)
-          ~
-        =+  len=(lent a)
-        =/  split-at=@  (div len 2)
-        =/  lhs  (scag split-at a)
-        =/  rhs  (slag split-at a)
-        ?~  rhs
-          ?~  lhs
-            ~
-          [i.lhs ~ ~]
-        =+  x=[i.rhs $(a lhs) $(a t.rhs)]
-        x
-      --
-  /:  /===/lib/unicode-data  /&unicode-data&/txt/
-::
-::  part 2: utility core
-::
-|%
-++  transform
-  |=  [a=tape fun=$-(@c @c)]
-  %-  tufa
-  (turn (tuba a) fun)
-::
-++  to-upper
-  ::  returns the uppercase of unicode codepoint {a}
-  |=  a=@c
-  ^-  @c
-  ::  special case ascii to not perform map lookup.
-  ?:  (lte a max-ascii)
-    ?:  &((gte a 'a') (lte a 'z'))
-      (sub a 32)
-    a
-  (apply-table a case-table %upper)
-::
-++  to-lower
-  ::  returns the lowercase of unicode codepoint {a}
-  |=  a=@c
-  ^-  @c
-  ?:  (lte a max-ascii)
-    ?:  &((gte a 'A') (lte a 'Z'))
-      (add 32 a)
-    a
-  (apply-table a case-table %lower)
-::
-++  apply-table
-  ::    searches {table} and apples applies {type} to {a}.
-  ::
-  ::  this recursively walks the case tree {table}. if it finds an entry which
-  ::  matches on {a}, it will apply the offset. otherwise, returns {a}.
-  |=  [a=@c table=case-tree:unicode-data type=?($upper $lower $title)]
-  ^-  @c
-  ?~  table
-    a
-  ?:  (lth a start.n.table)
-    $(table l.table)
-  ?:  (gth a end.n.table)
-    $(table r.table)
-  ?.  &((lte start.n.table a) (lte a end.n.table))
-    a
-  %^  apply-offset  a  type
-  ?-  type
-    $upper  upper.n.table
-    $lower  lower.n.table
-    $title  title.n.table
-  ==
-::
-++  apply-offset
-  ::  applies an character offset to {a}.
-  |=  [a=@c type=?($upper $lower $title) offset=case-offset:unicode-data]
-  ^-  @c
-  ?-  offset
-    {$add *}   (add a a.offset)
-    {$sub *}   (sub a s.offset)
-    {$none *}  a
-  ::
-      {$uplo *}
-    ?-  type
-      $upper  (sub a 1)
-      $lower  (add a 1)
-      $title  (sub a 1)
-    ==
-  ==
-::
-++  max-ascii  `@c`0x7f
--
-::
-::  part 3: generator
-::
-:-  %say
-|=  $:  [now=@da eny=@uvJ bec=beak]
-        [n=tape ~]
-        ~
-    ==
-:-  %tape  (transform n to-upper)
--- a/sur/unicode-data.hoon
+++ b/sur/unicode-data.hoon
@ -1,150 +0,0 @@
-|%
-::  #  %unicode-data
-::    types to represent UnicdoeData.txt.
-+|  %unicode-data
-++  line
-  ::    an individual codepoint definition
-  ::
-  $:  code=@c               ::  codepoint in hexadecimal format
-      name=tape             ::  character name
-      gen=general           ::  type of character this is
-      ::  canonical combining class for ordering algorithms
-      can=@ud
-      bi=bidi               ::  bidirectional category of this character
-      de=decomp             ::  character decomposition mapping
-      ::  todo: decimal/digit/numeric need to be parsed.
-      decimal=tape          ::  decimal digit value (or ~)
-      digit=tape            ::  digit value, covering non decimal radix forms
-      numeric=tape          ::  numeric value, including fractions
-      mirrored=?            ::  whether char is mirrored in bidirectional text
-      old-name=tape         ::  unicode 1.0 compatibility name
-      iso=tape              ::  iso 10646 comment field
-      up=(unit @c)          ::  uppercase mapping codepoint
-      low=(unit @c)         ::  lowercase mapping codepoint
-      title=(unit @c)       ::  titlecase mapping codepoint
-  ==
-::
-++  general
-  ::    one of the normative or informative unicode general categories
-  ::
-  ::  these abbreviations are as found in the unicode standard, except
-  ::  lowercased as to be valid symbols.
-  $?  $lu  ::  letter, uppercase
-      $ll  ::  letter, lowercase
-      $lt  ::  letter, titlecase
-      $mn  ::  mark, non-spacing
-      $mc  ::  mark, spacing combining
-      $me  ::  mark, enclosing
-      $nd  ::  number, decimal digit
-      $nl  ::  number, letter
-      $no  ::  number, other
-      $zs  ::  separator, space
-      $zl  ::  separator, line
-      $zp  ::  separator, paragraph
-      $cc  ::  other, control
-      $cf  ::  other, format
-      $cs  ::  other, surrogate
-      $co  ::  other, private use
-      $cn  ::  other, not assigned
-      ::
-      $lm  ::  letter, modifier
-      $lo  ::  letter, other
-      $pc  ::  punctuation, connector
-      $pd  ::  punctuation, dash
-      $ps  ::  punctuation, open
-      $pe  ::  punctuation, close
-      $pi  ::  punctuation, initial quote
-      $pf  ::  punctuation, final quote
-      $po  ::  punctuation, other
-      $sm  ::  symbol, math
-      $sc  ::  symbol, currency
-      $sk  ::  symbol, modifier
-      $so  ::  symbol, other
-  ==
-::
-++  bidi
-  ::  bidirectional category of a unicode character
-  $?  $l    ::  left-to-right
-      $lre  ::  left-to-right embedding
-      $lri  ::  left-to-right isolate
-      $lro  ::  left-to-right override
-      $fsi  ::  first strong isolate
-      $r    ::  right-to-left
-      $al   ::  right-to-left arabic
-      $rle  ::  right-to-left embedding
-      $rli  ::  right-to-left isolate
-      $rlo  ::  right-to-left override
-      $pdf  ::  pop directional format
-      $pdi  ::  pop directional isolate
-      $en   ::  european number
-      $es   ::  european number separator
-      $et   ::  european number terminator
-      $an   ::  arabic number
-      $cs   ::  common number separator
-      $nsm  ::  non-spacing mark
-      $bn   ::  boundary neutral
-      $b    ::  paragraph separator
-      $s    ::  segment separator
-      $ws   ::  whitespace
-      $on   ::  other neutrals
-  ==
-::
-++  decomp
-  ::  character decomposition mapping.
-  ::
-  ::  tag: type of decomposition.
-  ::  c: a list of codepoints this decomposes into.
-  (unit {tag/(unit decomp-tag) c/(list @c)})
-::
-++  decomp-tag
-  ::  tag that describes the type of a character decomposition.
-  $?  $font      ::  a font variant
-      $nobreak   ::  a no-break version of a space or hyphen
-      $initial   ::  an initial presentation form (arabic)
-      $medial    ::  a medial presentation form (arabic)
-      $final     ::  a final presentation form (arabic)
-      $isolated  ::  an isolated presentation form (arabic)
-      $circle    ::  an encircled form
-      $super     ::  a superscript form
-      $sub       ::  a subscript form
-      $vertical  ::  a vertical layout presentation form
-      $wide      ::  a wide (or zenkaku) compatibility character
-      $narrow    ::  a narrow (or hankaku) compatibility character
-      $small     ::  a small variant form (cns compatibility)
-      $square    ::  a cjk squared font variant
-      $fraction  ::  a vulgar fraction form
-      $compat    ::  otherwise unspecified compatibility character
-  ==
-::
-::  #
-::  #  %case-map
-::  #
-::    types to represent fast lookups of case data
-+|  %case-map
-++  case-offset
-  ::  case offsets can be in either direction
-  $%  ::  add {a} to get the new character
-      [%add a=@u]
-      ::  subtract {a} to get the new character
-      [%sub s=@u]
-      ::  take no action; return self
-      [%none ~]
-      ::  represents series of alternating uppercase/lowercase characters
-      [%uplo ~]
-  ==
-::
-++  case-node
-  ::    a node in a case-tree.
-  ::
-  ::  represents a range of
-  $:  start=@ux
-      end=@ux
-      upper=case-offset
-      lower=case-offset
-      title=case-offset
-  ==
-::
-++  case-tree
-  ::  a binary search tree of ++case-node items, sorted on span.
-  (tree case-node)
--