shrub/gen/capitalize.hoon

294 lines
7.8 KiB
Plaintext
Raw Normal View History

:: to use, download UnicdoeData.txt and place it in `%/lib/unicode-data/txt`.
::
::::
::
:: part 1: parse the file into {uppers}
::
/- unicode-data
/= case-table
2018-08-24 01:03:55 +03:00
/; !:
=>
|%
+$ case-fold
:: state that's part of the fold which generates the list of case-nodes
$: :: resulting data to pass to treeify.
out=(list case-node:unicode-data)
:: the start of a run of characters; ~ for not active.
start=(unit case-state)
:: previous character state
prev=case-state
==
::
+$ case-state
:: a temporary model which we compress later in a second pass.
$: point=@c
case=case-class
upper=case-offset:unicode-data
lower=case-offset:unicode-data
title=case-offset:unicode-data
==
::
+$ case-class
:: classification of an individual character.
$? $upper
$lower
$title
$none
$missing
==
--
|= a=(list line:unicode-data)
::
|^ %- build-tree
%- flop
(build-case-nodes a)
::
2018-05-25 01:39:56 +03:00
:: #
:: # %case-nodes
:: #
:: transforms raw unicode data into sequential case nodes.
2018-05-29 09:42:16 +03:00
+| %case-nodes
++ build-case-nodes
2018-05-25 01:39:56 +03:00
:: raw list of unicode data lines to a compact list of chardata
2018-08-24 01:03:55 +03:00
|= lines=(list line:unicode-data)
^- (list case-node:unicode-data)
::
:: todo: we don't have the final case range in the output of this
:: gate. this is because this algorithm doesn't work when the last
:: char is part of a range. this doesn't happen with the real one,
:: only the excerpts i was using for testing.
::
2018-08-24 01:03:55 +03:00
=< out
=| =case-fold
|- ^+ case-fold
?~ lines case-fold
::
=/ state=case-state (line-to-case-state i.lines)
::
?: (is-adjacent state prev.case-fold)
case-fold(prev state)
::
=. case-fold (add-range case-fold)
::
%_ case-fold
prev state
start ?.(?=(?(%missing %none) case.state) ~ `state)
==
::
++ line-to-case-state
2018-05-25 01:39:56 +03:00
:: creates an easy to merge form.
|= line:unicode-data
^- case-state
2018-01-19 09:19:48 +03:00
=/ out=case-state
[code %none [%none ~] [%none ~] [%none ~]]
?: =(code `@c`0)
=. case.out %missing
out
=. case.out
?+ gen %none
$lu %upper
$ll %lower
$lt %title
==
::
:: several characters aren't described as $lu or $ll but have lower or
:: upper state, such as u+2161. detect this and fix it up.
::
=? case.out &(=(case.out %none) !=(low ~)) %upper
=? case.out &(=(case.out %none) !=(up ~)) %lower
::
:: calculate offsets
::
=? upper.out !=(up ~) (calculate-offset (need up) code)
=? lower.out !=(low ~)
(calculate-offset (need low) code)
=? title.out !=(title ~) (calculate-offset (need title) code)
out
::
++ calculate-offset
2018-01-19 09:19:48 +03:00
|= [src=@c dst=@c]
^- case-offset:unicode-data
?: =(src dst)
[%none ~]
?: (gth src dst)
[%add (sub src dst)]
[%sub (sub dst src)]
::
++ is-adjacent
2018-05-25 01:39:56 +03:00
:: is {rhs} a continuation of {lhs}?
2018-01-19 09:19:48 +03:00
|= [lhs=case-state rhs=case-state]
^- ?
?: (lth point.rhs point.lhs)
$(lhs rhs, rhs lhs)
?: !=(point.rhs +(point.lhs))
%.n
?: !=(case.rhs case.lhs)
(upper-lower-adjacent lhs rhs)
?: =(case.lhs %none)
%.n
?: =(case.lhs %missing)
%.n
?: !=(upper.lhs upper.rhs)
%.n
?: !=(lower.lhs lower.rhs)
%.n
?: !=(title.lhs title.rhs)
%.n
%.y
::
++ upper-lower-adjacent
2018-05-25 01:39:56 +03:00
:: detects %upper-lower spans.
::
:: is {lhs} the same as {rhs}, but with opposite case?
2018-01-19 09:19:48 +03:00
|= [lhs=case-state rhs=case-state]
?: &(=(case.lhs %upper) !=(case.rhs %lower))
%.n
?: &(=(case.lhs %lower) !=(case.rhs %upper))
%.n
::
:: to simplify detection, if things are in the opposite order, redo
:: things flipped.
::
?: =(case.lhs %lower)
$(lhs rhs, rhs lhs)
?& (is-upper-lower lhs)
(is-lower-upper rhs)
==
::
++ is-upper-lower
2018-01-19 09:19:48 +03:00
|= i=case-state
=(+.+.i [[%none ~] [%add 1] [%none ~]])
::
++ is-lower-upper
2018-01-19 09:19:48 +03:00
|= i=case-state
=(+.+.i [[%sub 1] [%none ~] [%sub 1]])
::
++ is-none
2018-01-19 09:19:48 +03:00
|= i=case-state
=(+.+.i [[%none ~] [%none ~] [%none ~]])
::
++ add-range
2018-01-19 09:19:48 +03:00
|= c=case-fold
^+ c
?~ start.c
c
?: (is-none u.start.c)
c
?: ?& (gth point.prev.c point.u.start.c)
(is-upper-lower u.start.c)
==
2018-01-19 09:19:48 +03:00
=/ node=case-node:unicode-data
[`@ux`point.u.start.c `@ux`point.prev.c [%uplo ~] [%uplo ~] [%uplo ~]]
c(out [node out.c])
2018-01-19 09:19:48 +03:00
=/ node=case-node:unicode-data
[`@ux`point.u.start.c `@ux`point.prev.c +.+.u.start.c]
c(out [node out.c])
::
2018-05-25 01:39:56 +03:00
:: #
:: # %tree-building
:: #
:: builds a binary search tree out of the list
2018-05-29 09:42:16 +03:00
+| %tree-building
++ build-tree
2018-01-19 09:19:48 +03:00
|= a=(list case-node:unicode-data)
^- case-tree:unicode-data
:: there's probably a bottom up approach that doesn't require walking
:: a list over and over again.
2018-08-24 01:03:55 +03:00
::
:: use ?: instead of ?~ to prevent the TMI problem.
::
?: =(~ a)
~
=+ len=(lent a)
2018-08-24 01:03:55 +03:00
=/ split-at=@ (div len 2)
=/ lhs (scag split-at a)
=/ rhs (slag split-at a)
?~ rhs
?~ lhs
~
[i.lhs ~ ~]
=+ x=[i.rhs $(a lhs) $(a t.rhs)]
x
--
/: /===/lib/unicode-data /&unicode-data&/txt/
::
:: part 2: utility core
::
|%
++ transform
2018-01-19 09:19:48 +03:00
|= [a=tape fun=$-(@c @c)]
%- tufa
(turn (tuba a) fun)
::
++ to-upper
2018-05-25 01:39:56 +03:00
:: returns the uppercase of unicode codepoint {a}
2018-01-19 09:19:48 +03:00
|= a=@c
^- @c
:: special case ascii to not perform map lookup.
?: (lte a max-ascii)
?: &((gte a 'a') (lte a 'z'))
(sub a 32)
a
(apply-table a case-table %upper)
::
++ to-lower
2018-05-25 01:39:56 +03:00
:: returns the lowercase of unicode codepoint {a}
2018-01-19 09:19:48 +03:00
|= a=@c
^- @c
?: (lte a max-ascii)
2018-01-19 09:19:48 +03:00
?: &((gte a 'A') (lte a 'Z'))
(add 32 a)
a
(apply-table a case-table %lower)
::
++ apply-table
2018-05-25 01:39:56 +03:00
:: searches {table} and apples applies {type} to {a}.
::
:: this recursively walks the case tree {table}. if it finds an entry which
:: matches on {a}, it will apply the offset. otherwise, returns {a}.
2018-01-19 09:19:48 +03:00
|= [a=@c table=case-tree:unicode-data type=?($upper $lower $title)]
^- @c
?~ table
a
?: (lth a start.n.table)
$(table l.table)
?: (gth a end.n.table)
$(table r.table)
?. &((lte start.n.table a) (lte a end.n.table))
a
%^ apply-offset a type
?- type
$upper upper.n.table
$lower lower.n.table
$title title.n.table
==
::
++ apply-offset
2018-05-25 01:39:56 +03:00
:: applies an character offset to {a}.
2018-01-19 09:19:48 +03:00
|= [a=@c type=?($upper $lower $title) offset=case-offset:unicode-data]
^- @c
?- offset
{$add *} (add a a.offset)
{$sub *} (sub a s.offset)
{$none *} a
::
{$uplo *}
?- type
$upper (sub a 1)
$lower (add a 1)
$title (sub a 1)
==
==
::
++ max-ascii `@c`0x7f
--
::
:: part 3: generator
::
:- %say
2018-01-19 09:19:48 +03:00
|= $: [now=@da eny=@uvJ bec=beak]
2018-03-19 07:18:20 +03:00
[n=tape ~]
~
==
:- %tape (transform n to-upper)