mirror of
https://github.com/urbit/shrub.git
synced 2025-01-05 19:46:50 +03:00
Remove capitalize
This commit is contained in:
parent
2c4e537b72
commit
b00f78be28
@ -171,8 +171,6 @@
|
||||
++ failing
|
||||
^~ ^- (map path tape)
|
||||
%- my :~ ::TODO don't hardcode
|
||||
::
|
||||
:- /gen/capitalize "wants unicode-data/txt"
|
||||
::
|
||||
:- /gen/al "compiler types out-of-date"
|
||||
:- /gen/musk "compiler types out-of-date"
|
||||
|
@ -1,293 +0,0 @@
|
||||
:: to use, download UnicdoeData.txt and place it in `%/lib/unicode-data/txt`.
|
||||
::
|
||||
::::
|
||||
::
|
||||
:: part 1: parse the file into {uppers}
|
||||
::
|
||||
/- unicode-data
|
||||
/= case-table
|
||||
/; !:
|
||||
=>
|
||||
|%
|
||||
+$ case-fold
|
||||
:: state that's part of the fold which generates the list of case-nodes
|
||||
$: :: resulting data to pass to treeify.
|
||||
out=(list case-node:unicode-data)
|
||||
:: the start of a run of characters; ~ for not active.
|
||||
start=(unit case-state)
|
||||
:: previous character state
|
||||
prev=case-state
|
||||
==
|
||||
::
|
||||
+$ case-state
|
||||
:: a temporary model which we compress later in a second pass.
|
||||
$: point=@c
|
||||
case=case-class
|
||||
upper=case-offset:unicode-data
|
||||
lower=case-offset:unicode-data
|
||||
title=case-offset:unicode-data
|
||||
==
|
||||
::
|
||||
+$ case-class
|
||||
:: classification of an individual character.
|
||||
$? $upper
|
||||
$lower
|
||||
$title
|
||||
$none
|
||||
$missing
|
||||
==
|
||||
--
|
||||
|= a=(list line:unicode-data)
|
||||
::
|
||||
|^ %- build-tree
|
||||
%- flop
|
||||
(build-case-nodes a)
|
||||
::
|
||||
:: #
|
||||
:: # %case-nodes
|
||||
:: #
|
||||
:: transforms raw unicode data into sequential case nodes.
|
||||
+| %case-nodes
|
||||
++ build-case-nodes
|
||||
:: raw list of unicode data lines to a compact list of chardata
|
||||
|= lines=(list line:unicode-data)
|
||||
^- (list case-node:unicode-data)
|
||||
::
|
||||
:: todo: we don't have the final case range in the output of this
|
||||
:: gate. this is because this algorithm doesn't work when the last
|
||||
:: char is part of a range. this doesn't happen with the real one,
|
||||
:: only the excerpts i was using for testing.
|
||||
::
|
||||
=< out
|
||||
=| =case-fold
|
||||
|- ^+ case-fold
|
||||
?~ lines case-fold
|
||||
::
|
||||
=/ state=case-state (line-to-case-state i.lines)
|
||||
::
|
||||
?: (is-adjacent state prev.case-fold)
|
||||
case-fold(prev state)
|
||||
::
|
||||
=. case-fold (add-range case-fold)
|
||||
::
|
||||
%_ case-fold
|
||||
prev state
|
||||
start ?.(?=(?(%missing %none) case.state) ~ `state)
|
||||
==
|
||||
::
|
||||
++ line-to-case-state
|
||||
:: creates an easy to merge form.
|
||||
|= line:unicode-data
|
||||
^- case-state
|
||||
=/ out=case-state
|
||||
[code %none [%none ~] [%none ~] [%none ~]]
|
||||
?: =(code `@c`0)
|
||||
=. case.out %missing
|
||||
out
|
||||
=. case.out
|
||||
?+ gen %none
|
||||
$lu %upper
|
||||
$ll %lower
|
||||
$lt %title
|
||||
==
|
||||
::
|
||||
:: several characters aren't described as $lu or $ll but have lower or
|
||||
:: upper state, such as u+2161. detect this and fix it up.
|
||||
::
|
||||
=? case.out &(=(case.out %none) !=(low ~)) %upper
|
||||
=? case.out &(=(case.out %none) !=(up ~)) %lower
|
||||
::
|
||||
:: calculate offsets
|
||||
::
|
||||
=? upper.out !=(up ~) (calculate-offset (need up) code)
|
||||
=? lower.out !=(low ~)
|
||||
(calculate-offset (need low) code)
|
||||
=? title.out !=(title ~) (calculate-offset (need title) code)
|
||||
out
|
||||
::
|
||||
++ calculate-offset
|
||||
|= [src=@c dst=@c]
|
||||
^- case-offset:unicode-data
|
||||
?: =(src dst)
|
||||
[%none ~]
|
||||
?: (gth src dst)
|
||||
[%add (sub src dst)]
|
||||
[%sub (sub dst src)]
|
||||
::
|
||||
++ is-adjacent
|
||||
:: is {rhs} a continuation of {lhs}?
|
||||
|= [lhs=case-state rhs=case-state]
|
||||
^- ?
|
||||
?: (lth point.rhs point.lhs)
|
||||
$(lhs rhs, rhs lhs)
|
||||
?: !=(point.rhs +(point.lhs))
|
||||
%.n
|
||||
?: !=(case.rhs case.lhs)
|
||||
(upper-lower-adjacent lhs rhs)
|
||||
?: =(case.lhs %none)
|
||||
%.n
|
||||
?: =(case.lhs %missing)
|
||||
%.n
|
||||
?: !=(upper.lhs upper.rhs)
|
||||
%.n
|
||||
?: !=(lower.lhs lower.rhs)
|
||||
%.n
|
||||
?: !=(title.lhs title.rhs)
|
||||
%.n
|
||||
%.y
|
||||
::
|
||||
++ upper-lower-adjacent
|
||||
:: detects %upper-lower spans.
|
||||
::
|
||||
:: is {lhs} the same as {rhs}, but with opposite case?
|
||||
|= [lhs=case-state rhs=case-state]
|
||||
?: &(=(case.lhs %upper) !=(case.rhs %lower))
|
||||
%.n
|
||||
?: &(=(case.lhs %lower) !=(case.rhs %upper))
|
||||
%.n
|
||||
::
|
||||
:: to simplify detection, if things are in the opposite order, redo
|
||||
:: things flipped.
|
||||
::
|
||||
?: =(case.lhs %lower)
|
||||
$(lhs rhs, rhs lhs)
|
||||
?& (is-upper-lower lhs)
|
||||
(is-lower-upper rhs)
|
||||
==
|
||||
::
|
||||
++ is-upper-lower
|
||||
|= i=case-state
|
||||
=(+.+.i [[%none ~] [%add 1] [%none ~]])
|
||||
::
|
||||
++ is-lower-upper
|
||||
|= i=case-state
|
||||
=(+.+.i [[%sub 1] [%none ~] [%sub 1]])
|
||||
::
|
||||
++ is-none
|
||||
|= i=case-state
|
||||
=(+.+.i [[%none ~] [%none ~] [%none ~]])
|
||||
::
|
||||
++ add-range
|
||||
|= c=case-fold
|
||||
^+ c
|
||||
?~ start.c
|
||||
c
|
||||
?: (is-none u.start.c)
|
||||
c
|
||||
?: ?& (gth point.prev.c point.u.start.c)
|
||||
(is-upper-lower u.start.c)
|
||||
==
|
||||
=/ node=case-node:unicode-data
|
||||
[`@ux`point.u.start.c `@ux`point.prev.c [%uplo ~] [%uplo ~] [%uplo ~]]
|
||||
c(out [node out.c])
|
||||
=/ node=case-node:unicode-data
|
||||
[`@ux`point.u.start.c `@ux`point.prev.c +.+.u.start.c]
|
||||
c(out [node out.c])
|
||||
::
|
||||
:: #
|
||||
:: # %tree-building
|
||||
:: #
|
||||
:: builds a binary search tree out of the list
|
||||
+| %tree-building
|
||||
++ build-tree
|
||||
|= a=(list case-node:unicode-data)
|
||||
^- case-tree:unicode-data
|
||||
:: there's probably a bottom up approach that doesn't require walking
|
||||
:: a list over and over again.
|
||||
::
|
||||
:: use ?: instead of ?~ to prevent the TMI problem.
|
||||
::
|
||||
?: =(~ a)
|
||||
~
|
||||
=+ len=(lent a)
|
||||
=/ split-at=@ (div len 2)
|
||||
=/ lhs (scag split-at a)
|
||||
=/ rhs (slag split-at a)
|
||||
?~ rhs
|
||||
?~ lhs
|
||||
~
|
||||
[i.lhs ~ ~]
|
||||
=+ x=[i.rhs $(a lhs) $(a t.rhs)]
|
||||
x
|
||||
--
|
||||
/: /===/lib/unicode-data /&unicode-data&/txt/
|
||||
::
|
||||
:: part 2: utility core
|
||||
::
|
||||
|%
|
||||
++ transform
|
||||
|= [a=tape fun=$-(@c @c)]
|
||||
%- tufa
|
||||
(turn (tuba a) fun)
|
||||
::
|
||||
++ to-upper
|
||||
:: returns the uppercase of unicode codepoint {a}
|
||||
|= a=@c
|
||||
^- @c
|
||||
:: special case ascii to not perform map lookup.
|
||||
?: (lte a max-ascii)
|
||||
?: &((gte a 'a') (lte a 'z'))
|
||||
(sub a 32)
|
||||
a
|
||||
(apply-table a case-table %upper)
|
||||
::
|
||||
++ to-lower
|
||||
:: returns the lowercase of unicode codepoint {a}
|
||||
|= a=@c
|
||||
^- @c
|
||||
?: (lte a max-ascii)
|
||||
?: &((gte a 'A') (lte a 'Z'))
|
||||
(add 32 a)
|
||||
a
|
||||
(apply-table a case-table %lower)
|
||||
::
|
||||
++ apply-table
|
||||
:: searches {table} and apples applies {type} to {a}.
|
||||
::
|
||||
:: this recursively walks the case tree {table}. if it finds an entry which
|
||||
:: matches on {a}, it will apply the offset. otherwise, returns {a}.
|
||||
|= [a=@c table=case-tree:unicode-data type=?($upper $lower $title)]
|
||||
^- @c
|
||||
?~ table
|
||||
a
|
||||
?: (lth a start.n.table)
|
||||
$(table l.table)
|
||||
?: (gth a end.n.table)
|
||||
$(table r.table)
|
||||
?. &((lte start.n.table a) (lte a end.n.table))
|
||||
a
|
||||
%^ apply-offset a type
|
||||
?- type
|
||||
$upper upper.n.table
|
||||
$lower lower.n.table
|
||||
$title title.n.table
|
||||
==
|
||||
::
|
||||
++ apply-offset
|
||||
:: applies an character offset to {a}.
|
||||
|= [a=@c type=?($upper $lower $title) offset=case-offset:unicode-data]
|
||||
^- @c
|
||||
?- offset
|
||||
{$add *} (add a a.offset)
|
||||
{$sub *} (sub a s.offset)
|
||||
{$none *} a
|
||||
::
|
||||
{$uplo *}
|
||||
?- type
|
||||
$upper (sub a 1)
|
||||
$lower (add a 1)
|
||||
$title (sub a 1)
|
||||
==
|
||||
==
|
||||
::
|
||||
++ max-ascii `@c`0x7f
|
||||
--
|
||||
::
|
||||
:: part 3: generator
|
||||
::
|
||||
:- %say
|
||||
|= $: [now=@da eny=@uvJ bec=beak]
|
||||
[n=tape ~]
|
||||
~
|
||||
==
|
||||
:- %tape (transform n to-upper)
|
@ -1,150 +0,0 @@
|
||||
|%
|
||||
:: # %unicode-data
|
||||
:: types to represent UnicdoeData.txt.
|
||||
+| %unicode-data
|
||||
++ line
|
||||
:: an individual codepoint definition
|
||||
::
|
||||
$: code=@c :: codepoint in hexadecimal format
|
||||
name=tape :: character name
|
||||
gen=general :: type of character this is
|
||||
:: canonical combining class for ordering algorithms
|
||||
can=@ud
|
||||
bi=bidi :: bidirectional category of this character
|
||||
de=decomp :: character decomposition mapping
|
||||
:: todo: decimal/digit/numeric need to be parsed.
|
||||
decimal=tape :: decimal digit value (or ~)
|
||||
digit=tape :: digit value, covering non decimal radix forms
|
||||
numeric=tape :: numeric value, including fractions
|
||||
mirrored=? :: whether char is mirrored in bidirectional text
|
||||
old-name=tape :: unicode 1.0 compatibility name
|
||||
iso=tape :: iso 10646 comment field
|
||||
up=(unit @c) :: uppercase mapping codepoint
|
||||
low=(unit @c) :: lowercase mapping codepoint
|
||||
title=(unit @c) :: titlecase mapping codepoint
|
||||
==
|
||||
::
|
||||
++ general
|
||||
:: one of the normative or informative unicode general categories
|
||||
::
|
||||
:: these abbreviations are as found in the unicode standard, except
|
||||
:: lowercased as to be valid symbols.
|
||||
$? $lu :: letter, uppercase
|
||||
$ll :: letter, lowercase
|
||||
$lt :: letter, titlecase
|
||||
$mn :: mark, non-spacing
|
||||
$mc :: mark, spacing combining
|
||||
$me :: mark, enclosing
|
||||
$nd :: number, decimal digit
|
||||
$nl :: number, letter
|
||||
$no :: number, other
|
||||
$zs :: separator, space
|
||||
$zl :: separator, line
|
||||
$zp :: separator, paragraph
|
||||
$cc :: other, control
|
||||
$cf :: other, format
|
||||
$cs :: other, surrogate
|
||||
$co :: other, private use
|
||||
$cn :: other, not assigned
|
||||
::
|
||||
$lm :: letter, modifier
|
||||
$lo :: letter, other
|
||||
$pc :: punctuation, connector
|
||||
$pd :: punctuation, dash
|
||||
$ps :: punctuation, open
|
||||
$pe :: punctuation, close
|
||||
$pi :: punctuation, initial quote
|
||||
$pf :: punctuation, final quote
|
||||
$po :: punctuation, other
|
||||
$sm :: symbol, math
|
||||
$sc :: symbol, currency
|
||||
$sk :: symbol, modifier
|
||||
$so :: symbol, other
|
||||
==
|
||||
::
|
||||
++ bidi
|
||||
:: bidirectional category of a unicode character
|
||||
$? $l :: left-to-right
|
||||
$lre :: left-to-right embedding
|
||||
$lri :: left-to-right isolate
|
||||
$lro :: left-to-right override
|
||||
$fsi :: first strong isolate
|
||||
$r :: right-to-left
|
||||
$al :: right-to-left arabic
|
||||
$rle :: right-to-left embedding
|
||||
$rli :: right-to-left isolate
|
||||
$rlo :: right-to-left override
|
||||
$pdf :: pop directional format
|
||||
$pdi :: pop directional isolate
|
||||
$en :: european number
|
||||
$es :: european number separator
|
||||
$et :: european number terminator
|
||||
$an :: arabic number
|
||||
$cs :: common number separator
|
||||
$nsm :: non-spacing mark
|
||||
$bn :: boundary neutral
|
||||
$b :: paragraph separator
|
||||
$s :: segment separator
|
||||
$ws :: whitespace
|
||||
$on :: other neutrals
|
||||
==
|
||||
::
|
||||
++ decomp
|
||||
:: character decomposition mapping.
|
||||
::
|
||||
:: tag: type of decomposition.
|
||||
:: c: a list of codepoints this decomposes into.
|
||||
(unit {tag/(unit decomp-tag) c/(list @c)})
|
||||
::
|
||||
++ decomp-tag
|
||||
:: tag that describes the type of a character decomposition.
|
||||
$? $font :: a font variant
|
||||
$nobreak :: a no-break version of a space or hyphen
|
||||
$initial :: an initial presentation form (arabic)
|
||||
$medial :: a medial presentation form (arabic)
|
||||
$final :: a final presentation form (arabic)
|
||||
$isolated :: an isolated presentation form (arabic)
|
||||
$circle :: an encircled form
|
||||
$super :: a superscript form
|
||||
$sub :: a subscript form
|
||||
$vertical :: a vertical layout presentation form
|
||||
$wide :: a wide (or zenkaku) compatibility character
|
||||
$narrow :: a narrow (or hankaku) compatibility character
|
||||
$small :: a small variant form (cns compatibility)
|
||||
$square :: a cjk squared font variant
|
||||
$fraction :: a vulgar fraction form
|
||||
$compat :: otherwise unspecified compatibility character
|
||||
==
|
||||
::
|
||||
:: #
|
||||
:: # %case-map
|
||||
:: #
|
||||
:: types to represent fast lookups of case data
|
||||
+| %case-map
|
||||
++ case-offset
|
||||
:: case offsets can be in either direction
|
||||
$% :: add {a} to get the new character
|
||||
[%add a=@u]
|
||||
:: subtract {a} to get the new character
|
||||
[%sub s=@u]
|
||||
:: take no action; return self
|
||||
[%none ~]
|
||||
:: represents series of alternating uppercase/lowercase characters
|
||||
[%uplo ~]
|
||||
==
|
||||
::
|
||||
++ case-node
|
||||
:: a node in a case-tree.
|
||||
::
|
||||
:: represents a range of
|
||||
$: start=@ux
|
||||
end=@ux
|
||||
upper=case-offset
|
||||
lower=case-offset
|
||||
title=case-offset
|
||||
==
|
||||
::
|
||||
++ case-tree
|
||||
:: a binary search tree of ++case-node items, sorted on span.
|
||||
(tree case-node)
|
||||
--
|
Loading…
Reference in New Issue
Block a user