mirror of
https://github.com/ilyakooo0/urbit.git
synced 2025-01-07 07:30:23 +03:00
Complete prototype for a unicode aware ++to-{upper,lower}.
This uses a mark to parse UnicodeData.txt, and some ford runes to change this into a binary search tree data structure for quick lookups, along with the optimizations found in golang's unicode table to record ranges instead of individual characters.
This commit is contained in:
parent
25accb9eba
commit
ccfb11bda9
@ -2,21 +2,203 @@
|
|||||||
:: part 1: parse the file into {uppers}
|
:: part 1: parse the file into {uppers}
|
||||||
::
|
::
|
||||||
/- unicode-data
|
/- unicode-data
|
||||||
::
|
/+ new-hoon
|
||||||
:: while this works, it'd be better to build range based data structures like
|
/= case-table
|
||||||
:: golang does. golang uses flat tables that it binary searches over. storage
|
|
||||||
:: as a binary tree?
|
|
||||||
::
|
|
||||||
/= uppers
|
|
||||||
/; |= a/(list line:unicode-data)
|
/; |= a/(list line:unicode-data)
|
||||||
=| ret/(map @c @c)
|
=, new-hoon
|
||||||
|-
|
|^ %- build-tree
|
||||||
^- (map @c @c)
|
%- flop
|
||||||
?~ a
|
(build-case-nodes a)
|
||||||
ret
|
::
|
||||||
?~ up.i.a
|
:> #
|
||||||
$(a t.a)
|
:> # %case-nodes
|
||||||
$(a t.a, ret (~(put by ret) code.i.a u.up.i.a))
|
:> #
|
||||||
|
:> transforms raw unicode data into sequential case nodes.
|
||||||
|
+|
|
||||||
|
++ build-case-nodes
|
||||||
|
:> raw list of unicode data lines to a compact list of chardata
|
||||||
|
|= a/(list line:unicode-data)
|
||||||
|
^- (list case-node:unicode-data)
|
||||||
|
=< out
|
||||||
|
::
|
||||||
|
:: todo: we don't have the final case range in the output of this
|
||||||
|
:: gate. this is because this algorithm doesn't work when the last
|
||||||
|
:: char is part of a range. this doesn't happen with the real one,
|
||||||
|
:: only the excerpts i was using for testing.
|
||||||
|
::
|
||||||
|
%^ foldl:ls a *case-fold
|
||||||
|
|= {c/case-fold l/line:unicode-data}
|
||||||
|
^+ c
|
||||||
|
=+ state=(line-to-case-state l)
|
||||||
|
?: (is-adjacent state prev.c)
|
||||||
|
c(prev state)
|
||||||
|
=. c (add-range c)
|
||||||
|
%= c
|
||||||
|
start
|
||||||
|
?: &(!=(case.state %missing) !=(case.state %none))
|
||||||
|
`state
|
||||||
|
~
|
||||||
|
prev state
|
||||||
|
==
|
||||||
|
::
|
||||||
|
++ line-to-case-state
|
||||||
|
:> creates an easy to merge form.
|
||||||
|
|= line:unicode-data
|
||||||
|
^- case-state
|
||||||
|
=/ out/case-state
|
||||||
|
[code %none [%none ~] [%none ~] [%none ~]]
|
||||||
|
?: =(code `@c`0)
|
||||||
|
=. case.out %missing
|
||||||
|
out
|
||||||
|
=. case.out
|
||||||
|
?+ gen %none
|
||||||
|
$lu %upper
|
||||||
|
$ll %lower
|
||||||
|
$lt %title
|
||||||
|
==
|
||||||
|
::
|
||||||
|
:: several characters aren't described as $lu or $ll but have lower or
|
||||||
|
:: upper state, such as u+2161. detect this and fix it up.
|
||||||
|
::
|
||||||
|
=? case.out &(=(case.out %none) !=(low ~)) %upper
|
||||||
|
=? case.out &(=(case.out %none) !=(up ~)) %lower
|
||||||
|
::
|
||||||
|
:: calculate offsets
|
||||||
|
::
|
||||||
|
=? upper.out !=(up ~) (calculate-offset (need up) code)
|
||||||
|
=? lower.out !=(low ~)
|
||||||
|
(calculate-offset (need low) code)
|
||||||
|
=? title.out !=(title ~) (calculate-offset (need title) code)
|
||||||
|
out
|
||||||
|
::
|
||||||
|
++ calculate-offset
|
||||||
|
|= {src/@c dst/@c}
|
||||||
|
^- case-offset:unicode-data
|
||||||
|
?: =(src dst)
|
||||||
|
[%none ~]
|
||||||
|
?: (gth src dst)
|
||||||
|
[%add (sub src dst)]
|
||||||
|
[%sub (sub dst src)]
|
||||||
|
::
|
||||||
|
++ is-adjacent
|
||||||
|
:> is {rhs} a continuation of {lhs}?
|
||||||
|
|= {lhs/case-state rhs/case-state}
|
||||||
|
^- ?
|
||||||
|
?: (lth point.rhs point.lhs)
|
||||||
|
$(lhs rhs, rhs lhs)
|
||||||
|
?: !=(point.rhs +(point.lhs))
|
||||||
|
%.n
|
||||||
|
?: !=(case.rhs case.lhs)
|
||||||
|
(upper-lower-adjacent lhs rhs)
|
||||||
|
?: =(case.lhs %none)
|
||||||
|
%.n
|
||||||
|
?: =(case.lhs %missing)
|
||||||
|
%.n
|
||||||
|
?: !=(upper.lhs upper.rhs)
|
||||||
|
%.n
|
||||||
|
?: !=(lower.lhs lower.rhs)
|
||||||
|
%.n
|
||||||
|
?: !=(title.lhs title.rhs)
|
||||||
|
%.n
|
||||||
|
%.y
|
||||||
|
::
|
||||||
|
++ upper-lower-adjacent
|
||||||
|
:> detects %upper-lower spans.
|
||||||
|
:>
|
||||||
|
:> is {lhs} the same as {rhs}, but with opposite case?
|
||||||
|
|= {lhs/case-state rhs/case-state}
|
||||||
|
?: &(=(case.lhs %upper) !=(case.rhs %lower))
|
||||||
|
%.n
|
||||||
|
?: &(=(case.lhs %lower) !=(case.rhs %upper))
|
||||||
|
%.n
|
||||||
|
::
|
||||||
|
:: to simplify detection, if things are in the opposite order, redo
|
||||||
|
:: things flipped.
|
||||||
|
::
|
||||||
|
?: =(case.lhs %lower)
|
||||||
|
$(lhs rhs, rhs lhs)
|
||||||
|
?& (is-upper-lower lhs)
|
||||||
|
(is-lower-upper rhs)
|
||||||
|
==
|
||||||
|
::
|
||||||
|
++ is-upper-lower
|
||||||
|
|= i/case-state
|
||||||
|
=(+.+.i [[%none ~] [%add 1] [%none ~]])
|
||||||
|
::
|
||||||
|
++ is-lower-upper
|
||||||
|
|= i/case-state
|
||||||
|
=(+.+.i [[%sub 1] [%none ~] [%sub 1]])
|
||||||
|
::
|
||||||
|
++ is-none
|
||||||
|
|= i/case-state
|
||||||
|
=(+.+.i [[%none ~] [%none ~] [%none ~]])
|
||||||
|
::
|
||||||
|
++ add-range
|
||||||
|
|= c/case-fold
|
||||||
|
^+ c
|
||||||
|
?~ start.c
|
||||||
|
c
|
||||||
|
?: (is-none u.start.c)
|
||||||
|
c
|
||||||
|
?: ?& (gth point.prev.c point.u.start.c)
|
||||||
|
(is-upper-lower u.start.c)
|
||||||
|
==
|
||||||
|
=/ node/case-node:unicode-data
|
||||||
|
[`@ux`point.u.start.c `@ux`point.prev.c [%uplo ~] [%uplo ~] [%uplo ~]]
|
||||||
|
c(out [node out.c])
|
||||||
|
=/ node/case-node:unicode-data
|
||||||
|
[`@ux`point.u.start.c `@ux`point.prev.c +.+.u.start.c]
|
||||||
|
c(out [node out.c])
|
||||||
|
::
|
||||||
|
++ case-fold
|
||||||
|
:> state that's part of the fold which generates the list of case-nodes
|
||||||
|
$: :> resulting data to pass to treeify.
|
||||||
|
out/(list case-node:unicode-data)
|
||||||
|
:> the start of a run of characters; ~ for not active.
|
||||||
|
start/(unit case-state)
|
||||||
|
:> previous character state
|
||||||
|
prev/case-state
|
||||||
|
==
|
||||||
|
::
|
||||||
|
++ case-state
|
||||||
|
:> a temporary model which we compress later in a second pass.
|
||||||
|
$: point/@c
|
||||||
|
case/case-class
|
||||||
|
upper/case-offset:unicode-data
|
||||||
|
lower/case-offset:unicode-data
|
||||||
|
title/case-offset:unicode-data
|
||||||
|
==
|
||||||
|
::
|
||||||
|
++ case-class
|
||||||
|
:> classification of an individual character.
|
||||||
|
$? $upper
|
||||||
|
$lower
|
||||||
|
$title
|
||||||
|
$none
|
||||||
|
$missing
|
||||||
|
==
|
||||||
|
::
|
||||||
|
:> #
|
||||||
|
:> # %tree-building
|
||||||
|
:> #
|
||||||
|
:> builds a binary search tree out of the list
|
||||||
|
+|
|
||||||
|
++ build-tree
|
||||||
|
|= a/(list case-node:unicode-data)
|
||||||
|
^- case-tree:unicode-data
|
||||||
|
:: there's probably a bottom up approach that doesn't require walking
|
||||||
|
:: a list over and over again.
|
||||||
|
?~ a
|
||||||
|
~
|
||||||
|
=+ len=(lent a)
|
||||||
|
=+ [lhs rhs]=(split-at:ls (div len 2) a)
|
||||||
|
?~ rhs
|
||||||
|
?~ lhs
|
||||||
|
~
|
||||||
|
[i.lhs ~ ~]
|
||||||
|
=+ x=[i.rhs $(a lhs) $(a t.rhs)]
|
||||||
|
x
|
||||||
|
--
|
||||||
/: /===/lib/unicode-data /&unicode-data&/txt/
|
/: /===/lib/unicode-data /&unicode-data&/txt/
|
||||||
::
|
::
|
||||||
:: part 2: utility core
|
:: part 2: utility core
|
||||||
@ -28,6 +210,7 @@
|
|||||||
(turn (tuba a) fun)
|
(turn (tuba a) fun)
|
||||||
::
|
::
|
||||||
++ to-upper
|
++ to-upper
|
||||||
|
:> returns the uppercase of unicode codepoint {a}
|
||||||
|= a/@c
|
|= a/@c
|
||||||
^- @c
|
^- @c
|
||||||
:: special case ascii to not perform map lookup.
|
:: special case ascii to not perform map lookup.
|
||||||
@ -35,8 +218,56 @@
|
|||||||
?: &((gte a 'a') (lte a 'z'))
|
?: &((gte a 'a') (lte a 'z'))
|
||||||
(sub a 32)
|
(sub a 32)
|
||||||
a
|
a
|
||||||
=+ x=(~(get by uppers) a)
|
(apply-table a case-table %upper)
|
||||||
(fall x a)
|
::
|
||||||
|
++ to-lower
|
||||||
|
:> returns the lowercase of unicode codepoint {a}
|
||||||
|
|= a/@c
|
||||||
|
^- @c
|
||||||
|
?: (lte a max-ascii)
|
||||||
|
?: &((get a 'A') (lte a 'Z'))
|
||||||
|
(add 32 a)
|
||||||
|
a
|
||||||
|
(apply-table a case-table %lower)
|
||||||
|
::
|
||||||
|
++ apply-table
|
||||||
|
:> searches {table} and apples applies {type} to {a}.
|
||||||
|
:>
|
||||||
|
:> this recursively walks the case tree {table}. if it finds an entry which
|
||||||
|
:> matches on {a}, it will apply the offset. otherwise, returns {a}.
|
||||||
|
|= {a/@c table/case-tree:unicode-data type/?($upper $lower $title)}
|
||||||
|
^- @c
|
||||||
|
?~ table
|
||||||
|
a
|
||||||
|
?: (lth a start.n.table)
|
||||||
|
$(table l.table)
|
||||||
|
?: (gth a end.n.table)
|
||||||
|
$(table r.table)
|
||||||
|
?. &((lte start.n.table a) (lte a end.n.table))
|
||||||
|
a
|
||||||
|
%^ apply-offset a type
|
||||||
|
?- type
|
||||||
|
$upper upper.n.table
|
||||||
|
$lower lower.n.table
|
||||||
|
$title title.n.table
|
||||||
|
==
|
||||||
|
::
|
||||||
|
++ apply-offset
|
||||||
|
:> applies an character offset to {a}.
|
||||||
|
|= {a/@c type/?($upper $lower $title) offset/case-offset:unicode-data}
|
||||||
|
^- @c
|
||||||
|
?- offset
|
||||||
|
{$add *} (add a a.offset)
|
||||||
|
{$sub *} (sub a s.offset)
|
||||||
|
{$none *} a
|
||||||
|
::
|
||||||
|
{$uplo *}
|
||||||
|
?- type
|
||||||
|
$upper (sub a 1)
|
||||||
|
$lower (add a 1)
|
||||||
|
$title (sub a 1)
|
||||||
|
==
|
||||||
|
==
|
||||||
::
|
::
|
||||||
++ max-ascii `@c`0x7f
|
++ max-ascii `@c`0x7f
|
||||||
--
|
--
|
||||||
@ -48,5 +279,4 @@
|
|||||||
{n/tape $~}
|
{n/tape $~}
|
||||||
$~
|
$~
|
||||||
==
|
==
|
||||||
:- %noun
|
:- %tape (transform n to-upper)
|
||||||
(transform n to-upper)
|
|
||||||
|
@ -1,24 +1,8 @@
|
|||||||
/- unicode-data
|
/- unicode-data
|
||||||
=, eyre
|
=, eyre
|
||||||
=, format
|
=, format
|
||||||
|
|
||||||
:: ok, so we can currently slurp the data in. we're having problems getting it
|
|
||||||
:: back out. ++txt:grow is probably the next logical step here.
|
|
||||||
|
|
||||||
|_ all/(list line:unicode-data)
|
|
||||||
++ grow
|
|
||||||
:> converts from unicode-data to mark.
|
|
||||||
|%
|
|
||||||
:: ++ txt
|
|
||||||
:: ^- wain
|
|
||||||
:: %+ turn all
|
|
||||||
:: |= line:unicode-data
|
|
||||||
:: ;: weld
|
|
||||||
:: ";"
|
|
||||||
:: name
|
|
||||||
:: ";"
|
|
||||||
--
|
|
||||||
::
|
::
|
||||||
|
|_ all/(list line:unicode-data)
|
||||||
++ grab
|
++ grab
|
||||||
:> converts from mark to unicode-data.
|
:> converts from mark to unicode-data.
|
||||||
|%
|
|%
|
||||||
@ -56,7 +40,6 @@
|
|||||||
%+ cook
|
%+ cook
|
||||||
|=(a/tape a)
|
|=(a/tape a)
|
||||||
(star ;~(less sem prn))
|
(star ;~(less sem prn))
|
||||||
:: (star ;~(pose hig low nud hep ace gal gar pel per))
|
|
||||||
::
|
::
|
||||||
:> parses a unicode general category abbreviation to symbol
|
:> parses a unicode general category abbreviation to symbol
|
||||||
++ general-category
|
++ general-category
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
|%
|
|%
|
||||||
|
:> # %unicode-data
|
||||||
|
:> types to represent UnicdoeData.txt.
|
||||||
|
+|
|
||||||
++ line
|
++ line
|
||||||
:> an individual codepoint definition
|
:> an individual codepoint definition
|
||||||
:>
|
:>
|
||||||
@ -127,4 +130,36 @@
|
|||||||
$fraction :< a vulgar fraction form
|
$fraction :< a vulgar fraction form
|
||||||
$compat :< otherwise unspecified compatibility character
|
$compat :< otherwise unspecified compatibility character
|
||||||
==
|
==
|
||||||
|
::
|
||||||
|
:> #
|
||||||
|
:> # %case-map
|
||||||
|
:> #
|
||||||
|
:> types to represent fast lookups of case data
|
||||||
|
+|
|
||||||
|
++ case-offset
|
||||||
|
:> case offsets can be in either direction
|
||||||
|
$% :> add {a} to get the new character
|
||||||
|
{$add a/@u}
|
||||||
|
:> subtract {a} to get the new character
|
||||||
|
{$sub s/@u}
|
||||||
|
:> take no action; return self
|
||||||
|
{$none $~}
|
||||||
|
:> represents series of alternating uppercase/lowercase characters
|
||||||
|
{$uplo $~}
|
||||||
|
==
|
||||||
|
::
|
||||||
|
++ case-node
|
||||||
|
:> a node in a case-tree.
|
||||||
|
:>
|
||||||
|
:> represents a range of
|
||||||
|
$: start/@ux
|
||||||
|
end/@ux
|
||||||
|
upper/case-offset
|
||||||
|
lower/case-offset
|
||||||
|
title/case-offset
|
||||||
|
==
|
||||||
|
::
|
||||||
|
++ case-tree
|
||||||
|
:> a binary search tree of ++case-node items, sorted on span.
|
||||||
|
(tree case-node)
|
||||||
--
|
--
|
||||||
|
Loading…
Reference in New Issue
Block a user