From ccfb11bda9030601f6679d474d0f7fde87b28bd9 Mon Sep 17 00:00:00 2001 From: Elliot Glaysher Date: Thu, 12 Oct 2017 22:55:45 -0700 Subject: [PATCH] Complete prototype for a unicode aware ++to-{upper,lower}. This uses a mark to parse UnicodeData.txt, and some ford runes to change this into a binary search tree data structure for quick lookups, along with the optimizations found in golang's unicode table to record ranges instead of individual characters. --- gen/capitalize.hoon | 266 +++++++++++++++++++++++++++++++++++++++--- mar/unicode-data.hoon | 19 +-- sur/unicode-data.hoon | 35 ++++++ 3 files changed, 284 insertions(+), 36 deletions(-) diff --git a/gen/capitalize.hoon b/gen/capitalize.hoon index 3b755d633..49303d2dc 100644 --- a/gen/capitalize.hoon +++ b/gen/capitalize.hoon @@ -2,21 +2,203 @@ :: part 1: parse the file into {uppers} :: /- unicode-data -:: -:: while this works, it'd be better to build range based data structures like -:: golang does. golang uses flat tables that it binary searches over. storage -:: as a binary tree? -:: -/= uppers +/+ new-hoon +/= case-table /; |= a/(list line:unicode-data) - =| ret/(map @c @c) - |- - ^- (map @c @c) - ?~ a - ret - ?~ up.i.a - $(a t.a) - $(a t.a, ret (~(put by ret) code.i.a u.up.i.a)) + =, new-hoon + |^ %- build-tree + %- flop + (build-case-nodes a) + :: + :> # + :> # %case-nodes + :> # + :> transforms raw unicode data into sequential case nodes. + +| + ++ build-case-nodes + :> raw list of unicode data lines to a compact list of chardata + |= a/(list line:unicode-data) + ^- (list case-node:unicode-data) + =< out + :: + :: todo: we don't have the final case range in the output of this + :: gate. this is because this algorithm doesn't work when the last + :: char is part of a range. this doesn't happen with the real one, + :: only the excerpts i was using for testing. + :: + %^ foldl:ls a *case-fold + |= {c/case-fold l/line:unicode-data} + ^+ c + =+ state=(line-to-case-state l) + ?: (is-adjacent state prev.c) + c(prev state) + =. c (add-range c) + %= c + start + ?: &(!=(case.state %missing) !=(case.state %none)) + `state + ~ + prev state + == + :: + ++ line-to-case-state + :> creates an easy to merge form. + |= line:unicode-data + ^- case-state + =/ out/case-state + [code %none [%none ~] [%none ~] [%none ~]] + ?: =(code `@c`0) + =. case.out %missing + out + =. case.out + ?+ gen %none + $lu %upper + $ll %lower + $lt %title + == + :: + :: several characters aren't described as $lu or $ll but have lower or + :: upper state, such as u+2161. detect this and fix it up. + :: + =? case.out &(=(case.out %none) !=(low ~)) %upper + =? case.out &(=(case.out %none) !=(up ~)) %lower + :: + :: calculate offsets + :: + =? upper.out !=(up ~) (calculate-offset (need up) code) + =? lower.out !=(low ~) + (calculate-offset (need low) code) + =? title.out !=(title ~) (calculate-offset (need title) code) + out + :: + ++ calculate-offset + |= {src/@c dst/@c} + ^- case-offset:unicode-data + ?: =(src dst) + [%none ~] + ?: (gth src dst) + [%add (sub src dst)] + [%sub (sub dst src)] + :: + ++ is-adjacent + :> is {rhs} a continuation of {lhs}? + |= {lhs/case-state rhs/case-state} + ^- ? + ?: (lth point.rhs point.lhs) + $(lhs rhs, rhs lhs) + ?: !=(point.rhs +(point.lhs)) + %.n + ?: !=(case.rhs case.lhs) + (upper-lower-adjacent lhs rhs) + ?: =(case.lhs %none) + %.n + ?: =(case.lhs %missing) + %.n + ?: !=(upper.lhs upper.rhs) + %.n + ?: !=(lower.lhs lower.rhs) + %.n + ?: !=(title.lhs title.rhs) + %.n + %.y + :: + ++ upper-lower-adjacent + :> detects %upper-lower spans. + :> + :> is {lhs} the same as {rhs}, but with opposite case? + |= {lhs/case-state rhs/case-state} + ?: &(=(case.lhs %upper) !=(case.rhs %lower)) + %.n + ?: &(=(case.lhs %lower) !=(case.rhs %upper)) + %.n + :: + :: to simplify detection, if things are in the opposite order, redo + :: things flipped. + :: + ?: =(case.lhs %lower) + $(lhs rhs, rhs lhs) + ?& (is-upper-lower lhs) + (is-lower-upper rhs) + == + :: + ++ is-upper-lower + |= i/case-state + =(+.+.i [[%none ~] [%add 1] [%none ~]]) + :: + ++ is-lower-upper + |= i/case-state + =(+.+.i [[%sub 1] [%none ~] [%sub 1]]) + :: + ++ is-none + |= i/case-state + =(+.+.i [[%none ~] [%none ~] [%none ~]]) + :: + ++ add-range + |= c/case-fold + ^+ c + ?~ start.c + c + ?: (is-none u.start.c) + c + ?: ?& (gth point.prev.c point.u.start.c) + (is-upper-lower u.start.c) + == + =/ node/case-node:unicode-data + [`@ux`point.u.start.c `@ux`point.prev.c [%uplo ~] [%uplo ~] [%uplo ~]] + c(out [node out.c]) + =/ node/case-node:unicode-data + [`@ux`point.u.start.c `@ux`point.prev.c +.+.u.start.c] + c(out [node out.c]) + :: + ++ case-fold + :> state that's part of the fold which generates the list of case-nodes + $: :> resulting data to pass to treeify. + out/(list case-node:unicode-data) + :> the start of a run of characters; ~ for not active. + start/(unit case-state) + :> previous character state + prev/case-state + == + :: + ++ case-state + :> a temporary model which we compress later in a second pass. + $: point/@c + case/case-class + upper/case-offset:unicode-data + lower/case-offset:unicode-data + title/case-offset:unicode-data + == + :: + ++ case-class + :> classification of an individual character. + $? $upper + $lower + $title + $none + $missing + == + :: + :> # + :> # %tree-building + :> # + :> builds a binary search tree out of the list + +| + ++ build-tree + |= a/(list case-node:unicode-data) + ^- case-tree:unicode-data + :: there's probably a bottom up approach that doesn't require walking + :: a list over and over again. + ?~ a + ~ + =+ len=(lent a) + =+ [lhs rhs]=(split-at:ls (div len 2) a) + ?~ rhs + ?~ lhs + ~ + [i.lhs ~ ~] + =+ x=[i.rhs $(a lhs) $(a t.rhs)] + x + -- /: /===/lib/unicode-data /&unicode-data&/txt/ :: :: part 2: utility core @@ -28,6 +210,7 @@ (turn (tuba a) fun) :: ++ to-upper + :> returns the uppercase of unicode codepoint {a} |= a/@c ^- @c :: special case ascii to not perform map lookup. @@ -35,8 +218,56 @@ ?: &((gte a 'a') (lte a 'z')) (sub a 32) a - =+ x=(~(get by uppers) a) - (fall x a) + (apply-table a case-table %upper) +:: +++ to-lower + :> returns the lowercase of unicode codepoint {a} + |= a/@c + ^- @c + ?: (lte a max-ascii) + ?: &((get a 'A') (lte a 'Z')) + (add 32 a) + a + (apply-table a case-table %lower) +:: +++ apply-table + :> searches {table} and apples applies {type} to {a}. + :> + :> this recursively walks the case tree {table}. if it finds an entry which + :> matches on {a}, it will apply the offset. otherwise, returns {a}. + |= {a/@c table/case-tree:unicode-data type/?($upper $lower $title)} + ^- @c + ?~ table + a + ?: (lth a start.n.table) + $(table l.table) + ?: (gth a end.n.table) + $(table r.table) + ?. &((lte start.n.table a) (lte a end.n.table)) + a + %^ apply-offset a type + ?- type + $upper upper.n.table + $lower lower.n.table + $title title.n.table + == +:: +++ apply-offset + :> applies an character offset to {a}. + |= {a/@c type/?($upper $lower $title) offset/case-offset:unicode-data} + ^- @c + ?- offset + {$add *} (add a a.offset) + {$sub *} (sub a s.offset) + {$none *} a + :: + {$uplo *} + ?- type + $upper (sub a 1) + $lower (add a 1) + $title (sub a 1) + == + == :: ++ max-ascii `@c`0x7f -- @@ -48,5 +279,4 @@ {n/tape $~} $~ == -:- %noun -(transform n to-upper) +:- %tape (transform n to-upper) diff --git a/mar/unicode-data.hoon b/mar/unicode-data.hoon index 3367f2e0b..e3b8e3ba3 100644 --- a/mar/unicode-data.hoon +++ b/mar/unicode-data.hoon @@ -1,24 +1,8 @@ /- unicode-data =, eyre =, format - -:: ok, so we can currently slurp the data in. we're having problems getting it -:: back out. ++txt:grow is probably the next logical step here. - -|_ all/(list line:unicode-data) -++ grow - :> converts from unicode-data to mark. - |% - :: ++ txt - :: ^- wain - :: %+ turn all - :: |= line:unicode-data - :: ;: weld - :: ";" - :: name - :: ";" - -- :: +|_ all/(list line:unicode-data) ++ grab :> converts from mark to unicode-data. |% @@ -56,7 +40,6 @@ %+ cook |=(a/tape a) (star ;~(less sem prn)) -:: (star ;~(pose hig low nud hep ace gal gar pel per)) :: :> parses a unicode general category abbreviation to symbol ++ general-category diff --git a/sur/unicode-data.hoon b/sur/unicode-data.hoon index 88e2e51dc..ac3ea2cd5 100644 --- a/sur/unicode-data.hoon +++ b/sur/unicode-data.hoon @@ -1,4 +1,7 @@ |% +:> # %unicode-data +:> types to represent UnicdoeData.txt. ++| ++ line :> an individual codepoint definition :> @@ -127,4 +130,36 @@ $fraction :< a vulgar fraction form $compat :< otherwise unspecified compatibility character == +:: +:> # +:> # %case-map +:> # +:> types to represent fast lookups of case data ++| +++ case-offset + :> case offsets can be in either direction + $% :> add {a} to get the new character + {$add a/@u} + :> subtract {a} to get the new character + {$sub s/@u} + :> take no action; return self + {$none $~} + :> represents series of alternating uppercase/lowercase characters + {$uplo $~} + == +:: +++ case-node + :> a node in a case-tree. + :> + :> represents a range of + $: start/@ux + end/@ux + upper/case-offset + lower/case-offset + title/case-offset + == +:: +++ case-tree + :> a binary search tree of ++case-node items, sorted on span. + (tree case-node) --