Merge pull request #586 from eglaysher/unicode-string-gates

Unicode string gates
This commit is contained in:
cgyarvin 2018-01-29 11:09:08 -08:00 committed by GitHub
commit 1e8e3fdd84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 514 additions and 0 deletions

285
gen/capitalize.hoon Normal file
View File

@ -0,0 +1,285 @@
:: to use, download UnicdoeData.txt and place it in `%/lib/unicode-data/txt`.
::
::::
::
:: part 1: parse the file into {uppers}
::
/- unicode-data
/+ new-hoon
/= case-table
/; |= a=(list line:unicode-data)
=, new-hoon
|^ %- build-tree
%- flop
(build-case-nodes a)
::
:> #
:> # %case-nodes
:> #
:> transforms raw unicode data into sequential case nodes.
+|
++ build-case-nodes
:> raw list of unicode data lines to a compact list of chardata
|= a=(list line:unicode-data)
^- (list case-node:unicode-data)
=< out
::
:: todo: we don't have the final case range in the output of this
:: gate. this is because this algorithm doesn't work when the last
:: char is part of a range. this doesn't happen with the real one,
:: only the excerpts i was using for testing.
::
%^ foldl:ls a *case-fold
|= [c=case-fold l=line:unicode-data]
^+ c
=+ state=(line-to-case-state l)
?: (is-adjacent state prev.c)
c(prev state)
=. c (add-range c)
%= c
start
?: &(!=(case.state %missing) !=(case.state %none))
`state
~
prev state
==
::
++ line-to-case-state
:> creates an easy to merge form.
|= line:unicode-data
^- case-state
=/ out=case-state
[code %none [%none ~] [%none ~] [%none ~]]
?: =(code `@c`0)
=. case.out %missing
out
=. case.out
?+ gen %none
$lu %upper
$ll %lower
$lt %title
==
::
:: several characters aren't described as $lu or $ll but have lower or
:: upper state, such as u+2161. detect this and fix it up.
::
=? case.out &(=(case.out %none) !=(low ~)) %upper
=? case.out &(=(case.out %none) !=(up ~)) %lower
::
:: calculate offsets
::
=? upper.out !=(up ~) (calculate-offset (need up) code)
=? lower.out !=(low ~)
(calculate-offset (need low) code)
=? title.out !=(title ~) (calculate-offset (need title) code)
out
::
++ calculate-offset
|= [src=@c dst=@c]
^- case-offset:unicode-data
?: =(src dst)
[%none ~]
?: (gth src dst)
[%add (sub src dst)]
[%sub (sub dst src)]
::
++ is-adjacent
:> is {rhs} a continuation of {lhs}?
|= [lhs=case-state rhs=case-state]
^- ?
?: (lth point.rhs point.lhs)
$(lhs rhs, rhs lhs)
?: !=(point.rhs +(point.lhs))
%.n
?: !=(case.rhs case.lhs)
(upper-lower-adjacent lhs rhs)
?: =(case.lhs %none)
%.n
?: =(case.lhs %missing)
%.n
?: !=(upper.lhs upper.rhs)
%.n
?: !=(lower.lhs lower.rhs)
%.n
?: !=(title.lhs title.rhs)
%.n
%.y
::
++ upper-lower-adjacent
:> detects %upper-lower spans.
:>
:> is {lhs} the same as {rhs}, but with opposite case?
|= [lhs=case-state rhs=case-state]
?: &(=(case.lhs %upper) !=(case.rhs %lower))
%.n
?: &(=(case.lhs %lower) !=(case.rhs %upper))
%.n
::
:: to simplify detection, if things are in the opposite order, redo
:: things flipped.
::
?: =(case.lhs %lower)
$(lhs rhs, rhs lhs)
?& (is-upper-lower lhs)
(is-lower-upper rhs)
==
::
++ is-upper-lower
|= i=case-state
=(+.+.i [[%none ~] [%add 1] [%none ~]])
::
++ is-lower-upper
|= i=case-state
=(+.+.i [[%sub 1] [%none ~] [%sub 1]])
::
++ is-none
|= i=case-state
=(+.+.i [[%none ~] [%none ~] [%none ~]])
::
++ add-range
|= c=case-fold
^+ c
?~ start.c
c
?: (is-none u.start.c)
c
?: ?& (gth point.prev.c point.u.start.c)
(is-upper-lower u.start.c)
==
=/ node=case-node:unicode-data
[`@ux`point.u.start.c `@ux`point.prev.c [%uplo ~] [%uplo ~] [%uplo ~]]
c(out [node out.c])
=/ node=case-node:unicode-data
[`@ux`point.u.start.c `@ux`point.prev.c +.+.u.start.c]
c(out [node out.c])
::
++ case-fold
:> state that's part of the fold which generates the list of case-nodes
$: :> resulting data to pass to treeify.
out=(list case-node:unicode-data)
:> the start of a run of characters; ~ for not active.
start=(unit case-state)
:> previous character state
prev=case-state
==
::
++ case-state
:> a temporary model which we compress later in a second pass.
$: point=@c
case=case-class
upper=case-offset:unicode-data
lower=case-offset:unicode-data
title=case-offset:unicode-data
==
::
++ case-class
:> classification of an individual character.
$? $upper
$lower
$title
$none
$missing
==
::
:> #
:> # %tree-building
:> #
:> builds a binary search tree out of the list
+|
++ build-tree
|= a=(list case-node:unicode-data)
^- case-tree:unicode-data
:: there's probably a bottom up approach that doesn't require walking
:: a list over and over again.
?~ a
~
=+ len=(lent a)
=+ [lhs rhs]=(split-at:ls (div len 2) a)
?~ rhs
?~ lhs
~
[i.lhs ~ ~]
=+ x=[i.rhs $(a lhs) $(a t.rhs)]
x
--
/: /===/lib/unicode-data /&unicode-data&/txt/
::
:: part 2: utility core
::
|%
++ transform
|= [a=tape fun=$-(@c @c)]
%- tufa
(turn (tuba a) fun)
::
++ to-upper
:> returns the uppercase of unicode codepoint {a}
|= a=@c
^- @c
:: special case ascii to not perform map lookup.
?: (lte a max-ascii)
?: &((gte a 'a') (lte a 'z'))
(sub a 32)
a
(apply-table a case-table %upper)
::
++ to-lower
:> returns the lowercase of unicode codepoint {a}
|= a=@c
^- @c
?: (lte a max-ascii)
?: &((gte a 'A') (lte a 'Z'))
(add 32 a)
a
(apply-table a case-table %lower)
::
++ apply-table
:> searches {table} and apples applies {type} to {a}.
:>
:> this recursively walks the case tree {table}. if it finds an entry which
:> matches on {a}, it will apply the offset. otherwise, returns {a}.
|= [a=@c table=case-tree:unicode-data type=?($upper $lower $title)]
^- @c
?~ table
a
?: (lth a start.n.table)
$(table l.table)
?: (gth a end.n.table)
$(table r.table)
?. &((lte start.n.table a) (lte a end.n.table))
a
%^ apply-offset a type
?- type
$upper upper.n.table
$lower lower.n.table
$title title.n.table
==
::
++ apply-offset
:> applies an character offset to {a}.
|= [a=@c type=?($upper $lower $title) offset=case-offset:unicode-data]
^- @c
?- offset
{$add *} (add a a.offset)
{$sub *} (sub a s.offset)
{$none *} a
::
{$uplo *}
?- type
$upper (sub a 1)
$lower (add a 1)
$title (sub a 1)
==
==
::
++ max-ascii `@c`0x7f
--
::
:: part 3: generator
::
:- %say
|= $: [now=@da eny=@uvJ bec=beak]
[n=tape $~]
$~
==
:- %tape (transform n to-upper)

79
mar/unicode-data.hoon Normal file
View File

@ -0,0 +1,79 @@
/- unicode-data
=, eyre
=, format
::
|_ all/(list line:unicode-data)
++ grab
:> converts from mark to unicode-data.
|%
++ mime |=([* a=octs] (txt (to-wain q.a))) :: XX mark translation
++ txt
|^ |= a=wain
^+ all
%+ murn a
|= b=cord
^- (unit line:unicode-data)
?~ b ~
`(rash b line)
::
:> parses a single character information line of the unicode data file.
++ line
;~ (glue sem)
hex :: code/@c codepoint in hex format
name-string :: name/tape character name
general-category :: gen/general type of character
(bass 10 (plus dit)) :: can/@ud canonical combining class
bidi-category :: bi/bidi bidirectional category
decomposition-mapping :: de/decomp decomposition mapping
::
:: todo: decimal/digit/numeric need to be parsed.
::
string-number :: decimal/tape decimal digit value (or ~)
string-number :: digit/tape digit value, even if non-decimal
string-number :: numeric/tape numeric value, including fractions
::
(flag 'Y' 'N') :: mirrored/? is char mirrored in bidi text?
name-string :: old-name/tape unicode 1.0 compatibility name
name-string :: iso/tape iso 10646 comment field
(punt hex) :: up/(unit @c) uppercase mapping codepoint
(punt hex) :: low/(unit @c) lowercase mapping codepoint
(punt hex) :: title/(unit @c) titlecase mapping codepoint
==
::
:> parses a single name or comment string.
++ name-string
%+ cook
|=(a=tape a)
(star ;~(less sem prn))
::
:> parses a unicode general category abbreviation to symbol
++ general-category
%+ sear (soft general:unicode-data)
:(cook crip cass ;~(plug hig low (easy ~)))
::
:> parses a bidirectional category abbreviation to symbol.
++ bidi-category
%+ sear (soft bidi:unicode-data)
:(cook crip cass (star hig))
::
++ decomposition-mapping
%- punt :: optional
:: a tag and a list of characters to decompose to
;~ plug
(punt (ifix [gal ;~(plug gar ace)] decomp-tag))
(cook |=(a=(list @c) a) (most ace hex))
==
::
++ decomp-tag
%+ sear (soft decomp-tag:unicode-data)
:(cook crip cass (star alf))
::
++ string-number
%+ cook
|=(a=tape a)
(star ;~(pose nud fas hep))
::
--
--
++ grad %txt
--

150
sur/unicode-data.hoon Normal file
View File

@ -0,0 +1,150 @@
|%
:> # %unicode-data
:> types to represent UnicdoeData.txt.
+|
++ line
:> an individual codepoint definition
:>
$: code=@c :< codepoint in hexadecimal format
name=tape :< character name
gen=general :< type of character this is
:> canonical combining class for ordering algorithms
can=@ud
bi=bidi :< bidirectional category of this character
de=decomp :< character decomposition mapping
:: todo: decimal/digit/numeric need to be parsed.
decimal=tape :< decimal digit value (or ~)
digit=tape :< digit value, covering non decimal radix forms
numeric=tape :< numeric value, including fractions
mirrored=? :< whether char is mirrored in bidirectional text
old-name=tape :< unicode 1.0 compatibility name
iso=tape :< iso 10646 comment field
up=(unit @c) :< uppercase mapping codepoint
low=(unit @c) :< lowercase mapping codepoint
title=(unit @c) :< titlecase mapping codepoint
==
::
++ general
:> one of the normative or informative unicode general categories
:>
:> these abbreviations are as found in the unicode standard, except
:> lowercased as to be valid symbols.
$? $lu :< letter, uppercase
$ll :< letter, lowercase
$lt :< letter, titlecase
$mn :< mark, non-spacing
$mc :< mark, spacing combining
$me :< mark, enclosing
$nd :< number, decimal digit
$nl :< number, letter
$no :< number, other
$zs :< separator, space
$zl :< separator, line
$zp :< separator, paragraph
$cc :< other, control
$cf :< other, format
$cs :< other, surrogate
$co :< other, private use
$cn :< other, not assigned
::
$lm :< letter, modifier
$lo :< letter, other
$pc :< punctuation, connector
$pd :< punctuation, dash
$ps :< punctuation, open
$pe :< punctuation, close
$pi :< punctuation, initial quote
$pf :< punctuation, final quote
$po :< punctuation, other
$sm :< symbol, math
$sc :< symbol, currency
$sk :< symbol, modifier
$so :< symbol, other
==
::
++ bidi
:> bidirectional category of a unicode character
$? $l :< left-to-right
$lre :< left-to-right embedding
$lri :< left-to-right isolate
$lro :< left-to-right override
$fsi :< first strong isolate
$r :< right-to-left
$al :< right-to-left arabic
$rle :< right-to-left embedding
$rli :< right-to-left isolate
$rlo :< right-to-left override
$pdf :< pop directional format
$pdi :< pop directional isolate
$en :< european number
$es :< european number separator
$et :< european number terminator
$an :< arabic number
$cs :< common number separator
$nsm :< non-spacing mark
$bn :< boundary neutral
$b :< paragraph separator
$s :< segment separator
$ws :< whitespace
$on :< other neutrals
==
::
++ decomp
:> character decomposition mapping.
:>
:> tag: type of decomposition.
:> c: a list of codepoints this decomposes into.
(unit {tag/(unit decomp-tag) c/(list @c)})
::
++ decomp-tag
:> tag that describes the type of a character decomposition.
$? $font :< a font variant
$nobreak :< a no-break version of a space or hyphen
$initial :< an initial presentation form (arabic)
$medial :< a medial presentation form (arabic)
$final :< a final presentation form (arabic)
$isolated :< an isolated presentation form (arabic)
$circle :< an encircled form
$super :< a superscript form
$sub :< a subscript form
$vertical :< a vertical layout presentation form
$wide :< a wide (or zenkaku) compatibility character
$narrow :< a narrow (or hankaku) compatibility character
$small :< a small variant form (cns compatibility)
$square :< a cjk squared font variant
$fraction :< a vulgar fraction form
$compat :< otherwise unspecified compatibility character
==
::
:> #
:> # %case-map
:> #
:> types to represent fast lookups of case data
+|
++ case-offset
:> case offsets can be in either direction
$% :> add {a} to get the new character
[%add a=@u]
:> subtract {a} to get the new character
[%sub s=@u]
:> take no action; return self
[%none $~]
:> represents series of alternating uppercase/lowercase characters
[%uplo $~]
==
::
++ case-node
:> a node in a case-tree.
:>
:> represents a range of
$: start=@ux
end=@ux
upper=case-offset
lower=case-offset
title=case-offset
==
::
++ case-tree
:> a binary search tree of ++case-node items, sorted on span.
(tree case-node)
--