WIP simplify parsing constants to ++soft, comment placement

This commit is contained in:
Anton Dyudin 2017-10-13 09:47:38 -07:00
parent ccfb11bda9
commit 40050a490e
2 changed files with 50 additions and 146 deletions

View File

@ -15,24 +15,28 @@
^- line:unicode-data
(rash b line)
::
:> parses a single line of the unicode data file.
:> parses a single character information line of the unicode data file.
++ line
;~ (glue sem)
hex
name-string
general-category
(bass 10 (plus sid:ab))
bidi-category
decomposition-mapping
string-number
string-number
string-number
yes-or-no
name-string
name-string
optional-hex
optional-hex
optional-hex
hex :: code/@c codepoint in hex format
name-string :: name/tape character name
general-category :: gen/general type of character
(bass 10 (plus dit)) :: can/@ud canonical combining class
bidi-category :: bi/bidi bidirectional category
decomposition-mapping :: de/decomp decomposition mapping
::
:: todo: decimal/digit/numeric need to be parsed.
::
string-number :: decimal/tape decimal digit value (or ~)
string-number :: digit/tape digit value, even if non-decimal
string-number :: numeric/tape numeric value, including fractions
::
(flag 'Y' 'N') :: mirrored/? is char mirrored in bidi text?
name-string :: old-name/tape unicode 1.0 compatibility name
name-string :: iso/tape iso 10646 comment field
(punt hex) :: up/(unit @c) uppercase mapping codepoint
(punt hex) :: low/(unit @c) lowercase mapping codepoint
(punt hex) :: title/(unit @c) titlecase mapping codepoint
==
::
:> parses a single name or comment string.
@ -40,122 +44,37 @@
%+ cook
|=(a/tape a)
(star ;~(less sem prn))
:: (star ;~(pose alp (mask " <>()")))
::
:> parses a unicode general category abbreviation to symbol
++ general-category
;~ pose
(cold %lu (jest 'Lu'))
(cold %ll (jest 'Ll'))
(cold %lt (jest 'Lt'))
(cold %mn (jest 'Mn'))
(cold %mc (jest 'Mc'))
(cold %me (jest 'Me'))
(cold %nd (jest 'Nd'))
(cold %nl (jest 'Nl'))
(cold %no (jest 'No'))
(cold %zs (jest 'Zs'))
(cold %zl (jest 'Zl'))
(cold %zp (jest 'Zp'))
(cold %cc (jest 'Cc'))
(cold %cf (jest 'Cf'))
(cold %cs (jest 'Cs'))
(cold %co (jest 'Co'))
(cold %cn (jest 'Cn'))
(cold %lm (jest 'Lm'))
(cold %lo (jest 'Lo'))
(cold %pc (jest 'Pc'))
(cold %pd (jest 'Pd'))
(cold %ps (jest 'Ps'))
(cold %pe (jest 'Pe'))
(cold %pi (jest 'Pi'))
(cold %pf (jest 'Pf'))
(cold %po (jest 'Po'))
(cold %sm (jest 'Sm'))
(cold %sc (jest 'Sc'))
(cold %sk (jest 'Sk'))
(cold %so (jest 'So'))
==
%+ sear (soft general:unicode-data)
:(cook crip cass ;~(plug hig low (easy ~)))
::
:> parses a bidirectional category abbreviation to symbol.
++ bidi-category
;~ pose
(cold %fsi (jest 'FSI'))
(cold %lre (jest 'LRE'))
(cold %lri (jest 'LRI'))
(cold %lro (jest 'LRO'))
(cold %nsm (jest 'NSM'))
(cold %pdf (jest 'PDF'))
(cold %pdi (jest 'PDI'))
(cold %rle (jest 'RLE'))
(cold %rli (jest 'RLI'))
(cold %rlo (jest 'RLO'))
(cold %al (jest 'AL'))
(cold %an (jest 'AN'))
(cold %bn (jest 'BN'))
(cold %cs (jest 'CS'))
(cold %en (jest 'EN'))
(cold %es (jest 'ES'))
(cold %et (jest 'ET'))
(cold %on (jest 'ON'))
(cold %ws (jest 'WS'))
(cold %b (jest 'B'))
(cold %l (jest 'L'))
(cold %r (jest 'R'))
(cold %s (jest 'S'))
==
%+ sear (soft bidi:unicode-data)
:(cook crip cass (star hig))
::
:: TODO: This seems to be where the nest-fail is. There's an extra @ here?
++ decomposition-mapping
;~ pose
:: a tag and a list of characters to decompose to
%+ stag ~
;~ plug
;~ pose
(stag ~ (ifix [gal ;~(plug gar ace)] decomp-tag))
(easy ~)
==
(cook |=(a/(list @c) a) (most ace hex))
==
:: no decomposition information
(easy ~)
%- punt :: optional
:: a tag and a list of characters to decompose to
;~ plug
(punt (ifix [gal ;~(plug gar ace)] decomp-tag))
(cook |=(a/(list @c) a) (most ace hex))
==
::
++ decomp-tag
;~ pose
(cold %font (jest 'font'))
(cold %no-break (jest 'noBreak'))
(cold %initial (jest 'initial'))
(cold %medial (jest 'medial'))
(cold %final (jest 'final'))
(cold %isolated (jest 'isolated'))
(cold %circle (jest 'circle'))
(cold %super (jest 'super'))
(cold %sub (jest 'sub'))
(cold %vertical (jest 'vertical'))
(cold %wide (jest 'wide'))
(cold %narrow (jest 'narrow'))
(cold %small (jest 'small'))
(cold %square (jest 'square'))
(cold %fraction (jest 'fraction'))
(cold %compat (jest 'compat'))
==
%+ sear (soft decomp-tag:unicode-data)
%+ cook |=(a/term ?+(a a $nobreak %no-break)) ::REVIEW or just change the type
:(cook crip cass (star alf))
::
++ string-number
%+ cook
|=(a/tape a)
(star ;~(pose nud fas hep))
::
++ yes-or-no
;~ pose
(cold %.y (jest 'Y'))
(cold %.n (jest 'N'))
==
::
++ optional-hex
;~ pose
(stag ~ hex)
(easy ~)
==
--
--
++ grad %txt

View File

@ -5,38 +5,23 @@
++ line
:> an individual codepoint definition
:>
:> code: the codepoint in hexadecimal format
:> name: the character name
:> gen: the type of character this is
:> can: the canonical combining class for ordering algorithms
:> bidi: the bidirectional category of this character
:> de: the character decomposition mapping
:> decimal: the decimal digit value (or ~)
:> digit: the digit value, covering non decimal radix forms
:> numeric: the numeric value, including fractions
:> mirrored: whether char is mirrored in bidirectional text
:> old-name: unicode 1.0 compatibility name
:> iso: iso 10646 comment field
:> up: uppercase mapping codepoint
:> low: lowercase mapping codepoint
:> title: titlecase mapping codepoint
:>
$: code/@c
name/tape
gen/general
can/@ud
bi/bidi
de/decomp
:: todo: decimal/digit/numeric need to be parsed.
decimal/tape
digit/tape
numeric/tape
mirrored/?
old-name/tape
iso/tape
up/(unit @c)
low/(unit @c)
title/(unit @c)
$: code/@c :< codepoint in hexadecimal format
name/tape :< character name
gen/general :< type of character this is
:> canonical combining class for ordering algorithms
can/@ud
bi/bidi :< bidirectional category of this character
de/decomp :< character decomposition mapping
:: todo: decimal/digit/numeric need to be parsed.
decimal/tape :< decimal digit value (or ~)
digit/tape :< digit value, covering non decimal radix forms
numeric/tape :< numeric value, including fractions
mirrored/? :< whether char is mirrored in bidirectional text
old-name/tape :< unicode 1.0 compatibility name
iso/tape :< iso 10646 comment field
up/(unit @c) :< uppercase mapping codepoint
low/(unit @c) :< lowercase mapping codepoint
title/(unit @c) :< titlecase mapping codepoint
==
::
++ general