urbit/pub/docs/dev/hoon/library/2em.md
Galen Wolfe-Pauly bb495844be doc -> docs
2015-10-20 10:51:45 -07:00

18 KiB

section 2eM, regular-expressions

++pars

++  pars
  |=  [a=tape]                                          ::  parse tape to rege
  ^-  (unit rege)
  =+  foo=((full apex:rags) [[1 1] a])
  ?~  q.foo
    ~
  [~ p.u.q.foo]
::

Parse regular expression

~zod/try=> (pars "samo")
[ ~
  [ %pair
    p=[%lite p=~~s]
    q=[%pair p=[%lite p=~~a] q=[%pair p=[%lite p=~~m] q=[%lite p=~~o]]]
  ]
]
~zod/try=> (pars "so[,.0-9]")
[ ~
  [ %pair
    p=[%lite p=~~s]
    q=[%pair p=[%lite p=~~o] q=[%brac p=288.036.862.105.223.168]]
  ]
]
~zod/try=> `@ub`288.036.862.105.223.168
0b11.1111.1111.0101.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000
~zod/try=> `@ub`(lsh 0 `@`'9' 1)
0b10.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000
~zod/try=> `@ub`(roll (turn ",.0123456789" |=(a=@ (lsh 0 a 1))) con)
0b11.1111.1111.0101.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000.0000
~zod/try=> (pars "sop.*")
[ ~
  [ %pair
    p=[%lite p=~~s] 
    q=[%pair p=[%lite p=~~o] q=[%pair p=[%lite p=~~p] q=[%mant p=%dote]]]
  ]
]
~zod/try=> (pars "(hel)?")
[ ~
  [ %eith
      p
    [ %capt
      p=[%pair p=[%lite p=~~h] q=[%pair p=[%lite p=~~e] q=[%lite p=~~l]]]
      q=0
    ]
    q=%empt
  ]
]
~zod/try=> (pars "(hel)??")
[ ~ 
  [ %eith
    p=%empt
      q
    [ %capt
      p=[%pair p=[%lite p=~~h] q=[%pair p=[%lite p=~~e] q=[%lite p=~~l]]]
      q=0
    ]
  ]
]
~zod/try=> (pars "a\{1,20}")
[~ [%betw p=[%lite p=~~a] q=1 r=20]]

++rags

++  rags                                                ::  rege parsers
  =>  |%

Regex parser arms

++nor

      ++  nor  ;~(less (mask "^$()|*?+.[\\") (shim 1 127)) :: non-control char

XX document

++les

      ++  les  ;~(less bas asp)                         ::  not backslash

XX document

++lep

      ++  lep  ;~(less (mask "^[]\\") asp)              ::  charset non-control

XX document

++asp

      ++  asp  (shim 32 126)                            ::  printable ascii

XX document

++alb

      ++  alb  ;~(less ser asp)                         ::  charset literal char

XX document

++mis

      ++  mis  ;~(less aln asp)                         ::  non alphanumeric
      --
  |%

XX document

++apex

  ++  apex                                              ::  top level
    %+  knee  *rege  |.  ~+
    ;~  pose
      ;~((bend |=(a=[rege rege] (some [%eith a]))) mall ;~(pfix bar apex))
      (stag %eith ;~(plug (easy %empt) ;~(pfix bar apex)))
      (easy %empt)
    ==
  ::

XX document

++mall

  ++  mall
    %+  knee  *rege  |.  ~+
    ;~((bend |=(a=[rege rege] (some [%pair a]))) bets mall)
  ::

XX document

++bets

  ++  bets
    %+  knee  *rege  |.  ~+
    |=  tub=nail
    =+  vex=(chun tub)
    ?~  q.vex
      vex
    =+  a=p.u.q.vex
    %-  ;~  pose
          (cold [%eith %empt a] (jest '??'))
          (cold [%manl a] (jest '*?'))
          (cold [%plll a] (jest '+?'))
          (cold [%eith a %empt] wut)
          (cold [%mant a] tar)
          (cold [%plls a] lus)
          (stag %betl ;~(plug (easy a) ;~(sfix rang wut)))
          (stag %betw ;~(plug (easy a) rang))
          (stag %binl ;~(plug (easy a) (ifix [kel (jest ',}?')] dim:ag)))
          (stag %bant ;~(plug (easy a) (ifix [kel (jest '}?')] dim:ag)))
          (stag %bant ;~(plug (easy a) (ifix [kel ker] dim:ag)))
          (stag %bint ;~(plug (easy a) (ifix [kel (jest ',}')] dim:ag)))
          (easy a)
        ==
    q.u.q.vex
  ::

XX document

++ranc

  ++  ranc
    |=  [a=@ b=@]
    ^-  @
    ?:((gth a b) 0 (con (bex a) $(a +(a))))
  ::

XX document

++flap

  ++  flap  |=(a=@ (mix a (dec (bex 256))))
  ::

XX document

++rang

  ++  rang
    %+  sear  |=([a=@ b=@] ?:((lte a b) (some [a b]) ~))
      (ifix [kel ker] ;~(plug dim:ag ;~(pfix com dim:ag)))
  ::

XX document

++chun

  ++  chun
    %+  knee  *rege  |.  ~+
    ;~  pose
      (cold %ende buc)
      (cold %sart ket)
      (cold %dote dot)
      %+  cook  |=(a=(list char) (reel a |=([p=char q=rege] [%pair [%lite p] q])))
        ;~(pfix (jest '\\Q') cape)
      |=  tub=nail
      =+  foo=;~(plug kel dim:ag ;~(pose ker (jest ',}') ;~(plug com dim:ag ker)))
      =+  bar=(foo tub)
      ?~(q.bar (chad tub) (fail tub))
      (cook |=([a=rege] [%capt a 0]) (ifix [pel per] apex))
      %+  cook  |=([a=rege] [%capt a 0])
        (ifix [;~(plug (jest '(?P<') (plus aln) gar) per] apex)
      (ifix [(jest '(?:') per] apex)
      (stag %brac ;~(pfix sel seac))
    ==
  ::

XX document

++seac

  ++  seac
    |=  tub=nail
    ?~  q.tub
      (fail tub)
    ?:  =(i.q.tub '^')
      (;~(pfix ket (cook flap sead)) tub)
    (sead tub)
  ::

XX document

++sead

  ++  sead
    %+  knee  *@  |.  ~+
    ;~  pose
      |=  tub=nail
      ?~  q.tub
        (fail tub)
      ?.  =(i.q.tub ']')
        (fail tub)
      ?~  t.q.tub
        (fail tub)
      ?:  =(i.t.q.tub '-')
        ?~  t.t.q.tub
          (fail tub)
        ?:  =(i.t.t.q.tub ']')
          (;~(pfix ser (cook |=(a=@ (con (bex ']') a)) sade)) tub)
        (fail tub)
      (;~(pfix ser (cook |=(a=@ (con (bex ']') a)) sade)) tub)
      |=  tub=nail
      ?~  q.tub
        (fail tub)
      ?.  =(i.q.tub '-')
        (fail tub)
      ?~  t.q.tub
        (fail tub)
      ?:  =(i.t.q.tub '-')
        ?~  t.t.q.tub
          (fail tub)
        ?:  =(i.t.t.q.tub ']')
          (;~(pfix hep (cook |=(a=@ (con (bex '-') a)) sade)) tub)
        (fail tub)
      (;~(pfix hep (cook |=(a=@ (con (bex '-') a)) sade)) tub)
      (cook |=(a=[@ @] (con a)) ;~(plug seap sade))
    ==
  ::

XX document

++sade

  ++  sade
    %+  knee  *@  |.  ~+
    ;~  pose
      (cold (bex '-') (jest '-]'))
      (cold 0 ser)
      (cook |=([p=@ q=@] `@`(con p q)) ;~(plug seap sade))
    ==
  ::

XX document

++seap

  ++  seap
    %+  knee  *@  |.  ~+
    ;~  pose
      unid
      %+  ifix  (jest '[:')^(jest ':]')
      ;~(pose ;~(pfix ket (cook flap chas)) chas)
      %+  sear  |=([a=@ b=@] ?:((gth a b) ~ (some (ranc a b))))
        ;~(plug asp ;~(pfix hep alb))
      |=  tub=nail
      ?~  q.tub
        (fail tub)
      ?~  t.q.tub
        ((cook bex les) tub)
      ?:  =(i.t.q.tub '-')
        ?~  t.t.q.tub
          ((cook bex les) tub)
        ?:  =(i.t.t.q.tub ']')
          ((cook bex les) tub)
        (fail tub)
      ((cook bex les) tub)
      ;~(pfix bas escd)
    ==
  ::

XX document

++cape

  ++  cape
    %+  knee  *tape  |.  ~+
    ;~  pose
      (cold ~ (jest '\\E'))
      ;~(plug next cape)
      (cook |=(a=char (tape [a ~])) next)
      (full (easy ~))
    ==

XX document

++chas

  ++  chas                                              ::  ascii character set
    =-  (sear ~(get by -) sym)
    %-  mo  ^-  (list ,[@tas @I])
    :~  alnum/alnum  alpha/alpha  ascii/ascii  blank/blank  cntrl/cntrl
        digit/digit  graph/graph  lower/lower  print/print  punct/punct
        space/space  upper/upper  word/wordc   xdigit/xdigit
    ==
  ::  Character sets

++ alnum :(con lower upper digit)

XX  document

###++alpha

++ alpha :(con lower upper)

XX  document

###++ascii

++ ascii (ranc 0 127)

  ++  blank  (con (bex 32) (bex 9))

XX document

++cntrl

  ++  cntrl  :(con (ranc 0 31) (bex 127))

XX document

++digit

  ++  digit  (ranc '0' '9')

XX document

++graph

  ++  graph  (ranc 33 126)

XX document

++lower

  ++  lower  (ranc 'a' 'z')

XX document

++print

  ++  print  (ranc 32 126)

XX document

++punct

  ++  punct  ;:  con
               (ranc '!' '/')
               (ranc ':' '@')
               (ranc '[' '`')
               (ranc '{' '~')
             ==

XX document

++space

  ++  space  :(con (ranc 9 13) (bex ' '))

XX document

++upper

  ++  upper  (ranc 'A' 'Z')

XX document

++white

  ++  white  :(con (bex ' ') (ranc 9 10) (ranc 12 13))

XX document

++wordc

  ++  wordc  :(con digit lower upper (bex '_'))

XX document

++xdigit

  ++  xdigit  :(con (ranc 'a' 'f') (ranc 'A' 'F') digit)
  ::

XX document

++chad

  ++  chad
    %+  knee  *rege  |.  ~+
    ;~(pose (stag %lite nor) (stag %brac unid) ;~(pfix bas escp))
  ::

XX document

++escd

  ++  escd
    %+  knee  *@  |.  ~+
    ;~  pose
      (cold (bex 7) (just 'a'))
      (cold (bex 9) (just 't'))
      (cold (bex 10) (just 'n'))
      (cold (bex 11) (just 'v'))
      (cold (bex 12) (just 'f'))
      (cold (bex 13) (just 'r'))
      (cold (bex 0) (just '0'))
      (sear |=(a=@ ?:((lth a 256) (some (bex a)) ~)) (bass 8 (stun [2 3] cit)))
      (cook bex ;~(pfix (just 'x') (bass 16 (stun [2 2] hit))))
      (cook bex (ifix [(jest 'x{') ker] (bass 16 (stun [2 2] hit))))
      (cook bex mis)
    ==
  ::

XX document

++escp

  ++  escp
    %+  knee  *rege  |.  ~+
    ;~  pose
      (cold %empt (just 'Q'))
      (cold [%lite `@tD`0] (just '0'))
      (cold [%lite `@tD`7] (just 'a'))
      (cold [%lite `@tD`9] (just 't'))
      (cold [%lite `@tD`10] (just 'n'))
      (cold [%lite `@tD`11] (just 'v'))
      (cold [%lite `@tD`12] (just 'f'))
      (cold [%lite `@tD`13] (just 'r'))
      (sear |=(a=@ ?:((lth a 256) (some [%lite a]) ~)) (bass 8 (stun [2 3] cit)))
      (stag %lite ;~(pfix (just 'x') (bass 16 (stun [2 2] hit))))
      (stag %lite (ifix [(jest 'x{') ker] (bass 16 (stun [2 2] hit))))
      (cold %dote (just 'C'))
      (cold %sart (just 'A'))
      (cold %ende (just 'z'))
      (cold %boun (just 'b'))
      (cold %bout (just 'B'))
      (stag %brac (cold wordc (just 'w')))
      (stag %brac (cold (flap wordc) (just 'W')))
      (stag %lite mis)
    ==
  ::

XX document

++unid

  ++  unid
    %+  knee  *@  |.  ~+
    ;~  pose
      (cold digit (jest '\\d'))
      (cold (flap digit) (jest '\\D'))
      (cold white (jest '\\s'))
      (cold (flap white) (jest '\\S'))
      (cold wordc (jest '\\w'))
      (cold (flap wordc) (jest '\\W'))
    ==
  --
::

XX document

++ra

++  ra                                                  ::  regex engine
  |_  a=rege

XX document

++proc

  ++  proc                                              ::  capture numbering
    |=  b=@
    =-  -(+ +>.$(a a))
    ^-  [p=@ a=rege]
    ?-  a
      [%capt *]  =+  foo=$(a p.a, b +(b))
                 [p.foo [%capt a.foo b]]
      [%eith *]  =+  foo=$(a p.a)
                 =+  bar=$(a q.a, b p.foo)
                 [p.bar [%eith a.foo a.bar]]
      [%pair *]  =+  foo=$(a p.a)
                 =+  bar=$(a q.a, b p.foo)
                 [p.bar [%pair a.foo a.bar]]
      [%manl *]  =+  foo=$(a p.a)
                 [p.foo [%manl a.foo]]
      [%plll *]  =+  foo=$(a p.a)
                 [p.foo [%plll a.foo]]
      [%binl *]  =+  foo=$(a p.a)
                 [p.foo [%binl a.foo q.a]]
      [%betl *]  =+  foo=$(a p.a)
                 [p.foo [%betl a.foo q.a r.a]]
      [%mant *]  =+  foo=$(a p.a)
                 [p.foo [%mant a.foo]]
      [%plls *]  =+  foo=$(a p.a)
                 [p.foo [%plls a.foo]]
      [%bant *]  =+  foo=$(a p.a)
                 [p.foo [%bant a.foo q.a]]
      [%bint *]  =+  foo=$(a p.a)
                 [p.foo [%bint a.foo q.a]]
      [%betw *]  =+  foo=$(a p.a)
                 [p.foo [%betw a.foo q.a r.a]]
      *  [b a]
    ==
  ::

XX document

++cont

  ++  cont
    |=  [a=(map ,@u tape) b=(map ,@u tape)]
    (~(gas by _(map ,@u tape)) (weld (~(tap by a)) (~(tap by b))))
  ::

XX document

++abor

  ++  abor
    |=  [a=char b=(unit ,[tape (map ,@u tape)])]
    ^-  (unit ,[tape (map ,@u tape)])
    ?~  b
      b
    [~ [[a -.u.b] +.u.b]]
  ::

XX document

++matc

  ++  matc
    |=  [b=tape c=tape]
    ^-  (unit (map ,@u tape))
    =+  foo=`(unit ,[tape (map ,@u tape)])`(deep b %empt c)
    (bind foo |*(a=^ (~(put by +.a) 0 -.a)))
  ::

XX document

++chet

  ++  chet
    |=  [b=(unit ,[tape (map ,@u tape)]) c=tape d=tape]
    ^-  (unit ,[tape (map ,@u tape)])
    ?~  b
      b
    ?~  -.u.b
      b
    =+  bar=(deep (slag (lent -.u.b) c) %empt d)
    ?~  bar
      bar
    b

XX document

++blak

  ++  blak  (some ["" _(map ,@u tape)])

XX document

++word

  ++  word  |=(a=char =((dis wordc:rags (bex a)) 0))

XX document

++deep

  ++  deep
    |=  [b=tape c=rege d=tape]
    ^-  (unit ,[tape (map ,@u tape)])
    ?-  a
      %dote  ?~(b ~ (some [[i.b ~] _(map ,@u tape)]))
      %ende  ?~(b blak ~)
      %sart  ?:(=(b d) blak ~)
      %empt  blak
      %boun  =+  ^=  luc
                 ?:  =(b d)
                   &
                 =+  foo=(slag (dec (sub (lent d) (lent b))) d)
                 (word -.foo)
             =+  cuc=?~(b & (word -.b))
             ?:(!=(luc cuc) blak ~)
      %bout  =+  ^=  luc
                 ?:  =(b d)
                   &
                 =+  foo=(slag (dec (sub (lent d) (lent b))) d)
                 (word -.foo)
             =+  cuc=?~(b & (word -.b))
             ?:(=(luc cuc) blak ~)
      [%capt *]  =+  foo=$(a p.a)
                 ?~  foo
                   foo
                 =+  ft=u.foo
                 =+  bar=$(a c, b (slag (lent -.ft) b), c %empt)
                 ?~  bar
                   bar
                 [~ [-.ft (~(put by +.ft) q.a -.ft)]]
      [%lite *]  ?~(b ~ ?:(=(i.b p.a) (some [[i.b ~] _(map ,@u tape)]) ~))
      [%brac *]  ?~  b
                   ~
                 ?.  =((dis (bex `@`i.b) p.a) 0)
                   (some [[i.b ~] _(map ,@u tape)])
                 ~
      [%eith *]  =+  foo=(chet(a c) $(a p.a) b d)
                 =+  bar=(chet(a c) $(a q.a) b d)
                 ?~  foo
                   bar
                 ?~  bar
                   foo
                 =+  ft=u.foo
                 =+  bt=u.bar
                 ?:  (gte (lent -.ft) (lent -.bt))
                   foo
                 bar
      [%pair *]  =+  foo=$(a p.a, c [%pair q.a c])
                 ?~  foo
                   foo
                 =+  ft=u.foo
                 =+  bar=$(a q.a, b (slag (lent -.ft) b))
                 ?~  bar
                   bar
                 =+  bt=u.bar
                 [~ [(weld -.ft -.bt) (cont +.ft +.bt)]]
      [%manl *]  =+  foo=$(a p.a)
                 ?~  foo
                   blak
                 ?~  -.u.foo
                   blak
                 $(a [%eith %empt [%pair p.a [%eith %empt a]]])
      [%mant *]  =+  foo=$(a p.a)
                 ?~  foo
                   blak
                 =+  ft=u.foo
                 ?~  -.ft
                   blak
                 $(a [%eith [%pair p.a [%eith a %empt]] %empt])
      [%plls *]  $(a [%pair p.a [%mant p.a]])
      [%plll *]  $(a [%pair p.a [%manl p.a]])
      [%binl *]  =+  min=?:(=(q.a 0) 0 (dec q.a))
                 ?:  =(q.a 0)
                   $(a [%manl p.a])
                 $(a [%pair p.a [%binl p.a min]])
      [%bant *]  ?:  =(0 q.a)
                   blak
                 $(a [%pair p.a [%bant p.a (dec q.a)]])
      [%bint *]  =+  min=?:(=(q.a 0) 0 (dec q.a))
                 ?:  =(q.a 0)
                   $(a [%mant p.a])
                 $(a [%pair p.a [%bint p.a min]])
      [%betw *]  ?:  =(0 r.a)
                   blak
                 ?:  =(q.a 0)
                   $(a [%eith [%pair p.a [%betw p.a 0 (dec r.a)]] %empt])
                 $(a [%pair p.a [%betw p.a (dec q.a) (dec r.a)]])
      [%betl *]  ?:  =(0 r.a)
                   blak
                 ?:  =(q.a 0)
                   $(a [%eith %empt [%pair p.a [%betl p.a 0 (dec r.a)]]])
                 $(a [%pair p.a [%betl p.a (dec q.a) (dec r.a)]])
    ==
  --
::

XX document

++rexp

++  rexp                                                :: Regex match
  ~/  %rexp
  |=  [a=tape b=tape]
  ^-  (unit (unit (map ,@u tape)))
  =+  ^=  bar
      |=  [a=@ b=(map ,@u tape)]
      ?:  =(a 0)
        b
      =+  c=(~(get by b) a)
      ?~  c
        $(a (dec a), b (~(put by b) a ""))
      $(a (dec a))
  =+  par=(pars a)
  ?~  par  ~
  =+  poc=(~(proc ra u.par) 1)
  =+  c=b
  |-
  =+  foo=(matc:poc c b)
  ?~  foo
    ?~  c
      [~ ~]
    $(c t.c)
  [~ [~ (bar (dec p.poc) u.foo)]]
::

XX document

++repg

++  repg                                                :: Global regex replace
  ~/  %repg
  |=  [a=tape b=tape c=tape]
  ^-  (unit tape)
  =+  par=(pars a)
  ?~  par  ~
  =+  poc=(~(proc ra u.par) 1)
  =+  d=b
  :-  ~
  |-
  ^-  tape
  =+  foo=(matc:poc d b)
  ?~  foo
    ?~  d
      ~
    [i.d $(d t.d)]
  =+  ft=(need (~(get by u.foo) 0))
  ?~  d
    c
  (weld c $(d `tape`(slag (lent ft) `tape`d)))
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::