regex holes refactor

This commit is contained in:
Rijnard van Tonder 2021-04-15 23:23:26 -07:00
parent b1e29ce339
commit 9d839d1a1b
8 changed files with 571 additions and 42 deletions

View File

@ -3,4 +3,4 @@
(public_name comby-kernel.matchers)
(instrumentation (backend bisect_ppx))
(preprocess (pps ppx_here ppx_sexp_conv ppx_sexp_message ppx_deriving_yojson))
(libraries comby-kernel.parsers comby-kernel.match angstrom core_kernel mparser mparser-pcre yojson ppx_deriving_yojson))
(libraries comby-kernel.parsers comby-kernel.match angstrom core_kernel mparser mparser-pcre re yojson ppx_deriving_yojson))

View File

@ -27,6 +27,9 @@ let current_environment_ref : Match.Environment.t ref = ref (Match.Environment.c
let (|>>) p f =
p >>= fun x -> return (f x)
let ignore p =
p *> return ()
let debug =
match Sys.getenv "DEBUG_COMBY" with
| exception Not_found -> false
@ -73,7 +76,7 @@ let substitute template env =
let record_match_context pos_before pos_after =
let open Match.Location in
if debug then Format.printf "match context start pos: %d@." pos_before;
if debug then Format.printf "match context end pos %d@." pos_after;
if debug then Format.printf "match context end pos (this is wrong) %d@." pos_after;
let extract_matched_text source { offset = match_start; _ } { offset = match_end; _ } =
String.slice source match_start match_end
in
@ -351,7 +354,55 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct
| Ok (Hole { sort; identifier; dimension; _ }, user_state) ->
begin
match sort with
| Regex -> failwith "Not supported (seq chain)"
| Regex ->
let identifier, pattern = String.lsplit2_exn identifier ~on:'~' in
let identifier = if String.(identifier = "") then "_" else identifier in
if debug then Format.printf "Regex: Id: %s Pat: %s@." identifier pattern;
let compiled_regexp = Regexp.PCRE.make_regexp pattern in
let regexp_parser = Regexp.PCRE.regexp compiled_regexp in
let base_parser = [ regexp_parser; end_of_input >>= fun () -> return "" ] in (* the eof matters here for that one tricky test case *)
let base_parser =
(* adds begin line parser if the pattern has ^ anchor *)
if String.is_prefix pattern ~prefix:"^" then
let p =
Regexp.PCRE.make_regexp (String.drop_prefix pattern 1) |> Regexp.PCRE.regexp
in
(char '\n' >>= fun _ -> p)::base_parser
else
base_parser
in
let base_parser =
if String.is_suffix pattern ~suffix:"$" then
let p = Regexp.PCRE.make_regexp (String.drop_prefix pattern 1) |> Regexp.PCRE.regexp in
(p <* (ignore @@ char '\n' <|> end_of_input))::base_parser
else
base_parser
in
pos >>= fun offset ->
if debug then Format.printf "(X)@.";
choice base_parser
>>= fun value ->
if debug then Format.printf "Regex match @@ %d value %s@." offset value;
let offset =
if String.length value = 0 then
offset (*offset + 1 this may not matter, if we correct for the whole match conext *)
else
offset
in
(if String.length value = 0 then
(*advance 1*)
advance 0
else
advance @@ String.length value) >>= fun () ->
if debug then Format.printf "(Y)@.";
acc >>= fun _ ->
let m =
{ offset
; identifier
; text = value
}
in
r user_state (Match m)
| Alphanum ->
pos >>= fun offset ->
many1 (generate_single_hole_parser ())
@ -515,10 +566,10 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct
it in the template. So it should always match to
end_of_input (not empty string) *)
if !i = 0 then
(if debug then Format.printf "hole until: match to the end of this level@.";
(if debug then Format.printf "hole everything until: match to the end of this level@.";
end_of_input)
else
(if debug then Format.printf "hole until: append suffix@.";
(if debug then Format.printf "hole everything until: append suffix@.";
skip_unit acc)
in
let first_pos = ref (-1) in
@ -567,6 +618,7 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct
; text
}
in
if debug then Format.printf "Recording!@.";
r user_state (Match m)
end
| Ok (_, _user_state) -> failwith "unreachable: _signal_hole parsed but not handled by Hole variant"
@ -618,6 +670,23 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct
*> identifier_parser ()
<* string "]"
let regex_expression () =
fix (fun expr ->
choice
[ lift (fun x -> Format.sprintf "[%s]" @@ String.concat x) (char '[' *> many1 expr <* char ']')
; lift (fun c -> Format.sprintf {|\%c|} c) (char '\\' *> any_char)
; lift Char.to_string (not_char ']')
])
let regex_body () =
lift2
(fun v e -> Format.sprintf "%s~%s" v (String.concat e))
(identifier_parser ())
(char '~' *> many1 (regex_expression ()))
let regex_hole_parser () =
string ":[" *> regex_body () <* string "]"
let hole_parser sort dimension : (production * 'a) t t =
let open Hole in
let hole_parser =
@ -628,7 +697,7 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct
| Line -> line_hole_parser ()
| Non_space -> non_space_hole_parser ()
| Expression -> expression_hole_parser ()
| Regex -> single_hole_parser ()
| Regex -> regex_hole_parser ()
in
let skip_signal hole = skip_unit (string "_signal_hole") |>> fun () -> (Hole hole, acc) in
hole_parser |>> fun identifier -> skip_signal { sort; identifier; dimension; optional = false; at_depth = None }
@ -672,7 +741,9 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct
let general_parser_generator : (production * 'a) t t =
let spaces : (production * 'a) t t =
many1 (comment_parser <|> spaces1) |>> fun result -> generate_spaces_parser (String.concat result)
lift
(fun result -> generate_spaces_parser (String.concat result))
(many1 (comment_parser <|> spaces1))
in
let other =
(many1 (Parser.Deprecate.any_char_except ~reserved:Deprecate.reserved) |>> String.of_char_list)
@ -740,38 +811,31 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct
; any_char |>> Char.to_string
]
in
(* many1 may be appropriate *)
let matches =
many @@
many_till (prefix >>= fun s -> r acc (String s))
begin
at_end_of_input >>= fun at_end ->
if debug then Format.printf "We are at the end? %b.@." at_end;
if at_end then fail "end"
else
(* We may have found a match *)
pos >>= fun start_pos ->
let matched =
matcher >>= fun production ->
if debug then Format.printf "Full match context result@.";
pos >>= fun end_pos ->
record_match_context start_pos end_pos;
current_environment_ref := Match.Environment.create ();
return production
in
let no_match =
(* Reset any partial binds of holes in environment. *)
if debug then Format.printf "Failed to match and not at end.@.";
current_environment_ref := Match.Environment.create ();
(* cannot return: we must try some other parser or else we'll
infini loop! We can't advance because we haven't
successfully parsed the character at the current position.
So: fail and try another parser in the choice. *)
fail "no match, try something else"
in
choice [ matched; no_match ]
end
let match_one =
pos >>= fun start_pos ->
current_environment_ref := Match.Environment.create ();
consumed matcher >>= fun value ->
if debug then Format.printf "Full match context result: %s@." value;
pos >>= fun end_pos ->
let start_pos =
if String.length value = 0 then
start_pos (*offset + 1 this may not matter, if we correct for the whole match conext *)
else
start_pos
in
(if String.length value = 0 then
advance 1
else
return ()) >>= fun () ->
if debug then Format.printf "Calculated end_pos %d@." end_pos;
record_match_context start_pos (end_pos);
current_environment_ref := Match.Environment.create ();
return (Unit, "")
in
(* many1 may be appropriate *)
let prefix = (prefix >>= fun s -> r acc (String s)) in
let first_match_attempt = choice [match_one; prefix] in (* consumes a character in prefix if no match *)
let matches = many_till first_match_attempt end_of_input in
matches >>= fun _result ->
r acc Unit
@ -908,7 +972,6 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct
(* Don't reverse the list for non-nested matches--it matters for rewriting. *)
aux_all ?configuration ~nested ~template ~source:original_source ()
let first ?configuration ?shift:_ template source : Match.t Or_error.t =
configuration_ref := Option.value configuration ~default:!configuration_ref;
matches_ref := [];

View File

@ -0,0 +1,131 @@
open Angstrom
let debug =
match Sys.getenv "DEBUG_COMBY" with
| exception Not_found -> false
| _ -> true
module type Regexp_engine_intf = sig
type t
type substrings
val make: string -> t
val get_substring: substrings -> int -> string option
val get_all_substrings: substrings -> string array
val exec: rex:t -> pos:int -> Bytes.t -> substrings option
end
type t =
{ buffer_pos : int
; buffer : bytes
}
(* I think I should just implement the analog of string_ for regex with some bounded buffer size. *)
module Make (Regexp: Regexp_engine_intf) = struct
(* https://sourcegraph.com/github.com/comby-tools/mparser/-/blob/src/mParser_Char_Stream.ml#L231:8 *)
let match_regexp s pos rex =
Regexp.exec ~rex ~pos:(pos - s.buffer_pos) s.buffer
let make_regexp pat =
Regexp.make pat
(* TODO: tests and blit thing below *)
(* FIXME: size. about advance => want to use internal unsafe_apply_opt
actually. cf. string_ in angstrom.ml. instead, trying "do peek, then
advance/commit." *)
let regexp rex : string Angstrom.t =
(* Why do Unsafe if I can just do peek_string? => So I don't allocate on copy of buffer. *)
(* But it looks like we can't avoid allocation in converting bigstringaf to bytes *)
Unsafe.peek 1 (fun buffer ~off ~len:_ -> Bigstringaf.length buffer - off) >>= fun n ->
Unsafe.peek n (fun buffer ~off ~len ->
(* This still does a copy :( *)
let bytes = Bytes.create len in
Bigstringaf.unsafe_blit_to_bytes buffer ~src_off:off bytes ~dst_off:0 ~len;
if debug then Format.printf "Matching regex against string: %S@." @@ Bytes.to_string bytes;
match Regexp.exec ~rex ~pos:0 bytes with
| None ->
if debug then Format.printf "None (1)@.";
None
| Some substrings ->
match Regexp.get_substring substrings 0 with
| None ->
if debug then Format.printf "None (2)@.";
None
| Some result ->
if debug then Format.printf "Matchy Matchy (3)@.";
Some (result, String.length result))
>>= function
| Some (result, _n) ->
(* if empty string matches, this hole like for optionals (x?), advance 1. *)
(* we want to advance one so parsing can continue, but if we advance 1 here we will think
that the match context is at least length 1 and not 0 if this hole is the only thing
defining the match context *)
(* let n = if n > 0 then n else 1 in
advance n >>= fun () -> *)
if debug then Format.printf "Result indeed: %s len %d@." result _n;
return result
| None ->
fail "No match"
end
module PCRE = struct
module Engine : Regexp_engine_intf = struct
type t = Pcre.regexp
type substrings = Pcre.substrings
let compile_flags =
Pcre.cflags [ `ANCHORED ]
let make pattern =
Pcre.regexp ~iflags:compile_flags pattern
let get_substring s idx =
match Pcre.get_substring s idx with
| result -> Some result
| exception Not_found
| exception Invalid_argument _ -> None
let get_all_substrings s =
Pcre.get_substrings s
let exec ~rex ~pos b =
match Pcre.exec ~pos ~rex (Bytes.unsafe_to_string b) with
| result -> Some result
| exception Not_found -> None
end
include Make(Engine)
end
module RE = struct
module Engine : Regexp_engine_intf = struct
type t = Re.re
type substrings = Re.substrings
let compile_flags =
[ `Anchored ]
let make pattern =
Re.Perl.(compile (re ~opts:compile_flags pattern))
let get_substring s idx =
match Re.get s idx with
| result -> Some result
| exception Not_found -> None
let get_all_substrings s =
Re.get_all s
let exec ~rex ~pos b =
match Re.exec ~pos rex (Bytes.unsafe_to_string b) with
| result -> Some result
| exception Not_found -> None
end
include Make(Engine)
end

View File

@ -0,0 +1,37 @@
module type Regexp_engine_intf = sig
type t
type substrings
val make: string -> t
val get_substring: substrings -> int -> string option
val get_all_substrings: substrings -> string array
val exec: rex:t -> pos:int -> Bytes.t -> substrings option
end
(** Represents character stream right now.
Compare char stream interface on t and match_regexp descriptions
in https://sourcegraph.com/github.com/comby-tools/mparser/-/blob/src/mParser_Char_Stream.mli#L102:8
*)
type t
module Make (Regexp : Regexp_engine_intf): sig
(* do not use this, use regexp. *)
val match_regexp: t -> int -> Regexp.t -> Regexp.substrings option
val make_regexp : string -> Regexp.t
val regexp : Regexp.t -> string Angstrom.t
end
module PCRE : sig
module Engine : Regexp_engine_intf
include module type of Make (Engine)
end
module RE : sig
module Engine : Regexp_engine_intf
include module type of Make (Engine)
end

View File

@ -5,8 +5,7 @@
test_custom_metasyntax
test_optional_holes
test_special_matcher_cases
test_substring_disabled
test_regex_holes)
test_substring_disabled)
(inline_tests)
(preprocess (pps ppx_expect ppx_sexp_message ppx_deriving_yojson))
(libraries

View File

@ -2,6 +2,7 @@
(name common_test_integration)
(package comby)
(modules
; test_server ; disabled, factor out.
test_helpers
test_cli_helper
test_script
@ -10,7 +11,6 @@
test_omega
test_cli
test_cli_list
; test_server
test_statistics
test_offset_conversion
test_parse_rule
@ -65,6 +65,9 @@
test_nested_matches_alpha
test_nested_matches_omega
test_regex_holes_alpha
test_regex_holes_omega
)
(inline_tests)
(preprocess (pps ppx_expect ppx_sexp_message ppx_deriving_yojson))

View File

@ -0,0 +1,296 @@
open Core
open Rewriter
open Matchers.Omega
let configuration = Matchers.Configuration.create ~match_kind:Fuzzy ()
let run ?(configuration = configuration) (module M : Matchers.Matcher.S) source match_template ?rule rewrite_template =
let rule =
match rule with
| Some rule -> Language.Rule.create rule |> Or_error.ok_exn
| None -> Language.Rule.create "where true" |> Or_error.ok_exn
in
M.all ~configuration ~template:match_template ~source ()
|> List.filter ~f:(fun { Match.environment; _ } -> Language.Rule.Alpha.(sat @@ apply rule environment))
|> function
| [] -> print_string "No matches."
| results ->
Option.value_exn (Rewrite.all ~source ~rewrite_template results)
|> (fun { rewritten_source; _ } -> rewritten_source)
|> print_string
let%expect_test "regex_holes_simple" =
let source = {|foo|} in
let match_template = {|:[x~\w+]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|(foo)|}]
let%expect_test "regex_holes_simple_posix" =
let source = {|foo|} in
let match_template = {|:[x~[[:alpha:]]]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|(f)(o)(o)|}]
let%expect_test "regex_holes_substring" =
let source = {|foo()|} in
let match_template = {|:[x~o\w]()|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|f(oo)|}]
let%expect_test "regex_holes_empty_string_terminates" =
let source = {|foo()|} in
let match_template = {|:[x~|]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|()f()o()o()(())|}]
let%expect_test "regex_holes_repetition_takes_precedence" =
let source = {|foobar()|} in
(* this will _not_ match because bar is consumed by \w before we look ahead *)
let match_template = {|:[x~\w+]bar()|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|No matches.|}]
let%expect_test "regex_holes_negated_match" =
let source = {|(literally_anyting_except_close_paren?!@#$%^&*[])|} in
let match_template = {|(:[x~[^)]+])|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|(literally_anyting_except_close_paren?!@#$%^&*[])|}];
let source = {|(arg1, arg2, arg3)|} in
let match_template = {|:[x~[^,() ]+]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|((arg1), (arg2), (arg3))|}]
let%expect_test "regex_holes_dot_star_ok_and_this_is_for_newline" =
let source = "foo()\nbar()" in
let match_template = {|:[x~.*]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|(foo())()
(bar())|}]
let%expect_test "regex_holes_optional" =
let source = "nonovember no november no vember" in
let match_template = {|:[x~no(vember)?]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|(no)(november) (no) (november) (no) vember|}]
let%expect_test "regex_holes_optional_spaces" =
let source = "nonovember no november no vember" in
let match_template = {|no :[x~(vember)?]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|nonovember ()november (vember)|}]
let%expect_test "regex_holes_optional_doesnt_work_outside_regex" =
let source = "no" in
let match_template = {|no:[x~(vember)?]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|()|}];
let source = "foo bar foobar" in
let match_template = {|:[x~\s*?]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|()f()o()o() ()b()a()r() ()f()o()o()b()a()r|}];
let source = "foo bar foobar" in
let match_template = {|:[x~\s*]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|()f()o()o( )()b()a()r( )()f()o()o()b()a()r|}]
let%expect_test "regex_holes_optional_strip_no_from_november_outside_regex" =
let source = "nonovember no november no vember" in
let match_template = {|no:[x~(vember)?]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|()(vember) () (vember) () vember|}]
let%expect_test "regex_holes_optional_strip_no_from_november_inside_regex" =
let source = "nonovember no november no vember" in
let match_template = {|:[x~no(vember)?]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|(no)(november) (no) (november) (no) vember|}]
let%expect_test "leading_spaces_beginning_line_anchor" =
let source = {|
a
b
c
|}
in
let match_template = {|:[x~^(\s+)]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|( )a( )b( )c(
)|}]
let%expect_test "spaces_star" =
let source = {|
a
b
c
d
|}
in
let match_template = {|:[x~\s*]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
(* The <spaces><empty space>chars is how this behaves on https://regexr.com/59ft0 as well, see replace *)
[%expect_exact {|(
)()a(
)()b(
)()c(
)()d(
)|}]
(*
let%expect_test "end_line_anchor" =
let source = {|
aaa bbb
aaa bbb ccc
ccc ddd
|}
in
let match_template = {|:[x~\w+ bbb$]|} in
let rewrite_template = {|(:[x])|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|
(aaa bbb)aaa bbb ccc
ccc ddd
|}]
*)
let%expect_test "word_boundaries" =
let source = {|
foo(bar, baz(),
qux.derp)
|}
in
let match_template = {|:[x~\b\w+\b]|} in
let rewrite_template = {|(>:[x]<)|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|
(>foo<)((>bar<), (>baz<)(),
(>qux<).(>derp<))
|}]
(* I previously assumed [^ ] would not match newlines, but it does, and is the
same as regexr https://regexr.com/59fst. To not match newlines, see the next
test with [^\s] *)
let%expect_test "do_not_match_space" =
let source = {|
foo(bar, baz(),
qux.derp)
|}
in
let match_template = {|:[x~[^, ]+]|} in
let rewrite_template = {|(>:[x]<)|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|(>
foo(bar<), (>baz()<),(>
<) (>qux.derp)
<)|}]
let%expect_test "do_not_match_whitespace" =
let source = {|
foo(bar, baz(),
qux.derp)
|}
in
let match_template = {|:[x~[^,\s]+]|} in
let rewrite_template = {|(>:[x]<)|} in
run (module Generic) source match_template rewrite_template;
[%expect_exact {|
(>foo(bar<), (>baz()<),
(>qux.derp)<)
|}]
let%expect_test "eof_anchor" =
let source = {|
setScore(5)
setScore(6)
setScore(6.5)
setScore("")
setScore("hi")
setScore("hi" + "there")
setScore('ho')
setScore(x)
setScore(null)
setScore(4/3.0)
setScore(4.0/3.0)
setScore(4/3)
|}
in
let match_template = {|setScore(:[1])|} in
let rule = {|
where match :[1] {
| ":[~^\\d+$]" -> false
| ":[_]" -> true
}
|}
in
let rewrite_template = "setScore( /*CHECK ME*/ :[1])" in
run (module Generic) source match_template ~rule rewrite_template;
[%expect_exact {|
setScore(5)
setScore(6)
setScore( /*CHECK ME*/ 6.5)
setScore( /*CHECK ME*/ "")
setScore( /*CHECK ME*/ "hi")
setScore( /*CHECK ME*/ "hi" + "there")
setScore( /*CHECK ME*/ 'ho')
setScore( /*CHECK ME*/ x)
setScore( /*CHECK ME*/ null)
setScore( /*CHECK ME*/ 4/3.0)
setScore( /*CHECK ME*/ 4.0/3.0)
setScore( /*CHECK ME*/ 4/3)
|}]