diff --git a/lib/kernel/matchers/dune b/lib/kernel/matchers/dune index 64cbc59..f5672fc 100644 --- a/lib/kernel/matchers/dune +++ b/lib/kernel/matchers/dune @@ -3,4 +3,4 @@ (public_name comby-kernel.matchers) (instrumentation (backend bisect_ppx)) (preprocess (pps ppx_here ppx_sexp_conv ppx_sexp_message ppx_deriving_yojson)) - (libraries comby-kernel.parsers comby-kernel.match angstrom core_kernel mparser mparser-pcre yojson ppx_deriving_yojson)) + (libraries comby-kernel.parsers comby-kernel.match angstrom core_kernel mparser mparser-pcre re yojson ppx_deriving_yojson)) diff --git a/lib/kernel/matchers/omega.ml b/lib/kernel/matchers/omega.ml index af1777c..b53f658 100644 --- a/lib/kernel/matchers/omega.ml +++ b/lib/kernel/matchers/omega.ml @@ -27,6 +27,9 @@ let current_environment_ref : Match.Environment.t ref = ref (Match.Environment.c let (|>>) p f = p >>= fun x -> return (f x) +let ignore p = + p *> return () + let debug = match Sys.getenv "DEBUG_COMBY" with | exception Not_found -> false @@ -73,7 +76,7 @@ let substitute template env = let record_match_context pos_before pos_after = let open Match.Location in if debug then Format.printf "match context start pos: %d@." pos_before; - if debug then Format.printf "match context end pos %d@." pos_after; + if debug then Format.printf "match context end pos (this is wrong) %d@." pos_after; let extract_matched_text source { offset = match_start; _ } { offset = match_end; _ } = String.slice source match_start match_end in @@ -351,7 +354,55 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct | Ok (Hole { sort; identifier; dimension; _ }, user_state) -> begin match sort with - | Regex -> failwith "Not supported (seq chain)" + | Regex -> + let identifier, pattern = String.lsplit2_exn identifier ~on:'~' in + let identifier = if String.(identifier = "") then "_" else identifier in + if debug then Format.printf "Regex: Id: %s Pat: %s@." identifier pattern; + let compiled_regexp = Regexp.PCRE.make_regexp pattern in + let regexp_parser = Regexp.PCRE.regexp compiled_regexp in + let base_parser = [ regexp_parser; end_of_input >>= fun () -> return "" ] in (* the eof matters here for that one tricky test case *) + let base_parser = + (* adds begin line parser if the pattern has ^ anchor *) + if String.is_prefix pattern ~prefix:"^" then + let p = + Regexp.PCRE.make_regexp (String.drop_prefix pattern 1) |> Regexp.PCRE.regexp + in + (char '\n' >>= fun _ -> p)::base_parser + else + base_parser + in + let base_parser = + if String.is_suffix pattern ~suffix:"$" then + let p = Regexp.PCRE.make_regexp (String.drop_prefix pattern 1) |> Regexp.PCRE.regexp in + (p <* (ignore @@ char '\n' <|> end_of_input))::base_parser + else + base_parser + in + pos >>= fun offset -> + if debug then Format.printf "(X)@."; + choice base_parser + >>= fun value -> + if debug then Format.printf "Regex match @@ %d value %s@." offset value; + let offset = + if String.length value = 0 then + offset (*offset + 1 this may not matter, if we correct for the whole match conext *) + else + offset + in + (if String.length value = 0 then + (*advance 1*) + advance 0 + else + advance @@ String.length value) >>= fun () -> + if debug then Format.printf "(Y)@."; + acc >>= fun _ -> + let m = + { offset + ; identifier + ; text = value + } + in + r user_state (Match m) | Alphanum -> pos >>= fun offset -> many1 (generate_single_hole_parser ()) @@ -515,10 +566,10 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct it in the template. So it should always match to end_of_input (not empty string) *) if !i = 0 then - (if debug then Format.printf "hole until: match to the end of this level@."; + (if debug then Format.printf "hole everything until: match to the end of this level@."; end_of_input) else - (if debug then Format.printf "hole until: append suffix@."; + (if debug then Format.printf "hole everything until: append suffix@."; skip_unit acc) in let first_pos = ref (-1) in @@ -567,6 +618,7 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct ; text } in + if debug then Format.printf "Recording!@."; r user_state (Match m) end | Ok (_, _user_state) -> failwith "unreachable: _signal_hole parsed but not handled by Hole variant" @@ -618,6 +670,23 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct *> identifier_parser () <* string "]" + let regex_expression () = + fix (fun expr -> + choice + [ lift (fun x -> Format.sprintf "[%s]" @@ String.concat x) (char '[' *> many1 expr <* char ']') + ; lift (fun c -> Format.sprintf {|\%c|} c) (char '\\' *> any_char) + ; lift Char.to_string (not_char ']') + ]) + + let regex_body () = + lift2 + (fun v e -> Format.sprintf "%s~%s" v (String.concat e)) + (identifier_parser ()) + (char '~' *> many1 (regex_expression ())) + + let regex_hole_parser () = + string ":[" *> regex_body () <* string "]" + let hole_parser sort dimension : (production * 'a) t t = let open Hole in let hole_parser = @@ -628,7 +697,7 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct | Line -> line_hole_parser () | Non_space -> non_space_hole_parser () | Expression -> expression_hole_parser () - | Regex -> single_hole_parser () + | Regex -> regex_hole_parser () in let skip_signal hole = skip_unit (string "_signal_hole") |>> fun () -> (Hole hole, acc) in hole_parser |>> fun identifier -> skip_signal { sort; identifier; dimension; optional = false; at_depth = None } @@ -672,7 +741,9 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct let general_parser_generator : (production * 'a) t t = let spaces : (production * 'a) t t = - many1 (comment_parser <|> spaces1) |>> fun result -> generate_spaces_parser (String.concat result) + lift + (fun result -> generate_spaces_parser (String.concat result)) + (many1 (comment_parser <|> spaces1)) in let other = (many1 (Parser.Deprecate.any_char_except ~reserved:Deprecate.reserved) |>> String.of_char_list) @@ -740,38 +811,31 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct ; any_char |>> Char.to_string ] in - (* many1 may be appropriate *) - let matches = - many @@ - many_till (prefix >>= fun s -> r acc (String s)) - begin - at_end_of_input >>= fun at_end -> - if debug then Format.printf "We are at the end? %b.@." at_end; - if at_end then fail "end" - else - (* We may have found a match *) - pos >>= fun start_pos -> - let matched = - matcher >>= fun production -> - if debug then Format.printf "Full match context result@."; - pos >>= fun end_pos -> - record_match_context start_pos end_pos; - current_environment_ref := Match.Environment.create (); - return production - in - let no_match = - (* Reset any partial binds of holes in environment. *) - if debug then Format.printf "Failed to match and not at end.@."; - current_environment_ref := Match.Environment.create (); - (* cannot return: we must try some other parser or else we'll - infini loop! We can't advance because we haven't - successfully parsed the character at the current position. - So: fail and try another parser in the choice. *) - fail "no match, try something else" - in - choice [ matched; no_match ] - end + let match_one = + pos >>= fun start_pos -> + current_environment_ref := Match.Environment.create (); + consumed matcher >>= fun value -> + if debug then Format.printf "Full match context result: %s@." value; + pos >>= fun end_pos -> + let start_pos = + if String.length value = 0 then + start_pos (*offset + 1 this may not matter, if we correct for the whole match conext *) + else + start_pos + in + (if String.length value = 0 then + advance 1 + else + return ()) >>= fun () -> + if debug then Format.printf "Calculated end_pos %d@." end_pos; + record_match_context start_pos (end_pos); + current_environment_ref := Match.Environment.create (); + return (Unit, "") in + (* many1 may be appropriate *) + let prefix = (prefix >>= fun s -> r acc (String s)) in + let first_match_attempt = choice [match_one; prefix] in (* consumes a character in prefix if no match *) + let matches = many_till first_match_attempt end_of_input in matches >>= fun _result -> r acc Unit @@ -908,7 +972,6 @@ module Make (Language : Language.S) (Unimplemented : Metasyntax.S) = struct (* Don't reverse the list for non-nested matches--it matters for rewriting. *) aux_all ?configuration ~nested ~template ~source:original_source () - let first ?configuration ?shift:_ template source : Match.t Or_error.t = configuration_ref := Option.value configuration ~default:!configuration_ref; matches_ref := []; diff --git a/lib/kernel/matchers/regexp.ml b/lib/kernel/matchers/regexp.ml new file mode 100644 index 0000000..f6a2538 --- /dev/null +++ b/lib/kernel/matchers/regexp.ml @@ -0,0 +1,131 @@ +open Angstrom + +let debug = + match Sys.getenv "DEBUG_COMBY" with + | exception Not_found -> false + | _ -> true + +module type Regexp_engine_intf = sig + type t + type substrings + + val make: string -> t + + val get_substring: substrings -> int -> string option + + val get_all_substrings: substrings -> string array + + val exec: rex:t -> pos:int -> Bytes.t -> substrings option +end + +type t = + { buffer_pos : int + ; buffer : bytes + } + +(* I think I should just implement the analog of string_ for regex with some bounded buffer size. *) + +module Make (Regexp: Regexp_engine_intf) = struct + (* https://sourcegraph.com/github.com/comby-tools/mparser/-/blob/src/mParser_Char_Stream.ml#L231:8 *) + let match_regexp s pos rex = + Regexp.exec ~rex ~pos:(pos - s.buffer_pos) s.buffer + + let make_regexp pat = + Regexp.make pat + + (* TODO: tests and blit thing below *) + + (* FIXME: size. about advance => want to use internal unsafe_apply_opt + actually. cf. string_ in angstrom.ml. instead, trying "do peek, then + advance/commit." *) + let regexp rex : string Angstrom.t = + (* Why do Unsafe if I can just do peek_string? => So I don't allocate on copy of buffer. *) + (* But it looks like we can't avoid allocation in converting bigstringaf to bytes *) + Unsafe.peek 1 (fun buffer ~off ~len:_ -> Bigstringaf.length buffer - off) >>= fun n -> + Unsafe.peek n (fun buffer ~off ~len -> + (* This still does a copy :( *) + let bytes = Bytes.create len in + Bigstringaf.unsafe_blit_to_bytes buffer ~src_off:off bytes ~dst_off:0 ~len; + if debug then Format.printf "Matching regex against string: %S@." @@ Bytes.to_string bytes; + match Regexp.exec ~rex ~pos:0 bytes with + | None -> + if debug then Format.printf "None (1)@."; + None + | Some substrings -> + match Regexp.get_substring substrings 0 with + | None -> + if debug then Format.printf "None (2)@."; + None + | Some result -> + if debug then Format.printf "Matchy Matchy (3)@."; + Some (result, String.length result)) + >>= function + | Some (result, _n) -> + (* if empty string matches, this hole like for optionals (x?), advance 1. *) + (* we want to advance one so parsing can continue, but if we advance 1 here we will think + that the match context is at least length 1 and not 0 if this hole is the only thing + defining the match context *) + (* let n = if n > 0 then n else 1 in + advance n >>= fun () -> *) + if debug then Format.printf "Result indeed: %s len %d@." result _n; + return result + | None -> + fail "No match" +end + +module PCRE = struct + module Engine : Regexp_engine_intf = struct + type t = Pcre.regexp + type substrings = Pcre.substrings + + let compile_flags = + Pcre.cflags [ `ANCHORED ] + + let make pattern = + Pcre.regexp ~iflags:compile_flags pattern + + let get_substring s idx = + match Pcre.get_substring s idx with + | result -> Some result + | exception Not_found + | exception Invalid_argument _ -> None + + let get_all_substrings s = + Pcre.get_substrings s + + let exec ~rex ~pos b = + match Pcre.exec ~pos ~rex (Bytes.unsafe_to_string b) with + | result -> Some result + | exception Not_found -> None + end + + include Make(Engine) +end + +module RE = struct + module Engine : Regexp_engine_intf = struct + type t = Re.re + type substrings = Re.substrings + + let compile_flags = + [ `Anchored ] + + let make pattern = + Re.Perl.(compile (re ~opts:compile_flags pattern)) + + let get_substring s idx = + match Re.get s idx with + | result -> Some result + | exception Not_found -> None + + let get_all_substrings s = + Re.get_all s + + let exec ~rex ~pos b = + match Re.exec ~pos rex (Bytes.unsafe_to_string b) with + | result -> Some result + | exception Not_found -> None + end + + include Make(Engine) +end diff --git a/lib/kernel/matchers/regexp.mli b/lib/kernel/matchers/regexp.mli new file mode 100644 index 0000000..e73aaf2 --- /dev/null +++ b/lib/kernel/matchers/regexp.mli @@ -0,0 +1,37 @@ +module type Regexp_engine_intf = sig + type t + type substrings + + val make: string -> t + + val get_substring: substrings -> int -> string option + + val get_all_substrings: substrings -> string array + + val exec: rex:t -> pos:int -> Bytes.t -> substrings option +end + +(** Represents character stream right now. + Compare char stream interface on t and match_regexp descriptions + in https://sourcegraph.com/github.com/comby-tools/mparser/-/blob/src/mParser_Char_Stream.mli#L102:8 +*) +type t + +module Make (Regexp : Regexp_engine_intf): sig + (* do not use this, use regexp. *) + val match_regexp: t -> int -> Regexp.t -> Regexp.substrings option + + val make_regexp : string -> Regexp.t + + val regexp : Regexp.t -> string Angstrom.t +end + +module PCRE : sig + module Engine : Regexp_engine_intf + include module type of Make (Engine) +end + +module RE : sig + module Engine : Regexp_engine_intf + include module type of Make (Engine) +end diff --git a/test/alpha/dune b/test/alpha/dune index b9f9ec5..4c6ed6d 100644 --- a/test/alpha/dune +++ b/test/alpha/dune @@ -5,8 +5,7 @@ test_custom_metasyntax test_optional_holes test_special_matcher_cases - test_substring_disabled - test_regex_holes) + test_substring_disabled) (inline_tests) (preprocess (pps ppx_expect ppx_sexp_message ppx_deriving_yojson)) (libraries diff --git a/test/common/dune b/test/common/dune index b77a4f8..be680d8 100644 --- a/test/common/dune +++ b/test/common/dune @@ -2,6 +2,7 @@ (name common_test_integration) (package comby) (modules + ; test_server ; disabled, factor out. test_helpers test_cli_helper test_script @@ -10,7 +11,6 @@ test_omega test_cli test_cli_list - ; test_server test_statistics test_offset_conversion test_parse_rule @@ -65,6 +65,9 @@ test_nested_matches_alpha test_nested_matches_omega + + test_regex_holes_alpha + test_regex_holes_omega ) (inline_tests) (preprocess (pps ppx_expect ppx_sexp_message ppx_deriving_yojson)) diff --git a/test/alpha/test_regex_holes.ml b/test/common/test_regex_holes_alpha.ml similarity index 100% rename from test/alpha/test_regex_holes.ml rename to test/common/test_regex_holes_alpha.ml diff --git a/test/common/test_regex_holes_omega.ml b/test/common/test_regex_holes_omega.ml new file mode 100644 index 0000000..988284a --- /dev/null +++ b/test/common/test_regex_holes_omega.ml @@ -0,0 +1,296 @@ +open Core + +open Rewriter + +open Matchers.Omega + +let configuration = Matchers.Configuration.create ~match_kind:Fuzzy () + +let run ?(configuration = configuration) (module M : Matchers.Matcher.S) source match_template ?rule rewrite_template = + let rule = + match rule with + | Some rule -> Language.Rule.create rule |> Or_error.ok_exn + | None -> Language.Rule.create "where true" |> Or_error.ok_exn + in + M.all ~configuration ~template:match_template ~source () + |> List.filter ~f:(fun { Match.environment; _ } -> Language.Rule.Alpha.(sat @@ apply rule environment)) + |> function + | [] -> print_string "No matches." + | results -> + Option.value_exn (Rewrite.all ~source ~rewrite_template results) + |> (fun { rewritten_source; _ } -> rewritten_source) + |> print_string + +let%expect_test "regex_holes_simple" = + let source = {|foo|} in + let match_template = {|:[x~\w+]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|(foo)|}] + +let%expect_test "regex_holes_simple_posix" = + let source = {|foo|} in + let match_template = {|:[x~[[:alpha:]]]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|(f)(o)(o)|}] + +let%expect_test "regex_holes_substring" = + let source = {|foo()|} in + let match_template = {|:[x~o\w]()|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|f(oo)|}] + +let%expect_test "regex_holes_empty_string_terminates" = + let source = {|foo()|} in + let match_template = {|:[x~|]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|()f()o()o()(())|}] + +let%expect_test "regex_holes_repetition_takes_precedence" = + let source = {|foobar()|} in + (* this will _not_ match because bar is consumed by \w before we look ahead *) + let match_template = {|:[x~\w+]bar()|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|No matches.|}] + +let%expect_test "regex_holes_negated_match" = + let source = {|(literally_anyting_except_close_paren?!@#$%^&*[])|} in + let match_template = {|(:[x~[^)]+])|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|(literally_anyting_except_close_paren?!@#$%^&*[])|}]; + + let source = {|(arg1, arg2, arg3)|} in + let match_template = {|:[x~[^,() ]+]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|((arg1), (arg2), (arg3))|}] + +let%expect_test "regex_holes_dot_star_ok_and_this_is_for_newline" = + let source = "foo()\nbar()" in + let match_template = {|:[x~.*]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|(foo())() +(bar())|}] + +let%expect_test "regex_holes_optional" = + let source = "nonovember no november no vember" in + let match_template = {|:[x~no(vember)?]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|(no)(november) (no) (november) (no) vember|}] + +let%expect_test "regex_holes_optional_spaces" = + let source = "nonovember no november no vember" in + let match_template = {|no :[x~(vember)?]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|nonovember ()november (vember)|}] + +let%expect_test "regex_holes_optional_doesnt_work_outside_regex" = + let source = "no" in + let match_template = {|no:[x~(vember)?]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|()|}]; + + let source = "foo bar foobar" in + let match_template = {|:[x~\s*?]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|()f()o()o() ()b()a()r() ()f()o()o()b()a()r|}]; + + let source = "foo bar foobar" in + let match_template = {|:[x~\s*]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|()f()o()o( )()b()a()r( )()f()o()o()b()a()r|}] + + +let%expect_test "regex_holes_optional_strip_no_from_november_outside_regex" = + let source = "nonovember no november no vember" in + let match_template = {|no:[x~(vember)?]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|()(vember) () (vember) () vember|}] + +let%expect_test "regex_holes_optional_strip_no_from_november_inside_regex" = + let source = "nonovember no november no vember" in + let match_template = {|:[x~no(vember)?]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|(no)(november) (no) (november) (no) vember|}] + +let%expect_test "leading_spaces_beginning_line_anchor" = + let source = {| + a + b + c +|} + in + let match_template = {|:[x~^(\s+)]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|( )a( )b( )c( +)|}] + +let%expect_test "spaces_star" = + let source = {| + a + b + c + d +|} + in + let match_template = {|:[x~\s*]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + (* The chars is how this behaves on https://regexr.com/59ft0 as well, see replace *) + [%expect_exact {|( + )()a( + )()b( + )()c( + )()d( +)|}] + +(* +let%expect_test "end_line_anchor" = + let source = {| +aaa bbb +aaa bbb ccc +ccc ddd +|} + in + let match_template = {|:[x~\w+ bbb$]|} in + let rewrite_template = {|(:[x])|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {| +(aaa bbb)aaa bbb ccc +ccc ddd +|}] +*) + +let%expect_test "word_boundaries" = + let source = {| +foo(bar, baz(), + + + qux.derp) +|} + in + let match_template = {|:[x~\b\w+\b]|} in + let rewrite_template = {|(>:[x]<)|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {| +(>foo<)((>bar<), (>baz<)(), + + + (>qux<).(>derp<)) +|}] + +(* I previously assumed [^ ] would not match newlines, but it does, and is the + same as regexr https://regexr.com/59fst. To not match newlines, see the next + test with [^\s] *) +let%expect_test "do_not_match_space" = + let source = {| +foo(bar, baz(), + + + qux.derp) +|} + in + let match_template = {|:[x~[^, ]+]|} in + let rewrite_template = {|(>:[x]<)|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {|(> +foo(bar<), (>baz()<),(> + + +<) (>qux.derp) +<)|}] + +let%expect_test "do_not_match_whitespace" = + let source = {| +foo(bar, baz(), + + + qux.derp) +|} + in + let match_template = {|:[x~[^,\s]+]|} in + let rewrite_template = {|(>:[x]<)|} in + + run (module Generic) source match_template rewrite_template; + [%expect_exact {| +(>foo(bar<), (>baz()<), + + + (>qux.derp)<) +|}] + +let%expect_test "eof_anchor" = + let source = {| +setScore(5) +setScore(6) +setScore(6.5) +setScore("") +setScore("hi") +setScore("hi" + "there") +setScore('ho') +setScore(x) +setScore(null) +setScore(4/3.0) +setScore(4.0/3.0) +setScore(4/3) +|} + in + let match_template = {|setScore(:[1])|} in + let rule = {| + where match :[1] { + | ":[~^\\d+$]" -> false + | ":[_]" -> true + } + |} + in + let rewrite_template = "setScore( /*CHECK ME*/ :[1])" in + + run (module Generic) source match_template ~rule rewrite_template; + [%expect_exact {| +setScore(5) +setScore(6) +setScore( /*CHECK ME*/ 6.5) +setScore( /*CHECK ME*/ "") +setScore( /*CHECK ME*/ "hi") +setScore( /*CHECK ME*/ "hi" + "there") +setScore( /*CHECK ME*/ 'ho') +setScore( /*CHECK ME*/ x) +setScore( /*CHECK ME*/ null) +setScore( /*CHECK ME*/ 4/3.0) +setScore( /*CHECK ME*/ 4.0/3.0) +setScore( /*CHECK ME*/ 4/3) +|}]