From 70cdd4a413756f860f4ddd847a7922da3c5e6541 Mon Sep 17 00:00:00 2001 From: Wez Furlong Date: Fri, 6 Jul 2018 19:51:44 -0700 Subject: [PATCH] make hg grep aware of big grep Summary: This isn't 100% finished and is missing handling some error cases, but I wanted to put it out there to get some knee-jerk reactions and suggestions on how to handle some things. This diff adds a `grep.usebiggrep` config option to `hg grep`. The idea is that we'll default this to on when the repo requires `eden` (or when we pull in the eden specific site configuration). When run in big-grep mode, we'll first ask big grep for the results and then compute the local differences and run only those through the local grep process, and avoid materializing files locally. I'm not 100% sure if the current `repo.status` call will yield the correct results for the case where the current rev is behind the biggrep corpus revision. Reviewed By: quark-zju Differential Revision: D8416360 fbshipit-source-id: 952badb7a7ec74096b5c77cd79aa25e2327a7659 --- hgext/tweakdefaults.py | 192 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 178 insertions(+), 14 deletions(-) diff --git a/hgext/tweakdefaults.py b/hgext/tweakdefaults.py index 0b361e2f3f..3cdd19b76b 100644 --- a/hgext/tweakdefaults.py +++ b/hgext/tweakdefaults.py @@ -59,6 +59,10 @@ Config:: # output new hashes when nodes get updated showupdated = False + + [grep] + # Use external grep index + usebiggrep = False """ from __future__ import absolute_import @@ -91,7 +95,7 @@ from mercurial import ( util, ) from mercurial.i18n import _ -from mercurial.node import short +from mercurial.node import bin, short from . import rebase @@ -669,6 +673,34 @@ def histgrep(ui, repo, pattern, *pats, **opts): return commands.grep(ui, repo, pattern, *pats, **opts) +ansiregex = re.compile( + ( + r"\x1b(" + r"(\[\??\d+[hl])|" + r"([=<>a-kzNM78])|" + r"([\(\)][a-b0-2])|" + r"(\[\d{0,2}[ma-dgkjqi])|" + r"(\[\d+;\d+[hfy]?)|" + r"(\[;?[hf])|" + r"(#[3-68])|" + r"([01356]n)|" + r"(O[mlnp-z]?)|" + r"(/Z)|" + r"(\d+)|" + r"(\[\?\d;\d0c)|" + r"(\d;\dR))" + ), + flags=re.IGNORECASE, +) + + +def stripansiescapes(s): + """Removes ANSI escape sequences from a string. + Borrowed from https://stackoverflow.com/a/45448194/149111 + """ + return ansiregex.sub("", s) + + del commands.table["grep"] @@ -729,36 +761,60 @@ def grep(ui, repo, pattern, *pats, **opts): ] ) + # If true, we'll use the `bgr` tool to perform the grep against some + # externally maintained index. We don't provide an implementation + # of that tool with this repo, just the optional client interface. + biggrep = ui.configbool("grep", "usebiggrep") + + # Ask big grep to strip out the corpus dir (stripdir) and to include + # the corpus revision on the first line. + biggrepcmd = ["bgr", "--stripdir", "-r", "--expression", pattern] + + args = [] + if opts.get("after_context"): - cmd.append("-A") - cmd.append(opts.get("after_context")) + args.append("-A") + args.append(opts.get("after_context")) if opts.get("before_context"): - cmd.append("-B") - cmd.append(opts.get("before_context")) + args.append("-B") + args.append(opts.get("before_context")) if opts.get("context"): - cmd.append("-C") - cmd.append(opts.get("context")) + args.append("-C") + args.append(opts.get("context")) if opts.get("ignore_case"): - cmd.append("-i") + args.append("-i") if opts.get("files_with_matches"): - cmd.append("-l") + args.append("-l") if opts.get("line_number"): cmd.append("-n") if opts.get("invert_match"): + if biggrep: + raise error.Abort("Cannot use invert_match option with big grep") cmd.append("-v") if opts.get("word_regexp"): cmd.append("-w") + biggrepcmd[4] = "\\b%s\\b" % pattern if opts.get("extended_regexp"): cmd.append("-E") + # re2 is already mostly compatible by default, so there are no options + # to apply for this. if opts.get("fixed_strings"): cmd.append("-F") + # using bgs rather than bgr switches the engine to fixed string matches + biggrepcmd[0] = "bgs" if opts.get("perl_regexp"): cmd.append("-P") + # re2 is already mostly pcre compatible, so there are no options + # to apply for this. + + biggrepcmd += args + cmd += args # color support, using the color extension colormode = getattr(ui, "_colormode", "") if colormode == "ansi": cmd.append("--color=always") + biggrepcmd.append("--color=on") # Copy match specific options match_opts = {} @@ -778,21 +834,129 @@ def grep(ui, repo, pattern, *pats, **opts): # (passed in by xargs) as filenames. cmd.append("--") ui.pager("grep") - p = subprocess.Popen( - cmd, bufsize=-1, close_fds=util.closefds, stdin=subprocess.PIPE - ) - write = p.stdin.write + if biggrep: + reporoot = os.path.dirname(repo.path) + p = subprocess.Popen( + biggrepcmd, + bufsize=-1, + close_fds=util.closefds, + stdout=subprocess.PIPE, + cwd=reporoot, + ) + out, err = p.communicate() + lines = out.rstrip().split("\n") + # the first line has the revision for the corpus; parse it out + # the format is "#HASH:timestamp" + corpusrev = lines[0][1:41] + lines = lines[1:] + + resultsbyfile = {} + includelineno = opts.get("line_number") + + for line in lines: + filename, lineno, colno, context = line.split(":", 3) + unescapedfilename = stripansiescapes(filename) + + # filter to just the files that match the list supplied + # by the caller + if m(unescapedfilename): + # relativize the path to the CWD. Note that `filename` will + # often have escape sequences, so we do a substring replacement + filename = filename.replace(unescapedfilename, m.rel(unescapedfilename)) + + if unescapedfilename not in resultsbyfile: + resultsbyfile[unescapedfilename] = [] + + # re-assemble the output, but omit the column number + if includelineno: + resultsbyfile[unescapedfilename].append( + "%s:%s:%s\n" % (filename, lineno, context) + ) + else: + resultsbyfile[unescapedfilename].append( + "%s:%s\n" % (filename, context) + ) + + # Now check to see what has changed since the corpusrev + # we're going to need to grep those and stitch the results together + try: + changes = repo.status(bin(corpusrev), None, m) + except error.RepoLookupError: + # TODO: can we trigger a commit cloud fetch for this case? + + # print the results we've gathered so far. We're not sure + # how things differ, so we'll follow up with a warning. + for lines in resultsbyfile.values(): + for line in lines: + ui.write(line) + + ui.warn( + _( + "The results above are based on revision %s\n" + "which is not available locally and thus may be inaccurate.\n" + "To get accurate results, run `hg pull` and re-run " + "your grep.\n" + ) + % corpusrev + ) + return + + # which files we're going to search locally + filestogrep = set() + + # files that have been changed or added need to be searched again + for f in changes.modified: + resultsbyfile.pop(f, None) + filestogrep.add(f) + for f in changes.added: + resultsbyfile.pop(f, None) + filestogrep.add(f) + + # files that have been removed since the corpus rev cannot match + for f in changes.removed: + resultsbyfile.pop(f, None) + for f in changes.deleted: + resultsbyfile.pop(f, None) + + # Having filtered out the changed files from the big grep results, + # we can now print those that remain. + for lines in resultsbyfile.values(): + for line in lines: + ui.write(line) + + # pass on any changed files to the local grep + if len(filestogrep) > 0: + # Ensure that the biggrep results are flushed before we + # start to intermingle with the local grep process output + ui.flush() + return _rungrep(cmd, filestogrep, m) + + return 0 + ds = repo.dirstate getkind = stat.S_IFMT lnkkind = stat.S_IFLNK results = ds.walk(m, subrepos=[], unknown=False, ignored=False) + + files = [] for f in sorted(results.keys()): st = results[f] # skip symlinks and removed files if st is None or getkind(st.st_mode) == lnkkind: continue - write(m.rel(f) + "\0") + files.append(f) + + return _rungrep(cmd, files, m) + + +def _rungrep(cmd, files, match): + p = subprocess.Popen( + cmd, bufsize=-1, close_fds=util.closefds, stdin=subprocess.PIPE + ) + write = p.stdin.write + for f in files: + write(match.rel(f) + "\0") p.stdin.close() return p.wait()