From 70cdd4a413756f860f4ddd847a7922da3c5e6541 Mon Sep 17 00:00:00 2001
From: Wez Furlong <wez@fb.com>
Date: Fri, 6 Jul 2018 19:51:44 -0700
Subject: [PATCH] make hg grep aware of big grep

Summary:
This isn't 100% finished and is missing handling some
error cases, but I wanted to put it out there to get some knee-jerk
reactions and suggestions on how to handle some things.

This diff adds a `grep.usebiggrep` config option to `hg grep`.  The idea is
that we'll default this to on when the repo requires `eden` (or when we pull in
the eden specific site configuration).  When run in big-grep mode, we'll first
ask big grep for the results and then compute the local differences and run
only those through the local grep process, and avoid materializing files
locally.

I'm not 100% sure if the current `repo.status` call will yield the correct
results for the case where the current rev is behind the biggrep
corpus revision.

Reviewed By: quark-zju

Differential Revision: D8416360

fbshipit-source-id: 952badb7a7ec74096b5c77cd79aa25e2327a7659
---
 hgext/tweakdefaults.py | 192 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 178 insertions(+), 14 deletions(-)

diff --git a/hgext/tweakdefaults.py b/hgext/tweakdefaults.py
index 0b361e2f3f..3cdd19b76b 100644
--- a/hgext/tweakdefaults.py
+++ b/hgext/tweakdefaults.py
@@ -59,6 +59,10 @@ Config::
 
     # output new hashes when nodes get updated
     showupdated = False
+
+    [grep]
+    # Use external grep index
+    usebiggrep = False
 """
 from __future__ import absolute_import
 
@@ -91,7 +95,7 @@ from mercurial import (
     util,
 )
 from mercurial.i18n import _
-from mercurial.node import short
+from mercurial.node import bin, short
 
 from . import rebase
 
@@ -669,6 +673,34 @@ def histgrep(ui, repo, pattern, *pats, **opts):
     return commands.grep(ui, repo, pattern, *pats, **opts)
 
 
+ansiregex = re.compile(
+    (
+        r"\x1b("
+        r"(\[\??\d+[hl])|"
+        r"([=<>a-kzNM78])|"
+        r"([\(\)][a-b0-2])|"
+        r"(\[\d{0,2}[ma-dgkjqi])|"
+        r"(\[\d+;\d+[hfy]?)|"
+        r"(\[;?[hf])|"
+        r"(#[3-68])|"
+        r"([01356]n)|"
+        r"(O[mlnp-z]?)|"
+        r"(/Z)|"
+        r"(\d+)|"
+        r"(\[\?\d;\d0c)|"
+        r"(\d;\dR))"
+    ),
+    flags=re.IGNORECASE,
+)
+
+
+def stripansiescapes(s):
+    """Removes ANSI escape sequences from a string.
+    Borrowed from https://stackoverflow.com/a/45448194/149111
+    """
+    return ansiregex.sub("", s)
+
+
 del commands.table["grep"]
 
 
@@ -729,36 +761,60 @@ def grep(ui, repo, pattern, *pats, **opts):
         ]
     )
 
+    # If true, we'll use the `bgr` tool to perform the grep against some
+    # externally maintained index.  We don't provide an implementation
+    # of that tool with this repo, just the optional client interface.
+    biggrep = ui.configbool("grep", "usebiggrep")
+
+    # Ask big grep to strip out the corpus dir (stripdir) and to include
+    # the corpus revision on the first line.
+    biggrepcmd = ["bgr", "--stripdir", "-r", "--expression", pattern]
+
+    args = []
+
     if opts.get("after_context"):
-        cmd.append("-A")
-        cmd.append(opts.get("after_context"))
+        args.append("-A")
+        args.append(opts.get("after_context"))
     if opts.get("before_context"):
-        cmd.append("-B")
-        cmd.append(opts.get("before_context"))
+        args.append("-B")
+        args.append(opts.get("before_context"))
     if opts.get("context"):
-        cmd.append("-C")
-        cmd.append(opts.get("context"))
+        args.append("-C")
+        args.append(opts.get("context"))
     if opts.get("ignore_case"):
-        cmd.append("-i")
+        args.append("-i")
     if opts.get("files_with_matches"):
-        cmd.append("-l")
+        args.append("-l")
     if opts.get("line_number"):
         cmd.append("-n")
     if opts.get("invert_match"):
+        if biggrep:
+            raise error.Abort("Cannot use invert_match option with big grep")
         cmd.append("-v")
     if opts.get("word_regexp"):
         cmd.append("-w")
+        biggrepcmd[4] = "\\b%s\\b" % pattern
     if opts.get("extended_regexp"):
         cmd.append("-E")
+        # re2 is already mostly compatible by default, so there are no options
+        # to apply for this.
     if opts.get("fixed_strings"):
         cmd.append("-F")
+        # using bgs rather than bgr switches the engine to fixed string matches
+        biggrepcmd[0] = "bgs"
     if opts.get("perl_regexp"):
         cmd.append("-P")
+        # re2 is already mostly pcre compatible, so there are no options
+        # to apply for this.
+
+    biggrepcmd += args
+    cmd += args
 
     # color support, using the color extension
     colormode = getattr(ui, "_colormode", "")
     if colormode == "ansi":
         cmd.append("--color=always")
+        biggrepcmd.append("--color=on")
 
     # Copy match specific options
     match_opts = {}
@@ -778,21 +834,129 @@ def grep(ui, repo, pattern, *pats, **opts):
     # (passed in by xargs) as filenames.
     cmd.append("--")
     ui.pager("grep")
-    p = subprocess.Popen(
-        cmd, bufsize=-1, close_fds=util.closefds, stdin=subprocess.PIPE
-    )
 
-    write = p.stdin.write
+    if biggrep:
+        reporoot = os.path.dirname(repo.path)
+        p = subprocess.Popen(
+            biggrepcmd,
+            bufsize=-1,
+            close_fds=util.closefds,
+            stdout=subprocess.PIPE,
+            cwd=reporoot,
+        )
+        out, err = p.communicate()
+        lines = out.rstrip().split("\n")
+        # the first line has the revision for the corpus; parse it out
+        # the format is "#HASH:timestamp"
+        corpusrev = lines[0][1:41]
+        lines = lines[1:]
+
+        resultsbyfile = {}
+        includelineno = opts.get("line_number")
+
+        for line in lines:
+            filename, lineno, colno, context = line.split(":", 3)
+            unescapedfilename = stripansiescapes(filename)
+
+            # filter to just the files that match the list supplied
+            # by the caller
+            if m(unescapedfilename):
+                # relativize the path to the CWD.  Note that `filename` will
+                # often have escape sequences, so we do a substring replacement
+                filename = filename.replace(unescapedfilename, m.rel(unescapedfilename))
+
+                if unescapedfilename not in resultsbyfile:
+                    resultsbyfile[unescapedfilename] = []
+
+                # re-assemble the output, but omit the column number
+                if includelineno:
+                    resultsbyfile[unescapedfilename].append(
+                        "%s:%s:%s\n" % (filename, lineno, context)
+                    )
+                else:
+                    resultsbyfile[unescapedfilename].append(
+                        "%s:%s\n" % (filename, context)
+                    )
+
+        # Now check to see what has changed since the corpusrev
+        # we're going to need to grep those and stitch the results together
+        try:
+            changes = repo.status(bin(corpusrev), None, m)
+        except error.RepoLookupError:
+            # TODO: can we trigger a commit cloud fetch for this case?
+
+            # print the results we've gathered so far.  We're not sure
+            # how things differ, so we'll follow up with a warning.
+            for lines in resultsbyfile.values():
+                for line in lines:
+                    ui.write(line)
+
+            ui.warn(
+                _(
+                    "The results above are based on revision %s\n"
+                    "which is not available locally and thus may be inaccurate.\n"
+                    "To get accurate results, run `hg pull` and re-run "
+                    "your grep.\n"
+                )
+                % corpusrev
+            )
+            return
+
+        # which files we're going to search locally
+        filestogrep = set()
+
+        # files that have been changed or added need to be searched again
+        for f in changes.modified:
+            resultsbyfile.pop(f, None)
+            filestogrep.add(f)
+        for f in changes.added:
+            resultsbyfile.pop(f, None)
+            filestogrep.add(f)
+
+        # files that have been removed since the corpus rev cannot match
+        for f in changes.removed:
+            resultsbyfile.pop(f, None)
+        for f in changes.deleted:
+            resultsbyfile.pop(f, None)
+
+        # Having filtered out the changed files from the big grep results,
+        # we can now print those that remain.
+        for lines in resultsbyfile.values():
+            for line in lines:
+                ui.write(line)
+
+        # pass on any changed files to the local grep
+        if len(filestogrep) > 0:
+            # Ensure that the biggrep results are flushed before we
+            # start to intermingle with the local grep process output
+            ui.flush()
+            return _rungrep(cmd, filestogrep, m)
+
+        return 0
+
     ds = repo.dirstate
     getkind = stat.S_IFMT
     lnkkind = stat.S_IFLNK
     results = ds.walk(m, subrepos=[], unknown=False, ignored=False)
+
+    files = []
     for f in sorted(results.keys()):
         st = results[f]
         # skip symlinks and removed files
         if st is None or getkind(st.st_mode) == lnkkind:
             continue
-        write(m.rel(f) + "\0")
+        files.append(f)
+
+    return _rungrep(cmd, files, m)
+
+
+def _rungrep(cmd, files, match):
+    p = subprocess.Popen(
+        cmd, bufsize=-1, close_fds=util.closefds, stdin=subprocess.PIPE
+    )
+    write = p.stdin.write
+    for f in files:
+        write(match.rel(f) + "\0")
 
     p.stdin.close()
     return p.wait()