sapling/hgext/grepdiff.py

# grepdiff.py
#
# Copyright 2016 Facebook, Inc.
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

import re

from mercurial import pathutil, registrar, revset, util
from mercurial.i18n import _

revsetpredicate = registrar.revsetpredicate()

touchprefix = 'touch'
prefixtoprocessors = {
    "add": lambda adds, removes: adds > 0,
    "remove": lambda adds, removes: removes > 0,
    "delta": lambda adds, removes: adds != removes,
    touchprefix: lambda adds, removes: adds > 0 or removes > 0,
    "inc": lambda adds, removes: adds > removes,
    "dec": lambda adds, removes: adds < removes
}

def getpatternandprocessor(repo, args):
    """Parse prefix and pattern from the provided arguments

    Example argument could be args[0][1] == 'add:hello world'"""
    pattern = args[0][1]
    prefix = touchprefix
    patstart = 0
    if ':' in pattern:
        patstart = pattern.index(':') + 1
        prefix = pattern[:patstart - 1]
    if prefix and prefix not in prefixtoprocessors:
        repo.ui.warning(_('treating %s as a part of pattern') % (prefix + ':'))
        prefix = touchprefix
    else:
        pattern = pattern[patstart:]
    processor = prefixtoprocessors[prefix]
    # currently this regex always has re.M and re.I flags, we might
    # want to make it configurable in future
    pattern = util.re.compile(pattern, re.M | re.I)
    return pattern, processor

@revsetpredicate('grepdiff(pattern, [file], ...)')
def grepdiffpredicate(repo, subset, x):
    """grepdiff: a revset for code archeology

    Sample usages are:
      $ hg log --rev "grepdiff('add:command')" mercurial/commands.py
          will only match changesets that add 'command' somewhere in the diff
      $ hg log --rev "grepdiff('remove:command')" mercurial/commands.py
          will match changesets which remove 'command' somewhere in the diff
      $ hg log --rev "grepdiff('delta:command') mercurial/commands.py"
          will mathc changesets where the number of 'command' adds is different
          from the number of 'command' removes in the diff
      $ hg log --rev "grepdiff('touch:command')"
          will only match changesets which either add or remove 'command' at
          least once in the diff
      $ hg log --rev "grepdiff('inc:command')" folder/file1.py folder/file2.py
          will match changesets which increase the number of occurrences
          of 'command' in the specified files
      $ hg log --rev "grepdiff('dec:command')"
          will match changesets which decrease the number of occurrences
          of 'command'
    """
    err = _("wrong set of arguments passed to grepdiff revset")
    args = revset.getargs(x, 1, -1, err)
    files = None
    if len(args) > 1:
        files = set(pathutil.canonpath(repo.root, repo.getcwd(), arg[1])
                    for arg in args[1:])
    pattern, processor = getpatternandprocessor(repo, args)
    def matcher(rev):
        res = processor(*ctxaddsremoves(repo[rev], files, pattern))
        return res
    resset = subset.filter(matcher)
    return resset

def ctxaddsremoves(ctx, files, regexp):
    """Check whether some context matches a given pattern

    'ctx' is a context to check
    'files' is a set of repo-based filenames we're interested in (None
    indicates all files)
    'regexp' is a compiled regular expression against which to match"""
    addcount = 0
    removecount = 0
    filenamelines = []
    for diffitem in ctx.diff():
        # ctx.diff() is a generator that returns a list of strings that are
        # supposed to be printed and some of them are concatenations of
        # multiple '\n'-separated lines. Here's an example of such a list:
        # ["diff --git a/setup.py b/setup.py\n" +\
        #  "--- a/setup.py\n" +\
        #  "+++ b/setup.py\n",
        #  "@@ -1,7 +1,7 @@\n" +\
        #  " from distutils.core import setup, Extension\n" +\
        #  " \n" +\
        #  " setup(\n" +\
        #  "-    name='fbhgextensions',\n" +\
        #  "+    name='fbhgext',\n" +\
        #  "     version='0.1.0',\n" +\
        #  "     author='Durham Goode',\n" +\
        #  "     maintainer='Durham Goode',\n"]
        # Please note that this list in fact contains just two elements, the
        # second string is manually separated into individual lines as they
        # would've been printed.
        # It can be seen that the first element of the list starts with 'diff'
        # and contains the filenames for the upcoming chunks.
        # The second element however has the changes that happened to the
        # file separated by '\n', so we want to parse that, find which ones
        # start with '+' or '-', group them into blocks and match the regex
        # against those blocks.
        if diffitem.startswith('diff'):
            # title line that start diff for some file, does not contain
            # the diff itself. the next iteration of this loop wil hit the
            # actual diff line
            lines = diffitem.split('\n')
            filenamelines = lines[1:3]
            continue

        # a changeblock is a set of consequtive change lines which share the
        # same sign (+/-). we want to join those lines into blocks in order
        # to be able to perform multi-line regex matches
        changeblocks, currentblock, currentsign = [], [], ''
        lines = diffitem.split('\n')
        # an extra iteration is necessary to save the last block
        for line in lines + ["@"]:
            if not line:
                continue
            if line[0] == currentsign:
                # current block continues
                currentblock.append(line[1:])
                continue

            if currentsign:
                # we know that current block is over so we should save it
                changeblocks.append((currentsign, "\n".join(currentblock)))

            if line[0] == '+' or line[0] == '-':
                # new block starts here
                currentsign = line[0]
                currentblock = [line[1:]]
            else:
                # other lines include the ones that start with @@ and
                # contain context line numbers or unchanged context lines
                # from source file.
                currentsign, currentblock = '', []

        beforetablines = (ln.split("\t", 1)[0] for ln in filenamelines)
        filenames = (ln.split('/', 1)[1] for ln in beforetablines if '/' in ln)
        if files and not any(fn for fn in filenames if fn in files):
            # this part of diff does not touch any of the files we're
            # interested in
            continue
        for mod, change in changeblocks:
            match = regexp.search(change)
            if not match:
                continue
            if mod == '+':
                addcount += 1
            else:
                removecount += 1
    return addcount, removecount