sapling/mercurial/match.py

# match.py - filename matching
#
#  Copyright 2008, 2009 Matt Mackall <mpm@selenic.com> and others
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

import re
import util, pathutil
from i18n import _

def _rematcher(regex):
    '''compile the regexp with the best available regexp engine and return a
    matcher function'''
    m = util.re.compile(regex)
    try:
        # slightly faster, provided by facebook's re2 bindings
        return m.test_match
    except AttributeError:
        return m.match

def _expandsets(kindpats, ctx):
    '''Returns the kindpats list with the 'set' patterns expanded.'''
    fset = set()
    other = []

    for kind, pat in kindpats:
        if kind == 'set':
            if not ctx:
                raise util.Abort("fileset expression with no context")
            s = ctx.getfileset(pat)
            fset.update(s)
            continue
        other.append((kind, pat))
    return fset, other

class match(object):
    def __init__(self, root, cwd, patterns, include=[], exclude=[],
                 default='glob', exact=False, auditor=None, ctx=None):
        """build an object to match a set of file patterns

        arguments:
        root - the canonical root of the tree you're matching against
        cwd - the current working directory, if relevant
        patterns - patterns to find
        include - patterns to include (unless they are excluded)
        exclude - patterns to exclude (even if they are included)
        default - if a pattern in patterns has no explicit type, assume this one
        exact - patterns are actually filenames (include/exclude still apply)

        a pattern is one of:
        'glob:<glob>' - a glob relative to cwd
        're:<regexp>' - a regular expression
        'path:<path>' - a path relative to repository root
        'relglob:<glob>' - an unrooted glob (*.c matches C files in all dirs)
        'relpath:<path>' - a path relative to cwd
        'relre:<regexp>' - a regexp that needn't match the start of a name
        'set:<fileset>' - a fileset expression
        '<something>' - a pattern of the specified default type
        """

        self._root = root
        self._cwd = cwd
        self._files = [] # exact files and roots of patterns
        self._anypats = bool(include or exclude)
        self._ctx = ctx
        self._always = False

        if include:
            kindpats = _normalize(include, 'glob', root, cwd, auditor)
            self.includepat, im = _buildmatch(ctx, kindpats, '(?:/|$)')
        if exclude:
            kindpats = _normalize(exclude, 'glob', root, cwd, auditor)
            self.excludepat, em = _buildmatch(ctx, kindpats, '(?:/|$)')
        if exact:
            if isinstance(patterns, list):
                self._files = patterns
            else:
                self._files = list(patterns)
            pm = self.exact
        elif patterns:
            kindpats = _normalize(patterns, default, root, cwd, auditor)
            self._files = _roots(kindpats)
            self._anypats = self._anypats or _anypats(kindpats)
            self.patternspat, pm = _buildmatch(ctx, kindpats, '$')

        if patterns or exact:
            if include:
                if exclude:
                    m = lambda f: im(f) and not em(f) and pm(f)
                else:
                    m = lambda f: im(f) and pm(f)
            else:
                if exclude:
                    m = lambda f: not em(f) and pm(f)
                else:
                    m = pm
        else:
            if include:
                if exclude:
                    m = lambda f: im(f) and not em(f)
                else:
                    m = im
            else:
                if exclude:
                    m = lambda f: not em(f)
                else:
                    m = lambda f: True
                    self._always = True

        self.matchfn = m
        self._fmap = set(self._files)

    def __call__(self, fn):
        return self.matchfn(fn)
    def __iter__(self):
        for f in self._files:
            yield f

    # Callbacks related to how the matcher is used by dirstate.walk.
    # Subscribers to these events must monkeypatch the matcher object.
    def bad(self, f, msg):
        '''Callback from dirstate.walk for each explicit file that can't be
        found/accessed, with an error message.'''
        pass

    # If an explicitdir is set, it will be called when an explicitly listed
    # directory is visited.
    explicitdir = None

    # If an traversedir is set, it will be called when a directory discovered
    # by recursive traversal is visited.
    traversedir = None

    def rel(self, f):
        '''Convert repo path back to path that is relative to cwd of matcher.'''
        return util.pathto(self._root, self._cwd, f)

    def files(self):
        '''Explicitly listed files or patterns or roots:
        if no patterns or .always(): empty list,
        if exact: list exact files,
        if not .anypats(): list all files and dirs,
        else: optimal roots'''
        return self._files

    def exact(self, f):
        '''Returns True if f is in .files().'''
        return f in self._fmap

    def anypats(self):
        '''Matcher uses patterns or include/exclude.'''
        return self._anypats

    def always(self):
        '''Matcher will match everything and .files() will be empty
        - optimization might be possible and necessary.'''
        return self._always

class exact(match):
    def __init__(self, root, cwd, files):
        match.__init__(self, root, cwd, files, exact=True)

class always(match):
    def __init__(self, root, cwd):
        match.__init__(self, root, cwd, [])
        self._always = True

class narrowmatcher(match):
    """Adapt a matcher to work on a subdirectory only.

    The paths are remapped to remove/insert the path as needed:

    >>> m1 = match('root', '', ['a.txt', 'sub/b.txt'])
    >>> m2 = narrowmatcher('sub', m1)
    >>> bool(m2('a.txt'))
    False
    >>> bool(m2('b.txt'))
    True
    >>> bool(m2.matchfn('a.txt'))
    False
    >>> bool(m2.matchfn('b.txt'))
    True
    >>> m2.files()
    ['b.txt']
    >>> m2.exact('b.txt')
    True
    >>> m2.rel('b.txt')
    'b.txt'
    >>> def bad(f, msg):
    ...     print "%s: %s" % (f, msg)
    >>> m1.bad = bad
    >>> m2.bad('x.txt', 'No such file')
    sub/x.txt: No such file
    """

    def __init__(self, path, matcher):
        self._root = matcher._root
        self._cwd = matcher._cwd
        self._path = path
        self._matcher = matcher
        self._always = matcher._always

        self._files = [f[len(path) + 1:] for f in matcher._files
                       if f.startswith(path + "/")]
        self._anypats = matcher._anypats
        self.matchfn = lambda fn: matcher.matchfn(self._path + "/" + fn)
        self._fmap = set(self._files)

    def bad(self, f, msg):
        self._matcher.bad(self._path + "/" + f, msg)

def patkind(pattern, default=None):
    '''If pattern is 'kind:pat' with a known kind, return kind.'''
    return _patsplit(pattern, default)[0]

def _patsplit(pattern, default):
    """Split a string into the optional pattern kind prefix and the actual
    pattern."""
    if ':' in pattern:
        kind, pat = pattern.split(':', 1)
        if kind in ('re', 'glob', 'path', 'relglob', 'relpath', 'relre',
                    'listfile', 'listfile0', 'set'):
            return kind, pat
    return default, pattern

def _globre(pat):
    r'''Convert an extended glob string to a regexp string.

    >>> print _globre(r'?')
    .
    >>> print _globre(r'*')
    [^/]*
    >>> print _globre(r'**')
    .*
    >>> print _globre(r'**/a')
    (?:.*/)?a
    >>> print _globre(r'a/**/b')
    a\/(?:.*/)?b
    >>> print _globre(r'[a*?!^][^b][!c]')
    [a*?!^][\^b][^c]
    >>> print _globre(r'{a,b}')
    (?:a|b)
    >>> print _globre(r'.\*\?')
    \.\*\?
    '''
    i, n = 0, len(pat)
    res = ''
    group = 0
    escape = util.re.escape
    def peek():
        return i < n and pat[i]
    while i < n:
        c = pat[i]
        i += 1
        if c not in '*?[{},\\':
            res += escape(c)
        elif c == '*':
            if peek() == '*':
                i += 1
                if peek() == '/':
                    i += 1
                    res += '(?:.*/)?'
                else:
                    res += '.*'
            else:
                res += '[^/]*'
        elif c == '?':
            res += '.'
        elif c == '[':
            j = i
            if j < n and pat[j] in '!]':
                j += 1
            while j < n and pat[j] != ']':
                j += 1
            if j >= n:
                res += '\\['
            else:
                stuff = pat[i:j].replace('\\','\\\\')
                i = j + 1
                if stuff[0] == '!':
                    stuff = '^' + stuff[1:]
                elif stuff[0] == '^':
                    stuff = '\\' + stuff
                res = '%s[%s]' % (res, stuff)
        elif c == '{':
            group += 1
            res += '(?:'
        elif c == '}' and group:
            res += ')'
            group -= 1
        elif c == ',' and group:
            res += '|'
        elif c == '\\':
            p = peek()
            if p:
                i += 1
                res += escape(p)
            else:
                res += escape(c)
        else:
            res += escape(c)
    return res

def _regex(kind, pat, globsuffix):
    '''Convert a (normalized) pattern of any kind into a regular expression.
    globsuffix is appended to the regexp of globs.'''
    if not pat:
        return ''
    if kind == 're':
        return pat
    if kind == 'path':
        return '^' + util.re.escape(pat) + '(?:/|$)'
    if kind == 'relglob':
        return '(?:|.*/)' + _globre(pat) + globsuffix
    if kind == 'relpath':
        return util.re.escape(pat) + '(?:/|$)'
    if kind == 'relre':
        if pat.startswith('^'):
            return pat
        return '.*' + pat
    return _globre(pat) + globsuffix

def _buildmatch(ctx, kindpats, globsuffix):
    '''Return regexp string and a matcher function for kindpats.
    globsuffix is appended to the regexp of globs.'''
    fset, kindpats = _expandsets(kindpats, ctx)
    if not kindpats:
        return "", fset.__contains__

    regex, mf = _buildregexmatch(kindpats, globsuffix)
    if fset:
        return regex, lambda f: f in fset or mf(f)
    return regex, mf

def _buildregexmatch(kindpats, globsuffix):
    """Build a match function from a list of kinds and kindpats,
    return regexp string and a matcher function."""
    try:
        regex = '(?:%s)' % '|'.join([_regex(k, p, globsuffix)
                                     for (k, p) in kindpats])
        if len(regex) > 20000:
            raise OverflowError
        return regex, _rematcher(regex)
    except OverflowError:
        # We're using a Python with a tiny regex engine and we
        # made it explode, so we'll divide the pattern list in two
        # until it works
        l = len(kindpats)
        if l < 2:
            raise
        regexa, a = _buildregexmatch(kindpats[:l//2], globsuffix)
        regexb, b = _buildregexmatch(kindpats[l//2:], globsuffix)
        return regex, lambda s: a(s) or b(s)
    except re.error:
        for k, p in kindpats:
            try:
                _rematcher('(?:%s)' % _regex(k, p, globsuffix))
            except re.error:
                raise util.Abort(_("invalid pattern (%s): %s") % (k, p))
        raise util.Abort(_("invalid pattern"))

def _normalize(patterns, default, root, cwd, auditor):
    '''Convert 'kind:pat' from the patterns list to tuples with kind and
    normalized and rooted patterns and with listfiles expanded.'''
    kindpats = []
    for kind, pat in [_patsplit(p, default) for p in patterns]:
        if kind in ('glob', 'relpath'):
            pat = pathutil.canonpath(root, cwd, pat, auditor)
        elif kind in ('relglob', 'path'):
            pat = util.normpath(pat)
        elif kind in ('listfile', 'listfile0'):
            try:
                files = util.readfile(pat)
                if kind == 'listfile0':
                    files = files.split('\0')
                else:
                    files = files.splitlines()
                files = [f for f in files if f]
            except EnvironmentError:
                raise util.Abort(_("unable to read file list (%s)") % pat)
            kindpats += _normalize(files, default, root, cwd, auditor)
            continue
        # else: re or relre - which cannot be normalized
        kindpats.append((kind, pat))
    return kindpats

def _roots(kindpats):
    '''return roots and exact explicitly listed files from patterns

    >>> _roots([('glob', 'g/*'), ('glob', 'g'), ('glob', 'g*')])
    ['g', 'g', '.']
    >>> _roots([('relpath', 'r'), ('path', 'p/p'), ('path', '')])
    ['r', 'p/p', '.']
    >>> _roots([('relglob', 'rg*'), ('re', 're/'), ('relre', 'rr')])
    ['.', '.', '.']
    '''
    r = []
    for kind, pat in kindpats:
        if kind == 'glob': # find the non-glob prefix
            root = []
            for p in pat.split('/'):
                if '[' in p or '{' in p or '*' in p or '?' in p:
                    break
                root.append(p)
            r.append('/'.join(root) or '.')
        elif kind in ('relpath', 'path'):
            r.append(pat or '.')
        else: # relglob, re, relre
            r.append('.')
    return r

def _anypats(kindpats):
    for kind, pat in kindpats:
        if kind in ('glob', 're', 'relglob', 'relre', 'set'):
            return True