sapling/mercurial/match.py
2009-05-24 02:56:14 -05:00

253 lines
8.0 KiB
Python

# match.py - file name matching
#
# Copyright 2008, 2009 Matt Mackall <mpm@selenic.com> and others
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2, incorporated herein by reference.
import util, re
class _match(object):
def __init__(self, root, cwd, files, mf, ap):
self._root = root
self._cwd = cwd
self._files = files
self._fmap = set(files)
self.matchfn = mf
self._anypats = ap
def __call__(self, fn):
return self.matchfn(fn)
def __iter__(self):
for f in self._files:
yield f
def bad(self, f, msg):
return True
def dir(self, f):
pass
def missing(self, f):
pass
def exact(self, f):
return f in self._fmap
def rel(self, f):
return util.pathto(self._root, self._cwd, f)
def files(self):
return self._files
def anypats(self):
return self._anypats
class always(_match):
def __init__(self, root, cwd):
_match.__init__(self, root, cwd, [], lambda f: True, False)
class never(_match):
def __init__(self, root, cwd):
_match.__init__(self, root, cwd, [], lambda f: False, False)
class exact(_match):
def __init__(self, root, cwd, files):
_match.__init__(self, root, cwd, files, self.exact, False)
class match(_match):
def __init__(self, root, cwd, patterns, include=[], exclude=[],
default='glob'):
f, mf, ap = _matcher(root, cwd, patterns, include, exclude, default)
_match.__init__(self, root, cwd, f, mf, ap)
def patkind(pat):
return _patsplit(pat, None)[0]
def _patsplit(pat, default):
"""Split a string into an optional pattern kind prefix and the
actual pattern."""
for prefix in 're', 'glob', 'path', 'relglob', 'relpath', 'relre':
if pat.startswith(prefix + ':'): return pat.split(':', 1)
return default, pat
_globchars = set('[{*?')
def _globre(pat, head='^', tail='$'):
"convert a glob pattern into a regexp"
i, n = 0, len(pat)
res = ''
group = 0
def peek(): return i < n and pat[i]
while i < n:
c = pat[i]
i = i+1
if c == '*':
if peek() == '*':
i += 1
res += '.*'
else:
res += '[^/]*'
elif c == '?':
res += '.'
elif c == '[':
j = i
if j < n and pat[j] in '!]':
j += 1
while j < n and pat[j] != ']':
j += 1
if j >= n:
res += '\\['
else:
stuff = pat[i:j].replace('\\','\\\\')
i = j + 1
if stuff[0] == '!':
stuff = '^' + stuff[1:]
elif stuff[0] == '^':
stuff = '\\' + stuff
res = '%s[%s]' % (res, stuff)
elif c == '{':
group += 1
res += '(?:'
elif c == '}' and group:
res += ')'
group -= 1
elif c == ',' and group:
res += '|'
elif c == '\\':
p = peek()
if p:
i += 1
res += re.escape(p)
else:
res += re.escape(c)
else:
res += re.escape(c)
return head + res + tail
def _matcher(canonroot, cwd='', names=[], inc=[], exc=[], dflt_pat='glob'):
"""build a function to match a set of file patterns
arguments:
canonroot - the canonical root of the tree you're matching against
cwd - the current working directory, if relevant
names - patterns to find
inc - patterns to include
exc - patterns to exclude
dflt_pat - if a pattern in names has no explicit type, assume this one
a pattern is one of:
'glob:<glob>' - a glob relative to cwd
're:<regexp>' - a regular expression
'path:<path>' - a path relative to canonroot
'relglob:<glob>' - an unrooted glob (*.c matches C files in all dirs)
'relpath:<path>' - a path relative to cwd
'relre:<regexp>' - a regexp that doesn't have to match the start of a name
'<something>' - one of the cases above, selected by the dflt_pat argument
returns:
a 3-tuple containing
- list of roots (places where one should start a recursive walk of the fs);
this often matches the explicit non-pattern names passed in, but also
includes the initial part of glob: patterns that has no glob characters
- a bool match(filename) function
- a bool indicating if any patterns were passed in
"""
# a common case: no patterns at all
if not names and not inc and not exc:
return [], util.always, False
def contains_glob(name):
for c in name:
if c in _globchars: return True
return False
def regex(kind, name, tail):
'''convert a pattern into a regular expression'''
if not name:
return ''
if kind == 're':
return name
elif kind == 'path':
return '^' + re.escape(name) + '(?:/|$)'
elif kind == 'relglob':
return _globre(name, '(?:|.*/)', tail)
elif kind == 'relpath':
return re.escape(name) + '(?:/|$)'
elif kind == 'relre':
if name.startswith('^'):
return name
return '.*' + name
return _globre(name, '', tail)
def matchfn(pats, tail):
"""build a matching function from a set of patterns"""
if not pats:
return
try:
pat = '(?:%s)' % '|'.join([regex(k, p, tail) for (k, p) in pats])
if len(pat) > 20000:
raise OverflowError()
return re.compile(pat).match
except OverflowError:
# We're using a Python with a tiny regex engine and we
# made it explode, so we'll divide the pattern list in two
# until it works
l = len(pats)
if l < 2:
raise
a, b = matchfn(pats[:l//2], tail), matchfn(pats[l//2:], tail)
return lambda s: a(s) or b(s)
except re.error:
for k, p in pats:
try:
re.compile('(?:%s)' % regex(k, p, tail))
except re.error:
raise util.Abort("invalid pattern (%s): %s" % (k, p))
raise util.Abort("invalid pattern")
def globprefix(pat):
'''return the non-glob prefix of a path, e.g. foo/* -> foo'''
root = []
for p in pat.split('/'):
if contains_glob(p): break
root.append(p)
return '/'.join(root) or '.'
def normalizepats(names, default):
pats = []
roots = []
anypats = False
for kind, name in [_patsplit(p, default) for p in names]:
if kind in ('glob', 'relpath'):
name = util.canonpath(canonroot, cwd, name)
elif kind in ('relglob', 'path'):
name = util.normpath(name)
pats.append((kind, name))
if kind in ('glob', 're', 'relglob', 'relre'):
anypats = True
if kind == 'glob':
root = globprefix(name)
roots.append(root)
elif kind in ('relpath', 'path'):
roots.append(name or '.')
elif kind == 'relglob':
roots.append('.')
return roots, pats, anypats
roots, pats, anypats = normalizepats(names, dflt_pat)
patmatch = matchfn(pats, '$') or util.always
incmatch = util.always
if inc:
dummy, inckinds, dummy = normalizepats(inc, 'glob')
incmatch = matchfn(inckinds, '(?:/|$)')
excmatch = util.never
if exc:
dummy, exckinds, dummy = normalizepats(exc, 'glob')
excmatch = matchfn(exckinds, '(?:/|$)')
if not names and inc and not exc:
# common case: hgignore patterns
matcher = incmatch
else:
matcher = lambda fn: incmatch(fn) and not excmatch(fn) and patmatch(fn)
return (roots, matcher, (inc or exc or anypats) and True)