mirror of
https://github.com/facebook/sapling.git
synced 2024-10-04 22:07:44 +03:00
0e86e4ccf0
Summary: See why on previous diff Now with this diff GlobTree is independent of inode Reviewed By: kmancini Differential Revision: D49933175 fbshipit-source-id: 1551a2b7e054df5df88ac37fbf0bf45f91e34548
906 lines
31 KiB
C++
906 lines
31 KiB
C++
/*
|
|
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* This software may be used and distributed according to the terms of the
|
|
* GNU General Public License version 2.
|
|
*/
|
|
|
|
#include "eden/fs/utils/GlobMatcher.h"
|
|
|
|
#include <fmt/core.h>
|
|
#include <folly/logging/xlog.h>
|
|
#include <algorithm>
|
|
#include <limits>
|
|
|
|
using folly::Expected;
|
|
using std::string;
|
|
using std::vector;
|
|
|
|
namespace {
|
|
/*
|
|
* Opcode characters for our pattern buffer.
|
|
*/
|
|
enum : uint8_t {
|
|
// A chunk of literal string data.
|
|
// This is followed by a length byte, then the literal data.
|
|
// Literal runs of more than 255 bytes in a row are broken up into separate
|
|
// literal opcodes with a max length of 255 bytes each.
|
|
GLOB_LITERAL = 'S',
|
|
// GLOB_STAR matches 0 or more characters.
|
|
// This is followed by a bool byte. If true, the pattern can match text
|
|
// that starts with a '.'.
|
|
// Any character except '/' can be matched.
|
|
GLOB_STAR = '*',
|
|
// GLOB_STAR_STAR_END matches all remaining text.
|
|
// This is followed by a bool byte. If true, a path component in the pattern
|
|
// can start with a '.'.
|
|
// If GLOB_STAR_STAR_END appears it is always the very last opcode in the
|
|
// pattern buffer.
|
|
GLOB_STAR_STAR_END = '>',
|
|
// GLOB_STAR_STAR_SLASH matches either:
|
|
// - 0 characters
|
|
// - 1 or more characters followed by a slash
|
|
// This is followed by a bool byte. If true, a path component in the pattern
|
|
// can start with a '.'.
|
|
GLOB_STAR_STAR_SLASH = 'X',
|
|
// GLOB_CHAR_CLASS matches a character class.
|
|
// This is followed by a list of characters to match.
|
|
// The matching characters are encoded as follows:
|
|
// - '\x00' indicates the end of the character class
|
|
// - '\x01' indicates a range. It is followed by 2 bytes, the low and high
|
|
// bounds of the range (inclusive).
|
|
// - any other character matches only that character.
|
|
// A literal '\x00' or '\x01' is encoded as a range with itself as both the
|
|
// lower and upper bound. e.g. '\x00' gets encoded as '\x01\x00\x00'.
|
|
GLOB_CHAR_CLASS = '[',
|
|
// GLOB_CHAR_CLASS_NEGATED is like GLOB_CHAR_CLASS, but matches
|
|
// only if the character does not match the character class.
|
|
// TODO: Do not let a negated character class pattern match a "." at the start
|
|
// of a file name, as specified in the POSIX docs.
|
|
GLOB_CHAR_CLASS_NEGATED = ']',
|
|
GLOB_CHAR_CLASS_END = '\x00',
|
|
GLOB_CHAR_CLASS_RANGE = '\x01',
|
|
// GLOB_QMARK matches any single character except for '/'
|
|
GLOB_QMARK = '?',
|
|
// GLOB_ENDS_WITH matches a literal section at the end of the string.
|
|
// We optimize GLOB_STAR+GLOB_LITERAL at the end of the pattern into
|
|
// GLOB_ENDS_WITH, so it is composed of the bool byte from GLOB_STAR followed
|
|
// by the data from GLOB_LITERAL.
|
|
GLOB_ENDS_WITH = '$',
|
|
// Used to represent boolean values associated with an opcode.
|
|
GLOB_TRUE = 'T',
|
|
GLOB_FALSE = 'F',
|
|
};
|
|
|
|
/// A set of character intervals. This is used during parsing to deduplicate
|
|
/// ranges within a character class.
|
|
class CharIntervalSet {
|
|
public:
|
|
/// A closed interval (inclusive on both sides)
|
|
using Interval = std::pair</*lo*/ uint8_t, /*hi*/ uint8_t>;
|
|
|
|
/// Insert a non-empty interval into the set. \param lo and \param hi are both
|
|
/// inclusive.
|
|
void insert(uint8_t lo, uint8_t hi) {
|
|
XDCHECK_GE(hi, lo);
|
|
bounds_.push_back({lo, /*isEnd=*/false});
|
|
bounds_.push_back({hi, /*isEnd=*/true});
|
|
}
|
|
|
|
/// Returns an optimized version of the interval set; that is, a list of
|
|
/// non-overlapping intervals that are included in the set.
|
|
std::vector<Interval> optimize() {
|
|
std::sort(bounds_.begin(), bounds_.end(), [](const auto& a, const auto& b) {
|
|
// Sort the bounds in ascending order, and ensure start bounds precede end
|
|
// bounds.
|
|
return (a.value < b.value) ||
|
|
((a.value == b.value) && !a.isEnd && b.isEnd);
|
|
});
|
|
XDCHECK(bounds_.empty() || bounds_.back().isEnd);
|
|
std::vector<Interval> intervals;
|
|
int depth = 0;
|
|
for (const auto& bound : bounds_) {
|
|
if (!bound.isEnd) {
|
|
++depth;
|
|
if (depth == 1) {
|
|
// Start a new interval before this character. Its end will be set
|
|
// later.
|
|
intervals.emplace_back(
|
|
bound.value, std::numeric_limits<uint8_t>::max());
|
|
}
|
|
} else {
|
|
--depth;
|
|
if (depth == 0) {
|
|
XDCHECK(!intervals.empty());
|
|
// End the current interval after this character
|
|
// @lint-ignore CLANGTIDY facebook-hte-LocalUncheckedArrayBounds
|
|
intervals.back().second = bound.value;
|
|
}
|
|
}
|
|
}
|
|
XDCHECK_EQ(depth, 0);
|
|
XDCHECK(bounds_.empty() || !intervals.empty());
|
|
return intervals;
|
|
}
|
|
|
|
private:
|
|
struct Bound {
|
|
uint8_t value;
|
|
|
|
/// If true, an interval ends after this character.
|
|
/// If false, an interval starts before this character.
|
|
bool isEnd;
|
|
};
|
|
std::vector<Bound> bounds_;
|
|
};
|
|
} // namespace
|
|
|
|
namespace facebook::eden {
|
|
|
|
namespace {
|
|
|
|
bool isStringPieceEqual(
|
|
std::string_view left,
|
|
std::string_view right,
|
|
CaseSensitivity caseSensitive) {
|
|
if (caseSensitive == CaseSensitivity::Sensitive) {
|
|
return left == right;
|
|
} else {
|
|
return std::equal(
|
|
left.begin(),
|
|
left.end(),
|
|
right.begin(),
|
|
right.end(),
|
|
folly::AsciiCaseInsensitive{});
|
|
}
|
|
}
|
|
|
|
char toLower(char c) {
|
|
if (c >= 'A' && c <= 'Z') {
|
|
c += 'a' - 'A';
|
|
}
|
|
return c;
|
|
}
|
|
|
|
char toUpper(char c) {
|
|
if (c >= 'a' && c <= 'z') {
|
|
c -= 'a' - 'A';
|
|
}
|
|
return c;
|
|
}
|
|
} // namespace
|
|
|
|
GlobOptions operator|(GlobOptions a, GlobOptions b) {
|
|
return static_cast<GlobOptions>(
|
|
static_cast<uint32_t>(a) | static_cast<uint32_t>(b));
|
|
}
|
|
|
|
GlobOptions& operator|=(GlobOptions& a, GlobOptions b) {
|
|
a = (a | b);
|
|
return a;
|
|
}
|
|
|
|
bool operator&(GlobOptions a, GlobOptions b) {
|
|
return (static_cast<uint32_t>(a) & static_cast<uint32_t>(b)) != 0;
|
|
}
|
|
|
|
GlobMatcher::GlobMatcher(vector<uint8_t> pattern, CaseSensitivity caseSensitive)
|
|
: pattern_(std::move(pattern)), caseSensitive_(caseSensitive) {}
|
|
|
|
GlobMatcher::GlobMatcher() = default;
|
|
|
|
GlobMatcher::~GlobMatcher() = default;
|
|
|
|
/*
|
|
* A glob pattern consists of a few types of data:
|
|
* - literal string pieces
|
|
* - *
|
|
* - **
|
|
* - ?
|
|
* - bracket expressions ([])
|
|
*
|
|
* We parse this in create(), and encode it as a string of opcodes.
|
|
* The opcode semantics are documented above where they are defined.
|
|
*
|
|
* Glancing through our existing ignore rules:
|
|
* - About 60% are simple fixed strings, with no wildcards
|
|
* - About 27% are simple "ends with" patterns (e.g., "*.txt")
|
|
*/
|
|
Expected<GlobMatcher, string> GlobMatcher::create(
|
|
std::string_view glob,
|
|
GlobOptions options) {
|
|
CaseSensitivity caseSensitive = options & GlobOptions::CASE_INSENSITIVE
|
|
? CaseSensitivity::Insensitive
|
|
: CaseSensitivity::Sensitive;
|
|
vector<uint8_t> result;
|
|
// Make a guess at how big the pattern buffer will be.
|
|
// We require 2 extra bytes for each literal chunk. We save a byte for "**"
|
|
// expressions, and we usually save a byte or two on bracket expressions.
|
|
result.reserve(glob.size() + 6);
|
|
|
|
ssize_t prevOpcodeIdx = -1;
|
|
ssize_t curOpcodeIdx = -1;
|
|
auto addOpcode = [&](uint8_t opcode) {
|
|
prevOpcodeIdx = curOpcodeIdx;
|
|
curOpcodeIdx = result.size();
|
|
result.push_back(opcode);
|
|
};
|
|
|
|
auto appendLiteralChar = [&](char c) {
|
|
if (curOpcodeIdx >= 0 && result[curOpcodeIdx] == GLOB_LITERAL &&
|
|
result[curOpcodeIdx + 1] < 0xff) {
|
|
// Just append this byte to the end of the current literal section.
|
|
++result[curOpcodeIdx + 1];
|
|
result.push_back(c);
|
|
} else {
|
|
// We aren't currently in a literal section (or we have already put 255
|
|
// bytes in the current section and can't fit any more). Start a new
|
|
// literal section.
|
|
addOpcode(GLOB_LITERAL);
|
|
result.push_back(1);
|
|
result.push_back(c);
|
|
}
|
|
};
|
|
|
|
auto appendBool = [&](bool b) {
|
|
result.push_back(b ? GLOB_TRUE : GLOB_FALSE);
|
|
};
|
|
|
|
// Note: watchman's wildcard matching code treats '/' slightly specially:
|
|
// it can match 1 or more '/' characters. For example, "foo/bar" would match
|
|
// "foo///bar".
|
|
//
|
|
// We don't bother doing this here since the paths given to our code should
|
|
// already have been normalized, so we should never have repeated slashes in
|
|
// the text being matched.
|
|
|
|
auto includeDotfiles = !(options & GlobOptions::IGNORE_DOTFILES);
|
|
for (size_t idx = 0; idx < glob.size(); ++idx) {
|
|
char c = glob[idx];
|
|
if (c == '\\') {
|
|
// Backslash escaped characters are treated literally
|
|
++idx;
|
|
if (idx >= glob.size()) {
|
|
// A trailing backslash is invalid. This glob should be ignored.
|
|
return folly::makeUnexpected<string>(
|
|
"glob pattern ends with trailing backslash");
|
|
}
|
|
appendLiteralChar(glob[idx]);
|
|
continue;
|
|
} else if (c == '?') {
|
|
// Match any single character except for a slash
|
|
addOpcode(GLOB_QMARK);
|
|
} else if (c == '*') {
|
|
if (idx + 1 < glob.size() && glob[idx + 1] == '*') {
|
|
// This is "**".
|
|
// According to the gitignore man pages, "**" is only valid in three
|
|
// cases:
|
|
// - "**/" at the start of the pattern
|
|
// - "/**" at the end of the pattern
|
|
// - "/**/" in the middle of the pattern
|
|
++idx;
|
|
if (idx + 1 >= glob.size()) {
|
|
// Make sure that the character before this was '/'.
|
|
// We still treat it as part of the previous literal opcode, but we
|
|
// want to reject the glob if this ** wasn't preceded by '/'.
|
|
if (idx < 2 || glob[idx - 2] != '/') {
|
|
return folly::makeUnexpected<string>(
|
|
"invalid \"**\" sequence at end of pattern without slash");
|
|
}
|
|
addOpcode(GLOB_STAR_STAR_END);
|
|
appendBool(includeDotfiles);
|
|
} else if (glob[idx + 1] == '/') {
|
|
if (idx >= 2 && glob[idx - 2] != '/') {
|
|
return folly::makeUnexpected<string>(
|
|
"\"**/\" must follow a slash or appear at the start of a pattern");
|
|
}
|
|
|
|
++idx;
|
|
addOpcode(GLOB_STAR_STAR_SLASH);
|
|
appendBool(includeDotfiles);
|
|
} else {
|
|
// Reject the pattern if "**" isn't followed by the end of the
|
|
// pattern or a "/"
|
|
return folly::makeUnexpected<string>("invalid \"**\" sequence");
|
|
}
|
|
} else {
|
|
addOpcode(GLOB_STAR);
|
|
// If includeDotfiles is false, then "*.cpp" should not match
|
|
// ".bak.cpp", but "My*.cpp" should match "My.foo.cpp", so we must check
|
|
// the preceding character.
|
|
appendBool(includeDotfiles || (idx != 0 && glob[idx - 1] != '/'));
|
|
}
|
|
} else if (c == '[') {
|
|
// Translate a bracket expression
|
|
prevOpcodeIdx = curOpcodeIdx;
|
|
curOpcodeIdx = result.size();
|
|
auto newIdx = parseBracketExpr(glob, idx, caseSensitive, &result);
|
|
if (!newIdx.hasValue()) {
|
|
return folly::makeUnexpected<string>(std::move(newIdx).error());
|
|
}
|
|
idx = newIdx.value();
|
|
} else {
|
|
appendLiteralChar(c);
|
|
}
|
|
}
|
|
|
|
// We perform one additional optimization here:
|
|
// if the final two opcodes were GLOB_STAR followed by GLOB_LITERAL, we
|
|
// translate this into GLOB_ENDS_WITH.
|
|
if (prevOpcodeIdx >= 0 && result[prevOpcodeIdx] == GLOB_STAR &&
|
|
result[curOpcodeIdx] == GLOB_LITERAL) {
|
|
// Currently, the end of the result vector contains:
|
|
//
|
|
// [prevOpcodeIdx] GLOB_STAR
|
|
// GLOB_STAR matchCanStartWithDot bool
|
|
// [curOpcodeIdx] GLOB_LITERAL
|
|
// GLOB_LITERAL data
|
|
//
|
|
// We modify it so it becomes:
|
|
//
|
|
// [prevOpcodeIdx] GLOB_ENDS_WITH
|
|
// GLOB_STAR matchCanStartWithDot bool
|
|
// [curOpcodeIdx] GLOB_LITERAL data
|
|
result.erase(result.begin() + curOpcodeIdx);
|
|
result[prevOpcodeIdx] = GLOB_ENDS_WITH;
|
|
}
|
|
|
|
return GlobMatcher(std::move(result), caseSensitive);
|
|
}
|
|
|
|
Expected<size_t, string> GlobMatcher::parseBracketExpr(
|
|
std::string_view glob,
|
|
size_t idx,
|
|
CaseSensitivity caseSensitive,
|
|
vector<uint8_t>* pattern) {
|
|
XDCHECK_LT(idx, glob.size());
|
|
XDCHECK_EQ(glob[idx], '[');
|
|
|
|
// Check for a leading '!' or '^'
|
|
if (idx + 1 >= glob.size()) {
|
|
return folly::makeUnexpected<string>("unterminated bracket sequence");
|
|
}
|
|
if (glob[idx + 1] == '!' || glob[idx + 1] == '^') {
|
|
pattern->push_back(GLOB_CHAR_CLASS_NEGATED);
|
|
++idx;
|
|
if (idx >= glob.size()) {
|
|
return folly::makeUnexpected<string>("unterminated bracket sequence");
|
|
}
|
|
} else {
|
|
pattern->push_back(GLOB_CHAR_CLASS);
|
|
}
|
|
|
|
CharIntervalSet charIntervals;
|
|
|
|
// Set NO_PREV_CHAR to something outside of the range [-128, 255]
|
|
// We want to make sure it can't possibly correspond to a valid char value,
|
|
// regardless of whether char types are signed or unsigned on this platform.
|
|
constexpr int32_t NO_PREV_CHAR = 0xffff;
|
|
int32_t prevChar = NO_PREV_CHAR;
|
|
auto addPrevChar = [&]() {
|
|
if (prevChar == NO_PREV_CHAR) {
|
|
return;
|
|
} else if (
|
|
prevChar == GLOB_CHAR_CLASS_END || prevChar == GLOB_CHAR_CLASS_RANGE) {
|
|
// Escape these characters by turning them into ranges.
|
|
charIntervals.insert(prevChar, prevChar);
|
|
} else {
|
|
pattern->push_back(prevChar);
|
|
if (caseSensitive == CaseSensitivity::Insensitive) {
|
|
// For case-insensitive matching of alpha characters, add the
|
|
// opposite-case version of the character to the class.
|
|
auto asLower = toLower(prevChar);
|
|
auto asUpper = toUpper(prevChar);
|
|
if (asLower != prevChar) {
|
|
pattern->push_back(asLower);
|
|
} else if (asUpper != prevChar) {
|
|
pattern->push_back(asUpper);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
auto startIdx = idx;
|
|
while (true) {
|
|
++idx;
|
|
if (idx >= glob.size()) {
|
|
return folly::makeUnexpected<string>("unterminated bracket sequence");
|
|
}
|
|
|
|
auto c = glob[idx];
|
|
if (c == '\\') {
|
|
// A backslash escapes the following character
|
|
++idx;
|
|
if (idx >= glob.size()) {
|
|
// Unterminated escape sequence
|
|
return folly::makeUnexpected<string>(
|
|
"unterminated backslash in bracket sequence");
|
|
}
|
|
addPrevChar();
|
|
prevChar = glob[idx];
|
|
} else if (c == ']') {
|
|
// ']' normally signifies the end of the character class,
|
|
// unless it is the very first character after the opening '[' or '[^'
|
|
if (idx == startIdx + 1) {
|
|
XDCHECK_EQ(NO_PREV_CHAR, prevChar);
|
|
prevChar = c;
|
|
} else {
|
|
// End of the character class.
|
|
break;
|
|
}
|
|
} else if (c == '-') {
|
|
if (prevChar == NO_PREV_CHAR) {
|
|
prevChar = c;
|
|
} else {
|
|
// This is a range
|
|
if (idx + 1 >= glob.size()) {
|
|
// Unterminated escape sequence
|
|
return folly::makeUnexpected<string>("unterminated bracket range");
|
|
} else if (glob[idx + 1] == ']') {
|
|
// '-' followed by the terminating ']' is just a literal '-',
|
|
// not a range.
|
|
addPrevChar();
|
|
prevChar = c;
|
|
} else {
|
|
// This is a range
|
|
++idx;
|
|
uint8_t highBound = glob[idx];
|
|
if (highBound == '\\') {
|
|
++idx;
|
|
if (idx >= glob.size()) {
|
|
return folly::makeUnexpected<string>(
|
|
"unterminated escape in bracket range");
|
|
}
|
|
highBound = glob[idx];
|
|
}
|
|
// Don't even bother adding the range if the low bound is greater
|
|
// than the high bound. (We don't treat the whole glob as invalid
|
|
// though. We just ignore this one range, since it can never match
|
|
// anything.)
|
|
if (prevChar <= highBound) {
|
|
charIntervals.insert(prevChar, highBound);
|
|
|
|
if (caseSensitive == CaseSensitivity::Insensitive) {
|
|
// If the range intersects with ['A', 'Z'], add the lowercase
|
|
// counterpart of the intersection.
|
|
if (highBound >= 'A' && prevChar <= 'Z') {
|
|
charIntervals.insert(
|
|
toLower(std::clamp<uint8_t>(prevChar, 'A', 'Z')),
|
|
toLower(std::clamp<uint8_t>(highBound, 'A', 'Z')));
|
|
}
|
|
// If the range intersects with ['a', 'z'], add the uppercase
|
|
// counterpart of the intersection.
|
|
if (highBound >= 'a' && prevChar <= 'z') {
|
|
charIntervals.insert(
|
|
toUpper(std::clamp<uint8_t>(prevChar, 'a', 'z')),
|
|
toUpper(std::clamp<uint8_t>(highBound, 'a', 'z')));
|
|
}
|
|
}
|
|
}
|
|
prevChar = NO_PREV_CHAR;
|
|
}
|
|
}
|
|
} else if (c == '[') {
|
|
// Look for a character class like [:alpha:]
|
|
bool isClass = false;
|
|
if (idx + 3 < glob.size() && glob[idx + 1] == ':') {
|
|
auto classStart = idx + 2;
|
|
for (auto end = classStart; end + 1 < glob.size(); ++end) {
|
|
if (glob[end] == ':' && glob[end + 1] == ']') {
|
|
std::string_view charClass{
|
|
glob.data() + classStart, end - classStart};
|
|
if (!addCharClass(charClass, caseSensitive, pattern)) {
|
|
return folly::makeUnexpected<string>(
|
|
fmt::format("unknown character class \"{}\"", charClass));
|
|
}
|
|
idx = end + 1;
|
|
isClass = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// This wasn't a character class.
|
|
// Just treat this just as a literal '[' character.
|
|
if (!isClass) {
|
|
addPrevChar();
|
|
prevChar = c;
|
|
}
|
|
} else {
|
|
addPrevChar();
|
|
prevChar = c;
|
|
}
|
|
}
|
|
|
|
addPrevChar();
|
|
|
|
// Add any user-specified ranges we collected along the way, with no
|
|
// duplicates
|
|
for (auto& interval : charIntervals.optimize()) {
|
|
addCharClassRange(interval.first, interval.second, pattern);
|
|
}
|
|
pattern->push_back(GLOB_CHAR_CLASS_END);
|
|
return idx;
|
|
}
|
|
|
|
void GlobMatcher::addCharClassRange(
|
|
uint8_t low,
|
|
uint8_t high,
|
|
std::vector<uint8_t>* pattern) {
|
|
XDCHECK_LE(low, high);
|
|
pattern->push_back(GLOB_CHAR_CLASS_RANGE);
|
|
pattern->push_back(low);
|
|
pattern->push_back(high);
|
|
}
|
|
|
|
bool GlobMatcher::addCharClass(
|
|
std::string_view charClass,
|
|
CaseSensitivity caseSensitive,
|
|
vector<uint8_t>* pattern) {
|
|
// Character class definitions.
|
|
// These match the POSIX Standard Locale as defined in ISO/IEC 9945-2:1993
|
|
if (charClass == "alnum") {
|
|
addCharClassRange('a', 'z', pattern);
|
|
addCharClassRange('A', 'Z', pattern);
|
|
addCharClassRange('0', '9', pattern);
|
|
return true;
|
|
} else if (
|
|
charClass == "alpha" ||
|
|
// "upper" and "lower" with case-insensitive matching are equivalent to
|
|
// "alpha".
|
|
(caseSensitive == CaseSensitivity::Insensitive &&
|
|
(charClass == "lower" || charClass == "upper"))) {
|
|
addCharClassRange('a', 'z', pattern);
|
|
addCharClassRange('A', 'Z', pattern);
|
|
return true;
|
|
} else if (charClass == "blank") {
|
|
pattern->push_back(' ');
|
|
pattern->push_back('\t');
|
|
return true;
|
|
} else if (charClass == "cntrl") {
|
|
// POSIX locale cntrl definitions:
|
|
// 0x00-0x1f,0x7f
|
|
addCharClassRange(0x00, 0x1f, pattern);
|
|
pattern->push_back(0x7f);
|
|
return true;
|
|
} else if (charClass == "digit") {
|
|
addCharClassRange('0', '9', pattern);
|
|
return true;
|
|
} else if (charClass == "graph") {
|
|
// POSIX locale graph definition: alnum + punct
|
|
// This is everything from 0x21 - 0x7e
|
|
addCharClassRange(0x21, 0x7e, pattern);
|
|
return true;
|
|
} else if (charClass == "lower") {
|
|
addCharClassRange('a', 'z', pattern);
|
|
return true;
|
|
} else if (charClass == "print") {
|
|
// POSIX locale print definition: alnum + punct + ' '
|
|
// This is everything from 0x20 - 0x7e
|
|
addCharClassRange(0x20, 0x7e, pattern);
|
|
return true;
|
|
} else if (charClass == "punct") {
|
|
// POSIX locale punct definitions:
|
|
// 0x21-0x2f, 0x3a-0x40, 0x5b-0x60, 0x7b-0x7e
|
|
addCharClassRange(0x21, 0x2f, pattern);
|
|
addCharClassRange(0x3a, 0x40, pattern);
|
|
addCharClassRange(0x5b, 0x60, pattern);
|
|
addCharClassRange(0x7b, 0x7e, pattern);
|
|
return true;
|
|
} else if (charClass == "space") {
|
|
pattern->push_back(' ');
|
|
pattern->push_back('\f');
|
|
pattern->push_back('\n');
|
|
pattern->push_back('\r');
|
|
pattern->push_back('\t');
|
|
pattern->push_back('\v');
|
|
return true;
|
|
} else if (charClass == "upper") {
|
|
addCharClassRange('A', 'Z', pattern);
|
|
return true;
|
|
} else if (charClass == "xdigit") {
|
|
addCharClassRange('0', '9', pattern);
|
|
addCharClassRange('a', 'f', pattern);
|
|
addCharClassRange('A', 'F', pattern);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool GlobMatcher::match(std::string_view text) const {
|
|
return tryMatchAt(text, 0, 0);
|
|
}
|
|
|
|
bool GlobMatcher::tryMatchAt(
|
|
std::string_view text,
|
|
size_t textIdx,
|
|
size_t patternIdx) const {
|
|
// Loop through all opcodes in the pattern buffer.
|
|
// It's kind of unfortunate how big and complicated this while loop is.
|
|
//
|
|
// It would improve readability to break this down into one function per
|
|
// opcode, but then it would require additional conditional checks after each
|
|
// function to see if we should break out or keep going. Having everything
|
|
// inlined in this single while loop makes it very easy to break out early
|
|
// without additional checks.
|
|
//
|
|
// I have tried breaking this out into separate functions (and also using an
|
|
// array lookup to find the correct opcode handler, rather than just serial
|
|
// if checks). Unfortunately this did result in a performance hit.
|
|
while (patternIdx < pattern_.size()) {
|
|
if (pattern_[patternIdx] == GLOB_LITERAL) {
|
|
// A literal string section
|
|
uint8_t length = pattern_[patternIdx + 1];
|
|
const uint8_t* literal = pattern_.data() + patternIdx + 2;
|
|
patternIdx += 2 + length;
|
|
if (patternIdx >= pattern_.size()) {
|
|
// This is the last section of the pattern.
|
|
// We can exit out early if the lengths don't match.
|
|
if (text.size() - textIdx != length) {
|
|
return false;
|
|
}
|
|
return isStringPieceEqual(
|
|
text.substr(textIdx, length),
|
|
std::string_view{reinterpret_cast<const char*>(literal), length},
|
|
caseSensitive_);
|
|
}
|
|
// Not the final piece of the pattern. We have to do the string compare
|
|
// (unless the text remaining is too short).
|
|
if (text.size() - textIdx < length) {
|
|
return false;
|
|
}
|
|
if (!isStringPieceEqual(
|
|
text.substr(textIdx, length),
|
|
std::string_view{reinterpret_cast<const char*>(literal), length},
|
|
caseSensitive_)) {
|
|
return false;
|
|
}
|
|
// Matched so far, keep going.
|
|
textIdx += length;
|
|
} else if (pattern_[patternIdx] == GLOB_STAR) {
|
|
// '*' matches 0 or more characters, excluding '/'
|
|
++patternIdx;
|
|
auto matchCanStartWithDot = pattern_[patternIdx] == GLOB_TRUE;
|
|
++patternIdx;
|
|
|
|
// If the glob cannot match text starting with a dot, but the text
|
|
// has a dot here, then it cannot match.
|
|
if (!matchCanStartWithDot && textIdx < text.size() &&
|
|
text[textIdx] == '.') {
|
|
return false;
|
|
}
|
|
|
|
if (patternIdx >= pattern_.size()) {
|
|
// This '*' is at the end of the pattern.
|
|
// We match as long as there are no more '/' characters
|
|
return memchr(text.data() + textIdx, '/', text.size() - textIdx) ==
|
|
nullptr;
|
|
} else if (pattern_[patternIdx] == GLOB_LITERAL) {
|
|
// This '*' is followed by a string literal.
|
|
// Jump ahead to the next place where we find this literal. Make sure
|
|
// we don't cross a '/'
|
|
auto literalLength = pattern_[patternIdx + 1];
|
|
std::string_view literalPattern{
|
|
reinterpret_cast<const char*>(pattern_.data()) + patternIdx + 2,
|
|
literalLength};
|
|
patternIdx += 2 + literalLength;
|
|
auto nextSlash = text.find('/', textIdx);
|
|
while (true) {
|
|
auto textPiece = text.substr(textIdx);
|
|
auto literalIdx = caseSensitive_ == CaseSensitivity::Sensitive
|
|
? qfind(
|
|
folly::StringPiece{textPiece},
|
|
folly::StringPiece{literalPattern},
|
|
folly::AsciiCaseSensitive{})
|
|
: qfind(
|
|
folly::StringPiece{textPiece},
|
|
folly::StringPiece{literalPattern},
|
|
folly::AsciiCaseInsensitive{});
|
|
if (literalIdx == std::string_view::npos) {
|
|
// No match.
|
|
return false;
|
|
}
|
|
literalIdx += textIdx;
|
|
if (nextSlash < literalIdx) {
|
|
return false;
|
|
}
|
|
if (tryMatchAt(text, literalIdx + literalLength, patternIdx)) {
|
|
return true;
|
|
}
|
|
// No match here. Move forwards and try again.
|
|
textIdx = literalIdx + 1;
|
|
}
|
|
} else {
|
|
// '*' followed by another glob special, such as ? or a character
|
|
// class. We inefficiently try matching forwards one character at a
|
|
// time.
|
|
//
|
|
// In practice this type of pattern is rare.
|
|
while (textIdx < text.size()) {
|
|
if (tryMatchAt(text, textIdx, patternIdx)) {
|
|
return true;
|
|
}
|
|
if (text[textIdx] == '/') {
|
|
return false;
|
|
}
|
|
++textIdx;
|
|
}
|
|
return false;
|
|
}
|
|
} else if (pattern_[patternIdx] == GLOB_ENDS_WITH) {
|
|
// Advance patternIdx to read the bool from the original GLOB_STAR.
|
|
++patternIdx;
|
|
auto matchCanStartWithDot = pattern_[patternIdx] == GLOB_TRUE;
|
|
|
|
// If the glob match is not allowed to start with a dot then we also
|
|
// reject cases where it matches the empty string followed by a dot.
|
|
// We intentionally do not allow `*.cpp` to match `.cpp`
|
|
// This matches the behavior of the POSIX fnmatch() function.
|
|
// Because any match of '*' will start from the current textIdx, we
|
|
// can return right away if we know any match would start with an
|
|
// illegal dot.
|
|
if (!matchCanStartWithDot && textIdx < text.size() &&
|
|
text[textIdx] == '.') {
|
|
return false;
|
|
}
|
|
|
|
// An "ends-with" section
|
|
uint8_t length = pattern_[patternIdx + 1];
|
|
const uint8_t* literal = pattern_.data() + patternIdx + 2;
|
|
if (text.size() - textIdx < length) {
|
|
return false;
|
|
}
|
|
if (!isStringPieceEqual(
|
|
text.substr(text.size() - length),
|
|
std::string_view{reinterpret_cast<const char*>(literal), length},
|
|
caseSensitive_)) {
|
|
return false;
|
|
}
|
|
// The end of the text matched the desired literal.
|
|
// Now we just have to verify that there were no '/' characters in the
|
|
// preceding portion (that matches "*").
|
|
return memchr(
|
|
text.data() + textIdx,
|
|
'/',
|
|
text.size() - (textIdx + length)) == nullptr;
|
|
} else if (pattern_[patternIdx] == GLOB_STAR_STAR_END) {
|
|
// This is '**' at the end of a pattern. It matches everything else in
|
|
// the text. However, if this matcher was created with
|
|
// GlobOptions::IGNORE_DOTFILES, then we must ensure that none of the path
|
|
// components in the remaining text start with a '.'.
|
|
++patternIdx;
|
|
auto pathComponentInMatchCanStartWithDot =
|
|
pattern_[patternIdx] == GLOB_TRUE;
|
|
if (pathComponentInMatchCanStartWithDot) {
|
|
return true;
|
|
}
|
|
|
|
// By construction, we know that GLOB_STAR_STAR_END is preceded by a
|
|
// slash, so we can start from the previous character and scan the
|
|
// remaining text for "/." If we find one, then this is not a match.
|
|
auto searchIndex = textIdx == 0 ? 0 : textIdx - 1;
|
|
return text.find("/.", searchIndex) == std::string_view::npos;
|
|
} else if (pattern_[patternIdx] == GLOB_STAR_STAR_SLASH) {
|
|
++patternIdx;
|
|
auto pathComponentInMatchCannotStartWithDot =
|
|
pattern_[patternIdx] == GLOB_FALSE;
|
|
|
|
// This is "**/"
|
|
// It may match nothing at all, or it may match some arbitrary number of
|
|
// characters followed by a slash.
|
|
++patternIdx;
|
|
while (true) {
|
|
if (tryMatchAt(text, textIdx, patternIdx)) {
|
|
return true;
|
|
}
|
|
|
|
auto prevTextIdx = textIdx;
|
|
textIdx = text.find('/', prevTextIdx + 1);
|
|
if (textIdx == std::string_view::npos) {
|
|
// No match.
|
|
return false;
|
|
} else if (
|
|
pathComponentInMatchCannotStartWithDot &&
|
|
text[prevTextIdx] == '.') {
|
|
// Verify the path component does not start with an illegal dot
|
|
// before proceeding.
|
|
return false;
|
|
}
|
|
|
|
++textIdx;
|
|
}
|
|
} else {
|
|
// The other glob special patterns all match exactly one character.
|
|
// Get this character now.
|
|
if (textIdx >= text.size()) {
|
|
return false;
|
|
}
|
|
uint8_t ch = text[textIdx];
|
|
++textIdx;
|
|
|
|
// Git does not allow '/' to match any of these cases.
|
|
if (ch == '/') {
|
|
return false;
|
|
}
|
|
|
|
if (pattern_[patternIdx] == GLOB_CHAR_CLASS) {
|
|
// An inclusive character class
|
|
if (!charClassMatch(ch, &patternIdx)) {
|
|
return false;
|
|
}
|
|
} else if (pattern_[patternIdx] == GLOB_CHAR_CLASS_NEGATED) {
|
|
// An exclusive character class
|
|
if (charClassMatch(ch, &patternIdx)) {
|
|
return false;
|
|
}
|
|
} else if (pattern_[patternIdx] == GLOB_QMARK) {
|
|
// '?' matches any character except '/'
|
|
// (which we already excluded above)
|
|
++patternIdx;
|
|
} else {
|
|
// Unknown opcode. This should never happen.
|
|
XLOGF(
|
|
FATAL,
|
|
"unknown opcode {} in glob pattern buffer at index {}",
|
|
pattern_[patternIdx],
|
|
patternIdx);
|
|
}
|
|
}
|
|
}
|
|
|
|
return textIdx == text.size();
|
|
}
|
|
|
|
bool GlobMatcher::charClassMatch(uint8_t ch, size_t* patternIdx) const {
|
|
size_t idx = *patternIdx + 1;
|
|
while (true) {
|
|
XDCHECK_LT(idx, pattern_.size());
|
|
if (pattern_[idx] == GLOB_CHAR_CLASS_END) {
|
|
// Reached the end of the character class with no match.
|
|
*patternIdx = idx + 1;
|
|
return false;
|
|
} else if (pattern_[idx] == GLOB_CHAR_CLASS_RANGE) {
|
|
XDCHECK_LT(idx + 2, pattern_.size());
|
|
uint8_t lowBound = pattern_[idx + 1];
|
|
uint8_t highBound = pattern_[idx + 2];
|
|
idx += 3;
|
|
if (lowBound <= ch && ch <= highBound) {
|
|
// Found a match
|
|
break;
|
|
}
|
|
} else {
|
|
if (ch == pattern_[idx]) {
|
|
// Found a match
|
|
++idx;
|
|
break;
|
|
}
|
|
++idx;
|
|
}
|
|
}
|
|
|
|
// If we broke out of the loop then we found a match.
|
|
// Advance patternIdx to the end of the character class.
|
|
//
|
|
// We just keep scanning through the data until we find GLOB_CHAR_CLASS_END.
|
|
//
|
|
// In theory we could put a length byte after the GLOB_CHAR_CLASS opcode,
|
|
// similar to what we do for GLOB_LITERAL, so we could avoid scanning here.
|
|
// However this would introduce some complications: we would potentially have
|
|
// to re-arrange the data so it fits in 255 bytes. (Any character class can
|
|
// be represented in 255 bytes, but our naive literal encoding currently
|
|
// might end up using more than 255 bytes.) In practice character class data
|
|
// is normally very short, so the cost of a scan doesn't really matter here.
|
|
while (true) {
|
|
if (pattern_[idx] == GLOB_CHAR_CLASS_END) {
|
|
*patternIdx = idx + 1;
|
|
return true;
|
|
} else if (pattern_[idx] == GLOB_CHAR_CLASS_RANGE) {
|
|
idx += 3;
|
|
} else {
|
|
++idx;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace facebook::eden
|