sapling/eden/fs/utils/GlobMatcher.cpp
Xinyi Wang 0e86e4ccf0 create GlobTree class that doesn't depend on inode
Summary:
See why on previous diff

Now with this diff GlobTree is independent of inode

Reviewed By: kmancini

Differential Revision: D49933175

fbshipit-source-id: 1551a2b7e054df5df88ac37fbf0bf45f91e34548
2023-10-10 10:06:24 -07:00

906 lines
31 KiB
C++

/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This software may be used and distributed according to the terms of the
* GNU General Public License version 2.
*/
#include "eden/fs/utils/GlobMatcher.h"
#include <fmt/core.h>
#include <folly/logging/xlog.h>
#include <algorithm>
#include <limits>
using folly::Expected;
using std::string;
using std::vector;
namespace {
/*
* Opcode characters for our pattern buffer.
*/
enum : uint8_t {
// A chunk of literal string data.
// This is followed by a length byte, then the literal data.
// Literal runs of more than 255 bytes in a row are broken up into separate
// literal opcodes with a max length of 255 bytes each.
GLOB_LITERAL = 'S',
// GLOB_STAR matches 0 or more characters.
// This is followed by a bool byte. If true, the pattern can match text
// that starts with a '.'.
// Any character except '/' can be matched.
GLOB_STAR = '*',
// GLOB_STAR_STAR_END matches all remaining text.
// This is followed by a bool byte. If true, a path component in the pattern
// can start with a '.'.
// If GLOB_STAR_STAR_END appears it is always the very last opcode in the
// pattern buffer.
GLOB_STAR_STAR_END = '>',
// GLOB_STAR_STAR_SLASH matches either:
// - 0 characters
// - 1 or more characters followed by a slash
// This is followed by a bool byte. If true, a path component in the pattern
// can start with a '.'.
GLOB_STAR_STAR_SLASH = 'X',
// GLOB_CHAR_CLASS matches a character class.
// This is followed by a list of characters to match.
// The matching characters are encoded as follows:
// - '\x00' indicates the end of the character class
// - '\x01' indicates a range. It is followed by 2 bytes, the low and high
// bounds of the range (inclusive).
// - any other character matches only that character.
// A literal '\x00' or '\x01' is encoded as a range with itself as both the
// lower and upper bound. e.g. '\x00' gets encoded as '\x01\x00\x00'.
GLOB_CHAR_CLASS = '[',
// GLOB_CHAR_CLASS_NEGATED is like GLOB_CHAR_CLASS, but matches
// only if the character does not match the character class.
// TODO: Do not let a negated character class pattern match a "." at the start
// of a file name, as specified in the POSIX docs.
GLOB_CHAR_CLASS_NEGATED = ']',
GLOB_CHAR_CLASS_END = '\x00',
GLOB_CHAR_CLASS_RANGE = '\x01',
// GLOB_QMARK matches any single character except for '/'
GLOB_QMARK = '?',
// GLOB_ENDS_WITH matches a literal section at the end of the string.
// We optimize GLOB_STAR+GLOB_LITERAL at the end of the pattern into
// GLOB_ENDS_WITH, so it is composed of the bool byte from GLOB_STAR followed
// by the data from GLOB_LITERAL.
GLOB_ENDS_WITH = '$',
// Used to represent boolean values associated with an opcode.
GLOB_TRUE = 'T',
GLOB_FALSE = 'F',
};
/// A set of character intervals. This is used during parsing to deduplicate
/// ranges within a character class.
class CharIntervalSet {
public:
/// A closed interval (inclusive on both sides)
using Interval = std::pair</*lo*/ uint8_t, /*hi*/ uint8_t>;
/// Insert a non-empty interval into the set. \param lo and \param hi are both
/// inclusive.
void insert(uint8_t lo, uint8_t hi) {
XDCHECK_GE(hi, lo);
bounds_.push_back({lo, /*isEnd=*/false});
bounds_.push_back({hi, /*isEnd=*/true});
}
/// Returns an optimized version of the interval set; that is, a list of
/// non-overlapping intervals that are included in the set.
std::vector<Interval> optimize() {
std::sort(bounds_.begin(), bounds_.end(), [](const auto& a, const auto& b) {
// Sort the bounds in ascending order, and ensure start bounds precede end
// bounds.
return (a.value < b.value) ||
((a.value == b.value) && !a.isEnd && b.isEnd);
});
XDCHECK(bounds_.empty() || bounds_.back().isEnd);
std::vector<Interval> intervals;
int depth = 0;
for (const auto& bound : bounds_) {
if (!bound.isEnd) {
++depth;
if (depth == 1) {
// Start a new interval before this character. Its end will be set
// later.
intervals.emplace_back(
bound.value, std::numeric_limits<uint8_t>::max());
}
} else {
--depth;
if (depth == 0) {
XDCHECK(!intervals.empty());
// End the current interval after this character
// @lint-ignore CLANGTIDY facebook-hte-LocalUncheckedArrayBounds
intervals.back().second = bound.value;
}
}
}
XDCHECK_EQ(depth, 0);
XDCHECK(bounds_.empty() || !intervals.empty());
return intervals;
}
private:
struct Bound {
uint8_t value;
/// If true, an interval ends after this character.
/// If false, an interval starts before this character.
bool isEnd;
};
std::vector<Bound> bounds_;
};
} // namespace
namespace facebook::eden {
namespace {
bool isStringPieceEqual(
std::string_view left,
std::string_view right,
CaseSensitivity caseSensitive) {
if (caseSensitive == CaseSensitivity::Sensitive) {
return left == right;
} else {
return std::equal(
left.begin(),
left.end(),
right.begin(),
right.end(),
folly::AsciiCaseInsensitive{});
}
}
char toLower(char c) {
if (c >= 'A' && c <= 'Z') {
c += 'a' - 'A';
}
return c;
}
char toUpper(char c) {
if (c >= 'a' && c <= 'z') {
c -= 'a' - 'A';
}
return c;
}
} // namespace
GlobOptions operator|(GlobOptions a, GlobOptions b) {
return static_cast<GlobOptions>(
static_cast<uint32_t>(a) | static_cast<uint32_t>(b));
}
GlobOptions& operator|=(GlobOptions& a, GlobOptions b) {
a = (a | b);
return a;
}
bool operator&(GlobOptions a, GlobOptions b) {
return (static_cast<uint32_t>(a) & static_cast<uint32_t>(b)) != 0;
}
GlobMatcher::GlobMatcher(vector<uint8_t> pattern, CaseSensitivity caseSensitive)
: pattern_(std::move(pattern)), caseSensitive_(caseSensitive) {}
GlobMatcher::GlobMatcher() = default;
GlobMatcher::~GlobMatcher() = default;
/*
* A glob pattern consists of a few types of data:
* - literal string pieces
* - *
* - **
* - ?
* - bracket expressions ([])
*
* We parse this in create(), and encode it as a string of opcodes.
* The opcode semantics are documented above where they are defined.
*
* Glancing through our existing ignore rules:
* - About 60% are simple fixed strings, with no wildcards
* - About 27% are simple "ends with" patterns (e.g., "*.txt")
*/
Expected<GlobMatcher, string> GlobMatcher::create(
std::string_view glob,
GlobOptions options) {
CaseSensitivity caseSensitive = options & GlobOptions::CASE_INSENSITIVE
? CaseSensitivity::Insensitive
: CaseSensitivity::Sensitive;
vector<uint8_t> result;
// Make a guess at how big the pattern buffer will be.
// We require 2 extra bytes for each literal chunk. We save a byte for "**"
// expressions, and we usually save a byte or two on bracket expressions.
result.reserve(glob.size() + 6);
ssize_t prevOpcodeIdx = -1;
ssize_t curOpcodeIdx = -1;
auto addOpcode = [&](uint8_t opcode) {
prevOpcodeIdx = curOpcodeIdx;
curOpcodeIdx = result.size();
result.push_back(opcode);
};
auto appendLiteralChar = [&](char c) {
if (curOpcodeIdx >= 0 && result[curOpcodeIdx] == GLOB_LITERAL &&
result[curOpcodeIdx + 1] < 0xff) {
// Just append this byte to the end of the current literal section.
++result[curOpcodeIdx + 1];
result.push_back(c);
} else {
// We aren't currently in a literal section (or we have already put 255
// bytes in the current section and can't fit any more). Start a new
// literal section.
addOpcode(GLOB_LITERAL);
result.push_back(1);
result.push_back(c);
}
};
auto appendBool = [&](bool b) {
result.push_back(b ? GLOB_TRUE : GLOB_FALSE);
};
// Note: watchman's wildcard matching code treats '/' slightly specially:
// it can match 1 or more '/' characters. For example, "foo/bar" would match
// "foo///bar".
//
// We don't bother doing this here since the paths given to our code should
// already have been normalized, so we should never have repeated slashes in
// the text being matched.
auto includeDotfiles = !(options & GlobOptions::IGNORE_DOTFILES);
for (size_t idx = 0; idx < glob.size(); ++idx) {
char c = glob[idx];
if (c == '\\') {
// Backslash escaped characters are treated literally
++idx;
if (idx >= glob.size()) {
// A trailing backslash is invalid. This glob should be ignored.
return folly::makeUnexpected<string>(
"glob pattern ends with trailing backslash");
}
appendLiteralChar(glob[idx]);
continue;
} else if (c == '?') {
// Match any single character except for a slash
addOpcode(GLOB_QMARK);
} else if (c == '*') {
if (idx + 1 < glob.size() && glob[idx + 1] == '*') {
// This is "**".
// According to the gitignore man pages, "**" is only valid in three
// cases:
// - "**/" at the start of the pattern
// - "/**" at the end of the pattern
// - "/**/" in the middle of the pattern
++idx;
if (idx + 1 >= glob.size()) {
// Make sure that the character before this was '/'.
// We still treat it as part of the previous literal opcode, but we
// want to reject the glob if this ** wasn't preceded by '/'.
if (idx < 2 || glob[idx - 2] != '/') {
return folly::makeUnexpected<string>(
"invalid \"**\" sequence at end of pattern without slash");
}
addOpcode(GLOB_STAR_STAR_END);
appendBool(includeDotfiles);
} else if (glob[idx + 1] == '/') {
if (idx >= 2 && glob[idx - 2] != '/') {
return folly::makeUnexpected<string>(
"\"**/\" must follow a slash or appear at the start of a pattern");
}
++idx;
addOpcode(GLOB_STAR_STAR_SLASH);
appendBool(includeDotfiles);
} else {
// Reject the pattern if "**" isn't followed by the end of the
// pattern or a "/"
return folly::makeUnexpected<string>("invalid \"**\" sequence");
}
} else {
addOpcode(GLOB_STAR);
// If includeDotfiles is false, then "*.cpp" should not match
// ".bak.cpp", but "My*.cpp" should match "My.foo.cpp", so we must check
// the preceding character.
appendBool(includeDotfiles || (idx != 0 && glob[idx - 1] != '/'));
}
} else if (c == '[') {
// Translate a bracket expression
prevOpcodeIdx = curOpcodeIdx;
curOpcodeIdx = result.size();
auto newIdx = parseBracketExpr(glob, idx, caseSensitive, &result);
if (!newIdx.hasValue()) {
return folly::makeUnexpected<string>(std::move(newIdx).error());
}
idx = newIdx.value();
} else {
appendLiteralChar(c);
}
}
// We perform one additional optimization here:
// if the final two opcodes were GLOB_STAR followed by GLOB_LITERAL, we
// translate this into GLOB_ENDS_WITH.
if (prevOpcodeIdx >= 0 && result[prevOpcodeIdx] == GLOB_STAR &&
result[curOpcodeIdx] == GLOB_LITERAL) {
// Currently, the end of the result vector contains:
//
// [prevOpcodeIdx] GLOB_STAR
// GLOB_STAR matchCanStartWithDot bool
// [curOpcodeIdx] GLOB_LITERAL
// GLOB_LITERAL data
//
// We modify it so it becomes:
//
// [prevOpcodeIdx] GLOB_ENDS_WITH
// GLOB_STAR matchCanStartWithDot bool
// [curOpcodeIdx] GLOB_LITERAL data
result.erase(result.begin() + curOpcodeIdx);
result[prevOpcodeIdx] = GLOB_ENDS_WITH;
}
return GlobMatcher(std::move(result), caseSensitive);
}
Expected<size_t, string> GlobMatcher::parseBracketExpr(
std::string_view glob,
size_t idx,
CaseSensitivity caseSensitive,
vector<uint8_t>* pattern) {
XDCHECK_LT(idx, glob.size());
XDCHECK_EQ(glob[idx], '[');
// Check for a leading '!' or '^'
if (idx + 1 >= glob.size()) {
return folly::makeUnexpected<string>("unterminated bracket sequence");
}
if (glob[idx + 1] == '!' || glob[idx + 1] == '^') {
pattern->push_back(GLOB_CHAR_CLASS_NEGATED);
++idx;
if (idx >= glob.size()) {
return folly::makeUnexpected<string>("unterminated bracket sequence");
}
} else {
pattern->push_back(GLOB_CHAR_CLASS);
}
CharIntervalSet charIntervals;
// Set NO_PREV_CHAR to something outside of the range [-128, 255]
// We want to make sure it can't possibly correspond to a valid char value,
// regardless of whether char types are signed or unsigned on this platform.
constexpr int32_t NO_PREV_CHAR = 0xffff;
int32_t prevChar = NO_PREV_CHAR;
auto addPrevChar = [&]() {
if (prevChar == NO_PREV_CHAR) {
return;
} else if (
prevChar == GLOB_CHAR_CLASS_END || prevChar == GLOB_CHAR_CLASS_RANGE) {
// Escape these characters by turning them into ranges.
charIntervals.insert(prevChar, prevChar);
} else {
pattern->push_back(prevChar);
if (caseSensitive == CaseSensitivity::Insensitive) {
// For case-insensitive matching of alpha characters, add the
// opposite-case version of the character to the class.
auto asLower = toLower(prevChar);
auto asUpper = toUpper(prevChar);
if (asLower != prevChar) {
pattern->push_back(asLower);
} else if (asUpper != prevChar) {
pattern->push_back(asUpper);
}
}
}
};
auto startIdx = idx;
while (true) {
++idx;
if (idx >= glob.size()) {
return folly::makeUnexpected<string>("unterminated bracket sequence");
}
auto c = glob[idx];
if (c == '\\') {
// A backslash escapes the following character
++idx;
if (idx >= glob.size()) {
// Unterminated escape sequence
return folly::makeUnexpected<string>(
"unterminated backslash in bracket sequence");
}
addPrevChar();
prevChar = glob[idx];
} else if (c == ']') {
// ']' normally signifies the end of the character class,
// unless it is the very first character after the opening '[' or '[^'
if (idx == startIdx + 1) {
XDCHECK_EQ(NO_PREV_CHAR, prevChar);
prevChar = c;
} else {
// End of the character class.
break;
}
} else if (c == '-') {
if (prevChar == NO_PREV_CHAR) {
prevChar = c;
} else {
// This is a range
if (idx + 1 >= glob.size()) {
// Unterminated escape sequence
return folly::makeUnexpected<string>("unterminated bracket range");
} else if (glob[idx + 1] == ']') {
// '-' followed by the terminating ']' is just a literal '-',
// not a range.
addPrevChar();
prevChar = c;
} else {
// This is a range
++idx;
uint8_t highBound = glob[idx];
if (highBound == '\\') {
++idx;
if (idx >= glob.size()) {
return folly::makeUnexpected<string>(
"unterminated escape in bracket range");
}
highBound = glob[idx];
}
// Don't even bother adding the range if the low bound is greater
// than the high bound. (We don't treat the whole glob as invalid
// though. We just ignore this one range, since it can never match
// anything.)
if (prevChar <= highBound) {
charIntervals.insert(prevChar, highBound);
if (caseSensitive == CaseSensitivity::Insensitive) {
// If the range intersects with ['A', 'Z'], add the lowercase
// counterpart of the intersection.
if (highBound >= 'A' && prevChar <= 'Z') {
charIntervals.insert(
toLower(std::clamp<uint8_t>(prevChar, 'A', 'Z')),
toLower(std::clamp<uint8_t>(highBound, 'A', 'Z')));
}
// If the range intersects with ['a', 'z'], add the uppercase
// counterpart of the intersection.
if (highBound >= 'a' && prevChar <= 'z') {
charIntervals.insert(
toUpper(std::clamp<uint8_t>(prevChar, 'a', 'z')),
toUpper(std::clamp<uint8_t>(highBound, 'a', 'z')));
}
}
}
prevChar = NO_PREV_CHAR;
}
}
} else if (c == '[') {
// Look for a character class like [:alpha:]
bool isClass = false;
if (idx + 3 < glob.size() && glob[idx + 1] == ':') {
auto classStart = idx + 2;
for (auto end = classStart; end + 1 < glob.size(); ++end) {
if (glob[end] == ':' && glob[end + 1] == ']') {
std::string_view charClass{
glob.data() + classStart, end - classStart};
if (!addCharClass(charClass, caseSensitive, pattern)) {
return folly::makeUnexpected<string>(
fmt::format("unknown character class \"{}\"", charClass));
}
idx = end + 1;
isClass = true;
break;
}
}
}
// This wasn't a character class.
// Just treat this just as a literal '[' character.
if (!isClass) {
addPrevChar();
prevChar = c;
}
} else {
addPrevChar();
prevChar = c;
}
}
addPrevChar();
// Add any user-specified ranges we collected along the way, with no
// duplicates
for (auto& interval : charIntervals.optimize()) {
addCharClassRange(interval.first, interval.second, pattern);
}
pattern->push_back(GLOB_CHAR_CLASS_END);
return idx;
}
void GlobMatcher::addCharClassRange(
uint8_t low,
uint8_t high,
std::vector<uint8_t>* pattern) {
XDCHECK_LE(low, high);
pattern->push_back(GLOB_CHAR_CLASS_RANGE);
pattern->push_back(low);
pattern->push_back(high);
}
bool GlobMatcher::addCharClass(
std::string_view charClass,
CaseSensitivity caseSensitive,
vector<uint8_t>* pattern) {
// Character class definitions.
// These match the POSIX Standard Locale as defined in ISO/IEC 9945-2:1993
if (charClass == "alnum") {
addCharClassRange('a', 'z', pattern);
addCharClassRange('A', 'Z', pattern);
addCharClassRange('0', '9', pattern);
return true;
} else if (
charClass == "alpha" ||
// "upper" and "lower" with case-insensitive matching are equivalent to
// "alpha".
(caseSensitive == CaseSensitivity::Insensitive &&
(charClass == "lower" || charClass == "upper"))) {
addCharClassRange('a', 'z', pattern);
addCharClassRange('A', 'Z', pattern);
return true;
} else if (charClass == "blank") {
pattern->push_back(' ');
pattern->push_back('\t');
return true;
} else if (charClass == "cntrl") {
// POSIX locale cntrl definitions:
// 0x00-0x1f,0x7f
addCharClassRange(0x00, 0x1f, pattern);
pattern->push_back(0x7f);
return true;
} else if (charClass == "digit") {
addCharClassRange('0', '9', pattern);
return true;
} else if (charClass == "graph") {
// POSIX locale graph definition: alnum + punct
// This is everything from 0x21 - 0x7e
addCharClassRange(0x21, 0x7e, pattern);
return true;
} else if (charClass == "lower") {
addCharClassRange('a', 'z', pattern);
return true;
} else if (charClass == "print") {
// POSIX locale print definition: alnum + punct + ' '
// This is everything from 0x20 - 0x7e
addCharClassRange(0x20, 0x7e, pattern);
return true;
} else if (charClass == "punct") {
// POSIX locale punct definitions:
// 0x21-0x2f, 0x3a-0x40, 0x5b-0x60, 0x7b-0x7e
addCharClassRange(0x21, 0x2f, pattern);
addCharClassRange(0x3a, 0x40, pattern);
addCharClassRange(0x5b, 0x60, pattern);
addCharClassRange(0x7b, 0x7e, pattern);
return true;
} else if (charClass == "space") {
pattern->push_back(' ');
pattern->push_back('\f');
pattern->push_back('\n');
pattern->push_back('\r');
pattern->push_back('\t');
pattern->push_back('\v');
return true;
} else if (charClass == "upper") {
addCharClassRange('A', 'Z', pattern);
return true;
} else if (charClass == "xdigit") {
addCharClassRange('0', '9', pattern);
addCharClassRange('a', 'f', pattern);
addCharClassRange('A', 'F', pattern);
return true;
}
return false;
}
bool GlobMatcher::match(std::string_view text) const {
return tryMatchAt(text, 0, 0);
}
bool GlobMatcher::tryMatchAt(
std::string_view text,
size_t textIdx,
size_t patternIdx) const {
// Loop through all opcodes in the pattern buffer.
// It's kind of unfortunate how big and complicated this while loop is.
//
// It would improve readability to break this down into one function per
// opcode, but then it would require additional conditional checks after each
// function to see if we should break out or keep going. Having everything
// inlined in this single while loop makes it very easy to break out early
// without additional checks.
//
// I have tried breaking this out into separate functions (and also using an
// array lookup to find the correct opcode handler, rather than just serial
// if checks). Unfortunately this did result in a performance hit.
while (patternIdx < pattern_.size()) {
if (pattern_[patternIdx] == GLOB_LITERAL) {
// A literal string section
uint8_t length = pattern_[patternIdx + 1];
const uint8_t* literal = pattern_.data() + patternIdx + 2;
patternIdx += 2 + length;
if (patternIdx >= pattern_.size()) {
// This is the last section of the pattern.
// We can exit out early if the lengths don't match.
if (text.size() - textIdx != length) {
return false;
}
return isStringPieceEqual(
text.substr(textIdx, length),
std::string_view{reinterpret_cast<const char*>(literal), length},
caseSensitive_);
}
// Not the final piece of the pattern. We have to do the string compare
// (unless the text remaining is too short).
if (text.size() - textIdx < length) {
return false;
}
if (!isStringPieceEqual(
text.substr(textIdx, length),
std::string_view{reinterpret_cast<const char*>(literal), length},
caseSensitive_)) {
return false;
}
// Matched so far, keep going.
textIdx += length;
} else if (pattern_[patternIdx] == GLOB_STAR) {
// '*' matches 0 or more characters, excluding '/'
++patternIdx;
auto matchCanStartWithDot = pattern_[patternIdx] == GLOB_TRUE;
++patternIdx;
// If the glob cannot match text starting with a dot, but the text
// has a dot here, then it cannot match.
if (!matchCanStartWithDot && textIdx < text.size() &&
text[textIdx] == '.') {
return false;
}
if (patternIdx >= pattern_.size()) {
// This '*' is at the end of the pattern.
// We match as long as there are no more '/' characters
return memchr(text.data() + textIdx, '/', text.size() - textIdx) ==
nullptr;
} else if (pattern_[patternIdx] == GLOB_LITERAL) {
// This '*' is followed by a string literal.
// Jump ahead to the next place where we find this literal. Make sure
// we don't cross a '/'
auto literalLength = pattern_[patternIdx + 1];
std::string_view literalPattern{
reinterpret_cast<const char*>(pattern_.data()) + patternIdx + 2,
literalLength};
patternIdx += 2 + literalLength;
auto nextSlash = text.find('/', textIdx);
while (true) {
auto textPiece = text.substr(textIdx);
auto literalIdx = caseSensitive_ == CaseSensitivity::Sensitive
? qfind(
folly::StringPiece{textPiece},
folly::StringPiece{literalPattern},
folly::AsciiCaseSensitive{})
: qfind(
folly::StringPiece{textPiece},
folly::StringPiece{literalPattern},
folly::AsciiCaseInsensitive{});
if (literalIdx == std::string_view::npos) {
// No match.
return false;
}
literalIdx += textIdx;
if (nextSlash < literalIdx) {
return false;
}
if (tryMatchAt(text, literalIdx + literalLength, patternIdx)) {
return true;
}
// No match here. Move forwards and try again.
textIdx = literalIdx + 1;
}
} else {
// '*' followed by another glob special, such as ? or a character
// class. We inefficiently try matching forwards one character at a
// time.
//
// In practice this type of pattern is rare.
while (textIdx < text.size()) {
if (tryMatchAt(text, textIdx, patternIdx)) {
return true;
}
if (text[textIdx] == '/') {
return false;
}
++textIdx;
}
return false;
}
} else if (pattern_[patternIdx] == GLOB_ENDS_WITH) {
// Advance patternIdx to read the bool from the original GLOB_STAR.
++patternIdx;
auto matchCanStartWithDot = pattern_[patternIdx] == GLOB_TRUE;
// If the glob match is not allowed to start with a dot then we also
// reject cases where it matches the empty string followed by a dot.
// We intentionally do not allow `*.cpp` to match `.cpp`
// This matches the behavior of the POSIX fnmatch() function.
// Because any match of '*' will start from the current textIdx, we
// can return right away if we know any match would start with an
// illegal dot.
if (!matchCanStartWithDot && textIdx < text.size() &&
text[textIdx] == '.') {
return false;
}
// An "ends-with" section
uint8_t length = pattern_[patternIdx + 1];
const uint8_t* literal = pattern_.data() + patternIdx + 2;
if (text.size() - textIdx < length) {
return false;
}
if (!isStringPieceEqual(
text.substr(text.size() - length),
std::string_view{reinterpret_cast<const char*>(literal), length},
caseSensitive_)) {
return false;
}
// The end of the text matched the desired literal.
// Now we just have to verify that there were no '/' characters in the
// preceding portion (that matches "*").
return memchr(
text.data() + textIdx,
'/',
text.size() - (textIdx + length)) == nullptr;
} else if (pattern_[patternIdx] == GLOB_STAR_STAR_END) {
// This is '**' at the end of a pattern. It matches everything else in
// the text. However, if this matcher was created with
// GlobOptions::IGNORE_DOTFILES, then we must ensure that none of the path
// components in the remaining text start with a '.'.
++patternIdx;
auto pathComponentInMatchCanStartWithDot =
pattern_[patternIdx] == GLOB_TRUE;
if (pathComponentInMatchCanStartWithDot) {
return true;
}
// By construction, we know that GLOB_STAR_STAR_END is preceded by a
// slash, so we can start from the previous character and scan the
// remaining text for "/." If we find one, then this is not a match.
auto searchIndex = textIdx == 0 ? 0 : textIdx - 1;
return text.find("/.", searchIndex) == std::string_view::npos;
} else if (pattern_[patternIdx] == GLOB_STAR_STAR_SLASH) {
++patternIdx;
auto pathComponentInMatchCannotStartWithDot =
pattern_[patternIdx] == GLOB_FALSE;
// This is "**/"
// It may match nothing at all, or it may match some arbitrary number of
// characters followed by a slash.
++patternIdx;
while (true) {
if (tryMatchAt(text, textIdx, patternIdx)) {
return true;
}
auto prevTextIdx = textIdx;
textIdx = text.find('/', prevTextIdx + 1);
if (textIdx == std::string_view::npos) {
// No match.
return false;
} else if (
pathComponentInMatchCannotStartWithDot &&
text[prevTextIdx] == '.') {
// Verify the path component does not start with an illegal dot
// before proceeding.
return false;
}
++textIdx;
}
} else {
// The other glob special patterns all match exactly one character.
// Get this character now.
if (textIdx >= text.size()) {
return false;
}
uint8_t ch = text[textIdx];
++textIdx;
// Git does not allow '/' to match any of these cases.
if (ch == '/') {
return false;
}
if (pattern_[patternIdx] == GLOB_CHAR_CLASS) {
// An inclusive character class
if (!charClassMatch(ch, &patternIdx)) {
return false;
}
} else if (pattern_[patternIdx] == GLOB_CHAR_CLASS_NEGATED) {
// An exclusive character class
if (charClassMatch(ch, &patternIdx)) {
return false;
}
} else if (pattern_[patternIdx] == GLOB_QMARK) {
// '?' matches any character except '/'
// (which we already excluded above)
++patternIdx;
} else {
// Unknown opcode. This should never happen.
XLOGF(
FATAL,
"unknown opcode {} in glob pattern buffer at index {}",
pattern_[patternIdx],
patternIdx);
}
}
}
return textIdx == text.size();
}
bool GlobMatcher::charClassMatch(uint8_t ch, size_t* patternIdx) const {
size_t idx = *patternIdx + 1;
while (true) {
XDCHECK_LT(idx, pattern_.size());
if (pattern_[idx] == GLOB_CHAR_CLASS_END) {
// Reached the end of the character class with no match.
*patternIdx = idx + 1;
return false;
} else if (pattern_[idx] == GLOB_CHAR_CLASS_RANGE) {
XDCHECK_LT(idx + 2, pattern_.size());
uint8_t lowBound = pattern_[idx + 1];
uint8_t highBound = pattern_[idx + 2];
idx += 3;
if (lowBound <= ch && ch <= highBound) {
// Found a match
break;
}
} else {
if (ch == pattern_[idx]) {
// Found a match
++idx;
break;
}
++idx;
}
}
// If we broke out of the loop then we found a match.
// Advance patternIdx to the end of the character class.
//
// We just keep scanning through the data until we find GLOB_CHAR_CLASS_END.
//
// In theory we could put a length byte after the GLOB_CHAR_CLASS opcode,
// similar to what we do for GLOB_LITERAL, so we could avoid scanning here.
// However this would introduce some complications: we would potentially have
// to re-arrange the data so it fits in 255 bytes. (Any character class can
// be represented in 255 bytes, but our naive literal encoding currently
// might end up using more than 255 bytes.) In practice character class data
// is normally very short, so the cost of a scan doesn't really matter here.
while (true) {
if (pattern_[idx] == GLOB_CHAR_CLASS_END) {
*patternIdx = idx + 1;
return true;
} else if (pattern_[idx] == GLOB_CHAR_CLASS_RANGE) {
idx += 3;
} else {
++idx;
}
}
}
} // namespace facebook::eden