1
1
mirror of https://github.com/rui314/mold.git synced 2024-09-20 01:18:53 +03:00
mold/multi-glob.cc
2022-07-04 13:52:34 +08:00

169 lines
3.9 KiB
C++

// This file implements the Aho-Corasick algorithm to match multiple
// glob patterns to symbol strings as quickly as possible.
//
// Here are some examples of glob patterns:
//
// qt_private_api_tag*
// *16QAccessibleCache*
// *32QAbstractFileIconProviderPrivate*
// *17QPixmapIconEngine*
//
// `*` is a wildcard that matches any substring. We sometimes have
// hundreds of glob patterns and have to match them against millions
// of symbol strings.
//
// Aho-Corasick cannot handle complex patterns such as `*foo*bar*`.
// We handle such patterns with the Glob class. Glob is relatively
// slow, but complex patterns are rare in practice, so it should be
// OK.
#include "mold.h"
#include <queue>
#include <regex>
namespace mold {
std::optional<u32> MultiGlob::find(std::string_view str) {
std::call_once(once, [&] { compile(); });
u32 idx = UINT32_MAX;
if (root) {
// Match against simple glob patterns
TrieNode *node = root.get();
auto walk = [&](u8 c) {
for (;;) {
if (node->children[c]) {
node = node->children[c].get();
idx = std::min(idx, node->value);
return;
}
if (!node->suffix_link)
return;
node = node->suffix_link;
}
};
walk('\0');
for (u8 c : str)
walk(c);
walk('\0');
}
// Match against complex glob patterns
for (std::pair<Glob, u32> &glob : globs)
if (glob.first.match(str))
idx = std::min(idx, glob.second);
if (idx == UINT32_MAX)
return {};
return values[idx];
}
static bool is_simple_pattern(std::string_view pat) {
static std::regex re(R"(\*?[^*[?]+\*?)", std::regex_constants::optimize);
return std::regex_match(pat.begin(), pat.end(), re);
}
static std::string handle_stars(std::string_view pat) {
std::string str(pat);
// Convert "foo" -> "\0foo\0", "*foo" -> "foo\0", "foo*" -> "\0foo"
// and "*foo*" -> "foo". Aho-Corasick can do only substring matching,
// so we use \0 as beginning/end-of-string markers.
if (str.starts_with('*') && str.ends_with('*'))
return str.substr(1, str.size() - 2);
if (str.starts_with('*'))
return str.substr(1) + "\0"s;
if (str.ends_with('*'))
return "\0"s + str.substr(0, str.size() - 1);
return "\0"s + str + "\0"s;
}
bool MultiGlob::add(std::string_view pat, u32 val) {
assert(!is_compiled);
assert(!pat.empty());
u32 idx = strings.size();
strings.push_back(std::string(pat));
values.push_back(val);
// Complex glob pattern
if (!is_simple_pattern(pat)) {
if (std::optional<Glob> glob = Glob::compile(pat)) {
globs.push_back({std::move(*glob), idx});
return true;
}
return false;
}
// Simple glob pattern
if (!root)
root.reset(new TrieNode);
TrieNode *node = root.get();
for (u8 c : handle_stars(pat)) {
if (!node->children[c])
node->children[c].reset(new TrieNode);
node = node->children[c].get();
}
node->value = std::min(node->value, idx);
return true;
}
void MultiGlob::compile() {
is_compiled = true;
if (root) {
fix_suffix_links(*root);
fix_values();
}
}
void MultiGlob::fix_suffix_links(TrieNode &node) {
for (i64 i = 0; i < 256; i++) {
if (!node.children[i])
continue;
TrieNode &child = *node.children[i];
TrieNode *cur = node.suffix_link;
for (;;) {
if (!cur) {
child.suffix_link = root.get();
break;
}
if (cur->children[i]) {
child.suffix_link = cur->children[i].get();
break;
}
cur = cur->suffix_link;
}
fix_suffix_links(child);
}
}
void MultiGlob::fix_values() {
std::queue<TrieNode *> queue;
queue.push(root.get());
do {
TrieNode *node = queue.front();
queue.pop();
for (std::unique_ptr<TrieNode> &child : node->children) {
if (!child)
continue;
child->value = std::min(child->value, child->suffix_link->value);
queue.push(child.get());
}
} while (!queue.empty());
}
} // namespace mold