mirror of
https://github.com/fletcher/MultiMarkdown-6.git
synced 2024-10-04 10:37:08 +03:00
588 lines
12 KiB
C
588 lines
12 KiB
C
/**
|
|
|
|
C-Template -- Boilerplate c project with cmake support, CuTest unit testing, and more.
|
|
|
|
@file aho-corasick.c
|
|
|
|
@brief C implementation of the Aho-Corasick algorithm for searching text
|
|
for multiple strings simultaneously in a single pass without backtracking.
|
|
|
|
<https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm>
|
|
|
|
|
|
@author Fletcher T. Penney
|
|
@bug
|
|
|
|
**/
|
|
|
|
/*
|
|
|
|
Copyright © 2015-2017 Fletcher T. Penney.
|
|
|
|
|
|
The `c-template` project is released under the MIT License.
|
|
|
|
GLibFacade.c and GLibFacade.h are from the MultiMarkdown v4 project:
|
|
|
|
https://github.com/fletcher/MultiMarkdown-4/
|
|
|
|
MMD 4 is released under both the MIT License and GPL.
|
|
|
|
|
|
CuTest is released under the zlib/libpng license. See CuTest.c for the text
|
|
of the license.
|
|
|
|
|
|
## The MIT License ##
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <stdbool.h>
|
|
|
|
#include "aho-corasick.h"
|
|
|
|
#define kTrieStartingSize 256
|
|
|
|
void trie_to_graphviz(trie * a);
|
|
|
|
|
|
trie * trie_new(size_t startingSize) {
|
|
trie * a = malloc(sizeof(trie));
|
|
|
|
if (a) {
|
|
if (startingSize <= 1)
|
|
startingSize = kTrieStartingSize;
|
|
|
|
a->node = malloc(sizeof(trie_node) * startingSize);
|
|
|
|
if (!a->node) {
|
|
free(a);
|
|
return NULL;
|
|
}
|
|
|
|
// Clear memory
|
|
memset(a->node, 0, sizeof(trie_node) * startingSize);
|
|
|
|
// All tries have a root node
|
|
a->size = 1;
|
|
a->capacity = startingSize;
|
|
}
|
|
|
|
return a;
|
|
}
|
|
|
|
|
|
void trie_free(trie * a) {
|
|
free(a->node);
|
|
free(a);
|
|
}
|
|
|
|
|
|
bool trie_node_insert(trie * a, size_t s, const unsigned char * key, unsigned short match_type, unsigned short depth) {
|
|
// Get node for state s
|
|
trie_node * n = &a->node[s];
|
|
|
|
size_t i;
|
|
|
|
if (key[0] == '\0') {
|
|
// We've hit end of key
|
|
n->match_type = match_type;
|
|
n->len = depth;
|
|
return true; // Success
|
|
}
|
|
|
|
if (n->child[key[0]] != 0) {
|
|
// First character already in trie, advance forward
|
|
return trie_node_insert(a, n->child[key[0]], key + 1, match_type, ++depth);
|
|
} else {
|
|
// Create new node
|
|
|
|
// Ensure capacity
|
|
if (a->size == a->capacity) {
|
|
a->capacity *= 2;
|
|
a->node = realloc(a->node, a->capacity * sizeof(trie_node));
|
|
|
|
// Set n to new location
|
|
n = &(a->node[s]);
|
|
}
|
|
|
|
// Current node points to next node
|
|
i = a->size;
|
|
n->child[key[0]] = i;
|
|
|
|
// Initialize new node to 0
|
|
n = &a->node[i];
|
|
memset(n, 0, sizeof(trie_node));
|
|
|
|
// Set char for new node
|
|
n->c = key[0];
|
|
|
|
// Incremement size
|
|
a->size++;
|
|
|
|
// Advance forward
|
|
return trie_node_insert(a, i, key + 1, match_type, ++depth);
|
|
}
|
|
}
|
|
|
|
|
|
bool trie_insert(trie * a, const char * key, unsigned short match_type) {
|
|
if (a && key && (key[0] != '\0')) {
|
|
return trie_node_insert(a, 0, (const unsigned char *)key, match_type, 0);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
#ifdef TEST
|
|
void Test_trie_insert(CuTest* tc) {
|
|
trie * a = trie_new(0);
|
|
|
|
CuAssertIntEquals(tc, kTrieStartingSize, a->capacity);
|
|
CuAssertIntEquals(tc, 1, a->size);
|
|
|
|
trie_insert(a, "foo", 42);
|
|
|
|
trie_node * n = &a->node[0];
|
|
CuAssertIntEquals(tc, 0, n->match_type);
|
|
CuAssertIntEquals(tc, 1, n->child['f']);
|
|
CuAssertIntEquals(tc, '\0', n->c);
|
|
|
|
n = &a->node[1];
|
|
CuAssertIntEquals(tc, 0, n->match_type);
|
|
CuAssertIntEquals(tc, 2, n->child['o']);
|
|
CuAssertIntEquals(tc, 'f', n->c);
|
|
|
|
n = &a->node[2];
|
|
CuAssertIntEquals(tc, 0, n->match_type);
|
|
CuAssertIntEquals(tc, 3, n->child['o']);
|
|
CuAssertIntEquals(tc, 'o', n->c);
|
|
|
|
n = &a->node[3];
|
|
CuAssertIntEquals(tc, 42, n->match_type);
|
|
CuAssertIntEquals(tc, 3, n->len);
|
|
CuAssertIntEquals(tc, 'o', n->c);
|
|
|
|
trie_free(a);
|
|
}
|
|
#endif
|
|
|
|
|
|
size_t trie_node_search(trie * a, size_t s, const char * query) {
|
|
if (query[0] == '\0') {
|
|
// Found matching state
|
|
return s;
|
|
}
|
|
|
|
if (a->node[s].child[(unsigned char)query[0]] == 0) {
|
|
// Failed to match
|
|
return -1;
|
|
}
|
|
|
|
// Partial match, keep going
|
|
return trie_node_search(a, a->node[s].child[(unsigned char)query[0]], query + 1);
|
|
}
|
|
|
|
|
|
size_t trie_search(trie * a, const char * query) {
|
|
if (a && query) {
|
|
return trie_node_search(a, 0, query);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
unsigned short trie_search_match_type(trie * a, const char * query) {
|
|
size_t s = trie_search(a, query);
|
|
|
|
if (s == -1)
|
|
return -1;
|
|
|
|
return a->node[s].match_type;
|
|
}
|
|
|
|
|
|
#ifdef TEST
|
|
void Test_trie_search(CuTest* tc) {
|
|
trie * a = trie_new(0);
|
|
|
|
trie_insert(a, "foo", 42);
|
|
trie_insert(a, "bar", 41);
|
|
trie_insert(a, "food", 40);
|
|
|
|
CuAssertIntEquals(tc, 3, trie_search(a, "foo"));
|
|
CuAssertIntEquals(tc, 42, trie_search_match_type(a, "foo"));
|
|
|
|
CuAssertIntEquals(tc, 6, trie_search(a, "bar"));
|
|
CuAssertIntEquals(tc, 41, trie_search_match_type(a, "bar"));
|
|
|
|
CuAssertIntEquals(tc, 7, trie_search(a, "food"));
|
|
CuAssertIntEquals(tc, 40, trie_search_match_type(a, "food"));
|
|
|
|
CuAssertIntEquals(tc, -1, trie_search(a, "foot"));
|
|
CuAssertIntEquals(tc, (unsigned short) -1, trie_search_match_type(a, "foot"));
|
|
|
|
trie_free(a);
|
|
}
|
|
#endif
|
|
|
|
|
|
void ac_trie_node_prepare(trie * a, size_t s, char * buffer, unsigned short depth, size_t last_match_state) {
|
|
|
|
buffer[depth] = '\0';
|
|
buffer[depth + 1] = '\0';
|
|
|
|
// Current node
|
|
trie_node * n = &(a->node[s]);
|
|
|
|
char * suffix = buffer;
|
|
|
|
// Longest match seen so far??
|
|
suffix += 1;
|
|
|
|
// Find valid suffixes for failure path
|
|
while ((suffix[0] != '\0') && (n->ac_fail == 0)) {
|
|
n->ac_fail = trie_search(a, suffix);
|
|
|
|
if (n->ac_fail == -1)
|
|
n->ac_fail = 0;
|
|
|
|
if (n->ac_fail == s) {
|
|
// Something went wrong
|
|
fprintf(stderr, "Recursive trie fallback detected at state %lu('%c') - suffix:'%s'!\n", s, n->c, suffix);
|
|
n->ac_fail = 0;
|
|
}
|
|
|
|
suffix++;
|
|
}
|
|
|
|
|
|
// Prepare children
|
|
for (int i = 0; i < 256; ++i)
|
|
{
|
|
if ((n->child[i] != 0) &&
|
|
(n->child[i] != s)) {
|
|
buffer[depth] = i;
|
|
|
|
ac_trie_node_prepare(a, n->child[i], buffer, depth + 1, last_match_state);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Prepare trie for Aho-Corasick search algorithm by mapping failure connections
|
|
void ac_trie_prepare(trie * a) {
|
|
// Clear old pointers
|
|
for (size_t i = 0; i < a->size; ++i)
|
|
{
|
|
a->node[i].ac_fail = 0;
|
|
}
|
|
|
|
// Create a buffer to use
|
|
char buffer[a->capacity];
|
|
|
|
ac_trie_node_prepare(a, 0, buffer, 0, 0);
|
|
}
|
|
|
|
|
|
|
|
#ifdef TEST
|
|
void Test_trie_prepare(CuTest* tc) {
|
|
trie * a = trie_new(0);
|
|
|
|
trie_insert(a, "a", 1);
|
|
trie_insert(a, "aa", 2);
|
|
trie_insert(a, "aaa", 3);
|
|
trie_insert(a, "aaaa", 4);
|
|
|
|
ac_trie_prepare(a);
|
|
|
|
trie_free(a);
|
|
}
|
|
#endif
|
|
|
|
|
|
match * match_new(size_t start, size_t len, unsigned short match_type) {
|
|
match * m = malloc(sizeof(match));
|
|
|
|
if (m) {
|
|
m->start = start;
|
|
m->len = len;
|
|
m->match_type = match_type;
|
|
m->next = NULL;
|
|
m->prev = NULL;
|
|
}
|
|
|
|
return m;
|
|
}
|
|
|
|
|
|
void match_free(match * m) {
|
|
if (m) {
|
|
if (m->next) {
|
|
match_free(m->next);
|
|
}
|
|
|
|
free(m);
|
|
}
|
|
}
|
|
|
|
|
|
match * match_add(match * last, size_t start, size_t len, unsigned short match_type) {
|
|
if (last) {
|
|
last->next = match_new(start, len, match_type);
|
|
last->next->prev = last;
|
|
return last->next;
|
|
} else {
|
|
return match_new(start, len, match_type);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
|
|
match * ac_trie_search(trie * a, const char * source, size_t start, size_t len) {
|
|
|
|
// Store results in a linked list
|
|
// match * result = match_new(0, 0, 0);
|
|
match * result = NULL;
|
|
match * m = result;
|
|
|
|
// Keep track of our state
|
|
size_t state = 0;
|
|
size_t temp_state;
|
|
|
|
// Character being compared
|
|
unsigned char test_value;
|
|
size_t counter = start;
|
|
size_t stop = start + len;
|
|
|
|
while ((counter < stop) && (source[counter] != '\0')) {
|
|
// Read next character
|
|
test_value = (unsigned char)source[counter++];
|
|
|
|
// Check for path that allows us to match next character
|
|
while (state != 0 && a->node[state].child[test_value] == 0) {
|
|
state = a->node[state].ac_fail;
|
|
}
|
|
|
|
// Advance state for the next character
|
|
state = a->node[state].child[test_value];
|
|
|
|
// Check for partial matches
|
|
temp_state = state;
|
|
|
|
while (temp_state != 0) {
|
|
if (a->node[temp_state].match_type) {
|
|
// This is a match
|
|
if (!m) {
|
|
result = match_new(0, 0, 0);
|
|
m = result;
|
|
}
|
|
m = match_add(m, counter - a->node[temp_state].len,
|
|
a->node[temp_state].len, a->node[temp_state].match_type);
|
|
}
|
|
|
|
// Iterate to find shorter matches
|
|
temp_state = a->node[temp_state].ac_fail;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
void match_excise(match * m) {
|
|
if (m->prev) {
|
|
m->prev->next = m->next;
|
|
}
|
|
|
|
if (m->next) {
|
|
m->next->prev = m->prev;
|
|
}
|
|
|
|
free(m);
|
|
}
|
|
|
|
int match_count(match * m) {
|
|
int result = 0;
|
|
m = m->next; // Skip header
|
|
|
|
while (m) {
|
|
result++;
|
|
m = m->next;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
void match_describe(match * m, const char * source) {
|
|
fprintf(stderr, "'%.*s'(%d) at %lu:%lu\n", (int)m->len, &source[m->start],
|
|
m->match_type, m->start, m->start + m->len);
|
|
}
|
|
|
|
|
|
void match_set_describe(match * m, const char * source) {
|
|
m = m->next; // Skip header
|
|
while (m) {
|
|
match_describe(m, source);
|
|
m = m->next;
|
|
}
|
|
}
|
|
|
|
|
|
void match_set_filter_leftmost_longest(match * header) {
|
|
// Filter results to include only leftmost/longest results
|
|
match * m = header->next; // Skip header
|
|
match * n;
|
|
|
|
while (m) {
|
|
if (m->next) {
|
|
if (m->start == m->next->start) {
|
|
// The next match is longer than this one
|
|
n = m;
|
|
m = m->next;
|
|
match_excise(n);
|
|
continue;
|
|
}
|
|
|
|
while (m->next &&
|
|
m->next->start > m->start &&
|
|
m->next->start < m->start + m->len) {
|
|
// This match is "lefter" than next
|
|
match_excise(m->next);
|
|
}
|
|
|
|
while (m->next &&
|
|
m->next->start < m->start) {
|
|
// Next match is "lefter" than us
|
|
n = m;
|
|
m = m->prev;
|
|
match_excise(n);
|
|
}
|
|
}
|
|
|
|
while (m->prev &&
|
|
m->prev->len &&
|
|
m->prev->start >= m->start) {
|
|
// We are "lefter" than previous
|
|
n = m->prev;
|
|
match_excise(n);
|
|
}
|
|
|
|
m = m->next;
|
|
}
|
|
}
|
|
|
|
|
|
match * ac_trie_leftmost_longest_search(trie * a, const char * source, size_t start, size_t len) {
|
|
match * result = ac_trie_search(a, source, start, len);
|
|
|
|
if (result)
|
|
match_set_filter_leftmost_longest(result);
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
#ifdef TEST
|
|
void Test_aho_trie_search(CuTest* tc) {
|
|
trie * a = trie_new(0);
|
|
|
|
trie_insert(a, "foo", 42);
|
|
trie_insert(a, "bar", 41);
|
|
trie_insert(a, "food", 40);
|
|
|
|
ac_trie_prepare(a);
|
|
|
|
match * m = ac_trie_search(a, "this is a bar that serves food.", 0, 31);
|
|
|
|
match_free(m);
|
|
trie_free(a);
|
|
|
|
|
|
a = trie_new(0);
|
|
|
|
trie_insert(a, "A", 1);
|
|
trie_insert(a, "AB", 2);
|
|
trie_insert(a, "ABC", 3);
|
|
trie_insert(a, "BC", 4);
|
|
trie_insert(a, "BCD", 5);
|
|
trie_insert(a, "E", 6);
|
|
trie_insert(a, "EFGHIJ", 7);
|
|
trie_insert(a, "F", 8);
|
|
trie_insert(a, "ZABCABCZ", 9);
|
|
trie_insert(a, "ZAB", 10);
|
|
|
|
ac_trie_prepare(a);
|
|
|
|
m = ac_trie_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 0, 26);
|
|
fprintf(stderr, "Finish with %d matches\n", match_count(m));
|
|
match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ");
|
|
match_free(m);
|
|
|
|
m = ac_trie_leftmost_longest_search(a, "ABCDEFGGGAZABCABCDZABCABCZ", 0, 26);
|
|
fprintf(stderr, "Finish with %d matches\n", match_count(m));
|
|
match_set_describe(m, "ABCDEFGGGAZABCABCDZABCABCZ");
|
|
match_free(m);
|
|
|
|
// trie_to_graphviz(a);
|
|
|
|
trie_free(a);
|
|
}
|
|
#endif
|
|
|
|
|
|
void trie_node_to_graphviz(trie * a, size_t s) {
|
|
trie_node * n = &a->node[s];
|
|
|
|
if (n->match_type)
|
|
fprintf(stderr, "\"%lu\" [shape=doublecircle]\n", s);
|
|
|
|
for (int i = 0; i < 256; ++i)
|
|
{
|
|
if (n->child[i]) {
|
|
switch (i) {
|
|
default:
|
|
fprintf(stderr, "\"%lu\" -> \"%lu\" [label=\"%c\"]\n", s, n->child[i], (char)i);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (n->ac_fail)
|
|
fprintf(stderr, "\"%lu\" -> \"%lu\" [label=\"fail\"]\n", s, n->ac_fail);
|
|
}
|
|
|
|
|
|
void trie_to_graphviz(trie * a) {
|
|
fprintf(stderr, "digraph dfa {\n");
|
|
for (int i = 0; i < a->size; ++i)
|
|
{
|
|
trie_node_to_graphviz(a, i);
|
|
}
|
|
fprintf(stderr, "}\n");
|
|
}
|
|
|