reorganize Pattern library (#1257)

* feat!: implemented Pattern.match which returns start and end indizes of the first match

* fix: Pattern.match now returns correct end index

* feat: moved Pattern.match-str and Pattern.find from C to Carp code

* chore: fix build after merges

* chore: fix build after merges

* feat: moved Pattern.find-all from C to Carp code

* feat: Pattern.global-match-str no longer relies on Pattern.global-match

* docs: updated for Pattern.global-match

* fix: moved str/prn functions into sub module

* fix: removed unused functions from carp_pattern.h (using cflow)

* feat!: renamed (Pattern.global-match) to (Pattern.match-all-groups)

* fix: unit test

* fix: some functions renamed to match Carp style

Co-authored-by: guberatsie <gunnar.bernhardt@siemens.com>
This commit is contained in:
guberathome 2021-06-23 22:07:56 +02:00 committed by GitHub
parent c592485783
commit e412559380
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 78 additions and 125 deletions

View File

@ -1,14 +1,62 @@
(system-include "carp_pattern.h") (system-include "carp_pattern.h")
(defmodule Pattern (defmodule Pattern
(register-type MatchResult "PatternMatchResult" [start Int, end Int])
(defmodule MatchResult
(defn ref-str [ref-matchres]
(fmt "(MatchResult start=%d end=%d)"
(MatchResult.start ref-matchres)
(MatchResult.end ref-matchres) ))
(implements str Pattern.MatchResult.ref-str)
(implements prn Pattern.MatchResult.ref-str)
(defn str [matchres]
(Pattern.MatchResult.ref-str &matchres) )
(implements str Pattern.MatchResult.str)
(implements prn Pattern.MatchResult.str)
)
(defn non-match? [match-res]
(or (Int.< @(MatchResult.start match-res) 0)
(Int.< @(MatchResult.end match-res) 0) ))
(defn extract [match-res data]
(if (non-match? match-res)
(Maybe.Nothing)
(Maybe.Just (String.slice data @(MatchResult.start match-res)
@(MatchResult.end match-res) ))))
(doc match-from "returns start and end indizes of the first match after start-pos. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
(register match-from (Fn [&Pattern &String Int] Pattern.MatchResult))
(doc match "returns start and end indizes of the first match after the start of the string. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
(defn match [pattern data]
(match-from pattern data 0) )
(doc find "finds the index of a pattern in a string. (doc find "finds the index of a pattern in a string.
Returns `-1` if it doesnt find a matching pattern.") Returns `-1` if it doesnt find a matching pattern.")
(register find (Fn [&Pattern &String] Int)) (defn find [pattern data]
(doc find-all "finds all indices of a pattern in a string. @(Pattern.MatchResult.start &(Pattern.match pattern data)) )
(doc find-all "finds all indices of a pattern in a string. The patterns may _not_ overlap.
Returns `[]` if it doesnt find a matching pattern.") Returns `[]` if it doesnt find a matching pattern.")
(register find-all (Fn [&Pattern &String] (Array Int))) (defn find-all-matches [pattern data]
(let-do [result []
stop (String.length data)
found (match-from pattern data 0)
start @(MatchResult.end &found) ]
(while-do (and (Int.<= start stop)
(not (non-match? &found)) )
(set! result (Array.push-back result found))
(set! found (match-from pattern data start))
(set! start @(MatchResult.end &found)) )
result ))
(defn find-all [pattern data]
(Array.copy-map
&(fn [m] @(MatchResult.start m))
&(find-all-matches pattern data) ))
(doc match-groups "finds the match groups of the first match of a pattern in (doc match-groups "finds the match groups of the first match of a pattern in
a string. a string.
@ -16,13 +64,15 @@ Returns `[]` if it doesnt find a matching pattern.")
(register match-groups (Fn [&Pattern &String] (Array String))) (register match-groups (Fn [&Pattern &String] (Array String)))
(doc match-str "finds the first match of a pattern in a string. (doc match-str "finds the first match of a pattern in a string.
Returns `[]` if it doesnt find a matching pattern.") Returns an empty string if it doesnt find a matching pattern.")
(register match-str (Fn [&Pattern &String] String)) (defn match-str [pattern data]
(doc global-match "finds all matches of a pattern in a string as a nested (Maybe.from (Pattern.extract &(Pattern.match pattern data) data) @"") )
(doc match-all-groups "finds all match groups of a pattern in a string as a nested
array. array.
Returns `[]` if it doesnt find a matching pattern.") Returns `[]` if it doesnt find a matching pattern.")
(register global-match (Fn [&Pattern &String] (Array (Array String)))) (register match-all-groups (Fn [&Pattern &String] (Array (Array String))))
(doc substitute "finds all matches of a pattern in a string and replaces it (doc substitute "finds all matches of a pattern in a string and replaces it
by another pattern `n` times. by another pattern `n` times.
@ -52,8 +102,11 @@ list of those characters.")
(defn from-chars [chars] (defn from-chars [chars]
(Pattern.init &(str* @"[" (String.from-chars chars) @"]"))) (Pattern.init &(str* @"[" (String.from-chars chars) @"]")))
(defn global-match-str [p s] (defn global-match-str [pattern data]
(Array.copy-map &(fn [x] @(Array.unsafe-first x)) &(global-match p s))) (Array.copy-map
&(fn [m] (Maybe.unsafe-from (extract m data)))
&(find-all-matches pattern data)))
(doc split "splits a string by a pattern.") (doc split "splits a string by a pattern.")
(defn split [p s] (defn split [p s]

View File

@ -390,24 +390,6 @@ init: /* using goto's to optimize tail recursion */
return s; return s;
} }
String Pattern_internal_lmemfind(String s1, size_t l1, String s2, size_t l2) {
if (l2 == 0) return s1; /* empty Strings are everywhere */
if (l2 > l1) return NULL; /* avoids a negative 'l1' */
String init; /* to search for a '*s2' inside 's1' */
l2--; /* 1st char will be checked by 'memchr' */
l1 = l1 - l2; /* 's2' cannot be found after that */
while (l1 > 0 && (init = (String)memchr(s1, *s2, l1))) {
init++; /* 1st char is already checked */
if (!memcmp(init, s2 + 1, l2)) {
return init - 1;
} else { /* correct 'l1' and 's1' to try again */
l1 -= init - s1;
s1 = init;
}
}
return NULL; /* not found */
}
String String_copy_len(String s, int len) { String String_copy_len(String s, int len) {
String ptr = CARP_MALLOC(len + 1); String ptr = CARP_MALLOC(len + 1);
memcpy(ptr, s, len); memcpy(ptr, s, len);
@ -452,17 +434,6 @@ Array Pattern_internal_push_captures(PatternMatchState *ms, String s,
return res; return res;
} }
/* check whether Pattern has no special characters */
int Pattern_internal_nospecials(String p, size_t l) {
size_t upto = 0;
do {
if (strpbrk(p + upto, SPECIALS))
return 0; /* Pattern has a special character */
upto += strlen(p + upto) + 1; /* may have more after \0 */
} while (upto <= l);
return 1; /* no special chars found */
}
void Pattern_internal_prepstate(PatternMatchState *ms, String s, size_t ls, void Pattern_internal_prepstate(PatternMatchState *ms, String s, size_t ls,
String p, size_t lp) { String p, size_t lp) {
ms->matchdepth = MAXCCALLS; ms->matchdepth = MAXCCALLS;
@ -476,79 +447,6 @@ void Pattern_internal_reprepstate(PatternMatchState *ms) {
assert(ms->matchdepth == MAXCCALLS); assert(ms->matchdepth == MAXCCALLS);
} }
int Pattern_find(Pattern *p, String *s) {
String str = *s;
Pattern pat = *p;
int lstr = strlen(str);
int lpat = strlen(pat);
/* explicit request or no special characters? */
if (Pattern_internal_nospecials(pat, lpat)) {
/* do a plain search */
String s2 = Pattern_internal_lmemfind(str, lstr, pat, lpat);
if (!s2) return -1;
return s2 - str;
}
PatternMatchState ms;
String s1 = str;
int anchor = (*pat == '^');
if (anchor) {
pat++;
lpat--; /* skip anchor character */
}
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
do {
String res;
Pattern_internal_reprepstate(&ms);
if ((res = Pattern_internal_match(&ms, s1, pat))) return s1 - str;
} while (s1++ < ms.src_end && !anchor);
return -1;
}
/* TODO: this is duplicated behavior, almost equivalent to Array_push_back */
void Pattern_internal_update_int_array(Array *a, int value) {
a->len++;
if (a->len > a->capacity) {
a->capacity = a->len * 2;
a->data = CARP_REALLOC(a->data, sizeof(int) * a->capacity);
}
((int *)a->data)[a->len - 1] = value;
}
Array Pattern_find_MINUS_all(Pattern *p, String *s) {
String str = *s;
Pattern pat = *p;
int lstr = strlen(str);
int lpat = strlen(pat);
Array res;
res.len = 0;
res.capacity = 0;
res.data = NULL;
/* explicit request or no special characters? */
if (Pattern_internal_nospecials(pat, lpat)) {
while (1) {
/* do a plain search */
String s2 = Pattern_internal_lmemfind(str, lstr, pat, lpat);
if (!s2) return res;
Pattern_internal_update_int_array(&res, s2 - str);
}
}
PatternMatchState ms;
String s1 = str;
int anchor = (*pat == '^');
if (anchor) {
pat++;
lpat--; /* skip anchor character */
}
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
do {
Pattern_internal_reprepstate(&ms);
if (Pattern_internal_match(&ms, s1, pat)) {
Pattern_internal_update_int_array(&res, s1 - str);
}
} while (s1++ < ms.src_end && !anchor);
return res;
}
Array Pattern_match_MINUS_groups(Pattern *p, String *s) { Array Pattern_match_MINUS_groups(Pattern *p, String *s) {
String str = *s; String str = *s;
Pattern pat = *p; Pattern pat = *p;
@ -576,8 +474,14 @@ Array Pattern_match_MINUS_groups(Pattern *p, String *s) {
return a; return a;
} }
String Pattern_match_MINUS_str(Pattern *p, String *s) { typedef struct PatternMatchResult {
String str = *s; int start; // negative start or end indicates a non-match
int end;
} PatternMatchResult;
PatternMatchResult Pattern_match_MINUS_from(Pattern *p, String *s, int startpos) {
PatternMatchResult result = { .start=-1, .end=-1 };
String str = *s + startpos;
Pattern pat = *p; Pattern pat = *p;
int lstr = strlen(str); int lstr = strlen(str);
int lpat = strlen(pat); int lpat = strlen(pat);
@ -590,19 +494,15 @@ String Pattern_match_MINUS_str(Pattern *p, String *s) {
} }
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat); Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
do { do {
String res; String res;
Pattern_internal_reprepstate(&ms); Pattern_internal_reprepstate(&ms);
if ((res = Pattern_internal_match(&ms, s1, pat))) { if ((res = Pattern_internal_match(&ms, s1, pat))) {
int start = (s1 - str) + 1; result.start = startpos + (s1 - str);
int end = res - str + 1; result.end = startpos + res - str;
int len = end - start; break;
res = CARP_MALLOC(len + 1);
memcpy(res, s1, len);
res[len] = '\0';
return res;
} }
} while (s1++ < ms.src_end && !anchor); } while (s1++ < ms.src_end && !anchor);
return String_empty(); return result;
} }
/* state for 'gmatch' */ /* state for 'gmatch' */
@ -642,7 +542,7 @@ Array Array_push_back(Array res, Array tmp) {
return res; return res;
} }
Array Pattern_global_MINUS_match(Pattern *p, String *s) { Array Pattern_match_MINUS_all_MINUS_groups(Pattern *p, String *s) {
String str = *s; String str = *s;
Pattern pat = *p; Pattern pat = *p;
int lstr = strlen(str); int lstr = strlen(str);

View File

@ -69,8 +69,8 @@
"matches? works as exptected on tabs special case") "matches? works as exptected on tabs special case")
(assert-equal test (assert-equal test
&[@"3" @"4"] &[@"3" @"4"]
(Array.unsafe-nth &(global-match #"(\d)-(\d)" "1-2 2-3 3-4 4-5") 2) (Array.unsafe-nth &(match-all-groups #"(\d)-(\d)" "1-2 2-3 3-4 4-5") 2)
"global-match works as expected") "match-all-groups works as expected")
(assert-equal test (assert-equal test
"1-2" "1-2"
&(match-str #"(\d)-(\d)" "1-2 2-3 3-4 4-5") &(match-str #"(\d)-(\d)" "1-2 2-3 3-4 4-5")