reorganize Pattern library (#1257)

* feat!: implemented Pattern.match which returns start and end indizes of the first match

* fix: Pattern.match now returns correct end index

* feat: moved Pattern.match-str and Pattern.find from C to Carp code

* chore: fix build after merges

* chore: fix build after merges

* feat: moved Pattern.find-all from C to Carp code

* feat: Pattern.global-match-str no longer relies on Pattern.global-match

* docs: updated for Pattern.global-match

* fix: moved str/prn functions into sub module

* fix: removed unused functions from carp_pattern.h (using cflow)

* feat!: renamed (Pattern.global-match) to (Pattern.match-all-groups)

* fix: unit test

* fix: some functions renamed to match Carp style

Co-authored-by: guberatsie <gunnar.bernhardt@siemens.com>
This commit is contained in:
guberathome 2021-06-23 22:07:56 +02:00 committed by GitHub
parent c592485783
commit e412559380
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 78 additions and 125 deletions

View File

@ -1,14 +1,62 @@
(system-include "carp_pattern.h")
(defmodule Pattern
(register-type MatchResult "PatternMatchResult" [start Int, end Int])
(defmodule MatchResult
(defn ref-str [ref-matchres]
(fmt "(MatchResult start=%d end=%d)"
(MatchResult.start ref-matchres)
(MatchResult.end ref-matchres) ))
(implements str Pattern.MatchResult.ref-str)
(implements prn Pattern.MatchResult.ref-str)
(defn str [matchres]
(Pattern.MatchResult.ref-str &matchres) )
(implements str Pattern.MatchResult.str)
(implements prn Pattern.MatchResult.str)
)
(defn non-match? [match-res]
(or (Int.< @(MatchResult.start match-res) 0)
(Int.< @(MatchResult.end match-res) 0) ))
(defn extract [match-res data]
(if (non-match? match-res)
(Maybe.Nothing)
(Maybe.Just (String.slice data @(MatchResult.start match-res)
@(MatchResult.end match-res) ))))
(doc match-from "returns start and end indizes of the first match after start-pos. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
(register match-from (Fn [&Pattern &String Int] Pattern.MatchResult))
(doc match "returns start and end indizes of the first match after the start of the string. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
(defn match [pattern data]
(match-from pattern data 0) )
(doc find "finds the index of a pattern in a string.
Returns `-1` if it doesnt find a matching pattern.")
(register find (Fn [&Pattern &String] Int))
(doc find-all "finds all indices of a pattern in a string.
(defn find [pattern data]
@(Pattern.MatchResult.start &(Pattern.match pattern data)) )
(doc find-all "finds all indices of a pattern in a string. The patterns may _not_ overlap.
Returns `[]` if it doesnt find a matching pattern.")
(register find-all (Fn [&Pattern &String] (Array Int)))
(defn find-all-matches [pattern data]
(let-do [result []
stop (String.length data)
found (match-from pattern data 0)
start @(MatchResult.end &found) ]
(while-do (and (Int.<= start stop)
(not (non-match? &found)) )
(set! result (Array.push-back result found))
(set! found (match-from pattern data start))
(set! start @(MatchResult.end &found)) )
result ))
(defn find-all [pattern data]
(Array.copy-map
&(fn [m] @(MatchResult.start m))
&(find-all-matches pattern data) ))
(doc match-groups "finds the match groups of the first match of a pattern in
a string.
@ -16,13 +64,15 @@ Returns `[]` if it doesnt find a matching pattern.")
(register match-groups (Fn [&Pattern &String] (Array String)))
(doc match-str "finds the first match of a pattern in a string.
Returns `[]` if it doesnt find a matching pattern.")
(register match-str (Fn [&Pattern &String] String))
(doc global-match "finds all matches of a pattern in a string as a nested
Returns an empty string if it doesnt find a matching pattern.")
(defn match-str [pattern data]
(Maybe.from (Pattern.extract &(Pattern.match pattern data) data) @"") )
(doc match-all-groups "finds all match groups of a pattern in a string as a nested
array.
Returns `[]` if it doesnt find a matching pattern.")
(register global-match (Fn [&Pattern &String] (Array (Array String))))
(register match-all-groups (Fn [&Pattern &String] (Array (Array String))))
(doc substitute "finds all matches of a pattern in a string and replaces it
by another pattern `n` times.
@ -52,8 +102,11 @@ list of those characters.")
(defn from-chars [chars]
(Pattern.init &(str* @"[" (String.from-chars chars) @"]")))
(defn global-match-str [p s]
(Array.copy-map &(fn [x] @(Array.unsafe-first x)) &(global-match p s)))
(defn global-match-str [pattern data]
(Array.copy-map
&(fn [m] (Maybe.unsafe-from (extract m data)))
&(find-all-matches pattern data)))
(doc split "splits a string by a pattern.")
(defn split [p s]

View File

@ -390,24 +390,6 @@ init: /* using goto's to optimize tail recursion */
return s;
}
String Pattern_internal_lmemfind(String s1, size_t l1, String s2, size_t l2) {
if (l2 == 0) return s1; /* empty Strings are everywhere */
if (l2 > l1) return NULL; /* avoids a negative 'l1' */
String init; /* to search for a '*s2' inside 's1' */
l2--; /* 1st char will be checked by 'memchr' */
l1 = l1 - l2; /* 's2' cannot be found after that */
while (l1 > 0 && (init = (String)memchr(s1, *s2, l1))) {
init++; /* 1st char is already checked */
if (!memcmp(init, s2 + 1, l2)) {
return init - 1;
} else { /* correct 'l1' and 's1' to try again */
l1 -= init - s1;
s1 = init;
}
}
return NULL; /* not found */
}
String String_copy_len(String s, int len) {
String ptr = CARP_MALLOC(len + 1);
memcpy(ptr, s, len);
@ -452,17 +434,6 @@ Array Pattern_internal_push_captures(PatternMatchState *ms, String s,
return res;
}
/* check whether Pattern has no special characters */
int Pattern_internal_nospecials(String p, size_t l) {
size_t upto = 0;
do {
if (strpbrk(p + upto, SPECIALS))
return 0; /* Pattern has a special character */
upto += strlen(p + upto) + 1; /* may have more after \0 */
} while (upto <= l);
return 1; /* no special chars found */
}
void Pattern_internal_prepstate(PatternMatchState *ms, String s, size_t ls,
String p, size_t lp) {
ms->matchdepth = MAXCCALLS;
@ -476,79 +447,6 @@ void Pattern_internal_reprepstate(PatternMatchState *ms) {
assert(ms->matchdepth == MAXCCALLS);
}
int Pattern_find(Pattern *p, String *s) {
String str = *s;
Pattern pat = *p;
int lstr = strlen(str);
int lpat = strlen(pat);
/* explicit request or no special characters? */
if (Pattern_internal_nospecials(pat, lpat)) {
/* do a plain search */
String s2 = Pattern_internal_lmemfind(str, lstr, pat, lpat);
if (!s2) return -1;
return s2 - str;
}
PatternMatchState ms;
String s1 = str;
int anchor = (*pat == '^');
if (anchor) {
pat++;
lpat--; /* skip anchor character */
}
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
do {
String res;
Pattern_internal_reprepstate(&ms);
if ((res = Pattern_internal_match(&ms, s1, pat))) return s1 - str;
} while (s1++ < ms.src_end && !anchor);
return -1;
}
/* TODO: this is duplicated behavior, almost equivalent to Array_push_back */
void Pattern_internal_update_int_array(Array *a, int value) {
a->len++;
if (a->len > a->capacity) {
a->capacity = a->len * 2;
a->data = CARP_REALLOC(a->data, sizeof(int) * a->capacity);
}
((int *)a->data)[a->len - 1] = value;
}
Array Pattern_find_MINUS_all(Pattern *p, String *s) {
String str = *s;
Pattern pat = *p;
int lstr = strlen(str);
int lpat = strlen(pat);
Array res;
res.len = 0;
res.capacity = 0;
res.data = NULL;
/* explicit request or no special characters? */
if (Pattern_internal_nospecials(pat, lpat)) {
while (1) {
/* do a plain search */
String s2 = Pattern_internal_lmemfind(str, lstr, pat, lpat);
if (!s2) return res;
Pattern_internal_update_int_array(&res, s2 - str);
}
}
PatternMatchState ms;
String s1 = str;
int anchor = (*pat == '^');
if (anchor) {
pat++;
lpat--; /* skip anchor character */
}
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
do {
Pattern_internal_reprepstate(&ms);
if (Pattern_internal_match(&ms, s1, pat)) {
Pattern_internal_update_int_array(&res, s1 - str);
}
} while (s1++ < ms.src_end && !anchor);
return res;
}
Array Pattern_match_MINUS_groups(Pattern *p, String *s) {
String str = *s;
Pattern pat = *p;
@ -576,8 +474,14 @@ Array Pattern_match_MINUS_groups(Pattern *p, String *s) {
return a;
}
String Pattern_match_MINUS_str(Pattern *p, String *s) {
String str = *s;
typedef struct PatternMatchResult {
int start; // negative start or end indicates a non-match
int end;
} PatternMatchResult;
PatternMatchResult Pattern_match_MINUS_from(Pattern *p, String *s, int startpos) {
PatternMatchResult result = { .start=-1, .end=-1 };
String str = *s + startpos;
Pattern pat = *p;
int lstr = strlen(str);
int lpat = strlen(pat);
@ -590,19 +494,15 @@ String Pattern_match_MINUS_str(Pattern *p, String *s) {
}
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
do {
String res;
String res;
Pattern_internal_reprepstate(&ms);
if ((res = Pattern_internal_match(&ms, s1, pat))) {
int start = (s1 - str) + 1;
int end = res - str + 1;
int len = end - start;
res = CARP_MALLOC(len + 1);
memcpy(res, s1, len);
res[len] = '\0';
return res;
result.start = startpos + (s1 - str);
result.end = startpos + res - str;
break;
}
} while (s1++ < ms.src_end && !anchor);
return String_empty();
return result;
}
/* state for 'gmatch' */
@ -642,7 +542,7 @@ Array Array_push_back(Array res, Array tmp) {
return res;
}
Array Pattern_global_MINUS_match(Pattern *p, String *s) {
Array Pattern_match_MINUS_all_MINUS_groups(Pattern *p, String *s) {
String str = *s;
Pattern pat = *p;
int lstr = strlen(str);

View File

@ -69,8 +69,8 @@
"matches? works as exptected on tabs special case")
(assert-equal test
&[@"3" @"4"]
(Array.unsafe-nth &(global-match #"(\d)-(\d)" "1-2 2-3 3-4 4-5") 2)
"global-match works as expected")
(Array.unsafe-nth &(match-all-groups #"(\d)-(\d)" "1-2 2-3 3-4 4-5") 2)
"match-all-groups works as expected")
(assert-equal test
"1-2"
&(match-str #"(\d)-(\d)" "1-2 2-3 3-4 4-5")