mirror of
https://github.com/carp-lang/Carp.git
synced 2024-09-11 05:25:28 +03:00
reorganize Pattern library (#1257)
* feat!: implemented Pattern.match which returns start and end indizes of the first match * fix: Pattern.match now returns correct end index * feat: moved Pattern.match-str and Pattern.find from C to Carp code * chore: fix build after merges * chore: fix build after merges * feat: moved Pattern.find-all from C to Carp code * feat: Pattern.global-match-str no longer relies on Pattern.global-match * docs: updated for Pattern.global-match * fix: moved str/prn functions into sub module * fix: removed unused functions from carp_pattern.h (using cflow) * feat!: renamed (Pattern.global-match) to (Pattern.match-all-groups) * fix: unit test * fix: some functions renamed to match Carp style Co-authored-by: guberatsie <gunnar.bernhardt@siemens.com>
This commit is contained in:
parent
c592485783
commit
e412559380
@ -1,14 +1,62 @@
|
|||||||
(system-include "carp_pattern.h")
|
(system-include "carp_pattern.h")
|
||||||
|
|
||||||
(defmodule Pattern
|
(defmodule Pattern
|
||||||
|
|
||||||
|
(register-type MatchResult "PatternMatchResult" [start Int, end Int])
|
||||||
|
(defmodule MatchResult
|
||||||
|
(defn ref-str [ref-matchres]
|
||||||
|
(fmt "(MatchResult start=%d end=%d)"
|
||||||
|
(MatchResult.start ref-matchres)
|
||||||
|
(MatchResult.end ref-matchres) ))
|
||||||
|
(implements str Pattern.MatchResult.ref-str)
|
||||||
|
(implements prn Pattern.MatchResult.ref-str)
|
||||||
|
(defn str [matchres]
|
||||||
|
(Pattern.MatchResult.ref-str &matchres) )
|
||||||
|
(implements str Pattern.MatchResult.str)
|
||||||
|
(implements prn Pattern.MatchResult.str)
|
||||||
|
)
|
||||||
|
|
||||||
|
(defn non-match? [match-res]
|
||||||
|
(or (Int.< @(MatchResult.start match-res) 0)
|
||||||
|
(Int.< @(MatchResult.end match-res) 0) ))
|
||||||
|
(defn extract [match-res data]
|
||||||
|
(if (non-match? match-res)
|
||||||
|
(Maybe.Nothing)
|
||||||
|
(Maybe.Just (String.slice data @(MatchResult.start match-res)
|
||||||
|
@(MatchResult.end match-res) ))))
|
||||||
|
|
||||||
|
(doc match-from "returns start and end indizes of the first match after start-pos. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
|
||||||
|
(register match-from (Fn [&Pattern &String Int] Pattern.MatchResult))
|
||||||
|
(doc match "returns start and end indizes of the first match after the start of the string. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
|
||||||
|
(defn match [pattern data]
|
||||||
|
(match-from pattern data 0) )
|
||||||
|
|
||||||
(doc find "finds the index of a pattern in a string.
|
(doc find "finds the index of a pattern in a string.
|
||||||
|
|
||||||
Returns `-1` if it doesn’t find a matching pattern.")
|
Returns `-1` if it doesn’t find a matching pattern.")
|
||||||
(register find (Fn [&Pattern &String] Int))
|
(defn find [pattern data]
|
||||||
(doc find-all "finds all indices of a pattern in a string.
|
@(Pattern.MatchResult.start &(Pattern.match pattern data)) )
|
||||||
|
|
||||||
|
(doc find-all "finds all indices of a pattern in a string. The patterns may _not_ overlap.
|
||||||
|
|
||||||
Returns `[]` if it doesn’t find a matching pattern.")
|
Returns `[]` if it doesn’t find a matching pattern.")
|
||||||
(register find-all (Fn [&Pattern &String] (Array Int)))
|
(defn find-all-matches [pattern data]
|
||||||
|
(let-do [result []
|
||||||
|
stop (String.length data)
|
||||||
|
found (match-from pattern data 0)
|
||||||
|
start @(MatchResult.end &found) ]
|
||||||
|
(while-do (and (Int.<= start stop)
|
||||||
|
(not (non-match? &found)) )
|
||||||
|
(set! result (Array.push-back result found))
|
||||||
|
(set! found (match-from pattern data start))
|
||||||
|
(set! start @(MatchResult.end &found)) )
|
||||||
|
result ))
|
||||||
|
(defn find-all [pattern data]
|
||||||
|
(Array.copy-map
|
||||||
|
&(fn [m] @(MatchResult.start m))
|
||||||
|
&(find-all-matches pattern data) ))
|
||||||
|
|
||||||
|
|
||||||
(doc match-groups "finds the match groups of the first match of a pattern in
|
(doc match-groups "finds the match groups of the first match of a pattern in
|
||||||
a string.
|
a string.
|
||||||
|
|
||||||
@ -16,13 +64,15 @@ Returns `[]` if it doesn’t find a matching pattern.")
|
|||||||
(register match-groups (Fn [&Pattern &String] (Array String)))
|
(register match-groups (Fn [&Pattern &String] (Array String)))
|
||||||
(doc match-str "finds the first match of a pattern in a string.
|
(doc match-str "finds the first match of a pattern in a string.
|
||||||
|
|
||||||
Returns `[]` if it doesn’t find a matching pattern.")
|
Returns an empty string if it doesn’t find a matching pattern.")
|
||||||
(register match-str (Fn [&Pattern &String] String))
|
(defn match-str [pattern data]
|
||||||
(doc global-match "finds all matches of a pattern in a string as a nested
|
(Maybe.from (Pattern.extract &(Pattern.match pattern data) data) @"") )
|
||||||
|
|
||||||
|
(doc match-all-groups "finds all match groups of a pattern in a string as a nested
|
||||||
array.
|
array.
|
||||||
|
|
||||||
Returns `[]` if it doesn’t find a matching pattern.")
|
Returns `[]` if it doesn’t find a matching pattern.")
|
||||||
(register global-match (Fn [&Pattern &String] (Array (Array String))))
|
(register match-all-groups (Fn [&Pattern &String] (Array (Array String))))
|
||||||
(doc substitute "finds all matches of a pattern in a string and replaces it
|
(doc substitute "finds all matches of a pattern in a string and replaces it
|
||||||
by another pattern `n` times.
|
by another pattern `n` times.
|
||||||
|
|
||||||
@ -52,8 +102,11 @@ list of those characters.")
|
|||||||
(defn from-chars [chars]
|
(defn from-chars [chars]
|
||||||
(Pattern.init &(str* @"[" (String.from-chars chars) @"]")))
|
(Pattern.init &(str* @"[" (String.from-chars chars) @"]")))
|
||||||
|
|
||||||
(defn global-match-str [p s]
|
(defn global-match-str [pattern data]
|
||||||
(Array.copy-map &(fn [x] @(Array.unsafe-first x)) &(global-match p s)))
|
(Array.copy-map
|
||||||
|
&(fn [m] (Maybe.unsafe-from (extract m data)))
|
||||||
|
&(find-all-matches pattern data)))
|
||||||
|
|
||||||
|
|
||||||
(doc split "splits a string by a pattern.")
|
(doc split "splits a string by a pattern.")
|
||||||
(defn split [p s]
|
(defn split [p s]
|
||||||
|
@ -390,24 +390,6 @@ init: /* using goto's to optimize tail recursion */
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
String Pattern_internal_lmemfind(String s1, size_t l1, String s2, size_t l2) {
|
|
||||||
if (l2 == 0) return s1; /* empty Strings are everywhere */
|
|
||||||
if (l2 > l1) return NULL; /* avoids a negative 'l1' */
|
|
||||||
String init; /* to search for a '*s2' inside 's1' */
|
|
||||||
l2--; /* 1st char will be checked by 'memchr' */
|
|
||||||
l1 = l1 - l2; /* 's2' cannot be found after that */
|
|
||||||
while (l1 > 0 && (init = (String)memchr(s1, *s2, l1))) {
|
|
||||||
init++; /* 1st char is already checked */
|
|
||||||
if (!memcmp(init, s2 + 1, l2)) {
|
|
||||||
return init - 1;
|
|
||||||
} else { /* correct 'l1' and 's1' to try again */
|
|
||||||
l1 -= init - s1;
|
|
||||||
s1 = init;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL; /* not found */
|
|
||||||
}
|
|
||||||
|
|
||||||
String String_copy_len(String s, int len) {
|
String String_copy_len(String s, int len) {
|
||||||
String ptr = CARP_MALLOC(len + 1);
|
String ptr = CARP_MALLOC(len + 1);
|
||||||
memcpy(ptr, s, len);
|
memcpy(ptr, s, len);
|
||||||
@ -452,17 +434,6 @@ Array Pattern_internal_push_captures(PatternMatchState *ms, String s,
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* check whether Pattern has no special characters */
|
|
||||||
int Pattern_internal_nospecials(String p, size_t l) {
|
|
||||||
size_t upto = 0;
|
|
||||||
do {
|
|
||||||
if (strpbrk(p + upto, SPECIALS))
|
|
||||||
return 0; /* Pattern has a special character */
|
|
||||||
upto += strlen(p + upto) + 1; /* may have more after \0 */
|
|
||||||
} while (upto <= l);
|
|
||||||
return 1; /* no special chars found */
|
|
||||||
}
|
|
||||||
|
|
||||||
void Pattern_internal_prepstate(PatternMatchState *ms, String s, size_t ls,
|
void Pattern_internal_prepstate(PatternMatchState *ms, String s, size_t ls,
|
||||||
String p, size_t lp) {
|
String p, size_t lp) {
|
||||||
ms->matchdepth = MAXCCALLS;
|
ms->matchdepth = MAXCCALLS;
|
||||||
@ -476,79 +447,6 @@ void Pattern_internal_reprepstate(PatternMatchState *ms) {
|
|||||||
assert(ms->matchdepth == MAXCCALLS);
|
assert(ms->matchdepth == MAXCCALLS);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Pattern_find(Pattern *p, String *s) {
|
|
||||||
String str = *s;
|
|
||||||
Pattern pat = *p;
|
|
||||||
int lstr = strlen(str);
|
|
||||||
int lpat = strlen(pat);
|
|
||||||
/* explicit request or no special characters? */
|
|
||||||
if (Pattern_internal_nospecials(pat, lpat)) {
|
|
||||||
/* do a plain search */
|
|
||||||
String s2 = Pattern_internal_lmemfind(str, lstr, pat, lpat);
|
|
||||||
if (!s2) return -1;
|
|
||||||
return s2 - str;
|
|
||||||
}
|
|
||||||
PatternMatchState ms;
|
|
||||||
String s1 = str;
|
|
||||||
int anchor = (*pat == '^');
|
|
||||||
if (anchor) {
|
|
||||||
pat++;
|
|
||||||
lpat--; /* skip anchor character */
|
|
||||||
}
|
|
||||||
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
|
|
||||||
do {
|
|
||||||
String res;
|
|
||||||
Pattern_internal_reprepstate(&ms);
|
|
||||||
if ((res = Pattern_internal_match(&ms, s1, pat))) return s1 - str;
|
|
||||||
} while (s1++ < ms.src_end && !anchor);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* TODO: this is duplicated behavior, almost equivalent to Array_push_back */
|
|
||||||
void Pattern_internal_update_int_array(Array *a, int value) {
|
|
||||||
a->len++;
|
|
||||||
if (a->len > a->capacity) {
|
|
||||||
a->capacity = a->len * 2;
|
|
||||||
a->data = CARP_REALLOC(a->data, sizeof(int) * a->capacity);
|
|
||||||
}
|
|
||||||
((int *)a->data)[a->len - 1] = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
Array Pattern_find_MINUS_all(Pattern *p, String *s) {
|
|
||||||
String str = *s;
|
|
||||||
Pattern pat = *p;
|
|
||||||
int lstr = strlen(str);
|
|
||||||
int lpat = strlen(pat);
|
|
||||||
Array res;
|
|
||||||
res.len = 0;
|
|
||||||
res.capacity = 0;
|
|
||||||
res.data = NULL;
|
|
||||||
/* explicit request or no special characters? */
|
|
||||||
if (Pattern_internal_nospecials(pat, lpat)) {
|
|
||||||
while (1) {
|
|
||||||
/* do a plain search */
|
|
||||||
String s2 = Pattern_internal_lmemfind(str, lstr, pat, lpat);
|
|
||||||
if (!s2) return res;
|
|
||||||
Pattern_internal_update_int_array(&res, s2 - str);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
PatternMatchState ms;
|
|
||||||
String s1 = str;
|
|
||||||
int anchor = (*pat == '^');
|
|
||||||
if (anchor) {
|
|
||||||
pat++;
|
|
||||||
lpat--; /* skip anchor character */
|
|
||||||
}
|
|
||||||
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
|
|
||||||
do {
|
|
||||||
Pattern_internal_reprepstate(&ms);
|
|
||||||
if (Pattern_internal_match(&ms, s1, pat)) {
|
|
||||||
Pattern_internal_update_int_array(&res, s1 - str);
|
|
||||||
}
|
|
||||||
} while (s1++ < ms.src_end && !anchor);
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
Array Pattern_match_MINUS_groups(Pattern *p, String *s) {
|
Array Pattern_match_MINUS_groups(Pattern *p, String *s) {
|
||||||
String str = *s;
|
String str = *s;
|
||||||
Pattern pat = *p;
|
Pattern pat = *p;
|
||||||
@ -576,8 +474,14 @@ Array Pattern_match_MINUS_groups(Pattern *p, String *s) {
|
|||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
String Pattern_match_MINUS_str(Pattern *p, String *s) {
|
typedef struct PatternMatchResult {
|
||||||
String str = *s;
|
int start; // negative start or end indicates a non-match
|
||||||
|
int end;
|
||||||
|
} PatternMatchResult;
|
||||||
|
|
||||||
|
PatternMatchResult Pattern_match_MINUS_from(Pattern *p, String *s, int startpos) {
|
||||||
|
PatternMatchResult result = { .start=-1, .end=-1 };
|
||||||
|
String str = *s + startpos;
|
||||||
Pattern pat = *p;
|
Pattern pat = *p;
|
||||||
int lstr = strlen(str);
|
int lstr = strlen(str);
|
||||||
int lpat = strlen(pat);
|
int lpat = strlen(pat);
|
||||||
@ -590,19 +494,15 @@ String Pattern_match_MINUS_str(Pattern *p, String *s) {
|
|||||||
}
|
}
|
||||||
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
|
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
|
||||||
do {
|
do {
|
||||||
String res;
|
String res;
|
||||||
Pattern_internal_reprepstate(&ms);
|
Pattern_internal_reprepstate(&ms);
|
||||||
if ((res = Pattern_internal_match(&ms, s1, pat))) {
|
if ((res = Pattern_internal_match(&ms, s1, pat))) {
|
||||||
int start = (s1 - str) + 1;
|
result.start = startpos + (s1 - str);
|
||||||
int end = res - str + 1;
|
result.end = startpos + res - str;
|
||||||
int len = end - start;
|
break;
|
||||||
res = CARP_MALLOC(len + 1);
|
|
||||||
memcpy(res, s1, len);
|
|
||||||
res[len] = '\0';
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
} while (s1++ < ms.src_end && !anchor);
|
} while (s1++ < ms.src_end && !anchor);
|
||||||
return String_empty();
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* state for 'gmatch' */
|
/* state for 'gmatch' */
|
||||||
@ -642,7 +542,7 @@ Array Array_push_back(Array res, Array tmp) {
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
Array Pattern_global_MINUS_match(Pattern *p, String *s) {
|
Array Pattern_match_MINUS_all_MINUS_groups(Pattern *p, String *s) {
|
||||||
String str = *s;
|
String str = *s;
|
||||||
Pattern pat = *p;
|
Pattern pat = *p;
|
||||||
int lstr = strlen(str);
|
int lstr = strlen(str);
|
||||||
|
@ -69,8 +69,8 @@
|
|||||||
"matches? works as exptected on tabs special case")
|
"matches? works as exptected on tabs special case")
|
||||||
(assert-equal test
|
(assert-equal test
|
||||||
&[@"3" @"4"]
|
&[@"3" @"4"]
|
||||||
(Array.unsafe-nth &(global-match #"(\d)-(\d)" "1-2 2-3 3-4 4-5") 2)
|
(Array.unsafe-nth &(match-all-groups #"(\d)-(\d)" "1-2 2-3 3-4 4-5") 2)
|
||||||
"global-match works as expected")
|
"match-all-groups works as expected")
|
||||||
(assert-equal test
|
(assert-equal test
|
||||||
"1-2"
|
"1-2"
|
||||||
&(match-str #"(\d)-(\d)" "1-2 2-3 3-4 4-5")
|
&(match-str #"(\d)-(\d)" "1-2 2-3 3-4 4-5")
|
||||||
|
Loading…
Reference in New Issue
Block a user