mirror of
https://github.com/carp-lang/Carp.git
synced 2024-10-26 05:45:37 +03:00
reorganize Pattern library (#1257)
* feat!: implemented Pattern.match which returns start and end indizes of the first match * fix: Pattern.match now returns correct end index * feat: moved Pattern.match-str and Pattern.find from C to Carp code * chore: fix build after merges * chore: fix build after merges * feat: moved Pattern.find-all from C to Carp code * feat: Pattern.global-match-str no longer relies on Pattern.global-match * docs: updated for Pattern.global-match * fix: moved str/prn functions into sub module * fix: removed unused functions from carp_pattern.h (using cflow) * feat!: renamed (Pattern.global-match) to (Pattern.match-all-groups) * fix: unit test * fix: some functions renamed to match Carp style Co-authored-by: guberatsie <gunnar.bernhardt@siemens.com>
This commit is contained in:
parent
c592485783
commit
e412559380
@ -1,14 +1,62 @@
|
||||
(system-include "carp_pattern.h")
|
||||
|
||||
(defmodule Pattern
|
||||
|
||||
(register-type MatchResult "PatternMatchResult" [start Int, end Int])
|
||||
(defmodule MatchResult
|
||||
(defn ref-str [ref-matchres]
|
||||
(fmt "(MatchResult start=%d end=%d)"
|
||||
(MatchResult.start ref-matchres)
|
||||
(MatchResult.end ref-matchres) ))
|
||||
(implements str Pattern.MatchResult.ref-str)
|
||||
(implements prn Pattern.MatchResult.ref-str)
|
||||
(defn str [matchres]
|
||||
(Pattern.MatchResult.ref-str &matchres) )
|
||||
(implements str Pattern.MatchResult.str)
|
||||
(implements prn Pattern.MatchResult.str)
|
||||
)
|
||||
|
||||
(defn non-match? [match-res]
|
||||
(or (Int.< @(MatchResult.start match-res) 0)
|
||||
(Int.< @(MatchResult.end match-res) 0) ))
|
||||
(defn extract [match-res data]
|
||||
(if (non-match? match-res)
|
||||
(Maybe.Nothing)
|
||||
(Maybe.Just (String.slice data @(MatchResult.start match-res)
|
||||
@(MatchResult.end match-res) ))))
|
||||
|
||||
(doc match-from "returns start and end indizes of the first match after start-pos. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
|
||||
(register match-from (Fn [&Pattern &String Int] Pattern.MatchResult))
|
||||
(doc match "returns start and end indizes of the first match after the start of the string. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
|
||||
(defn match [pattern data]
|
||||
(match-from pattern data 0) )
|
||||
|
||||
(doc find "finds the index of a pattern in a string.
|
||||
|
||||
Returns `-1` if it doesn’t find a matching pattern.")
|
||||
(register find (Fn [&Pattern &String] Int))
|
||||
(doc find-all "finds all indices of a pattern in a string.
|
||||
(defn find [pattern data]
|
||||
@(Pattern.MatchResult.start &(Pattern.match pattern data)) )
|
||||
|
||||
(doc find-all "finds all indices of a pattern in a string. The patterns may _not_ overlap.
|
||||
|
||||
Returns `[]` if it doesn’t find a matching pattern.")
|
||||
(register find-all (Fn [&Pattern &String] (Array Int)))
|
||||
(defn find-all-matches [pattern data]
|
||||
(let-do [result []
|
||||
stop (String.length data)
|
||||
found (match-from pattern data 0)
|
||||
start @(MatchResult.end &found) ]
|
||||
(while-do (and (Int.<= start stop)
|
||||
(not (non-match? &found)) )
|
||||
(set! result (Array.push-back result found))
|
||||
(set! found (match-from pattern data start))
|
||||
(set! start @(MatchResult.end &found)) )
|
||||
result ))
|
||||
(defn find-all [pattern data]
|
||||
(Array.copy-map
|
||||
&(fn [m] @(MatchResult.start m))
|
||||
&(find-all-matches pattern data) ))
|
||||
|
||||
|
||||
(doc match-groups "finds the match groups of the first match of a pattern in
|
||||
a string.
|
||||
|
||||
@ -16,13 +64,15 @@ Returns `[]` if it doesn’t find a matching pattern.")
|
||||
(register match-groups (Fn [&Pattern &String] (Array String)))
|
||||
(doc match-str "finds the first match of a pattern in a string.
|
||||
|
||||
Returns `[]` if it doesn’t find a matching pattern.")
|
||||
(register match-str (Fn [&Pattern &String] String))
|
||||
(doc global-match "finds all matches of a pattern in a string as a nested
|
||||
Returns an empty string if it doesn’t find a matching pattern.")
|
||||
(defn match-str [pattern data]
|
||||
(Maybe.from (Pattern.extract &(Pattern.match pattern data) data) @"") )
|
||||
|
||||
(doc match-all-groups "finds all match groups of a pattern in a string as a nested
|
||||
array.
|
||||
|
||||
Returns `[]` if it doesn’t find a matching pattern.")
|
||||
(register global-match (Fn [&Pattern &String] (Array (Array String))))
|
||||
(register match-all-groups (Fn [&Pattern &String] (Array (Array String))))
|
||||
(doc substitute "finds all matches of a pattern in a string and replaces it
|
||||
by another pattern `n` times.
|
||||
|
||||
@ -52,8 +102,11 @@ list of those characters.")
|
||||
(defn from-chars [chars]
|
||||
(Pattern.init &(str* @"[" (String.from-chars chars) @"]")))
|
||||
|
||||
(defn global-match-str [p s]
|
||||
(Array.copy-map &(fn [x] @(Array.unsafe-first x)) &(global-match p s)))
|
||||
(defn global-match-str [pattern data]
|
||||
(Array.copy-map
|
||||
&(fn [m] (Maybe.unsafe-from (extract m data)))
|
||||
&(find-all-matches pattern data)))
|
||||
|
||||
|
||||
(doc split "splits a string by a pattern.")
|
||||
(defn split [p s]
|
||||
|
@ -390,24 +390,6 @@ init: /* using goto's to optimize tail recursion */
|
||||
return s;
|
||||
}
|
||||
|
||||
String Pattern_internal_lmemfind(String s1, size_t l1, String s2, size_t l2) {
|
||||
if (l2 == 0) return s1; /* empty Strings are everywhere */
|
||||
if (l2 > l1) return NULL; /* avoids a negative 'l1' */
|
||||
String init; /* to search for a '*s2' inside 's1' */
|
||||
l2--; /* 1st char will be checked by 'memchr' */
|
||||
l1 = l1 - l2; /* 's2' cannot be found after that */
|
||||
while (l1 > 0 && (init = (String)memchr(s1, *s2, l1))) {
|
||||
init++; /* 1st char is already checked */
|
||||
if (!memcmp(init, s2 + 1, l2)) {
|
||||
return init - 1;
|
||||
} else { /* correct 'l1' and 's1' to try again */
|
||||
l1 -= init - s1;
|
||||
s1 = init;
|
||||
}
|
||||
}
|
||||
return NULL; /* not found */
|
||||
}
|
||||
|
||||
String String_copy_len(String s, int len) {
|
||||
String ptr = CARP_MALLOC(len + 1);
|
||||
memcpy(ptr, s, len);
|
||||
@ -452,17 +434,6 @@ Array Pattern_internal_push_captures(PatternMatchState *ms, String s,
|
||||
return res;
|
||||
}
|
||||
|
||||
/* check whether Pattern has no special characters */
|
||||
int Pattern_internal_nospecials(String p, size_t l) {
|
||||
size_t upto = 0;
|
||||
do {
|
||||
if (strpbrk(p + upto, SPECIALS))
|
||||
return 0; /* Pattern has a special character */
|
||||
upto += strlen(p + upto) + 1; /* may have more after \0 */
|
||||
} while (upto <= l);
|
||||
return 1; /* no special chars found */
|
||||
}
|
||||
|
||||
void Pattern_internal_prepstate(PatternMatchState *ms, String s, size_t ls,
|
||||
String p, size_t lp) {
|
||||
ms->matchdepth = MAXCCALLS;
|
||||
@ -476,79 +447,6 @@ void Pattern_internal_reprepstate(PatternMatchState *ms) {
|
||||
assert(ms->matchdepth == MAXCCALLS);
|
||||
}
|
||||
|
||||
int Pattern_find(Pattern *p, String *s) {
|
||||
String str = *s;
|
||||
Pattern pat = *p;
|
||||
int lstr = strlen(str);
|
||||
int lpat = strlen(pat);
|
||||
/* explicit request or no special characters? */
|
||||
if (Pattern_internal_nospecials(pat, lpat)) {
|
||||
/* do a plain search */
|
||||
String s2 = Pattern_internal_lmemfind(str, lstr, pat, lpat);
|
||||
if (!s2) return -1;
|
||||
return s2 - str;
|
||||
}
|
||||
PatternMatchState ms;
|
||||
String s1 = str;
|
||||
int anchor = (*pat == '^');
|
||||
if (anchor) {
|
||||
pat++;
|
||||
lpat--; /* skip anchor character */
|
||||
}
|
||||
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
|
||||
do {
|
||||
String res;
|
||||
Pattern_internal_reprepstate(&ms);
|
||||
if ((res = Pattern_internal_match(&ms, s1, pat))) return s1 - str;
|
||||
} while (s1++ < ms.src_end && !anchor);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* TODO: this is duplicated behavior, almost equivalent to Array_push_back */
|
||||
void Pattern_internal_update_int_array(Array *a, int value) {
|
||||
a->len++;
|
||||
if (a->len > a->capacity) {
|
||||
a->capacity = a->len * 2;
|
||||
a->data = CARP_REALLOC(a->data, sizeof(int) * a->capacity);
|
||||
}
|
||||
((int *)a->data)[a->len - 1] = value;
|
||||
}
|
||||
|
||||
Array Pattern_find_MINUS_all(Pattern *p, String *s) {
|
||||
String str = *s;
|
||||
Pattern pat = *p;
|
||||
int lstr = strlen(str);
|
||||
int lpat = strlen(pat);
|
||||
Array res;
|
||||
res.len = 0;
|
||||
res.capacity = 0;
|
||||
res.data = NULL;
|
||||
/* explicit request or no special characters? */
|
||||
if (Pattern_internal_nospecials(pat, lpat)) {
|
||||
while (1) {
|
||||
/* do a plain search */
|
||||
String s2 = Pattern_internal_lmemfind(str, lstr, pat, lpat);
|
||||
if (!s2) return res;
|
||||
Pattern_internal_update_int_array(&res, s2 - str);
|
||||
}
|
||||
}
|
||||
PatternMatchState ms;
|
||||
String s1 = str;
|
||||
int anchor = (*pat == '^');
|
||||
if (anchor) {
|
||||
pat++;
|
||||
lpat--; /* skip anchor character */
|
||||
}
|
||||
Pattern_internal_prepstate(&ms, str, lstr, pat, lpat);
|
||||
do {
|
||||
Pattern_internal_reprepstate(&ms);
|
||||
if (Pattern_internal_match(&ms, s1, pat)) {
|
||||
Pattern_internal_update_int_array(&res, s1 - str);
|
||||
}
|
||||
} while (s1++ < ms.src_end && !anchor);
|
||||
return res;
|
||||
}
|
||||
|
||||
Array Pattern_match_MINUS_groups(Pattern *p, String *s) {
|
||||
String str = *s;
|
||||
Pattern pat = *p;
|
||||
@ -576,8 +474,14 @@ Array Pattern_match_MINUS_groups(Pattern *p, String *s) {
|
||||
return a;
|
||||
}
|
||||
|
||||
String Pattern_match_MINUS_str(Pattern *p, String *s) {
|
||||
String str = *s;
|
||||
typedef struct PatternMatchResult {
|
||||
int start; // negative start or end indicates a non-match
|
||||
int end;
|
||||
} PatternMatchResult;
|
||||
|
||||
PatternMatchResult Pattern_match_MINUS_from(Pattern *p, String *s, int startpos) {
|
||||
PatternMatchResult result = { .start=-1, .end=-1 };
|
||||
String str = *s + startpos;
|
||||
Pattern pat = *p;
|
||||
int lstr = strlen(str);
|
||||
int lpat = strlen(pat);
|
||||
@ -593,16 +497,12 @@ String Pattern_match_MINUS_str(Pattern *p, String *s) {
|
||||
String res;
|
||||
Pattern_internal_reprepstate(&ms);
|
||||
if ((res = Pattern_internal_match(&ms, s1, pat))) {
|
||||
int start = (s1 - str) + 1;
|
||||
int end = res - str + 1;
|
||||
int len = end - start;
|
||||
res = CARP_MALLOC(len + 1);
|
||||
memcpy(res, s1, len);
|
||||
res[len] = '\0';
|
||||
return res;
|
||||
result.start = startpos + (s1 - str);
|
||||
result.end = startpos + res - str;
|
||||
break;
|
||||
}
|
||||
} while (s1++ < ms.src_end && !anchor);
|
||||
return String_empty();
|
||||
return result;
|
||||
}
|
||||
|
||||
/* state for 'gmatch' */
|
||||
@ -642,7 +542,7 @@ Array Array_push_back(Array res, Array tmp) {
|
||||
return res;
|
||||
}
|
||||
|
||||
Array Pattern_global_MINUS_match(Pattern *p, String *s) {
|
||||
Array Pattern_match_MINUS_all_MINUS_groups(Pattern *p, String *s) {
|
||||
String str = *s;
|
||||
Pattern pat = *p;
|
||||
int lstr = strlen(str);
|
||||
|
@ -69,8 +69,8 @@
|
||||
"matches? works as exptected on tabs special case")
|
||||
(assert-equal test
|
||||
&[@"3" @"4"]
|
||||
(Array.unsafe-nth &(global-match #"(\d)-(\d)" "1-2 2-3 3-4 4-5") 2)
|
||||
"global-match works as expected")
|
||||
(Array.unsafe-nth &(match-all-groups #"(\d)-(\d)" "1-2 2-3 3-4 4-5") 2)
|
||||
"match-all-groups works as expected")
|
||||
(assert-equal test
|
||||
"1-2"
|
||||
&(match-str #"(\d)-(\d)" "1-2 2-3 3-4 4-5")
|
||||
|
Loading…
Reference in New Issue
Block a user