diff --git a/core/Pattern.carp b/core/Pattern.carp index fe6d0cb4..92d9fab2 100644 --- a/core/Pattern.carp +++ b/core/Pattern.carp @@ -1,14 +1,62 @@ (system-include "carp_pattern.h") (defmodule Pattern + + (register-type MatchResult "PatternMatchResult" [start Int, end Int]) + (defmodule MatchResult + (defn ref-str [ref-matchres] + (fmt "(MatchResult start=%d end=%d)" + (MatchResult.start ref-matchres) + (MatchResult.end ref-matchres) )) + (implements str Pattern.MatchResult.ref-str) + (implements prn Pattern.MatchResult.ref-str) + (defn str [matchres] + (Pattern.MatchResult.ref-str &matchres) ) + (implements str Pattern.MatchResult.str) + (implements prn Pattern.MatchResult.str) + ) + + (defn non-match? [match-res] + (or (Int.< @(MatchResult.start match-res) 0) + (Int.< @(MatchResult.end match-res) 0) )) + (defn extract [match-res data] + (if (non-match? match-res) + (Maybe.Nothing) + (Maybe.Just (String.slice data @(MatchResult.start match-res) + @(MatchResult.end match-res) )))) + + (doc match-from "returns start and end indizes of the first match after start-pos. Note that the end index points to the 1st character _after_ the match (like all slice functions).") + (register match-from (Fn [&Pattern &String Int] Pattern.MatchResult)) + (doc match "returns start and end indizes of the first match after the start of the string. Note that the end index points to the 1st character _after_ the match (like all slice functions).") + (defn match [pattern data] + (match-from pattern data 0) ) + (doc find "finds the index of a pattern in a string. Returns `-1` if it doesn’t find a matching pattern.") - (register find (Fn [&Pattern &String] Int)) - (doc find-all "finds all indices of a pattern in a string. + (defn find [pattern data] + @(Pattern.MatchResult.start &(Pattern.match pattern data)) ) + + (doc find-all "finds all indices of a pattern in a string. The patterns may _not_ overlap. Returns `[]` if it doesn’t find a matching pattern.") - (register find-all (Fn [&Pattern &String] (Array Int))) + (defn find-all-matches [pattern data] + (let-do [result [] + stop (String.length data) + found (match-from pattern data 0) + start @(MatchResult.end &found) ] + (while-do (and (Int.<= start stop) + (not (non-match? &found)) ) + (set! result (Array.push-back result found)) + (set! found (match-from pattern data start)) + (set! start @(MatchResult.end &found)) ) + result )) + (defn find-all [pattern data] + (Array.copy-map + &(fn [m] @(MatchResult.start m)) + &(find-all-matches pattern data) )) + + (doc match-groups "finds the match groups of the first match of a pattern in a string. @@ -16,13 +64,15 @@ Returns `[]` if it doesn’t find a matching pattern.") (register match-groups (Fn [&Pattern &String] (Array String))) (doc match-str "finds the first match of a pattern in a string. -Returns `[]` if it doesn’t find a matching pattern.") - (register match-str (Fn [&Pattern &String] String)) - (doc global-match "finds all matches of a pattern in a string as a nested +Returns an empty string if it doesn’t find a matching pattern.") + (defn match-str [pattern data] + (Maybe.from (Pattern.extract &(Pattern.match pattern data) data) @"") ) + + (doc match-all-groups "finds all match groups of a pattern in a string as a nested array. Returns `[]` if it doesn’t find a matching pattern.") - (register global-match (Fn [&Pattern &String] (Array (Array String)))) + (register match-all-groups (Fn [&Pattern &String] (Array (Array String)))) (doc substitute "finds all matches of a pattern in a string and replaces it by another pattern `n` times. @@ -52,8 +102,11 @@ list of those characters.") (defn from-chars [chars] (Pattern.init &(str* @"[" (String.from-chars chars) @"]"))) - (defn global-match-str [p s] - (Array.copy-map &(fn [x] @(Array.unsafe-first x)) &(global-match p s))) + (defn global-match-str [pattern data] + (Array.copy-map + &(fn [m] (Maybe.unsafe-from (extract m data))) + &(find-all-matches pattern data))) + (doc split "splits a string by a pattern.") (defn split [p s] diff --git a/core/carp_pattern.h b/core/carp_pattern.h index 274abcea..f23b191f 100644 --- a/core/carp_pattern.h +++ b/core/carp_pattern.h @@ -390,24 +390,6 @@ init: /* using goto's to optimize tail recursion */ return s; } -String Pattern_internal_lmemfind(String s1, size_t l1, String s2, size_t l2) { - if (l2 == 0) return s1; /* empty Strings are everywhere */ - if (l2 > l1) return NULL; /* avoids a negative 'l1' */ - String init; /* to search for a '*s2' inside 's1' */ - l2--; /* 1st char will be checked by 'memchr' */ - l1 = l1 - l2; /* 's2' cannot be found after that */ - while (l1 > 0 && (init = (String)memchr(s1, *s2, l1))) { - init++; /* 1st char is already checked */ - if (!memcmp(init, s2 + 1, l2)) { - return init - 1; - } else { /* correct 'l1' and 's1' to try again */ - l1 -= init - s1; - s1 = init; - } - } - return NULL; /* not found */ -} - String String_copy_len(String s, int len) { String ptr = CARP_MALLOC(len + 1); memcpy(ptr, s, len); @@ -452,17 +434,6 @@ Array Pattern_internal_push_captures(PatternMatchState *ms, String s, return res; } -/* check whether Pattern has no special characters */ -int Pattern_internal_nospecials(String p, size_t l) { - size_t upto = 0; - do { - if (strpbrk(p + upto, SPECIALS)) - return 0; /* Pattern has a special character */ - upto += strlen(p + upto) + 1; /* may have more after \0 */ - } while (upto <= l); - return 1; /* no special chars found */ -} - void Pattern_internal_prepstate(PatternMatchState *ms, String s, size_t ls, String p, size_t lp) { ms->matchdepth = MAXCCALLS; @@ -476,79 +447,6 @@ void Pattern_internal_reprepstate(PatternMatchState *ms) { assert(ms->matchdepth == MAXCCALLS); } -int Pattern_find(Pattern *p, String *s) { - String str = *s; - Pattern pat = *p; - int lstr = strlen(str); - int lpat = strlen(pat); - /* explicit request or no special characters? */ - if (Pattern_internal_nospecials(pat, lpat)) { - /* do a plain search */ - String s2 = Pattern_internal_lmemfind(str, lstr, pat, lpat); - if (!s2) return -1; - return s2 - str; - } - PatternMatchState ms; - String s1 = str; - int anchor = (*pat == '^'); - if (anchor) { - pat++; - lpat--; /* skip anchor character */ - } - Pattern_internal_prepstate(&ms, str, lstr, pat, lpat); - do { - String res; - Pattern_internal_reprepstate(&ms); - if ((res = Pattern_internal_match(&ms, s1, pat))) return s1 - str; - } while (s1++ < ms.src_end && !anchor); - return -1; -} - -/* TODO: this is duplicated behavior, almost equivalent to Array_push_back */ -void Pattern_internal_update_int_array(Array *a, int value) { - a->len++; - if (a->len > a->capacity) { - a->capacity = a->len * 2; - a->data = CARP_REALLOC(a->data, sizeof(int) * a->capacity); - } - ((int *)a->data)[a->len - 1] = value; -} - -Array Pattern_find_MINUS_all(Pattern *p, String *s) { - String str = *s; - Pattern pat = *p; - int lstr = strlen(str); - int lpat = strlen(pat); - Array res; - res.len = 0; - res.capacity = 0; - res.data = NULL; - /* explicit request or no special characters? */ - if (Pattern_internal_nospecials(pat, lpat)) { - while (1) { - /* do a plain search */ - String s2 = Pattern_internal_lmemfind(str, lstr, pat, lpat); - if (!s2) return res; - Pattern_internal_update_int_array(&res, s2 - str); - } - } - PatternMatchState ms; - String s1 = str; - int anchor = (*pat == '^'); - if (anchor) { - pat++; - lpat--; /* skip anchor character */ - } - Pattern_internal_prepstate(&ms, str, lstr, pat, lpat); - do { - Pattern_internal_reprepstate(&ms); - if (Pattern_internal_match(&ms, s1, pat)) { - Pattern_internal_update_int_array(&res, s1 - str); - } - } while (s1++ < ms.src_end && !anchor); - return res; -} - Array Pattern_match_MINUS_groups(Pattern *p, String *s) { String str = *s; Pattern pat = *p; @@ -576,8 +474,14 @@ Array Pattern_match_MINUS_groups(Pattern *p, String *s) { return a; } -String Pattern_match_MINUS_str(Pattern *p, String *s) { - String str = *s; +typedef struct PatternMatchResult { + int start; // negative start or end indicates a non-match + int end; +} PatternMatchResult; + +PatternMatchResult Pattern_match_MINUS_from(Pattern *p, String *s, int startpos) { + PatternMatchResult result = { .start=-1, .end=-1 }; + String str = *s + startpos; Pattern pat = *p; int lstr = strlen(str); int lpat = strlen(pat); @@ -590,19 +494,15 @@ String Pattern_match_MINUS_str(Pattern *p, String *s) { } Pattern_internal_prepstate(&ms, str, lstr, pat, lpat); do { - String res; + String res; Pattern_internal_reprepstate(&ms); if ((res = Pattern_internal_match(&ms, s1, pat))) { - int start = (s1 - str) + 1; - int end = res - str + 1; - int len = end - start; - res = CARP_MALLOC(len + 1); - memcpy(res, s1, len); - res[len] = '\0'; - return res; + result.start = startpos + (s1 - str); + result.end = startpos + res - str; + break; } } while (s1++ < ms.src_end && !anchor); - return String_empty(); + return result; } /* state for 'gmatch' */ @@ -642,7 +542,7 @@ Array Array_push_back(Array res, Array tmp) { return res; } -Array Pattern_global_MINUS_match(Pattern *p, String *s) { +Array Pattern_match_MINUS_all_MINUS_groups(Pattern *p, String *s) { String str = *s; Pattern pat = *p; int lstr = strlen(str); diff --git a/test/pattern.carp b/test/pattern.carp index f7193881..f1f0f57c 100644 --- a/test/pattern.carp +++ b/test/pattern.carp @@ -69,8 +69,8 @@ "matches? works as exptected on tabs special case") (assert-equal test &[@"3" @"4"] - (Array.unsafe-nth &(global-match #"(\d)-(\d)" "1-2 2-3 3-4 4-5") 2) - "global-match works as expected") + (Array.unsafe-nth &(match-all-groups #"(\d)-(\d)" "1-2 2-3 3-4 4-5") 2) + "match-all-groups works as expected") (assert-equal test "1-2" &(match-str #"(\d)-(\d)" "1-2 2-3 3-4 4-5")