Carp/core/Pattern.carp
guberathome e412559380
reorganize Pattern library (#1257)
* feat!: implemented Pattern.match which returns start and end indizes of the first match

* fix: Pattern.match now returns correct end index

* feat: moved Pattern.match-str and Pattern.find from C to Carp code

* chore: fix build after merges

* chore: fix build after merges

* feat: moved Pattern.find-all from C to Carp code

* feat: Pattern.global-match-str no longer relies on Pattern.global-match

* docs: updated for Pattern.global-match

* fix: moved str/prn functions into sub module

* fix: removed unused functions from carp_pattern.h (using cflow)

* feat!: renamed (Pattern.global-match) to (Pattern.match-all-groups)

* fix: unit test

* fix: some functions renamed to match Carp style

Co-authored-by: guberatsie <gunnar.bernhardt@siemens.com>
2021-06-23 22:07:56 +02:00

206 lines
7.6 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

(system-include "carp_pattern.h")
(defmodule Pattern
(register-type MatchResult "PatternMatchResult" [start Int, end Int])
(defmodule MatchResult
(defn ref-str [ref-matchres]
(fmt "(MatchResult start=%d end=%d)"
(MatchResult.start ref-matchres)
(MatchResult.end ref-matchres) ))
(implements str Pattern.MatchResult.ref-str)
(implements prn Pattern.MatchResult.ref-str)
(defn str [matchres]
(Pattern.MatchResult.ref-str &matchres) )
(implements str Pattern.MatchResult.str)
(implements prn Pattern.MatchResult.str)
)
(defn non-match? [match-res]
(or (Int.< @(MatchResult.start match-res) 0)
(Int.< @(MatchResult.end match-res) 0) ))
(defn extract [match-res data]
(if (non-match? match-res)
(Maybe.Nothing)
(Maybe.Just (String.slice data @(MatchResult.start match-res)
@(MatchResult.end match-res) ))))
(doc match-from "returns start and end indizes of the first match after start-pos. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
(register match-from (Fn [&Pattern &String Int] Pattern.MatchResult))
(doc match "returns start and end indizes of the first match after the start of the string. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
(defn match [pattern data]
(match-from pattern data 0) )
(doc find "finds the index of a pattern in a string.
Returns `-1` if it doesnt find a matching pattern.")
(defn find [pattern data]
@(Pattern.MatchResult.start &(Pattern.match pattern data)) )
(doc find-all "finds all indices of a pattern in a string. The patterns may _not_ overlap.
Returns `[]` if it doesnt find a matching pattern.")
(defn find-all-matches [pattern data]
(let-do [result []
stop (String.length data)
found (match-from pattern data 0)
start @(MatchResult.end &found) ]
(while-do (and (Int.<= start stop)
(not (non-match? &found)) )
(set! result (Array.push-back result found))
(set! found (match-from pattern data start))
(set! start @(MatchResult.end &found)) )
result ))
(defn find-all [pattern data]
(Array.copy-map
&(fn [m] @(MatchResult.start m))
&(find-all-matches pattern data) ))
(doc match-groups "finds the match groups of the first match of a pattern in
a string.
Returns `[]` if it doesnt find a matching pattern.")
(register match-groups (Fn [&Pattern &String] (Array String)))
(doc match-str "finds the first match of a pattern in a string.
Returns an empty string if it doesnt find a matching pattern.")
(defn match-str [pattern data]
(Maybe.from (Pattern.extract &(Pattern.match pattern data) data) @"") )
(doc match-all-groups "finds all match groups of a pattern in a string as a nested
array.
Returns `[]` if it doesnt find a matching pattern.")
(register match-all-groups (Fn [&Pattern &String] (Array (Array String))))
(doc substitute "finds all matches of a pattern in a string and replaces it
by another pattern `n` times.
The substitute pattern can reference the original pattern by match group
indices, such as `\1`. This means that backslashes need to be double escaped.
If you want to replace all occurrences of the pattern, use `-1`.")
(register substitute (Fn [&Pattern &String &String Int] String))
(doc matches? "checks whether a pattern matches a string.")
(defn matches? [pat s] (/= (find pat s) -1))
(register str (Fn [&Pattern] String))
(implements str Pattern.str)
(register prn (Fn [&Pattern] String))
(implements prn Pattern.prn)
(register init (Fn [&String] Pattern))
(register = (Fn [&Pattern &Pattern] Bool))
(implements = Pattern.=)
(register delete (Fn [Pattern] ()))
(implements delete Pattern.delete)
(register copy (Fn [&Pattern] Pattern))
(implements copy Pattern.copy)
(doc from-chars "creates a pattern that matches a group of characters from a
list of those characters.")
(defn from-chars [chars]
(Pattern.init &(str* @"[" (String.from-chars chars) @"]")))
(defn global-match-str [pattern data]
(Array.copy-map
&(fn [m] (Maybe.unsafe-from (extract m data)))
&(find-all-matches pattern data)))
(doc split "splits a string by a pattern.")
(defn split [p s]
(let-do [idx (find-all p s)
strs (global-match-str p s)
lidx (Array.length &idx)
result (Array.allocate (Int.inc lidx))]
(Array.aset-uninitialized! &result 0
(slice s 0 (if (> lidx 0) @(Array.unsafe-nth &idx 0) (length s))))
(for [i 0 (Int.dec (Array.length &idx))]
(let [plen (length (Array.unsafe-nth &strs i))]
(Array.aset-uninitialized! &result (Int.inc i)
(slice s (+ @(Array.unsafe-nth &idx i) plen)
@(Array.unsafe-nth &idx (Int.inc i))))))
(when (> lidx 0)
(let [plen (length (Array.unsafe-nth &strs (Int.dec lidx)))]
(Array.aset-uninitialized! &result lidx
(suffix s (+ @(Array.unsafe-nth &idx (Int.dec lidx))
plen)))))
result))
)
(defmodule String
(doc in? "checks whether a string contains another string.")
(defn in? [s sub]
(Pattern.matches? &(Pattern.init sub) s))
(doc upper? "checks whether a string is all uppercase.")
(defn upper? [s]
(Pattern.matches? #"^[\u\s\p]*$" s))
(doc lower? "checks whether a string is all lowercase.")
(defn lower? [s]
(Pattern.matches? #"^[\l\s\p]*$" s))
(doc num? "checks whether a string is numerical.")
(defn num? [s]
(Pattern.matches? #"^[0-9]*$" s))
(doc alpha? "checks whether a string contains only alphabetical characters (a-Z).")
(defn alpha? [s]
(Pattern.matches? #"^[\u\l]*$" s))
(doc alphanum? "checks whether a string is alphanumerical.")
(defn alphanum? [s]
(Pattern.matches? #"^[\w]*$" s))
(doc hex? "checks whether a string is hexadecimal.")
(defn hex? [s]
(Pattern.matches? #"^[\x]*$" s))
(doc trim-left "trims whitespace from the left of a string.")
(defn trim-left [s]
(Pattern.substitute #"^\s+" s "" 1))
(doc trim-right "trims whitespace from the right of a string.")
(defn trim-right [s]
(Pattern.substitute #"\s+$" s "" 1))
(doc trim "trims whitespace from both sides of a string.")
(defn trim [s]
(trim-left &(trim-right s)))
(doc chomp "trims a newline from the end of a string.")
(defn chomp [s]
(Pattern.substitute #"\r$" &(Pattern.substitute #"\n$" s "" 1) "" 1))
(doc collapse-whitespace "collapses groups of whitespace into single spaces.")
(defn collapse-whitespace [s]
(Pattern.substitute #"\s+" s " " -1))
(doc split-by "splits a string by separators.")
(defn split-by [s separators]
(let-do [pat (Pattern.from-chars separators)
idx (Pattern.find-all &pat s)
lidx (Array.length &idx)
result (Array.allocate (Int.inc lidx))]
(Array.aset-uninitialized! &result 0
(slice s 0 (if (> lidx 0) @(Array.unsafe-nth &idx 0) (length s))))
(for [i 0 (Int.dec (Array.length &idx))]
(Array.aset-uninitialized! &result (Int.inc i)
(slice s (Int.inc @(Array.unsafe-nth &idx i)) @(Array.unsafe-nth &idx (Int.inc i)))))
(when (> lidx 0)
(Array.aset-uninitialized! &result lidx
(suffix s (Int.inc @(Array.unsafe-nth &idx (Int.dec lidx))))))
result))
(doc words "splits a string into words.")
(defn words [s]
(Array.endo-filter &(fn [s] (not (empty? s))) (split-by s &[\tab \space \newline])))
(doc lines "splits a string into lines.")
(defn lines [s]
(split-by s &[\newline]))
)