Carp/core/Pattern.carp

208 lines
7.8 KiB
Plaintext
Raw Normal View History

(system-include "carp_pattern.h")
2021-07-05 15:48:35 +03:00
(doc Pattern "is a data type for pattern matching, similar to, but not the same
as, Regular Expressions. [See the docs for more
information](../LanguageGuide.html#patterns).")
(defmodule Pattern
(register-type MatchResult [start Int, end Int])
(defmodule MatchResult
(defn ref-str [ref-matchres]
(fmt "(MatchResult start=%d end=%d)"
(MatchResult.start ref-matchres)
(MatchResult.end ref-matchres) ))
(implements str Pattern.MatchResult.ref-str)
(implements prn Pattern.MatchResult.ref-str)
(defn str [matchres]
(Pattern.MatchResult.ref-str &matchres) )
(implements str Pattern.MatchResult.str)
(implements prn Pattern.MatchResult.str)
)
(defn non-match? [match-res]
(or (Int.< @(MatchResult.start match-res) 0)
(Int.< @(MatchResult.end match-res) 0) ))
(defn extract [match-res data]
(if (non-match? match-res)
(Maybe.Nothing)
(Maybe.Just (String.slice data @(MatchResult.start match-res)
@(MatchResult.end match-res) ))))
(doc match-from "returns start and end indizes of the first match after start-pos. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
(register match-from (Fn [&Pattern &String Int] Pattern.MatchResult))
(doc match "returns start and end indizes of the first match after the start of the string. Note that the end index points to the 1st character _after_ the match (like all slice functions).")
(defn match [pattern data]
(match-from pattern data 0) )
2019-05-04 10:41:33 +03:00
(doc find "finds the index of a pattern in a string.
Returns `-1` if it doesnt find a matching pattern.")
(defn find [pattern data]
@(Pattern.MatchResult.start &(Pattern.match pattern data)) )
(doc find-all "finds all indices of a pattern in a string. The patterns may _not_ overlap.
2019-05-04 10:41:33 +03:00
Returns `[]` if it doesnt find a matching pattern.")
(defn find-all-matches [pattern data]
(let-do [result []
stop (String.length data)
found (match-from pattern data 0)
start @(MatchResult.end &found) ]
(while-do (and (Int.<= start stop)
(not (non-match? &found)) )
(set! result (Array.push-back result found))
(set! found (match-from pattern data start))
(set! start @(MatchResult.end &found)) )
result ))
(defn find-all [pattern data]
(Array.copy-map
&(fn [m] @(MatchResult.start m))
&(find-all-matches pattern data) ))
(doc match-groups "finds the match groups of the first match of a pattern in
a string.
2019-05-04 10:41:33 +03:00
Returns `[]` if it doesnt find a matching pattern.")
2018-11-14 23:21:56 +03:00
(register match-groups (Fn [&Pattern &String] (Array String)))
2019-05-04 10:41:33 +03:00
(doc match-str "finds the first match of a pattern in a string.
Returns an empty string if it doesnt find a matching pattern.")
(defn match-str [pattern data]
(Maybe.from (Pattern.extract &(Pattern.match pattern data) data) @"") )
(doc match-all-groups "finds all match groups of a pattern in a string as a nested
array.
2019-05-04 10:41:33 +03:00
Returns `[]` if it doesnt find a matching pattern.")
(register match-all-groups (Fn [&Pattern &String] (Array (Array String))))
(doc substitute "finds all matches of a pattern in a string and replaces it
by another pattern `n` times.
The substitute pattern can reference the original pattern by match group
indices, such as `\1`. This means that backslashes need to be double escaped.
2019-05-04 10:41:33 +03:00
If you want to replace all occurrences of the pattern, use `-1`.")
(register substitute (Fn [&Pattern &String &String Int] String))
2019-05-04 10:41:33 +03:00
(doc matches? "checks whether a pattern matches a string.")
(defn matches? [pat s] (/= (find pat s) -1))
2018-03-03 16:59:12 +03:00
(register str (Fn [&Pattern] String))
(implements str Pattern.str)
2018-03-03 16:59:12 +03:00
(register prn (Fn [&Pattern] String))
(implements prn Pattern.prn)
2018-03-03 16:59:12 +03:00
(register init (Fn [&String] Pattern))
(register = (Fn [&Pattern &Pattern] Bool))
(implements = Pattern.=)
2018-03-03 16:59:12 +03:00
(register delete (Fn [Pattern] ()))
(implements delete Pattern.delete)
2018-03-03 16:59:12 +03:00
(register copy (Fn [&Pattern] Pattern))
(implements copy Pattern.copy)
(doc from-chars "creates a pattern that matches a group of characters from a
list of those characters.")
(defn from-chars [chars]
(Pattern.init &(str* @"[" (String.from-chars chars) @"]")))
(defn global-match-str [pattern data]
(Array.copy-map
&(fn [m] (Maybe.unsafe-from (extract m data)))
&(find-all-matches pattern data)))
(doc split "splits a string by a pattern.")
(defn split [p s]
(let-do [idx (find-all p s)
strs (global-match-str p s)
lidx (Array.length &idx)
result (Array.allocate (Int.inc lidx))]
(Array.aset-uninitialized! &result 0
2020-02-11 11:09:30 +03:00
(slice s 0 (if (> lidx 0) @(Array.unsafe-nth &idx 0) (length s))))
(for [i 0 (Int.dec (Array.length &idx))]
(let [plen (length (Array.unsafe-nth &strs i))]
(Array.aset-uninitialized! &result (Int.inc i)
2020-02-11 11:09:30 +03:00
(slice s (+ @(Array.unsafe-nth &idx i) plen)
@(Array.unsafe-nth &idx (Int.inc i))))))
(when (> lidx 0)
(let [plen (length (Array.unsafe-nth &strs (Int.dec lidx)))]
(Array.aset-uninitialized! &result lidx
2020-02-11 11:09:30 +03:00
(suffix s (+ @(Array.unsafe-nth &idx (Int.dec lidx))
plen)))))
result))
)
2018-03-13 13:59:48 +03:00
(defmodule String
2019-05-04 10:41:33 +03:00
(doc in? "checks whether a string contains another string.")
2018-03-13 13:59:48 +03:00
(defn in? [s sub]
(Pattern.matches? &(Pattern.init sub) s))
2019-05-04 10:41:33 +03:00
(doc upper? "checks whether a string is all uppercase.")
2018-03-13 13:59:48 +03:00
(defn upper? [s]
(Pattern.matches? #"^[\u\s\p]*$" s))
2019-05-04 10:41:33 +03:00
(doc lower? "checks whether a string is all lowercase.")
2018-03-13 13:59:48 +03:00
(defn lower? [s]
(Pattern.matches? #"^[\l\s\p]*$" s))
2019-05-04 10:41:33 +03:00
(doc num? "checks whether a string is numerical.")
2018-03-13 13:59:48 +03:00
(defn num? [s]
(Pattern.matches? #"^[0-9]*$" s))
2019-05-04 10:41:33 +03:00
(doc alpha? "checks whether a string contains only alphabetical characters (a-Z).")
2018-03-13 13:59:48 +03:00
(defn alpha? [s]
(Pattern.matches? #"^[\u\l]*$" s))
2019-05-04 10:41:33 +03:00
(doc alphanum? "checks whether a string is alphanumerical.")
2018-03-13 13:59:48 +03:00
(defn alphanum? [s]
(Pattern.matches? #"^[\w]*$" s))
2019-05-04 10:41:33 +03:00
(doc hex? "checks whether a string is hexadecimal.")
2018-03-13 13:59:48 +03:00
(defn hex? [s]
(Pattern.matches? #"^[\x]*$" s))
2019-05-04 10:41:33 +03:00
(doc trim-left "trims whitespace from the left of a string.")
2018-03-13 13:59:48 +03:00
(defn trim-left [s]
(Pattern.substitute #"^\s+" s "" 1))
2019-05-04 10:41:33 +03:00
(doc trim-right "trims whitespace from the right of a string.")
2018-03-13 13:59:48 +03:00
(defn trim-right [s]
(Pattern.substitute #"\s+$" s "" 1))
2019-05-04 10:41:33 +03:00
(doc trim "trims whitespace from both sides of a string.")
2018-03-13 13:59:48 +03:00
(defn trim [s]
(trim-left &(trim-right s)))
2019-05-04 10:41:33 +03:00
(doc chomp "trims a newline from the end of a string.")
2018-03-13 13:59:48 +03:00
(defn chomp [s]
(Pattern.substitute #"\r$" &(Pattern.substitute #"\n$" s "" 1) "" 1))
2018-03-13 13:59:48 +03:00
2019-05-04 10:41:33 +03:00
(doc collapse-whitespace "collapses groups of whitespace into single spaces.")
2018-03-13 13:59:48 +03:00
(defn collapse-whitespace [s]
(Pattern.substitute #"\s+" s " " -1))
2019-05-04 10:41:33 +03:00
(doc split-by "splits a string by separators.")
(defn split-by [s separators]
(let-do [pat (Pattern.from-chars separators)
idx (Pattern.find-all &pat s)
lidx (Array.length &idx)
result (Array.allocate (Int.inc lidx))]
(Array.aset-uninitialized! &result 0
2020-02-11 11:09:30 +03:00
(slice s 0 (if (> lidx 0) @(Array.unsafe-nth &idx 0) (length s))))
(for [i 0 (Int.dec (Array.length &idx))]
(Array.aset-uninitialized! &result (Int.inc i)
2020-02-11 11:09:30 +03:00
(slice s (Int.inc @(Array.unsafe-nth &idx i)) @(Array.unsafe-nth &idx (Int.inc i)))))
(when (> lidx 0)
(Array.aset-uninitialized! &result lidx
2020-02-11 11:09:30 +03:00
(suffix s (Int.inc @(Array.unsafe-nth &idx (Int.dec lidx))))))
result))
2019-05-04 10:41:33 +03:00
(doc words "splits a string into words.")
(defn words [s]
(Array.endo-filter &(fn [s] (not (empty? s))) (split-by s &[\tab \space \newline])))
2019-05-04 10:41:33 +03:00
(doc lines "splits a string into lines.")
(defn lines [s]
(split-by s &[\newline]))
2018-03-13 13:59:48 +03:00
)