diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d875b80a8f..715210590af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -374,6 +374,7 @@ - [Added `Text.tokenize`][6150] - [Added support for Date/Time columns in the Postgres backend and added `year`/`month`/`day` operations to Table columns.][6153] +- [`Text.split` can now take a vector of delimiters.][6156] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -566,6 +567,7 @@ [6116]: https://github.com/enso-org/enso/pull/6116 [6150]: https://github.com/enso-org/enso/pull/6150 [6153]: https://github.com/enso-org/enso/pull/6153 +[6156]: https://github.com/enso-org/enso/pull/6156 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 4eedeca766f..1cd5c610f24 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -40,6 +40,7 @@ polyglot java import com.ibm.icu.text.BreakIterator polyglot java import java.lang.StringBuilder polyglot java import org.enso.base.Text_Utils polyglot java import org.enso.base.Encoding_Utils +polyglot java import org.enso.base.Regex_Utils ## Returns a new `Text` object with the characters in the reverse order of the input. @@ -301,7 +302,6 @@ Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = - delimiter: The pattern used to split the text. - case_sensitivity: Specifies if the text values should be compared case sensitively. The values are compared case sensitively by default. - - only_first: If true, only replace the first match. - use_regex: If true, the term is used as a regular expression. > Example @@ -319,25 +319,41 @@ Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = Split the text on any whitespace. 'abc def\tghi'.split '\\s+' use_regex=True == ["abc", "def", "ghi"] -Text.split : Text -> Case_Sensitivity -> Boolean -> Boolean -> Vector Text | Illegal_Argument -Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False = if delimiter.is_empty then Error.throw (Illegal_Argument.Error "The delimiter cannot be empty.") else - case use_regex of - False -> - delimiters = Vector.from_polyglot_array <| case case_sensitivity of - Case_Sensitivity.Sensitive -> - Text_Utils.span_of_all self delimiter - Case_Sensitivity.Insensitive locale -> - Text_Utils.span_of_all_case_insensitive self delimiter locale.java_locale - Vector.new delimiters.length+1 i-> - start = if i == 0 then 0 else - delimiters.at i-1 . codeunit_end - end = if i == delimiters.length then (Text_Utils.char_length self) else - delimiters.at i . codeunit_start - Text_Utils.substring self start end - True -> - case_insensitive = case_sensitivity.is_case_insensitive_in_memory - compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive - compiled_pattern.split self only_first + + > Example + Split with a vector of strings. + + 'azbzczdzezfzg'.split ['b', 'zez'] == ['az', 'zczd', 'fzg'] +Text.split : Text | Vector Text -> Case_Sensitivity -> Boolean -> Vector Text | Illegal_Argument +Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive use_regex=False = + delimiter_is_empty = case delimiter of + _ : Text -> delimiter.is_empty + _ : Vector -> delimiter.is_empty || delimiter.any (.is_empty) + _ -> Error.throw (Illegal_Argument.Error "The delimiter must be a Text or Vector of Texts.") + if delimiter_is_empty then Error.throw (Illegal_Argument.Error "The delimiter cannot be empty.") else + delimiter_is_singleton_vector = case delimiter of + _ : Vector -> delimiter.length == 1 + _ -> False + # If it's a vector of one element, just call it on that one element. + if delimiter_is_singleton_vector then self.split delimiter=(delimiter.first) case_sensitivity=case_sensitivity use_regex=use_regex else + case use_regex of + False -> + delimiters = split_find_delimiters self delimiter case_sensitivity + Vector.new delimiters.length+1 i-> + start = if i == 0 then 0 else + delimiters.at i-1 . codeunit_end + end = if i == delimiters.length then (Text_Utils.char_length self) else + delimiters.at i . codeunit_start + Text_Utils.substring self start end + True -> case delimiter of + _ : Text -> + case_insensitive = case_sensitivity.is_case_insensitive_in_memory + compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive + compiled_pattern.split self + _ : Vector -> + parenthesize s = "(?:" + s + ")" + combined_delimiter = parenthesize (delimiter.map parenthesize . join '|') + self.split combined_delimiter case_sensitivity=case_sensitivity use_regex=True ## ADVANCED Takes an input string and and a pattern and returns all the matches as a @@ -1359,3 +1375,20 @@ slice_text text char_ranges = char_ranges.map char_range-> sb.append text char_range.start char_range.end sb.toString + +## PRIVATE + + Find occurrences of delimiters in a string. +split_find_delimiters : Text -> Text | Vector Text -> Case_Sensitivity -> Vector Text | Illegal_Argument +split_find_delimiters input delimiter case_sensitivity = + Vector.from_polyglot_array <| case delimiter of + _ : Text -> case case_sensitivity of + Case_Sensitivity.Sensitive -> + Text_Utils.span_of_all input delimiter + Case_Sensitivity.Insensitive locale -> + Text_Utils.span_of_all_case_insensitive input delimiter locale.java_locale + _ : Vector -> case case_sensitivity of + Case_Sensitivity.Sensitive -> + Text_Utils.span_of_all_multiple input delimiter + Case_Sensitivity.Insensitive locale -> + Text_Utils.span_of_all_case_insensitive_multiple input delimiter locale.java_locale diff --git a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java index 8edde276714..6e73b534495 100644 --- a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java @@ -108,4 +108,8 @@ public class Regex_Utils { return result.toString(); } + + public static String regexQuote(String pattern) { + return pattern.replaceAll("[.*+?^${}()|\\[\\]]", "\\\\$0"); + } } diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java index 2da16c249ba..277b0a06814 100644 --- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java @@ -9,6 +9,8 @@ import com.ibm.icu.text.StringSearch; import java.util.ArrayList; import java.util.List; import java.util.Locale; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import org.enso.base.text.CaseFoldedString; import org.enso.base.text.CaseFoldedString.Grapheme; import org.enso.base.text.GraphemeSpan; @@ -343,7 +345,7 @@ public class Text_Utils { public static List span_of_all(String haystack, String needle) { if (needle.isEmpty()) throw new IllegalArgumentException( - "The operation `index_of_all` does not support searching for an empty term."); + "The operation `span_of_all` does not support searching for an empty term."); if (haystack.isEmpty()) return List.of(); StringSearch search = new StringSearch(needle, haystack); @@ -355,6 +357,48 @@ public class Text_Utils { return occurrences; } + /** + * Find spans of all occurrences of a set of needles within the haystack. + * + * @param haystack the string to search + * @param needles the substrings that are searched for + * @return a list of UTF-16 code unit spans at which the needle occurs in the haystack + */ + public static List span_of_all_multiple(String haystack, List needles) { + if (needles.isEmpty() || needles.stream().anyMatch(String::isEmpty)) + throw new IllegalArgumentException( + "The operation `span_of_all_multiple` does not support searching for an empty term."); + if (haystack.isEmpty()) return List.of(); + + StringSearch stringSearches[] = IntStream.range(0, needles.size()) + .mapToObj(i -> new StringSearch(needles.get(i), haystack)) + .toArray(StringSearch[]::new); + List occurrences = new ArrayList<>(); + + int ix = 0; + while (ix != StringSearch.DONE) { + int earliestIndex = -1; + int earliestStart = -1; + for (int i = 0; i < stringSearches.length; ++i) { + StringSearch stringSearch = stringSearches[i]; + int start = stringSearch.following(ix); + if (start != StringSearch.DONE && (earliestStart == -1 || start < earliestStart)) { + earliestIndex = i; + earliestStart = start; + } + } + if (earliestIndex == -1) { + // No more matches. + break; + } + int matchLength = stringSearches[earliestIndex].getMatchLength(); + occurrences.add(new Utf16Span(earliestStart, earliestStart + matchLength)); + ix = earliestStart + matchLength; + } + + return occurrences; + } + /** * Converts a UTF-16 code unit index to index of the grapheme that this code unit belongs to. * @@ -449,18 +493,18 @@ public class Text_Utils { } /** - * Find all occurrences of needle in the haystack + * Find all occurrences of needle in the haystack, case-insensitively. * * @param haystack the string to search - * @param needle the substring that is searched for + * @param needles the substrings that are searched for * @param locale the locale used for case-insensitive comparisons * @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack */ public static List span_of_all_case_insensitive( - String haystack, String needle, Locale locale) { + String haystack, String needle, Locale locale) { if (needle.isEmpty()) throw new IllegalArgumentException( - "The operation `span_of_all_case_insensitive` does not support searching for an empty term."); + "The operation `span_of_all_case_insensitive` does not support searching for an empty term."); if (haystack.isEmpty()) return List.of(); CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale); @@ -477,6 +521,29 @@ public class Text_Utils { return result; } + /** + * Find spans of all occurrences of a set of needles within the haystack, + * case-insensitively. + * + * @param haystack the string to search + * @param needle the substring that is searched for + * @param locale the locale used for case-insensitive comparisons + * @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack + */ + public static List span_of_all_case_insensitive_multiple( + String haystack, List needles, Locale locale) { + CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale); + List foldedNeedles = IntStream.range(0, needles.size()) + .mapToObj(i -> CaseFoldedString.simpleFold(needles.get(i), locale)) + .collect(Collectors.toList()); + var foldedSpans = span_of_all_multiple(foldedHaystack.getFoldedString(), foldedNeedles); + List occurrences = + foldedSpans.stream() + .map(span -> findExtendedSpan(foldedHaystack, span.codeunit_start, span.codeunit_end-span.codeunit_start)) + .collect(Collectors.toList()); + return occurrences; + } + /** * Finds the grapheme span corresponding to the found match indexed with code units. * diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index a9e57197c9b..376dd791c59 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -274,7 +274,61 @@ spec = utf_8_whitespace.split "\s+" use_regex=True . should_equal utf_8_whitespace_split 'abc def\tghi'.split '\\s+' use_regex=True . should_equal ["abc", "def", "ghi"] - Test.specify "exmples should be correct" <| + Test.specify 'should be able to split with a vector of strings' <| + 'azbzczdzezfzg'.split ['b', 'zez'] . should_equal ['az', 'zczd', 'fzg'] + 'a1b2c3d4e5f6g7h8'.split ['c', '5'] . should_equal ['a1b2', '3d4e', 'f6g7h8'] + + Test.specify 'should handle overlapping delimiters correctly' <| + 'blah x 123'.split [' ', ' x ' , 'x'] . should_equal ['blah', '', '', '123'] + 'abcdef'.split ['bc', 'cd'] . should_equal ['a', 'def'] + 'abcdef'.split ['cd', 'bc'] . should_equal ['a', 'def'] + 'abcdef'.split ['bc', 'bcd'] . should_equal ['a', 'def'] + 'abcdef'.split ['bcd', 'bc'] . should_equal ['a', 'ef'] + + Test.specify 'should be able to split with a vector of strings, case insensitively' <| + 'azBZczDZEZFzg'.split ['B', 'zez'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['az', 'ZczD', 'Fzg'] + 'blah X 123'.split [' ', ' x ' , 'x'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['blah', '', '', '123'] + 'A1B2C3D4E5F6G7H8'.split ['c', '5'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['A1B2', '3D4E', 'F6G7H8'] + + Test.specify 'should be able to split with a vector of strings, using regexes' <| + 'a1b2c3d4e5f6g7h8'.split ['[cde]', '[456]'] use_regex=True . should_equal ['a1b2', '3', '', '', '', 'f', 'g7h8'] + 'abcde1fghij2klmnop'.split ["\d", '[hm]'] use_regex=True . should_equal ['abcde', 'fg', 'ij', 'kl', 'nop'] + + Test.specify "should handle unicode normalization the same for single and multiple delimiters" <| + 'aśbs\u0301c'.split 'ś' . should_equal ['a', 'b', 'c'] + 'aśbs\u0301c'.split ['ś'] . should_equal ['a', 'b', 'c'] + 'aśbs\u0301c'.split 's\u0301' . should_equal ['a', 'b', 'c'] + 'aśbs\u0301c'.split ['s\u0301'] . should_equal ['a', 'b', 'c'] + 'aśbs\u0301cdef'.split ['ś', 'de'] . should_equal ['a', 'b', 'c', 'f'] + + Test.specify "should handle unicode normalization the same for single and multiple delimiters, case-insensitively" <| + 'aśbS\u0301c'.split 'ś' case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c'] + 'aśbS\u0301c'.split ['ś'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c'] + 'aŚbS\u0301c'.split 's\u0301' case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c'] + 'aśbS\u0301c'.split ['s\u0301'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c'] + 'aŚbS\u0301cdef'.split ['ś', 'de'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c', 'f'] + + Test.specify "should handle splitting the same for the special case of a 1-element vector" <| + 'abcdefgh'.split 'c' . should_equal ['ab', 'defgh'] + 'abcdefgh'.split ['c'] . should_equal ['ab', 'defgh'] + 'abcdefgh'.split ['c', 'q'] . should_equal ['ab', 'defgh'] + + Test.specify "should split on the leftmost delimiter in the case of a tie" <| + 'abcdefgh'.split ['c', 'cd'] . should_equal ['ab', 'defgh'] + 'abcdefgh'.split ['cd', 'c'] . should_equal ['ab', 'efgh'] + + Test.specify "should throw Illegal_Argument for a bad or empty delimiter" <| + 'abc'.split '' . should_fail_with Illegal_Argument + 'abc'.split [] . should_fail_with Illegal_Argument + 'abc'.split ['a', ''] . should_fail_with Illegal_Argument + 'abc'.split 3 . should_fail_with Illegal_Argument + + 'abc'.split '' case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument + 'abc'.split [] case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument + 'abc'.split ['a', ''] case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument + 'abc'.split 3 case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument + + Test.specify "examples should be correct" <| "Namespace::package::package::Type".split "::" . should_equal ["Namespace", "package", "package", "Type"] "abc--def==>ghi".split "[-=>]+" use_regex=True . should_equal ["abc", "def", "ghi"] 'abc def\tghi'.split '\\s+' use_regex=True . should_equal ["abc", "def", "ghi"]