Update Text.split to take a Vector Text parameter (#6156)

Allows you to pass a vector of delimiters to `split`.
2024-12-23 14:52:01 +03:00 · 2023-04-04 10:44:47 -04:00 · 2023-04-04 10:44:47 -04:00 · fb77f42fd5
commit fb77f42fd5
parent b3e54aeb54
5 changed files with 186 additions and 26 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -374,6 +374,7 @@
 - [Added `Text.tokenize`][6150]
 - [Added support for Date/Time columns in the Postgres backend and added
  `year`/`month`/`day` operations to Table columns.][6153]
+- [`Text.split` can now take a vector of delimiters.][6156]

 [debug-shortcuts]:
  https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -566,6 +567,7 @@
 [6116]: https://github.com/enso-org/enso/pull/6116
 [6150]: https://github.com/enso-org/enso/pull/6150
 [6153]: https://github.com/enso-org/enso/pull/6153
+[6156]: https://github.com/enso-org/enso/pull/6156

 #### Enso Compiler

--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@ -40,6 +40,7 @@ polyglot java import com.ibm.icu.text.BreakIterator
 polyglot java import java.lang.StringBuilder
 polyglot java import org.enso.base.Text_Utils
 polyglot java import org.enso.base.Encoding_Utils
+polyglot java import org.enso.base.Regex_Utils

 ## Returns a new `Text` object with the characters in the reverse order of the input.

@ -301,7 +302,6 @@ Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
   - delimiter: The pattern used to split the text.
   - case_sensitivity: Specifies if the text values should be compared case
     sensitively. The values are compared case sensitively by default.
-   - only_first: If true, only replace the first match.
   - use_regex: If true, the term is used as a regular expression.

   > Example
@ -319,25 +319,41 @@ Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
     Split the text on any whitespace.

         'abc  def\tghi'.split '\\s+' use_regex=True == ["abc", "def", "ghi"]
-Text.split : Text -> Case_Sensitivity -> Boolean -> Boolean -> Vector Text | Illegal_Argument
-Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False = if delimiter.is_empty then Error.throw (Illegal_Argument.Error "The delimiter cannot be empty.") else
-    case use_regex of
-        False ->
-            delimiters = Vector.from_polyglot_array <| case case_sensitivity of
-                Case_Sensitivity.Sensitive ->
-                    Text_Utils.span_of_all self delimiter
-                Case_Sensitivity.Insensitive locale ->
-                    Text_Utils.span_of_all_case_insensitive self delimiter locale.java_locale
-            Vector.new delimiters.length+1 i->
-                start = if i == 0 then 0 else
-                    delimiters.at i-1 . codeunit_end
-                end = if i == delimiters.length then (Text_Utils.char_length self) else
-                    delimiters.at i . codeunit_start
-                Text_Utils.substring self start end
-        True ->
-            case_insensitive = case_sensitivity.is_case_insensitive_in_memory
-            compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
-            compiled_pattern.split self only_first
+
+   > Example
+     Split with a vector of strings.
+
+         'azbzczdzezfzg'.split ['b', 'zez'] == ['az', 'zczd', 'fzg']
+Text.split : Text | Vector Text  -> Case_Sensitivity -> Boolean -> Vector Text | Illegal_Argument
+Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive use_regex=False =
+    delimiter_is_empty = case delimiter of
+        _ : Text -> delimiter.is_empty
+        _ : Vector -> delimiter.is_empty || delimiter.any (.is_empty)
+        _ -> Error.throw (Illegal_Argument.Error "The delimiter must be a Text or Vector of Texts.")
+    if delimiter_is_empty then Error.throw (Illegal_Argument.Error "The delimiter cannot be empty.") else
+        delimiter_is_singleton_vector = case delimiter of
+            _ : Vector -> delimiter.length == 1
+            _ -> False
+        # If it's a vector of one element, just call it on that one element.
+        if delimiter_is_singleton_vector then self.split delimiter=(delimiter.first) case_sensitivity=case_sensitivity use_regex=use_regex else
+            case use_regex of
+                False ->
+                    delimiters = split_find_delimiters self delimiter case_sensitivity
+                    Vector.new delimiters.length+1 i->
+                        start = if i == 0 then 0 else
+                            delimiters.at i-1 . codeunit_end
+                        end = if i == delimiters.length then (Text_Utils.char_length self) else
+                            delimiters.at i . codeunit_start
+                        Text_Utils.substring self start end
+                True -> case delimiter of
+                    _ : Text ->
+                        case_insensitive = case_sensitivity.is_case_insensitive_in_memory
+                        compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
+                        compiled_pattern.split self
+                    _ : Vector ->
+                        parenthesize s = "(?:" + s + ")"
+                        combined_delimiter = parenthesize (delimiter.map parenthesize . join '|')
+                        self.split combined_delimiter case_sensitivity=case_sensitivity use_regex=True

 ## ADVANCED
   Takes an input string and and a pattern and returns all the matches as a
@ -1359,3 +1375,20 @@ slice_text text char_ranges =
    char_ranges.map char_range->
       sb.append text char_range.start char_range.end
    sb.toString
+
+## PRIVATE
+
+   Find occurrences of delimiters in a string.
+split_find_delimiters : Text -> Text | Vector Text  -> Case_Sensitivity -> Vector Text | Illegal_Argument
+split_find_delimiters input delimiter case_sensitivity =
+    Vector.from_polyglot_array <| case delimiter of
+        _ : Text -> case case_sensitivity of
+            Case_Sensitivity.Sensitive ->
+                Text_Utils.span_of_all input delimiter
+            Case_Sensitivity.Insensitive locale ->
+                Text_Utils.span_of_all_case_insensitive input delimiter locale.java_locale
+        _ : Vector -> case case_sensitivity of
+            Case_Sensitivity.Sensitive ->
+                Text_Utils.span_of_all_multiple input delimiter
+            Case_Sensitivity.Insensitive locale ->
+                Text_Utils.span_of_all_case_insensitive_multiple input delimiter locale.java_locale
--- a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java
+++ b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java
@ -108,4 +108,8 @@ public class Regex_Utils {

    return result.toString();
  }
+
+  public static String regexQuote(String pattern) {
+    return pattern.replaceAll("[.*+?^${}()|\\[\\]]", "\\\\$0");
+  }
 }
--- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
+++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
@ -9,6 +9,8 @@ import com.ibm.icu.text.StringSearch;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 import org.enso.base.text.CaseFoldedString;
 import org.enso.base.text.CaseFoldedString.Grapheme;
 import org.enso.base.text.GraphemeSpan;
@ -343,7 +345,7 @@ public class Text_Utils {
  public static List<Utf16Span> span_of_all(String haystack, String needle) {
    if (needle.isEmpty())
      throw new IllegalArgumentException(
-          "The operation `index_of_all` does not support searching for an empty term.");
+              "The operation `span_of_all` does not support searching for an empty term.");
    if (haystack.isEmpty()) return List.of();

    StringSearch search = new StringSearch(needle, haystack);
@ -355,6 +357,48 @@ public class Text_Utils {
    return occurrences;
  }

+  /**
+   * Find spans of all occurrences of a set of needles within the haystack.
+   *
+   * @param haystack the string to search
+   * @param needles the substrings that are searched for
+   * @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
+   */
+  public static List<Utf16Span> span_of_all_multiple(String haystack, List<String> needles) {
+    if (needles.isEmpty() || needles.stream().anyMatch(String::isEmpty))
+      throw new IllegalArgumentException(
+              "The operation `span_of_all_multiple` does not support searching for an empty term.");
+    if (haystack.isEmpty()) return List.of();
+
+    StringSearch stringSearches[] = IntStream.range(0, needles.size())
+            .mapToObj(i -> new StringSearch(needles.get(i), haystack))
+            .toArray(StringSearch[]::new);
+    List<Utf16Span> occurrences = new ArrayList<>();
+
+    int ix = 0;
+    while (ix != StringSearch.DONE) {
+      int earliestIndex = -1;
+      int earliestStart = -1;
+      for (int i = 0; i < stringSearches.length; ++i) {
+        StringSearch stringSearch = stringSearches[i];
+        int start = stringSearch.following(ix);
+        if (start != StringSearch.DONE && (earliestStart == -1 || start < earliestStart)) {
+          earliestIndex = i;
+          earliestStart = start;
+        }
+      }
+      if (earliestIndex == -1) {
+        // No more matches.
+        break;
+      }
+      int matchLength = stringSearches[earliestIndex].getMatchLength();
+      occurrences.add(new Utf16Span(earliestStart, earliestStart + matchLength));
+      ix = earliestStart + matchLength;
+    }
+
+    return occurrences;
+  }
+
  /**
   * Converts a UTF-16 code unit index to index of the grapheme that this code unit belongs to.
   *
@ -449,18 +493,18 @@ public class Text_Utils {
  }

  /**
-   * Find all occurrences of needle in the haystack
+   * Find all occurrences of needle in the haystack, case-insensitively.
   *
   * @param haystack the string to search
-   * @param needle the substring that is searched for
+   * @param needles the substrings that are searched for
   * @param locale the locale used for case-insensitive comparisons
   * @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack
   */
  public static List<GraphemeSpan> span_of_all_case_insensitive(
-      String haystack, String needle, Locale locale) {
+          String haystack, String needle, Locale locale) {
    if (needle.isEmpty())
      throw new IllegalArgumentException(
-          "The operation `span_of_all_case_insensitive` does not support searching for an empty term.");
+              "The operation `span_of_all_case_insensitive` does not support searching for an empty term.");
    if (haystack.isEmpty()) return List.of();

    CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
@ -477,6 +521,29 @@ public class Text_Utils {
    return result;
  }

+  /**
+   * Find spans of all occurrences of a set of needles within the haystack,
+   * case-insensitively.
+   *
+   * @param haystack the string to search
+   * @param needle the substring that is searched for
+   * @param locale the locale used for case-insensitive comparisons
+   * @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack
+   */
+  public static List<GraphemeSpan> span_of_all_case_insensitive_multiple(
+          String haystack, List<String> needles, Locale locale) {
+    CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
+    List<String> foldedNeedles = IntStream.range(0, needles.size())
+            .mapToObj(i -> CaseFoldedString.simpleFold(needles.get(i), locale))
+            .collect(Collectors.toList());
+    var foldedSpans = span_of_all_multiple(foldedHaystack.getFoldedString(), foldedNeedles);
+    List<GraphemeSpan> occurrences =
+            foldedSpans.stream()
+                    .map(span -> findExtendedSpan(foldedHaystack, span.codeunit_start, span.codeunit_end-span.codeunit_start))
+                    .collect(Collectors.toList());
+    return occurrences;
+  }
+
  /**
   * Finds the grapheme span corresponding to the found match indexed with code units.
   *
--- a/test/Tests/src/Data/Text_Spec.enso
+++ b/test/Tests/src/Data/Text_Spec.enso
@ -274,7 +274,61 @@ spec =
            utf_8_whitespace.split "\s+" use_regex=True . should_equal utf_8_whitespace_split
            'abc  def\tghi'.split '\\s+' use_regex=True . should_equal ["abc", "def", "ghi"]

-        Test.specify "exmples should be correct" <|
+        Test.specify 'should be able to split with a vector of strings' <|
+            'azbzczdzezfzg'.split ['b', 'zez'] . should_equal ['az', 'zczd', 'fzg']
+            'a1b2c3d4e5f6g7h8'.split ['c', '5'] . should_equal ['a1b2', '3d4e', 'f6g7h8']
+
+        Test.specify 'should handle overlapping delimiters correctly' <|
+            'blah x 123'.split [' ', ' x ' , 'x'] . should_equal ['blah', '', '', '123']
+            'abcdef'.split ['bc', 'cd'] . should_equal ['a', 'def']
+            'abcdef'.split ['cd', 'bc'] . should_equal ['a', 'def']
+            'abcdef'.split ['bc', 'bcd'] . should_equal ['a', 'def']
+            'abcdef'.split ['bcd', 'bc'] . should_equal ['a', 'ef']
+
+        Test.specify 'should be able to split with a vector of strings, case insensitively' <|
+            'azBZczDZEZFzg'.split ['B', 'zez'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['az', 'ZczD', 'Fzg']
+            'blah X 123'.split [' ', ' x ' , 'x'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['blah', '', '', '123']
+            'A1B2C3D4E5F6G7H8'.split ['c', '5'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['A1B2', '3D4E', 'F6G7H8']
+
+        Test.specify 'should be able to split with a vector of strings, using regexes' <|
+            'a1b2c3d4e5f6g7h8'.split ['[cde]', '[456]'] use_regex=True . should_equal ['a1b2', '3', '', '', '', 'f', 'g7h8']
+            'abcde1fghij2klmnop'.split ["\d", '[hm]'] use_regex=True . should_equal ['abcde', 'fg', 'ij', 'kl', 'nop']
+
+        Test.specify "should handle unicode normalization the same for single and multiple delimiters" <|
+            'aśbs\u0301c'.split 'ś' . should_equal ['a', 'b', 'c']
+            'aśbs\u0301c'.split ['ś'] . should_equal ['a', 'b', 'c']
+            'aśbs\u0301c'.split 's\u0301' . should_equal ['a', 'b', 'c']
+            'aśbs\u0301c'.split ['s\u0301'] . should_equal ['a', 'b', 'c']
+            'aśbs\u0301cdef'.split ['ś', 'de'] . should_equal ['a', 'b', 'c', 'f']
+
+        Test.specify "should handle unicode normalization the same for single and multiple delimiters, case-insensitively" <|
+            'aśbS\u0301c'.split 'ś' case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
+            'aśbS\u0301c'.split ['ś'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
+            'aŚbS\u0301c'.split 's\u0301' case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
+            'aśbS\u0301c'.split ['s\u0301'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
+            'aŚbS\u0301cdef'.split ['ś', 'de'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c', 'f']
+
+        Test.specify "should handle splitting the same for the special case of a 1-element vector" <|
+            'abcdefgh'.split 'c' . should_equal ['ab', 'defgh']
+            'abcdefgh'.split ['c'] . should_equal ['ab', 'defgh']
+            'abcdefgh'.split ['c', 'q'] . should_equal ['ab', 'defgh']
+
+        Test.specify "should split on the leftmost delimiter in the case of a tie" <|
+            'abcdefgh'.split ['c', 'cd'] . should_equal ['ab', 'defgh']
+            'abcdefgh'.split ['cd', 'c'] . should_equal ['ab', 'efgh']
+
+        Test.specify "should throw Illegal_Argument for a bad or empty delimiter" <|
+            'abc'.split '' . should_fail_with Illegal_Argument
+            'abc'.split [] . should_fail_with Illegal_Argument
+            'abc'.split ['a', ''] . should_fail_with Illegal_Argument
+            'abc'.split 3 . should_fail_with Illegal_Argument
+
+            'abc'.split '' case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
+            'abc'.split [] case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
+            'abc'.split ['a', ''] case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
+            'abc'.split 3 case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
+
+        Test.specify "examples should be correct" <|
            "Namespace::package::package::Type".split "::" . should_equal ["Namespace", "package", "package", "Type"]
            "abc--def==>ghi".split "[-=>]+" use_regex=True . should_equal ["abc", "def", "ghi"]
            'abc  def\tghi'.split '\\s+' use_regex=True . should_equal ["abc", "def", "ghi"]