Update Text.split to take a Vector Text parameter (#6156)

Allows you to pass a vector of delimiters to `split`.
This commit is contained in:
GregoryTravis 2023-04-04 10:44:47 -04:00 committed by GitHub
parent b3e54aeb54
commit fb77f42fd5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 186 additions and 26 deletions

View File

@ -374,6 +374,7 @@
- [Added `Text.tokenize`][6150]
- [Added support for Date/Time columns in the Postgres backend and added
`year`/`month`/`day` operations to Table columns.][6153]
- [`Text.split` can now take a vector of delimiters.][6156]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -566,6 +567,7 @@
[6116]: https://github.com/enso-org/enso/pull/6116
[6150]: https://github.com/enso-org/enso/pull/6150
[6153]: https://github.com/enso-org/enso/pull/6153
[6156]: https://github.com/enso-org/enso/pull/6156
#### Enso Compiler

View File

@ -40,6 +40,7 @@ polyglot java import com.ibm.icu.text.BreakIterator
polyglot java import java.lang.StringBuilder
polyglot java import org.enso.base.Text_Utils
polyglot java import org.enso.base.Encoding_Utils
polyglot java import org.enso.base.Regex_Utils
## Returns a new `Text` object with the characters in the reverse order of the input.
@ -301,7 +302,6 @@ Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
- delimiter: The pattern used to split the text.
- case_sensitivity: Specifies if the text values should be compared case
sensitively. The values are compared case sensitively by default.
- only_first: If true, only replace the first match.
- use_regex: If true, the term is used as a regular expression.
> Example
@ -319,25 +319,41 @@ Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
Split the text on any whitespace.
'abc def\tghi'.split '\\s+' use_regex=True == ["abc", "def", "ghi"]
Text.split : Text -> Case_Sensitivity -> Boolean -> Boolean -> Vector Text | Illegal_Argument
Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False = if delimiter.is_empty then Error.throw (Illegal_Argument.Error "The delimiter cannot be empty.") else
case use_regex of
False ->
delimiters = Vector.from_polyglot_array <| case case_sensitivity of
Case_Sensitivity.Sensitive ->
Text_Utils.span_of_all self delimiter
Case_Sensitivity.Insensitive locale ->
Text_Utils.span_of_all_case_insensitive self delimiter locale.java_locale
Vector.new delimiters.length+1 i->
start = if i == 0 then 0 else
delimiters.at i-1 . codeunit_end
end = if i == delimiters.length then (Text_Utils.char_length self) else
delimiters.at i . codeunit_start
Text_Utils.substring self start end
True ->
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
compiled_pattern.split self only_first
> Example
Split with a vector of strings.
'azbzczdzezfzg'.split ['b', 'zez'] == ['az', 'zczd', 'fzg']
Text.split : Text | Vector Text -> Case_Sensitivity -> Boolean -> Vector Text | Illegal_Argument
Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive use_regex=False =
delimiter_is_empty = case delimiter of
_ : Text -> delimiter.is_empty
_ : Vector -> delimiter.is_empty || delimiter.any (.is_empty)
_ -> Error.throw (Illegal_Argument.Error "The delimiter must be a Text or Vector of Texts.")
if delimiter_is_empty then Error.throw (Illegal_Argument.Error "The delimiter cannot be empty.") else
delimiter_is_singleton_vector = case delimiter of
_ : Vector -> delimiter.length == 1
_ -> False
# If it's a vector of one element, just call it on that one element.
if delimiter_is_singleton_vector then self.split delimiter=(delimiter.first) case_sensitivity=case_sensitivity use_regex=use_regex else
case use_regex of
False ->
delimiters = split_find_delimiters self delimiter case_sensitivity
Vector.new delimiters.length+1 i->
start = if i == 0 then 0 else
delimiters.at i-1 . codeunit_end
end = if i == delimiters.length then (Text_Utils.char_length self) else
delimiters.at i . codeunit_start
Text_Utils.substring self start end
True -> case delimiter of
_ : Text ->
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
compiled_pattern.split self
_ : Vector ->
parenthesize s = "(?:" + s + ")"
combined_delimiter = parenthesize (delimiter.map parenthesize . join '|')
self.split combined_delimiter case_sensitivity=case_sensitivity use_regex=True
## ADVANCED
Takes an input string and and a pattern and returns all the matches as a
@ -1359,3 +1375,20 @@ slice_text text char_ranges =
char_ranges.map char_range->
sb.append text char_range.start char_range.end
sb.toString
## PRIVATE
Find occurrences of delimiters in a string.
split_find_delimiters : Text -> Text | Vector Text -> Case_Sensitivity -> Vector Text | Illegal_Argument
split_find_delimiters input delimiter case_sensitivity =
Vector.from_polyglot_array <| case delimiter of
_ : Text -> case case_sensitivity of
Case_Sensitivity.Sensitive ->
Text_Utils.span_of_all input delimiter
Case_Sensitivity.Insensitive locale ->
Text_Utils.span_of_all_case_insensitive input delimiter locale.java_locale
_ : Vector -> case case_sensitivity of
Case_Sensitivity.Sensitive ->
Text_Utils.span_of_all_multiple input delimiter
Case_Sensitivity.Insensitive locale ->
Text_Utils.span_of_all_case_insensitive_multiple input delimiter locale.java_locale

View File

@ -108,4 +108,8 @@ public class Regex_Utils {
return result.toString();
}
public static String regexQuote(String pattern) {
return pattern.replaceAll("[.*+?^${}()|\\[\\]]", "\\\\$0");
}
}

View File

@ -9,6 +9,8 @@ import com.ibm.icu.text.StringSearch;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.enso.base.text.CaseFoldedString;
import org.enso.base.text.CaseFoldedString.Grapheme;
import org.enso.base.text.GraphemeSpan;
@ -343,7 +345,7 @@ public class Text_Utils {
public static List<Utf16Span> span_of_all(String haystack, String needle) {
if (needle.isEmpty())
throw new IllegalArgumentException(
"The operation `index_of_all` does not support searching for an empty term.");
"The operation `span_of_all` does not support searching for an empty term.");
if (haystack.isEmpty()) return List.of();
StringSearch search = new StringSearch(needle, haystack);
@ -355,6 +357,48 @@ public class Text_Utils {
return occurrences;
}
/**
* Find spans of all occurrences of a set of needles within the haystack.
*
* @param haystack the string to search
* @param needles the substrings that are searched for
* @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
*/
public static List<Utf16Span> span_of_all_multiple(String haystack, List<String> needles) {
if (needles.isEmpty() || needles.stream().anyMatch(String::isEmpty))
throw new IllegalArgumentException(
"The operation `span_of_all_multiple` does not support searching for an empty term.");
if (haystack.isEmpty()) return List.of();
StringSearch stringSearches[] = IntStream.range(0, needles.size())
.mapToObj(i -> new StringSearch(needles.get(i), haystack))
.toArray(StringSearch[]::new);
List<Utf16Span> occurrences = new ArrayList<>();
int ix = 0;
while (ix != StringSearch.DONE) {
int earliestIndex = -1;
int earliestStart = -1;
for (int i = 0; i < stringSearches.length; ++i) {
StringSearch stringSearch = stringSearches[i];
int start = stringSearch.following(ix);
if (start != StringSearch.DONE && (earliestStart == -1 || start < earliestStart)) {
earliestIndex = i;
earliestStart = start;
}
}
if (earliestIndex == -1) {
// No more matches.
break;
}
int matchLength = stringSearches[earliestIndex].getMatchLength();
occurrences.add(new Utf16Span(earliestStart, earliestStart + matchLength));
ix = earliestStart + matchLength;
}
return occurrences;
}
/**
* Converts a UTF-16 code unit index to index of the grapheme that this code unit belongs to.
*
@ -449,18 +493,18 @@ public class Text_Utils {
}
/**
* Find all occurrences of needle in the haystack
* Find all occurrences of needle in the haystack, case-insensitively.
*
* @param haystack the string to search
* @param needle the substring that is searched for
* @param needles the substrings that are searched for
* @param locale the locale used for case-insensitive comparisons
* @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack
*/
public static List<GraphemeSpan> span_of_all_case_insensitive(
String haystack, String needle, Locale locale) {
String haystack, String needle, Locale locale) {
if (needle.isEmpty())
throw new IllegalArgumentException(
"The operation `span_of_all_case_insensitive` does not support searching for an empty term.");
"The operation `span_of_all_case_insensitive` does not support searching for an empty term.");
if (haystack.isEmpty()) return List.of();
CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
@ -477,6 +521,29 @@ public class Text_Utils {
return result;
}
/**
* Find spans of all occurrences of a set of needles within the haystack,
* case-insensitively.
*
* @param haystack the string to search
* @param needle the substring that is searched for
* @param locale the locale used for case-insensitive comparisons
* @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack
*/
public static List<GraphemeSpan> span_of_all_case_insensitive_multiple(
String haystack, List<String> needles, Locale locale) {
CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
List<String> foldedNeedles = IntStream.range(0, needles.size())
.mapToObj(i -> CaseFoldedString.simpleFold(needles.get(i), locale))
.collect(Collectors.toList());
var foldedSpans = span_of_all_multiple(foldedHaystack.getFoldedString(), foldedNeedles);
List<GraphemeSpan> occurrences =
foldedSpans.stream()
.map(span -> findExtendedSpan(foldedHaystack, span.codeunit_start, span.codeunit_end-span.codeunit_start))
.collect(Collectors.toList());
return occurrences;
}
/**
* Finds the grapheme span corresponding to the found match indexed with code units.
*

View File

@ -274,7 +274,61 @@ spec =
utf_8_whitespace.split "\s+" use_regex=True . should_equal utf_8_whitespace_split
'abc def\tghi'.split '\\s+' use_regex=True . should_equal ["abc", "def", "ghi"]
Test.specify "exmples should be correct" <|
Test.specify 'should be able to split with a vector of strings' <|
'azbzczdzezfzg'.split ['b', 'zez'] . should_equal ['az', 'zczd', 'fzg']
'a1b2c3d4e5f6g7h8'.split ['c', '5'] . should_equal ['a1b2', '3d4e', 'f6g7h8']
Test.specify 'should handle overlapping delimiters correctly' <|
'blah x 123'.split [' ', ' x ' , 'x'] . should_equal ['blah', '', '', '123']
'abcdef'.split ['bc', 'cd'] . should_equal ['a', 'def']
'abcdef'.split ['cd', 'bc'] . should_equal ['a', 'def']
'abcdef'.split ['bc', 'bcd'] . should_equal ['a', 'def']
'abcdef'.split ['bcd', 'bc'] . should_equal ['a', 'ef']
Test.specify 'should be able to split with a vector of strings, case insensitively' <|
'azBZczDZEZFzg'.split ['B', 'zez'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['az', 'ZczD', 'Fzg']
'blah X 123'.split [' ', ' x ' , 'x'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['blah', '', '', '123']
'A1B2C3D4E5F6G7H8'.split ['c', '5'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['A1B2', '3D4E', 'F6G7H8']
Test.specify 'should be able to split with a vector of strings, using regexes' <|
'a1b2c3d4e5f6g7h8'.split ['[cde]', '[456]'] use_regex=True . should_equal ['a1b2', '3', '', '', '', 'f', 'g7h8']
'abcde1fghij2klmnop'.split ["\d", '[hm]'] use_regex=True . should_equal ['abcde', 'fg', 'ij', 'kl', 'nop']
Test.specify "should handle unicode normalization the same for single and multiple delimiters" <|
'aśbs\u0301c'.split 'ś' . should_equal ['a', 'b', 'c']
'aśbs\u0301c'.split ['ś'] . should_equal ['a', 'b', 'c']
'aśbs\u0301c'.split 's\u0301' . should_equal ['a', 'b', 'c']
'aśbs\u0301c'.split ['s\u0301'] . should_equal ['a', 'b', 'c']
'aśbs\u0301cdef'.split ['ś', 'de'] . should_equal ['a', 'b', 'c', 'f']
Test.specify "should handle unicode normalization the same for single and multiple delimiters, case-insensitively" <|
'aśbS\u0301c'.split 'ś' case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
'aśbS\u0301c'.split ['ś'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
'aŚbS\u0301c'.split 's\u0301' case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
'aśbS\u0301c'.split ['s\u0301'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
'aŚbS\u0301cdef'.split ['ś', 'de'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c', 'f']
Test.specify "should handle splitting the same for the special case of a 1-element vector" <|
'abcdefgh'.split 'c' . should_equal ['ab', 'defgh']
'abcdefgh'.split ['c'] . should_equal ['ab', 'defgh']
'abcdefgh'.split ['c', 'q'] . should_equal ['ab', 'defgh']
Test.specify "should split on the leftmost delimiter in the case of a tie" <|
'abcdefgh'.split ['c', 'cd'] . should_equal ['ab', 'defgh']
'abcdefgh'.split ['cd', 'c'] . should_equal ['ab', 'efgh']
Test.specify "should throw Illegal_Argument for a bad or empty delimiter" <|
'abc'.split '' . should_fail_with Illegal_Argument
'abc'.split [] . should_fail_with Illegal_Argument
'abc'.split ['a', ''] . should_fail_with Illegal_Argument
'abc'.split 3 . should_fail_with Illegal_Argument
'abc'.split '' case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
'abc'.split [] case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
'abc'.split ['a', ''] case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
'abc'.split 3 case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
Test.specify "examples should be correct" <|
"Namespace::package::package::Type".split "::" . should_equal ["Namespace", "package", "package", "Type"]
"abc--def==>ghi".split "[-=>]+" use_regex=True . should_equal ["abc", "def", "ghi"]
'abc def\tghi'.split '\\s+' use_regex=True . should_equal ["abc", "def", "ghi"]