mirror of
https://github.com/enso-org/enso.git
synced 2024-12-23 14:52:01 +03:00
Update Text.split
to take a Vector Text
parameter (#6156)
Allows you to pass a vector of delimiters to `split`.
This commit is contained in:
parent
b3e54aeb54
commit
fb77f42fd5
@ -374,6 +374,7 @@
|
||||
- [Added `Text.tokenize`][6150]
|
||||
- [Added support for Date/Time columns in the Postgres backend and added
|
||||
`year`/`month`/`day` operations to Table columns.][6153]
|
||||
- [`Text.split` can now take a vector of delimiters.][6156]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -566,6 +567,7 @@
|
||||
[6116]: https://github.com/enso-org/enso/pull/6116
|
||||
[6150]: https://github.com/enso-org/enso/pull/6150
|
||||
[6153]: https://github.com/enso-org/enso/pull/6153
|
||||
[6156]: https://github.com/enso-org/enso/pull/6156
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -40,6 +40,7 @@ polyglot java import com.ibm.icu.text.BreakIterator
|
||||
polyglot java import java.lang.StringBuilder
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
polyglot java import org.enso.base.Encoding_Utils
|
||||
polyglot java import org.enso.base.Regex_Utils
|
||||
|
||||
## Returns a new `Text` object with the characters in the reverse order of the input.
|
||||
|
||||
@ -301,7 +302,6 @@ Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
- delimiter: The pattern used to split the text.
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively. The values are compared case sensitively by default.
|
||||
- only_first: If true, only replace the first match.
|
||||
- use_regex: If true, the term is used as a regular expression.
|
||||
|
||||
> Example
|
||||
@ -319,25 +319,41 @@ Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
Split the text on any whitespace.
|
||||
|
||||
'abc def\tghi'.split '\\s+' use_regex=True == ["abc", "def", "ghi"]
|
||||
Text.split : Text -> Case_Sensitivity -> Boolean -> Boolean -> Vector Text | Illegal_Argument
|
||||
Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False = if delimiter.is_empty then Error.throw (Illegal_Argument.Error "The delimiter cannot be empty.") else
|
||||
case use_regex of
|
||||
False ->
|
||||
delimiters = Vector.from_polyglot_array <| case case_sensitivity of
|
||||
Case_Sensitivity.Sensitive ->
|
||||
Text_Utils.span_of_all self delimiter
|
||||
Case_Sensitivity.Insensitive locale ->
|
||||
Text_Utils.span_of_all_case_insensitive self delimiter locale.java_locale
|
||||
Vector.new delimiters.length+1 i->
|
||||
start = if i == 0 then 0 else
|
||||
delimiters.at i-1 . codeunit_end
|
||||
end = if i == delimiters.length then (Text_Utils.char_length self) else
|
||||
delimiters.at i . codeunit_start
|
||||
Text_Utils.substring self start end
|
||||
True ->
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
|
||||
compiled_pattern.split self only_first
|
||||
|
||||
> Example
|
||||
Split with a vector of strings.
|
||||
|
||||
'azbzczdzezfzg'.split ['b', 'zez'] == ['az', 'zczd', 'fzg']
|
||||
Text.split : Text | Vector Text -> Case_Sensitivity -> Boolean -> Vector Text | Illegal_Argument
|
||||
Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive use_regex=False =
|
||||
delimiter_is_empty = case delimiter of
|
||||
_ : Text -> delimiter.is_empty
|
||||
_ : Vector -> delimiter.is_empty || delimiter.any (.is_empty)
|
||||
_ -> Error.throw (Illegal_Argument.Error "The delimiter must be a Text or Vector of Texts.")
|
||||
if delimiter_is_empty then Error.throw (Illegal_Argument.Error "The delimiter cannot be empty.") else
|
||||
delimiter_is_singleton_vector = case delimiter of
|
||||
_ : Vector -> delimiter.length == 1
|
||||
_ -> False
|
||||
# If it's a vector of one element, just call it on that one element.
|
||||
if delimiter_is_singleton_vector then self.split delimiter=(delimiter.first) case_sensitivity=case_sensitivity use_regex=use_regex else
|
||||
case use_regex of
|
||||
False ->
|
||||
delimiters = split_find_delimiters self delimiter case_sensitivity
|
||||
Vector.new delimiters.length+1 i->
|
||||
start = if i == 0 then 0 else
|
||||
delimiters.at i-1 . codeunit_end
|
||||
end = if i == delimiters.length then (Text_Utils.char_length self) else
|
||||
delimiters.at i . codeunit_start
|
||||
Text_Utils.substring self start end
|
||||
True -> case delimiter of
|
||||
_ : Text ->
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
|
||||
compiled_pattern.split self
|
||||
_ : Vector ->
|
||||
parenthesize s = "(?:" + s + ")"
|
||||
combined_delimiter = parenthesize (delimiter.map parenthesize . join '|')
|
||||
self.split combined_delimiter case_sensitivity=case_sensitivity use_regex=True
|
||||
|
||||
## ADVANCED
|
||||
Takes an input string and and a pattern and returns all the matches as a
|
||||
@ -1359,3 +1375,20 @@ slice_text text char_ranges =
|
||||
char_ranges.map char_range->
|
||||
sb.append text char_range.start char_range.end
|
||||
sb.toString
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Find occurrences of delimiters in a string.
|
||||
split_find_delimiters : Text -> Text | Vector Text -> Case_Sensitivity -> Vector Text | Illegal_Argument
|
||||
split_find_delimiters input delimiter case_sensitivity =
|
||||
Vector.from_polyglot_array <| case delimiter of
|
||||
_ : Text -> case case_sensitivity of
|
||||
Case_Sensitivity.Sensitive ->
|
||||
Text_Utils.span_of_all input delimiter
|
||||
Case_Sensitivity.Insensitive locale ->
|
||||
Text_Utils.span_of_all_case_insensitive input delimiter locale.java_locale
|
||||
_ : Vector -> case case_sensitivity of
|
||||
Case_Sensitivity.Sensitive ->
|
||||
Text_Utils.span_of_all_multiple input delimiter
|
||||
Case_Sensitivity.Insensitive locale ->
|
||||
Text_Utils.span_of_all_case_insensitive_multiple input delimiter locale.java_locale
|
||||
|
@ -108,4 +108,8 @@ public class Regex_Utils {
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String regexQuote(String pattern) {
|
||||
return pattern.replaceAll("[.*+?^${}()|\\[\\]]", "\\\\$0");
|
||||
}
|
||||
}
|
||||
|
@ -9,6 +9,8 @@ import com.ibm.icu.text.StringSearch;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import org.enso.base.text.CaseFoldedString;
|
||||
import org.enso.base.text.CaseFoldedString.Grapheme;
|
||||
import org.enso.base.text.GraphemeSpan;
|
||||
@ -343,7 +345,7 @@ public class Text_Utils {
|
||||
public static List<Utf16Span> span_of_all(String haystack, String needle) {
|
||||
if (needle.isEmpty())
|
||||
throw new IllegalArgumentException(
|
||||
"The operation `index_of_all` does not support searching for an empty term.");
|
||||
"The operation `span_of_all` does not support searching for an empty term.");
|
||||
if (haystack.isEmpty()) return List.of();
|
||||
|
||||
StringSearch search = new StringSearch(needle, haystack);
|
||||
@ -355,6 +357,48 @@ public class Text_Utils {
|
||||
return occurrences;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find spans of all occurrences of a set of needles within the haystack.
|
||||
*
|
||||
* @param haystack the string to search
|
||||
* @param needles the substrings that are searched for
|
||||
* @return a list of UTF-16 code unit spans at which the needle occurs in the haystack
|
||||
*/
|
||||
public static List<Utf16Span> span_of_all_multiple(String haystack, List<String> needles) {
|
||||
if (needles.isEmpty() || needles.stream().anyMatch(String::isEmpty))
|
||||
throw new IllegalArgumentException(
|
||||
"The operation `span_of_all_multiple` does not support searching for an empty term.");
|
||||
if (haystack.isEmpty()) return List.of();
|
||||
|
||||
StringSearch stringSearches[] = IntStream.range(0, needles.size())
|
||||
.mapToObj(i -> new StringSearch(needles.get(i), haystack))
|
||||
.toArray(StringSearch[]::new);
|
||||
List<Utf16Span> occurrences = new ArrayList<>();
|
||||
|
||||
int ix = 0;
|
||||
while (ix != StringSearch.DONE) {
|
||||
int earliestIndex = -1;
|
||||
int earliestStart = -1;
|
||||
for (int i = 0; i < stringSearches.length; ++i) {
|
||||
StringSearch stringSearch = stringSearches[i];
|
||||
int start = stringSearch.following(ix);
|
||||
if (start != StringSearch.DONE && (earliestStart == -1 || start < earliestStart)) {
|
||||
earliestIndex = i;
|
||||
earliestStart = start;
|
||||
}
|
||||
}
|
||||
if (earliestIndex == -1) {
|
||||
// No more matches.
|
||||
break;
|
||||
}
|
||||
int matchLength = stringSearches[earliestIndex].getMatchLength();
|
||||
occurrences.add(new Utf16Span(earliestStart, earliestStart + matchLength));
|
||||
ix = earliestStart + matchLength;
|
||||
}
|
||||
|
||||
return occurrences;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a UTF-16 code unit index to index of the grapheme that this code unit belongs to.
|
||||
*
|
||||
@ -449,18 +493,18 @@ public class Text_Utils {
|
||||
}
|
||||
|
||||
/**
|
||||
* Find all occurrences of needle in the haystack
|
||||
* Find all occurrences of needle in the haystack, case-insensitively.
|
||||
*
|
||||
* @param haystack the string to search
|
||||
* @param needle the substring that is searched for
|
||||
* @param needles the substrings that are searched for
|
||||
* @param locale the locale used for case-insensitive comparisons
|
||||
* @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack
|
||||
*/
|
||||
public static List<GraphemeSpan> span_of_all_case_insensitive(
|
||||
String haystack, String needle, Locale locale) {
|
||||
String haystack, String needle, Locale locale) {
|
||||
if (needle.isEmpty())
|
||||
throw new IllegalArgumentException(
|
||||
"The operation `span_of_all_case_insensitive` does not support searching for an empty term.");
|
||||
"The operation `span_of_all_case_insensitive` does not support searching for an empty term.");
|
||||
if (haystack.isEmpty()) return List.of();
|
||||
|
||||
CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
|
||||
@ -477,6 +521,29 @@ public class Text_Utils {
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find spans of all occurrences of a set of needles within the haystack,
|
||||
* case-insensitively.
|
||||
*
|
||||
* @param haystack the string to search
|
||||
* @param needle the substring that is searched for
|
||||
* @param locale the locale used for case-insensitive comparisons
|
||||
* @return a list of extended-grapheme-cluster spans at which the needle occurs in the haystack
|
||||
*/
|
||||
public static List<GraphemeSpan> span_of_all_case_insensitive_multiple(
|
||||
String haystack, List<String> needles, Locale locale) {
|
||||
CaseFoldedString foldedHaystack = CaseFoldedString.fold(haystack, locale);
|
||||
List<String> foldedNeedles = IntStream.range(0, needles.size())
|
||||
.mapToObj(i -> CaseFoldedString.simpleFold(needles.get(i), locale))
|
||||
.collect(Collectors.toList());
|
||||
var foldedSpans = span_of_all_multiple(foldedHaystack.getFoldedString(), foldedNeedles);
|
||||
List<GraphemeSpan> occurrences =
|
||||
foldedSpans.stream()
|
||||
.map(span -> findExtendedSpan(foldedHaystack, span.codeunit_start, span.codeunit_end-span.codeunit_start))
|
||||
.collect(Collectors.toList());
|
||||
return occurrences;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the grapheme span corresponding to the found match indexed with code units.
|
||||
*
|
||||
|
@ -274,7 +274,61 @@ spec =
|
||||
utf_8_whitespace.split "\s+" use_regex=True . should_equal utf_8_whitespace_split
|
||||
'abc def\tghi'.split '\\s+' use_regex=True . should_equal ["abc", "def", "ghi"]
|
||||
|
||||
Test.specify "exmples should be correct" <|
|
||||
Test.specify 'should be able to split with a vector of strings' <|
|
||||
'azbzczdzezfzg'.split ['b', 'zez'] . should_equal ['az', 'zczd', 'fzg']
|
||||
'a1b2c3d4e5f6g7h8'.split ['c', '5'] . should_equal ['a1b2', '3d4e', 'f6g7h8']
|
||||
|
||||
Test.specify 'should handle overlapping delimiters correctly' <|
|
||||
'blah x 123'.split [' ', ' x ' , 'x'] . should_equal ['blah', '', '', '123']
|
||||
'abcdef'.split ['bc', 'cd'] . should_equal ['a', 'def']
|
||||
'abcdef'.split ['cd', 'bc'] . should_equal ['a', 'def']
|
||||
'abcdef'.split ['bc', 'bcd'] . should_equal ['a', 'def']
|
||||
'abcdef'.split ['bcd', 'bc'] . should_equal ['a', 'ef']
|
||||
|
||||
Test.specify 'should be able to split with a vector of strings, case insensitively' <|
|
||||
'azBZczDZEZFzg'.split ['B', 'zez'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['az', 'ZczD', 'Fzg']
|
||||
'blah X 123'.split [' ', ' x ' , 'x'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['blah', '', '', '123']
|
||||
'A1B2C3D4E5F6G7H8'.split ['c', '5'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['A1B2', '3D4E', 'F6G7H8']
|
||||
|
||||
Test.specify 'should be able to split with a vector of strings, using regexes' <|
|
||||
'a1b2c3d4e5f6g7h8'.split ['[cde]', '[456]'] use_regex=True . should_equal ['a1b2', '3', '', '', '', 'f', 'g7h8']
|
||||
'abcde1fghij2klmnop'.split ["\d", '[hm]'] use_regex=True . should_equal ['abcde', 'fg', 'ij', 'kl', 'nop']
|
||||
|
||||
Test.specify "should handle unicode normalization the same for single and multiple delimiters" <|
|
||||
'aśbs\u0301c'.split 'ś' . should_equal ['a', 'b', 'c']
|
||||
'aśbs\u0301c'.split ['ś'] . should_equal ['a', 'b', 'c']
|
||||
'aśbs\u0301c'.split 's\u0301' . should_equal ['a', 'b', 'c']
|
||||
'aśbs\u0301c'.split ['s\u0301'] . should_equal ['a', 'b', 'c']
|
||||
'aśbs\u0301cdef'.split ['ś', 'de'] . should_equal ['a', 'b', 'c', 'f']
|
||||
|
||||
Test.specify "should handle unicode normalization the same for single and multiple delimiters, case-insensitively" <|
|
||||
'aśbS\u0301c'.split 'ś' case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
|
||||
'aśbS\u0301c'.split ['ś'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
|
||||
'aŚbS\u0301c'.split 's\u0301' case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
|
||||
'aśbS\u0301c'.split ['s\u0301'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
|
||||
'aŚbS\u0301cdef'.split ['ś', 'de'] case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c', 'f']
|
||||
|
||||
Test.specify "should handle splitting the same for the special case of a 1-element vector" <|
|
||||
'abcdefgh'.split 'c' . should_equal ['ab', 'defgh']
|
||||
'abcdefgh'.split ['c'] . should_equal ['ab', 'defgh']
|
||||
'abcdefgh'.split ['c', 'q'] . should_equal ['ab', 'defgh']
|
||||
|
||||
Test.specify "should split on the leftmost delimiter in the case of a tie" <|
|
||||
'abcdefgh'.split ['c', 'cd'] . should_equal ['ab', 'defgh']
|
||||
'abcdefgh'.split ['cd', 'c'] . should_equal ['ab', 'efgh']
|
||||
|
||||
Test.specify "should throw Illegal_Argument for a bad or empty delimiter" <|
|
||||
'abc'.split '' . should_fail_with Illegal_Argument
|
||||
'abc'.split [] . should_fail_with Illegal_Argument
|
||||
'abc'.split ['a', ''] . should_fail_with Illegal_Argument
|
||||
'abc'.split 3 . should_fail_with Illegal_Argument
|
||||
|
||||
'abc'.split '' case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
|
||||
'abc'.split [] case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
|
||||
'abc'.split ['a', ''] case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
|
||||
'abc'.split 3 case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "examples should be correct" <|
|
||||
"Namespace::package::package::Type".split "::" . should_equal ["Namespace", "package", "package", "Type"]
|
||||
"abc--def==>ghi".split "[-=>]+" use_regex=True . should_equal ["abc", "def", "ghi"]
|
||||
'abc def\tghi'.split '\\s+' use_regex=True . should_equal ["abc", "def", "ghi"]
|
||||
|
Loading…
Reference in New Issue
Block a user