diff --git a/CHANGELOG.md b/CHANGELOG.md index 6db21559e4a..dd7575a3bc3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -113,6 +113,7 @@ encoding to `File.read_text`. New `File.read` API.][3390] - [Improved the `Range` type. Added a `down_to` counterpart to `up_to` and `with_step` allowing to change the range step.][3408] +- [Aligned `Text.split` API with other methods and added `Text.lines`.][3415] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -173,6 +174,7 @@ [3393]: https://github.com/enso-org/enso/pull/3393 [3390]: https://github.com/enso-org/enso/pull/3390 [3408]: https://github.com/enso-org/enso/pull/3408 +[3415]: https://github.com/enso-org/enso/pull/3415 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 9231780010b..b2e2e5ff4aa 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -10,7 +10,6 @@ import Standard.Base.Data.Text.Case import Standard.Base.Data.Text.Location import Standard.Base.Data.Text.Line_Ending_Style from Standard.Base.Data.Text.Span as Span_Module import Span -import Standard.Base.Data.Text.Split_Kind import Standard.Base.Data.Text.Text_Sub_Range from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning @@ -22,7 +21,6 @@ from Standard.Builtins export Text export Standard.Base.Data.Text.Matching_Mode export Standard.Base.Data.Text.Case export Standard.Base.Data.Text.Location -export Standard.Base.Data.Text.Split_Kind export Standard.Base.Data.Text.Line_Ending_Style polyglot java import com.ibm.icu.lang.UCharacter @@ -348,82 +346,49 @@ Text.find pattern mode=Mode.All match_ascii=Nothing case_insensitive=Nothing dot ## ALIAS Split Text - Takes a separator and returns the vector that results from splitting `this` - on the configured number of occurrences of `separator`. + Takes a delimiter and returns the vector that results from splitting `this` + on each of its occurrences. Arguments: - - separator: The pattern used to split the text. - - mode: This argument specifies how many matches the engine will try and - find. When mode is set to either `Mode.First` or `Mode.Full`, this method - will return either a single `Text` or `Nothing`. If set to an `Integer` or - `Mode.All`, this method will return either a `Vector Text` or `Nothing`. - - match_ascii: Enables or disables pure-ASCII matching for the regex. If you - know your data only contains ASCII then you can enable this for a - performance boost on some regex engines. - - case_insensitive: Enables or disables case-insensitive matching. Case - insensitive matching behaves as if it normalises the case of all input - text before matching on it. - - dot_matches_newline: Enables or disables the dot matches newline option. - This specifies that the `.` special character should match everything - _including_ newline characters. Without this flag, it will match all - characters _except_ newlines. - - multiline: Enables or disables the multiline option. Multiline specifies - that the `^` and `$` pattern characters match the start and end of lines, - as well as the start and end of the input respectively. - - verbose: Enables or disables the verbose mode for the regular expression. - In verbose mode, the following changes apply: - - Whitespace within the pattern is ignored, except when within a - character class or when preceeded by an unescaped backslash, or within - grouping constructs (e.g. `(?...)`). - - When a line contains a `#`, that is not in a character class and is not - preceeded by an unescaped backslash, all characters from the leftmost - such `#` to the end of the line are ignored. That is to say, they act - as _comments_ in the regex. - - extra_opts: Specifies additional options in a vector. This allows options - to be supplied and computed without having to break them out into arguments - to the function. Where these overlap with one of the flags (`match_ascii`, - `case_insensitive`, `dot_matches_newline`, `multiline` and `verbose`), the - flags take precedence. - - ! Boolean Flags and Extra Options - This function contains a number of arguments that are boolean flags that - enable or disable common options for the regex. At the same time, it also - provides the ability to specify options in the `extra_opts` argument. - - Where one of the flags is _set_ (has the value `True` or `False`), the - value of the flag takes precedence over the value in `extra_opts` when - merging the options to the engine. The flags are _unset_ (have value - `Nothing`) by default. - - > Example - Split the comma-separated text into a vector of items. - - "ham,eggs,cheese,tomatoes".split "," - - > Example - Split the text on whitespace into a vector of items. - - "ham eggs cheese tomatoes".split Split_Kind.Whitespace + - delimiter: The pattern used to split the text. + - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity + rules specified in the matcher. If a `Regex_Matcher`, the term is used as a + regular expression and matched using the associated options. > Example Split the text on any occurrence of the separator `"::"`. example_split = text = "Namespace::package::package::Type" - text.split ":::" -Text.split : Split_Kind -> Mode.Mode -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Vector.Vector Option.Option -> Vector.Vector Text -Text.split separator=Split_Kind.Whitespace mode=Mode.All match_ascii=Nothing case_insensitive=Nothing dot_matches_newline=Nothing multiline=Nothing comments=Nothing extra_opts=[] = - case separator of - Split_Kind.Words -> Vector.Vector this.words - Split_Kind.Whitespace -> - pattern = Regex.compile "\s+" match_ascii=match_ascii case_insensitive=case_insensitive dot_matches_newline=dot_matches_newline multiline=multiline comments=comments extra_opts=extra_opts - pattern.split this mode=mode - Split_Kind.Lines -> - pattern = Regex.compile "\v+" match_ascii=match_ascii case_insensitive=case_insensitive dot_matches_newline=dot_matches_newline multiline=multiline comments=comments extra_opts=extra_opts - pattern.split this mode=mode - Text -> - pattern = Regex.compile separator match_ascii=match_ascii case_insensitive=case_insensitive dot_matches_newline=dot_matches_newline multiline=multiline comments=comments extra_opts=extra_opts - pattern.split this mode=mode + text.split "::" == ["Namespace", "package", "package", "Type"] + + > Example + Split the text on a regex pattern. + + "abc--def==>ghi".split "[-=>]+" Regex_Matcher == ["abc", "def", "ghi"] + + > Example + Split the text on any whitespace. + + 'abc def\tghi'.split '\\s+' Regex_Matcher == ["abc", "def", "ghi"] +Text.split : Text -> (Text_Matcher | Regex_Matcher) -> Vector.Vector Text +Text.split delimiter="," matcher=Text_Matcher = if delimiter.is_empty then Error.throw (Illegal_Argument_Error "The delimiter cannot be empty.") else + case matcher of + Text_Matcher case_sensitivity -> + delimiters = Vector.Vector <| case case_sensitivity of + True -> + Text_Utils.span_of_all this delimiter + Case_Insensitive locale -> + Text_Utils.span_of_all_case_insensitive this delimiter locale.java_locale + Vector.new delimiters.length+1 i-> + start = if i == 0 then 0 else + delimiters.at i-1 . codeunit_end + end = if i == delimiters.length then (Text_Utils.char_length this) else + delimiters.at i . codeunit_start + Text_Utils.substring this start end + Regex_Matcher _ _ _ _ _ -> + compiled_pattern = matcher.compile delimiter + compiled_pattern.split this mode=Mode.All ## ALIAS Replace Text Replaces the first, last, or all occurrences of term with new_text in the @@ -547,7 +512,12 @@ Text.replace term="" new_text="" mode=Mode.All matcher=Text_Matcher = if term.is > Example Getting the words in the sentence "I have not one, but two cats." - "I have not one, but two cats.".words + "I have not one, but two cats.".words == ['I', 'have', 'not', 'one', ',', 'but', 'two', 'cats', '.'] + + > Example + Getting the words in the Thai sentence "แมวมีสี่ขา" + + "แมวมีสี่ขา".words == ['แมว', 'มี', 'สี่', 'ขา'] Text.words : Boolean -> Vector.Vector Text Text.words keep_whitespace=False = iterator = BreakIterator.getWordInstance @@ -559,9 +529,7 @@ Text.words keep_whitespace=False = build prev nxt = if nxt == -1 then Nothing else word = Text_Utils.substring this prev nxt word_not_whitespace = (Text_Utils.is_all_whitespace word).not - if word_not_whitespace then bldr.append word else - if keep_whitespace then - bldr.append word + if word_not_whitespace || keep_whitespace then bldr.append word next_nxt = iterator.next @Tail_Call build nxt next_nxt @@ -570,6 +538,33 @@ Text.words keep_whitespace=False = bldr.to_vector +## ALIAS Get Lines + + Splits the text into lines, based on '\n', '\r' or '\r\n' line endings. + + Empty lines are added for leading newlines. Multiple consecutive + newlines will also yield additional empty lines. A line ending at the end of + the line is not required, but if it is present it will not cause an empty + line to be added at the end. + + > Example + Split the text 'a\nb\nc' into lines. + + 'a\nb\nc'.lines == ['a', 'b', 'c'] + + > Example + Split the text '\na\n\nb\n\n' into lines. + + '\na\n\nb\n\n\n'.lines == ['', 'a', '', 'b', '', ''] + + > Example + Split the text '\na\nb\n' into lines, keeping the line endings. + + '\na\nb\n'.lines keep_endings=True == ['\n', 'a\n', 'b\n'] +Text.lines : Boolean -> Vector.Vector Text +Text.lines keep_endings=False = + Vector.Vector (Text_Utils.split_on_lines this keep_endings) + ## Checks whether `this` is equal to `that`. Arguments: diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex.enso index 0fa951ac211..00ef98632ed 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex.enso @@ -12,7 +12,6 @@ import Standard.Base.Data.Text.Regex.Engine import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine import Standard.Base.Data.Text.Regex.Mode import Standard.Base.Data.Text.Regex.Option -import Standard.Base.Data.Text.Split_Kind import Standard.Base.Data.Map import Standard.Base.Error.Extensions as Errors diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso index ef96da5c33e..8d1c548e930 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso @@ -476,7 +476,7 @@ type Pattern Mode_Error "Cannot match a negative number of times." mode + 1 - Mode.All -> 0 + Mode.All -> -1 Mode.Full -> Panic.throw <| Mode_Error "Splitting on a full match yields an empty text." Mode.Bounded _ _ _ -> Panic.throw <| diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Split_Kind.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Split_Kind.enso deleted file mode 100644 index 38210a9a08c..00000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Split_Kind.enso +++ /dev/null @@ -1,17 +0,0 @@ -from Standard.Base import all - -## The type of split for splitting text. -type Split_Kind - - ## Split on unicode whitespace. - type Whitespace - - ## Split into lines. - type Lines - - ## Split into words. - type Words - - ## Split on a literal text value. - Text - diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso index 20300da37da..e0b17543a84 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso @@ -58,7 +58,7 @@ from project.Data.Range export Range Relevant issues: https://www.pivotaltracker.com/story/show/181403340 https://www.pivotaltracker.com/story/show/181309938 -from project.Data.Text.Extensions export Text, Split_Kind, Line_Ending_Style, Case, Location +from project.Data.Text.Extensions export Text, Line_Ending_Style, Case, Location from project.Data.Text.Matching export Case_Insensitive, Text_Matcher, Regex_Matcher from project.Error.Common export all from project.Error.Extensions export all diff --git a/distribution/lib/Standard/Searcher/0.0.0-dev/src/Data_Science/Text.enso b/distribution/lib/Standard/Searcher/0.0.0-dev/src/Data_Science/Text.enso index 0b8553c1f7c..25ad37b6bdd 100644 --- a/distribution/lib/Standard/Searcher/0.0.0-dev/src/Data_Science/Text.enso +++ b/distribution/lib/Standard/Searcher/0.0.0-dev/src/Data_Science/Text.enso @@ -8,7 +8,7 @@ > Example Split the text on whitespace into a vector of items. - "ham eggs cheese tomatoes".split Split_Kind.Whitespace + "ham eggs cheese tomatoes".split "\s+" > Example Getting the words in the sentence "I have not one, but two cats." diff --git a/project/DistributionPackage.scala b/project/DistributionPackage.scala index 4c54b9f4473..3796068f972 100644 --- a/project/DistributionPackage.scala +++ b/project/DistributionPackage.scala @@ -82,17 +82,6 @@ object DistributionPackage { } } - def downloadFileToLocation( - address: String, - location: File - ): File = { - val exitCode = (url(address) #> location).! - if (exitCode != 0) { - throw new RuntimeException(s"Downloading the file at $address failed.") - } - location - } - def executableName(baseName: String): String = if (Platform.isWindows) baseName + ".exe" else baseName @@ -154,7 +143,6 @@ object DistributionPackage { cacheFactory = cacheFactory.sub("engine-libraries"), log = log ) - getStdlibDataFiles(distributionRoot, targetStdlibVersion) copyDirectoryIncremental( file("distribution/bin"), @@ -238,19 +226,6 @@ object DistributionPackage { } - private def getStdlibDataFiles( - distributionRoot: File, - stdlibVersion: String - ): Unit = { - val exampleImageUrl = - "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e9/" + - "Hue_alpha_falloff.png/320px-Hue_alpha_falloff.png" - downloadFileToLocation( - exampleImageUrl, - distributionRoot / s"lib/Standard/Examples/$stdlibVersion/data/image.png" - ) - } - private def buildEngineManifest( template: File, destination: File, diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java index 862d2e769e2..a153f8d75f2 100644 --- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java @@ -6,7 +6,6 @@ import com.ibm.icu.text.CaseMap.Fold; import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.StringSearch; - import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.CharBuffer; @@ -30,10 +29,6 @@ import org.enso.base.text.Utf16Span; /** Utils for standard library operations on Text. */ public class Text_Utils { - private static final Pattern whitespace = - Pattern.compile("\\s+", Pattern.UNICODE_CHARACTER_CLASS); - private static final Pattern vertical_space = - Pattern.compile("\\v+", Pattern.UNICODE_CHARACTER_CLASS); private static final String INVALID_CHARACTER = "\uFFFD"; /** @@ -62,7 +57,7 @@ public class Text_Utils { private static T resize(T old, IntFunction allocate, BiConsumer put) { int n = old.capacity(); - int new_n = 2*n + 1; + int new_n = 2 * n + 1; T o = allocate.apply(new_n); old.flip(); put.accept(o, old); @@ -81,14 +76,15 @@ public class Text_Utils { return new ResultWithWarnings<>(new byte[0]); } - CharsetEncoder encoder = charset.newEncoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT) - .reset(); + CharsetEncoder encoder = + charset + .newEncoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + .reset(); CharBuffer in = CharBuffer.wrap(str.toCharArray()); - ByteBuffer out = ByteBuffer.allocate( - (int)(in.remaining() * encoder.averageBytesPerChar())); + ByteBuffer out = ByteBuffer.allocate((int) (in.remaining() * encoder.averageBytesPerChar())); StringBuilder warnings = null; while (in.hasRemaining()) { @@ -153,38 +149,43 @@ public class Text_Utils { return str.codePoints().toArray(); } - /** - * Splits the string on each occurrence of {@code sep}, returning the resulting substrings in an - * array. - * - * @param str the string to split - * @param sep the separator string - * @return array of substrings of {@code str} contained between occurences of {@code sep} - */ - public static String[] split_by_literal(String str, String sep) { - return str.split(Pattern.quote(sep)); - } - - /** - * Splits the string on each occurrence of UTF-8 whitespace, returning the resulting substrings in - * an array. - * - * @param str the string to split - * @return the array of substrings of {@code str} - */ - public static String[] split_on_whitespace(String str) { - return whitespace.split(str); - } - /** * Splits the string on each occurrence of UTF-8 vertical whitespace, returning the resulting * substrings in an array. * * @param str the string to split + * @param keep_endings whether to keep line endings in returned lines * @return the array of substrings of {@code str} */ - public static String[] split_on_lines(String str) { - return vertical_space.split(str); + public static List split_on_lines(String str, boolean keep_endings) { + ArrayList acc = new ArrayList<>(); + int length = str.length(); + int currentStart = 0; + int currentPos = 0; + while (currentPos < length) { + if (str.charAt(currentPos) == '\n') { + acc.add(str.substring(currentStart, keep_endings ? currentPos + 1 : currentPos)); + currentStart = currentPos + 1; + currentPos = currentStart; + } else if (str.charAt(currentPos) == '\r') { + // Handle the '\r\n' digraph. + int offset = 1; + if (currentPos + 1 < length && str.charAt(currentPos + 1) == '\n') { + offset = 2; + } + acc.add(str.substring(currentStart, keep_endings ? currentPos + offset : currentPos)); + currentStart = currentPos + offset; + currentPos = currentStart; + } else { + currentPos += 1; + } + } + + if (currentStart < length) { + acc.add(str.substring(currentStart)); + } + + return acc; } /** @@ -241,14 +242,15 @@ public class Text_Utils { return new ResultWithWarnings<>(""); } - CharsetDecoder decoder = charset.newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT) - .reset(); + CharsetDecoder decoder = + charset + .newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + .reset(); ByteBuffer in = ByteBuffer.wrap(bytes); - CharBuffer out = CharBuffer.allocate( - (int)(bytes.length * decoder.averageCharsPerByte())); + CharBuffer out = CharBuffer.allocate((int) (bytes.length * decoder.averageCharsPerByte())); StringBuilder warnings = null; while (in.hasRemaining()) { diff --git a/test/Table_Tests/src/Csv_Spec.enso b/test/Table_Tests/src/Csv_Spec.enso index d1f4403a26b..f19a90dc832 100644 --- a/test/Table_Tests/src/Csv_Spec.enso +++ b/test/Table_Tests/src/Csv_Spec.enso @@ -77,7 +77,7 @@ spec = "This;Name;;Is""""Strange";20 Marcin,,;"hello;world" - expected = expected_wrong_newline.split Split_Kind.Lines . join '\r\n' + expected = expected_wrong_newline.lines . join '\r\n' res = t.to_csv separator=';' line_ending=Line_Ending_Style.Windows res.should_equal expected+'\r\n' diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index c7782468878..7f60056d2ee 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -3,7 +3,6 @@ from Standard.Base import all from Standard.Base.Data.Text.Extensions import Index_Out_Of_Bounds_Error import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine import Standard.Base.Data.Locale -import Standard.Base.Data.Text.Split_Kind from Standard.Base.Data.Text.Span as Span_Module import Span from Standard.Base.Data.Text.Text_Sub_Range import all import Standard.Base.Data.Text.Regex.Mode @@ -60,8 +59,6 @@ spec = accent_2 = '\u0065\u{301}' utf_8_whitespace = 'foo\n bar baz \u202F quux' utf_8_whitespace_split = ["foo", "bar", "baz", "quux"] - utf_8_vertical = 'foo\n bar \v baz \r quux' - utf_8_vertical_split = ["foo", " bar ", " baz ", " quux"] sentences = ''' I have a very long block of text, here. It goes on and on, containing things like decimal points (1.0314e3) and other language scripts as well @@ -155,17 +152,76 @@ spec = str.at 4 . should_fail_with Index_Out_Of_Bounds_Error Test.specify "should be able to split the text into words" <| + "I have not one, but two cats.".words . should_equal ['I', 'have', 'not', 'one', ',', 'but', 'two', 'cats', '.'] + "แมวมีสี่ขา".words . should_equal ['แมว', 'มี', 'สี่', 'ขา'] sentences.words . should_equal sentence_words + "I ❤️ Unicode! 🙂🙂".words . should_equal ['I', '❤️', 'Unicode', '!', '🙂', '🙂'] + '"แมวมีสี่ขา" means that a cat has four legs.'.words . should_equal ['"', 'แมว', 'มี', 'สี่', 'ขา', '"', 'means', 'that', 'a', 'cat', 'has', 'four', 'legs', '.'] - Test.specify "should be able to split the text on UTF-8 whitespace" <| - utf_8_whitespace.split . should_equal utf_8_whitespace_split + Test.specify "should be able to split the text into lines" <| + utf_8_vertical = 'foo\n bar \r\n baz \r quux' + utf_8_vertical_split = ["foo", " bar ", " baz ", " quux"] + utf_8_vertical.lines . should_equal utf_8_vertical_split - Test.specify "should be able to split the text on UTF-8 newlines" <| - utf_8_vertical.split Split_Kind.Lines . should_equal utf_8_vertical_split + 'a\nb\nc'.lines . should_equal ['a', 'b', 'c'] + '\na\n\nb\n\n\n'.lines . should_equal ['', 'a', '', 'b', '', ''] + '\na\nb\n'.lines keep_endings=True . should_equal ['\n', 'a\n', 'b\n'] + + '\n\n\n'.lines . should_equal ['', '', ''] + '\r\r\r'.lines . should_equal ['', '', ''] + '\r\n\r\n\r\n'.lines . should_equal ['', '', ''] + '\n\n\n'.lines keep_endings=True . should_equal ['\n', '\n', '\n'] + 'a\r\nb\n\rc'.lines keep_endings=True . should_equal ['a\r\n', 'b\n', '\r', 'c'] + 'a\r\nb\n\rc'.lines . should_equal ['a', 'b', '', 'c'] + 'abc'.lines . should_equal ['abc'] + 'abc\n'.lines . should_equal ['abc'] + 'abc\n'.lines keep_endings=True . should_equal ['abc\n'] + '\na'.lines . should_equal ['', 'a'] + + multiline = """ + Hello + world + multiline.lines . should_equal ['Hello', 'world'] + '🚀🚧\n\u{301}a\u{301}\r건반'.lines . should_equal ['🚀🚧', '\u{301}a\u{301}', '건반'] Test.specify "should be able to split the text on arbitrary text sequence" <| "foo, bar, baz" . split ", " . should_equal ["foo", "bar", "baz"] + text = "Namespace::package::package::Type" + text.split "::" . should_equal ["Namespace", "package", "package", "Type"] + "..a.b.c.d" . split "." . should_equal ["", "", "a", "b", "c", "d"] + "abc".split "." . should_equal ["abc"] + "aaa".split "a" . should_equal ["", "", "", ""] + ".a.".split "." . should_equal ["", "a", ""] + "".split "." . should_equal [""] + "abc[a-z]def".split "[a-z]" . should_equal ["abc", "def"] + 'aśbs\u{301}c'.split 'ś' . should_equal ['a', 'b', 'c'] + 'abc'.split '' . should_fail_with Illegal_Argument_Error + Test.specify "should be able to split the text on arbitrary text sequence, case-insensitively" <| + matcher = Text_Matcher Case_Insensitive + "AbCdABCDabDCba" . split "ab" matcher . should_equal ["", "Cd", "CD", "DCba"] + "abc".split "d" matcher . should_equal ["abc"] + "AAA".split "a" matcher . should_equal ["", "", "", ""] + "baB".split "b" matcher . should_equal ["", "a", ""] + "".split "a" matcher . should_equal [""] + 'aŚbS\u{301}c'.split 'ś' matcher . should_equal ['a', 'b', 'c'] + 'abc'.split '' matcher . should_fail_with Illegal_Argument_Error + + Test.specify "should be able to split the text on Regex patterns" <| + "cababdabe" . split "ab" Regex_Matcher . should_equal ["c", "", "d", "e"] + "cababdabe" . split "(ab)+" Regex_Matcher . should_equal ["c", "d", "e"] + "abc" . split "[a-z]" Regex_Matcher . should_equal ["", "", "", ""] + "abc--def==>ghi".split "[-=>]+" Regex_Matcher == ["abc", "def", "ghi"] + "abc".split "." Regex_Matcher . should_equal ["", "", "", ""] + "abc".split "d" Regex_Matcher . should_equal ["abc"] + ".a.".split "\." Regex_Matcher . should_equal ["", "a", ""] + "".split "a" Regex_Matcher . should_equal [""] + 'aśbs\u{301}c'.split 'ś' Regex_Matcher . should_equal ['a', 'b', 'c'] + 'abc'.split '' Regex_Matcher . should_fail_with Illegal_Argument_Error + + Test.specify "should be able to split the text on UTF-8 whitespace" <| + utf_8_whitespace.split "\s+" Regex_Matcher . should_equal utf_8_whitespace_split + 'abc def\tghi'.split '\\s+' Regex_Matcher . should_equal ["abc", "def", "ghi"] Test.specify "should convert any type to text automatically and using provided methods" <| t = Auto (Manual 123) . to_text @@ -1130,34 +1186,34 @@ spec = Test.group "Regex splitting" <| Test.specify "should be possible on text" <| - splits = "abcde".split "[bd]" + splits = "abcde".split "[bd]" Regex_Matcher splits.length . should_equal 3 splits.at 0 . should_equal "a" splits.at 1 . should_equal "c" splits.at 2 . should_equal "e" Test.specify "should be possible on unicode text" <| - match = "Korean: 건반 (hangul)".split " " + match = "Korean: 건반 (hangul)".split " " Regex_Matcher match.length . should_equal 3 match.at 0 . should_equal "Korean:" match.at 1 . should_equal "건반" match.at 2 . should_equal "(hangul)" Test.specify "should be possible in ascii mode" <| - splits = "İiİ".split "\w" match_ascii=True + splits = "İiİ".split "\w" (Regex_Matcher match_ascii=True) splits.length . should_equal 2 splits.at 0 . should_equal "İ" splits.at 1 . should_equal "İ" Test.specify "should be possible in case-insensitive mode" <| - splits = "abaBa".split "b" case_insensitive=True + splits = "abaBa".split "b" (Regex_Matcher case_sensitive=Case_Insensitive) splits.length . should_equal 3 splits.at 0 . should_equal "a" splits.at 1 . should_equal "a" splits.at 2 . should_equal "a" Test.specify "should be possible in dot_matches_newline mode" <| - splits = 'ab\nabcd'.split "b." dot_matches_newline=True + splits = 'ab\nabcd'.split "b." (Regex_Matcher dot_matches_newline=True) splits.length . should_equal 3 splits.at 0 . should_equal "a" splits.at 1 . should_equal "a" @@ -1167,11 +1223,11 @@ spec = text = """ Foo bar - match = text.split "$" multiline=True - match.length . should_equal 2 + match = text.split "$" (Regex_Matcher multiline=True) + match.length . should_equal 3 Test.specify "should be possible in comments mode" <| - splits = "abcde".split "[bd] # Split on the letters `b` and `d`" comments=True + splits = "abcde".split "[bd] # Split on the letters `b` and `d`" (Regex_Matcher comments=True) splits.length . should_equal 3 splits.at 0 . should_equal "a" splits.at 1 . should_equal "c"