Text.pad and Text.trim (#3309)

Implements https://www.pivotaltracker.com/story/show/181265516
This commit is contained in:
Radosław Waśko 2022-03-02 18:19:39 +01:00 committed by GitHub
parent 738a691662
commit 40c851bf8b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 216 additions and 22 deletions

View File

@ -58,6 +58,7 @@
- [Implemented `Text.to_case`, replacing `Text.to_lower_case` and
`Text.to_upper_case`][3302]
- [Implemented initial `Table.group_by` function on Standard.Table][3305]
- [Implemented `Text.pad` and `Text.trim`][3309]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -90,6 +91,7 @@
[3292]: https://github.com/enso-org/enso/pull/3292
[3302]: https://github.com/enso-org/enso/pull/3302
[3305]: https://github.com/enso-org/enso/pull/3305
[3309]: https://github.com/enso-org/enso/pull/3309
#### Enso Compiler

View File

@ -6,6 +6,7 @@ from Standard.Builtins import Text, Prim_Text_Helpers
import Standard.Base.Data.Text.Regex
import Standard.Base.Data.Text.Regex.Mode
import Standard.Base.Data.Text.Case
import Standard.Base.Data.Text.Location
import Standard.Base.Data.Text.Line_Ending_Style
import Standard.Base.Data.Text.Split_Kind
import Standard.Base.Data.Text.Text_Sub_Range
@ -15,6 +16,7 @@ import Standard.Base.Meta
from Standard.Builtins export Text
export Standard.Base.Data.Text.Case
export Standard.Base.Data.Text.Location
export Standard.Base.Data.Text.Split_Kind
export Standard.Base.Data.Text.Line_Ending_Style
@ -487,7 +489,7 @@ Text.words keep_whitespace=False =
build prev nxt = if nxt == -1 then Nothing else
word = Text_Utils.substring this prev nxt
word_not_whitespace = (Text_Utils.is_whitespace word).not
word_not_whitespace = (Text_Utils.is_all_whitespace word).not
if word_not_whitespace then bldr.append word else
if keep_whitespace then
bldr.append word
@ -620,12 +622,22 @@ Text.not_empty = this.is_empty.not
"A0".is_digit == False
"A0".is_digit 1 == True
"건반(Korean)".is_digit 1 == False
Text.is_digit : Integer -> Text ! Index_Out_Of_Bounds_Error
Text.is_digit : Integer -> Boolean ! Index_Out_Of_Bounds_Error
Text.is_digit (index=0) =
grapheme = this.at index
if grapheme.is_error then grapheme else
char = (Text_Utils.get_chars grapheme).at 0
char>=48 && char<=57
char = (Text_Utils.get_chars grapheme).at 0
char>=48 && char<=57
## Checks if the text consists only of whitespace characters.
> Example
Check if a text is whitespace only.
' \t'.is_whitespace == True
"0 ".is_whitespace == False
Text.is_whitespace : Boolean
Text.is_whitespace =
Text_Utils.is_all_whitespace this
## Returns a vector containing bytes representing the UTF-8 encoding of the
input text.
@ -940,8 +952,7 @@ Text.take range =
char_range = case range of
Range _ _ -> here.range_to_char_indices this range
_ -> range.to_char_range this
if char_range.is_error then char_range else
Text_Utils.substring this char_range.start char_range.end
Text_Utils.substring this char_range.start char_range.end
## ALIAS skip, remove
Creates a new Text by removing the specified range of the input.
@ -1022,3 +1033,95 @@ Text.to_case case_option=Case.Lower locale=Locale.Default = case case_option of
Case.Lower -> UCharacter.toLowerCase locale.java_locale this
Case.Upper -> UCharacter.toUpperCase locale.java_locale this
Case.Title -> UCharacter.toTitleCase locale.java_locale this Nothing
## Returns the input padded to the specified `length`, using the `with_pad`
string repeated at the start or the end.
Arguments:
- length: The new length for the output. The result is the original string if
the input length is more than length.
- with_pad: The string to use to pad the input. If the last repetition
exceeds the target length, it is truncated to the required size. If padding
at the `End`, the beginning of the padding string is used and if padding at
`Start`, the end of the string is used.
- at: The location of where to pad the input.
> Example
Padding a text with whitespace at the end.
"Hello World!".pad 15 == "Hello World! "
> Example
Behavior of padding if the `with_pad` string has to be truncated.
"HELLO".pad 9 "AB" == "HELLOABAB"
"HELLO".pad 8 "AB" == "HELLOABA"
"HELLO".pad 8 "AB" Start == "BABHELLO"
Text.pad : Integer -> Text -> (Location.Start | Location.End) -> Text
Text.pad length=0 with_pad=' ' at=Location.End =
with_pad_length = with_pad.length
if with_pad_length == 0 then Error.throw (Illegal_Argument_Error "`with_pad` must not be an empty string.") else
pad_size = length - this.length
if pad_size <= 0 then this else
full_repetitions = pad_size.div with_pad_length
remainder = pad_size % with_pad_length
case at of
Location.Start ->
with_pad.take (Text_Sub_Range.Last remainder) + with_pad.repeat full_repetitions + this
Location.End ->
this + with_pad.repeat full_repetitions + with_pad.take (Text_Sub_Range.First remainder)
## This function removes the specified `trim_characters`, by default any
whitespace, from the start, the end, or both ends of the input.
Arguments:
- trim_characters: A Text containing characters that should be removed or a
predicate taking single character strings and specifying if they should be
removed. By default, this should be any Unicode whitespace characters and
all line terminator characters.
- from: The location of where to trim the input. By default, this function
trims both ends of the input.
> Example
Trimming whitespace from a string.
" Hello! ".trim == "Hello!"
" Hello! ".trim Start == "Hello! "
" Hello! ".trim End == " Hello!"
> Example
Trimming a specific set of letters from a string.
"ABC123".trim Start "ABC" == "123"
"ABBA123".trim Start "ABC" == "123"
Text.trim : (Location.Start | Location.End | Location.Both) -> (Text | (Text -> Boolean)) -> Text
Text.trim where=Location.Both what=_.is_whitespace =
predicate = case what of
Text -> what.contains _
_ -> what
break_iterator = BreakIterator.getCharacterInstance
break_iterator.setText this
start_index = case where of
Location.End -> 0
_ ->
loop current next =
if next < 0 then current else
case predicate (Text_Utils.substring this current next) of
True ->
@Tail_Call loop next break_iterator.next
False -> current
loop 0 break_iterator.next
end_index = case where of
Location.Start -> Text_Utils.char_length this
_ ->
loop current prev =
if prev < 0 then current else
case predicate (Text_Utils.substring this prev current) of
True ->
@Tail_Call loop prev break_iterator.previous
False -> current
current = break_iterator.last
loop current break_iterator.previous
if start_index >= end_index then "" else
Text_Utils.substring this start_index end_index

View File

@ -0,0 +1,8 @@
## Indicates the beginning of a text.
type Start
## Indicates the end of a text.
type End
## Indicates both the beginning and end of a text.
type Both

View File

@ -72,10 +72,10 @@ type Text_Sub_Range
Range 0 (if start_index == -1 then (Text_Utils.char_length text) else start_index)
Last count ->
if count <= 0 then (Range 0 0) else
first_count = text.length - count
iterator = BreakIterator.getCharacterInstance
iterator.setText text
start_index = iterator.next first_count
iterator.last
start_index = iterator.next -count
Range (if start_index == -1 then 0 else start_index) (Text_Utils.char_length text)
Before delimiter ->
if delimiter.is_empty then (Range 0 0) else

View File

@ -54,7 +54,7 @@ from project.Data.Range export Range
Relevant issues:
https://www.pivotaltracker.com/story/show/181403340
https://www.pivotaltracker.com/story/show/181309938
from project.Data.Text.Extensions export Text, Split_Kind, Line_Ending_Style, Case
from project.Data.Text.Extensions export Text, Split_Kind, Line_Ending_Style, Case, Location
from project.Data.Text.Matching export Case_Insensitive, Text_Matcher, Regex_Matcher
from project.Error.Common export all
from project.Error.Extensions export all

View File

@ -238,7 +238,7 @@ Any.should_equal that frames_to_skip=0 = case this == that of
import Standard.Test
example_should_equal = Examples.add_1_to 1 . should_equal 2
Error.should_equal : Any -> Assertion.
Error.should_equal : Any -> Assertion
Error.should_equal _ = Panic.throw (Matched_On_Error this)
## Asserts that `this` is within `epsilon` from `that`.

View File

@ -1,5 +1,6 @@
package org.enso.base;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.StringSearch;
@ -101,17 +102,6 @@ public class Text_Utils {
return vertical_space.split(str);
}
/**
* Checks if the provided string consists only of whitespace characters.
*
* @param str the string to check
* @return {@code true} if {@code str} is only whitespace, otherwise {@code false}
*/
public static boolean is_whitespace(String str) {
var matcher = whitespace.matcher(str);
return matcher.matches();
}
/**
* Checks whether two strings are equal up to Unicode canonicalization.
*
@ -252,4 +242,14 @@ public class Text_Utils {
public static String normalize(String str) {
return Normalizer2.getNFDInstance().normalize(str);
}
/**
* Checks if the given string consists only of whitespace characters.
*
* @param str the string to check
* @return {@code true} if {@code str} is only whitespace, otherwise {@code false}
*/
public static boolean is_all_whitespace(String text) {
return text.codePoints().allMatch(UCharacter::isUWhiteSpace);
}
}

View File

@ -168,6 +168,7 @@ spec =
"Hello World!".take (First 0) . should_equal ""
"Hello World!".take Last.new . should_equal "!"
"Hello World!".take (Last 6) . should_equal "World!"
"Hello World!".take (Last 0) . should_equal ""
"Hello World!".take (Last 100) . should_equal "Hello World!"
"Hello World!".take (Before " ") . should_equal "Hello"
"Hello World!".take (Before "z") . should_equal "Hello World!"
@ -219,6 +220,7 @@ spec =
'✨🚀🚧😍😃😎😙😉☺'.take First.new . should_equal '✨'
'✨🚀🚧😍😃😎😙😉☺'.take (First 2) . should_equal '✨🚀'
'✨🚀🚧😍😃😎😙😉☺'.take Last.new . should_equal '☺'
'✨🚀🚧😍😃😎😙😉☺'.take (Last 0) . should_equal ''
'✨🚀🚧😍😃😎😙😉☺'.take (Last 3) . should_equal '😙😉☺'
'✨🚀🚧😍😃😍😎😙😉☺'.take (Before '😍') . should_equal '✨🚀🚧'
'✨🚀🚧😍😃😍😎😙😉☺'.take (Before_Last '😍') . should_equal '✨🚀🚧😍😃'
@ -344,6 +346,7 @@ spec =
str.is_digit 2 . should_be_true
str.is_digit 3 . should_be_true
str.is_digit 4 . should_be_false
str.is_digit 5 . should_fail_with Index_Out_Of_Bounds_Error
Test.specify "should be able to check by negative index if is a digit" <|
str = kshi + "A12" + accent_2
@ -352,6 +355,16 @@ spec =
str.is_digit -3 . should_be_true
str.is_digit -4 . should_be_false
str.is_digit -5 . should_be_false
str.is_digit -100 . should_fail_with Index_Out_Of_Bounds_Error
Test.specify "should be able to check if a text consists only of whitespace" <|
' \t\n'.is_whitespace . should_be_true
'AB'.is_whitespace . should_be_false
' A '.is_whitespace . should_be_false
'\v\f\u{200a}\u{202f}\u{205F}\u{3000}\u{feff}'.is_whitespace . should_be_true
# The Unicode Zero Width Space is not considered whitespace
'\u{200b}'.is_whitespace . should_be_false
Test.specify "should return a dataflow error when checking is digit for out of bounds" <|
str = kshi + "A12" + accent_2
@ -627,6 +640,74 @@ spec =
"foobar" . ends_with "" Regex_Matcher.new . should_be_true
"" . ends_with "" Regex_Matcher.new . should_be_true
Test.specify "should allow to pad a text" <|
"Hello World!".pad 15 . should_equal "Hello World! "
"HELLO".pad 9 "AB" . should_equal "HELLOABAB"
"HELLO".pad 8 "AB" . should_equal "HELLOABA"
"HELLO".pad 8 "AB" Location.Start . should_equal "BABHELLO"
"".pad 4 . should_equal " "
"A".pad 3 "" . should_fail_with Illegal_Argument_Error
"ABCDE".pad 3 "" . should_fail_with Illegal_Argument_Error
"".pad 0 "" . should_fail_with Illegal_Argument_Error
"".pad 0 . should_equal ""
"ABC".pad 3 . should_equal "ABC"
"AB".pad -1 . should_equal "AB"
"ABC".pad -100 . should_equal "ABC"
'a\u{301}'.pad 2 . should_equal 'a\u{301} '
"".pad 2 'a\u{302}' . should_equal 'a\u{302}a\u{302}'
'XX'.pad 5 'yy\u{301}' . should_equal 'XXyy\u{301}y'
'XX'.pad 5 'y\u{301}y' . should_equal 'XXy\u{301}yy\u{301}'
'XX'.pad 4 'yy\u{301}Z' . should_equal 'XXyy\u{301}'
'🚀'.pad 3 'B' Location.End . should_equal '🚀BB'
'🚀'.pad 3 'B' Location.Start . should_equal 'BB🚀'
## It is technically possible to use a combining diacritical mark as
the padding, then the actual length of the text will not increase
because all padding will still constitute a single grapheme
cluster.
'e'.pad 7 '\u{301}' . length . should_equal 1
Test.specify "should allow to trim a text" <|
" Hello! ".trim . should_equal "Hello!"
" Hello! ".trim Location.Start . should_equal "Hello! "
" Hello! ".trim Location.End . should_equal " Hello!"
"ABC123".trim Location.Start "ABC" . should_equal "123"
"ABBA123".trim Location.Start "ABC" . should_equal "123"
"ABCZ-]".trim Location.Both "[A-Z]" . should_equal "BC"
" ".trim . should_equal ""
" Hello World! ".trim . should_equal "Hello World!"
" Hello World! ".trim Location.Start . should_equal "Hello World! "
" Hello World! ".trim Location.End . should_equal " Hello World!"
"ABCD".trim Location.Start "ABCDEF" . should_equal ""
"ABCD".trim Location.End "ABCDEF" . should_equal ""
"ABCD".trim Location.Both "ABCDEF" . should_equal ""
"".trim . should_equal ""
"A".trim . should_equal "A"
" A ".trim . should_equal "A"
' A\u{301} \n '.trim . should_equal 'A\u{301}'
"🚧".trim . should_equal "🚧"
" 🚧 🚧 ".trim . should_equal "🚧 🚧"
" 🚧 🚧 ".trim Location.End . should_equal " 🚧 🚧"
"ABCD".trim Location.Start (_ -> True) . should_equal ""
"ABCD".trim Location.Both (_ -> True) . should_equal ""
"ABCD".trim Location.Both (_ -> False) . should_equal "ABCD"
"123AB98".trim Location.Both _.is_digit . should_equal "AB"
' \t\n\r'.trim . should_equal ''
'\t\t Test\nFoo\r\n'.trim . should_equal 'Test\nFoo'
# Check various kinds of Unicode whitespace
'\v\f\u{200a}\u{202f}\u{205F}\u{3000}\u{feff}'.trim . should_equal ''
# A whitespace with an accent is not treated as whitespace anymore
' \u{301} '.trim . should_equal ' \u{301}'
' \u{301}'.trim . should_equal ' \u{301}'
Test.group "Regex matching" <|
Test.specify "should be possible on text" <|
match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First