Text.pad and Text.trim (#3309)

Implements https://www.pivotaltracker.com/story/show/181265516
2024-09-11 13:15:52 +03:00 · 2022-03-02 18:19:39 +01:00 · 2022-03-02 18:19:39 +01:00 · 40c851bf8b
commit 40c851bf8b
parent 738a691662
8 changed files with 216 additions and 22 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -58,6 +58,7 @@
 - [Implemented `Text.to_case`, replacing `Text.to_lower_case` and
  `Text.to_upper_case`][3302]
 - [Implemented initial `Table.group_by` function on Standard.Table][3305]
+- [Implemented `Text.pad` and `Text.trim`][3309]

 [debug-shortcuts]:
  https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -90,6 +91,7 @@
 [3292]: https://github.com/enso-org/enso/pull/3292
 [3302]: https://github.com/enso-org/enso/pull/3302
 [3305]: https://github.com/enso-org/enso/pull/3305
+[3309]: https://github.com/enso-org/enso/pull/3309

 #### Enso Compiler

--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@ -6,6 +6,7 @@ from Standard.Builtins import Text, Prim_Text_Helpers
 import Standard.Base.Data.Text.Regex
 import Standard.Base.Data.Text.Regex.Mode
 import Standard.Base.Data.Text.Case
+import Standard.Base.Data.Text.Location
 import Standard.Base.Data.Text.Line_Ending_Style
 import Standard.Base.Data.Text.Split_Kind
 import Standard.Base.Data.Text.Text_Sub_Range
@ -15,6 +16,7 @@ import Standard.Base.Meta
 from Standard.Builtins export Text

 export Standard.Base.Data.Text.Case
+export Standard.Base.Data.Text.Location
 export Standard.Base.Data.Text.Split_Kind
 export Standard.Base.Data.Text.Line_Ending_Style

@ -487,7 +489,7 @@ Text.words keep_whitespace=False =

    build prev nxt = if nxt == -1 then Nothing else
        word = Text_Utils.substring this prev nxt
-        word_not_whitespace = (Text_Utils.is_whitespace word).not
+        word_not_whitespace = (Text_Utils.is_all_whitespace word).not
        if word_not_whitespace then bldr.append word else
            if keep_whitespace then
                bldr.append word
@ -620,12 +622,22 @@ Text.not_empty = this.is_empty.not
         "A0".is_digit == False
         "A0".is_digit 1 == True
         "건반(Korean)".is_digit 1 == False
-Text.is_digit : Integer -> Text ! Index_Out_Of_Bounds_Error
+Text.is_digit : Integer -> Boolean ! Index_Out_Of_Bounds_Error
 Text.is_digit (index=0) =
    grapheme = this.at index
-    if grapheme.is_error then grapheme else
-        char = (Text_Utils.get_chars grapheme).at 0
-        char>=48 && char<=57
+    char = (Text_Utils.get_chars grapheme).at 0
+    char>=48 && char<=57
+
+## Checks if the text consists only of whitespace characters.
+
+   > Example
+     Check if a text is whitespace only.
+
+         ' \t'.is_whitespace == True
+         "0 ".is_whitespace == False
+Text.is_whitespace : Boolean
+Text.is_whitespace =
+    Text_Utils.is_all_whitespace this

 ## Returns a vector containing bytes representing the UTF-8 encoding of the
   input text.
@ -940,8 +952,7 @@ Text.take range =
    char_range = case range of
        Range _ _ -> here.range_to_char_indices this range
        _ -> range.to_char_range this
-    if char_range.is_error then char_range else
-        Text_Utils.substring this char_range.start char_range.end
+    Text_Utils.substring this char_range.start char_range.end

 ## ALIAS skip, remove
   Creates a new Text by removing the specified range of the input.
@ -1022,3 +1033,95 @@ Text.to_case case_option=Case.Lower locale=Locale.Default = case case_option of
    Case.Lower -> UCharacter.toLowerCase locale.java_locale this
    Case.Upper -> UCharacter.toUpperCase locale.java_locale this
    Case.Title -> UCharacter.toTitleCase locale.java_locale this Nothing
+
+## Returns the input padded to the specified `length`, using the `with_pad`
+   string repeated at the start or the end.
+
+   Arguments:
+   - length: The new length for the output. The result is the original string if
+     the input length is more than length.
+   - with_pad: The string to use to pad the input. If the last repetition
+     exceeds the target length, it is truncated to the required size. If padding
+     at the `End`, the beginning of the padding string is used and if padding at
+     `Start`, the end of the string is used.
+   - at: The location of where to pad the input.
+
+   > Example
+     Padding a text with whitespace at the end.
+
+         "Hello World!".pad 15 == "Hello World!   "
+
+   > Example
+     Behavior of padding if the `with_pad` string has to be truncated.
+
+         "HELLO".pad 9 "AB" == "HELLOABAB"
+         "HELLO".pad 8 "AB" == "HELLOABA"
+         "HELLO".pad 8 "AB" Start == "BABHELLO"
+
+Text.pad : Integer -> Text -> (Location.Start | Location.End) -> Text
+Text.pad length=0 with_pad=' ' at=Location.End =
+    with_pad_length = with_pad.length
+    if with_pad_length == 0 then Error.throw (Illegal_Argument_Error "`with_pad` must not be an empty string.") else
+        pad_size = length - this.length
+        if pad_size <= 0 then this else
+            full_repetitions = pad_size.div with_pad_length
+            remainder = pad_size % with_pad_length
+            case at of
+                Location.Start ->
+                    with_pad.take (Text_Sub_Range.Last remainder) + with_pad.repeat full_repetitions + this
+                Location.End ->
+                    this + with_pad.repeat full_repetitions + with_pad.take (Text_Sub_Range.First remainder)
+
+## This function removes the specified `trim_characters`, by default any
+   whitespace, from the start, the end, or both ends of the input.
+
+   Arguments:
+   - trim_characters: A Text containing characters that should be removed or a
+     predicate taking single character strings and specifying if they should be
+     removed. By default, this should be any Unicode whitespace characters and
+     all line terminator characters.
+   - from: The location of where to trim the input. By default, this function
+     trims both ends of the input.
+
+   > Example
+     Trimming whitespace from a string.
+
+         " Hello! ".trim == "Hello!"
+         " Hello! ".trim Start == "Hello! "
+         " Hello! ".trim End == " Hello!"
+
+   > Example
+     Trimming a specific set of letters from a string.
+
+        "ABC123".trim Start "ABC" == "123"
+        "ABBA123".trim Start "ABC" == "123"
+Text.trim : (Location.Start | Location.End | Location.Both) -> (Text | (Text -> Boolean)) -> Text
+Text.trim where=Location.Both what=_.is_whitespace =
+    predicate = case what of
+        Text -> what.contains _
+        _ -> what
+    break_iterator = BreakIterator.getCharacterInstance
+    break_iterator.setText this
+    start_index = case where of
+        Location.End -> 0
+        _ ->
+            loop current next =
+                if next < 0 then current else
+                    case predicate (Text_Utils.substring this current next) of
+                        True ->
+                            @Tail_Call loop next break_iterator.next
+                        False -> current
+            loop 0 break_iterator.next
+    end_index = case where of
+        Location.Start -> Text_Utils.char_length this
+        _ ->
+            loop current prev =
+                if prev < 0 then current else
+                    case predicate (Text_Utils.substring this prev current) of
+                        True ->
+                            @Tail_Call loop prev break_iterator.previous
+                        False -> current
+            current = break_iterator.last
+            loop current break_iterator.previous
+    if start_index >= end_index then "" else
+        Text_Utils.substring this start_index end_index
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Location.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Location.enso
@ -0,0 +1,8 @@
+## Indicates the beginning of a text.
+type Start
+
+## Indicates the end of a text.
+type End
+
+## Indicates both the beginning and end of a text.
+type Both
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Sub_Range.enso
@ -72,10 +72,10 @@ type Text_Sub_Range
                    Range 0 (if start_index == -1 then (Text_Utils.char_length text) else start_index)
            Last count ->
                if count <= 0 then (Range 0 0) else
-                    first_count = text.length - count
                    iterator = BreakIterator.getCharacterInstance
                    iterator.setText text
-                    start_index = iterator.next first_count
+                    iterator.last
+                    start_index = iterator.next -count
                    Range (if start_index == -1 then 0 else start_index) (Text_Utils.char_length text)
            Before delimiter ->
                if delimiter.is_empty then (Range 0 0) else
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso
@ -54,7 +54,7 @@ from project.Data.Range export Range
   Relevant issues:
   https://www.pivotaltracker.com/story/show/181403340
   https://www.pivotaltracker.com/story/show/181309938
-from project.Data.Text.Extensions export Text, Split_Kind, Line_Ending_Style, Case
+from project.Data.Text.Extensions export Text, Split_Kind, Line_Ending_Style, Case, Location
 from project.Data.Text.Matching export Case_Insensitive, Text_Matcher, Regex_Matcher
 from project.Error.Common export all
 from project.Error.Extensions export all
--- a/distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso
+++ b/distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso
@ -238,7 +238,7 @@ Any.should_equal that frames_to_skip=0 = case this == that of
         import Standard.Test

         example_should_equal = Examples.add_1_to 1 . should_equal 2
-Error.should_equal : Any -> Assertion.
+Error.should_equal : Any -> Assertion
 Error.should_equal _ = Panic.throw (Matched_On_Error this)

 ## Asserts that `this` is within `epsilon` from `that`.
--- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
+++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
@ -1,5 +1,6 @@
 package org.enso.base;

+import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.Normalizer;
 import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.StringSearch;
@ -101,17 +102,6 @@ public class Text_Utils {
    return vertical_space.split(str);
  }

-  /**
-   * Checks if the provided string consists only of whitespace characters.
-   *
-   * @param str the string to check
-   * @return {@code true} if {@code str} is only whitespace, otherwise {@code false}
-   */
-  public static boolean is_whitespace(String str) {
-    var matcher = whitespace.matcher(str);
-    return matcher.matches();
-  }
-
  /**
   * Checks whether two strings are equal up to Unicode canonicalization.
   *
@ -252,4 +242,14 @@ public class Text_Utils {
  public static String normalize(String str) {
    return Normalizer2.getNFDInstance().normalize(str);
  }
+
+  /**
+   * Checks if the given string consists only of whitespace characters.
+   *
+   * @param str the string to check
+   * @return {@code true} if {@code str} is only whitespace, otherwise {@code false}
+   */
+  public static boolean is_all_whitespace(String text) {
+    return text.codePoints().allMatch(UCharacter::isUWhiteSpace);
+  }
 }
--- a/test/Tests/src/Data/Text_Spec.enso
+++ b/test/Tests/src/Data/Text_Spec.enso
@ -168,6 +168,7 @@ spec =
            "Hello World!".take (First 0) . should_equal ""
            "Hello World!".take Last.new . should_equal "!"
            "Hello World!".take (Last 6) . should_equal "World!"
+            "Hello World!".take (Last 0) . should_equal ""
            "Hello World!".take (Last 100) . should_equal "Hello World!"
            "Hello World!".take (Before " ") . should_equal "Hello"
            "Hello World!".take (Before "z") . should_equal "Hello World!"
@ -219,6 +220,7 @@ spec =
            '✨🚀🚧😍😃😎😙😉☺'.take First.new . should_equal '✨'
            '✨🚀🚧😍😃😎😙😉☺'.take (First 2) . should_equal '✨🚀'
            '✨🚀🚧😍😃😎😙😉☺'.take Last.new . should_equal '☺'
+            '✨🚀🚧😍😃😎😙😉☺'.take (Last 0) . should_equal ''
            '✨🚀🚧😍😃😎😙😉☺'.take (Last 3) . should_equal '😙😉☺'
            '✨🚀🚧😍😃😍😎😙😉☺'.take (Before '😍') . should_equal '✨🚀🚧'
            '✨🚀🚧😍😃😍😎😙😉☺'.take (Before_Last '😍') . should_equal '✨🚀🚧😍😃'
@ -344,6 +346,7 @@ spec =
            str.is_digit 2 . should_be_true
            str.is_digit 3 . should_be_true
            str.is_digit 4 . should_be_false
+            str.is_digit 5 . should_fail_with Index_Out_Of_Bounds_Error

        Test.specify "should be able to check by negative index if is a digit" <|
            str = kshi + "A12" + accent_2
@ -352,6 +355,16 @@ spec =
            str.is_digit -3 . should_be_true
            str.is_digit -4 . should_be_false
            str.is_digit -5 . should_be_false
+            str.is_digit -100 . should_fail_with Index_Out_Of_Bounds_Error
+
+        Test.specify "should be able to check if a text consists only of whitespace" <|
+            '  \t\n'.is_whitespace . should_be_true
+            'AB'.is_whitespace . should_be_false
+            '  A   '.is_whitespace . should_be_false
+
+            '\v\f\u{200a}\u{202f}\u{205F}\u{3000}\u{feff}'.is_whitespace . should_be_true
+            # The Unicode Zero Width Space is not considered whitespace
+            '\u{200b}'.is_whitespace . should_be_false

        Test.specify "should return a dataflow error when checking is digit for out of bounds" <|
            str = kshi + "A12" + accent_2
@ -627,6 +640,74 @@ spec =
            "foobar" . ends_with "" Regex_Matcher.new . should_be_true
            "" . ends_with "" Regex_Matcher.new . should_be_true

+        Test.specify "should allow to pad a text" <|
+            "Hello World!".pad 15 . should_equal "Hello World!   "
+            "HELLO".pad 9 "AB" . should_equal "HELLOABAB"
+            "HELLO".pad 8 "AB" . should_equal "HELLOABA"
+            "HELLO".pad 8 "AB" Location.Start . should_equal "BABHELLO"
+            "".pad 4 . should_equal "    "
+            "A".pad 3 "" . should_fail_with Illegal_Argument_Error
+            "ABCDE".pad 3 "" . should_fail_with Illegal_Argument_Error
+            "".pad 0 "" . should_fail_with Illegal_Argument_Error
+
+            "".pad 0 . should_equal ""
+            "ABC".pad 3 . should_equal "ABC"
+            "AB".pad -1 . should_equal "AB"
+            "ABC".pad -100 . should_equal "ABC"
+
+            'a\u{301}'.pad 2 . should_equal 'a\u{301} '
+            "".pad 2 'a\u{302}' . should_equal 'a\u{302}a\u{302}'
+            'XX'.pad 5 'yy\u{301}' . should_equal 'XXyy\u{301}y'
+            'XX'.pad 5 'y\u{301}y' . should_equal 'XXy\u{301}yy\u{301}'
+            'XX'.pad 4 'yy\u{301}Z' . should_equal 'XXyy\u{301}'
+
+            '🚀'.pad 3 'B' Location.End . should_equal '🚀BB'
+            '🚀'.pad 3 'B' Location.Start . should_equal 'BB🚀'
+
+            ## It is technically possible to use a combining diacritical mark as
+               the padding, then the actual length of the text will not increase
+               because all padding will still constitute a single grapheme
+               cluster.
+            'e'.pad 7 '\u{301}' . length . should_equal 1
+
+        Test.specify "should allow to trim a text" <|
+            " Hello! ".trim . should_equal  "Hello!"
+            " Hello! ".trim Location.Start . should_equal  "Hello! "
+            " Hello! ".trim Location.End . should_equal  " Hello!"
+            "ABC123".trim Location.Start "ABC" . should_equal  "123"
+            "ABBA123".trim Location.Start "ABC" . should_equal  "123"
+            "ABCZ-]".trim Location.Both "[A-Z]" . should_equal "BC"
+
+            "   ".trim . should_equal ""
+            "  Hello World!   ".trim . should_equal  "Hello World!"
+            "  Hello World!   ".trim Location.Start . should_equal  "Hello World!   "
+            "  Hello World!   ".trim Location.End . should_equal  "  Hello World!"
+            "ABCD".trim Location.Start "ABCDEF" . should_equal ""
+            "ABCD".trim Location.End "ABCDEF" . should_equal ""
+            "ABCD".trim Location.Both "ABCDEF" . should_equal ""
+
+            "".trim . should_equal ""
+            "A".trim . should_equal "A"
+            " A ".trim . should_equal "A"
+            '   A\u{301} \n   '.trim . should_equal 'A\u{301}'
+            "🚧".trim . should_equal "🚧"
+            "  🚧  🚧  ".trim . should_equal "🚧  🚧"
+            "  🚧  🚧  ".trim Location.End . should_equal "  🚧  🚧"
+
+            "ABCD".trim Location.Start (_ -> True) . should_equal ""
+            "ABCD".trim Location.Both (_ -> True) . should_equal ""
+            "ABCD".trim Location.Both (_ -> False) . should_equal "ABCD"
+            "123AB98".trim Location.Both _.is_digit . should_equal "AB"
+
+            ' \t\n\r'.trim . should_equal ''
+            '\t\t  Test\nFoo\r\n'.trim . should_equal 'Test\nFoo'
+            # Check various kinds of Unicode whitespace
+            '\v\f\u{200a}\u{202f}\u{205F}\u{3000}\u{feff}'.trim . should_equal ''
+
+            # A whitespace with an accent is not treated as whitespace anymore
+            '      \u{301}   '.trim . should_equal ' \u{301}'
+            ' \u{301}'.trim . should_equal ' \u{301}'
+
    Test.group "Regex matching" <|
        Test.specify "should be possible on text" <|
            match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First