Add newlines option to text cleanse/replace (#10761)

* Auto-commit work in progress before clean build on 2024-08-06 11:32:46

* Fixed Regex and additional test

* changelog

* .

* Make non-capturing
This commit is contained in:
AdRiley 2024-08-06 16:59:54 +01:00 committed by GitHub
parent f0de43a970
commit 0f688d0a25
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 28 additions and 2 deletions

View File

@ -6,9 +6,11 @@
comparisons.][10614]
- [Relative paths are now resolved relative to the project location, also in the
Cloud.][10660]
- [Added Newline option to Text_Cleanse/Text_Replace.][10761]
[10614]: https://github.com/enso-org/enso/pull/10614
[10660]: https://github.com/enso-org/enso/pull/10660
[10761]: https://github.com/enso-org/enso/pull/10761
# Enso 2023.3

View File

@ -553,6 +553,7 @@ Text.replace self term:(Text | Regex) replacement:Text (case_sensitivity:Case_Se
- ..Trailing_Whitespace: Removes all whitespace from the end of the string.
- ..Duplicate_Whitespace: Removes all duplicate whitespace from the string replacing it with the first whitespace character of the duplicated block.
- ..All_Whitespace: Removes all whitespace from the string.
- ..Newlines: Removes all newline characters from the string. Line Feed and Carriage Return characters are considered newlines.
- ..Leading_Numbers: Removes all numbers from the start of the string.
- ..Trailing_Numbers: Removes all numbers from the end of the string.
- ..Non_ASCII: Removes all non-ascii characters from the string.

View File

@ -14,6 +14,9 @@ type Named_Pattern
## Matches one or more whitespace characters anywhere in a string.
All_Whitespace
## Matches one or more newline characters.
Newlines
## Matches one or more digits at the beginning of a string.
Leading_Numbers
@ -44,6 +47,7 @@ type Named_Pattern
Named_Pattern.Trailing_Whitespace -> "\s+$"
Named_Pattern.Duplicate_Whitespace -> "(?<=\s)\s+"
Named_Pattern.All_Whitespace -> "\s+"
Named_Pattern.Newlines -> "(?:\r\n?|\n)"
Named_Pattern.Leading_Numbers -> "^\d+"
Named_Pattern.Trailing_Numbers -> "\d+$"
Named_Pattern.Non_ASCII -> "[^\x00-\x7F]"

View File

@ -127,7 +127,7 @@ make_format_chooser include_number:Boolean=True include_date:Boolean=True includ
Creates a Multiple_Choice Widget for selecting data cleanse operations.
make_data_cleanse_vector_selector : Display -> Widget
make_data_cleanse_vector_selector display:Display=Display.Always =
patterns = ['Leading_Whitespace', 'Trailing_Whitespace', 'Duplicate_Whitespace', 'All_Whitespace', 'Leading_Numbers', 'Trailing_Numbers', 'Non_ASCII', 'Tabs', 'Letters', 'Numbers', 'Punctuation', 'Symbols']
patterns = ['Leading_Whitespace', 'Trailing_Whitespace', 'Duplicate_Whitespace', 'All_Whitespace', "Newlines", 'Leading_Numbers', 'Trailing_Numbers', 'Non_ASCII', 'Tabs', 'Letters', 'Numbers', 'Punctuation', 'Symbols']
options = patterns.map f-> Option f (".." + f)
Widget.Multiple_Choice values=options display=display
@ -139,7 +139,7 @@ make_any_selector display:Display=..Always add_text:Boolean=False add_regex:Bool
if add_text then builder.append (Option "<Text Value>" '""')
if add_regex then builder.append (Option "<Regular Expression>" '(regex "^$")')
if add_named_pattern then
patterns = ["Leading_Whitespace", "Trailing_Whitespace", "All_Whitespace", "Leading_Numbers", "Trailing_Numbers", "Non_ASCII", "Tabs", "Letters", "Numbers", "Punctuation", "Symbols"]
patterns = ["Leading_Whitespace", "Trailing_Whitespace", "All_Whitespace", "Newlines", "Leading_Numbers", "Trailing_Numbers", "Non_ASCII", "Tabs", "Letters", "Numbers", "Punctuation", "Symbols"]
patterns.each p-> builder.append (Option "<"+p+">" "Named_Pattern."+p)
if add_number then builder.append (Option "<Number Value>" "0")
if add_boolean then builder.append (Option "<True/False>" "True")

View File

@ -1484,6 +1484,7 @@ type DB_Column
- ..Trailing_Whitespace: Removes all whitspace from the end of the string.
- ..Duplicate_Whitespace: Removes all duplicate whitspace from the string replacing it with the first whitespace character of the duplicated block.
- ..All_Whitespace: Removes all whitspace from the string.
- ..Newlines: Removes all newline characters from the string. Line Feed and Carriage Return characters are considered newlines.
- ..Leading_Numbers: Removes all numbers from the start of the string.
- ..Trailing_Numbers: Removes all numbers from the end of the string.
- ..Non_ASCII: Removes all non-ascii characters from the string.

View File

@ -2951,6 +2951,7 @@ type DB_Table
- ..Trailing_Whitespace: Removes all whitespace from the end of the string.
- ..Duplicate_Whitespace: Removes all duplicate whitespace from the string replacing it with the first whitespace character of the duplicated block.
- ..All_Whitespace: Removes all whitespace from the string.
- ..Newlines: Removes all newline characters from the string. Line Feed and Carriage Return characters are considered newlines.
- ..Leading_Numbers: Removes all numbers from the start of the string.
- ..Trailing_Numbers: Removes all numbers from the end of the string.
- ..Non_ASCII: Removes all non-ascii characters from the string.

View File

@ -1483,6 +1483,7 @@ type Column
- ..Trailing_Whitespace: Removes all whitspace from the end of the string.
- ..Duplicate_Whitespace: Removes all duplicate whitspace from the string replacing it with the first whitespace character of the duplicated block.
- ..All_Whitespace: Removes all whitspace from the string.
- ..Newlines: Removes all newline characters from the string. Line Feed and Carriage Return characters are considered newlines.
- ..Leading_Numbers: Removes all numbers from the start of the string.
- ..Trailing_Numbers: Removes all numbers from the end of the string.
- ..Non_ASCII: Removes all non-ascii characters from the string.

View File

@ -3002,6 +3002,7 @@ type Table
- ..Trailing_Whitespace: Removes all whitespace from the end of the string.
- ..Duplicate_Whitespace: Removes all duplicate whitespace from the string replacing it with the first whitespace character of the duplicated block.
- ..All_Whitespace: Removes all whitespace from the string.
- ..Newlines: Removes all newline characters from the string. Line Feed and Carriage Return characters are considered newlines.
- ..Leading_Numbers: Removes all numbers from the start of the string.
- ..Trailing_Numbers: Removes all numbers from the end of the string.
- ..Non_ASCII: Removes all non-ascii characters from the string.

View File

@ -1658,6 +1658,11 @@ add_specs suite_builder =
expected = "XItXwasXtheXbestXofXtimesXitXwasXtheXworstXofXtimesX"
res = input.replace Named_Pattern.All_Whitespace "X"
res.should_equal expected
input_with_newlines = 'It was\r the best of times\n it was the worst\r\n of times'
group_builder.specify "should replace newlines" <|
expected = 'It wasX the best of timesX it was the worstX of times'
res = input_with_newlines.replace Named_Pattern.Newlines "X"
res.should_equal expected
input2 = "1922 It was the best of times it was the worst of times 1804"
group_builder.specify "should replace leading numbers" <|
expected = "X It was the best of times it was the worst of times 1804"
@ -1727,6 +1732,11 @@ add_specs suite_builder =
expected = "Itwasthebestoftimesitwastheworstoftimes"
res = input.cleanse [Named_Pattern.All_Whitespace]
res.should_equal expected
input_with_newlines = 'It was\r the best of times\n it was the worst\r\n of times'
group_builder.specify "should remove newlines" <|
expected = 'It was the best of times it was the worst of times'
res = input_with_newlines.cleanse [Named_Pattern.Newlines]
res.should_equal expected
input2 = "1922 It was the best of times it was the worst of times 1804"
group_builder.specify "should remove leading numbers" <|
expected = " It was the best of times it was the worst of times 1804"

View File

@ -90,6 +90,11 @@ add_specs suite_builder setup =
expected_col = Column.from_vector "Test" ["Itwas", "thebest", "oftimes", "itwastheworstoftimes"]
res = test_col.text_cleanse [..All_Whitespace]
res.should_equal expected_col
test_col_with_newlines = Column.from_vector "Test" ['\nIt was', 'the best\r', '\rof times\n', '\r\nit was \r\n the worst of times\n\r']
group_builder.specify "should remove all whitespace" <|
expected_col = Column.from_vector "Test" ["It was", "the best", "of times", "it was the worst of times"]
res = test_col_with_newlines.text_cleanse [..Newlines]
res.should_equal expected_col
test_col_with_nums = Column.from_vector "Test" ["1It was", "the best2", "3of times4", " 1984 it was the worst of times 72"]
group_builder.specify "should remove leading numbers" <|
expected_col = Column.from_vector "Test" ["It was", "the best2", "of times4", " 1984 it was the worst of times 72"]