Add replace_text method to In-Memory Table (#3793)

Implements https://www.pivotaltracker.com/n/projects/2539304/stories/183415329
2024-11-22 22:10:15 +03:00 · 2022-10-14 17:42:29 +02:00 · 2022-10-14 17:42:29 +02:00 · ce6267f098
commit ce6267f098
parent 5873af88c5
6 changed files with 147 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -209,6 +209,7 @@
 - [Added `Filter_Condition` to `Vector`, `Range` and `List`.][3770]
 - [Extended `Filter_Condition` with `Is_Empty`, `Not_Empty`, `Like` and
  `Not_Like`.][3775]
+- [Implemented `Table.replace_text` for in-memory table.][3793]

 [debug-shortcuts]:
  https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -335,6 +336,7 @@
 [3750]: https://github.com/enso-org/enso/pull/3750
 [3770]: https://github.com/enso-org/enso/pull/3770
 [3775]: https://github.com/enso-org/enso/pull/3775
+[3793]: https://github.com/enso-org/enso/pull/3793

 #### Enso Compiler

--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@ -385,9 +385,9 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
   - new_text: The new text to replace occurrences of `term` with.
     If `matcher` is a `Regex_Matcher`, `new_text` can include replacement
     patterns (such as `$<n>`) for a marked group.
-   - mode: Specifies which instances of term the engine tries to find. When the
-     mode is `First` or `Last`, this method replaces the first or last instance
-     of term in the input. If set to `All`, it replaces all instances of term in
+   - mode: Specifies which occurences of term the engine tries to find. When the
+     mode is `First` or `Last`, this method replaces the first or last occurence
+     of term in the input. If set to `All`, it replaces all occurences of term in
     the input.
   - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
     rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
@ -12,6 +12,8 @@ import project.Data.Column_Selector.Column_Selector
 import project.Data.Data_Formatter.Data_Formatter
 import project.Data.Match_Columns.Match_Columns
 import project.Data.Position.Position
+import project.Data.Storage.Storage
+import project.Data.Value_Type.Value_Type
 import project.Data.Sort_Column_Selector.Sort_Column_Selector
 import project.Data.Sort_Column.Sort_Column
 import project.Data.Aggregate_Column.Aggregate_Column
@ -21,10 +23,11 @@ import project.Internal.Parse_Values_Helper
 import project.Internal.Problem_Builder.Problem_Builder
 import project.IO.Auto_Detect.Auto_Detect

+from project.Data.Column import get_item_string
 from project.Data.Column_Type_Selection import Column_Type_Selection, Auto
 from project.Delimited.Delimited_Format import Delimited
 from project.Internal.Filter_Condition_Helpers import make_filter_column
-from project.Errors import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector, No_Index_Set_Error, No_Such_Column_Error, No_Such_Column_Error_Data, No_Input_Columns_Selected, No_Output_Columns
+from project.Errors import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector, No_Index_Set_Error, No_Such_Column_Error, No_Such_Column_Error_Data, No_Input_Columns_Selected, No_Output_Columns, Invalid_Value_Type

 import Standard.Visualization

@ -112,7 +115,7 @@ type Table
        display_rows = Math.min num_rows show_rows
        rows = Vector.new display_rows row_num->
            cols = col_vals.map col->
-                if col.isNa row_num then "Nothing" else Column.get_item_string col row_num
+                if col.isNa row_num then "Nothing" else get_item_string col row_num
            [index.ilocString row_num] + cols
        table = print_table col_names rows 1 format_terminal
        if num_rows - display_rows <= 0 then table else
@ -726,6 +729,87 @@ type Table
        result = Table.new new_columns
        on_problems.attach_problems_after result problem_builder.to_vector

+    ## Replaces the first, last, or all occurrences of `term` with
+       `new_text` in each text row of selected columns.
+       If `term` is empty, the function returns the table unchanged.
+
+       This method follows the exact replacement semantics of the
+       `Text.replace` method.
+
+       Arguments:
+       - columns: Column selection criteria or a column name or index.
+       - term: The term to find.
+       - new_text: The new text to replace occurrences of `term` with.
+         If `matcher` is a `Regex_Matcher`, `new_text` can include replacement
+         patterns (such as `$<n>`) for a marked group.
+       - mode: Specifies which occurences of term the engine tries to find. When the
+         mode is `First` or `Last`, this method replaces the first or last occurence
+         of term in each individual table cell. If set to `All`, it replaces all
+         occurences of term.
+       - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
+         rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
+         regular expression and matched using the associated options.
+       - on_problems: Specifies how to handle if a problem occurs, raising as a
+         warning by default.
+
+         The following problems can occur:
+         - If a column in columns is not in the input table, a `Missing_Input_Columns`.
+         - If duplicate columns, names or indices are provided, a
+           `Duplicate_Column_Selectors`.
+         - If a column index is out of range, a `Column_Indexes_Out_Of_Range`.
+         - If two distinct indices refer to the same column, an
+           `Input_Indices_Already_Matched`, with the column included the first
+           time it is matched.
+         - If a column in columns does not have a storage type of `Text`, or `Any`,
+           thus it is guaranteed that it can't contain any text values, a
+           `Invalid_Value_Type`.
+
+       > Example
+         Replace dashes with underscores in a column named "variable_names".
+
+             table.replace_text "variable_names" "-" "_" 
+
+       > Example
+         Remove leading and trailing spaces from cells in multiple columns.
+
+             table.replace_text (Column_Selector.By_Name ["foo", "bar"]) "^\s*(.*?)\s*$" "$1" matcher=Regex_Matcher.Regex_Matcher_Data
+
+       > Example
+         Replace texts in quotes with parentheses in column at index 1.
+
+             table.replace_text 1 '"(.*?)"' '($1)' matcher=Regex_Matcher.Regex_Matcher_Data
+    replace_text : (Text | Integer | Column_Selector) -> Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Problem_Behavior -> Table
+    replace_text self columns=(Column_Selector.By_Index [0]) term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else
+        problem_builder = Problem_Builder.new
+        
+        selector = case columns of
+            _ : Column_Selector -> columns
+            name : Text -> Column_Selector.By_Name [name]
+            index : Integer -> Column_Selector.By_Index [index]
+        selection = Table_Helpers.select_columns_helper self.columns selector reorder=False problem_builder
+        selected_names = Map.from_vector (selection.map column-> [column.name, True])
+
+        map_preserve_name column f = column.map f . rename column.name
+        do_replace = _.replace term new_text mode matcher
+        do_replace_only_text = case _ of
+                item : Text -> do_replace item
+                item -> item
+
+        transform column = case column.storage_type of
+            Storage.Text -> map_preserve_name column do_replace
+            Storage.Any -> map_preserve_name column do_replace_only_text
+            _ ->
+                problem = Invalid_Value_Type.Invalid_Value_Type_Data Value_Type.Char column.value_type
+                problem_builder.report_other_warning problem
+                column
+
+        new_columns = self.columns.map column->
+            is_selected = selected_names.get_or_else column.name False
+            if is_selected then transform column else column 
+            
+        result = Table.new new_columns
+        problem_builder.attach_problems_after on_problems result
+      
    ## ALIAS Filter Rows

       Selects only the rows of this table that correspond to `True` values of
--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso
@ -363,7 +363,7 @@ type Column_Transform_Element
    Value column associated_selector

 ## PRIVATE
-prepare_order_by : Vector -> Problem_Builder -> Vector Column_Transform_Element
+prepare_order_by : Vector -> Vector Text | Sort_Column_Selector -> Problem_Builder -> Vector Column_Transform_Element
 prepare_order_by internal_columns column_selectors problem_builder =
    selected_elements = case column_selectors of
        _ : Vector.Vector ->
--- a/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/DocParserDef.scala
+++ b/lib/scala/syntax/definition/src/main/scala/org/enso/syntax/text/spec/DocParserDef.scala
@ -252,7 +252,7 @@ case class DocParserDef() extends Parser[Doc] {
      logger.trace {
        var listOfFormattedAST: List[Elem] = Nil
        while (
-          result.stack.head != Elem.Formatter(typ) && result.stack.nonEmpty
+          result.stack.nonEmpty && result.stack.head != Elem.Formatter(typ)
        ) {
          result.pop()
          result.current match {
--- a/test/Table_Tests/src/Table_Spec.enso
+++ b/test/Table_Tests/src/Table_Spec.enso
@ -4,14 +4,14 @@ from Standard.Base.Error.Problem_Behavior import Report_Error
 from Standard.Table import Table, Column, Sort_Column, Column_Selector, Sort_Column_Selector, Aggregate_Column
 from Standard.Table.Data.Aggregate_Column.Aggregate_Column import all hiding First, Last
 from Standard.Table.Data.Table import Empty_Error
-from Standard.Table.Errors import Invalid_Output_Column_Names_Data, Duplicate_Output_Column_Names_Data, No_Input_Columns_Selected, Missing_Input_Columns_Data, No_Such_Column_Error_Data
 from Standard.Table.Data.Storage import Storage
-from Standard.Table.Errors import Floating_Point_Grouping_Data
+from Standard.Table.Errors import Invalid_Output_Column_Names_Data, Duplicate_Output_Column_Names_Data, No_Input_Columns_Selected, Missing_Input_Columns_Data, No_Such_Column_Error_Data, Floating_Point_Grouping_Data, Invalid_Value_Type

 import Standard.Visualization

 import Standard.Test
 import Standard.Test.Problems
+import Standard.Table.Data.Value_Type.Value_Type

 import project.Common_Table_Spec
 from project.Util import all
@ -715,6 +715,58 @@ spec =
            problems = [Duplicate_Output_Column_Names_Data ["A", "A", "A"]]
            Problems.test_problem_handling action problems tester

+    Test.group "Table.replace_text" <|
+        Test.specify "should replace text in full-text table columns" <|
+            bools = ["bools", [False, False, True, True, False]]
+            texts = ["texts", ["foo", "foo", "bar", "baz", "spam"]]
+            table = Table.new [bools, texts]
+            actual = table.replace_text "texts" "a" "o"
+            actual.at "bools" . to_vector . should_equal [False, False, True, True, False]
+            actual.at "texts" . to_vector . should_equal ["foo", "foo", "bor", "boz", "spom"]
+            Problems.assume_no_problems actual
+
+        Test.specify "should replace text in mixed columns" <|
+            bools = ["bools", [False, False, True, True, False]]
+            mixed = ["mixed", ["foo", 5, "bar", False, "spam"]]
+            table = Table.new [bools, mixed]
+            actual = table.replace_text "mixed" "a" "o"
+            actual.at "bools" . to_vector . should_equal [False, False, True, True, False]
+            actual.at "mixed" . to_vector . should_equal ["foo", 5, "bor", False, "spom"]
+            Problems.assume_no_problems actual
+
+        Test.specify "should support operating on multiple columns at once" <|
+            bools = ["bools", [False, False, True]]
+            texts1 = ["texts1", ["foo", "bar", "baz"]]
+            texts2 = ["texts2", ["baz", "quux", "spam"]]
+            table = Table.new [bools, texts1, texts2]
+            actual = table.replace_text (Column_Selector.By_Name ["texts1", "texts2"]) "a" "o"
+            actual.at "bools"  . to_vector . should_equal [False, False, True]
+            actual.at "texts1" . to_vector . should_equal ["foo", "bor", "boz"]
+            actual.at "texts2" . to_vector . should_equal ["boz", "quux", "spom"]
+            Problems.assume_no_problems actual
+
+        Test.specify "should support regex replacement" <|
+            bools = ["bools", [False, False, True, True, False]]
+            texts = ["texts", ["foo", "bar", "baz", "spam"]]
+            table = Table.new [bools, texts]
+            actual = table.replace_text "texts" "(a|o)" "$1e" matcher=Regex_Matcher.Regex_Matcher_Data
+            actual.at "texts" . to_vector . should_equal ["foeoe", "baer", "baez", "spaem"]
+            Problems.assume_no_problems actual
+
+        Test.specify 'should return warnings and errors when passed a non-existent column' <|
+            table = Table.new [["bools", [False, True]], ["texts", ["foo", "bar"]]]
+            action = table.replace_text "invalid_name" "a" "b" on_problems=_
+            tester = _.should_equal table
+            problems = [Missing_Input_Columns_Data ['invalid_name']]
+            Problems.test_problem_handling action problems tester
+
+        Test.specify "should return warnings and errors when selected non-text column" <|
+            table = Table.new [["bools", [False, True]], ["texts", ["foo", "bar"]]]
+            action = table.replace_text "bools" "a" "b" on_problems=_
+            tester = _.should_equal table
+            problems = [Invalid_Value_Type.Invalid_Value_Type_Data Value_Type.Char Value_Type.Boolean]
+            Problems.test_problem_handling action problems tester
+
    Test.group "[In-Memory] Table.aggregate" <|
        Test.specify "should return columns with correct types" <|
            dates = ["dates", [Date.new 1999, Date.new 2000, Date.new 2000, Date.new 2000]]