Add replace_text method to In-Memory Table (#3793)

Implements https://www.pivotaltracker.com/n/projects/2539304/stories/183415329
This commit is contained in:
Paweł Grabarz 2022-10-14 17:42:29 +02:00 committed by GitHub
parent 5873af88c5
commit ce6267f098
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 147 additions and 9 deletions

View File

@ -209,6 +209,7 @@
- [Added `Filter_Condition` to `Vector`, `Range` and `List`.][3770]
- [Extended `Filter_Condition` with `Is_Empty`, `Not_Empty`, `Like` and
`Not_Like`.][3775]
- [Implemented `Table.replace_text` for in-memory table.][3793]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -335,6 +336,7 @@
[3750]: https://github.com/enso-org/enso/pull/3750
[3770]: https://github.com/enso-org/enso/pull/3770
[3775]: https://github.com/enso-org/enso/pull/3775
[3793]: https://github.com/enso-org/enso/pull/3793
#### Enso Compiler

View File

@ -385,9 +385,9 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
- new_text: The new text to replace occurrences of `term` with.
If `matcher` is a `Regex_Matcher`, `new_text` can include replacement
patterns (such as `$<n>`) for a marked group.
- mode: Specifies which instances of term the engine tries to find. When the
mode is `First` or `Last`, this method replaces the first or last instance
of term in the input. If set to `All`, it replaces all instances of term in
- mode: Specifies which occurences of term the engine tries to find. When the
mode is `First` or `Last`, this method replaces the first or last occurence
of term in the input. If set to `All`, it replaces all occurences of term in
the input.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a

View File

@ -12,6 +12,8 @@ import project.Data.Column_Selector.Column_Selector
import project.Data.Data_Formatter.Data_Formatter
import project.Data.Match_Columns.Match_Columns
import project.Data.Position.Position
import project.Data.Storage.Storage
import project.Data.Value_Type.Value_Type
import project.Data.Sort_Column_Selector.Sort_Column_Selector
import project.Data.Sort_Column.Sort_Column
import project.Data.Aggregate_Column.Aggregate_Column
@ -21,10 +23,11 @@ import project.Internal.Parse_Values_Helper
import project.Internal.Problem_Builder.Problem_Builder
import project.IO.Auto_Detect.Auto_Detect
from project.Data.Column import get_item_string
from project.Data.Column_Type_Selection import Column_Type_Selection, Auto
from project.Delimited.Delimited_Format import Delimited
from project.Internal.Filter_Condition_Helpers import make_filter_column
from project.Errors import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector, No_Index_Set_Error, No_Such_Column_Error, No_Such_Column_Error_Data, No_Input_Columns_Selected, No_Output_Columns
from project.Errors import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector, No_Index_Set_Error, No_Such_Column_Error, No_Such_Column_Error_Data, No_Input_Columns_Selected, No_Output_Columns, Invalid_Value_Type
import Standard.Visualization
@ -112,7 +115,7 @@ type Table
display_rows = Math.min num_rows show_rows
rows = Vector.new display_rows row_num->
cols = col_vals.map col->
if col.isNa row_num then "Nothing" else Column.get_item_string col row_num
if col.isNa row_num then "Nothing" else get_item_string col row_num
[index.ilocString row_num] + cols
table = print_table col_names rows 1 format_terminal
if num_rows - display_rows <= 0 then table else
@ -726,6 +729,87 @@ type Table
result = Table.new new_columns
on_problems.attach_problems_after result problem_builder.to_vector
## Replaces the first, last, or all occurrences of `term` with
`new_text` in each text row of selected columns.
If `term` is empty, the function returns the table unchanged.
This method follows the exact replacement semantics of the
`Text.replace` method.
Arguments:
- columns: Column selection criteria or a column name or index.
- term: The term to find.
- new_text: The new text to replace occurrences of `term` with.
If `matcher` is a `Regex_Matcher`, `new_text` can include replacement
patterns (such as `$<n>`) for a marked group.
- mode: Specifies which occurences of term the engine tries to find. When the
mode is `First` or `Last`, this method replaces the first or last occurence
of term in each individual table cell. If set to `All`, it replaces all
occurences of term.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
regular expression and matched using the associated options.
- on_problems: Specifies how to handle if a problem occurs, raising as a
warning by default.
The following problems can occur:
- If a column in columns is not in the input table, a `Missing_Input_Columns`.
- If duplicate columns, names or indices are provided, a
`Duplicate_Column_Selectors`.
- If a column index is out of range, a `Column_Indexes_Out_Of_Range`.
- If two distinct indices refer to the same column, an
`Input_Indices_Already_Matched`, with the column included the first
time it is matched.
- If a column in columns does not have a storage type of `Text`, or `Any`,
thus it is guaranteed that it can't contain any text values, a
`Invalid_Value_Type`.
> Example
Replace dashes with underscores in a column named "variable_names".
table.replace_text "variable_names" "-" "_"
> Example
Remove leading and trailing spaces from cells in multiple columns.
table.replace_text (Column_Selector.By_Name ["foo", "bar"]) "^\s*(.*?)\s*$" "$1" matcher=Regex_Matcher.Regex_Matcher_Data
> Example
Replace texts in quotes with parentheses in column at index 1.
table.replace_text 1 '"(.*?)"' '($1)' matcher=Regex_Matcher.Regex_Matcher_Data
replace_text : (Text | Integer | Column_Selector) -> Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Problem_Behavior -> Table
replace_text self columns=(Column_Selector.By_Index [0]) term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else
problem_builder = Problem_Builder.new
selector = case columns of
_ : Column_Selector -> columns
name : Text -> Column_Selector.By_Name [name]
index : Integer -> Column_Selector.By_Index [index]
selection = Table_Helpers.select_columns_helper self.columns selector reorder=False problem_builder
selected_names = Map.from_vector (selection.map column-> [column.name, True])
map_preserve_name column f = column.map f . rename column.name
do_replace = _.replace term new_text mode matcher
do_replace_only_text = case _ of
item : Text -> do_replace item
item -> item
transform column = case column.storage_type of
Storage.Text -> map_preserve_name column do_replace
Storage.Any -> map_preserve_name column do_replace_only_text
_ ->
problem = Invalid_Value_Type.Invalid_Value_Type_Data Value_Type.Char column.value_type
problem_builder.report_other_warning problem
column
new_columns = self.columns.map column->
is_selected = selected_names.get_or_else column.name False
if is_selected then transform column else column
result = Table.new new_columns
problem_builder.attach_problems_after on_problems result
## ALIAS Filter Rows
Selects only the rows of this table that correspond to `True` values of

View File

@ -363,7 +363,7 @@ type Column_Transform_Element
Value column associated_selector
## PRIVATE
prepare_order_by : Vector -> Problem_Builder -> Vector Column_Transform_Element
prepare_order_by : Vector -> Vector Text | Sort_Column_Selector -> Problem_Builder -> Vector Column_Transform_Element
prepare_order_by internal_columns column_selectors problem_builder =
selected_elements = case column_selectors of
_ : Vector.Vector ->

View File

@ -252,7 +252,7 @@ case class DocParserDef() extends Parser[Doc] {
logger.trace {
var listOfFormattedAST: List[Elem] = Nil
while (
result.stack.head != Elem.Formatter(typ) && result.stack.nonEmpty
result.stack.nonEmpty && result.stack.head != Elem.Formatter(typ)
) {
result.pop()
result.current match {

View File

@ -4,14 +4,14 @@ from Standard.Base.Error.Problem_Behavior import Report_Error
from Standard.Table import Table, Column, Sort_Column, Column_Selector, Sort_Column_Selector, Aggregate_Column
from Standard.Table.Data.Aggregate_Column.Aggregate_Column import all hiding First, Last
from Standard.Table.Data.Table import Empty_Error
from Standard.Table.Errors import Invalid_Output_Column_Names_Data, Duplicate_Output_Column_Names_Data, No_Input_Columns_Selected, Missing_Input_Columns_Data, No_Such_Column_Error_Data
from Standard.Table.Data.Storage import Storage
from Standard.Table.Errors import Floating_Point_Grouping_Data
from Standard.Table.Errors import Invalid_Output_Column_Names_Data, Duplicate_Output_Column_Names_Data, No_Input_Columns_Selected, Missing_Input_Columns_Data, No_Such_Column_Error_Data, Floating_Point_Grouping_Data, Invalid_Value_Type
import Standard.Visualization
import Standard.Test
import Standard.Test.Problems
import Standard.Table.Data.Value_Type.Value_Type
import project.Common_Table_Spec
from project.Util import all
@ -715,6 +715,58 @@ spec =
problems = [Duplicate_Output_Column_Names_Data ["A", "A", "A"]]
Problems.test_problem_handling action problems tester
Test.group "Table.replace_text" <|
Test.specify "should replace text in full-text table columns" <|
bools = ["bools", [False, False, True, True, False]]
texts = ["texts", ["foo", "foo", "bar", "baz", "spam"]]
table = Table.new [bools, texts]
actual = table.replace_text "texts" "a" "o"
actual.at "bools" . to_vector . should_equal [False, False, True, True, False]
actual.at "texts" . to_vector . should_equal ["foo", "foo", "bor", "boz", "spom"]
Problems.assume_no_problems actual
Test.specify "should replace text in mixed columns" <|
bools = ["bools", [False, False, True, True, False]]
mixed = ["mixed", ["foo", 5, "bar", False, "spam"]]
table = Table.new [bools, mixed]
actual = table.replace_text "mixed" "a" "o"
actual.at "bools" . to_vector . should_equal [False, False, True, True, False]
actual.at "mixed" . to_vector . should_equal ["foo", 5, "bor", False, "spom"]
Problems.assume_no_problems actual
Test.specify "should support operating on multiple columns at once" <|
bools = ["bools", [False, False, True]]
texts1 = ["texts1", ["foo", "bar", "baz"]]
texts2 = ["texts2", ["baz", "quux", "spam"]]
table = Table.new [bools, texts1, texts2]
actual = table.replace_text (Column_Selector.By_Name ["texts1", "texts2"]) "a" "o"
actual.at "bools" . to_vector . should_equal [False, False, True]
actual.at "texts1" . to_vector . should_equal ["foo", "bor", "boz"]
actual.at "texts2" . to_vector . should_equal ["boz", "quux", "spom"]
Problems.assume_no_problems actual
Test.specify "should support regex replacement" <|
bools = ["bools", [False, False, True, True, False]]
texts = ["texts", ["foo", "bar", "baz", "spam"]]
table = Table.new [bools, texts]
actual = table.replace_text "texts" "(a|o)" "$1e" matcher=Regex_Matcher.Regex_Matcher_Data
actual.at "texts" . to_vector . should_equal ["foeoe", "baer", "baez", "spaem"]
Problems.assume_no_problems actual
Test.specify 'should return warnings and errors when passed a non-existent column' <|
table = Table.new [["bools", [False, True]], ["texts", ["foo", "bar"]]]
action = table.replace_text "invalid_name" "a" "b" on_problems=_
tester = _.should_equal table
problems = [Missing_Input_Columns_Data ['invalid_name']]
Problems.test_problem_handling action problems tester
Test.specify "should return warnings and errors when selected non-text column" <|
table = Table.new [["bools", [False, True]], ["texts", ["foo", "bar"]]]
action = table.replace_text "bools" "a" "b" on_problems=_
tester = _.should_equal table
problems = [Invalid_Value_Type.Invalid_Value_Type_Data Value_Type.Char Value_Type.Boolean]
Problems.test_problem_handling action problems tester
Test.group "[In-Memory] Table.aggregate" <|
Test.specify "should return columns with correct types" <|
dates = ["dates", [Date.new 1999, Date.new 2000, Date.new 2000, Date.new 2000]]