Add split and tokenize to the Table. (#6233)

Implement split and tokenize for tables.
This commit is contained in:
GregoryTravis 2023-04-14 12:03:02 -04:00 committed by GitHub
parent 92ce47016a
commit 4dcf5faddd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 622 additions and 0 deletions

View File

@ -381,6 +381,7 @@
methods.][6176]
- [Implemented `Table.union` for the Database backend.][6204]
- [Array & Vector have the same methods & behavior][6218]
- [Implemented `Table.split` and `Table.tokenize` for in-memory tables.][6233]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -578,6 +579,7 @@
[6204]: https://github.com/enso-org/enso/pull/6204
[6077]: https://github.com/enso-org/enso/pull/6077
[6218]: https://github.com/enso-org/enso/pull/6218
[6233]: https://github.com/enso-org/enso/pull/6233
#### Enso Compiler

View File

@ -1392,6 +1392,80 @@ type Table
msg = "Parsing values is not supported in database tables, the table has to be materialized first with `read`."
Error.throw (Unsupported_Database_Operation.Error msg)
## Splits a column of text into a set of new columns.
The original column will be removed from the table.
The new columns will be named with the name of the input column with a
incrementing number after.
Arguments:
- column: The name or index of the column to split the text of.
- delimiter: The term or terms used to split the text.
- column_count: The number of columns to split to.
If `Nothing` then columns will be added to fit all data.
- on_problems: Specifies the behavior when a problem occurs.
! Error Conditions
If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
be reported according to the `on_problems` behavior.
split_to_columns : Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
split_to_columns self column delimiter="," column_count=Nothing on_problems=Report_Error =
_ = [column delimiter column_count on_problems]
Error.throw (Unsupported_Database_Operation.Error "Table.split_to_columns is not implemented yet for the Database backends.")
## Splits a column of text into a set of new rows.
The values of other columns are repeated for the new rows.
Arguments:
- column: The name or index of the column to split the text of.
- delimiter: The term or terms used to split the text.
- on_problems: Specifies the behavior when a problem occurs.
split_to_rows : Text | Integer -> Text -> Table
split_to_rows self column delimiter="," =
_ = [column delimiter]
Error.throw (Unsupported_Database_Operation.Error "Table.split_to_rows is not implemented yet for the Database backends.")
## Tokenizes a column of text into a set of new columns using a regular
expression.
If the pattern contains marked groups, the values are concatenated
together; otherwise the whole match is returned.
The original column will be removed from the table.
The new columns will be named with the name of the input column with a
incrementing number after.
Arguments:
- column: The name or index of the column to tokenize the text of.
- pattern: The pattern used to find within the text.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
- column_count: The number of columns to split to.
If `Nothing` then columns will be added to fit all data.
- on_problems: Specifies the behavior when a problem occurs.
! Error Conditions
If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
be reported according to the `on_problems` behavior.
tokenize_to_columns : Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
tokenize_to_columns self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive column_count=Nothing on_problems=Report_Error =
_ = [column pattern case_sensitivity column_count on_problems]
Error.throw (Unsupported_Database_Operation.Error "Table.tokenize_to_columns is not implemented yet for the Database backends.")
## Tokenizes a column of text into a set of new rows using a regular
expression.
If the pattern contains marked groups, the values are concatenated
together; otherwise the whole match is returned.
The values of other columns are repeated for the new rows.
Arguments:
- column: The name or index of the column to tokenize the text of.
- pattern: The pattern used to find within the text.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
- on_problems: Specifies the behavior when a problem occurs.
tokenize_to_rows : Text | Integer -> Text -> Case_Sensitivity -> Table
tokenize_to_rows self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
_ = [column pattern case_sensitivity]
Error.throw (Unsupported_Database_Operation.Error "Table.tokenize_to_rows is not implemented yet for the Database backends.")
## PRIVATE
UNSTABLE
Cast the selected columns to a specific type.

View File

@ -29,6 +29,7 @@ import project.Internal.Join_Helpers
import project.Internal.Naming_Helpers.Naming_Helpers
import project.Internal.Parse_Values_Helper
import project.Internal.Problem_Builder.Problem_Builder
import project.Internal.Split_Tokenize
import project.Internal.Table_Helpers
import project.Internal.Table_Helpers.Table_Column_Helper
import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
@ -918,6 +919,76 @@ type Table
result = Table.new new_columns
problem_builder.attach_problems_after on_problems result
## Splits a column of text into a set of new columns.
The original column will be removed from the table.
The new columns will be named with the name of the input column with a
incrementing number after.
Arguments:
- column: The name or index of the column to split the text of.
- delimiter: The term or terms used to split the text.
- column_count: The number of columns to split to.
If `Nothing` then columns will be added to fit all data.
- on_problems: Specifies the behavior when a problem occurs.
! Error Conditions
If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
be reported according to the `on_problems` behavior.
split_to_columns : Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
split_to_columns self column delimiter="," column_count=Nothing on_problems=Report_Error =
Split_Tokenize.split_to_columns self column delimiter column_count on_problems
## Splits a column of text into a set of new rows.
The values of other columns are repeated for the new rows.
Arguments:
- column: The name or index of the column to split the text of.
- delimiter: The term or terms used to split the text.
- on_problems: Specifies the behavior when a problem occurs.
split_to_rows : Text | Integer -> Text -> Table
split_to_rows self column delimiter="," =
Split_Tokenize.split_to_rows self column delimiter
## Tokenizes a column of text into a set of new columns using a regular
expression.
If the pattern contains marked groups, the values are concatenated
together; otherwise the whole match is returned.
The original column will be removed from the table.
The new columns will be named with the name of the input column with a
incrementing number after.
Arguments:
- column: The name or index of the column to tokenize the text of.
- pattern: The pattern used to find within the text.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
- column_count: The number of columns to split to.
If `Nothing` then columns will be added to fit all data.
- on_problems: Specifies the behavior when a problem occurs.
! Error Conditions
If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
be reported according to the `on_problems` behavior.
tokenize_to_columns : Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
tokenize_to_columns self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive column_count=Nothing on_problems=Report_Error =
Split_Tokenize.tokenize_to_columns self column pattern case_sensitivity column_count on_problems
## Tokenizes a column of text into a set of new rows using a regular
expression.
If the pattern contains marked groups, the values are concatenated
together; otherwise the whole match is returned.
The values of other columns are repeated for the new rows.
Arguments:
- column: The name or index of the column to tokenize the text of.
- pattern: The pattern used to find within the text.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
- on_problems: Specifies the behavior when a problem occurs.
tokenize_to_rows : Text | Integer -> Text -> Case_Sensitivity -> Table
tokenize_to_rows self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
Split_Tokenize.tokenize_to_rows self column pattern case_sensitivity
## ALIAS Filter Rows
Selects only the rows of this table that correspond to `True` values of

View File

@ -552,3 +552,16 @@ type Invalid_Value_For_Type
to_display_text : Text
to_display_text self =
"The value ["+self.value.to_text+"] is not valid for the column type ["+self.value_type.to_text+"]."
type Column_Count_Exceeded
## PRIVATE
Indicates that an operation generating new columns produced more columns
than allowed by the limit.
Error (limit : Integer) (column_count : Integer)
## PRIVATE
Create a human-readable version of the error.
to_display_text : Text
to_display_text self =
"The operation produced more columns than the specified limit. The limit is "+self.limit.to_text+" and the number of new columns was "+self.column_count.to_text+". The limit may be turned off by setting the `limit` option to `Nothing`."

View File

@ -0,0 +1,236 @@
from Standard.Base import all
import project.Data.Column.Column
import project.Data.Table.Table
import project.Data.Type.Value_Type.Value_Type
import project.Internal.Java_Exports
import project.Internal.Problem_Builder.Problem_Builder
import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
from project import Value_Type
from project.Errors import Column_Count_Exceeded, Duplicate_Output_Column_Names, Invalid_Value_Type, Missing_Input_Columns
from project.Internal.Java_Exports import make_string_builder
polyglot java import org.enso.table.data.mask.OrderMask
## PRIVATE
Splits a column of text into a set of new columns.
See `Table.split_to_columns`.
split_to_columns : Table -> Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
split_to_columns table input_column_id delimiter="," column_count=Nothing on_problems=Report_Error =
column = table.at input_column_id
Value_Type.expect_text (column.value_type) related_column=column <|
fan_out_to_columns table input_column_id (handle_nothing (_.split delimiter)) column_count on_problems
## PRIVATE
Splits a column of text into a set of new rows.
See `Table.split_to_rows`.
split_to_rows : Table -> Text | Integer -> Text -> Table
split_to_rows table input_column_id delimiter="," =
column = table.at input_column_id
Value_Type.expect_text (column.value_type) related_column=column <|
fan_out_to_rows table input_column_id (handle_nothing (_.split delimiter))
## PRIVATE
Tokenizes a column of text into a set of new columns using a regular
expression.
See `Table.tokenize_to_columns`.
tokenize_to_columns : Table -> Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
tokenize_to_columns table input_column_id pattern case_sensitivity column_count on_problems =
column = table.at input_column_id
Value_Type.expect_text (column.value_type) related_column=column <|
fan_out_to_columns table input_column_id (handle_nothing (_.tokenize pattern case_sensitivity)) column_count on_problems
## PRIVATE
Tokenizes a column of text into a set of new rows using a regular
expression.
See `Table.tokenize_to_rows`.
tokenize_to_rows : Table -> Text | Integer -> Text -> Case_Sensitivity -> Table
tokenize_to_rows table input_column_id pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
column = table.at input_column_id
Value_Type.expect_text (column.value_type) related_column=column <|
fan_out_to_rows table input_column_id (handle_nothing (_.tokenize pattern case_sensitivity))
## PRIVATE
Transform a table by transforming a column into a set of columns. Takes a
function that maps a single element of the input column to a vector of output
values. The original column is replaced by the new columns.
Arguments:
- table: The table to transform.
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to multiple values.
fan_out_to_columns : Table -> Text | Integer -> (Any -> Vector Any) -> Integer | Nothing -> Problem_Behavior -> Table | Nothing
fan_out_to_columns table input_column_id function column_count=Nothing on_problems=Report_Error =
input_column = table.get input_column_id
problem_builder = Problem_Builder.new
new_columns_unrenamed = map_columns_to_multiple input_column function column_count problem_builder
new_columns = rename_new_columns table new_columns_unrenamed problem_builder
new_table = replace_column_with_columns table input_column new_columns
problem_builder.attach_problems_after on_problems new_table
## PRIVATE
Transform a column by applying the given function to the values in the
column. The function produces multiple outputs, so each row is duplicated,
with each row getting a distinct output value in place of the original
input value. The other column values are just duplicated.
Arguments:
- table: The table to transform.
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to multiple values.
fan_out_to_rows : Table -> Text | Integer -> (Any -> Vector Any) -> Table
fan_out_to_rows table input_column_id function =
input_column = table.at input_column_id
input_storage = input_column.java_column.getStorage
num_input_rows = input_storage.size
# Guess that most of the time, we'll get at least one value for each input.
initial_size = input_column.length
# Accumulates the output of the output column values.
output_column_builder = make_string_builder initial_size
# Accumulates repeated position indices for the order mask.
order_mask_positions = Vector.new_builder initial_size
0.up_to num_input_rows . each i->
input_value = input_storage.getItemBoxed i
output_values = function input_value
# Append each value.
output_values.each v-> output_column_builder.append v
# Append n copies of the input row position, n = # of output values.
repeat_each output_values.length <| order_mask_positions.append i
# Build the output column
output_storage = output_column_builder.seal
output_column = Column.from_storage input_column_id output_storage
# Build the order mask.
order_mask = OrderMask.new (order_mask_positions.to_vector)
# Build the other columns, and include the output_column while doing it.
new_columns = table.columns.map column->
case column.name == input_column_id of
True ->
# Replace the input column with the output column.
output_column
False ->
# Build a new column from the old one with the mask
old_storage = column.java_column.getStorage
new_storage = old_storage.applyMask order_mask
Column.from_storage column.name new_storage
Table.new new_columns
## PRIVATE
Map a multi-valued function over a column and return the results as set of
output columns.
Returns a Pair of a Vector of Columns and a Vector of problems.
Arguments:
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to multiple values.
- column_count: The number of columns to split to.
If `Nothing` then columns will be added to fit all data.
If the data exceeds the `column_count`, a `Column_Count_Exceeded` error
will follow the `on_problems` behavior.
- on_problems: Specifies the behavior when a problem occurs.
map_columns_to_multiple : Column -> (Any -> Vector Any) -> Integer | Nothing -> Problem_Builder -> Vector Column
map_columns_to_multiple input_column function column_count problem_builder =
num_rows = input_column.length
input_storage = input_column.java_column.getStorage
builders = case column_count of
Nothing ->
builders = Vector.new_builder
0.up_to num_rows . each i->
input_value = input_storage.getItemBoxed i
output_values = function input_value
# Add more builders if necessary to accommodate `output_values`.
if output_values.length > builders.length then
num_builders_needed = output_values.length - builders.length
repeat_each num_builders_needed <|
builder = make_string_builder num_rows
# Pad the new builder with nulls
num_nulls_needed = i
builder.appendNulls num_nulls_needed
builders.append builder
## Add `output_values` to builders; if there are more builders
than `output_values`, pad with null.
0.up_to builders.length . each i->
builders.at i . appendNoGrow (output_values.get i Nothing)
builders.to_vector
_ : Integer ->
builders = Vector.new column_count (_-> make_string_builder num_rows)
output_lengths = 0.up_to num_rows . map i->
input_value = input_storage.getItemBoxed i
output_values = function input_value
## Add `output_values` to builders; if there are more builders
than `output_values`, pad with null.
0.up_to builders.length . each i->
builders.at i . appendNoGrow (output_values.get i Nothing)
output_values.length
max_output_length = maximum output_lengths
if max_output_length > column_count then
problem = Column_Count_Exceeded.Error column_count max_output_length
problem_builder.report_other_warning problem
builders
# Build Columns.
builders.map .seal . map_with_index i-> storage->
name = input_column.name + "_" + i.to_text
Column.from_storage name storage
## PRIVATE
Rename a vector of columns to be unique when added to a table.
rename_new_columns : Table -> Vector Column -> Problem_Builder -> Vector Column
rename_new_columns table columns problem_builder =
unique = Unique_Name_Strategy.new
unique.mark_used <| table.columns.map .name
new_columns = columns.map column->
new_name = unique.make_unique column.name
column.rename new_name
problem_builder.report_unique_name_strategy unique
new_columns
## PRIVATE
Replace a single column in a table with new columns.
Does not ensure names are unique; that must be done before calling this.
replace_column_with_columns : Table -> Column -> Vector Column -> Table
replace_column_with_columns table old_column new_columns =
Table.new ((table.columns.map (c-> if c.name == old_column.name then new_columns else [c])).flatten)
## PRIVATE
Return the maximum value of the vector.
Throws Empty_Error if the vector is empty.
maximum : Vector Any -> Vector Any
maximum vec = if vec.is_empty then Nothing else
vec.reduce (a-> b-> a.max b)
## PRIVATE
Wrap a function so that it returns [] if passed Nothing
handle_nothing : (Any -> Any) -> (Any -> Any)
handle_nothing function = x-> case x of
_ : Nothing -> []
_ -> function x
## PRIVATE
Repeat a computation n times.
repeat_each n ~action = 0.up_to n . each _-> action

View File

@ -7,6 +7,7 @@ import project.In_Memory.Builders_Spec
import project.In_Memory.Column_Spec
import project.In_Memory.Common_Spec
import project.In_Memory.Join_Performance_Spec
import project.In_Memory.Split_Tokenize_Spec
import project.In_Memory.Table_Spec
import project.In_Memory.Table_Date_Spec
import project.In_Memory.Table_Date_Time_Spec
@ -22,5 +23,6 @@ spec =
Aggregate_Column_Spec.spec
Builders_Spec.spec
Join_Performance_Spec.spec
Split_Tokenize_Spec.spec
main = Test_Suite.run_main spec

View File

@ -0,0 +1,214 @@
from Standard.Base import all
import Standard.Base.Data.Text.Case_Sensitivity.Case_Sensitivity
import Standard.Test.Extensions
from Standard.Table import Table
from Standard.Table.Errors import Invalid_Value_Type, Column_Count_Exceeded, Duplicate_Output_Column_Names, No_Such_Column
from Standard.Test import Test, Test_Suite, Problems
from project.Util import all
spec =
Test.group "Table.split" <|
Test.specify "can do split_to_columns" <|
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
t = Table.new cols
expected_rows = [[0, "a", "c", Nothing], [1, "c", "d", "ef"], [2, "gh", "ij", "u"]]
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
t2 = t.split_to_columns "bar" "b"
t2.should_equal_verbose expected
Test.specify "can do split_to_rows" <|
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
t = Table.new cols
expected_rows = [[0, "a"], [0, "c"], [1, "c"], [1, "d"], [1, "ef"], [2, "gh"], [2, "ij"], [2, "u"]]
expected = Table.from_rows ["foo", "bar"] expected_rows
t2 = t.split_to_rows "bar" "b"
t2.should_equal_verbose expected
Test.specify "can do split_to_columns with some Nothings" <|
cols = [["foo", [0, 1, 2, 3]], ["bar", ["abc", "cbdbef", Nothing, "ghbijbu"]]]
t = Table.new cols
expected_rows = [[0, "a", "c", Nothing], [1, "c", "d", "ef"], [2, Nothing, Nothing, Nothing], [3, "gh", "ij", "u"]]
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
t2 = t.split_to_columns "bar" "b"
t2.should_equal_verbose expected
Test.specify "can do split_to_rows with some Nothings" <|
cols = [["foo", [0, 1, 2, 3]], ["bar", ["abc", "cbdbef", Nothing, "ghbijbu"]]]
t = Table.new cols
expected_rows = [[0, "a"], [0, "c"], [1, "c"], [1, "d"], [1, "ef"], [3, "gh"], [3, "ij"], [3, "u"]]
expected = Table.from_rows ["foo", "bar"] expected_rows
t2 = t.split_to_rows "bar" "b"
t2.should_equal_verbose expected
Test.group "Table.tokenize" <|
Test.specify "can do tokenize_to_columns" <|
cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]]]
t = Table.new cols
expected_rows = [[0, "12", "34", "5"], [1, "23", Nothing, Nothing], [2, "2", "4", "55"]]
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
t2 = t.tokenize_to_columns "bar" "\d+"
t2.should_equal_verbose expected
Test.specify "can do tokenize_to_rows" <|
cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]]]
t = Table.new cols
expected_rows = [[0, "12"], [0, "34"], [0, "5"], [1, "23"], [2, "2"], [2, "4"], [2, "55"]]
expected = Table.from_rows ["foo", "bar"] expected_rows
t2 = t.tokenize_to_rows "bar" "\d+"
t2.should_equal_verbose expected
Test.specify "can do tokenize_to_columns with some nothings" <|
cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", Nothing, "23", "2r4r55"]]]
t = Table.new cols
expected_rows = [[0, "12", "34", "5"], [1, Nothing, Nothing, Nothing], [2, "23", Nothing, Nothing], [3, "2", "4", "55"]]
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
t2 = t.tokenize_to_columns "bar" "\d+"
t2.should_equal_verbose expected
Test.specify "can do tokenize_to_rows with some Nothings" <|
cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", Nothing, "23", "2r4r55"]]]
t = Table.new cols
expected_rows = [[0, "12"], [0, "34"], [0, "5"], [2, "23"], [3, "2"], [3, "4"], [3, "55"]]
expected = Table.from_rows ["foo", "bar"] expected_rows
t2 = t.tokenize_to_rows "bar" "\d+"
t2.should_equal_verbose expected
Test.specify "can do tokenize_to_rows with some rows that have no matches" <|
cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", "23", "q", "2r4r55"]]]
t = Table.new cols
expected_rows = [[0, "12"], [0, "34"], [0, "5"], [1, "23"], [3, "2"], [3, "4"], [3, "55"]]
expected = Table.from_rows ["foo", "bar"] expected_rows
t2 = t.tokenize_to_rows "bar" "\d+"
t2.should_equal_verbose expected
Test.specify "can do tokenize_to_columns with groups" <|
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
t = Table.new cols
expected_rows = [[0, "a1", "b12", "d50"], [1, "b10", "c20", Nothing]]
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
t2 = t.tokenize_to_columns "bar" "([a-z]).(\d+)"
t2.should_equal_verbose expected
Test.specify "can do tokenize_to_rows with groups" <|
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
t = Table.new cols
expected_rows = [[0, "a1"], [0, "b12"], [0, "d50"], [1, "b10"], [1, "c20"]]
expected = Table.from_rows ["foo", "bar"] expected_rows
t2 = t.tokenize_to_rows "bar" "([a-z]).(\d+)"
t2.should_equal_verbose expected
Test.specify "can do tokenize_to_columns case-insensitively" <|
cols = [["foo", [0, 1, 2]], ["bar", ["aBqcE", "qcBr", "cCb"]]]
t = Table.new cols
expected_rows = [[0, "B", "c", Nothing], [1, "c", "B", Nothing], [2, "c", "C", "b"]]
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
t2 = t.tokenize_to_columns "bar" "[bc]" case_sensitivity=Case_Sensitivity.Insensitive
t2.should_equal_verbose expected
Test.specify "can do tokenize_to_rows case-insensitively" <|
cols = [["foo", [0, 1, 2]], ["bar", ["aBqcE", "qcBr", "cCb"]]]
t = Table.new cols
expected_rows = [[0, "B"], [0, "c"], [1, "c"], [1, "B"], [2, "c"], [2, "C"], [2, "b"]]
expected = Table.from_rows ["foo", "bar"] expected_rows
t2 = t.tokenize_to_rows "bar" "[bc]" case_sensitivity=Case_Sensitivity.Insensitive
t2.should_equal_verbose expected
Test.group "Table.split/tokenize column count" <|
Test.specify "should generate extra empty columns if column_count is set" <|
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
t = Table.new cols
expected_rows = [[0, "a", "c", Nothing, Nothing], [1, "c", "d", "ef", Nothing], [2, "gh", "ij", "u", Nothing]]
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "bar_3"] expected_rows
t2 = t.split_to_columns "bar" "b" column_count=4
t2.should_equal_verbose expected
t2.at "bar_3" . value_type . is_text . should_be_true
Test.specify "split should limit columns and return problems when exceeding the column limit" <|
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
t = Table.new cols
expected_rows = [[0, "a", "c"], [1, "c", "d"], [2, "gh", "ij"]]
expected = Table.from_rows ["foo", "bar_0", "bar_1"] expected_rows
action = t.split_to_columns "bar" "b" column_count=2 on_problems=_
tester = t-> t.should_equal_verbose expected
problems = [Column_Count_Exceeded.Error 2 3]
Problems.test_problem_handling action problems tester
Test.specify "tokenize should limit columns and return problems when exceeding the column limit" <|
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
t = Table.new cols
expected_rows = [[0, "a1", "b12", "d50"], [1, "b10", "c20", Nothing]]
expected = Table.from_rows ["foo", "bar_0", "bar_1"] expected_rows
action = t.tokenize_to_columns "bar" "([a-z]).(\d+)" column_count=2 on_problems=_
tester = t-> t.should_equal_verbose expected
problems = [Column_Count_Exceeded.Error 2 3]
Problems.test_problem_handling action problems tester
Test.specify "should generate extra empty columns if column_count is set (with rows in a different order)" <|
cols = [["foo", [0, 1, 2]], ["bar", ["ghbijbu", "cbdbef", "abc"]]]
t = Table.new cols
expected_rows = [[0, "gh", "ij", "u", Nothing], [1, "c", "d", "ef", Nothing], [2, "a", "c", Nothing, Nothing]]
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "bar_3"] expected_rows
t2 = t.split_to_columns "bar" "b" column_count=4
t2.should_equal_verbose expected
t2.at "bar_3" . value_type . is_text . should_be_true
Test.group "Table.split/tokenize errors" <|
Test.specify "won't work on a non-text column" <|
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
t = Table.new cols
t.split_to_columns "foo" "x" . should_fail_with Invalid_Value_Type
t.split_to_rows "foo" "x" . should_fail_with Invalid_Value_Type
t.tokenize_to_columns "foo" "x" . should_fail_with Invalid_Value_Type
t.tokenize_to_rows "foo" "x" . should_fail_with Invalid_Value_Type
Test.specify "won't work on a mixed column" <|
cols = [["foo", [0, 1]], ["bar", [500, "ab-10:bc-20c"]]]
t = Table.new cols
t.split_to_columns "bar" "x" . should_fail_with Invalid_Value_Type
t.split_to_rows "bar" "x" . should_fail_with Invalid_Value_Type
t.tokenize_to_columns "bar" "x" . should_fail_with Invalid_Value_Type
t.tokenize_to_rows "bar" "x" . should_fail_with Invalid_Value_Type
Test.specify "*_to_columns handles missing input column" <|
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
t = Table.new cols
t.tokenize_to_columns "invalid_name" "([a-z]).(\d+)" . should_fail_with No_Such_Column
Test.specify "*_to_rows handles missing input column" <|
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
t = Table.new cols
t.tokenize_to_rows "invalid_name" "([a-z]).(\d+)" . should_fail_with No_Such_Column
Test.group "Table.split/tokenize name conflicts" <|
Test.specify "split will make column names unique" <|
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]], ["bar_1", ["a", "b", "c"]]]
t = Table.new cols
expected_rows = [[0, "a", "c", Nothing, "a"], [1, "c", "d", "ef", "b"], [2, "gh", "ij", "u", "c"]]
expected = Table.from_rows ["foo", "bar_0", "bar_1_1", "bar_2", "bar_1"] expected_rows
action = t.split_to_columns "bar" "b" on_problems=_
tester = t-> t.should_equal_verbose expected
problems = [Duplicate_Output_Column_Names.Error ["bar_1"]]
Problems.test_problem_handling action problems tester
Test.specify "tokenize will make column names unique" <|
cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]], ["bar_1", ["a", "b", "c"]]]
t = Table.new cols
expected_rows = [[0, "12", "34", "5", "a"], [1, "23", Nothing, Nothing, "b"], [2, "2", "4", "55", "c"]]
expected = Table.from_rows ["foo", "bar_0", "bar_1_1", "bar_2", "bar_1"] expected_rows
action = t.tokenize_to_columns "bar" "\d+" on_problems=_
tester = t-> t.should_equal_verbose expected
problems = [Duplicate_Output_Column_Names.Error ["bar_1"]]
Problems.test_problem_handling action problems tester
Test.group "Table.split/tokenize column order" <|
Test.specify "preserves column order" <|
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]], ["baz", [1, 2, 3]]]
t = Table.new cols
expected_rows = [[0, "a", "c", Nothing, 1], [1, "c", "d", "ef", 2], [2, "gh", "ij", "u", 3]]
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "baz"] expected_rows
t2 = t.split_to_columns "bar" "b"
t2.should_equal_verbose expected
main = Test_Suite.run_main spec

View File

@ -13,6 +13,16 @@ Table.should_equal self expected =
self_cols.map .name . should_equal (that_cols.map .name) frames_to_skip=1
self_cols.map .to_vector . should_equal (that_cols.map .to_vector) frames_to_skip=1
Table.should_equal_verbose self expected =
tables_equal t0 t1 =
same_headers = (t0.columns.map .name) == (t1.columns.map .name)
same_columns = (t0.columns.map .to_vector) == (t1.columns.map .to_vector)
same_headers && same_columns
equal = tables_equal self expected
if equal.not then
msg = 'Tables differ.\nActual:\n' + self.display + '\nExpected:\n' + expected.display
Test.fail msg
Column.should_equal self expected =
if self.name != expected.name then
Test.fail "Expected column name "+expected.name+", but got "+self.name+"."