mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 22:10:15 +03:00
Add split and tokenize to the Table. (#6233)
Implement split and tokenize for tables.
This commit is contained in:
parent
92ce47016a
commit
4dcf5faddd
@ -381,6 +381,7 @@
|
||||
methods.][6176]
|
||||
- [Implemented `Table.union` for the Database backend.][6204]
|
||||
- [Array & Vector have the same methods & behavior][6218]
|
||||
- [Implemented `Table.split` and `Table.tokenize` for in-memory tables.][6233]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -578,6 +579,7 @@
|
||||
[6204]: https://github.com/enso-org/enso/pull/6204
|
||||
[6077]: https://github.com/enso-org/enso/pull/6077
|
||||
[6218]: https://github.com/enso-org/enso/pull/6218
|
||||
[6233]: https://github.com/enso-org/enso/pull/6233
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -1392,6 +1392,80 @@ type Table
|
||||
msg = "Parsing values is not supported in database tables, the table has to be materialized first with `read`."
|
||||
Error.throw (Unsupported_Database_Operation.Error msg)
|
||||
|
||||
## Splits a column of text into a set of new columns.
|
||||
The original column will be removed from the table.
|
||||
The new columns will be named with the name of the input column with a
|
||||
incrementing number after.
|
||||
|
||||
Arguments:
|
||||
- column: The name or index of the column to split the text of.
|
||||
- delimiter: The term or terms used to split the text.
|
||||
- column_count: The number of columns to split to.
|
||||
If `Nothing` then columns will be added to fit all data.
|
||||
- on_problems: Specifies the behavior when a problem occurs.
|
||||
|
||||
! Error Conditions
|
||||
If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
|
||||
be reported according to the `on_problems` behavior.
|
||||
split_to_columns : Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
|
||||
split_to_columns self column delimiter="," column_count=Nothing on_problems=Report_Error =
|
||||
_ = [column delimiter column_count on_problems]
|
||||
Error.throw (Unsupported_Database_Operation.Error "Table.split_to_columns is not implemented yet for the Database backends.")
|
||||
|
||||
## Splits a column of text into a set of new rows.
|
||||
The values of other columns are repeated for the new rows.
|
||||
|
||||
Arguments:
|
||||
- column: The name or index of the column to split the text of.
|
||||
- delimiter: The term or terms used to split the text.
|
||||
- on_problems: Specifies the behavior when a problem occurs.
|
||||
split_to_rows : Text | Integer -> Text -> Table
|
||||
split_to_rows self column delimiter="," =
|
||||
_ = [column delimiter]
|
||||
Error.throw (Unsupported_Database_Operation.Error "Table.split_to_rows is not implemented yet for the Database backends.")
|
||||
|
||||
## Tokenizes a column of text into a set of new columns using a regular
|
||||
expression.
|
||||
If the pattern contains marked groups, the values are concatenated
|
||||
together; otherwise the whole match is returned.
|
||||
The original column will be removed from the table.
|
||||
The new columns will be named with the name of the input column with a
|
||||
incrementing number after.
|
||||
|
||||
Arguments:
|
||||
- column: The name or index of the column to tokenize the text of.
|
||||
- pattern: The pattern used to find within the text.
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively.
|
||||
- column_count: The number of columns to split to.
|
||||
If `Nothing` then columns will be added to fit all data.
|
||||
- on_problems: Specifies the behavior when a problem occurs.
|
||||
|
||||
! Error Conditions
|
||||
If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
|
||||
be reported according to the `on_problems` behavior.
|
||||
tokenize_to_columns : Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
|
||||
tokenize_to_columns self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive column_count=Nothing on_problems=Report_Error =
|
||||
_ = [column pattern case_sensitivity column_count on_problems]
|
||||
Error.throw (Unsupported_Database_Operation.Error "Table.tokenize_to_columns is not implemented yet for the Database backends.")
|
||||
|
||||
## Tokenizes a column of text into a set of new rows using a regular
|
||||
expression.
|
||||
If the pattern contains marked groups, the values are concatenated
|
||||
together; otherwise the whole match is returned.
|
||||
The values of other columns are repeated for the new rows.
|
||||
|
||||
Arguments:
|
||||
- column: The name or index of the column to tokenize the text of.
|
||||
- pattern: The pattern used to find within the text.
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively.
|
||||
- on_problems: Specifies the behavior when a problem occurs.
|
||||
tokenize_to_rows : Text | Integer -> Text -> Case_Sensitivity -> Table
|
||||
tokenize_to_rows self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
_ = [column pattern case_sensitivity]
|
||||
Error.throw (Unsupported_Database_Operation.Error "Table.tokenize_to_rows is not implemented yet for the Database backends.")
|
||||
|
||||
## PRIVATE
|
||||
UNSTABLE
|
||||
Cast the selected columns to a specific type.
|
||||
|
@ -29,6 +29,7 @@ import project.Internal.Join_Helpers
|
||||
import project.Internal.Naming_Helpers.Naming_Helpers
|
||||
import project.Internal.Parse_Values_Helper
|
||||
import project.Internal.Problem_Builder.Problem_Builder
|
||||
import project.Internal.Split_Tokenize
|
||||
import project.Internal.Table_Helpers
|
||||
import project.Internal.Table_Helpers.Table_Column_Helper
|
||||
import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
|
||||
@ -918,6 +919,76 @@ type Table
|
||||
result = Table.new new_columns
|
||||
problem_builder.attach_problems_after on_problems result
|
||||
|
||||
## Splits a column of text into a set of new columns.
|
||||
The original column will be removed from the table.
|
||||
The new columns will be named with the name of the input column with a
|
||||
incrementing number after.
|
||||
|
||||
Arguments:
|
||||
- column: The name or index of the column to split the text of.
|
||||
- delimiter: The term or terms used to split the text.
|
||||
- column_count: The number of columns to split to.
|
||||
If `Nothing` then columns will be added to fit all data.
|
||||
- on_problems: Specifies the behavior when a problem occurs.
|
||||
|
||||
! Error Conditions
|
||||
If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
|
||||
be reported according to the `on_problems` behavior.
|
||||
split_to_columns : Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
|
||||
split_to_columns self column delimiter="," column_count=Nothing on_problems=Report_Error =
|
||||
Split_Tokenize.split_to_columns self column delimiter column_count on_problems
|
||||
|
||||
## Splits a column of text into a set of new rows.
|
||||
The values of other columns are repeated for the new rows.
|
||||
|
||||
Arguments:
|
||||
- column: The name or index of the column to split the text of.
|
||||
- delimiter: The term or terms used to split the text.
|
||||
- on_problems: Specifies the behavior when a problem occurs.
|
||||
split_to_rows : Text | Integer -> Text -> Table
|
||||
split_to_rows self column delimiter="," =
|
||||
Split_Tokenize.split_to_rows self column delimiter
|
||||
|
||||
## Tokenizes a column of text into a set of new columns using a regular
|
||||
expression.
|
||||
If the pattern contains marked groups, the values are concatenated
|
||||
together; otherwise the whole match is returned.
|
||||
The original column will be removed from the table.
|
||||
The new columns will be named with the name of the input column with a
|
||||
incrementing number after.
|
||||
|
||||
Arguments:
|
||||
- column: The name or index of the column to tokenize the text of.
|
||||
- pattern: The pattern used to find within the text.
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively.
|
||||
- column_count: The number of columns to split to.
|
||||
If `Nothing` then columns will be added to fit all data.
|
||||
- on_problems: Specifies the behavior when a problem occurs.
|
||||
|
||||
! Error Conditions
|
||||
If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
|
||||
be reported according to the `on_problems` behavior.
|
||||
tokenize_to_columns : Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
|
||||
tokenize_to_columns self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive column_count=Nothing on_problems=Report_Error =
|
||||
Split_Tokenize.tokenize_to_columns self column pattern case_sensitivity column_count on_problems
|
||||
|
||||
## Tokenizes a column of text into a set of new rows using a regular
|
||||
expression.
|
||||
If the pattern contains marked groups, the values are concatenated
|
||||
together; otherwise the whole match is returned.
|
||||
The values of other columns are repeated for the new rows.
|
||||
|
||||
Arguments:
|
||||
- column: The name or index of the column to tokenize the text of.
|
||||
- pattern: The pattern used to find within the text.
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively.
|
||||
- on_problems: Specifies the behavior when a problem occurs.
|
||||
tokenize_to_rows : Text | Integer -> Text -> Case_Sensitivity -> Table
|
||||
tokenize_to_rows self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
Split_Tokenize.tokenize_to_rows self column pattern case_sensitivity
|
||||
|
||||
## ALIAS Filter Rows
|
||||
|
||||
Selects only the rows of this table that correspond to `True` values of
|
||||
|
@ -552,3 +552,16 @@ type Invalid_Value_For_Type
|
||||
to_display_text : Text
|
||||
to_display_text self =
|
||||
"The value ["+self.value.to_text+"] is not valid for the column type ["+self.value_type.to_text+"]."
|
||||
|
||||
type Column_Count_Exceeded
|
||||
## PRIVATE
|
||||
Indicates that an operation generating new columns produced more columns
|
||||
than allowed by the limit.
|
||||
Error (limit : Integer) (column_count : Integer)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Create a human-readable version of the error.
|
||||
to_display_text : Text
|
||||
to_display_text self =
|
||||
"The operation produced more columns than the specified limit. The limit is "+self.limit.to_text+" and the number of new columns was "+self.column_count.to_text+". The limit may be turned off by setting the `limit` option to `Nothing`."
|
||||
|
@ -0,0 +1,236 @@
|
||||
from Standard.Base import all
|
||||
|
||||
import project.Data.Column.Column
|
||||
import project.Data.Table.Table
|
||||
import project.Data.Type.Value_Type.Value_Type
|
||||
import project.Internal.Java_Exports
|
||||
import project.Internal.Problem_Builder.Problem_Builder
|
||||
import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
|
||||
|
||||
from project import Value_Type
|
||||
from project.Errors import Column_Count_Exceeded, Duplicate_Output_Column_Names, Invalid_Value_Type, Missing_Input_Columns
|
||||
from project.Internal.Java_Exports import make_string_builder
|
||||
|
||||
polyglot java import org.enso.table.data.mask.OrderMask
|
||||
|
||||
## PRIVATE
|
||||
Splits a column of text into a set of new columns.
|
||||
See `Table.split_to_columns`.
|
||||
split_to_columns : Table -> Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
|
||||
split_to_columns table input_column_id delimiter="," column_count=Nothing on_problems=Report_Error =
|
||||
column = table.at input_column_id
|
||||
Value_Type.expect_text (column.value_type) related_column=column <|
|
||||
fan_out_to_columns table input_column_id (handle_nothing (_.split delimiter)) column_count on_problems
|
||||
|
||||
## PRIVATE
|
||||
Splits a column of text into a set of new rows.
|
||||
See `Table.split_to_rows`.
|
||||
split_to_rows : Table -> Text | Integer -> Text -> Table
|
||||
split_to_rows table input_column_id delimiter="," =
|
||||
column = table.at input_column_id
|
||||
Value_Type.expect_text (column.value_type) related_column=column <|
|
||||
fan_out_to_rows table input_column_id (handle_nothing (_.split delimiter))
|
||||
|
||||
## PRIVATE
|
||||
Tokenizes a column of text into a set of new columns using a regular
|
||||
expression.
|
||||
See `Table.tokenize_to_columns`.
|
||||
tokenize_to_columns : Table -> Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
|
||||
tokenize_to_columns table input_column_id pattern case_sensitivity column_count on_problems =
|
||||
column = table.at input_column_id
|
||||
Value_Type.expect_text (column.value_type) related_column=column <|
|
||||
fan_out_to_columns table input_column_id (handle_nothing (_.tokenize pattern case_sensitivity)) column_count on_problems
|
||||
|
||||
## PRIVATE
|
||||
Tokenizes a column of text into a set of new rows using a regular
|
||||
expression.
|
||||
See `Table.tokenize_to_rows`.
|
||||
tokenize_to_rows : Table -> Text | Integer -> Text -> Case_Sensitivity -> Table
|
||||
tokenize_to_rows table input_column_id pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
column = table.at input_column_id
|
||||
Value_Type.expect_text (column.value_type) related_column=column <|
|
||||
fan_out_to_rows table input_column_id (handle_nothing (_.tokenize pattern case_sensitivity))
|
||||
|
||||
## PRIVATE
|
||||
Transform a table by transforming a column into a set of columns. Takes a
|
||||
function that maps a single element of the input column to a vector of output
|
||||
values. The original column is replaced by the new columns.
|
||||
|
||||
Arguments:
|
||||
- table: The table to transform.
|
||||
- input_column: The column to transform.
|
||||
- function: A function that transforms a single element of `input_column`
|
||||
to multiple values.
|
||||
fan_out_to_columns : Table -> Text | Integer -> (Any -> Vector Any) -> Integer | Nothing -> Problem_Behavior -> Table | Nothing
|
||||
fan_out_to_columns table input_column_id function column_count=Nothing on_problems=Report_Error =
|
||||
input_column = table.get input_column_id
|
||||
problem_builder = Problem_Builder.new
|
||||
new_columns_unrenamed = map_columns_to_multiple input_column function column_count problem_builder
|
||||
new_columns = rename_new_columns table new_columns_unrenamed problem_builder
|
||||
new_table = replace_column_with_columns table input_column new_columns
|
||||
problem_builder.attach_problems_after on_problems new_table
|
||||
|
||||
## PRIVATE
|
||||
Transform a column by applying the given function to the values in the
|
||||
column. The function produces multiple outputs, so each row is duplicated,
|
||||
with each row getting a distinct output value in place of the original
|
||||
input value. The other column values are just duplicated.
|
||||
|
||||
Arguments:
|
||||
- table: The table to transform.
|
||||
- input_column: The column to transform.
|
||||
- function: A function that transforms a single element of `input_column`
|
||||
to multiple values.
|
||||
fan_out_to_rows : Table -> Text | Integer -> (Any -> Vector Any) -> Table
|
||||
fan_out_to_rows table input_column_id function =
|
||||
input_column = table.at input_column_id
|
||||
input_storage = input_column.java_column.getStorage
|
||||
num_input_rows = input_storage.size
|
||||
|
||||
# Guess that most of the time, we'll get at least one value for each input.
|
||||
initial_size = input_column.length
|
||||
# Accumulates the output of the output column values.
|
||||
output_column_builder = make_string_builder initial_size
|
||||
# Accumulates repeated position indices for the order mask.
|
||||
order_mask_positions = Vector.new_builder initial_size
|
||||
|
||||
0.up_to num_input_rows . each i->
|
||||
input_value = input_storage.getItemBoxed i
|
||||
output_values = function input_value
|
||||
# Append each value.
|
||||
output_values.each v-> output_column_builder.append v
|
||||
# Append n copies of the input row position, n = # of output values.
|
||||
repeat_each output_values.length <| order_mask_positions.append i
|
||||
|
||||
# Build the output column
|
||||
output_storage = output_column_builder.seal
|
||||
output_column = Column.from_storage input_column_id output_storage
|
||||
|
||||
# Build the order mask.
|
||||
order_mask = OrderMask.new (order_mask_positions.to_vector)
|
||||
|
||||
# Build the other columns, and include the output_column while doing it.
|
||||
new_columns = table.columns.map column->
|
||||
case column.name == input_column_id of
|
||||
True ->
|
||||
# Replace the input column with the output column.
|
||||
output_column
|
||||
False ->
|
||||
# Build a new column from the old one with the mask
|
||||
old_storage = column.java_column.getStorage
|
||||
new_storage = old_storage.applyMask order_mask
|
||||
Column.from_storage column.name new_storage
|
||||
|
||||
Table.new new_columns
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Map a multi-valued function over a column and return the results as set of
|
||||
output columns.
|
||||
|
||||
Returns a Pair of a Vector of Columns and a Vector of problems.
|
||||
|
||||
Arguments:
|
||||
- input_column: The column to transform.
|
||||
- function: A function that transforms a single element of `input_column`
|
||||
to multiple values.
|
||||
- column_count: The number of columns to split to.
|
||||
If `Nothing` then columns will be added to fit all data.
|
||||
If the data exceeds the `column_count`, a `Column_Count_Exceeded` error
|
||||
will follow the `on_problems` behavior.
|
||||
- on_problems: Specifies the behavior when a problem occurs.
|
||||
map_columns_to_multiple : Column -> (Any -> Vector Any) -> Integer | Nothing -> Problem_Builder -> Vector Column
|
||||
map_columns_to_multiple input_column function column_count problem_builder =
|
||||
num_rows = input_column.length
|
||||
input_storage = input_column.java_column.getStorage
|
||||
|
||||
builders = case column_count of
|
||||
Nothing ->
|
||||
builders = Vector.new_builder
|
||||
|
||||
0.up_to num_rows . each i->
|
||||
input_value = input_storage.getItemBoxed i
|
||||
output_values = function input_value
|
||||
|
||||
# Add more builders if necessary to accommodate `output_values`.
|
||||
if output_values.length > builders.length then
|
||||
num_builders_needed = output_values.length - builders.length
|
||||
repeat_each num_builders_needed <|
|
||||
builder = make_string_builder num_rows
|
||||
|
||||
# Pad the new builder with nulls
|
||||
num_nulls_needed = i
|
||||
builder.appendNulls num_nulls_needed
|
||||
|
||||
builders.append builder
|
||||
|
||||
## Add `output_values` to builders; if there are more builders
|
||||
than `output_values`, pad with null.
|
||||
0.up_to builders.length . each i->
|
||||
builders.at i . appendNoGrow (output_values.get i Nothing)
|
||||
|
||||
builders.to_vector
|
||||
|
||||
_ : Integer ->
|
||||
builders = Vector.new column_count (_-> make_string_builder num_rows)
|
||||
|
||||
output_lengths = 0.up_to num_rows . map i->
|
||||
input_value = input_storage.getItemBoxed i
|
||||
output_values = function input_value
|
||||
|
||||
## Add `output_values` to builders; if there are more builders
|
||||
than `output_values`, pad with null.
|
||||
0.up_to builders.length . each i->
|
||||
builders.at i . appendNoGrow (output_values.get i Nothing)
|
||||
|
||||
output_values.length
|
||||
|
||||
max_output_length = maximum output_lengths
|
||||
|
||||
if max_output_length > column_count then
|
||||
problem = Column_Count_Exceeded.Error column_count max_output_length
|
||||
problem_builder.report_other_warning problem
|
||||
|
||||
builders
|
||||
|
||||
# Build Columns.
|
||||
builders.map .seal . map_with_index i-> storage->
|
||||
name = input_column.name + "_" + i.to_text
|
||||
Column.from_storage name storage
|
||||
|
||||
## PRIVATE
|
||||
Rename a vector of columns to be unique when added to a table.
|
||||
rename_new_columns : Table -> Vector Column -> Problem_Builder -> Vector Column
|
||||
rename_new_columns table columns problem_builder =
|
||||
unique = Unique_Name_Strategy.new
|
||||
unique.mark_used <| table.columns.map .name
|
||||
new_columns = columns.map column->
|
||||
new_name = unique.make_unique column.name
|
||||
column.rename new_name
|
||||
problem_builder.report_unique_name_strategy unique
|
||||
new_columns
|
||||
|
||||
## PRIVATE
|
||||
Replace a single column in a table with new columns.
|
||||
Does not ensure names are unique; that must be done before calling this.
|
||||
replace_column_with_columns : Table -> Column -> Vector Column -> Table
|
||||
replace_column_with_columns table old_column new_columns =
|
||||
Table.new ((table.columns.map (c-> if c.name == old_column.name then new_columns else [c])).flatten)
|
||||
|
||||
## PRIVATE
|
||||
Return the maximum value of the vector.
|
||||
Throws Empty_Error if the vector is empty.
|
||||
maximum : Vector Any -> Vector Any
|
||||
maximum vec = if vec.is_empty then Nothing else
|
||||
vec.reduce (a-> b-> a.max b)
|
||||
|
||||
## PRIVATE
|
||||
Wrap a function so that it returns [] if passed Nothing
|
||||
handle_nothing : (Any -> Any) -> (Any -> Any)
|
||||
handle_nothing function = x-> case x of
|
||||
_ : Nothing -> []
|
||||
_ -> function x
|
||||
|
||||
## PRIVATE
|
||||
Repeat a computation n times.
|
||||
repeat_each n ~action = 0.up_to n . each _-> action
|
@ -7,6 +7,7 @@ import project.In_Memory.Builders_Spec
|
||||
import project.In_Memory.Column_Spec
|
||||
import project.In_Memory.Common_Spec
|
||||
import project.In_Memory.Join_Performance_Spec
|
||||
import project.In_Memory.Split_Tokenize_Spec
|
||||
import project.In_Memory.Table_Spec
|
||||
import project.In_Memory.Table_Date_Spec
|
||||
import project.In_Memory.Table_Date_Time_Spec
|
||||
@ -22,5 +23,6 @@ spec =
|
||||
Aggregate_Column_Spec.spec
|
||||
Builders_Spec.spec
|
||||
Join_Performance_Spec.spec
|
||||
Split_Tokenize_Spec.spec
|
||||
|
||||
main = Test_Suite.run_main spec
|
||||
|
214
test/Table_Tests/src/In_Memory/Split_Tokenize_Spec.enso
Normal file
214
test/Table_Tests/src/In_Memory/Split_Tokenize_Spec.enso
Normal file
@ -0,0 +1,214 @@
|
||||
from Standard.Base import all
|
||||
|
||||
import Standard.Base.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import Standard.Test.Extensions
|
||||
|
||||
from Standard.Table import Table
|
||||
from Standard.Table.Errors import Invalid_Value_Type, Column_Count_Exceeded, Duplicate_Output_Column_Names, No_Such_Column
|
||||
from Standard.Test import Test, Test_Suite, Problems
|
||||
from project.Util import all
|
||||
|
||||
spec =
|
||||
Test.group "Table.split" <|
|
||||
Test.specify "can do split_to_columns" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a", "c", Nothing], [1, "c", "d", "ef"], [2, "gh", "ij", "u"]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
|
||||
t2 = t.split_to_columns "bar" "b"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do split_to_rows" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a"], [0, "c"], [1, "c"], [1, "d"], [1, "ef"], [2, "gh"], [2, "ij"], [2, "u"]]
|
||||
expected = Table.from_rows ["foo", "bar"] expected_rows
|
||||
t2 = t.split_to_rows "bar" "b"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do split_to_columns with some Nothings" <|
|
||||
cols = [["foo", [0, 1, 2, 3]], ["bar", ["abc", "cbdbef", Nothing, "ghbijbu"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a", "c", Nothing], [1, "c", "d", "ef"], [2, Nothing, Nothing, Nothing], [3, "gh", "ij", "u"]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
|
||||
t2 = t.split_to_columns "bar" "b"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do split_to_rows with some Nothings" <|
|
||||
cols = [["foo", [0, 1, 2, 3]], ["bar", ["abc", "cbdbef", Nothing, "ghbijbu"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a"], [0, "c"], [1, "c"], [1, "d"], [1, "ef"], [3, "gh"], [3, "ij"], [3, "u"]]
|
||||
expected = Table.from_rows ["foo", "bar"] expected_rows
|
||||
t2 = t.split_to_rows "bar" "b"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.group "Table.tokenize" <|
|
||||
Test.specify "can do tokenize_to_columns" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "12", "34", "5"], [1, "23", Nothing, Nothing], [2, "2", "4", "55"]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
|
||||
t2 = t.tokenize_to_columns "bar" "\d+"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do tokenize_to_rows" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "12"], [0, "34"], [0, "5"], [1, "23"], [2, "2"], [2, "4"], [2, "55"]]
|
||||
expected = Table.from_rows ["foo", "bar"] expected_rows
|
||||
t2 = t.tokenize_to_rows "bar" "\d+"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do tokenize_to_columns with some nothings" <|
|
||||
cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", Nothing, "23", "2r4r55"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "12", "34", "5"], [1, Nothing, Nothing, Nothing], [2, "23", Nothing, Nothing], [3, "2", "4", "55"]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
|
||||
t2 = t.tokenize_to_columns "bar" "\d+"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do tokenize_to_rows with some Nothings" <|
|
||||
cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", Nothing, "23", "2r4r55"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "12"], [0, "34"], [0, "5"], [2, "23"], [3, "2"], [3, "4"], [3, "55"]]
|
||||
expected = Table.from_rows ["foo", "bar"] expected_rows
|
||||
t2 = t.tokenize_to_rows "bar" "\d+"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do tokenize_to_rows with some rows that have no matches" <|
|
||||
cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", "23", "q", "2r4r55"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "12"], [0, "34"], [0, "5"], [1, "23"], [3, "2"], [3, "4"], [3, "55"]]
|
||||
expected = Table.from_rows ["foo", "bar"] expected_rows
|
||||
t2 = t.tokenize_to_rows "bar" "\d+"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do tokenize_to_columns with groups" <|
|
||||
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a1", "b12", "d50"], [1, "b10", "c20", Nothing]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
|
||||
t2 = t.tokenize_to_columns "bar" "([a-z]).(\d+)"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do tokenize_to_rows with groups" <|
|
||||
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a1"], [0, "b12"], [0, "d50"], [1, "b10"], [1, "c20"]]
|
||||
expected = Table.from_rows ["foo", "bar"] expected_rows
|
||||
t2 = t.tokenize_to_rows "bar" "([a-z]).(\d+)"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do tokenize_to_columns case-insensitively" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["aBqcE", "qcBr", "cCb"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "B", "c", Nothing], [1, "c", "B", Nothing], [2, "c", "C", "b"]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
|
||||
t2 = t.tokenize_to_columns "bar" "[bc]" case_sensitivity=Case_Sensitivity.Insensitive
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.specify "can do tokenize_to_rows case-insensitively" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["aBqcE", "qcBr", "cCb"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "B"], [0, "c"], [1, "c"], [1, "B"], [2, "c"], [2, "C"], [2, "b"]]
|
||||
expected = Table.from_rows ["foo", "bar"] expected_rows
|
||||
t2 = t.tokenize_to_rows "bar" "[bc]" case_sensitivity=Case_Sensitivity.Insensitive
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
Test.group "Table.split/tokenize column count" <|
|
||||
Test.specify "should generate extra empty columns if column_count is set" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a", "c", Nothing, Nothing], [1, "c", "d", "ef", Nothing], [2, "gh", "ij", "u", Nothing]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "bar_3"] expected_rows
|
||||
t2 = t.split_to_columns "bar" "b" column_count=4
|
||||
t2.should_equal_verbose expected
|
||||
t2.at "bar_3" . value_type . is_text . should_be_true
|
||||
|
||||
Test.specify "split should limit columns and return problems when exceeding the column limit" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a", "c"], [1, "c", "d"], [2, "gh", "ij"]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1"] expected_rows
|
||||
action = t.split_to_columns "bar" "b" column_count=2 on_problems=_
|
||||
tester = t-> t.should_equal_verbose expected
|
||||
problems = [Column_Count_Exceeded.Error 2 3]
|
||||
Problems.test_problem_handling action problems tester
|
||||
|
||||
Test.specify "tokenize should limit columns and return problems when exceeding the column limit" <|
|
||||
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a1", "b12", "d50"], [1, "b10", "c20", Nothing]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1"] expected_rows
|
||||
action = t.tokenize_to_columns "bar" "([a-z]).(\d+)" column_count=2 on_problems=_
|
||||
tester = t-> t.should_equal_verbose expected
|
||||
problems = [Column_Count_Exceeded.Error 2 3]
|
||||
Problems.test_problem_handling action problems tester
|
||||
|
||||
Test.specify "should generate extra empty columns if column_count is set (with rows in a different order)" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["ghbijbu", "cbdbef", "abc"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "gh", "ij", "u", Nothing], [1, "c", "d", "ef", Nothing], [2, "a", "c", Nothing, Nothing]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "bar_3"] expected_rows
|
||||
t2 = t.split_to_columns "bar" "b" column_count=4
|
||||
t2.should_equal_verbose expected
|
||||
t2.at "bar_3" . value_type . is_text . should_be_true
|
||||
|
||||
Test.group "Table.split/tokenize errors" <|
|
||||
Test.specify "won't work on a non-text column" <|
|
||||
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
|
||||
t = Table.new cols
|
||||
t.split_to_columns "foo" "x" . should_fail_with Invalid_Value_Type
|
||||
t.split_to_rows "foo" "x" . should_fail_with Invalid_Value_Type
|
||||
t.tokenize_to_columns "foo" "x" . should_fail_with Invalid_Value_Type
|
||||
t.tokenize_to_rows "foo" "x" . should_fail_with Invalid_Value_Type
|
||||
|
||||
Test.specify "won't work on a mixed column" <|
|
||||
cols = [["foo", [0, 1]], ["bar", [500, "ab-10:bc-20c"]]]
|
||||
t = Table.new cols
|
||||
t.split_to_columns "bar" "x" . should_fail_with Invalid_Value_Type
|
||||
t.split_to_rows "bar" "x" . should_fail_with Invalid_Value_Type
|
||||
t.tokenize_to_columns "bar" "x" . should_fail_with Invalid_Value_Type
|
||||
t.tokenize_to_rows "bar" "x" . should_fail_with Invalid_Value_Type
|
||||
|
||||
Test.specify "*_to_columns handles missing input column" <|
|
||||
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
|
||||
t = Table.new cols
|
||||
t.tokenize_to_columns "invalid_name" "([a-z]).(\d+)" . should_fail_with No_Such_Column
|
||||
|
||||
Test.specify "*_to_rows handles missing input column" <|
|
||||
cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
|
||||
t = Table.new cols
|
||||
t.tokenize_to_rows "invalid_name" "([a-z]).(\d+)" . should_fail_with No_Such_Column
|
||||
|
||||
Test.group "Table.split/tokenize name conflicts" <|
|
||||
Test.specify "split will make column names unique" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]], ["bar_1", ["a", "b", "c"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a", "c", Nothing, "a"], [1, "c", "d", "ef", "b"], [2, "gh", "ij", "u", "c"]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1_1", "bar_2", "bar_1"] expected_rows
|
||||
action = t.split_to_columns "bar" "b" on_problems=_
|
||||
tester = t-> t.should_equal_verbose expected
|
||||
problems = [Duplicate_Output_Column_Names.Error ["bar_1"]]
|
||||
Problems.test_problem_handling action problems tester
|
||||
|
||||
Test.specify "tokenize will make column names unique" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]], ["bar_1", ["a", "b", "c"]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "12", "34", "5", "a"], [1, "23", Nothing, Nothing, "b"], [2, "2", "4", "55", "c"]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1_1", "bar_2", "bar_1"] expected_rows
|
||||
action = t.tokenize_to_columns "bar" "\d+" on_problems=_
|
||||
tester = t-> t.should_equal_verbose expected
|
||||
problems = [Duplicate_Output_Column_Names.Error ["bar_1"]]
|
||||
Problems.test_problem_handling action problems tester
|
||||
|
||||
Test.group "Table.split/tokenize column order" <|
|
||||
Test.specify "preserves column order" <|
|
||||
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]], ["baz", [1, 2, 3]]]
|
||||
t = Table.new cols
|
||||
expected_rows = [[0, "a", "c", Nothing, 1], [1, "c", "d", "ef", 2], [2, "gh", "ij", "u", 3]]
|
||||
expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "baz"] expected_rows
|
||||
t2 = t.split_to_columns "bar" "b"
|
||||
t2.should_equal_verbose expected
|
||||
|
||||
main = Test_Suite.run_main spec
|
@ -13,6 +13,16 @@ Table.should_equal self expected =
|
||||
self_cols.map .name . should_equal (that_cols.map .name) frames_to_skip=1
|
||||
self_cols.map .to_vector . should_equal (that_cols.map .to_vector) frames_to_skip=1
|
||||
|
||||
Table.should_equal_verbose self expected =
|
||||
tables_equal t0 t1 =
|
||||
same_headers = (t0.columns.map .name) == (t1.columns.map .name)
|
||||
same_columns = (t0.columns.map .to_vector) == (t1.columns.map .to_vector)
|
||||
same_headers && same_columns
|
||||
equal = tables_equal self expected
|
||||
if equal.not then
|
||||
msg = 'Tables differ.\nActual:\n' + self.display + '\nExpected:\n' + expected.display
|
||||
Test.fail msg
|
||||
|
||||
Column.should_equal self expected =
|
||||
if self.name != expected.name then
|
||||
Test.fail "Expected column name "+expected.name+", but got "+self.name+"."
|
||||
|
Loading…
Reference in New Issue
Block a user