Add split and tokenize to the Table. (#6233)

Implement split and tokenize for tables.
2024-11-22 22:10:15 +03:00 · 2023-04-14 12:03:02 -04:00 · 2023-04-14 12:03:02 -04:00 · 4dcf5faddd
commit 4dcf5faddd
parent 92ce47016a
8 changed files with 622 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -381,6 +381,7 @@
  methods.][6176]
 - [Implemented `Table.union` for the Database backend.][6204]
 - [Array & Vector have the same methods & behavior][6218]
+- [Implemented `Table.split` and `Table.tokenize` for in-memory tables.][6233]

 [debug-shortcuts]:
  https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -578,6 +579,7 @@
 [6204]: https://github.com/enso-org/enso/pull/6204
 [6077]: https://github.com/enso-org/enso/pull/6077
 [6218]: https://github.com/enso-org/enso/pull/6218
+[6233]: https://github.com/enso-org/enso/pull/6233

 #### Enso Compiler

--- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso
+++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso
@ -1392,6 +1392,80 @@ type Table
        msg = "Parsing values is not supported in database tables, the table has to be materialized first with `read`."
        Error.throw (Unsupported_Database_Operation.Error msg)

+    ## Splits a column of text into a set of new columns.
+       The original column will be removed from the table.
+       The new columns will be named with the name of the input column with a
+       incrementing number after.
+
+       Arguments:
+       - column: The name or index of the column to split the text of.
+       - delimiter: The term or terms used to split the text.
+       - column_count: The number of columns to split to.
+         If `Nothing` then columns will be added to fit all data.
+       - on_problems: Specifies the behavior when a problem occurs.
+
+       ! Error Conditions
+         If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
+         be reported according to the `on_problems` behavior.
+    split_to_columns : Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
+    split_to_columns self column delimiter="," column_count=Nothing on_problems=Report_Error =
+        _ = [column delimiter column_count on_problems]
+        Error.throw (Unsupported_Database_Operation.Error "Table.split_to_columns is not implemented yet for the Database backends.")
+
+    ## Splits a column of text into a set of new rows.
+       The values of other columns are repeated for the new rows.
+
+       Arguments:
+       - column: The name or index of the column to split the text of.
+       - delimiter: The term or terms used to split the text.
+       - on_problems: Specifies the behavior when a problem occurs.
+    split_to_rows : Text | Integer -> Text -> Table
+    split_to_rows self column delimiter="," =
+        _ = [column delimiter]
+        Error.throw (Unsupported_Database_Operation.Error "Table.split_to_rows is not implemented yet for the Database backends.")
+
+    ## Tokenizes a column of text into a set of new columns using a regular
+       expression.
+       If the pattern contains marked groups, the values are concatenated
+       together; otherwise the whole match is returned.
+       The original column will be removed from the table.
+       The new columns will be named with the name of the input column with a
+       incrementing number after.
+
+       Arguments:
+       - column: The name or index of the column to tokenize the text of.
+       - pattern: The pattern used to find within the text.
+       - case_sensitivity: Specifies if the text values should be compared case
+         sensitively.
+       - column_count: The number of columns to split to.
+         If `Nothing` then columns will be added to fit all data.
+       - on_problems: Specifies the behavior when a problem occurs.
+
+       ! Error Conditions
+         If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
+         be reported according to the `on_problems` behavior.
+    tokenize_to_columns : Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
+    tokenize_to_columns self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive column_count=Nothing on_problems=Report_Error =
+        _ = [column pattern case_sensitivity column_count on_problems]
+        Error.throw (Unsupported_Database_Operation.Error "Table.tokenize_to_columns is not implemented yet for the Database backends.")
+
+    ## Tokenizes a column of text into a set of new rows using a regular
+       expression.
+       If the pattern contains marked groups, the values are concatenated
+       together; otherwise the whole match is returned.
+       The values of other columns are repeated for the new rows.
+
+       Arguments:
+       - column: The name or index of the column to tokenize the text of.
+       - pattern: The pattern used to find within the text.
+       - case_sensitivity: Specifies if the text values should be compared case
+         sensitively.
+       - on_problems: Specifies the behavior when a problem occurs.
+    tokenize_to_rows : Text | Integer -> Text -> Case_Sensitivity -> Table
+    tokenize_to_rows self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
+        _ = [column pattern case_sensitivity]
+        Error.throw (Unsupported_Database_Operation.Error "Table.tokenize_to_rows is not implemented yet for the Database backends.")
+
    ## PRIVATE
       UNSTABLE
       Cast the selected columns to a specific type.
--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
@ -29,6 +29,7 @@ import project.Internal.Join_Helpers
 import project.Internal.Naming_Helpers.Naming_Helpers
 import project.Internal.Parse_Values_Helper
 import project.Internal.Problem_Builder.Problem_Builder
+import project.Internal.Split_Tokenize
 import project.Internal.Table_Helpers
 import project.Internal.Table_Helpers.Table_Column_Helper
 import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
@ -918,6 +919,76 @@ type Table
        result = Table.new new_columns
        problem_builder.attach_problems_after on_problems result

+    ## Splits a column of text into a set of new columns.
+       The original column will be removed from the table.
+       The new columns will be named with the name of the input column with a
+       incrementing number after.
+
+       Arguments:
+       - column: The name or index of the column to split the text of.
+       - delimiter: The term or terms used to split the text.
+       - column_count: The number of columns to split to.
+         If `Nothing` then columns will be added to fit all data.
+       - on_problems: Specifies the behavior when a problem occurs.
+
+       ! Error Conditions
+         If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
+         be reported according to the `on_problems` behavior.
+    split_to_columns : Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
+    split_to_columns self column delimiter="," column_count=Nothing on_problems=Report_Error =
+        Split_Tokenize.split_to_columns self column delimiter column_count on_problems
+
+    ## Splits a column of text into a set of new rows.
+       The values of other columns are repeated for the new rows.
+
+       Arguments:
+       - column: The name or index of the column to split the text of.
+       - delimiter: The term or terms used to split the text.
+       - on_problems: Specifies the behavior when a problem occurs.
+    split_to_rows : Text | Integer -> Text -> Table
+    split_to_rows self column delimiter="," =
+        Split_Tokenize.split_to_rows self column delimiter
+
+    ## Tokenizes a column of text into a set of new columns using a regular
+       expression.
+       If the pattern contains marked groups, the values are concatenated
+       together; otherwise the whole match is returned.
+       The original column will be removed from the table.
+       The new columns will be named with the name of the input column with a
+       incrementing number after.
+
+       Arguments:
+       - column: The name or index of the column to tokenize the text of.
+       - pattern: The pattern used to find within the text.
+       - case_sensitivity: Specifies if the text values should be compared case
+         sensitively.
+       - column_count: The number of columns to split to.
+         If `Nothing` then columns will be added to fit all data.
+       - on_problems: Specifies the behavior when a problem occurs.
+
+       ! Error Conditions
+         If the data exceeds the `column_count`, a `Column_Count_Exceeded` will
+         be reported according to the `on_problems` behavior.
+    tokenize_to_columns : Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
+    tokenize_to_columns self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive column_count=Nothing on_problems=Report_Error =
+        Split_Tokenize.tokenize_to_columns self column pattern case_sensitivity column_count on_problems
+
+    ## Tokenizes a column of text into a set of new rows using a regular
+       expression.
+       If the pattern contains marked groups, the values are concatenated
+       together; otherwise the whole match is returned.
+       The values of other columns are repeated for the new rows.
+
+       Arguments:
+       - column: The name or index of the column to tokenize the text of.
+       - pattern: The pattern used to find within the text.
+       - case_sensitivity: Specifies if the text values should be compared case
+         sensitively.
+       - on_problems: Specifies the behavior when a problem occurs.
+    tokenize_to_rows : Text | Integer -> Text -> Case_Sensitivity -> Table
+    tokenize_to_rows self column pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
+        Split_Tokenize.tokenize_to_rows self column pattern case_sensitivity
+
    ## ALIAS Filter Rows

       Selects only the rows of this table that correspond to `True` values of
--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso
@ -552,3 +552,16 @@ type Invalid_Value_For_Type
    to_display_text : Text
    to_display_text self =
        "The value ["+self.value.to_text+"] is not valid for the column type ["+self.value_type.to_text+"]."
+
+type Column_Count_Exceeded
+    ## PRIVATE
+       Indicates that an operation generating new columns produced more columns
+       than allowed by the limit.
+    Error (limit : Integer) (column_count : Integer)
+
+    ## PRIVATE
+
+       Create a human-readable version of the error.
+    to_display_text : Text
+    to_display_text self =
+        "The operation produced more columns than the specified limit. The limit is "+self.limit.to_text+" and the number of new columns was "+self.column_count.to_text+". The limit may be turned off by setting the `limit` option to `Nothing`."
--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Split_Tokenize.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Split_Tokenize.enso
@ -0,0 +1,236 @@
+from Standard.Base import all
+
+import project.Data.Column.Column
+import project.Data.Table.Table
+import project.Data.Type.Value_Type.Value_Type
+import project.Internal.Java_Exports
+import project.Internal.Problem_Builder.Problem_Builder
+import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
+
+from project import Value_Type
+from project.Errors import Column_Count_Exceeded, Duplicate_Output_Column_Names, Invalid_Value_Type, Missing_Input_Columns
+from project.Internal.Java_Exports import make_string_builder
+
+polyglot java import org.enso.table.data.mask.OrderMask
+
+## PRIVATE
+   Splits a column of text into a set of new columns.
+   See `Table.split_to_columns`.
+split_to_columns : Table -> Text | Integer -> Text -> Integer | Nothing -> Problem_Behavior -> Table
+split_to_columns table input_column_id delimiter="," column_count=Nothing on_problems=Report_Error =
+    column = table.at input_column_id
+    Value_Type.expect_text (column.value_type) related_column=column <|
+        fan_out_to_columns table input_column_id (handle_nothing (_.split delimiter)) column_count on_problems
+
+## PRIVATE
+   Splits a column of text into a set of new rows.
+   See `Table.split_to_rows`.
+split_to_rows : Table -> Text | Integer -> Text -> Table
+split_to_rows table input_column_id delimiter="," =
+    column = table.at input_column_id
+    Value_Type.expect_text (column.value_type) related_column=column <|
+        fan_out_to_rows table input_column_id (handle_nothing (_.split delimiter))
+
+## PRIVATE
+   Tokenizes a column of text into a set of new columns using a regular
+   expression.
+   See `Table.tokenize_to_columns`.
+tokenize_to_columns : Table -> Text | Integer -> Text -> Case_Sensitivity -> Integer | Nothing -> Problem_Behavior -> Table
+tokenize_to_columns table input_column_id pattern case_sensitivity column_count on_problems =
+    column = table.at input_column_id
+    Value_Type.expect_text (column.value_type) related_column=column <|
+        fan_out_to_columns table input_column_id (handle_nothing (_.tokenize pattern case_sensitivity)) column_count on_problems
+
+## PRIVATE
+   Tokenizes a column of text into a set of new rows using a regular
+   expression.
+   See `Table.tokenize_to_rows`.
+tokenize_to_rows : Table -> Text | Integer -> Text -> Case_Sensitivity -> Table
+tokenize_to_rows table input_column_id pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
+    column = table.at input_column_id
+    Value_Type.expect_text (column.value_type) related_column=column <|
+        fan_out_to_rows table input_column_id (handle_nothing (_.tokenize pattern case_sensitivity))
+
+## PRIVATE
+   Transform a table by transforming a column into a set of columns. Takes a
+   function that maps a single element of the input column to a vector of output
+   values. The original column is replaced by the new columns.
+
+   Arguments:
+   - table: The table to transform.
+   - input_column: The column to transform.
+   - function: A function that transforms a single element of `input_column`
+     to multiple values.
+fan_out_to_columns : Table -> Text | Integer -> (Any -> Vector Any) -> Integer | Nothing -> Problem_Behavior -> Table | Nothing
+fan_out_to_columns table input_column_id function column_count=Nothing on_problems=Report_Error =
+    input_column = table.get input_column_id
+    problem_builder = Problem_Builder.new
+    new_columns_unrenamed = map_columns_to_multiple input_column function column_count problem_builder
+    new_columns = rename_new_columns table new_columns_unrenamed problem_builder
+    new_table = replace_column_with_columns table input_column new_columns
+    problem_builder.attach_problems_after on_problems new_table
+
+## PRIVATE
+   Transform a column by applying the given function to the values in the
+   column. The function produces multiple outputs, so each row is duplicated,
+   with each row getting a distinct output value in place of the original
+   input value. The other column values are just duplicated.
+
+   Arguments:
+   - table: The table to transform.
+   - input_column: The column to transform.
+   - function: A function that transforms a single element of `input_column`
+     to multiple values.
+fan_out_to_rows : Table -> Text | Integer -> (Any -> Vector Any) -> Table
+fan_out_to_rows table input_column_id function =
+    input_column = table.at input_column_id
+    input_storage = input_column.java_column.getStorage
+    num_input_rows = input_storage.size
+
+    # Guess that most of the time, we'll get at least one value for each input.
+    initial_size = input_column.length
+    # Accumulates the output of the output column values.
+    output_column_builder = make_string_builder initial_size
+    # Accumulates repeated position indices for the order mask.
+    order_mask_positions = Vector.new_builder initial_size
+
+    0.up_to num_input_rows . each i->
+        input_value = input_storage.getItemBoxed i
+        output_values = function input_value
+        # Append each value.
+        output_values.each v-> output_column_builder.append v
+        # Append n copies of the input row position, n = # of output values.
+        repeat_each output_values.length <| order_mask_positions.append i
+
+    # Build the output column
+    output_storage = output_column_builder.seal
+    output_column = Column.from_storage input_column_id output_storage
+
+    # Build the order mask.
+    order_mask = OrderMask.new (order_mask_positions.to_vector)
+
+    # Build the other columns, and include the output_column while doing it.
+    new_columns = table.columns.map column->
+        case column.name == input_column_id of
+            True ->
+                # Replace the input column with the output column.
+                output_column
+            False ->
+                # Build a new column from the old one with the mask
+                old_storage = column.java_column.getStorage
+                new_storage = old_storage.applyMask order_mask
+                Column.from_storage column.name new_storage
+
+    Table.new new_columns
+
+## PRIVATE
+
+   Map a multi-valued function over a column and return the results as set of
+   output columns.
+
+   Returns a Pair of a Vector of Columns and a Vector of problems.
+
+   Arguments:
+   - input_column: The column to transform.
+   - function: A function that transforms a single element of `input_column`
+     to multiple values.
+   - column_count: The number of columns to split to.
+     If `Nothing` then columns will be added to fit all data.
+     If the data exceeds the `column_count`, a `Column_Count_Exceeded` error
+     will follow the `on_problems` behavior.
+   - on_problems: Specifies the behavior when a problem occurs.
+map_columns_to_multiple : Column -> (Any -> Vector Any) -> Integer | Nothing -> Problem_Builder -> Vector Column
+map_columns_to_multiple input_column function column_count problem_builder =
+    num_rows = input_column.length
+    input_storage = input_column.java_column.getStorage
+
+    builders = case column_count of
+        Nothing ->
+            builders = Vector.new_builder
+
+            0.up_to num_rows . each i->
+                input_value = input_storage.getItemBoxed i
+                output_values = function input_value
+
+                # Add more builders if necessary to accommodate `output_values`.
+                if output_values.length > builders.length then
+                    num_builders_needed = output_values.length - builders.length
+                    repeat_each num_builders_needed <|
+                        builder = make_string_builder num_rows
+
+                        # Pad the new builder with nulls
+                        num_nulls_needed = i
+                        builder.appendNulls num_nulls_needed
+
+                        builders.append builder
+
+                ## Add `output_values` to builders; if there are more builders
+                   than `output_values`, pad with null.
+                0.up_to builders.length . each i->
+                    builders.at i . appendNoGrow (output_values.get i Nothing)
+
+            builders.to_vector
+
+        _ : Integer ->
+            builders = Vector.new column_count (_-> make_string_builder num_rows)
+
+            output_lengths = 0.up_to num_rows . map i->
+                input_value = input_storage.getItemBoxed i
+                output_values = function input_value
+
+                ## Add `output_values` to builders; if there are more builders
+                   than `output_values`, pad with null.
+                0.up_to builders.length . each i->
+                    builders.at i . appendNoGrow (output_values.get i Nothing)
+
+                output_values.length
+
+            max_output_length = maximum output_lengths
+
+            if max_output_length > column_count then
+                problem = Column_Count_Exceeded.Error column_count max_output_length
+                problem_builder.report_other_warning problem
+
+            builders
+
+    # Build Columns.
+    builders.map .seal . map_with_index i-> storage->
+        name = input_column.name + "_" + i.to_text
+        Column.from_storage name storage
+
+## PRIVATE
+   Rename a vector of columns to be unique when added to a table.
+rename_new_columns : Table -> Vector Column -> Problem_Builder -> Vector Column
+rename_new_columns table columns problem_builder =
+    unique = Unique_Name_Strategy.new
+    unique.mark_used <| table.columns.map .name
+    new_columns = columns.map column->
+        new_name = unique.make_unique column.name
+        column.rename new_name
+    problem_builder.report_unique_name_strategy unique
+    new_columns
+
+## PRIVATE
+   Replace a single column in a table with new columns.
+   Does not ensure names are unique; that must be done before calling this.
+replace_column_with_columns : Table -> Column -> Vector Column -> Table
+replace_column_with_columns table old_column new_columns =
+    Table.new ((table.columns.map (c-> if c.name == old_column.name then new_columns else [c])).flatten)
+
+## PRIVATE
+   Return the maximum value of the vector.
+   Throws Empty_Error if the vector is empty.
+maximum : Vector Any -> Vector Any
+maximum vec = if vec.is_empty then Nothing else
+    vec.reduce (a-> b-> a.max b)
+
+## PRIVATE
+   Wrap a function so that it returns [] if passed Nothing
+handle_nothing : (Any -> Any) -> (Any -> Any)
+handle_nothing function = x-> case x of
+    _ : Nothing -> []
+    _ -> function x
+
+## PRIVATE
+   Repeat a computation n times.
+repeat_each n ~action = 0.up_to n . each _-> action
--- a/test/Table_Tests/src/In_Memory/Main.enso
+++ b/test/Table_Tests/src/In_Memory/Main.enso
@ -7,6 +7,7 @@ import project.In_Memory.Builders_Spec
 import project.In_Memory.Column_Spec
 import project.In_Memory.Common_Spec
 import project.In_Memory.Join_Performance_Spec
+import project.In_Memory.Split_Tokenize_Spec
 import project.In_Memory.Table_Spec
 import project.In_Memory.Table_Date_Spec
 import project.In_Memory.Table_Date_Time_Spec
@ -22,5 +23,6 @@ spec =
    Aggregate_Column_Spec.spec
    Builders_Spec.spec
    Join_Performance_Spec.spec
+    Split_Tokenize_Spec.spec

 main = Test_Suite.run_main spec
--- a/test/Table_Tests/src/In_Memory/Split_Tokenize_Spec.enso
+++ b/test/Table_Tests/src/In_Memory/Split_Tokenize_Spec.enso
@ -0,0 +1,214 @@
+from Standard.Base import all
+
+import Standard.Base.Data.Text.Case_Sensitivity.Case_Sensitivity
+import Standard.Test.Extensions
+
+from Standard.Table import Table
+from Standard.Table.Errors import Invalid_Value_Type, Column_Count_Exceeded, Duplicate_Output_Column_Names, No_Such_Column
+from Standard.Test import Test, Test_Suite, Problems
+from project.Util import all
+
+spec =
+    Test.group "Table.split" <|
+        Test.specify "can do split_to_columns" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
+            t = Table.new cols
+            expected_rows = [[0, "a", "c", Nothing], [1, "c", "d", "ef"], [2, "gh", "ij", "u"]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
+            t2 = t.split_to_columns "bar" "b"
+            t2.should_equal_verbose expected
+
+        Test.specify "can do split_to_rows" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
+            t = Table.new cols
+            expected_rows = [[0, "a"], [0, "c"], [1, "c"], [1, "d"], [1, "ef"], [2, "gh"], [2, "ij"], [2, "u"]]
+            expected = Table.from_rows ["foo", "bar"] expected_rows
+            t2 = t.split_to_rows "bar" "b"
+            t2.should_equal_verbose expected
+
+        Test.specify "can do split_to_columns with some Nothings" <|
+            cols = [["foo", [0, 1, 2, 3]], ["bar", ["abc", "cbdbef", Nothing, "ghbijbu"]]]
+            t = Table.new cols
+            expected_rows = [[0, "a", "c", Nothing], [1, "c", "d", "ef"], [2, Nothing, Nothing, Nothing], [3, "gh", "ij", "u"]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
+            t2 = t.split_to_columns "bar" "b"
+            t2.should_equal_verbose expected
+
+        Test.specify "can do split_to_rows with some Nothings" <|
+            cols = [["foo", [0, 1, 2, 3]], ["bar", ["abc", "cbdbef", Nothing, "ghbijbu"]]]
+            t = Table.new cols
+            expected_rows = [[0, "a"], [0, "c"], [1, "c"], [1, "d"], [1, "ef"], [3, "gh"], [3, "ij"], [3, "u"]]
+            expected = Table.from_rows ["foo", "bar"] expected_rows
+            t2 = t.split_to_rows "bar" "b"
+            t2.should_equal_verbose expected
+
+    Test.group "Table.tokenize" <|
+        Test.specify "can do tokenize_to_columns" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]]]
+            t = Table.new cols
+            expected_rows = [[0, "12", "34", "5"], [1, "23", Nothing, Nothing], [2, "2", "4", "55"]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
+            t2 = t.tokenize_to_columns "bar" "\d+"
+            t2.should_equal_verbose expected
+
+        Test.specify "can do tokenize_to_rows" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]]]
+            t = Table.new cols
+            expected_rows = [[0, "12"], [0, "34"], [0, "5"], [1, "23"], [2, "2"], [2, "4"], [2, "55"]]
+            expected = Table.from_rows ["foo", "bar"] expected_rows
+            t2 = t.tokenize_to_rows "bar" "\d+"
+            t2.should_equal_verbose expected
+
+        Test.specify "can do tokenize_to_columns with some nothings" <|
+            cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", Nothing, "23", "2r4r55"]]]
+            t = Table.new cols
+            expected_rows = [[0, "12", "34", "5"], [1, Nothing, Nothing, Nothing], [2, "23", Nothing, Nothing], [3, "2", "4", "55"]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
+            t2 = t.tokenize_to_columns "bar" "\d+"
+            t2.should_equal_verbose expected
+
+        Test.specify "can do tokenize_to_rows with some Nothings" <|
+            cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", Nothing, "23", "2r4r55"]]]
+            t = Table.new cols
+            expected_rows = [[0, "12"], [0, "34"], [0, "5"], [2, "23"], [3, "2"], [3, "4"], [3, "55"]]
+            expected = Table.from_rows ["foo", "bar"] expected_rows
+            t2 = t.tokenize_to_rows "bar" "\d+"
+            t2.should_equal_verbose expected
+
+        Test.specify "can do tokenize_to_rows with some rows that have no matches" <|
+            cols = [["foo", [0, 1, 2, 3]], ["bar", ["a12b34r5", "23", "q", "2r4r55"]]]
+            t = Table.new cols
+            expected_rows = [[0, "12"], [0, "34"], [0, "5"], [1, "23"], [3, "2"], [3, "4"], [3, "55"]]
+            expected = Table.from_rows ["foo", "bar"] expected_rows
+            t2 = t.tokenize_to_rows "bar" "\d+"
+            t2.should_equal_verbose expected
+
+        Test.specify "can do tokenize_to_columns with groups" <|
+            cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
+            t = Table.new cols
+            expected_rows = [[0, "a1", "b12", "d50"], [1, "b10", "c20", Nothing]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
+            t2 = t.tokenize_to_columns "bar" "([a-z]).(\d+)"
+            t2.should_equal_verbose expected
+
+        Test.specify "can do tokenize_to_rows with groups" <|
+            cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
+            t = Table.new cols
+            expected_rows = [[0, "a1"], [0, "b12"], [0, "d50"], [1, "b10"], [1, "c20"]]
+            expected = Table.from_rows ["foo", "bar"] expected_rows
+            t2 = t.tokenize_to_rows "bar" "([a-z]).(\d+)"
+            t2.should_equal_verbose expected
+
+        Test.specify "can do tokenize_to_columns case-insensitively" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["aBqcE", "qcBr", "cCb"]]]
+            t = Table.new cols
+            expected_rows = [[0, "B", "c", Nothing], [1, "c", "B", Nothing], [2, "c", "C", "b"]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2"] expected_rows
+            t2 = t.tokenize_to_columns "bar" "[bc]" case_sensitivity=Case_Sensitivity.Insensitive
+            t2.should_equal_verbose expected
+
+        Test.specify "can do tokenize_to_rows case-insensitively" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["aBqcE", "qcBr", "cCb"]]]
+            t = Table.new cols
+            expected_rows = [[0, "B"], [0, "c"], [1, "c"], [1, "B"], [2, "c"], [2, "C"], [2, "b"]]
+            expected = Table.from_rows ["foo", "bar"] expected_rows
+            t2 = t.tokenize_to_rows "bar" "[bc]" case_sensitivity=Case_Sensitivity.Insensitive
+            t2.should_equal_verbose expected
+
+    Test.group "Table.split/tokenize column count" <|
+        Test.specify "should generate extra empty columns if column_count is set" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
+            t = Table.new cols
+            expected_rows = [[0, "a", "c", Nothing, Nothing], [1, "c", "d", "ef", Nothing], [2, "gh", "ij", "u", Nothing]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "bar_3"] expected_rows
+            t2 = t.split_to_columns "bar" "b" column_count=4
+            t2.should_equal_verbose expected
+            t2.at "bar_3" . value_type . is_text . should_be_true
+
+        Test.specify "split should limit columns and return problems when exceeding the column limit" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
+            t = Table.new cols
+            expected_rows = [[0, "a", "c"], [1, "c", "d"], [2, "gh", "ij"]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1"] expected_rows
+            action = t.split_to_columns "bar" "b" column_count=2 on_problems=_
+            tester = t-> t.should_equal_verbose expected
+            problems = [Column_Count_Exceeded.Error 2 3]
+            Problems.test_problem_handling action problems tester
+
+        Test.specify "tokenize should limit columns and return problems when exceeding the column limit" <|
+            cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
+            t = Table.new cols
+            expected_rows = [[0, "a1", "b12", "d50"], [1, "b10", "c20", Nothing]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1"] expected_rows
+            action = t.tokenize_to_columns "bar" "([a-z]).(\d+)" column_count=2 on_problems=_
+            tester = t-> t.should_equal_verbose expected
+            problems = [Column_Count_Exceeded.Error 2 3]
+            Problems.test_problem_handling action problems tester
+
+        Test.specify "should generate extra empty columns if column_count is set (with rows in a different order)" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["ghbijbu", "cbdbef", "abc"]]]
+            t = Table.new cols
+            expected_rows = [[0, "gh", "ij", "u", Nothing], [1, "c", "d", "ef", Nothing], [2, "a", "c", Nothing, Nothing]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "bar_3"] expected_rows
+            t2 = t.split_to_columns "bar" "b" column_count=4
+            t2.should_equal_verbose expected
+            t2.at "bar_3" . value_type . is_text . should_be_true
+
+    Test.group "Table.split/tokenize errors" <|
+        Test.specify "won't work on a non-text column" <|
+            cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
+            t = Table.new cols
+            t.split_to_columns "foo" "x" . should_fail_with Invalid_Value_Type
+            t.split_to_rows "foo" "x" . should_fail_with Invalid_Value_Type
+            t.tokenize_to_columns "foo" "x" . should_fail_with Invalid_Value_Type
+            t.tokenize_to_rows "foo" "x" . should_fail_with Invalid_Value_Type
+
+        Test.specify "won't work on a mixed column" <|
+            cols = [["foo", [0, 1]], ["bar", [500, "ab-10:bc-20c"]]]
+            t = Table.new cols
+            t.split_to_columns "bar" "x" . should_fail_with Invalid_Value_Type
+            t.split_to_rows "bar" "x" . should_fail_with Invalid_Value_Type
+            t.tokenize_to_columns "bar" "x" . should_fail_with Invalid_Value_Type
+            t.tokenize_to_rows "bar" "x" . should_fail_with Invalid_Value_Type
+
+        Test.specify "*_to_columns handles missing input column" <|
+            cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
+            t = Table.new cols
+            t.tokenize_to_columns "invalid_name" "([a-z]).(\d+)" . should_fail_with No_Such_Column
+
+        Test.specify "*_to_rows handles missing input column" <|
+            cols = [["foo", [0, 1]], ["bar", ["r a-1, b-12,qd-50", "ab-10:bc-20c"]]]
+            t = Table.new cols
+            t.tokenize_to_rows "invalid_name" "([a-z]).(\d+)" . should_fail_with No_Such_Column
+
+    Test.group "Table.split/tokenize name conflicts" <|
+        Test.specify "split will make column names unique" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]], ["bar_1", ["a", "b", "c"]]]
+            t = Table.new cols
+            expected_rows = [[0, "a", "c", Nothing, "a"], [1, "c", "d", "ef", "b"], [2, "gh", "ij", "u", "c"]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1_1", "bar_2", "bar_1"] expected_rows
+            action = t.split_to_columns "bar" "b" on_problems=_
+            tester = t-> t.should_equal_verbose expected
+            problems = [Duplicate_Output_Column_Names.Error ["bar_1"]]
+            Problems.test_problem_handling action problems tester
+
+        Test.specify "tokenize will make column names unique" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["a12b34r5", "23", "2r4r55"]], ["bar_1", ["a", "b", "c"]]]
+            t = Table.new cols
+            expected_rows = [[0, "12", "34", "5", "a"], [1, "23", Nothing, Nothing, "b"], [2, "2", "4", "55", "c"]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1_1", "bar_2", "bar_1"] expected_rows
+            action = t.tokenize_to_columns "bar" "\d+"  on_problems=_
+            tester = t-> t.should_equal_verbose expected
+            problems = [Duplicate_Output_Column_Names.Error ["bar_1"]]
+            Problems.test_problem_handling action problems tester
+
+    Test.group "Table.split/tokenize column order" <|
+        Test.specify "preserves column order" <|
+            cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]], ["baz", [1, 2, 3]]]
+            t = Table.new cols
+            expected_rows = [[0, "a", "c", Nothing, 1], [1, "c", "d", "ef", 2], [2, "gh", "ij", "u", 3]]
+            expected = Table.from_rows ["foo", "bar_0", "bar_1", "bar_2", "baz"] expected_rows
+            t2 = t.split_to_columns "bar" "b"
+            t2.should_equal_verbose expected
+
+main = Test_Suite.run_main spec
--- a/test/Table_Tests/src/Util.enso
+++ b/test/Table_Tests/src/Util.enso
@ -13,6 +13,16 @@ Table.should_equal self expected =
    self_cols.map .name . should_equal (that_cols.map .name) frames_to_skip=1
    self_cols.map .to_vector . should_equal (that_cols.map .to_vector) frames_to_skip=1

+Table.should_equal_verbose self expected =
+    tables_equal t0 t1 =
+        same_headers = (t0.columns.map .name) == (t1.columns.map .name)
+        same_columns = (t0.columns.map .to_vector) == (t1.columns.map .to_vector)
+        same_headers && same_columns
+    equal = tables_equal self expected
+    if equal.not then
+        msg = 'Tables differ.\nActual:\n' + self.display + '\nExpected:\n' + expected.display
+        Test.fail msg
+
 Column.should_equal self expected =
    if self.name != expected.name then
        Test.fail "Expected column name "+expected.name+", but got "+self.name+"."