Add Table.expand_to_rows to allow flattening vector and array values in table (#8042)

# Important Notes
Also includes a fix for a reallocation bug in `InferredBuilder`.
This commit is contained in:
GregoryTravis 2023-10-13 16:54:06 -04:00 committed by GitHub
parent b7d7910a88
commit f18d1323e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 538 additions and 284 deletions

View File

@ -585,6 +585,7 @@
- [Implemented `Table.lookup_and_replace` for the in-memory backend.][7979] - [Implemented `Table.lookup_and_replace` for the in-memory backend.][7979]
- [Added `Column_Operation` to `Table.set` allowing for more streamlined flow of - [Added `Column_Operation` to `Table.set` allowing for more streamlined flow of
deriving column values in the GUI.][8005] deriving column values in the GUI.][8005]
- [Implemented `Table.expand_to_rows` for the in-memory backend.][8029]
[debug-shortcuts]: [debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -833,6 +834,7 @@
[7947]: https://github.com/enso-org/enso/pull/7947 [7947]: https://github.com/enso-org/enso/pull/7947
[7979]: https://github.com/enso-org/enso/pull/7979 [7979]: https://github.com/enso-org/enso/pull/7979
[8005]: https://github.com/enso-org/enso/pull/8005 [8005]: https://github.com/enso-org/enso/pull/8005
[8029]: https://github.com/enso-org/enso/pull/8029
#### Enso Compiler #### Enso Compiler

View File

@ -1999,6 +1999,43 @@ type Table
_ = [column, fields, prefix] _ = [column, fields, prefix]
Error.throw (Unsupported_Database_Operation.Error "Table.expand_column is currently not implemented for the Database backend. You may download the table to memory using `.read` to use this feature.") Error.throw (Unsupported_Database_Operation.Error "Table.expand_column is currently not implemented for the Database backend. You may download the table to memory using `.read` to use this feature.")
## GROUP Standard.Base.Conversions
Expand aggregate values in a column to separate rows.
For each value in the specified column, if it is an aggregate (`Vector`,
`Range`, etc.), expand it to multiple rows, duplicating the values in the
other columns.
Arguments:
- column: The column to expand.
- at_least_one_row: for an empty aggregate value, if `at_least_one_row` is
true, a single row is output with `Nothing` for the aggregates column; if
false, no row is output at all.
The following aggregate values are supported:
- `Array`
- `Vector`
- `List`
- `Range`
- `Date_Range`
- `Pair
Any other values are treated as non-aggregate values, and their rows are kept
unchanged.
In in-memory tables, it is permitted to mix values of different types.
> Example
Expand a column of integer `Vectors` to a column of `Integer`
table = Table.new [["aaa", [1, 2]], ["bbb", [[30, 31], [40, 41]]]]
# => Table.new [["aaa", [1, 1, 2, 2]], ["bbb", [30, 31, 40, 41]]]
@column Widget_Helpers.make_column_name_selector
expand_to_rows : Text | Integer -> Boolean -> Table ! Type_Error | No_Such_Column | Index_Out_Of_Bounds
expand_to_rows self column at_least_one_row=False =
_ = [column, at_least_one_row]
Error.throw (Unsupported_Database_Operation.Error "Table.expand_to_rows is currently not implemented for the Database backend. You may download the table to memory using `.read` to use this feature.")
## GROUP Standard.Base.Conversions ## GROUP Standard.Base.Conversions
Cast the selected columns to a specific type. Cast the selected columns to a specific type.

View File

@ -16,12 +16,20 @@ type Convertible_To_Rows
- getter: Get the value for a specified row. - getter: Get the value for a specified row.
Value length:Integer (getter : Integer->Any) Value length:Integer (getter : Integer->Any)
## PRIVATE
Return the iterator values as a `Vector`.
to_vector : Vector Any
to_vector self = 0.up_to self.length . map self.getter
## PRIVATE ## PRIVATE
Convertible_To_Rows.from that:Vector = Convertible_To_Rows.Value that.length that.get Convertible_To_Rows.from that:Vector = Convertible_To_Rows.Value that.length that.get
## PRIVATE ## PRIVATE
Convertible_To_Rows.from that:Array = Convertible_To_Rows.Value that.length that.get Convertible_To_Rows.from that:Array = Convertible_To_Rows.Value that.length that.get
## PRIVATE
Convertible_To_Rows.from that:List = Convertible_To_Rows.from that.to_vector
## PRIVATE ## PRIVATE
Convertible_To_Rows.from that:Range = Convertible_To_Rows.Value that.length that.get Convertible_To_Rows.from that:Range = Convertible_To_Rows.Value that.length that.get

View File

@ -1189,6 +1189,42 @@ type Table
expand_column self (column : Text | Integer) (fields : Vector | Nothing = Nothing) (prefix : Text | Nothing = Nothing) = expand_column self (column : Text | Integer) (fields : Vector | Nothing = Nothing) (prefix : Text | Nothing = Nothing) =
Expand_Objects_Helpers.expand_column self column fields prefix Expand_Objects_Helpers.expand_column self column fields prefix
## GROUP Standard.Base.Conversions
Expand aggregate values in a column to separate rows.
For each value in the specified column, if it is an aggregate (`Vector`,
`Range`, etc.), expand it to multiple rows, duplicating the values in the
other columns.
Arguments:
- column: The column to expand.
- at_least_one_row: for an empty aggregate value, if `at_least_one_row` is
true, a single row is output with `Nothing` for the aggregates column; if
false, no row is output at all.
The following aggregate values are supported:
- `Array`
- `Vector`
- `List`
- `Range`
- `Date_Range`
- `Pair
Any other values are treated as non-aggregate values, and their rows are kept
unchanged.
In in-memory tables, it is permitted to mix values of different types.
> Example
Expand a column of integer `Vectors` to a column of `Integer`
table = Table.new [["aaa", [1, 2]], ["bbb", [[30, 31], [40, 41]]]]
# => Table.new [["aaa", [1, 1, 2, 2]], ["bbb", [30, 31, 40, 41]]]
@column Widget_Helpers.make_column_name_selector
expand_to_rows : Text | Integer -> Boolean -> Table ! Type_Error | No_Such_Column | Index_Out_Of_Bounds
expand_to_rows self column at_least_one_row=False =
Expand_Objects_Helpers.expand_to_rows self column at_least_one_row
## ALIAS filter rows ## ALIAS filter rows
GROUP Standard.Base.Selections GROUP Standard.Base.Selections

View File

@ -1,6 +1,5 @@
from Standard.Base import all from Standard.Base import all
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
import Standard.Base.Errors.Common.Index_Out_Of_Bounds
import Standard.Base.Errors.Common.Type_Error import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.Errors.Unimplemented.Unimplemented import Standard.Base.Errors.Unimplemented.Unimplemented
@ -9,7 +8,6 @@ from Standard.Base.Metadata import make_single_choice
import project.Data.Match_Columns.Match_Columns import project.Data.Match_Columns.Match_Columns
import project.Data.Table.Table import project.Data.Table.Table
import project.Errors.Invalid_JSON_Format import project.Errors.Invalid_JSON_Format
import project.Errors.No_Such_Column
import project.Internal.Expand_Objects_Helpers import project.Internal.Expand_Objects_Helpers
import project.Internal.Parse_To_Table import project.Internal.Parse_To_Table
import project.Internal.Widget_Helpers import project.Internal.Widget_Helpers

View File

@ -1,11 +1,17 @@
from Standard.Base import all from Standard.Base import all
import Standard.Base.Errors.Common.Index_Out_Of_Bounds
import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import project.Data.Table.Table import project.Data.Table.Table
import project.Data.Column.Column import project.Data.Column.Column
import project.Data.Conversions.Convertible_To_Columns.Convertible_To_Columns import project.Data.Conversions.Convertible_To_Columns.Convertible_To_Columns
import project.Data.Conversions.Convertible_To_Rows.Convertible_To_Rows import project.Data.Conversions.Convertible_To_Rows.Convertible_To_Rows
import project.Errors.No_Such_Column
import project.Internal.Fan_Out
import project.Internal.Java_Exports import project.Internal.Java_Exports
from project.Internal.Java_Exports import make_inferred_builder
## PRIVATE ## PRIVATE
expand_column : Table -> (Text | Integer) -> ((Vector Text) | Nothing) -> (Text | Nothing) -> Table expand_column : Table -> (Text | Integer) -> ((Vector Text) | Nothing) -> (Text | Nothing) -> Table
@ -29,6 +35,45 @@ expand_column table column fields prefix =
Table.new output_builder.to_vector Table.new output_builder.to_vector
## GROUP Standard.Base.Conversions
Expand aggregate values in a column to separate rows.
For each value in the specified column, if it is an aggregate (`Vector`,
`Range`, etc.), expand it to multiple rows, duplicating the values in the
other columns.
Arguments:
- column: The column to expand.
- at_least_one_row: for an empty aggregate value, if `at_least_one_row` is
true, a single row is output with `Nothing` for the aggregates column; if
false, no row is output at all.
The following aggregate values are supported:
- `Array`
- `Vector`
- `List`
- `Range`
- `Date_Range`
- `Pair
Any other values are treated as non-aggregate values, and their rows are kept
unchanged.
In in-memory tables, it is permitted to mix values of different types.
> Example
Expand a column of integer `Vectors` to a column of `Integer`
table = Table.new [["aaa", [1, 2]], ["bbb", [[30, 31], [40, 41]]]]
# => Table.new [["aaa", [1, 1, 2, 2]], ["bbb", [30, 31, 40, 41]]]
@column Widget_Helpers.make_column_name_selector
expand_to_rows : Table -> Text | Integer -> Boolean -> Table ! Type_Error | No_Such_Column | Index_Out_Of_Bounds
expand_to_rows table column at_least_one_row=False =
row_expander : Any -> Vector
row_expander value:Convertible_To_Rows = value.to_vector
Fan_Out.fan_out_to_rows table column row_expander at_least_one_row column_builder=make_inferred_builder
## PRIVATE ## PRIVATE
create_table_from_objects : Any -> ((Vector Text) | Nothing) -> Table create_table_from_objects : Any -> ((Vector Text) | Nothing) -> Table
create_table_from_objects (value : Convertible_To_Rows) (fields : Vector | Nothing) = if fields.is_nothing.not && fields.is_empty then Error.throw (Illegal_Argument.Error "The fields parameter cannot be empty.") else create_table_from_objects (value : Convertible_To_Rows) (fields : Vector | Nothing) = if fields.is_nothing.not && fields.is_empty then Error.throw (Illegal_Argument.Error "The fields parameter cannot be empty.") else

View File

@ -0,0 +1,281 @@
from Standard.Base import all
import project.Data.Column.Column
import project.Data.Table.Table
import project.Data.Type.Value_Type.Value_Type
import project.Internal.Problem_Builder.Problem_Builder
from project.Errors import Column_Count_Exceeded, Column_Count_Mismatch
from project.Internal.Java_Exports import make_string_builder
polyglot java import org.enso.table.data.mask.OrderMask
## PRIVATE
Transform a table by transforming a column into a set of columns. Takes a
function that maps a single element of the input column to a vector of output
values. The original column is replaced by the new columns.
Arguments:
- table: The table to transform.
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to multiple values.
fan_out_to_columns : Table -> Text | Integer -> (Any -> Vector Any) -> Integer | Nothing -> (Integer -> Any) -> Problem_Behavior -> Table | Nothing
fan_out_to_columns table input_column_id function column_count=Nothing column_builder=make_string_builder on_problems=Report_Error =
input_column = table.get input_column_id
problem_builder = Problem_Builder.new
new_columns_unrenamed = map_columns_to_multiple input_column function column_count column_builder=column_builder problem_builder=problem_builder
new_columns = rename_new_columns table input_column.name new_columns_unrenamed problem_builder
new_table = replace_column_with_columns table input_column new_columns
problem_builder.attach_problems_after on_problems new_table
## PRIVATE
Transform a column by applying the given function to the values in the
column. The function produces multiple outputs, so each row is duplicated,
with each row getting a distinct output value in place of the original
input value. The other column values are just duplicated.
Arguments:
- table: The table to transform.
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to multiple values.
- at_least_one_row: When true, if the function returns an empty list, a
single row is output with `Nothing` for the transformed column. If false,
the row is not output at all.
fan_out_to_rows : Table -> Text | Integer -> (Any -> Vector Any) -> Boolean -> (Integer -> Any) -> Problem_Behavior -> Table
fan_out_to_rows table input_column_id function at_least_one_row=False column_builder=make_string_builder on_problems=Report_Error =
## Treat this as a special case of fan_out_to_rows_and_columns, with one
column. Wrap the provided function to convert each value to a singleton
`Vector`.
wrapped_function x = function x . map y-> [y]
column_names = [input_column_id]
fan_out_to_rows_and_columns table input_column_id wrapped_function column_names at_least_one_row=at_least_one_row column_builder=column_builder on_problems=on_problems
## PRIVATE
Transform a column by applying the given function to the values in the
column. The function returns a `Vector` of `Vectors`. Each inner vector turns
into multiple new columns in a single row. Each inner vector within the outer
vector produces an output row, so each row is duplicated, with each row
getting a distinct set of output values in place of the original input value.
The other column values are just duplicated.
! Error Conditions
The inner vectors should all have the same number of values, which should
match the provided `column_names`. If a value is too short, it will be
padded with Nothing, and if it is too long, it will be truncated. In either
case, Column_Count_Mismatch will be added as a warning. (It is expected
that the caller of this private method will ensure that the provided
function will produce inner vectors of the correct length, but we check for
it anyway.)
> Example
f("12 34 56") -> [[1, 2], [3, 4], [5, 6]]
foo | bar | baz
----+-----------+----
x | 12 34 56 | y ===>
... | ... | ...
foo | bar 1 | bar 2 | baz
----+-------+-------+----
x | 1 | 2 | y
x | 3 | 4 | y
x | 5 | 6 | y
... | ... | ... | ...
Arguments:
- table: The table to transform.
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to a `Vector` of `Vector` of values.
- column_names: The names for the generated columns.
- on_problems: Specifies the behavior when a problem occurs.
fan_out_to_rows_and_columns : Table -> Text | Integer -> (Any -> Vector (Vector Any)) -> Vector Text -> Boolean -> (Integer -> Any) -> Problem_Behavior -> Table
fan_out_to_rows_and_columns table input_column_id function column_names at_least_one_row=False column_builder=make_string_builder on_problems=Report_Error =
problem_builder = Problem_Builder.new
unique = table.column_naming_helper.create_unique_name_strategy
input_column = table.at input_column_id
input_storage = input_column.java_column.getStorage
num_input_rows = input_storage.size
num_output_columns = column_names.length
# Guess that most of the time, we'll get at least one value for each input.
initial_size = input_column.length
# Accumulates the outputs of the function.
output_column_builders = Vector.new num_output_columns _-> column_builder initial_size
# Accumulates repeated position indices for the order mask.
order_mask_positions = Vector.new_builder initial_size
maybe_add_empty_row vecs =
should_add_empty_row = vecs.is_empty && at_least_one_row
if should_add_empty_row.not then vecs else
empty_row = Vector.fill num_output_columns Nothing
[empty_row]
0.up_to num_input_rows . each i->
input_value = input_storage.getItemBoxed i
output_values = function input_value |> maybe_add_empty_row
# Append each group of values to the builder.
output_values.each row_unchecked->
row = uniform_length num_output_columns row_unchecked problem_builder
row.each_with_index i-> v-> output_column_builders.at i . append v
# Append n copies of the input row position, n = # of output values.
repeat_each output_values.length <| order_mask_positions.append i
# Reserve the non-input column names that will not be changing.
non_input_columns = table.columns.filter c-> c.name != input_column.name
unique.mark_used <| non_input_columns.map .name
# Build the output column
output_storages = output_column_builders.map .seal
output_columns = output_storages.map_with_index i-> output_storage->
column_name = unique.make_unique <| column_names.at i
Column.from_storage column_name output_storage
# Build the order mask.
order_mask = OrderMask.new (order_mask_positions.to_vector)
## Build the new table, replacing the input column with the new output
columns.
new_columns_unflattened = table.columns.map column->
case column.name == input_column_id of
True ->
# Replace the input column with the output columns.
output_columns
False ->
# Build a new column from the old one with the mask
old_storage = column.java_column.getStorage
new_storage = old_storage.applyMask order_mask
[Column.from_storage column.name new_storage]
new_columns = new_columns_unflattened.flatten
new_table = Table.new new_columns
problem_builder.attach_problems_after on_problems new_table
## PRIVATE
Map a multi-valued function over a column and return the results as set of
output columns.
Returns a Pair of a Vector of Columns and a Vector of problems.
Arguments:
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to multiple values.
- column_count: The number of columns to split to.
If `Nothing` then columns will be added to fit all data.
If the data exceeds the `column_count`, a `Column_Count_Exceeded` error
will follow the `on_problems` behavior.
- on_problems: Specifies the behavior when a problem occurs.
map_columns_to_multiple : Column -> (Any -> Vector Any) -> Integer | Nothing -> (Integer -> Any) -> Problem_Builder -> Vector Column
map_columns_to_multiple input_column function column_count column_builder=make_string_builder problem_builder =
num_rows = input_column.length
input_storage = input_column.java_column.getStorage
builders = case column_count of
Nothing ->
builders = Vector.new_builder
0.up_to num_rows . each i->
input_value = input_storage.getItemBoxed i
output_values = function input_value
# Add more builders if necessary to accommodate `output_values`.
if output_values.length > builders.length then
num_builders_needed = output_values.length - builders.length
repeat_each num_builders_needed <|
builder = column_builder num_rows
# Pad the new builder with nulls
num_nulls_needed = i
builder.appendNulls num_nulls_needed
builders.append builder
## Add `output_values` to builders; if there are more builders
than `output_values`, pad with null.
0.up_to builders.length . each i->
builders.at i . appendNoGrow (output_values.get i Nothing)
builders.to_vector
_ : Integer ->
builders = Vector.new column_count (_-> column_builder num_rows)
output_lengths = 0.up_to num_rows . map i->
input_value = input_storage.getItemBoxed i
output_values = function input_value
## Add `output_values` to builders; if there are more builders
than `output_values`, pad with null.
0.up_to builders.length . each i->
builders.at i . appendNoGrow (output_values.get i Nothing)
output_values.length
max_output_length = maximum output_lengths
if max_output_length > column_count then
problem = Column_Count_Exceeded.Error column_count max_output_length
problem_builder.report_other_warning problem
builders
# Name columns. If there's only one, use the original column name.
new_column_names = case builders.length of
1 -> [input_column.name]
_ -> 0.up_to builders.length . map i-> input_column.name + " " + (i+1).to_text
# Build Columns.
storages = builders.map .seal
new_column_names.zip storages Column.from_storage
## PRIVATE
Rename a vector of columns to be unique when added to a table.
rename_new_columns : Table -> Text -> Vector Column -> Problem_Builder -> Vector Column
rename_new_columns table removed_column_name columns problem_builder =
unique = table.column_naming_helper.create_unique_name_strategy
remaining_columns = table.columns . filter (c-> c.name != removed_column_name) . map .name
unique.mark_used remaining_columns
new_columns = columns.map column->
new_name = unique.make_unique column.name
column.rename new_name
problem_builder.report_unique_name_strategy unique
new_columns
## PRIVATE
Replace a single column in a table with new columns.
Does not ensure names are unique; that must be done before calling this.
replace_column_with_columns : Table -> Column -> Vector Column -> Table
replace_column_with_columns table old_column new_columns =
Table.new ((table.columns.map (c-> if c.name == old_column.name then new_columns else [c])).flatten)
## PRIVATE
Return the maximum value of the vector.
Throws Empty_Error if the vector is empty.
maximum : Vector Any -> Vector Any
maximum vec = if vec.is_empty then Nothing else
vec.reduce (a-> b-> a.max b)
## PRIVATE
Repeat a computation n times.
repeat_each : Integer -> Any -> Any
repeat_each n ~action = 0.up_to n . each _-> action
## PRIVATE
Pad or truncate a vector to be a specified length; if altered, report
it as a Column_Count_Mismatch warning.
uniform_length : Integer -> Vector Any -> Problem_Builder -> Vector Any
uniform_length target_length v problem_builder = if v.length == target_length then v else
problem = Column_Count_Mismatch.Error target_length v.length
problem_builder.report_other_warning problem
case v.length < target_length of
# Pad.
True -> v.pad target_length Nothing
# Truncate.
False -> v.take target_length

View File

@ -1,12 +1,8 @@
from Standard.Base import all from Standard.Base import all
import project.Data.Column.Column
import project.Data.Table.Table import project.Data.Table.Table
import project.Data.Type.Value_Type.Value_Type import project.Data.Type.Value_Type.Value_Type
import project.Internal.Problem_Builder.Problem_Builder from project.Internal.Fan_Out import all
from project.Errors import Column_Count_Exceeded, Column_Count_Mismatch, Duplicate_Output_Column_Names, Invalid_Value_Type, Missing_Input_Columns
from project.Internal.Java_Exports import make_string_builder
polyglot java import org.enso.table.data.mask.OrderMask polyglot java import org.enso.table.data.mask.OrderMask
@ -17,7 +13,7 @@ split_to_columns : Table -> Text | Integer -> Text -> Integer | Nothing -> Probl
split_to_columns table input_column_id delimiter="," column_count=Nothing on_problems=Report_Error = split_to_columns table input_column_id delimiter="," column_count=Nothing on_problems=Report_Error =
column = table.at input_column_id column = table.at input_column_id
Value_Type.expect_text column <| Value_Type.expect_text column <|
fan_out_to_columns table input_column_id (handle_nothing (_.split delimiter)) column_count on_problems fan_out_to_columns table input_column_id (handle_nothing (_.split delimiter)) column_count on_problems=on_problems
## PRIVATE ## PRIVATE
Splits a column of text into a set of new rows. Splits a column of text into a set of new rows.
@ -36,7 +32,7 @@ tokenize_to_columns : Table -> Text | Integer -> Text -> Case_Sensitivity -> Int
tokenize_to_columns table input_column_id pattern case_sensitivity column_count on_problems = tokenize_to_columns table input_column_id pattern case_sensitivity column_count on_problems =
column = table.at input_column_id column = table.at input_column_id
Value_Type.expect_text column Value_Type.expect_text column
fan_out_to_columns table input_column_id (handle_nothing (_.tokenize pattern case_sensitivity)) column_count on_problems fan_out_to_columns table input_column_id (handle_nothing (_.tokenize pattern case_sensitivity)) column_count on_problems=on_problems
## PRIVATE ## PRIVATE
Tokenizes a column of text into a set of new rows using a regular Tokenizes a column of text into a set of new rows using a regular
@ -102,285 +98,13 @@ regex_to_column_names pattern original_column_name =
case group_nums_to_names.get (i+1) of case group_nums_to_names.get (i+1) of
Nothing -> Nothing ->
suffix = group_number_to_column_name_suffix.at (i+1) suffix = group_number_to_column_name_suffix.at (i+1)
default_column_namer original_column_name suffix original_column_name + " " + (suffix+1).to_text
name : Text -> name : Text ->
name name
## PRIVATE
Transform a table by transforming a column into a set of columns. Takes a
function that maps a single element of the input column to a vector of output
values. The original column is replaced by the new columns.
Arguments:
- table: The table to transform.
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to multiple values.
fan_out_to_columns : Table -> Text | Integer -> (Any -> Vector Any) -> Integer | Nothing -> Problem_Behavior -> Table | Nothing
fan_out_to_columns table input_column_id function column_count=Nothing on_problems=Report_Error =
input_column = table.get input_column_id
problem_builder = Problem_Builder.new
new_columns_unrenamed = map_columns_to_multiple input_column function column_count problem_builder
new_columns = rename_new_columns table input_column.name new_columns_unrenamed problem_builder
new_table = replace_column_with_columns table input_column new_columns
problem_builder.attach_problems_after on_problems new_table
## PRIVATE
Transform a column by applying the given function to the values in the
column. The function produces multiple outputs, so each row is duplicated,
with each row getting a distinct output value in place of the original
input value. The other column values are just duplicated.
Arguments:
- table: The table to transform.
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to multiple values.
fan_out_to_rows : Table -> Text | Integer -> (Any -> Vector Any) -> Boolean -> Problem_Behavior -> Table
fan_out_to_rows table input_column_id function at_least_one_row=False on_problems=Report_Error =
## Treat this as a special case of fan_out_to_rows_and_columns, with one
column. Wrap the provided function to convert each value to a singleton
`Vector`.
wrapped_function x = function x . map y-> [y]
column_names = [input_column_id]
fan_out_to_rows_and_columns table input_column_id wrapped_function column_names at_least_one_row=at_least_one_row on_problems=on_problems
## PRIVATE
Transform a column by applying the given function to the values in the
column. The function returns a `Vector` of `Vectors`. Each inner vector turns
into multiple new columns in a single row. Each inner vector within the outer
vector produces an output row, so each row is duplicated, with each row
getting a distinct set of output values in place of the original input value.
The other column values are just duplicated.
! Error Conditions
The inner vectors should all have the same number of values, which should
match the provided `column_names`. If a value is too short, it will be
padded with Nothing, and if it is too long, it will be truncated. In either
case, Column_Count_Mismatch will be added as a warning. (It is expected
that the caller of this private method will ensure that the provided
function will produce inner vectors of the correct length, but we check for
it anyway.)
> Example
f("12 34 56") -> [[1, 2], [3, 4], [5, 6]]
foo | bar | baz
----+-----------+----
x | 12 34 56 | y ===>
... | ... | ...
foo | bar 1 | bar 2 | baz
----+-------+-------+----
x | 1 | 2 | y
x | 3 | 4 | y
x | 5 | 6 | y
... | ... | ... | ...
Arguments:
- table: The table to transform.
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to a `Vector` of `Vector` of values.
- column_names: The names for the generated columns.
- on_problems: Specifies the behavior when a problem occurs.
fan_out_to_rows_and_columns : Table -> Text | Integer -> (Any -> Vector (Vector Any)) -> Vector Text -> Boolean -> Problem_Behavior -> Table
fan_out_to_rows_and_columns table input_column_id function column_names at_least_one_row=False on_problems=Report_Error =
problem_builder = Problem_Builder.new
unique = table.column_naming_helper.create_unique_name_strategy
input_column = table.at input_column_id
input_storage = input_column.java_column.getStorage
num_input_rows = input_storage.size
num_output_columns = column_names.length
# Guess that most of the time, we'll get at least one value for each input.
initial_size = input_column.length
# Accumulates the outputs of the function.
output_column_builders = Vector.new num_output_columns _-> make_string_builder initial_size
# Accumulates repeated position indices for the order mask.
order_mask_positions = Vector.new_builder initial_size
maybe_add_empty_row vecs =
should_add_empty_row = vecs.is_empty && at_least_one_row
if should_add_empty_row.not then vecs else
empty_row = Vector.fill num_output_columns Nothing
[empty_row]
0.up_to num_input_rows . each i->
input_value = input_storage.getItemBoxed i
output_values = function input_value |> maybe_add_empty_row
# Append each group of values to the builder.
output_values.each row_unchecked->
row = uniform_length num_output_columns row_unchecked problem_builder
row.each_with_index i-> v-> output_column_builders.at i . append v
# Append n copies of the input row position, n = # of output values.
repeat_each output_values.length <| order_mask_positions.append i
# Reserve the non-input column names that will not be changing.
non_input_columns = table.columns.filter c-> c.name != input_column.name
unique.mark_used <| non_input_columns.map .name
# Build the output column
output_storages = output_column_builders.map .seal
output_columns = output_storages.map_with_index i-> output_storage->
column_name = unique.make_unique <| column_names.at i
Column.from_storage column_name output_storage
# Build the order mask.
order_mask = OrderMask.new (order_mask_positions.to_vector)
## Build the new table, replacing the input column with the new output
columns.
new_columns_unflattened = table.columns.map column->
case column.name == input_column_id of
True ->
# Replace the input column with the output columns.
output_columns
False ->
# Build a new column from the old one with the mask
old_storage = column.java_column.getStorage
new_storage = old_storage.applyMask order_mask
[Column.from_storage column.name new_storage]
new_columns = new_columns_unflattened.flatten
new_table = Table.new new_columns
problem_builder.attach_problems_after on_problems new_table
## PRIVATE
Map a multi-valued function over a column and return the results as set of
output columns.
Returns a Pair of a Vector of Columns and a Vector of problems.
Arguments:
- input_column: The column to transform.
- function: A function that transforms a single element of `input_column`
to multiple values.
- column_count: The number of columns to split to.
If `Nothing` then columns will be added to fit all data.
If the data exceeds the `column_count`, a `Column_Count_Exceeded` error
will follow the `on_problems` behavior.
- on_problems: Specifies the behavior when a problem occurs.
map_columns_to_multiple : Column -> (Any -> Vector Any) -> Integer | Nothing -> Problem_Builder -> Vector Column
map_columns_to_multiple input_column function column_count problem_builder =
num_rows = input_column.length
input_storage = input_column.java_column.getStorage
builders = case column_count of
Nothing ->
builders = Vector.new_builder
0.up_to num_rows . each i->
input_value = input_storage.getItemBoxed i
output_values = function input_value
# Add more builders if necessary to accommodate `output_values`.
if output_values.length > builders.length then
num_builders_needed = output_values.length - builders.length
repeat_each num_builders_needed <|
builder = make_string_builder num_rows
# Pad the new builder with nulls
num_nulls_needed = i
builder.appendNulls num_nulls_needed
builders.append builder
## Add `output_values` to builders; if there are more builders
than `output_values`, pad with null.
0.up_to builders.length . each i->
builders.at i . appendNoGrow (output_values.get i Nothing)
builders.to_vector
_ : Integer ->
builders = Vector.new column_count (_-> make_string_builder num_rows)
output_lengths = 0.up_to num_rows . map i->
input_value = input_storage.getItemBoxed i
output_values = function input_value
## Add `output_values` to builders; if there are more builders
than `output_values`, pad with null.
0.up_to builders.length . each i->
builders.at i . appendNoGrow (output_values.get i Nothing)
output_values.length
max_output_length = maximum output_lengths
if max_output_length > column_count then
problem = Column_Count_Exceeded.Error column_count max_output_length
problem_builder.report_other_warning problem
builders
# Name columns. If there's only one, use the original column name.
new_column_names = case builders.length of
1 -> [input_column.name]
_ -> 0.up_to builders.length . map i-> default_column_namer input_column.name i
# Build Columns.
storages = builders.map .seal
new_column_names.zip storages Column.from_storage
## PRIVATE
Rename a vector of columns to be unique when added to a table.
rename_new_columns : Table -> Text -> Vector Column -> Problem_Builder -> Vector Column
rename_new_columns table removed_column_name columns problem_builder =
unique = table.column_naming_helper.create_unique_name_strategy
remaining_columns = table.columns . filter (c-> c.name != removed_column_name) . map .name
unique.mark_used remaining_columns
new_columns = columns.map column->
new_name = unique.make_unique column.name
column.rename new_name
problem_builder.report_unique_name_strategy unique
new_columns
## PRIVATE
Replace a single column in a table with new columns.
Does not ensure names are unique; that must be done before calling this.
replace_column_with_columns : Table -> Column -> Vector Column -> Table
replace_column_with_columns table old_column new_columns =
Table.new ((table.columns.map (c-> if c.name == old_column.name then new_columns else [c])).flatten)
## PRIVATE
Return the maximum value of the vector.
Throws Empty_Error if the vector is empty.
maximum : Vector Any -> Vector Any
maximum vec = if vec.is_empty then Nothing else
vec.reduce (a-> b-> a.max b)
## PRIVATE ## PRIVATE
Wrap a function so that it returns [] if passed Nothing. Wrap a function so that it returns [] if passed Nothing.
handle_nothing : (Any -> Any) -> (Any -> Any) handle_nothing : (Any -> Any) -> (Any -> Any)
handle_nothing function = x-> case x of handle_nothing function = x-> case x of
_ : Nothing -> [] _ : Nothing -> []
_ -> function x _ -> function x
## PRIVATE
Repeat a computation n times.
repeat_each : Integer -> Any -> Any
repeat_each n ~action = 0.up_to n . each _-> action
## PRIVATE
Name a column by appending an integer to a base column name.
default_column_namer : Text -> Integer -> Text
default_column_namer base_name i = base_name + " " + (i+1).to_text
## PRIVATE
Pad or truncate a vector to be a specified length; if altered, report
it as a Column_Count_Mismatch warning.
uniform_length : Integer -> Vector Any -> Problem_Builder -> Vector Any
uniform_length target_length v problem_builder = if v.length == target_length then v else
problem = Column_Count_Mismatch.Error target_length v.length
problem_builder.report_other_warning problem
case v.length < target_length of
# Pad.
True -> v.pad target_length Nothing
# Truncate.
False -> v.take target_length

View File

@ -155,7 +155,13 @@ public class InferredBuilder extends Builder {
} }
private void retypeToMixed() { private void retypeToMixed() {
ObjectBuilder objectBuilder = new MixedBuilder(initialSize); // The new internal builder must be at least `currentSize` so it can store
// all the current values. It must also be at least 'initialSize' since the
// caller might be using appendNoGrow and is expecting to write at least
// that many values.
int capacity = Math.max(initialSize, currentSize);
ObjectBuilder objectBuilder = new MixedBuilder(capacity);
currentBuilder.retypeToMixed(objectBuilder.getData()); currentBuilder.retypeToMixed(objectBuilder.getData());
objectBuilder.setCurrentSize(currentBuilder.getCurrentSize()); objectBuilder.setCurrentSize(currentBuilder.getCurrentSize());
objectBuilder.setPreExistingProblems(currentBuilder.getProblems()); objectBuilder.setPreExistingProblems(currentBuilder.getProblems());

View File

@ -553,6 +553,12 @@ spec setup =
table = table_builder [["aaa", [1, 2]], ["bbb", [3, 4]], ["ccc", [5, 6]]] table = table_builder [["aaa", [1, 2]], ["bbb", [3, 4]], ["ccc", [5, 6]]]
table.expand_column "bbb" . should_fail_with Unsupported_Database_Operation table.expand_column "bbb" . should_fail_with Unsupported_Database_Operation
# The in-memory functionality of `expand_to_rows` is tested in test/Table_Tests/src/In_Memory/Table_Conversion_Spec.enso
if setup.is_database then Test.group prefix+"Table.expand_to_rows" <|
Test.specify "should report unsupported" <|
table = table_builder [["aaa", [1, 2]], ["bbb", [3, 4]], ["ccc", [5, 6]]]
table.expand_to_rows "bbb" . should_fail_with Unsupported_Database_Operation
if setup.is_database.not then Test.group prefix+"Table/Column auto value type" <| if setup.is_database.not then Test.group prefix+"Table/Column auto value type" <|
Test.specify "should allow to narrow down types of a Mixed column" <| Test.specify "should allow to narrow down types of a Mixed column" <|
[True, False].each shrink_types-> [True, False].each shrink_types->

View File

@ -30,3 +30,11 @@ spec = Test.group "[In-Memory] Storage Builders" <|
storage = builder.seal storage = builder.seal
column = Column.from_storage "X" storage column = Column.from_storage "X" storage
column.to_vector . should_equal vector column.to_vector . should_equal vector
Test.specify "Inferred Builder should correctly resize when retyping to a mixed column, with an underestimated initial size" <|
mixed_values = [10, 11, 22, 23, 24, 25, '2020-02-28']
builder = make_inferred_builder 3
mixed_values.map v-> builder.append v
storage = builder.seal
column = Column.from_storage "X" storage
column.to_vector . should_equal mixed_values

View File

@ -0,0 +1,29 @@
from Standard.Base import all
from project.Util import all
import Standard.Table.Internal.Fan_Out
from Standard.Table import Table
import Standard.Test.Extensions
from Standard.Test import Test, Test_Suite, Problems
spec =
Test.group "Fan_Out" <|
Test.specify "can do fan_out_to_columns " <|
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
t = Table.new cols
expected_rows = [[0, "a", "c", Nothing], [1, "c", "d", "ef"], [2, "gh", "ij", "u"]]
expected = Table.from_rows ["foo", "bar 1", "bar 2", "bar 3"] expected_rows
t2 = Fan_Out.fan_out_to_columns t "bar" (_.split "b")
t2.should_equal expected
Test.specify "can do fan_out_to_rows" <|
cols = [["foo", [0, 1, 2]], ["bar", ["abc", "cbdbef", "ghbijbu"]]]
t = Table.new cols
expected_rows = [[0, "a"], [0, "c"], [1, "c"], [1, "d"], [1, "ef"], [2, "gh"], [2, "ij"], [2, "u"]]
expected = Table.from_rows ["foo", "bar"] expected_rows
t2 = Fan_Out.fan_out_to_rows t "bar" (_.split "b")
t2.should_equal expected
main = Test_Suite.run_main spec

View File

@ -2,8 +2,10 @@ from Standard.Base import all
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
from Standard.Table.Extensions.Table_Conversions import all import Standard.Table.Data.Type.Value_Type.Value_Type
from Standard.Table import Table, Column from Standard.Table import Table, Column
from Standard.Table.Errors import No_Such_Column
from Standard.Table.Extensions.Table_Conversions import all
from Standard.Test import Test, Test_Suite, Problems from Standard.Test import Test, Test_Suite, Problems
import Standard.Test.Extensions import Standard.Test.Extensions
@ -202,4 +204,76 @@ spec =
expected = Table.new [["aaa", [1, 2]], ["expanded last", ["Smith", Nothing]], ["expanded height", [Nothing, 1.9]], ["expanded foo", [Nothing, Nothing]], ["ccc", [5, 6]]] expected = Table.new [["aaa", [1, 2]], ["expanded last", ["Smith", Nothing]], ["expanded height", [Nothing, 1.9]], ["expanded foo", [Nothing, Nothing]], ["ccc", [5, 6]]]
table.expand_column "bbb" ["last", "height", "foo"] "expanded " . should_equal expected table.expand_column "bbb" ["last", "height", "foo"] "expanded " . should_equal expected
Test.group "expand_to_rows" <|
Test.specify "Can expand single values" <|
values_to_expand = [3, 4]
table = Table.new [["aaa", [1, 2]], ["bbb", values_to_expand], ["ccc", [5, 6]]]
expected = Table.new [["aaa", [1, 2]], ["bbb", [3, 4]], ["ccc", [5, 6]]]
table.expand_to_rows "bbb" . should_equal expected
Test.specify "Can expand Vectors" <|
values_to_expand = [[10, 11], [20, 21, 22], [30]]
table = Table.new [["aaa", [1, 2, 3]], ["bbb", values_to_expand], ["ccc", [5, 6, 7]]]
expected = Table.new [["aaa", [1, 1, 2, 2, 2, 3]], ["bbb", [10, 11, 20, 21, 22, 30]], ["ccc", [5, 5, 6, 6, 6, 7]]]
r = table.expand_to_rows "bbb"
r . should_equal expected
r.at "bbb" . value_type . should_equal Value_Type.Integer
Test.specify "Can expand Arrays" <|
values_to_expand = [[10, 11].to_array, [20, 21, 22].to_array, [30].to_array]
table = Table.new [["aaa", [1, 2, 3]], ["bbb", values_to_expand], ["ccc", [5, 6, 7]]]
expected = Table.new [["aaa", [1, 1, 2, 2, 2, 3]], ["bbb", [10, 11, 20, 21, 22, 30]], ["ccc", [5, 5, 6, 6, 6, 7]]]
table.expand_to_rows "bbb" . should_equal expected
Test.specify "Can expand Lists" <|
values_to_expand = [[10, 11].to_list, [20, 21, 22].to_list, [30].to_list]
table = Table.new [["aaa", [1, 2, 3]], ["bbb", values_to_expand], ["ccc", [5, 6, 7]]]
expected = Table.new [["aaa", [1, 1, 2, 2, 2, 3]], ["bbb", [10, 11, 20, 21, 22, 30]], ["ccc", [5, 5, 6, 6, 6, 7]]]
table.expand_to_rows "bbb" . should_equal expected
Test.specify "Can expand Pairs" <|
values_to_expand = [Pair.new 10 20, Pair.new "a" [30], Pair.new 40 50]
table = Table.new [["aaa", [1, 2, 3]], ["bbb", values_to_expand], ["ccc", [5, 6, 7]]]
expected = Table.new [["aaa", [1, 1, 2, 2, 3, 3]], ["bbb", [10, 20, "a", [30], 40, 50]], ["ccc", [5, 5, 6, 6, 7, 7]]]
table.expand_to_rows "bbb" . should_equal expected
Test.specify "Can expand Ranges" <|
values_to_expand = [Range.new 10 12, Range.new 20 27 step=3, Range.new 30 31]
table = Table.new [["aaa", [1, 2, 3]], ["bbb", values_to_expand], ["ccc", [5, 6, 7]]]
expected = Table.new [["aaa", [1, 1, 2, 2, 2, 3]], ["bbb", [10, 11, 20, 23, 26, 30]], ["ccc", [5, 5, 6, 6, 6, 7]]]
table.expand_to_rows "bbb" . should_equal expected
Test.specify "Can expand Date_Ranges" <|
range0 = (Date.new 2020 02 28).up_to (Date.new 2020 03 01)
range1 = (Date.new 2020 10 28).up_to (Date.new 2020 11 16) . with_step Date_Period.Week
range2 = (Date.new 2023 07 03).up_to (Date.new 2023 10 03) . with_step Date_Period.Month
values_to_expand = [range0, range1, range2]
values_expanded = [Date.new 2020 02 28, Date.new 2020 02 29] + [Date.new 2020 10 28, Date.new 2020 11 4, Date.new 2020 11 11, Date.new 2023 07 03, Date.new 2023 08 03] + [Date.new 2023 09 03]
table = Table.new [["aaa", [1, 2, 3]], ["bbb", values_to_expand], ["ccc", [5, 6, 7]]]
expected = Table.new [["aaa", [1, 1, 2, 2, 2, 3, 3, 3]], ["bbb", values_expanded], ["ccc", [5, 5, 6, 6, 6, 7, 7, 7]]]
table.expand_to_rows "bbb" . should_equal expected
Test.specify "Can expand mixed columns" <|
values_to_expand = [[10, 11], 22.up_to 26, (Date.new 2020 02 28).up_to (Date.new 2020 03 01)]
values_expanded = [10, 11, 22, 23, 24, 25, Date.new 2020 02 28, Date.new 2020 02 29]
table = Table.new [["aaa", [1, 2, 3]], ["bbb", values_to_expand], ["ccc", [5, 6, 7]]]
expected = Table.new [["aaa", [1, 1, 2, 2, 2, 2, 3, 3]], ["bbb", values_expanded], ["ccc", [5, 5, 6, 6, 6, 6, 7, 7]]]
table.expand_to_rows "bbb" . should_equal expected
Test.specify "Respects `at_least_one_row=True`" <|
values_to_expand = [[10, 11], [], [30]]
table = Table.new [["aaa", [1, 2, 3]], ["bbb", values_to_expand], ["ccc", [5, 6, 7]]]
expected = Table.new [["aaa", [1, 1, 2, 3]], ["bbb", [10, 11, Nothing, 30]], ["ccc", [5, 5, 6, 7]]]
table.expand_to_rows "bbb" at_least_one_row=True . should_equal expected
Test.specify "Respects `at_least_one_row=False`" <|
values_to_expand = [[10, 11], [], [30]]
table = Table.new [["aaa", [1, 2, 3]], ["bbb", values_to_expand], ["ccc", [5, 6, 7]]]
expected = Table.new [["aaa", [1, 1, 3]], ["bbb", [10, 11, 30]], ["ccc", [5, 5, 7]]]
table.expand_to_rows "bbb" . should_equal expected
Test.specify "Missing column" <|
table = Table.new [["aaa", [1, 2, 3]], ["notbbbb", [8, 8, 8]], ["ccc", [5, 6, 7]]]
table.expand_to_rows "bbb" . should_fail_with No_Such_Column
main = Test_Suite.run_main spec main = Test_Suite.run_main spec