From 1e0649fda1376a040549a24f29a080e0283a7a37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 22 May 2024 11:38:10 +0200 Subject: [PATCH] Improvements to `Table.union` (#9968) - Closes #9952 --- CHANGELOG.md | 2 + .../Database/0.0.0-dev/src/DB_Table.enso | 187 ++++----- .../Database/0.0.0-dev/src/Errors.enso | 11 +- .../Internal/Postgres/Postgres_Dialect.enso | 2 +- .../src/Internal/SQLite/SQLite_Dialect.enso | 2 +- .../Table/0.0.0-dev/src/Columns_To_Keep.enso | 35 ++ .../Standard/Table/0.0.0-dev/src/Errors.enso | 54 ++- .../0.0.0-dev/src/Excel/Excel_Workbook.enso | 7 +- .../0.0.0-dev/src/Internal/Table_Helpers.enso | 55 +-- .../src/Internal/Value_Type_Helpers.enso | 89 ++++- .../Table/0.0.0-dev/src/Match_Columns.enso | 121 ++++-- .../Standard/Table/0.0.0-dev/src/Table.enso | 281 +++++++------- .../data/column/builder/DateTimeBuilder.java | 41 ++ .../data/column/builder/TypedBuilder.java | 11 +- .../org/enso/table/data/table/Column.java | 11 +- .../Join/Union_Spec.enso | 358 +++++++++--------- .../Table_Tests/src/In_Memory/Table_Spec.enso | 20 +- 17 files changed, 772 insertions(+), 515 deletions(-) create mode 100644 distribution/lib/Standard/Table/0.0.0-dev/src/Columns_To_Keep.enso diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c22f8ff244..a0e42396919 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -666,6 +666,7 @@ - [Added `Text.cleanse` `Column.Text_Cleanse` and `Table.Text_Cleanse`][9879] - [Added ability to save an existing Postgres connection as a Data Link in Enso Cloud.][9957] +- [Improved `Table.union`.][9968] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -978,6 +979,7 @@ [9873]: https://github.com/enso-org/enso/pull/9873 [9879]: https://github.com/enso-org/enso/pull/9879 [9957]: https://github.com/enso-org/enso/pull/9957 +[9968]: https://github.com/enso-org/enso/pull/9968 #### Enso Compiler diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso index 93ce2e5f4d0..fc20c053644 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/DB_Table.enso @@ -18,6 +18,7 @@ from Standard.Base.Runtime import assert from Standard.Base.Widget_Helpers import make_data_cleanse_vector_selector, make_delimiter_selector, make_format_chooser import Standard.Table.Column_Operation.Column_Operation +import Standard.Table.Columns_To_Keep.Columns_To_Keep import Standard.Table.Expression.Expression import Standard.Table.Expression.Expression_Error import Standard.Table.Internal.Add_Row_Number @@ -30,6 +31,7 @@ import Standard.Table.Internal.Problem_Builder.Problem_Builder import Standard.Table.Internal.Replace_Helpers import Standard.Table.Internal.Table_Helpers import Standard.Table.Internal.Table_Helpers.Table_Column_Helper +import Standard.Table.Internal.Table_Helpers.Union_Result_Type import Standard.Table.Internal.Table_Ref.Table_Ref import Standard.Table.Internal.Unique_Name_Strategy.Unique_Name_Strategy import Standard.Table.Internal.Value_Type_Helpers @@ -1719,90 +1721,77 @@ type DB_Table - tables: A single table or a vector of tables to append to this one. The tables are concatenated in the order they are specified, with `self` being the first one. + - columns_to_keep: Specifies which columns to keep. Defaults to keeping + columns that are present in any of the tables, reporting a warning for + columns that are not present in all tables and adding `Nothing` values + for them. - match_columns: Specifies how to match the columns. - If `Match_Columns.By_Name` - the columns are matched by name across all provided tables. - If unmatched columns are to be dropped, the resulting table will keep - only the set of columns that appear in all provided tables, in the - relative order that they appeared in the `self` table. - If unmatched columns are kept, they are added in the order of - appearance - i.e. first all columns from `self` will be added in the - original order, then any columns from the second table that were not - matched will be added at the end (preserving their relative order), - and so on for all the remaining tables. - If `Match_Columns.By_Position` - the columns are mapped by position. - If unmatched columns are to be dropped, the resulting table will have - as many columns as the table that had the least columns and the - column names of the first table (`self`) will be used. - If unmatched columns are kept, the resulting table will have as many - columns as the table with the most columns. Since the first table may - not have all the necessary columns to provide column names for the - result, the result will have column names taken from the first table - that has the biggest number of columns. - - keep_unmatched_columns: If set to `True`, unmatched columns are kept - and are padded with `Nothing` for tables that did not have them. - If set to `False`, only the common subset of columns is kept - any - column that is not present in all tables is dropped. Defaults to - `Report_Unmatched`, which behaves like `True` - unmatched columns are - kept and padded with `Nothing`, but a problem is reported. - - allow_type_widening: Specifies if the resulting column type should be - adjusted to fit columns from all arguments. If `True`, a common type - will be chosen for each column (see "Unifying Column Types" below). - If `False`, the resulting column type will be the same as in the first - table containing the column. In this case, all columns that are - concatenated must have the same type as the first one (unless this - had a `Mixed` type - in which case it will accept any other types). + The names of each column come from the first table in which the given + column appears in. + The `List` option is not applicable when mapping columns by position. + Column names are taken from the first table if `In_All` and from the + first table that has the maximum number of columns if `In_Any` - on_problems: Specifies how to handle problems if they occur, reporting them as warnings by default. - - If `keep_unmatched_columns` is set to `Report_Unmatched` (the - default): - - If matching by name and there are columns that are not present in - all tables, `Unmatched_Columns` is reported. - - If matching by position and column counts of the merged tables - differ, then a `Column_Count_Mismatch` is reported. The error will - contain the greatest column count as its `expected` value and the - smallest one as its `actual` value. - - If `keep_unmatched_columns` is set to `False` and matching by name, - it is possible that there are no columns that are common to all - provided tables, in that case `No_Output_Columns` is thrown as a - dataflow error regardless of the `on_problems` setting, because there - are no columns to include in the resulting table. - - If type widening is disabled and one of corresponding columns has a - type that is incompatible with the type coming from the first table, - a `Column_Type_Mismatch` is reported. The problematic column will be - dropped from the resulting table. With type widening disabled, the - subsequent tables must have the same types as the first one, unless - the type of the first one was `Mixed` which will accept any other - type. - - If a common type coercion for a set of matched columns from - concatenated tables cannot be found, a `No_Common_Type` is reported. - In warning or ignore mode, the problematic column will be dropped - from the resulting table. - ? Unifying Column Types - If `allow_type_widening` is set to `True`, then the following rules are - used to find a common type that will fit values from all merged tables. - - Numeric columns are unified by finding the most general type that can - fit all of the columns. The biggest integer type will be chosen and if + Numeric columns are unified by finding the smallest type that can fit + all of the columns. The biggest integer type will be chosen and if integers and decimals are mixed, the decimal type will be chosen. If boolean columns are mixed with numeric columns, they will be coerced to the numeric type (and converted to 0 and 1). - Text types will also be coerced according to the common rules - if - constant-length texts of different lengths are mixed, they will be - coerced to a varying-length type. + Text types will are also unified by finding the smallest type that can + fit all the values. If constant-length texts of different lengths are + mixed, they will be coerced to a varying-length type. + + If date and date-time columns are unified, this yields a date-time + column. In-memory, the date is promoted by adding a time of 00:00 and + the system time-zone. In other backends that behaviour may differ. If one of the matched columns has `Mixed` type, that type will be used - regardless of types of other columns. Mixing any other types will - result in a `No_Common_Type` problem. If columns of incompatible types - are meant to be mixed, at least one of them should be explicitly - retyped to the `Mixed` type to indicate that intention. Note that the - `Mixed` type may not be supported by most Database backends. - union : (DB_Table | Vector DB_Table) -> Match_Columns -> Boolean | Report_Unmatched -> Boolean -> Problem_Behavior -> DB_Table - union self tables:(DB_Table | Vector) match_columns=Match_Columns.By_Name keep_unmatched_columns=Report_Unmatched allow_type_widening=True on_problems=Report_Warning = + regardless of types of other columns. Note that the `Mixed` type may + not be supported by most Database backends. + + Finally, if no common type is found using the rules above, everything + is converted to text. + + ? Problem Conditions + + - If no common type is found and the text conversion fallback is used, + the `No_Common_Type` problem is reported. + - The `Float` type may not be able to exactly represent larger + integers, thus if such large integers are mixed with floats, the + resulting conversion to `Float` may cause a loss of precision. + In that case, a `Loss_Of_Integer_Precision` problem is reported. + This warning is only reported in the in-memory backend. Currently, + the Database backend proceeds without a warning about precision loss. + - If a column of dates is unified with a column of date-times, since + the assumption of using the midnight time-of-day is arbitrary, + a `Implicit_Date_As_Date_Time_Conversion` problem is reported. + - If an empty vector of tables is provided, an `Illegal_Argument` error + is raised. + - If `columns_to_keep` is set to `In_All` or `List` and an expected + column is missing in some of the tables, a `Unmatched_Columns` + problem is reported. If this causes the output to contain no columns, + a `No_Output_Columns` error is raised. + + ? Ordering of Columns in the result + + When matching columns by name, it is possible that the ordering of + columns may vary between input tables. The ordering is determined as + following: columns that are kept from the first table are in the order + they appear in that table. If there are columns that do not appear in + the first table, they are appended to the end of the resulting table in + the order they appear in the input. + @tables (Widget.Vector_Editor item_editor=Widget.Code_Input item_default='_' display=Display.Always) + @columns_to_keep Columns_To_Keep.default_widget + union : (DB_Table | Vector DB_Table) -> Match_Columns -> Columns_To_Keep -> Problem_Behavior -> DB_Table + union self tables:(DB_Table | Vector) (columns_to_keep : Columns_To_Keep = ..In_Any_Warn_On_Missing) (match_columns : Match_Columns = Match_Columns.By_Name) (on_problems : Problem_Behavior = Report_Warning) = all_tables = case tables of v : Vector -> [self] + (v.map t-> DB_Table.from t) single_table -> [self, single_table] @@ -1811,39 +1800,53 @@ type DB_Table we only want to add a cause coming from unification; matching reports problems that would not fit this error. problem_builder_for_matching = Problem_Builder.new problem_builder_for_unification = Problem_Builder.new - matched_column_sets = Match_Columns_Helpers.match_columns all_tables match_columns keep_unmatched_columns problem_builder_for_matching + matched_column_sets = Match_Columns_Helpers.match_columns all_tables match_columns columns_to_keep problem_builder_for_matching dialect = self.connection.dialect type_mapping = dialect.get_type_mapping merged_columns = matched_column_sets.map column_set-> - case Table_Helpers.unify_result_type_for_union column_set all_tables allow_type_widening problem_builder_for_unification of - Nothing -> Nothing - result_type : Value_Type -> - sql_type = type_mapping.value_type_to_sql result_type Problem_Behavior.Report_Error - sql_type.catch Inexact_Type_Coercion error-> - Panic.throw <| - Illegal_State.Error "Unexpected inexact type coercion in Union. The union logic should only operate in types supported by the given backend. This is a bug in the Database library. The coercion was: "+error.to_display_text cause=error - [column_set, sql_type, result_type] - good_columns = merged_columns.filter r-> r.is_nothing.not + sql_type_from_value_type value_type = + type_mapping.value_type_to_sql value_type Problem_Behavior.Report_Error . catch Inexact_Type_Coercion error-> + Panic.throw <| + Illegal_State.Error "Unexpected inexact type coercion in Union. The union logic should only operate in types supported by the given backend. This is a bug in the Database library. The coercion was: "+error.to_display_text cause=error + case Table_Helpers.unify_result_type_for_union column_set all_tables problem_builder_for_unification of + Union_Result_Type.Common_Type common_type -> + [column_set, sql_type_from_value_type common_type, common_type] + Union_Result_Type.Fallback_To_Text -> + [column_set, sql_type_from_value_type Value_Type.Char, Value_Type.Char] + Union_Result_Type.No_Types_To_Unify -> + ## If the column is all nulls, we still need to give it some type. + For DB `Mixed` is not available, so a portable type to use is `Char`. + [column_set, SQL_Type.null, Value_Type.Char] + problem_builder_for_matching.attach_problems_before on_problems <| problem_builder_for_unification.attach_problems_before on_problems <| - if good_columns.is_empty then problem_builder_for_unification.raise_no_output_columns_with_cause else + if merged_columns.is_empty then problem_builder_for_unification.raise_no_output_columns_with_cause else queries = all_tables.map_with_index i-> t-> - columns_to_select = good_columns.map description-> - column_set = description.first - sql_type = description.second + columns_to_select = merged_columns.map description-> + column_set = description.at 0 + sql_type = description.at 1 + result_type = description.at 2 column_name = column_set.name + ## We assume that the type for this expression will never be queried - it is + just used internally to build the Union operation and never exposed externally. + infer_return_type _ = SQL_Type_Reference.null case column_set.column_indices.at i of + corresponding_column_index : Integer -> + column = t.at corresponding_column_index + internal_named_column = column.as_internal.rename column_name + ## We cast if the result type is different. + This is a bit on the safe side. In some cases the cast is not needed + (for example, most databases will allow union of int2 and int4 without casts; or SQLite does not need casts at all). + However, we do this for simplicity as determining the rules when the cast is needed or not is adding a lot of complication. + This is a possible future improvement to make queries lighter, but the benefit is unlikely to be worth it. + needs_cast = column.value_type != result_type + if needs_cast.not then internal_named_column else + dialect.make_cast internal_named_column sql_type infer_return_type Nothing -> typ = SQL_Type_Reference.from_constant SQL_Type.null expr = SQL_Expression.Literal "NULL" null_column = Internal_Column.Value column_name typ expr - ## We assume that the type for this - expression will never be queried - it is - just used internally to build the Union - operation and never exposed externally. - infer_return_type _ = SQL_Type_Reference.null - dialect.make_cast null_column sql_type infer_return_type - corresponding_column_index : Integer -> - t.at corresponding_column_index . as_internal . rename column_name + if sql_type == SQL_Type.null then null_column else + dialect.make_cast null_column sql_type infer_return_type pairs = columns_to_select.map c-> [c.name, c.expression] Query.Select pairs t.context @@ -1860,7 +1863,7 @@ type DB_Table See #6118. infer_return_type expression = SQL_Type_Reference.new self.connection new_ctx expression - new_columns = good_columns.map description-> + new_columns = merged_columns.map description-> column_set = description.first result_type = description.at 2 name = column_set.name diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Errors.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Errors.enso index f833454482d..76242a284f4 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Errors.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Errors.enso @@ -35,11 +35,18 @@ type SQL_Error Convert the SQL error to a textual representation. to_text : Text to_text self = - query = if self.related_query.is_nothing.not then " [Query was: " + self.related_query.to_display_text + "]" else "" + query = if self.related_query.is_nothing then "" else + query_text = self.related_query.to_text + ## Our generated queries tend to be very long, so to still be readable, + we don't shorten them too much. We impose an upper limit to avoid unbounded error message size. + max_length = 1000 + shortened_query_text = if query_text.length <= max_length then query_text else + query_text.take (Index_Sub_Range.First (max_length.div 2)) + " (...) " + query_text.take (Index_Sub_Range.Last (max_length.div 2)) + " [Query was: " + shortened_query_text + "]" message = self.java_exception.getMessage max_length = 300 short_message = if message.length < max_length then message else - message.take (Index_Sub_Range.First max_length/2) + " (...) " + message.take (Index_Sub_Range.Last max_length/2) + message.take (Index_Sub_Range.First (max_length.div 2)) + " (...) " + message.take (Index_Sub_Range.Last (max_length.div 2)) "There was an SQL error: " + short_message + "." + query ## PRIVATE diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso index a95a86fec83..6f689b03233 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso @@ -140,7 +140,7 @@ type Postgres_Dialect ## PRIVATE make_cast : Internal_Column -> SQL_Type -> (SQL_Expression -> SQL_Type_Reference) -> Internal_Column - make_cast self column target_type infer_result_type_from_database_callback = + make_cast self (column : Internal_Column) (target_type : SQL_Type) (infer_result_type_from_database_callback : SQL_Expression -> SQL_Type_Reference) = mapping = self.get_type_mapping source_type = mapping.sql_type_to_value_type column.sql_type_reference.get target_value_type = mapping.sql_type_to_value_type target_type diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso index d63a7db2099..0b7038a045c 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso @@ -138,7 +138,7 @@ type SQLite_Dialect ## PRIVATE make_cast : Internal_Column -> SQL_Type -> (SQL_Expression -> SQL_Type_Reference) -> Internal_Column - make_cast self column target_type infer_result_type_from_database_callback = + make_cast self (column : Internal_Column) (target_type : SQL_Type) (infer_result_type_from_database_callback : SQL_Expression -> SQL_Type_Reference) = _ = [infer_result_type_from_database_callback] mapping = self.get_type_mapping target_value_type = mapping.sql_type_to_value_type target_type diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Columns_To_Keep.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Columns_To_Keep.enso new file mode 100644 index 00000000000..dc52e6882a2 --- /dev/null +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Columns_To_Keep.enso @@ -0,0 +1,35 @@ +from Standard.Base import Vector, Text +from Standard.Base.Metadata import make_single_choice, Widget + +## Specifies which columns to keep in a union operation. +type Columns_To_Keep + ## All columns are kept. + + If a column is present only in some of the tables, it is padded with + `Nothing` for tables where it is missing. + In_Any + + ## Only columns that are present in all tables are kept. + + If there are columns that are only present in some of the tables, + a problem is reported. + In_All + + ## Specific list of column names to keep. + + If a table does not have a column that is specified in the list, it is + padded with `Nothing` and a problem is reported. + In_List (column_names : Vector Text) + + ## PRIVATE + Same as `In_Any`, but it will warn about columns that are not present in + all tables. + In_Any_Warn_On_Missing + + ## PRIVATE + The default widget for `Columns_To_Keep`. + It does not display the internal `In_Any_Warn_On_Missing` variant, since + that variant is only meant to be used as the default value. + default_widget -> Widget = + make_single_choice <| + ["In_Any", "In_All", "In_List"].map c-> [c, ".."+c] diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso index 583e1ab8353..96309c32e43 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Errors.enso @@ -494,7 +494,8 @@ type Column_Type_Mismatch type No_Common_Type ## PRIVATE - An error indicating that no common type could be found. + An error indicating that no common type could be found, and the operation + could not be performed. Arguments: - types: The types that were tried to be unified. @@ -502,6 +503,11 @@ type No_Common_Type unified, if applicable. Error (types : Vector Value_Type) (related_column_name : Nothing|Text) + ## PRIVATE + A warning indicating that no common type could be found, so the operation + had to fall back to converting all values to text. + Warning_Convert_To_Text (types : Vector Value_Type) (related_column_name:Text) + ## PRIVATE Create a human-readable version of the error. @@ -509,11 +515,21 @@ type No_Common_Type to_display_text self = types = self.types.map .to_display_text . join ", " prefix = "No common type was found for types: "+types - infix = case self.related_column_name of - column_name : Text -> " when unifying column ["+column_name+"]." - _ -> "." - suffix = " If you want to allow mixed types, please cast one of the columns to `Mixed` beforehand." - prefix + infix + suffix + location = case self.related_column_name of + column_name : Text -> " when unifying column ["+column_name+"]" + _ -> "" + suffix_type = case self of + No_Common_Type.Error _ _ -> "." + No_Common_Type.Warning_Convert_To_Text _ _ -> ", so the values were converted to text." + suffix_mixed = " If you want to have mixed types instead, please cast one of the columns to `Mixed` beforehand." + prefix + location + suffix_type + suffix_mixed + + ## PRIVATE + to_text self -> Text = + ctor = case self of + No_Common_Type.Error _ _ -> "Error" + No_Common_Type.Warning_Convert_To_Text _ _ -> "Warning_Convert_To_Text" + "No_Common_Type."+ctor+" "+self.types.to_text+" "+self.related_column_name.to_text type Unmatched_Columns ## PRIVATE @@ -637,9 +653,11 @@ type Conversion_Failure type Loss_Of_Integer_Precision ## PRIVATE - Indicates that an automatic conversion of an integer column to a decimal + Indicates that an automatic conversion of an Integer column to a Float column is losing precision because some of the large integers cannot be - exactly represented by the `double` type. + exactly represented by the floating-point type. + + Currently, this error is only reported in-memory. Warning (affected_rows_count : Integer) (example_value : Integer) (example_value_converted : Float) ## PRIVATE @@ -834,3 +852,23 @@ type Nothing_Value_In_Filter_Condition to_display_text : Text to_display_text self = "Using `Nothing` as an argument to a `"+self.filter_condition.to_text+"` cannot match anything." + +## Indicates that different Date_Time (with or without timezone) or Date types + are mixed in the result, causing implicit coercions. + + This is a warning, because using the `00:00` time and default time-zone may + not always be the expected choice, so the user should be aware of this. +type Mixing_Date_Time_Types + ## PRIVATE + Date_To_Date_Time (related_column_name : Text | Nothing) + + ## PRIVATE + Implicit_Time_Zone (related_column_name : Text | Nothing) + + to_display_text self -> Text = + location = if self.related_column_name.is_nothing then "" else " (in column ["+self.related_column_name+"])" + case self of + Mixing_Date_Time_Types.Date_To_Date_Time _ -> + "Mixing Date and Date_Time values"+location+": the Date values have been automatically converted to Date_Time by adding a time of 00:00 in the default time-zone." + Mixing_Date_Time_Types.Implicit_Time_Zone _ -> + "Mixing Date_Time values with and without timezone"+location+". A default timezone has been assumed where it was missing." diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Excel/Excel_Workbook.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Excel/Excel_Workbook.enso index 093be2c49b3..1eb4390a0bc 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Excel/Excel_Workbook.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Excel/Excel_Workbook.enso @@ -12,6 +12,7 @@ from Standard.Base.Data.Filter_Condition import sql_like_to_regex from Standard.Base.Metadata.Choice import Option from Standard.Base.Metadata.Widget import Multiple_Choice, Single_Choice +import project.Columns_To_Keep.Columns_To_Keep import project.Excel.Excel_Range.Excel_Range import project.Headers.Headers import project.Internal.Excel_Reader @@ -331,7 +332,7 @@ type Excel_Workbook tables = sheet_names.map on_problems=on_problems address-> self.read address headers on_problems=on_problems case return of Return_As.Table_Of_Tables -> Table.new [["Sheet Name", sheet_names], ["Table", tables]] - Return_As.Merged_Table match -> + Return_As.Merged_Table columns_to_keep match -> first_tbl = tables.find t-> t != Nothing if first_tbl == Nothing then Error.throw (Illegal_Argument.Error "No valid sheets found.") else unique = first_tbl.column_naming_helper.create_unique_name_strategy @@ -339,7 +340,7 @@ type Excel_Workbook new_column_name = unique.make_unique "Sheet Name" with_names = tables.zip sheet_names tbl->name-> if tbl == Nothing then Nothing else tbl.set name new_column_name . reorder_columns [new_column_name] - result = Table.from_union (with_names.filter Filter_Condition.Not_Nothing) match keep_unmatched_columns=True + result = Table.from_union (with_names.filter Filter_Condition.Not_Nothing) columns_to_keep=columns_to_keep match_columns=match problem_builder = Problem_Builder.new problem_builder.report_unique_name_strategy unique @@ -359,4 +360,4 @@ type Return_As Table_Of_Tables ## All sheets are merged into a single table. A union operation is performed. - Merged_Table match:Match_Columns=Match_Columns.By_Name + Merged_Table (columns_to_keep : Columns_To_Keep = Columns_To_Keep.In_Any) (match : Match_Columns = Match_Columns.By_Name) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso index 3b882fd6528..274abe6440d 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Table_Helpers.enso @@ -10,6 +10,7 @@ import project.Expression.Expression import project.Internal.Column_Naming_Helper.Column_Naming_Helper import project.Internal.Problem_Builder.Problem_Builder import project.Internal.Value_Type_Helpers +import project.Match_Columns.Column_Set import project.Position.Position import project.Set_Mode.Set_Mode import project.Sort_Column.Sort_Column @@ -530,30 +531,38 @@ is_column obj = ## PRIVATE A helper method that resolves what should be the result type of a particular column set based on the union settings. -unify_result_type_for_union column_set all_tables allow_type_widening problem_builder = +unify_result_type_for_union (column_set : Column_Set) (all_tables : Vector) (problem_builder : Problem_Builder) -> Union_Result_Type = columns = column_set.resolve_columns all_tables - case allow_type_widening of - True -> - types = columns.filter Filter_Condition.Not_Nothing . map .value_type - common_type = Value_Type_Helpers.find_common_type types strict=True - if common_type.is_nothing then - problem_builder.report_other_warning (No_Common_Type.Error types related_column_name=column_set.name) - common_type - False -> - is_not_nothing c = case c of - Nothing -> False - _ -> True - first_column = columns.find is_not_nothing - first_type = first_column.value_type - if first_type == Value_Type.Mixed then Value_Type.Mixed else - first_wrong_column = columns.find if_missing=Nothing col-> - is_not_nothing col && col.value_type != first_type - case first_wrong_column of - Nothing -> first_type - _ -> - got_type = first_wrong_column.value_type - problem_builder.report_other_warning (Column_Type_Mismatch.Error column_set.name first_type got_type) - Nothing + . filter Filter_Condition.Not_Nothing + types = columns.map .value_type + + if types.is_empty then Union_Result_Type.No_Types_To_Unify else + ## First we check if we can find a generic common type. + This includes widening numeric column sizes, or converting Integer to Float. + common_type = Value_Type_Helpers.find_common_type types strict=True + if common_type.is_nothing.not then Union_Result_Type.Common_Type common_type else + ## Union has less strict requirements than other operations relying on `find_common_type`, + so if the common type was not found, we still check some fallbacks. + common_numeric_boolean = Value_Type_Helpers.find_common_numeric_boolean_type types + if common_numeric_boolean.is_nothing.not then Union_Result_Type.Common_Type common_numeric_boolean else + common_date_type = Value_Type_Helpers.find_common_date_types types column_set.name problem_builder + if common_date_type.is_nothing.not then Union_Result_Type.Common_Type common_date_type else + # Lastly, we fall back to text, reporting a warning. + problem_builder.report_other_warning (No_Common_Type.Warning_Convert_To_Text types column_set.name) + Union_Result_Type.Fallback_To_Text + +## PRIVATE +type Union_Result_Type + ## PRIVATE + Common_Type (value_type : Value_Type) + + ## PRIVATE + Fallback_To_Text + + ## PRIVATE + This case is returned if the requested column was missing from _all_ tables, + so there were no types to unify. An all-null column should be created. + No_Types_To_Unify ## PRIVATE Replace a set of columns in the table with a new set of columns. The old diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Value_Type_Helpers.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Value_Type_Helpers.enso index 62e7b0d4621..34148b97639 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Value_Type_Helpers.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Value_Type_Helpers.enso @@ -1,13 +1,15 @@ from Standard.Base import all import Standard.Base.Data.Vector.No_Wrap import Standard.Base.Errors.Illegal_Argument.Illegal_Argument +from Standard.Base.Runtime import assert import project.Column.Column +import project.Internal.Problem_Builder.Problem_Builder import project.Internal.Storage import project.Value_Type.Auto import project.Value_Type.Bits import project.Value_Type.Value_Type -from project.Errors import Invalid_Value_Type, No_Common_Type +from project.Errors import Invalid_Value_Type, Mixing_Date_Time_Types, No_Common_Type from project.Internal.Table_Helpers import is_column polyglot java import org.enso.base.polyglot.NumericConverter @@ -60,11 +62,11 @@ reconcile_types current new = case current of Value_Type.Integer size -> case new of Value_Type.Integer new_size -> Value_Type.Integer (max_size size new_size) - Value_Type.Byte -> Value_Type.Integer size + Value_Type.Byte -> current # If we unify integers with floats, we select the default Float 64 regardless of the input sizes. - Value_Type.Float _ -> Value_Type.Float + Value_Type.Float _ -> Value_Type.Float Value_Type.Decimal _ _ -> new - _ -> Value_Type.Mixed + _ -> Value_Type.Mixed Value_Type.Float size -> case new of Value_Type.Float new_size -> Value_Type.Float (max_size size new_size) @@ -74,12 +76,11 @@ reconcile_types current new = case current of Value_Type.Decimal _ _ -> Value_Type.Float _ -> Value_Type.Mixed Value_Type.Byte -> case new of - Value_Type.Byte -> Value_Type.Byte - Value_Type.Integer size -> - Value_Type.Integer size - Value_Type.Float _ -> Value_Type.Float + Value_Type.Byte -> Value_Type.Byte + Value_Type.Integer _ -> new + Value_Type.Float _ -> Value_Type.Float Value_Type.Decimal _ _ -> new - _ -> Value_Type.Mixed + _ -> Value_Type.Mixed Value_Type.Decimal precision scale -> case new of Value_Type.Decimal new_precision new_scale -> if (precision == new_precision) && (scale == new_scale) then new else @@ -89,9 +90,6 @@ reconcile_types current new = case current of Value_Type.Byte -> Value_Type.Decimal precision scale Value_Type.Float _ -> Value_Type.Float _ -> Value_Type.Mixed - Value_Type.Boolean -> case new of - Value_Type.Boolean -> Value_Type.Boolean - _ -> Value_Type.Mixed Value_Type.Char current_size current_variable -> case new of Value_Type.Char new_size new_variable -> result_variable = current_variable || new_variable || current_size != new_size @@ -118,19 +116,70 @@ max_size a b = ## PRIVATE Finds the most specific value type that will fit all the provided types. - If `strict` is `True`, it is implemented as specified in the note - "Unifying Column Types" in `Table.union`. In that case, if no common type - is found, `Nothing` is returned. - - It assumes that the `types` vector is not empty. -find_common_type : Vector Value_Type -> Boolean -> Value_Type | Nothing -find_common_type types strict = + Arguments: + - types: a vector of types to unify. It must not be empty. + - strict: A flag determining how strict the unification is. + If `False`, if no common type can be found, `Mixed` is used as a generic fallback. + If `True`, `Nothing` is returned if no common type can be found and `Mixed` + is only returned if any of the input types was already `Mixed`. +find_common_type (types : Vector Value_Type) (strict : Boolean) -> Value_Type | Nothing = + assert types.not_empty most_generic_type = (types.drop 1).fold types.first reconcile_types if strict.not || most_generic_type != Value_Type.Mixed then most_generic_type else - # Double check if Mixed was really allowed to come out. + ## We return the Mixed type only if the input contained Mixed. + Otherwise we report failure to find common type. if types.contains Value_Type.Mixed then Value_Type.Mixed else Nothing +## PRIVATE + An extra helper function that reconciles Date_Time types with varying timezone + setting, as well as Date type, reporting any warnings. + It can be used as a fallback after `find_common_type` does not find a simple common type. + If non-date types are provided, it will fail by returning `Nothing`. It will not report any warnings in that case. + It assumes that the list of `types` is not empty. +find_common_date_types (types : Vector Value_Type) (related_column_name : Text | Nothing) (problem_builder : Problem_Builder) -> Value_Type | Nothing = + assert types.not_empty + all_date = types.all typ-> case typ of + Value_Type.Date -> True + Value_Type.Date_Time _ -> True + _ -> False + if all_date.not then Nothing else + has_date = types.contains Value_Type.Date + has_date_time_with_tz = types.contains (Value_Type.Date_Time True) + has_date_time_without_tz = types.contains (Value_Type.Date_Time False) + + # The common type is the 'largest' one. + common_type = if has_date_time_with_tz then Value_Type.Date_Time True else + if has_date_time_without_tz then Value_Type.Date_Time False else + Value_Type.Date + + if has_date && (common_type != Value_Type.Date) then + problem_builder.report_other_warning (Mixing_Date_Time_Types.Date_To_Date_Time related_column_name) + + if has_date_time_without_tz && (common_type != Value_Type.Date_Time False) then + problem_builder.report_other_warning (Mixing_Date_Time_Types.Implicit_Time_Zone related_column_name) + + common_type + +## PRIVATE + An extra helper function that reconciles numeric and boolean types. + Unifying Boolean and numeric types is not expected by all operations, but + some may want to opt-in to it. This method allows to do so. + + If non-numeric or non-boolean types are provided, it will fail by returning + `Nothing`. + + No warnings are reported, as coercing boolean to integer is harmless, it was + just chosen not to be done by default. +find_common_numeric_boolean_type (types : Vector Value_Type) -> Value_Type | Nothing = + assert types.not_empty + all_numeric_or_boolean = types.all typ-> typ.is_numeric || (typ == Value_Type.Boolean) + if all_numeric_or_boolean.not then Nothing else + ## We just find a common type again, ignoring the boolean types: the + boolean will fit any numeric type that we get out of this. + without_boolean = types.filter typ-> typ != Value_Type.Boolean + find_common_type without_boolean strict=True + ## PRIVATE Finds the type of an argument to a column operation. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Match_Columns.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Match_Columns.enso index 27477afee32..b240105bf79 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Match_Columns.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Match_Columns.enso @@ -1,6 +1,9 @@ from Standard.Base import all +import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import Standard.Base.Runtime.State +from Standard.Base.Runtime import assert +import project.Columns_To_Keep.Columns_To_Keep import project.Constants.Report_Unmatched from project.Errors import Column_Count_Mismatch, No_Output_Columns, Unmatched_Columns @@ -31,41 +34,93 @@ type Match_Columns columns should appear in the resulting table. The method assumes at least one table is provided in its input. -match_columns tables matching_mode keep_unmatched_columns problem_builder = case matching_mode of - Match_Columns.By_Name -> case keep_unmatched_columns of - False -> +match_columns tables matching_mode columns_to_keep problem_builder = + assert tables.not_empty + case matching_mode of + Match_Columns.By_Name -> match_columns_by_name tables columns_to_keep problem_builder + Match_Columns.By_Position -> match_columns_by_position tables columns_to_keep problem_builder + +## PRIVATE +match_columns_by_name tables columns_to_keep problem_builder = case columns_to_keep of + Columns_To_Keep.In_List list -> if list.is_empty then Error.throw (Illegal_Argument.Error "The list of columns to keep cannot be empty.") else + output_column_names = list.distinct + column_counts = find_column_counts tables + all_tables_count = tables.length + unmatched_column_names = output_column_names.filter name-> + column_counts.get name 0 < all_tables_count + if unmatched_column_names.not_empty then + problem_builder.report_other_warning (Unmatched_Columns.Error unmatched_column_names) + build_column_set_by_name tables output_column_names + Columns_To_Keep.In_All -> + column_counts = find_column_counts tables + # This will only include columns that were present in all tables. + all_tables_count = tables.length + common_column_names = tables.first.column_names.filter name-> + column_counts.at name == all_tables_count + if common_column_names.is_empty then Error.throw (No_Output_Columns.Error "Unmatched columns are set to be dropped, but no common column names were found.") else + dropped_column_names = tables.map .column_names + . flatten + . filter (name-> column_counts.at name < all_tables_count) + . distinct + if dropped_column_names.not_empty then + problem_builder.report_other_warning (Unmatched_Columns.Error dropped_column_names) + build_column_set_by_name tables common_column_names + _ -> + output_column_names = distinct_columns_in_appearance_order tables + report_missing = case columns_to_keep of + Columns_To_Keep.In_Any -> False + Columns_To_Keep.In_Any_Warn_On_Missing -> True + if report_missing then column_counts = find_column_counts tables - # This will only include columns that were present in all tables. - common_column_names = tables.first.column_names.filter name-> - column_counts.at name == tables.length - if common_column_names.is_empty then Error.throw (No_Output_Columns.Error "Unmatched columns are set to be dropped, but no common column names were found.") else - common_column_names.map name-> - column_indices = tables.map table-> - table.column_names.index_of name - Column_Set.Value name column_indices - _ -> - output_column_names = distinct_columns_in_appearance_order tables - if keep_unmatched_columns == Report_Unmatched then - column_counts = find_column_counts tables - all_tables_count = tables.length - ## We iterate over output column names to get deterministic - order of unmatched columns. - unmatched_column_names = output_column_names.filter name-> - column_counts.get name 0 < all_tables_count - if unmatched_column_names.not_empty then - problem_builder.report_other_warning (Unmatched_Columns.Error unmatched_column_names) - output_column_names.map name-> - column_indices = tables.map table-> - table.columns.index_of col-> col.name==name - Column_Set.Value name column_indices - Match_Columns.By_Position -> + all_tables_count = tables.length + ## We iterate over output column names to get deterministic + order of unmatched columns. + unmatched_column_names = output_column_names.filter name-> + column_counts.get name 0 < all_tables_count + if unmatched_column_names.not_empty then + problem_builder.report_other_warning (Unmatched_Columns.Error unmatched_column_names) + build_column_set_by_name tables output_column_names + +## PRIVATE + Common logic for computing the final result of by-name matching. + Once the set of output column names is determined, we compute the + `Column_Set` by finding the corresponding column indices in each table (if found). +build_column_set_by_name tables output_column_names = + output_column_names.map name-> + column_indices = tables.map table-> + # TODO this gets O(N^2), we should optimize + table.column_names.index_of name + Column_Set.Value name column_indices + +## PRIVATE +match_columns_by_position tables columns_to_keep problem_builder = case columns_to_keep of + Columns_To_Keep.In_List _ -> + Error.throw (Illegal_Argument.Error "The In_List option for `columns_to_keep` cannot be used together with `By_Position` matching.") + _ -> column_counts = tables.map table-> table.columns.length minmax = column_counts.compute_bulk [Statistic.Minimum, Statistic.Maximum] - columns_to_take = if keep_unmatched_columns == False then minmax.first else minmax.second - if (minmax.first != minmax.second) && (keep_unmatched_columns == Report_Unmatched) then - problem_builder.report_other_warning (Column_Count_Mismatch.Error minmax.second minmax.first) - name_source = if keep_unmatched_columns == False then tables.first else - tables.find table-> table.columns.length == columns_to_take + min = minmax.first + max = minmax.second + columns_to_take = case columns_to_keep of + Columns_To_Keep.In_All -> min + Columns_To_Keep.In_Any -> max + Columns_To_Keep.In_Any_Warn_On_Missing -> max + has_unmatched_columns = min != max + if has_unmatched_columns then + should_report_unmatched = case columns_to_keep of + Columns_To_Keep.In_All -> True + Columns_To_Keep.In_Any -> False + Columns_To_Keep.In_Any_Warn_On_Missing -> True + # TODO should we rephrase the wording of the error? should it depend on In_Any_Warn_On_Missing vs In_All? + if should_report_unmatched then + problem_builder.report_other_warning (Column_Count_Mismatch.Error max min) + + name_source = case columns_to_keep of + Columns_To_Keep.In_All -> tables.first + _ -> + # We find the first table that has all the columns present. + tables.find table-> table.columns.length == columns_to_take + column_sets = Vector.new columns_to_take i-> name = name_source.at i . name column_ids = tables.map table-> @@ -79,7 +134,7 @@ type Column_Set Value (name : Text) (column_indices : Vector Integer) ## PRIVATE - resolve_columns self all_tables = self.column_indices.zip all_tables i-> parent_table-> + resolve_columns self (all_tables : Vector) = self.column_indices.zip all_tables i-> parent_table-> case i of Nothing -> Nothing _ : Integer -> parent_table.at i diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso index 1db726c1ea9..d89f2c2bda7 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Table.enso @@ -23,6 +23,7 @@ import project.Aggregate_Column.Aggregate_Column import project.Blank_Selector.Blank_Selector import project.Column.Column import project.Column_Ref.Column_Ref +import project.Columns_To_Keep.Columns_To_Keep import project.Constants.Previous_Value import project.Constants.Report_Unmatched import project.Data_Formatter.Data_Formatter @@ -49,7 +50,9 @@ import project.Internal.Replace_Helpers import project.Internal.Split_Tokenize import project.Internal.Table_Helpers import project.Internal.Table_Helpers.Table_Column_Helper +import project.Internal.Table_Helpers.Union_Result_Type import project.Internal.Table_Ref.Table_Ref +import project.Internal.Value_Type_Helpers import project.Internal.Widget_Helpers import project.Join_Condition.Join_Condition import project.Join_Kind.Join_Kind @@ -2245,91 +2248,78 @@ type Table - tables: A single table or a vector of tables to append to this one. The tables are concatenated in the order they are specified, with `self` being the first one. + - columns_to_keep: Specifies which columns to keep. Defaults to keeping + columns that are present in any of the tables, reporting a warning for + columns that are not present in all tables and adding `Nothing` values + for them. - match_columns: Specifies how to match the columns. - If `Match_Columns.By_Name` - the columns are matched by name across all provided tables. - If unmatched columns are to be dropped, the resulting table will keep - only the set of columns that appear in all provided tables, in the - relative order that they appeared in the `self` table. - If unmatched columns are kept, they are added in the order of - appearance - i.e. first all columns from `self` will be added in the - original order, then any columns from the second table that were not - matched will be added at the end (preserving their relative order), - and so on for all the remaining tables. - If `Match_Columns.By_Position` - the columns are mapped by position. - If unmatched columns are to be dropped, the resulting table will have - as many columns as the table that had the least columns and the - column names of the first table (self) will be used. - If unmatched columns are kept, the resulting table will have as many - columns as the table with the most columns. Since the first table may - not have all the necessary columns to provide column names for the - result, the result will have column names taken from the first table - that has the biggest number of columns. - - keep_unmatched_columns: If set to `True`, unmatched columns are kept - and are padded with `Nothing` for tables that did not have them. - If set to `False`, only the common subset of columns is kept - any - column that is not present in all tables is dropped. Defaults to - `Report_Unmatched`, which behaves like `True` - unmatched columns are - kept and padded with `Nothing`, but a problem is reported. - - allow_type_widening: Specifies if the resulting column type should be - adjusted to fit columns from all arguments. If `True`, a common type - will be chosen for each column (see "Unifying Column Types" below). - If `False`, the resulting column type will be the same as in the first - table containing the column. In this case, all columns that are - concatenated must have the same type as the first one (unless this - had a `Mixed` type - in which case it will accept any other types). + The names of each column come from the first table in which the given + column appears in. + The `List` option is not applicable when mapping columns by position. + Column names are taken from the first table if `In_All` and from the + first table that has the maximum number of columns if `In_Any` - on_problems: Specifies how to handle problems if they occur, reporting them as warnings by default. - - If `keep_unmatched_columns` is set to `Report_Unmatched` (the - default): - - If matching by name and there are columns that are not present in - all tables, `Unmatched_Columns` is reported. - - If matching by position and column counts of the merged tables - differ, then a `Column_Count_Mismatch` is reported. The error will - contain the greatest column count as its `expected` value and the - smallest one as its `actual` value. - - If `keep_unmatched_columns` is set to `False` and matching by name, - it is possible that there are no columns that are common to all - provided tables, in that case `No_Output_Columns` is thrown as a - dataflow error regardless of the `on_problems` setting, because there - are no columns to include in the resulting table. - - If type widening is disabled and one of corresponding columns has a - type that is incompatible with the type coming from the first table, - a `Column_Type_Mismatch` is reported. The problematic column will be - dropped from the resulting table. With type widening disabled, the - subsequent tables must have the same types as the first one, unless - the type of the first one was `Mixed` which will accept any other - type. - - If a common type coercion for a set of matched columns from - concatenated tables cannot be found, a `No_Common_Type` is reported. - In warning or ignore mode, the problematic column will be dropped - from the resulting table. - ? Unifying Column Types - If `allow_type_widening` is set to `True`, then the following rules are - used to find a common type that will fit values from all merged tables. - - Numeric columns are unified by finding the most general type that can - fit all of the columns. The biggest integer type will be chosen and if + Numeric columns are unified by finding the smallest type that can fit + all of the columns. The biggest integer type will be chosen and if integers and decimals are mixed, the decimal type will be chosen. If boolean columns are mixed with numeric columns, they will be coerced to the numeric type (and converted to 0 and 1). - Text types will also be coerced according to the common rules - if - constant-length texts of different lengths are mixed, they will be - coerced to a varying-length type. + Text types will are also unified by finding the smallest type that can + fit all the values. If constant-length texts of different lengths are + mixed, they will be coerced to a varying-length type. + + If date and date-time columns are unified, this yields a date-time + column. In-memory, the date is promoted by adding a time of 00:00 and + the system time-zone. In other backends that behaviour may differ. If one of the matched columns has `Mixed` type, that type will be used - regardless of types of other columns. Mixing any other types will - result in a `No_Common_Type` problem. If columns of incompatible types - are meant to be mixed, at least one of them should be explicitly - retyped to the `Mixed` type to indicate that intention. Note that the - `Mixed` type may not be supported by most Database backends. - union : (Table | Vector Table) -> Match_Columns -> Boolean | Report_Unmatched -> Boolean -> Problem_Behavior -> Table - union self tables:(Table | Vector) match_columns=Match_Columns.By_Name keep_unmatched_columns=Report_Unmatched allow_type_widening=True on_problems=Report_Warning = - Table.from_union ([self] + Vector.unify_vector_or_element tables) match_columns keep_unmatched_columns allow_type_widening on_problems + regardless of types of other columns. Note that the `Mixed` type may + not be supported by most Database backends. + + Finally, if no common type is found using the rules above, everything + is converted to text. + + ? Problem Conditions + + - If no common type is found and the text conversion fallback is used, + the `No_Common_Type` problem is reported. + - The `Float` type may not be able to exactly represent larger + integers, thus if such large integers are mixed with floats, the + resulting conversion to `Float` may cause a loss of precision. + In that case, a `Loss_Of_Integer_Precision` problem is reported. + This warning is only reported in the in-memory backend. Currently, + the Database backend proceeds without a warning about precision loss. + - If a column of dates is unified with a column of date-times, since + the assumption of using the midnight time-of-day is arbitrary, + a `Implicit_Date_As_Date_Time_Conversion` problem is reported. + - If an empty vector of tables is provided, an `Illegal_Argument` error + is raised. + - If `columns_to_keep` is set to `In_All` or `List` and an expected + column is missing in some of the tables, a `Unmatched_Columns` + problem is reported. If this causes the output to contain no columns, + a `No_Output_Columns` error is raised. + + ? Ordering of Columns in the result + + When matching columns by name, it is possible that the ordering of + columns may vary between input tables. The ordering is determined as + following: columns that are kept from the first table are in the order + they appear in that table. If there are columns that do not appear in + the first table, they are appended to the end of the resulting table in + the order they appear in the input. + @tables (Widget.Vector_Editor item_editor=Widget.Code_Input item_default='_' display=Display.Always) + @columns_to_keep Columns_To_Keep.default_widget + union : (Table | Vector Table) -> Columns_To_Keep -> Match_Columns -> Problem_Behavior -> Table + union self tables:(Table | Vector) (columns_to_keep : Columns_To_Keep = ..In_Any_Warn_On_Missing) (match_columns : Match_Columns = Match_Columns.By_Name) (on_problems : Problem_Behavior = Report_Warning) = + Table.from_union ([self] + Vector.unify_vector_or_element tables) columns_to_keep match_columns on_problems ## ALIAS drop_missing_rows, dropna GROUP Standard.Base.Selections @@ -2953,108 +2943,97 @@ type Table Arguments: - tables: A vector of tables to union together. The tables are concatenated in the order they are specified. + - columns_to_keep: Specifies which columns to keep. Defaults to keeping + columns that are present in any of the tables, reporting a warning for + columns that are not present in all tables and adding `Nothing` values + for them. - match_columns: Specifies how to match the columns. - If `Match_Columns.By_Name` - the columns are matched by name across all provided tables. - If unmatched columns are to be dropped, the resulting table will keep - only the set of columns that appear in all provided tables, in the - relative order that they appeared in the `self` table. - If unmatched columns are kept, they are added in the order of - appearance - i.e. first all columns from `self` will be added in the - original order, then any columns from the second table that were not - matched will be added at the end (preserving their relative order), - and so on for all the remaining tables. - If `Match_Columns.By_Position` - the columns are mapped by position. - If unmatched columns are to be dropped, the resulting table will have - as many columns as the table that had the least columns and the - column names of the first table (self) will be used. - If unmatched columns are kept, the resulting table will have as many - columns as the table with the most columns. Since the first table may - not have all the necessary columns to provide column names for the - result, the result will have column names taken from the first table - that has the biggest number of columns. - - keep_unmatched_columns: If set to `True`, unmatched columns are kept - and are padded with `Nothing` for tables that did not have them. - If set to `False`, only the common subset of columns is kept - any - column that is not present in all tables is dropped. Defaults to - `Report_Unmatched`, which behaves like `True` - unmatched columns are - kept and padded with `Nothing`, but a problem is reported. - - allow_type_widening: Specifies if the resulting column type should be - adjusted to fit columns from all arguments. If `True`, a common type - will be chosen for each column (see "Unifying Column Types" below). - If `False`, the resulting column type will be the same as in the first - table containing the column. In this case, all columns that are - concatenated must have the same type as the first one (unless this - had a `Mixed` type - in which case it will accept any other types). + The names of each column come from the first table in which the given + column appears in. + The `List` option is not applicable when mapping columns by position. + Column names are taken from the first table if `In_All` and from the + first table that has the maximum number of columns if `In_Any`. - on_problems: Specifies how to handle problems if they occur, reporting them as warnings by default. - - If `keep_unmatched_columns` is set to `Report_Unmatched` (the - default): - - If matching by name and there are columns that are not present in - all tables, `Unmatched_Columns` is reported. - - If matching by position and column counts of the merged tables - differ, then a `Column_Count_Mismatch` is reported. The error will - contain the greatest column count as its `expected` value and the - smallest one as its `actual` value. - - If `keep_unmatched_columns` is set to `False` and matching by name, - it is possible that there are no columns that are common to all - provided tables, in that case `No_Output_Columns` is thrown as a - dataflow error regardless of the `on_problems` setting, because there - are no columns to include in the resulting table. - - If type widening is disabled and one of corresponding columns has a - type that is incompatible with the type coming from the first table, - a `Column_Type_Mismatch` is reported. The problematic column will be - dropped from the resulting table. With type widening disabled, the - subsequent tables must have the same types as the first one, unless - the type of the first one was `Mixed` which will accept any other - type. - - If a common type coercion for a set of matched columns from - concatenated tables cannot be found, a `No_Common_Type` is reported. - In warning or ignore mode, the problematic column will be dropped - from the resulting table. - ? Unifying Column Types - If `allow_type_widening` is set to `True`, then the following rules are - used to find a common type that will fit values from all merged tables. - - Numeric columns are unified by finding the most general type that can - fit all of the columns. The biggest integer type will be chosen and if + Numeric columns are unified by finding the smallest type that can fit + all of the columns. The biggest integer type will be chosen and if integers and decimals are mixed, the decimal type will be chosen. If boolean columns are mixed with numeric columns, they will be coerced to the numeric type (and converted to 0 and 1). - Text types will also be coerced according to the common rules - if - constant-length texts of different lengths are mixed, they will be - coerced to a varying-length type. + Text types will are also unified by finding the smallest type that can + fit all the values. If constant-length texts of different lengths are + mixed, they will be coerced to a varying-length type. + + If date and date-time columns are unified, this yields a date-time + column. In-memory, the date is promoted by adding a time of 00:00 and + the system time-zone. In other backends that behaviour may differ. If one of the matched columns has `Mixed` type, that type will be used - regardless of types of other columns. Mixing any other types will - result in a `No_Common_Type` problem. If columns of incompatible types - are meant to be mixed, at least one of them should be explicitly - retyped to the `Mixed` type to indicate that intention. Note that the - `Mixed` type may not be supported by most Database backends. - from_union : (Vector Table) -> Match_Columns -> Boolean | Report_Unmatched -> Boolean -> Problem_Behavior -> Table - from_union tables:(Vector) match_columns=Match_Columns.By_Name keep_unmatched_columns=Report_Unmatched allow_type_widening=True on_problems=Report_Warning = + regardless of types of other columns. Note that the `Mixed` type may + not be supported by most Database backends. + + Finally, if no common type is found using the rules above, everything + is converted to text. + + ? Problem Conditions + + - If no common type is found and the text conversion fallback is used, + the `No_Common_Type` problem is reported. + - The `Float` type may not be able to exactly represent larger + integers, thus if such large integers are mixed with floats, the + resulting conversion to `Float` may cause a loss of precision. + In that case, a `Loss_Of_Integer_Precision` problem is reported. + This warning is only reported in the in-memory backend. Currently, + the Database backend proceeds without a warning about precision loss. + - If a column of dates is unified with a column of date-times, since + the assumption of using the midnight time-of-day is arbitrary, + a `Implicit_Date_As_Date_Time_Conversion` problem is reported. + - If an empty vector of tables is provided, an `Illegal_Argument` error + is raised. + - If `columns_to_keep` is set to `In_All` or `List` and an expected + column is missing in some of the tables, a `Unmatched_Columns` + problem is reported. If this causes the output to contain no columns, + a `No_Output_Columns` error is raised. + + ? Ordering of Columns in the result + + When matching columns by name, it is possible that the ordering of + columns may vary between input tables. The ordering is determined as + following: columns that are kept from the first table are in the order + they appear in that table. If there are columns that do not appear in + the first table, they are appended to the end of the resulting table in + the order they appear in the input. + @tables (Widget.Vector_Editor item_editor=Widget.Code_Input item_default='_' display=Display.Always) + @columns_to_keep Columns_To_Keep.default_widget + from_union : (Vector Table) -> Columns_To_Keep -> Match_Columns -> Problem_Behavior -> Table ! No_Output_Columns | Illegal_Argument + from_union (tables : Vector) (columns_to_keep : Columns_To_Keep = ..In_Any_Warn_On_Missing) (match_columns : Match_Columns = Match_Columns.By_Name) (on_problems : Problem_Behavior = Report_Warning) = all_tables = (tables.map t-> Table.from t) - all_tables.if_not_error <| + if all_tables.is_empty then Error.throw (Illegal_Argument.Error "`Table.from_union` needs at least 1 input table.") else ## We keep separate problem builders, because if we are reporting `No_Output_Columns`, we only want to add a cause coming from unification; matching reports problems that would not fit this error. problem_builder_for_matching = Problem_Builder.new problem_builder_for_unification = Problem_Builder.new - matched_column_sets = Match_Columns_Helpers.match_columns all_tables match_columns keep_unmatched_columns problem_builder_for_matching + matched_column_sets = Match_Columns_Helpers.match_columns all_tables match_columns columns_to_keep problem_builder_for_matching result_row_count = all_tables.fold 0 c-> t-> c + t.row_count merged_columns = matched_column_sets.map column_set-> - case Table_Helpers.unify_result_type_for_union column_set all_tables allow_type_widening problem_builder_for_unification of - Nothing -> Nothing - result_type : Value_Type -> - concat_columns column_set all_tables result_type result_row_count on_problems - good_columns = merged_columns.filter Filter_Condition.Not_Nothing + case Table_Helpers.unify_result_type_for_union column_set all_tables problem_builder_for_unification of + Union_Result_Type.Common_Type common_type -> + concat_columns column_set all_tables common_type result_row_count needs_cast=False on_problems + Union_Result_Type.Fallback_To_Text -> + concat_columns column_set all_tables Value_Type.Char result_row_count needs_cast=True on_problems + Union_Result_Type.No_Types_To_Unify -> + Column.from_repeated_item column_set.name Nothing result_row_count problem_builder_for_matching.attach_problems_before on_problems <| problem_builder_for_unification.attach_problems_before on_problems <| - if good_columns.is_empty then problem_builder_for_unification.raise_no_output_columns_with_cause else - Table.new good_columns + if merged_columns.is_empty then problem_builder_for_unification.raise_no_output_columns_with_cause else + Table.new merged_columns ## PRIVATE A helper to create a new table consisting of slices of the original table. @@ -3073,7 +3052,7 @@ make_join_helpers left_table right_table = ## PRIVATE A helper that efficiently concatenates storages of in-memory columns. -concat_columns column_set all_tables result_type result_row_count on_problems = +concat_columns column_set all_tables result_type result_row_count needs_cast on_problems = Java_Problems.with_problem_aggregator on_problems java_problem_aggregator-> storage_builder = make_storage_builder_for_type result_type on_problems initial_size=result_row_count java_problem_aggregator column_set.column_indices.zip all_tables i-> parent_table-> @@ -3082,7 +3061,9 @@ concat_columns column_set all_tables result_type result_row_count on_problems = null_row_count = parent_table.row_count storage_builder.appendNulls null_row_count _ : Integer -> - storage = parent_table.at i . java_column . getStorage + column = parent_table.at i + converted = if needs_cast then column.cast result_type on_problems=Report_Error else column + storage = converted.java_column.getStorage storage_builder.appendBulkStorage storage sealed_storage = storage_builder.seal Column.from_storage column_set.name sealed_storage diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/DateTimeBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/DateTimeBuilder.java index aadc4167213..35fe0672aa0 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/DateTimeBuilder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/DateTimeBuilder.java @@ -1,11 +1,16 @@ package org.enso.table.data.column.builder; +import java.time.LocalDate; +import java.time.ZoneId; import java.time.ZonedDateTime; import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.column.storage.datetime.DateStorage; import org.enso.table.data.column.storage.datetime.DateTimeStorage; import org.enso.table.data.column.storage.type.DateTimeType; +import org.enso.table.data.column.storage.type.DateType; import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.error.ValueTypeMismatchException; +import org.graalvm.polyglot.Context; /** A builder for string columns. */ public class DateTimeBuilder extends TypedBuilderImpl { @@ -23,6 +28,14 @@ public class DateTimeBuilder extends TypedBuilderImpl { return DateTimeType.INSTANCE; } + /** + * TODO DRY {@link org.enso.table.data.column.operation.cast.ToDateTimeStorageConverter} + * convertDate. + */ + private ZonedDateTime convertDate(LocalDate date) { + return date.atStartOfDay().atZone(ZoneId.systemDefault()); + } + @Override public void appendNoGrow(Object o) { try { @@ -32,6 +45,34 @@ public class DateTimeBuilder extends TypedBuilderImpl { } } + @Override + public void appendBulkStorage(Storage storage) { + if (storage.getType() instanceof DateType) { + if (storage instanceof DateStorage dateStorage) { + Context context = Context.getCurrent(); + for (int i = 0; i < dateStorage.size(); ++i) { + LocalDate date = dateStorage.getItemBoxed(i); + if (date == null) { + data[currentSize++] = null; + } else { + data[currentSize++] = convertDate(date); + } + + context.safepoint(); + } + } else { + throw new IllegalStateException( + "Unexpected storage implementation for type " + + storage.getType() + + ": " + + storage + + ". This is a bug in the Table library."); + } + } else { + super.appendBulkStorage(storage); + } + } + @Override public boolean accepts(Object o) { return o instanceof ZonedDateTime; diff --git a/std-bits/table/src/main/java/org/enso/table/data/column/builder/TypedBuilder.java b/std-bits/table/src/main/java/org/enso/table/data/column/builder/TypedBuilder.java index c12a90bf379..b1000ddd875 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/TypedBuilder.java +++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/TypedBuilder.java @@ -28,6 +28,15 @@ public abstract class TypedBuilder extends Builder { */ public abstract TypedBuilder retypeTo(StorageType type); - /** Specifies if the following object will be accepted by this builder's append* methods. */ + /** + * Specifies if the following object will be accepted by this builder's append* methods. + * + *

This is used to determine if a given value can be appended to the current builder, or if it + * needs to be retyped to a more general one. + * + *

Note that the {@code appendBulkStorage} method may still accept more types than {@code + * accept}. This is exploited by operations like Union where more flexibility in merging column + * types is allowed than in building new columns from scratch. + */ public abstract boolean accepts(Object o); } diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/Column.java b/std-bits/table/src/main/java/org/enso/table/data/table/Column.java index 2b599cc3d31..c5d868d1772 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/Column.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/Column.java @@ -171,16 +171,15 @@ public class Column { Object converted = Polyglot_Utils.convertPolyglotValue(item); - Builder builder; if (converted == null) { - builder = new MixedBuilder(repeat); - } else { - StorageType storageType = StorageType.forBoxedItem(converted); - builder = Builder.getForType(storageType, repeat, problemAggregator); + Builder builder = new MixedBuilder(repeat); + builder.appendNulls(repeat); + return new Column(name, builder.seal()); } + StorageType storageType = StorageType.forBoxedItem(converted); + Builder builder = Builder.getForType(storageType, repeat, problemAggregator); Context context = Context.getCurrent(); - for (int i = 0; i < repeat; i++) { builder.appendNoGrow(converted); context.safepoint(); diff --git a/test/Table_Tests/src/Common_Table_Operations/Join/Union_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Join/Union_Spec.enso index 1303bced126..4a15f52745a 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Join/Union_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Join/Union_Spec.enso @@ -9,7 +9,7 @@ from Standard.Database.Errors import Unsupported_Database_Operation, Integrity_E from Standard.Test import all -from project.Common_Table_Operations.Util import expect_column_names, run_default_backend, within_table +from project.Common_Table_Operations.Util import all import project.Util main filter=Nothing = run_default_backend add_specs filter @@ -42,6 +42,11 @@ add_specs suite_builder setup = suite_builder.group prefix+"Table.from_union" pending=db_pending group_builder-> run_union_tests group_builder setup call_static_union + group_builder.specify "should fail if no tables are provided" <| + r = Table.from_union [] + r.should_fail_with Illegal_Argument + r.catch.to_display_text . should_contain "at least 1" + run_union_tests group_builder setup call_union = create_connection_fn = setup.create_connection_func data = Data.setup create_connection_fn @@ -90,37 +95,70 @@ run_union_tests group_builder setup call_union = problems2 = [Unmatched_Columns.Error ["A", "D"]] Problems.test_problem_handling action2 problems2 tester2 - group_builder.specify "should drop unmatched columns if asked to" <| + group_builder.specify "should fill unmatched columns with nulls with no warning, if In_Any is explicitly chosen" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]] + t3 = table_builder [["D", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]] + + table = call_union [t1, t2, t3] columns_to_keep=..In_Any on_problems=..Report_Error + Problems.assume_no_problems table + expect_column_names ["A", "B", "C", "D"] table + table.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, Nothing, Nothing, Nothing] + table.at "B" . to_vector . should_equal ["a", "b", "c", Nothing, Nothing, Nothing, Nothing, Nothing, Nothing] + table.at "C" . to_vector . should_equal [Nothing, Nothing, Nothing, "d", "e", "f", "g", "h", "i"] + table.at "D" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, 0] + + group_builder.specify "should drop unmatched columns and warn, if In_All is selected" <| t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]] t3 = table_builder [["A", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]] - t4 = call_union[t1, t2, t3] keep_unmatched_columns=False on_problems=Problem_Behavior.Report_Error - Problems.assume_no_problems t4 + t4 = call_union [t1, t2, t3] columns_to_keep=..In_All + w = Problems.expect_only_warning Unmatched_Columns t4 + w.column_names.should_equal ["B", "C"] expect_column_names ["A"] t4 t4.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, Nothing, Nothing, 0] - group_builder.specify "should keep unmatched columns without errors if asked to" <| - t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] - t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]] - t3 = table_builder [["A", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]] - - t4 = call_union [t1, t2, t3] keep_unmatched_columns=True on_problems=Problem_Behavior.Report_Error - Problems.assume_no_problems t4 - expect_column_names ["A", "B", "C"] t4 - t4.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, Nothing, Nothing, 0] - t4.at "B" . to_vector . should_equal ["a", "b", "c", Nothing, Nothing, Nothing, Nothing, Nothing, Nothing] - t4.at "C" . to_vector . should_equal [Nothing, Nothing, Nothing, "d", "e", "f", "g", "h", "i"] - group_builder.specify "should fail if asked to drop unmatched columns but the set of common columns is empty" <| t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]] t3 = table_builder [["D", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]] - t4 = call_union [t1, t2, t3] keep_unmatched_columns=False on_problems=Problem_Behavior.Ignore + t4 = call_union [t1, t2, t3] columns_to_keep=..In_All on_problems=..Ignore t4.should_fail_with No_Output_Columns t4.catch.to_display_text . should_equal "No columns in the result, because of another problem: Unmatched columns are set to be dropped, but no common column names were found." + group_builder.specify "should allow to select specified columns for union by In_List, using the ordering from the list" <| + t1 = table_builder [["A", [1]], ["X", [2]], ["B", ["a"]], ["Y", [3]]] + t2 = table_builder [["A", [4]], ["Z", [5]], ["B", ["b"]], ["X", [6]]] + + t3 = call_union [t1, t2] columns_to_keep=(..In_List ["B", "A"]) + expect_column_names ["B", "A"] t3 + t3.at "B" . to_vector . should_equal ["a", "b"] + t3.at "A" . to_vector . should_equal [1, 4] + + group_builder.specify "should add a Null column for unmatched columns from In_List" <| + t1 = table_builder [["A", [1]], ["X", [2]]] + t2 = table_builder [["Z", [4]], ["A", [5]]] + + t3 = call_union [t1, t2] columns_to_keep=(..In_List ["B", "A"]) + expect_column_names ["B", "A"] t3 + t3.at "B" . to_vector . should_equal [Nothing, Nothing] + t3.at "A" . to_vector . should_equal [1, 5] + + group_builder.specify "does not allow an empty list in In_List" <| + t1 = table_builder [["A", [1]], ["X", [2]]] + t2 = table_builder [["Z", [4]], ["A", [5]]] + r = call_union [t1, t2] columns_to_keep=(..In_List []) + r.should_fail_with Illegal_Argument + + group_builder.specify "does not error if duplicate entries appear in the In_List" <| + t1 = table_builder [["A", [1]], ["X", [2]], ["B", ["a"]], ["Y", [3]]] + t2 = table_builder [["A", [4]], ["Z", [5]], ["B", ["b"]], ["X", [6]]] + + t3 = call_union [t1, t2] columns_to_keep=(..In_List ["B", "B", "A", "A", "B"]) + expect_column_names ["B", "A"] t3 + group_builder.specify "should ignore column names when matching by position" <| t1 = table_builder [["A", [1, 2, 3]], ["Y", ["a", "b", "c"]]] t2 = table_builder [["X", [4, 5, 6]], ["A", ["d", "e", "f"]]] @@ -144,28 +182,36 @@ run_union_tests group_builder setup call_union = problems = [Column_Count_Mismatch.Error 3 1] Problems.test_problem_handling action problems tester - group_builder.specify "should keep the least number of columns with positional matching if asked to drop unmatched ones" <| + group_builder.specify "should keep the least number of columns with positional matching if In_All" <| t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] t2 = table_builder [["A1", [4, 5, 6]], ["B1", ["d", "e", "f"]], ["C", [7, 8, 9]]] t3 = table_builder [["A2", [10, 20, 30]]] - t4 = call_union [t1, t2, t3] keep_unmatched_columns=False match_columns=Match_Columns.By_Position on_problems=Problem_Behavior.Report_Error - Problems.assume_no_problems t4 + t4 = call_union [t1, t2, t3] columns_to_keep=..In_All match_columns=..By_Position expect_column_names ["A"] t4 t4.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, 10, 20, 30] + w = Problems.expect_only_warning Column_Count_Mismatch t4 + w.expected.should_equal 3 + w.actual.should_equal 1 - group_builder.specify "should keep the greatest number of columns with positional matching if asked to keep unmatched ones, filling missing values with null and reporting no problems" <| + group_builder.specify "should keep the greatest number of columns with positional matching if In_Any, reporting no problems" <| t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] t2 = table_builder [["A1", [4, 5, 6]], ["B1", ["d", "e", "f"]], ["C", [7, 8, 9]]] t3 = table_builder [["A2", [10, 20, 30]]] - t4 = call_union [t1, t2, t3] match_columns=Match_Columns.By_Position keep_unmatched_columns=True on_problems=Problem_Behavior.Ignore + t4 = call_union [t1, t2, t3] columns_to_keep=..In_Any match_columns=..By_Position on_problems=..Report_Error Problems.assume_no_problems t4 expect_column_names ["A1", "B1", "C"] t4 t4.at "A1" . to_vector . should_equal [1, 2, 3, 4, 5, 6, 10, 20, 30] t4.at "B1" . to_vector . should_equal ["a", "b", "c", "d", "e", "f", Nothing, Nothing, Nothing] t4.at "C" . to_vector . should_equal [Nothing, Nothing, Nothing, 7, 8, 9, Nothing, Nothing, Nothing] + group_builder.specify "does not allow In_List with positional matching" <| + t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] + t2 = table_builder [["A1", [4, 5, 6]], ["B1", ["d", "e", "f"]], ["C", [7, 8, 9]]] + r = call_union [t1, t2] columns_to_keep=(..In_List ["A", "B"]) match_columns=Match_Columns.By_Position + r.should_fail_with Illegal_Argument + group_builder.specify "should use column names from the first table that has enough columns in positional matching mode" <| t1 = table_builder [["A", [1, 2, 3]]] t2 = table_builder [["X", [4, 5, 6]], ["A", ["a", "b", "c"]]] @@ -180,10 +226,6 @@ run_union_tests group_builder setup call_union = check t3 Problems.get_attached_warnings t3 . should_equal [Column_Count_Mismatch.Error 2 1] - t4 = call_union [t1, t2] match_columns=Match_Columns.By_Position keep_unmatched_columns=True - within_table t4 <| - check t4 - t5 = table_builder [["Y", [7, 8, 9]], ["A", ["d", "e", "f"]], ["Z", [10, 11, 12]]] t6 = table_builder [["W", [0]]] t7 = table_builder [["X", [7, 8, 9]], ["Y", ["d", "e", "f"]], ["Z", [10, 11, 12]]] @@ -216,11 +258,13 @@ run_union_tests group_builder setup call_union = check_same <| call_union [t1] check_same <| call_union [t1] match_columns=Match_Columns.By_Position - check_same <| call_union [t1] keep_unmatched_columns=False - check_same <| call_union [t1] match_columns=Match_Columns.By_Position keep_unmatched_columns=False + check_same <| call_union [t1] columns_to_keep=..In_All + check_same <| call_union [t1] match_columns=Match_Columns.By_Position columns_to_keep=..In_All - check_same <| call_union [t1] keep_unmatched_columns=True - check_same <| call_union [t1] match_columns=Match_Columns.By_Position keep_unmatched_columns=True + check_same <| call_union [t1] columns_to_keep=..In_Any + check_same <| call_union [t1] match_columns=Match_Columns.By_Position columns_to_keep=..In_Any + + check_same <| call_union [t1] columns_to_keep=(..In_List ["A", "B"]) group_builder.specify "should correctly unify text columns of various lengths" pending=(if setup.test_selection.fixed_length_text_columns.not then "Fixed-length Char columns are not supported by this backend.") <| t1 = (table_builder [["A", ["a", "b", "c"]]]) . cast "A" (Value_Type.Char size=1 variable_length=False) @@ -231,12 +275,13 @@ run_union_tests group_builder setup call_union = t3 = call_union [t1, t2] expect_column_names ["A"] t3 + Problems.assume_no_problems t3 t3.at "A" . to_vector . should_equal ["a", "b", "c", "xyz", "abc", "def"] t3.at "A" . value_type . is_text . should_be_true Test.with_clue "t3[A].value_type="+(t3.at "A").value_type.to_display_text+": " <| t3.at "A" . value_type . variable_length . should_be_true - group_builder.specify "should find a common type that will fit the merged columns" <| + group_builder.specify "should find a common type that will fit the merged columns (Integer + Float)" <| t1 = table_builder [["A", [0, 1, 2]]] t2 = table_builder [["A", [1.0, 2.0, 2.5]]] @@ -245,29 +290,77 @@ run_union_tests group_builder setup call_union = t3 = call_union [t1, t2] expect_column_names ["A"] t3 + Problems.assume_no_problems t3 t3.at "A" . value_type . is_floating_point . should_be_true t3.at "A" . to_vector . should_equal [0, 1, 2, 1.0, 2.0, 2.5] - # Specific type tests that apply to in-memory. Database behaviour is up to implementation. - if setup.is_database.not then - t4 = table_builder [["A", [2^100, 2^10, 2]]] - t4.at "A" . value_type . should_be_a (Value_Type.Decimal ...) + group_builder.specify "should find a common type that will fit the merged columns (numeric + Boolean)" <| + t1 = table_builder [["A", [0, 1, 20]]] + t2 = table_builder [["A", [True, False, True]]] - t5 = call_union [t2, t4] - expect_column_names ["A"] t5 - t5.at "A" . value_type . is_floating_point . should_be_true - t5.at "A" . to_vector . should_equal [1.0, 2.0, 2.5, 2^100, 2^10, 2] + t1.at "A" . value_type . is_integer . should_be_true + t2.at "A" . value_type . should_equal Value_Type.Boolean - t6 = call_union [t1, t4] - expect_column_names ["A"] t6 - t6.at "A" . value_type . should_be_a (Value_Type.Decimal ...) - t6.at "A" . to_vector . should_equal [0, 1, 2, 2^100, 2^10, 2] + t3 = call_union [t1, t2] + expect_column_names ["A"] t3 + Problems.assume_no_problems t3 + t3.at "A" . value_type . is_integer . should_be_true + t3.at "A" . to_vector . should_equal [0, 1, 20, 1, 0, 1] + + t4 = table_builder [["A", [1.5, 0.0, 2.0]]] + t5 = call_union [t2, t4] + Problems.assume_no_problems t5 + t5.at "A" . value_type . is_floating_point . should_be_true + t5.at "A" . to_vector . should_equal [1.0, 0.0, 1.0, 1.5, 0.0, 2.0] + + group_builder.specify "should warn about loss of precision when converting large Integer to Float" pending=(if setup.is_database then "Loss_Of_Integer_Precision not yet supported in DB.") <| + # 2^70 is not exactly representable as a Float. + t1 = table_builder [["A", [2^70, 2^10, 2]]] + t2 = table_builder [["A", [1.5, 2.0, 2.5]]] + t1.at "A" . value_type . is_decimal . should_be_true + t2.at "A" . value_type . is_floating_point . should_be_true + + t3 = call_union [t1, t2] + expect_column_names ["A"] t3 + w = Problems.expect_only_warning Loss_Of_Integer_Precision t3 + # TODO should we try to include column name here for context? may be worth it... + w.affected_rows_count.should_equal 1 + t3.at "A" . value_type . is_floating_point . should_be_true + t3.at "A" . to_vector . should_equal [(2^70).to_float, 2^10, 2, 1.5, 2.0, 2.5] + + group_builder.specify "should find a common type (Integer and Char of different sizes)" <| + t1 = (table_builder [["X", [0, 1, 2]], ["Y", ['aa', 'bb', 'cc']]]) . cast "X" (Value_Type.Integer Bits.Bits_16) . cast "Y" (Value_Type.Char size=2 variable_length=False) + t2 = (table_builder [["X", [3, 4, 5]], ["Y", ['x', 'y', 'z']]]) . cast "X" (Value_Type.Integer Bits.Bits_32) . cast "Y" (Value_Type.Char size=1 variable_length=False) + supports_complex_types = (t1.is_error || t2.is_error || Problems.get_attached_warnings t1 . not_empty).not + case supports_complex_types of + False -> Nothing + True -> + t12 = call_union [t1, t2] + # No warnings are expected + Problems.assume_no_problems t12 + t12.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_32) + t12.at "Y" . value_type . should_equal (Value_Type.Char size=2 variable_length=True) + + t12.at "X" . to_vector . should_equal [0, 1, 2, 3, 4, 5] + t12.at "Y" . to_vector . should_equal ['aa', 'bb', 'cc', 'x', 'y', 'z'] + + date_time_pending = if setup.test_selection.date_time.not then "Date/Time operations are not supported." + group_builder.specify "should warn when converting a Date to Date_Time" pending=date_time_pending <| + t1 = table_builder [["D", [Date_Time.new 2024 5 16 16 48 23]]] + t2 = table_builder [["D", [Date.new 2019 10 23, Date.new 2020]]] + + action = call_union [t1, t2] on_problems=_ + tester table = + expect_column_names ["D"] table + table.at "D" . value_type . should_equal Value_Type.Date_Time + table.at "D" . to_vector . should_equal_tz_agnostic [Date_Time.new 2024 5 16 16 48 23, Date_Time.new 2019 10 23 0 0 0, Date_Time.new 2020 1 1 0 0 0] + problems = [Mixing_Date_Time_Types.Date_To_Date_Time "D"] + problems.first.to_display_text . should_contain "[D]" + Problems.test_problem_handling action problems tester # Database backends are not required to support Mixed types. if setup.is_database.not then group_builder.specify "should resort to Mixed value type only if at least one column is already Mixed" <| - ## TODO currently no way to retype a column to Mixed, so we are - using a custom object t1 = table_builder [["A", [1, 2, 3]], ["mixed", ["a", My_Type.Value 1 2, Nothing]]] t2 = table_builder [["A", [4, 5, 6]], ["mixed", [1, 2, 3]]] t1.at "mixed" . value_type . should_equal Value_Type.Mixed @@ -291,145 +384,68 @@ run_union_tests group_builder setup call_union = t6.at "mixed" . to_vector . should_equal ["X", "y", "a", My_Type.Value 1 2, Nothing, 1, 2, 3, True, False] t6.at "mixed" . value_type . should_equal Value_Type.Mixed - group_builder.specify "if no common type can be found, should report error and drop the problematic column" <| + group_builder.specify "if no common type can be found, will fall back to converting all types to text and warn" <| t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]], ["C", [True, False, Nothing]]] t2 = table_builder [["C", ["x", "Y", "Z"]], ["A", [4, 5, 6]], ["B", [1, 2, 3]]] r1 = call_union [t1, t2] on_problems=Problem_Behavior.Report_Error r1.should_fail_with No_Common_Type + r1.catch.to_display_text . should_contain "converted to text" - r2 = call_union [t1, t2] on_problems=Problem_Behavior.Ignore - Problems.assume_no_problems r2 - - r3 = call_union [t1, t2] on_problems=Problem_Behavior.Report_Warning - w3 = Problems.get_attached_warnings r3 - w3.each w-> w.should_be_a No_Common_Type - w3.map w-> - ## We look just at names of the Value_Type constructors, as - different database backends may choose integers of different - sizes and have differing settings for text types. - types = w.types.map value_type-> - Meta.meta value_type . constructor . name - (types == ["Char", "Integer"]) || (types == ["Boolean", "Char"]) . should_be_true - - # A boolean column cannot be merged with integers. - t3 = t1.select_columns ["C", "A"] reorder=True - t4 = t2.select_columns ["B", "A"] reorder=True - r4 = call_union [t3, t4] match_columns=Match_Columns.By_Position on_problems=Problem_Behavior.Report_Error - r4.should_fail_with No_Common_Type - - group_builder.specify "if type widening is not allowed, should use the type from first table that contained the given column" <| - t1 = table_builder [["A", [1, 2, 3]]] - t2 = table_builder [["A", [4, 5, 6]], ["B", [1.2, 2.2, 3.1]]] - - t3 = call_union [t1, t2] allow_type_widening=False keep_unmatched_columns=True - within_table t3 <| - Problems.assume_no_problems t3 - expect_column_names ["A", "B"] t3 - t3.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6] - t3.at "B" . to_vector . should_equal [Nothing, Nothing, Nothing, 1.2, 2.2, 3.1] - t3.at "A" . value_type . is_integer . should_be_true - t2.at "B" . value_type . is_floating_point . should_be_true - t3.at "B" . value_type . is_floating_point . should_be_true - - group_builder.specify "if type widening is not allowed and types do not match, should report error and drop the problematic column" <| - t1 = table_builder [["A", [1, 2, 3]], ["B", [1, 2, 3]], ["E", [1.1, 2.5, 3.2]]] - t2 = table_builder [["A", [4, 5, 6]], ["B", [1.5, 2.5, 3.5]], ["E", [1, 2, 3]]] - - t1.at "B" . value_type . is_integer . should_be_true - t1.at "E" . value_type . is_floating_point . should_be_true - - t2.at "B" . value_type . is_floating_point . should_be_true - t2.at "E" . value_type . is_integer . should_be_true - - action = call_union [t1, t2] allow_type_widening=False on_problems=_ - tester table = - expect_column_names ["A"] table + action = call_union [t1, t2] on_problems=_ + result_checker table = + expect_column_names ["A", "B", "C"] table + # If type was matched - the columns are merged as is: table.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6] + table.at "A" . value_type . is_integer . should_be_true + # If mixed, they are converted to text representation: + table.at "B" . to_vector . should_equal ["a", "b", "c", "1", "2", "3"] + table.at "B" . value_type . is_text . should_be_true - problem_checker problem = - problem.should_be_a Column_Type_Mismatch - True - err_checker err = - problem_checker err.catch - warn_checker warnings = - warnings.all problem_checker - Problems.test_advanced_problem_handling action err_checker warn_checker tester + v = table.at "C" . to_vector + # The check needs to be case insensitive because various database backends may represent Booleans with lower or uppercase. + v.take 2 . map (t -> t.to_case Case.Lower) . should_equal ["true", "false"] + # Nothing is preserved, not converted to text because we want to preserve the meaning of 'missing value': + v.drop 2 . should_equal [Nothing, "x", "Y", "Z"] - # Database backends are not required to support Mixed types. - if setup.is_database.not then - group_builder.specify "even if type widening is not allowed, if the first column is mixed, it should accept any column to be concatenated to it" <| - t1 = table_builder [["X", ["a", 1, Nothing]]] - t2 = table_builder [["X", [1]]] - t3 = table_builder [["X", [1.2, 2.3, 3.4]]] - t4 = table_builder [["X", ["a", "b"]]] - t5 = table_builder [["X", [True, False]]] + table.at "C" . value_type . is_text . should_be_true - t1.at "X" . value_type . should_equal Value_Type.Mixed - t2.at "X" . value_type . should_equal Value_Type.Integer + error_checker result = result.should_fail_with No_Common_Type + warnings_checker warnings = + warnings.map w-> + w.should_be_a No_Common_Type + w.to_display_text . should_contain "converted to text" + ["B", "C"].should_contain w.related_column_name + ## We look just at names of the Value_Type constructors, as + different database backends may choose integers of different + sizes and have differing settings for text types. + types = w.types.map value_type-> + Meta.meta value_type . constructor . name + Test.with_clue "(should be one of...) " <| + [["Char", "Integer"], ["Boolean", "Char"]].should_contain types + Problems.test_advanced_problem_handling action error_checker warnings_checker result_checker - t6 = call_union [t1, t2, t3, t4, t5] allow_type_widening=False - Problems.assume_no_problems t6 - t6.at "X" . value_type . should_equal Value_Type.Mixed - t6.at "X" . to_vector . should_equal ["a", 1, Nothing, 1, 1.2, 2.3, 3.4, "a", "b", True, False] + group_builder.specify "if no common type can be found, will fall back to converting all types to text and warn (Date+Time)" pending=date_time_pending <| + t1 = table_builder [["D", [Time_Of_Day.new 12, Time_Of_Day.new 13, Time_Of_Day.new 14]]] + t2 = table_builder [["D", [Date.new 2019, Date.new 2020, Date.new 2021]]] - group_builder.specify "when finding a common type for numeric columns to be Float, any precision loss should be reported" <| - t1 = table_builder [["X", [1, (2^62)-1, 3]]] - t2 = table_builder [["X", [1.5, 2.5, 3.5]]] - t3 = table_builder [["X", [(2^100)+1, 2^10, 2]]] + action = call_union [t1, t2] on_problems=_ + tester table = + expect_column_names ["D"] table + table.at "D" . to_vector . should_equal ["12:00:00", "13:00:00", "14:00:00", "2019-01-01", "2020-01-01", "2021-01-01"] + table.at "D" . value_type . is_text . should_be_true + problems = [No_Common_Type.Warning_Convert_To_Text [Value_Type.Time, Value_Type.Date] "D"] + Problems.test_problem_handling action problems tester - t1.at "X" . value_type . should_equal Value_Type.Integer - t2.at "X" . value_type . should_equal Value_Type.Float - t3.at "X" . value_type . should_be_a (Value_Type.Decimal ...) - - t4 = call_union [t2, t1, t3] allow_type_widening=True - t4.at "X" . value_type . should_equal Value_Type.Float - t4.at "X" . to_vector . should_equal [1.5, 2.5, 3.5, 1, (2^62)-1, 3, (2^100)+1 . to_float, 2^10, 2] - - w = Problems.expect_only_warning Loss_Of_Integer_Precision t4 - # Losing precision on (2^62)-1 and 2^100+1. - w.affected_rows_count . should_equal 2 - - group_builder.specify "if type mismatches cause all columns to be dropped, fail with No_Output_Columns" <| - t1 = table_builder [["A", [1, 2, 3]]] - t2 = table_builder [["A", ['x']]] - - e3 = call_union [t1, t2] allow_type_widening=True on_problems=Problem_Behavior.Ignore - e3.should_fail_with No_Output_Columns - - t4 = table_builder [["A", [1.5]]] - e5 = call_union [t1, t4] allow_type_widening=False on_problems=Problem_Behavior.Ignore - e5.should_fail_with No_Output_Columns - - group_builder.specify "should find a common type (2)" <| - t1 = (table_builder [["X", [0, 1, 2]], ["Y", ['aa', 'bb', 'cc']]]) . cast "X" (Value_Type.Integer Bits.Bits_16) . cast "Y" (Value_Type.Char size=2 variable_length=False) - t2 = (table_builder [["X", [3, 4, 5]], ["Y", ['x', 'y', 'z']]]) . cast "X" (Value_Type.Integer Bits.Bits_32) . cast "Y" (Value_Type.Char size=1 variable_length=False) - supports_complex_types = (t1.is_error || t2.is_error || Problems.get_attached_warnings t1 . not_empty).not - case supports_complex_types of - False -> Nothing - True -> - t12 = call_union [t1, t2] - Problems.assume_no_problems t12 - t12.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_32) - t12.at "Y" . value_type . should_equal (Value_Type.Char size=2 variable_length=True) - - t12.at "X" . to_vector . should_equal [0, 1, 2, 3, 4, 5] - t12.at "Y" . to_vector . should_equal ['aa', 'bb', 'cc', 'x', 'y', 'z'] - - group_builder.specify "should fail to find a common type if widening is not allowed (2)" <| - t1 = (table_builder [["X", [0, 1, 2]], ["Y", ['aa', 'bb', 'cc']]]) . cast "X" (Value_Type.Integer Bits.Bits_16) . cast "Y" (Value_Type.Char size=2 variable_length=False) - t2 = (table_builder [["X", [3, 4, 5]], ["Y", ['x', 'y', 'z']]]) . cast "X" (Value_Type.Integer Bits.Bits_32) . cast "Y" (Value_Type.Char size=1 variable_length=False) - supports_complex_types = (t1.is_error || t2.is_error || Problems.get_attached_warnings t1 . not_empty).not - case supports_complex_types of - False -> Nothing - True -> - r1 = call_union [t1, t2] allow_type_widening=False - r1.should_fail_with No_Output_Columns - r1.catch.cause . should_be_a Column_Type_Mismatch - r1.catch.to_display_text . should_equal "No columns in the result, because of another problem: The column [X] expects type Integer (16 bits) but one of the provided tables had type Integer (32 bits) which is not compatible with it." - - # And this should report Column_Type_Mismatch as the more important error too: - call_union [t1, t2] allow_type_widening=False on_problems=Problem_Behavior.Report_Error . should_fail_with Column_Type_Mismatch + group_builder.specify "will use the _output_ column name in the warnings when matching by position (so input names may differ)" <| + t1 = table_builder [["A", [1]]] + t2 = table_builder [["B", ["a"]]] + r1 = call_union [t1, t2] match_columns=Match_Columns.By_Position + expect_column_names ["A"] r1 + r1.at "A" . value_type . is_text . should_be_true + r1.at "A" . to_vector . should_equal ["1", "a"] + w = Problems.expect_only_warning No_Common_Type r1 + w.related_column_name.should_equal "A" group_builder.specify "should gracefully handle tables from different backends" <| t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]] diff --git a/test/Table_Tests/src/In_Memory/Table_Spec.enso b/test/Table_Tests/src/In_Memory/Table_Spec.enso index dbb8fd8fa8f..62b09c035be 100644 --- a/test/Table_Tests/src/In_Memory/Table_Spec.enso +++ b/test/Table_Tests/src/In_Memory/Table_Spec.enso @@ -822,13 +822,21 @@ add_specs suite_builder = int = ["int", [1, 2, 3, 3]] int2 = ["int2", [1, Nothing, Nothing, 1]] dbl = ["dbl", [0.0, 0.0, Nothing, Nothing]] - dates = ["dates", [Date.new 2000, Date.new 1999 1 1, Date.new 1999 1 1, Date_Time.new 2022 8 20]] - dts = ["dts", [Date_Time.new 2022 8 27 11 22 25, Nothing, Date_Time.new 2030, Date.new 2000]] - tod = ["tod", [Time_Of_Day.new 18 00, Time_Of_Day.new 18 19, Date_Time.new 2000 1 1, Time_Of_Day.new 18 19]] + dates = ["dates", [Date.new 2000, Date.new 1999 1 1, Date.new 1999 1 1, Nothing]] + dts = ["dts", [Date_Time.new 2022 8 27 11 22 25, Nothing, Nothing, Date_Time.new 2030]] + dts_mixed = ["dts_mixed", [Date_Time.new 2022 8 27 11 22 25, Nothing, Date_Time.new 2030, Date.new 2000]] + tod = ["tod", [Time_Of_Day.new 18 00, Time_Of_Day.new 18 19, Nothing, Time_Of_Day.new 18 19]] + tod_mixed = ["tod_mixed", [Time_Of_Day.new 18 00, Time_Of_Day.new 18 19, Date_Time.new 2000 1 1, Time_Of_Day.new 18 19]] mix = ["mix", [42, Date_Time.new 2022 8 27, 1, 1]] nulls = ["nulls", [Nothing, Nothing, Nothing, 0]] custom = ["custom", [2, My.Data 2 1, Nothing, Nothing]] - [str, int, int2, dbl, dates, dts, tod, mix, nulls, custom] + [str, int, int2, dbl, dates, dts, dts_mixed, tod, tod_mixed, mix, nulls, custom] + ins.at "dates" . value_type . should_equal Value_Type.Date + ins.at "dts" . value_type . is_date_time . should_be_true + ins.at "dts_mixed" . value_type . should_equal Value_Type.Mixed + ins.at "tod" . value_type . should_equal Value_Type.Time + ins.at "tod_mixed" . value_type . should_equal Value_Type.Mixed + data.varied_type_table.filter "strs" (Filter_Condition.Is_In (ins.at "str")) . at "strs" . to_vector . should_equal ["b", "c"] data.varied_type_table.filter "strs" (Filter_Condition.Is_In (ins.at "str" . to_vector)) . at "strs" . to_vector . should_equal ["b", "c"] data.varied_type_table.filter "ints" (Filter_Condition.Is_In (ins.at "int")) . at "ints" . to_vector . should_equal [1, 2] @@ -841,7 +849,11 @@ add_specs suite_builder = data.varied_type_table.filter "dates" (Filter_Condition.Is_In (ins.at "dates" . to_vector)) . at "dates" . to_vector . should_equal [Date.new 2000, Date.new 1999 1 1] data.varied_type_table.filter "datetimes" (Filter_Condition.Is_In (ins.at "dts")) . at "datetimes" . to_vector . should_equal [Date_Time.new 2022 8 27 11 22 25] data.varied_type_table.filter "datetimes" (Filter_Condition.Is_In (ins.at "dts" . to_vector)) . at "datetimes" . to_vector . should_equal [Date_Time.new 2022 8 27 11 22 25] + # The Date_Time.new 2000 should not match with Date.new 2000 because the types are different: + data.varied_type_table.filter "datetimes" (Filter_Condition.Is_In (ins.at "dts_mixed")) . at "datetimes" . to_vector . should_equal [Date_Time.new 2022 8 27 11 22 25] + data.varied_type_table.filter "dates" (Filter_Condition.Is_In (ins.at "dts_mixed")) . at "dates" . to_vector . should_equal [Date.new 2000] data.varied_type_table.filter "times" (Filter_Condition.Is_In (ins.at "tod")) . at "times" . to_vector . should_equal [Time_Of_Day.new 18 00] + data.varied_type_table.filter "times" (Filter_Condition.Is_In (ins.at "tod_mixed")) . at "times" . to_vector . should_equal [Time_Of_Day.new 18 00] data.varied_type_table.filter "times" (Filter_Condition.Is_In (ins.at "tod" . to_vector)) . at "times" . to_vector . should_equal [Time_Of_Day.new 18 00] data.varied_type_table.filter "mixed" (Filter_Condition.Is_In [42, "a", 1, Nothing, Date.new 2022 8 27, Date_Time.new 2022 8 27]) . at "mixed" . to_vector . should_equal [1, "a", Date.new 2022 8 27] data.varied_type_table.filter "mixed" (Filter_Condition.Is_In (ins.at "mix")) . at "mixed" . to_vector . should_equal [1]