mirror of
https://github.com/enso-org/enso.git
synced 2024-12-22 16:41:45 +03:00
parent
517299bb09
commit
1e0649fda1
@ -666,6 +666,7 @@
|
||||
- [Added `Text.cleanse` `Column.Text_Cleanse` and `Table.Text_Cleanse`][9879]
|
||||
- [Added ability to save an existing Postgres connection as a Data Link in Enso
|
||||
Cloud.][9957]
|
||||
- [Improved `Table.union`.][9968]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -978,6 +979,7 @@
|
||||
[9873]: https://github.com/enso-org/enso/pull/9873
|
||||
[9879]: https://github.com/enso-org/enso/pull/9879
|
||||
[9957]: https://github.com/enso-org/enso/pull/9957
|
||||
[9968]: https://github.com/enso-org/enso/pull/9968
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -18,6 +18,7 @@ from Standard.Base.Runtime import assert
|
||||
from Standard.Base.Widget_Helpers import make_data_cleanse_vector_selector, make_delimiter_selector, make_format_chooser
|
||||
|
||||
import Standard.Table.Column_Operation.Column_Operation
|
||||
import Standard.Table.Columns_To_Keep.Columns_To_Keep
|
||||
import Standard.Table.Expression.Expression
|
||||
import Standard.Table.Expression.Expression_Error
|
||||
import Standard.Table.Internal.Add_Row_Number
|
||||
@ -30,6 +31,7 @@ import Standard.Table.Internal.Problem_Builder.Problem_Builder
|
||||
import Standard.Table.Internal.Replace_Helpers
|
||||
import Standard.Table.Internal.Table_Helpers
|
||||
import Standard.Table.Internal.Table_Helpers.Table_Column_Helper
|
||||
import Standard.Table.Internal.Table_Helpers.Union_Result_Type
|
||||
import Standard.Table.Internal.Table_Ref.Table_Ref
|
||||
import Standard.Table.Internal.Unique_Name_Strategy.Unique_Name_Strategy
|
||||
import Standard.Table.Internal.Value_Type_Helpers
|
||||
@ -1719,90 +1721,77 @@ type DB_Table
|
||||
- tables: A single table or a vector of tables to append to this one. The
|
||||
tables are concatenated in the order they are specified, with `self`
|
||||
being the first one.
|
||||
- columns_to_keep: Specifies which columns to keep. Defaults to keeping
|
||||
columns that are present in any of the tables, reporting a warning for
|
||||
columns that are not present in all tables and adding `Nothing` values
|
||||
for them.
|
||||
- match_columns: Specifies how to match the columns.
|
||||
- If `Match_Columns.By_Name` - the columns are matched by name across
|
||||
all provided tables.
|
||||
If unmatched columns are to be dropped, the resulting table will keep
|
||||
only the set of columns that appear in all provided tables, in the
|
||||
relative order that they appeared in the `self` table.
|
||||
If unmatched columns are kept, they are added in the order of
|
||||
appearance - i.e. first all columns from `self` will be added in the
|
||||
original order, then any columns from the second table that were not
|
||||
matched will be added at the end (preserving their relative order),
|
||||
and so on for all the remaining tables.
|
||||
- If `Match_Columns.By_Position` - the columns are mapped by position.
|
||||
If unmatched columns are to be dropped, the resulting table will have
|
||||
as many columns as the table that had the least columns and the
|
||||
column names of the first table (`self`) will be used.
|
||||
If unmatched columns are kept, the resulting table will have as many
|
||||
columns as the table with the most columns. Since the first table may
|
||||
not have all the necessary columns to provide column names for the
|
||||
result, the result will have column names taken from the first table
|
||||
that has the biggest number of columns.
|
||||
- keep_unmatched_columns: If set to `True`, unmatched columns are kept
|
||||
and are padded with `Nothing` for tables that did not have them.
|
||||
If set to `False`, only the common subset of columns is kept - any
|
||||
column that is not present in all tables is dropped. Defaults to
|
||||
`Report_Unmatched`, which behaves like `True` - unmatched columns are
|
||||
kept and padded with `Nothing`, but a problem is reported.
|
||||
- allow_type_widening: Specifies if the resulting column type should be
|
||||
adjusted to fit columns from all arguments. If `True`, a common type
|
||||
will be chosen for each column (see "Unifying Column Types" below).
|
||||
If `False`, the resulting column type will be the same as in the first
|
||||
table containing the column. In this case, all columns that are
|
||||
concatenated must have the same type as the first one (unless this
|
||||
had a `Mixed` type - in which case it will accept any other types).
|
||||
The names of each column come from the first table in which the given
|
||||
column appears in.
|
||||
The `List` option is not applicable when mapping columns by position.
|
||||
Column names are taken from the first table if `In_All` and from the
|
||||
first table that has the maximum number of columns if `In_Any`
|
||||
- on_problems: Specifies how to handle problems if they occur, reporting
|
||||
them as warnings by default.
|
||||
|
||||
- If `keep_unmatched_columns` is set to `Report_Unmatched` (the
|
||||
default):
|
||||
- If matching by name and there are columns that are not present in
|
||||
all tables, `Unmatched_Columns` is reported.
|
||||
- If matching by position and column counts of the merged tables
|
||||
differ, then a `Column_Count_Mismatch` is reported. The error will
|
||||
contain the greatest column count as its `expected` value and the
|
||||
smallest one as its `actual` value.
|
||||
- If `keep_unmatched_columns` is set to `False` and matching by name,
|
||||
it is possible that there are no columns that are common to all
|
||||
provided tables, in that case `No_Output_Columns` is thrown as a
|
||||
dataflow error regardless of the `on_problems` setting, because there
|
||||
are no columns to include in the resulting table.
|
||||
- If type widening is disabled and one of corresponding columns has a
|
||||
type that is incompatible with the type coming from the first table,
|
||||
a `Column_Type_Mismatch` is reported. The problematic column will be
|
||||
dropped from the resulting table. With type widening disabled, the
|
||||
subsequent tables must have the same types as the first one, unless
|
||||
the type of the first one was `Mixed` which will accept any other
|
||||
type.
|
||||
- If a common type coercion for a set of matched columns from
|
||||
concatenated tables cannot be found, a `No_Common_Type` is reported.
|
||||
In warning or ignore mode, the problematic column will be dropped
|
||||
from the resulting table.
|
||||
|
||||
? Unifying Column Types
|
||||
|
||||
If `allow_type_widening` is set to `True`, then the following rules are
|
||||
used to find a common type that will fit values from all merged tables.
|
||||
|
||||
Numeric columns are unified by finding the most general type that can
|
||||
fit all of the columns. The biggest integer type will be chosen and if
|
||||
Numeric columns are unified by finding the smallest type that can fit
|
||||
all of the columns. The biggest integer type will be chosen and if
|
||||
integers and decimals are mixed, the decimal type will be chosen.
|
||||
If boolean columns are mixed with numeric columns, they will be coerced
|
||||
to the numeric type (and converted to 0 and 1).
|
||||
|
||||
Text types will also be coerced according to the common rules - if
|
||||
constant-length texts of different lengths are mixed, they will be
|
||||
coerced to a varying-length type.
|
||||
Text types will are also unified by finding the smallest type that can
|
||||
fit all the values. If constant-length texts of different lengths are
|
||||
mixed, they will be coerced to a varying-length type.
|
||||
|
||||
If date and date-time columns are unified, this yields a date-time
|
||||
column. In-memory, the date is promoted by adding a time of 00:00 and
|
||||
the system time-zone. In other backends that behaviour may differ.
|
||||
|
||||
If one of the matched columns has `Mixed` type, that type will be used
|
||||
regardless of types of other columns. Mixing any other types will
|
||||
result in a `No_Common_Type` problem. If columns of incompatible types
|
||||
are meant to be mixed, at least one of them should be explicitly
|
||||
retyped to the `Mixed` type to indicate that intention. Note that the
|
||||
`Mixed` type may not be supported by most Database backends.
|
||||
union : (DB_Table | Vector DB_Table) -> Match_Columns -> Boolean | Report_Unmatched -> Boolean -> Problem_Behavior -> DB_Table
|
||||
union self tables:(DB_Table | Vector) match_columns=Match_Columns.By_Name keep_unmatched_columns=Report_Unmatched allow_type_widening=True on_problems=Report_Warning =
|
||||
regardless of types of other columns. Note that the `Mixed` type may
|
||||
not be supported by most Database backends.
|
||||
|
||||
Finally, if no common type is found using the rules above, everything
|
||||
is converted to text.
|
||||
|
||||
? Problem Conditions
|
||||
|
||||
- If no common type is found and the text conversion fallback is used,
|
||||
the `No_Common_Type` problem is reported.
|
||||
- The `Float` type may not be able to exactly represent larger
|
||||
integers, thus if such large integers are mixed with floats, the
|
||||
resulting conversion to `Float` may cause a loss of precision.
|
||||
In that case, a `Loss_Of_Integer_Precision` problem is reported.
|
||||
This warning is only reported in the in-memory backend. Currently,
|
||||
the Database backend proceeds without a warning about precision loss.
|
||||
- If a column of dates is unified with a column of date-times, since
|
||||
the assumption of using the midnight time-of-day is arbitrary,
|
||||
a `Implicit_Date_As_Date_Time_Conversion` problem is reported.
|
||||
- If an empty vector of tables is provided, an `Illegal_Argument` error
|
||||
is raised.
|
||||
- If `columns_to_keep` is set to `In_All` or `List` and an expected
|
||||
column is missing in some of the tables, a `Unmatched_Columns`
|
||||
problem is reported. If this causes the output to contain no columns,
|
||||
a `No_Output_Columns` error is raised.
|
||||
|
||||
? Ordering of Columns in the result
|
||||
|
||||
When matching columns by name, it is possible that the ordering of
|
||||
columns may vary between input tables. The ordering is determined as
|
||||
following: columns that are kept from the first table are in the order
|
||||
they appear in that table. If there are columns that do not appear in
|
||||
the first table, they are appended to the end of the resulting table in
|
||||
the order they appear in the input.
|
||||
@tables (Widget.Vector_Editor item_editor=Widget.Code_Input item_default='_' display=Display.Always)
|
||||
@columns_to_keep Columns_To_Keep.default_widget
|
||||
union : (DB_Table | Vector DB_Table) -> Match_Columns -> Columns_To_Keep -> Problem_Behavior -> DB_Table
|
||||
union self tables:(DB_Table | Vector) (columns_to_keep : Columns_To_Keep = ..In_Any_Warn_On_Missing) (match_columns : Match_Columns = Match_Columns.By_Name) (on_problems : Problem_Behavior = Report_Warning) =
|
||||
all_tables = case tables of
|
||||
v : Vector -> [self] + (v.map t-> DB_Table.from t)
|
||||
single_table -> [self, single_table]
|
||||
@ -1811,39 +1800,53 @@ type DB_Table
|
||||
we only want to add a cause coming from unification; matching reports problems that would not fit this error.
|
||||
problem_builder_for_matching = Problem_Builder.new
|
||||
problem_builder_for_unification = Problem_Builder.new
|
||||
matched_column_sets = Match_Columns_Helpers.match_columns all_tables match_columns keep_unmatched_columns problem_builder_for_matching
|
||||
matched_column_sets = Match_Columns_Helpers.match_columns all_tables match_columns columns_to_keep problem_builder_for_matching
|
||||
dialect = self.connection.dialect
|
||||
type_mapping = dialect.get_type_mapping
|
||||
merged_columns = matched_column_sets.map column_set->
|
||||
case Table_Helpers.unify_result_type_for_union column_set all_tables allow_type_widening problem_builder_for_unification of
|
||||
Nothing -> Nothing
|
||||
result_type : Value_Type ->
|
||||
sql_type = type_mapping.value_type_to_sql result_type Problem_Behavior.Report_Error
|
||||
sql_type.catch Inexact_Type_Coercion error->
|
||||
Panic.throw <|
|
||||
Illegal_State.Error "Unexpected inexact type coercion in Union. The union logic should only operate in types supported by the given backend. This is a bug in the Database library. The coercion was: "+error.to_display_text cause=error
|
||||
[column_set, sql_type, result_type]
|
||||
good_columns = merged_columns.filter r-> r.is_nothing.not
|
||||
sql_type_from_value_type value_type =
|
||||
type_mapping.value_type_to_sql value_type Problem_Behavior.Report_Error . catch Inexact_Type_Coercion error->
|
||||
Panic.throw <|
|
||||
Illegal_State.Error "Unexpected inexact type coercion in Union. The union logic should only operate in types supported by the given backend. This is a bug in the Database library. The coercion was: "+error.to_display_text cause=error
|
||||
case Table_Helpers.unify_result_type_for_union column_set all_tables problem_builder_for_unification of
|
||||
Union_Result_Type.Common_Type common_type ->
|
||||
[column_set, sql_type_from_value_type common_type, common_type]
|
||||
Union_Result_Type.Fallback_To_Text ->
|
||||
[column_set, sql_type_from_value_type Value_Type.Char, Value_Type.Char]
|
||||
Union_Result_Type.No_Types_To_Unify ->
|
||||
## If the column is all nulls, we still need to give it some type.
|
||||
For DB `Mixed` is not available, so a portable type to use is `Char`.
|
||||
[column_set, SQL_Type.null, Value_Type.Char]
|
||||
|
||||
problem_builder_for_matching.attach_problems_before on_problems <| problem_builder_for_unification.attach_problems_before on_problems <|
|
||||
if good_columns.is_empty then problem_builder_for_unification.raise_no_output_columns_with_cause else
|
||||
if merged_columns.is_empty then problem_builder_for_unification.raise_no_output_columns_with_cause else
|
||||
queries = all_tables.map_with_index i-> t->
|
||||
columns_to_select = good_columns.map description->
|
||||
column_set = description.first
|
||||
sql_type = description.second
|
||||
columns_to_select = merged_columns.map description->
|
||||
column_set = description.at 0
|
||||
sql_type = description.at 1
|
||||
result_type = description.at 2
|
||||
column_name = column_set.name
|
||||
## We assume that the type for this expression will never be queried - it is
|
||||
just used internally to build the Union operation and never exposed externally.
|
||||
infer_return_type _ = SQL_Type_Reference.null
|
||||
case column_set.column_indices.at i of
|
||||
corresponding_column_index : Integer ->
|
||||
column = t.at corresponding_column_index
|
||||
internal_named_column = column.as_internal.rename column_name
|
||||
## We cast if the result type is different.
|
||||
This is a bit on the safe side. In some cases the cast is not needed
|
||||
(for example, most databases will allow union of int2 and int4 without casts; or SQLite does not need casts at all).
|
||||
However, we do this for simplicity as determining the rules when the cast is needed or not is adding a lot of complication.
|
||||
This is a possible future improvement to make queries lighter, but the benefit is unlikely to be worth it.
|
||||
needs_cast = column.value_type != result_type
|
||||
if needs_cast.not then internal_named_column else
|
||||
dialect.make_cast internal_named_column sql_type infer_return_type
|
||||
Nothing ->
|
||||
typ = SQL_Type_Reference.from_constant SQL_Type.null
|
||||
expr = SQL_Expression.Literal "NULL"
|
||||
null_column = Internal_Column.Value column_name typ expr
|
||||
## We assume that the type for this
|
||||
expression will never be queried - it is
|
||||
just used internally to build the Union
|
||||
operation and never exposed externally.
|
||||
infer_return_type _ = SQL_Type_Reference.null
|
||||
dialect.make_cast null_column sql_type infer_return_type
|
||||
corresponding_column_index : Integer ->
|
||||
t.at corresponding_column_index . as_internal . rename column_name
|
||||
if sql_type == SQL_Type.null then null_column else
|
||||
dialect.make_cast null_column sql_type infer_return_type
|
||||
pairs = columns_to_select.map c->
|
||||
[c.name, c.expression]
|
||||
Query.Select pairs t.context
|
||||
@ -1860,7 +1863,7 @@ type DB_Table
|
||||
See #6118.
|
||||
infer_return_type expression =
|
||||
SQL_Type_Reference.new self.connection new_ctx expression
|
||||
new_columns = good_columns.map description->
|
||||
new_columns = merged_columns.map description->
|
||||
column_set = description.first
|
||||
result_type = description.at 2
|
||||
name = column_set.name
|
||||
|
@ -35,11 +35,18 @@ type SQL_Error
|
||||
Convert the SQL error to a textual representation.
|
||||
to_text : Text
|
||||
to_text self =
|
||||
query = if self.related_query.is_nothing.not then " [Query was: " + self.related_query.to_display_text + "]" else ""
|
||||
query = if self.related_query.is_nothing then "" else
|
||||
query_text = self.related_query.to_text
|
||||
## Our generated queries tend to be very long, so to still be readable,
|
||||
we don't shorten them too much. We impose an upper limit to avoid unbounded error message size.
|
||||
max_length = 1000
|
||||
shortened_query_text = if query_text.length <= max_length then query_text else
|
||||
query_text.take (Index_Sub_Range.First (max_length.div 2)) + " (...) " + query_text.take (Index_Sub_Range.Last (max_length.div 2))
|
||||
" [Query was: " + shortened_query_text + "]"
|
||||
message = self.java_exception.getMessage
|
||||
max_length = 300
|
||||
short_message = if message.length < max_length then message else
|
||||
message.take (Index_Sub_Range.First max_length/2) + " (...) " + message.take (Index_Sub_Range.Last max_length/2)
|
||||
message.take (Index_Sub_Range.First (max_length.div 2)) + " (...) " + message.take (Index_Sub_Range.Last (max_length.div 2))
|
||||
"There was an SQL error: " + short_message + "." + query
|
||||
|
||||
## PRIVATE
|
||||
|
@ -140,7 +140,7 @@ type Postgres_Dialect
|
||||
|
||||
## PRIVATE
|
||||
make_cast : Internal_Column -> SQL_Type -> (SQL_Expression -> SQL_Type_Reference) -> Internal_Column
|
||||
make_cast self column target_type infer_result_type_from_database_callback =
|
||||
make_cast self (column : Internal_Column) (target_type : SQL_Type) (infer_result_type_from_database_callback : SQL_Expression -> SQL_Type_Reference) =
|
||||
mapping = self.get_type_mapping
|
||||
source_type = mapping.sql_type_to_value_type column.sql_type_reference.get
|
||||
target_value_type = mapping.sql_type_to_value_type target_type
|
||||
|
@ -138,7 +138,7 @@ type SQLite_Dialect
|
||||
|
||||
## PRIVATE
|
||||
make_cast : Internal_Column -> SQL_Type -> (SQL_Expression -> SQL_Type_Reference) -> Internal_Column
|
||||
make_cast self column target_type infer_result_type_from_database_callback =
|
||||
make_cast self (column : Internal_Column) (target_type : SQL_Type) (infer_result_type_from_database_callback : SQL_Expression -> SQL_Type_Reference) =
|
||||
_ = [infer_result_type_from_database_callback]
|
||||
mapping = self.get_type_mapping
|
||||
target_value_type = mapping.sql_type_to_value_type target_type
|
||||
|
@ -0,0 +1,35 @@
|
||||
from Standard.Base import Vector, Text
|
||||
from Standard.Base.Metadata import make_single_choice, Widget
|
||||
|
||||
## Specifies which columns to keep in a union operation.
|
||||
type Columns_To_Keep
|
||||
## All columns are kept.
|
||||
|
||||
If a column is present only in some of the tables, it is padded with
|
||||
`Nothing` for tables where it is missing.
|
||||
In_Any
|
||||
|
||||
## Only columns that are present in all tables are kept.
|
||||
|
||||
If there are columns that are only present in some of the tables,
|
||||
a problem is reported.
|
||||
In_All
|
||||
|
||||
## Specific list of column names to keep.
|
||||
|
||||
If a table does not have a column that is specified in the list, it is
|
||||
padded with `Nothing` and a problem is reported.
|
||||
In_List (column_names : Vector Text)
|
||||
|
||||
## PRIVATE
|
||||
Same as `In_Any`, but it will warn about columns that are not present in
|
||||
all tables.
|
||||
In_Any_Warn_On_Missing
|
||||
|
||||
## PRIVATE
|
||||
The default widget for `Columns_To_Keep`.
|
||||
It does not display the internal `In_Any_Warn_On_Missing` variant, since
|
||||
that variant is only meant to be used as the default value.
|
||||
default_widget -> Widget =
|
||||
make_single_choice <|
|
||||
["In_Any", "In_All", "In_List"].map c-> [c, ".."+c]
|
@ -494,7 +494,8 @@ type Column_Type_Mismatch
|
||||
|
||||
type No_Common_Type
|
||||
## PRIVATE
|
||||
An error indicating that no common type could be found.
|
||||
An error indicating that no common type could be found, and the operation
|
||||
could not be performed.
|
||||
|
||||
Arguments:
|
||||
- types: The types that were tried to be unified.
|
||||
@ -502,6 +503,11 @@ type No_Common_Type
|
||||
unified, if applicable.
|
||||
Error (types : Vector Value_Type) (related_column_name : Nothing|Text)
|
||||
|
||||
## PRIVATE
|
||||
A warning indicating that no common type could be found, so the operation
|
||||
had to fall back to converting all values to text.
|
||||
Warning_Convert_To_Text (types : Vector Value_Type) (related_column_name:Text)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Create a human-readable version of the error.
|
||||
@ -509,11 +515,21 @@ type No_Common_Type
|
||||
to_display_text self =
|
||||
types = self.types.map .to_display_text . join ", "
|
||||
prefix = "No common type was found for types: "+types
|
||||
infix = case self.related_column_name of
|
||||
column_name : Text -> " when unifying column ["+column_name+"]."
|
||||
_ -> "."
|
||||
suffix = " If you want to allow mixed types, please cast one of the columns to `Mixed` beforehand."
|
||||
prefix + infix + suffix
|
||||
location = case self.related_column_name of
|
||||
column_name : Text -> " when unifying column ["+column_name+"]"
|
||||
_ -> ""
|
||||
suffix_type = case self of
|
||||
No_Common_Type.Error _ _ -> "."
|
||||
No_Common_Type.Warning_Convert_To_Text _ _ -> ", so the values were converted to text."
|
||||
suffix_mixed = " If you want to have mixed types instead, please cast one of the columns to `Mixed` beforehand."
|
||||
prefix + location + suffix_type + suffix_mixed
|
||||
|
||||
## PRIVATE
|
||||
to_text self -> Text =
|
||||
ctor = case self of
|
||||
No_Common_Type.Error _ _ -> "Error"
|
||||
No_Common_Type.Warning_Convert_To_Text _ _ -> "Warning_Convert_To_Text"
|
||||
"No_Common_Type."+ctor+" "+self.types.to_text+" "+self.related_column_name.to_text
|
||||
|
||||
type Unmatched_Columns
|
||||
## PRIVATE
|
||||
@ -637,9 +653,11 @@ type Conversion_Failure
|
||||
|
||||
type Loss_Of_Integer_Precision
|
||||
## PRIVATE
|
||||
Indicates that an automatic conversion of an integer column to a decimal
|
||||
Indicates that an automatic conversion of an Integer column to a Float
|
||||
column is losing precision because some of the large integers cannot be
|
||||
exactly represented by the `double` type.
|
||||
exactly represented by the floating-point type.
|
||||
|
||||
Currently, this error is only reported in-memory.
|
||||
Warning (affected_rows_count : Integer) (example_value : Integer) (example_value_converted : Float)
|
||||
|
||||
## PRIVATE
|
||||
@ -834,3 +852,23 @@ type Nothing_Value_In_Filter_Condition
|
||||
to_display_text : Text
|
||||
to_display_text self =
|
||||
"Using `Nothing` as an argument to a `"+self.filter_condition.to_text+"` cannot match anything."
|
||||
|
||||
## Indicates that different Date_Time (with or without timezone) or Date types
|
||||
are mixed in the result, causing implicit coercions.
|
||||
|
||||
This is a warning, because using the `00:00` time and default time-zone may
|
||||
not always be the expected choice, so the user should be aware of this.
|
||||
type Mixing_Date_Time_Types
|
||||
## PRIVATE
|
||||
Date_To_Date_Time (related_column_name : Text | Nothing)
|
||||
|
||||
## PRIVATE
|
||||
Implicit_Time_Zone (related_column_name : Text | Nothing)
|
||||
|
||||
to_display_text self -> Text =
|
||||
location = if self.related_column_name.is_nothing then "" else " (in column ["+self.related_column_name+"])"
|
||||
case self of
|
||||
Mixing_Date_Time_Types.Date_To_Date_Time _ ->
|
||||
"Mixing Date and Date_Time values"+location+": the Date values have been automatically converted to Date_Time by adding a time of 00:00 in the default time-zone."
|
||||
Mixing_Date_Time_Types.Implicit_Time_Zone _ ->
|
||||
"Mixing Date_Time values with and without timezone"+location+". A default timezone has been assumed where it was missing."
|
||||
|
@ -12,6 +12,7 @@ from Standard.Base.Data.Filter_Condition import sql_like_to_regex
|
||||
from Standard.Base.Metadata.Choice import Option
|
||||
from Standard.Base.Metadata.Widget import Multiple_Choice, Single_Choice
|
||||
|
||||
import project.Columns_To_Keep.Columns_To_Keep
|
||||
import project.Excel.Excel_Range.Excel_Range
|
||||
import project.Headers.Headers
|
||||
import project.Internal.Excel_Reader
|
||||
@ -331,7 +332,7 @@ type Excel_Workbook
|
||||
tables = sheet_names.map on_problems=on_problems address-> self.read address headers on_problems=on_problems
|
||||
case return of
|
||||
Return_As.Table_Of_Tables -> Table.new [["Sheet Name", sheet_names], ["Table", tables]]
|
||||
Return_As.Merged_Table match ->
|
||||
Return_As.Merged_Table columns_to_keep match ->
|
||||
first_tbl = tables.find t-> t != Nothing
|
||||
if first_tbl == Nothing then Error.throw (Illegal_Argument.Error "No valid sheets found.") else
|
||||
unique = first_tbl.column_naming_helper.create_unique_name_strategy
|
||||
@ -339,7 +340,7 @@ type Excel_Workbook
|
||||
new_column_name = unique.make_unique "Sheet Name"
|
||||
|
||||
with_names = tables.zip sheet_names tbl->name-> if tbl == Nothing then Nothing else tbl.set name new_column_name . reorder_columns [new_column_name]
|
||||
result = Table.from_union (with_names.filter Filter_Condition.Not_Nothing) match keep_unmatched_columns=True
|
||||
result = Table.from_union (with_names.filter Filter_Condition.Not_Nothing) columns_to_keep=columns_to_keep match_columns=match
|
||||
|
||||
problem_builder = Problem_Builder.new
|
||||
problem_builder.report_unique_name_strategy unique
|
||||
@ -359,4 +360,4 @@ type Return_As
|
||||
Table_Of_Tables
|
||||
|
||||
## All sheets are merged into a single table. A union operation is performed.
|
||||
Merged_Table match:Match_Columns=Match_Columns.By_Name
|
||||
Merged_Table (columns_to_keep : Columns_To_Keep = Columns_To_Keep.In_Any) (match : Match_Columns = Match_Columns.By_Name)
|
||||
|
@ -10,6 +10,7 @@ import project.Expression.Expression
|
||||
import project.Internal.Column_Naming_Helper.Column_Naming_Helper
|
||||
import project.Internal.Problem_Builder.Problem_Builder
|
||||
import project.Internal.Value_Type_Helpers
|
||||
import project.Match_Columns.Column_Set
|
||||
import project.Position.Position
|
||||
import project.Set_Mode.Set_Mode
|
||||
import project.Sort_Column.Sort_Column
|
||||
@ -530,30 +531,38 @@ is_column obj =
|
||||
## PRIVATE
|
||||
A helper method that resolves what should be the result type of a particular
|
||||
column set based on the union settings.
|
||||
unify_result_type_for_union column_set all_tables allow_type_widening problem_builder =
|
||||
unify_result_type_for_union (column_set : Column_Set) (all_tables : Vector) (problem_builder : Problem_Builder) -> Union_Result_Type =
|
||||
columns = column_set.resolve_columns all_tables
|
||||
case allow_type_widening of
|
||||
True ->
|
||||
types = columns.filter Filter_Condition.Not_Nothing . map .value_type
|
||||
common_type = Value_Type_Helpers.find_common_type types strict=True
|
||||
if common_type.is_nothing then
|
||||
problem_builder.report_other_warning (No_Common_Type.Error types related_column_name=column_set.name)
|
||||
common_type
|
||||
False ->
|
||||
is_not_nothing c = case c of
|
||||
Nothing -> False
|
||||
_ -> True
|
||||
first_column = columns.find is_not_nothing
|
||||
first_type = first_column.value_type
|
||||
if first_type == Value_Type.Mixed then Value_Type.Mixed else
|
||||
first_wrong_column = columns.find if_missing=Nothing col->
|
||||
is_not_nothing col && col.value_type != first_type
|
||||
case first_wrong_column of
|
||||
Nothing -> first_type
|
||||
_ ->
|
||||
got_type = first_wrong_column.value_type
|
||||
problem_builder.report_other_warning (Column_Type_Mismatch.Error column_set.name first_type got_type)
|
||||
Nothing
|
||||
. filter Filter_Condition.Not_Nothing
|
||||
types = columns.map .value_type
|
||||
|
||||
if types.is_empty then Union_Result_Type.No_Types_To_Unify else
|
||||
## First we check if we can find a generic common type.
|
||||
This includes widening numeric column sizes, or converting Integer to Float.
|
||||
common_type = Value_Type_Helpers.find_common_type types strict=True
|
||||
if common_type.is_nothing.not then Union_Result_Type.Common_Type common_type else
|
||||
## Union has less strict requirements than other operations relying on `find_common_type`,
|
||||
so if the common type was not found, we still check some fallbacks.
|
||||
common_numeric_boolean = Value_Type_Helpers.find_common_numeric_boolean_type types
|
||||
if common_numeric_boolean.is_nothing.not then Union_Result_Type.Common_Type common_numeric_boolean else
|
||||
common_date_type = Value_Type_Helpers.find_common_date_types types column_set.name problem_builder
|
||||
if common_date_type.is_nothing.not then Union_Result_Type.Common_Type common_date_type else
|
||||
# Lastly, we fall back to text, reporting a warning.
|
||||
problem_builder.report_other_warning (No_Common_Type.Warning_Convert_To_Text types column_set.name)
|
||||
Union_Result_Type.Fallback_To_Text
|
||||
|
||||
## PRIVATE
|
||||
type Union_Result_Type
|
||||
## PRIVATE
|
||||
Common_Type (value_type : Value_Type)
|
||||
|
||||
## PRIVATE
|
||||
Fallback_To_Text
|
||||
|
||||
## PRIVATE
|
||||
This case is returned if the requested column was missing from _all_ tables,
|
||||
so there were no types to unify. An all-null column should be created.
|
||||
No_Types_To_Unify
|
||||
|
||||
## PRIVATE
|
||||
Replace a set of columns in the table with a new set of columns. The old
|
||||
|
@ -1,13 +1,15 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Data.Vector.No_Wrap
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
from Standard.Base.Runtime import assert
|
||||
|
||||
import project.Column.Column
|
||||
import project.Internal.Problem_Builder.Problem_Builder
|
||||
import project.Internal.Storage
|
||||
import project.Value_Type.Auto
|
||||
import project.Value_Type.Bits
|
||||
import project.Value_Type.Value_Type
|
||||
from project.Errors import Invalid_Value_Type, No_Common_Type
|
||||
from project.Errors import Invalid_Value_Type, Mixing_Date_Time_Types, No_Common_Type
|
||||
from project.Internal.Table_Helpers import is_column
|
||||
|
||||
polyglot java import org.enso.base.polyglot.NumericConverter
|
||||
@ -60,11 +62,11 @@ reconcile_types current new = case current of
|
||||
Value_Type.Integer size -> case new of
|
||||
Value_Type.Integer new_size ->
|
||||
Value_Type.Integer (max_size size new_size)
|
||||
Value_Type.Byte -> Value_Type.Integer size
|
||||
Value_Type.Byte -> current
|
||||
# If we unify integers with floats, we select the default Float 64 regardless of the input sizes.
|
||||
Value_Type.Float _ -> Value_Type.Float
|
||||
Value_Type.Float _ -> Value_Type.Float
|
||||
Value_Type.Decimal _ _ -> new
|
||||
_ -> Value_Type.Mixed
|
||||
_ -> Value_Type.Mixed
|
||||
Value_Type.Float size -> case new of
|
||||
Value_Type.Float new_size ->
|
||||
Value_Type.Float (max_size size new_size)
|
||||
@ -74,12 +76,11 @@ reconcile_types current new = case current of
|
||||
Value_Type.Decimal _ _ -> Value_Type.Float
|
||||
_ -> Value_Type.Mixed
|
||||
Value_Type.Byte -> case new of
|
||||
Value_Type.Byte -> Value_Type.Byte
|
||||
Value_Type.Integer size ->
|
||||
Value_Type.Integer size
|
||||
Value_Type.Float _ -> Value_Type.Float
|
||||
Value_Type.Byte -> Value_Type.Byte
|
||||
Value_Type.Integer _ -> new
|
||||
Value_Type.Float _ -> Value_Type.Float
|
||||
Value_Type.Decimal _ _ -> new
|
||||
_ -> Value_Type.Mixed
|
||||
_ -> Value_Type.Mixed
|
||||
Value_Type.Decimal precision scale -> case new of
|
||||
Value_Type.Decimal new_precision new_scale ->
|
||||
if (precision == new_precision) && (scale == new_scale) then new else
|
||||
@ -89,9 +90,6 @@ reconcile_types current new = case current of
|
||||
Value_Type.Byte -> Value_Type.Decimal precision scale
|
||||
Value_Type.Float _ -> Value_Type.Float
|
||||
_ -> Value_Type.Mixed
|
||||
Value_Type.Boolean -> case new of
|
||||
Value_Type.Boolean -> Value_Type.Boolean
|
||||
_ -> Value_Type.Mixed
|
||||
Value_Type.Char current_size current_variable -> case new of
|
||||
Value_Type.Char new_size new_variable ->
|
||||
result_variable = current_variable || new_variable || current_size != new_size
|
||||
@ -118,19 +116,70 @@ max_size a b =
|
||||
## PRIVATE
|
||||
Finds the most specific value type that will fit all the provided types.
|
||||
|
||||
If `strict` is `True`, it is implemented as specified in the note
|
||||
"Unifying Column Types" in `Table.union`. In that case, if no common type
|
||||
is found, `Nothing` is returned.
|
||||
|
||||
It assumes that the `types` vector is not empty.
|
||||
find_common_type : Vector Value_Type -> Boolean -> Value_Type | Nothing
|
||||
find_common_type types strict =
|
||||
Arguments:
|
||||
- types: a vector of types to unify. It must not be empty.
|
||||
- strict: A flag determining how strict the unification is.
|
||||
If `False`, if no common type can be found, `Mixed` is used as a generic fallback.
|
||||
If `True`, `Nothing` is returned if no common type can be found and `Mixed`
|
||||
is only returned if any of the input types was already `Mixed`.
|
||||
find_common_type (types : Vector Value_Type) (strict : Boolean) -> Value_Type | Nothing =
|
||||
assert types.not_empty
|
||||
most_generic_type = (types.drop 1).fold types.first reconcile_types
|
||||
if strict.not || most_generic_type != Value_Type.Mixed then most_generic_type else
|
||||
# Double check if Mixed was really allowed to come out.
|
||||
## We return the Mixed type only if the input contained Mixed.
|
||||
Otherwise we report failure to find common type.
|
||||
if types.contains Value_Type.Mixed then Value_Type.Mixed else
|
||||
Nothing
|
||||
|
||||
## PRIVATE
|
||||
An extra helper function that reconciles Date_Time types with varying timezone
|
||||
setting, as well as Date type, reporting any warnings.
|
||||
It can be used as a fallback after `find_common_type` does not find a simple common type.
|
||||
If non-date types are provided, it will fail by returning `Nothing`. It will not report any warnings in that case.
|
||||
It assumes that the list of `types` is not empty.
|
||||
find_common_date_types (types : Vector Value_Type) (related_column_name : Text | Nothing) (problem_builder : Problem_Builder) -> Value_Type | Nothing =
|
||||
assert types.not_empty
|
||||
all_date = types.all typ-> case typ of
|
||||
Value_Type.Date -> True
|
||||
Value_Type.Date_Time _ -> True
|
||||
_ -> False
|
||||
if all_date.not then Nothing else
|
||||
has_date = types.contains Value_Type.Date
|
||||
has_date_time_with_tz = types.contains (Value_Type.Date_Time True)
|
||||
has_date_time_without_tz = types.contains (Value_Type.Date_Time False)
|
||||
|
||||
# The common type is the 'largest' one.
|
||||
common_type = if has_date_time_with_tz then Value_Type.Date_Time True else
|
||||
if has_date_time_without_tz then Value_Type.Date_Time False else
|
||||
Value_Type.Date
|
||||
|
||||
if has_date && (common_type != Value_Type.Date) then
|
||||
problem_builder.report_other_warning (Mixing_Date_Time_Types.Date_To_Date_Time related_column_name)
|
||||
|
||||
if has_date_time_without_tz && (common_type != Value_Type.Date_Time False) then
|
||||
problem_builder.report_other_warning (Mixing_Date_Time_Types.Implicit_Time_Zone related_column_name)
|
||||
|
||||
common_type
|
||||
|
||||
## PRIVATE
|
||||
An extra helper function that reconciles numeric and boolean types.
|
||||
Unifying Boolean and numeric types is not expected by all operations, but
|
||||
some may want to opt-in to it. This method allows to do so.
|
||||
|
||||
If non-numeric or non-boolean types are provided, it will fail by returning
|
||||
`Nothing`.
|
||||
|
||||
No warnings are reported, as coercing boolean to integer is harmless, it was
|
||||
just chosen not to be done by default.
|
||||
find_common_numeric_boolean_type (types : Vector Value_Type) -> Value_Type | Nothing =
|
||||
assert types.not_empty
|
||||
all_numeric_or_boolean = types.all typ-> typ.is_numeric || (typ == Value_Type.Boolean)
|
||||
if all_numeric_or_boolean.not then Nothing else
|
||||
## We just find a common type again, ignoring the boolean types: the
|
||||
boolean will fit any numeric type that we get out of this.
|
||||
without_boolean = types.filter typ-> typ != Value_Type.Boolean
|
||||
find_common_type without_boolean strict=True
|
||||
|
||||
## PRIVATE
|
||||
Finds the type of an argument to a column operation.
|
||||
|
||||
|
@ -1,6 +1,9 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
import Standard.Base.Runtime.State
|
||||
from Standard.Base.Runtime import assert
|
||||
|
||||
import project.Columns_To_Keep.Columns_To_Keep
|
||||
import project.Constants.Report_Unmatched
|
||||
from project.Errors import Column_Count_Mismatch, No_Output_Columns, Unmatched_Columns
|
||||
|
||||
@ -31,41 +34,93 @@ type Match_Columns
|
||||
columns should appear in the resulting table.
|
||||
|
||||
The method assumes at least one table is provided in its input.
|
||||
match_columns tables matching_mode keep_unmatched_columns problem_builder = case matching_mode of
|
||||
Match_Columns.By_Name -> case keep_unmatched_columns of
|
||||
False ->
|
||||
match_columns tables matching_mode columns_to_keep problem_builder =
|
||||
assert tables.not_empty
|
||||
case matching_mode of
|
||||
Match_Columns.By_Name -> match_columns_by_name tables columns_to_keep problem_builder
|
||||
Match_Columns.By_Position -> match_columns_by_position tables columns_to_keep problem_builder
|
||||
|
||||
## PRIVATE
|
||||
match_columns_by_name tables columns_to_keep problem_builder = case columns_to_keep of
|
||||
Columns_To_Keep.In_List list -> if list.is_empty then Error.throw (Illegal_Argument.Error "The list of columns to keep cannot be empty.") else
|
||||
output_column_names = list.distinct
|
||||
column_counts = find_column_counts tables
|
||||
all_tables_count = tables.length
|
||||
unmatched_column_names = output_column_names.filter name->
|
||||
column_counts.get name 0 < all_tables_count
|
||||
if unmatched_column_names.not_empty then
|
||||
problem_builder.report_other_warning (Unmatched_Columns.Error unmatched_column_names)
|
||||
build_column_set_by_name tables output_column_names
|
||||
Columns_To_Keep.In_All ->
|
||||
column_counts = find_column_counts tables
|
||||
# This will only include columns that were present in all tables.
|
||||
all_tables_count = tables.length
|
||||
common_column_names = tables.first.column_names.filter name->
|
||||
column_counts.at name == all_tables_count
|
||||
if common_column_names.is_empty then Error.throw (No_Output_Columns.Error "Unmatched columns are set to be dropped, but no common column names were found.") else
|
||||
dropped_column_names = tables.map .column_names
|
||||
. flatten
|
||||
. filter (name-> column_counts.at name < all_tables_count)
|
||||
. distinct
|
||||
if dropped_column_names.not_empty then
|
||||
problem_builder.report_other_warning (Unmatched_Columns.Error dropped_column_names)
|
||||
build_column_set_by_name tables common_column_names
|
||||
_ ->
|
||||
output_column_names = distinct_columns_in_appearance_order tables
|
||||
report_missing = case columns_to_keep of
|
||||
Columns_To_Keep.In_Any -> False
|
||||
Columns_To_Keep.In_Any_Warn_On_Missing -> True
|
||||
if report_missing then
|
||||
column_counts = find_column_counts tables
|
||||
# This will only include columns that were present in all tables.
|
||||
common_column_names = tables.first.column_names.filter name->
|
||||
column_counts.at name == tables.length
|
||||
if common_column_names.is_empty then Error.throw (No_Output_Columns.Error "Unmatched columns are set to be dropped, but no common column names were found.") else
|
||||
common_column_names.map name->
|
||||
column_indices = tables.map table->
|
||||
table.column_names.index_of name
|
||||
Column_Set.Value name column_indices
|
||||
_ ->
|
||||
output_column_names = distinct_columns_in_appearance_order tables
|
||||
if keep_unmatched_columns == Report_Unmatched then
|
||||
column_counts = find_column_counts tables
|
||||
all_tables_count = tables.length
|
||||
## We iterate over output column names to get deterministic
|
||||
order of unmatched columns.
|
||||
unmatched_column_names = output_column_names.filter name->
|
||||
column_counts.get name 0 < all_tables_count
|
||||
if unmatched_column_names.not_empty then
|
||||
problem_builder.report_other_warning (Unmatched_Columns.Error unmatched_column_names)
|
||||
output_column_names.map name->
|
||||
column_indices = tables.map table->
|
||||
table.columns.index_of col-> col.name==name
|
||||
Column_Set.Value name column_indices
|
||||
Match_Columns.By_Position ->
|
||||
all_tables_count = tables.length
|
||||
## We iterate over output column names to get deterministic
|
||||
order of unmatched columns.
|
||||
unmatched_column_names = output_column_names.filter name->
|
||||
column_counts.get name 0 < all_tables_count
|
||||
if unmatched_column_names.not_empty then
|
||||
problem_builder.report_other_warning (Unmatched_Columns.Error unmatched_column_names)
|
||||
build_column_set_by_name tables output_column_names
|
||||
|
||||
## PRIVATE
|
||||
Common logic for computing the final result of by-name matching.
|
||||
Once the set of output column names is determined, we compute the
|
||||
`Column_Set` by finding the corresponding column indices in each table (if found).
|
||||
build_column_set_by_name tables output_column_names =
|
||||
output_column_names.map name->
|
||||
column_indices = tables.map table->
|
||||
# TODO this gets O(N^2), we should optimize
|
||||
table.column_names.index_of name
|
||||
Column_Set.Value name column_indices
|
||||
|
||||
## PRIVATE
|
||||
match_columns_by_position tables columns_to_keep problem_builder = case columns_to_keep of
|
||||
Columns_To_Keep.In_List _ ->
|
||||
Error.throw (Illegal_Argument.Error "The In_List option for `columns_to_keep` cannot be used together with `By_Position` matching.")
|
||||
_ ->
|
||||
column_counts = tables.map table-> table.columns.length
|
||||
minmax = column_counts.compute_bulk [Statistic.Minimum, Statistic.Maximum]
|
||||
columns_to_take = if keep_unmatched_columns == False then minmax.first else minmax.second
|
||||
if (minmax.first != minmax.second) && (keep_unmatched_columns == Report_Unmatched) then
|
||||
problem_builder.report_other_warning (Column_Count_Mismatch.Error minmax.second minmax.first)
|
||||
name_source = if keep_unmatched_columns == False then tables.first else
|
||||
tables.find table-> table.columns.length == columns_to_take
|
||||
min = minmax.first
|
||||
max = minmax.second
|
||||
columns_to_take = case columns_to_keep of
|
||||
Columns_To_Keep.In_All -> min
|
||||
Columns_To_Keep.In_Any -> max
|
||||
Columns_To_Keep.In_Any_Warn_On_Missing -> max
|
||||
has_unmatched_columns = min != max
|
||||
if has_unmatched_columns then
|
||||
should_report_unmatched = case columns_to_keep of
|
||||
Columns_To_Keep.In_All -> True
|
||||
Columns_To_Keep.In_Any -> False
|
||||
Columns_To_Keep.In_Any_Warn_On_Missing -> True
|
||||
# TODO should we rephrase the wording of the error? should it depend on In_Any_Warn_On_Missing vs In_All?
|
||||
if should_report_unmatched then
|
||||
problem_builder.report_other_warning (Column_Count_Mismatch.Error max min)
|
||||
|
||||
name_source = case columns_to_keep of
|
||||
Columns_To_Keep.In_All -> tables.first
|
||||
_ ->
|
||||
# We find the first table that has all the columns present.
|
||||
tables.find table-> table.columns.length == columns_to_take
|
||||
|
||||
column_sets = Vector.new columns_to_take i->
|
||||
name = name_source.at i . name
|
||||
column_ids = tables.map table->
|
||||
@ -79,7 +134,7 @@ type Column_Set
|
||||
Value (name : Text) (column_indices : Vector Integer)
|
||||
|
||||
## PRIVATE
|
||||
resolve_columns self all_tables = self.column_indices.zip all_tables i-> parent_table->
|
||||
resolve_columns self (all_tables : Vector) = self.column_indices.zip all_tables i-> parent_table->
|
||||
case i of
|
||||
Nothing -> Nothing
|
||||
_ : Integer -> parent_table.at i
|
||||
|
@ -23,6 +23,7 @@ import project.Aggregate_Column.Aggregate_Column
|
||||
import project.Blank_Selector.Blank_Selector
|
||||
import project.Column.Column
|
||||
import project.Column_Ref.Column_Ref
|
||||
import project.Columns_To_Keep.Columns_To_Keep
|
||||
import project.Constants.Previous_Value
|
||||
import project.Constants.Report_Unmatched
|
||||
import project.Data_Formatter.Data_Formatter
|
||||
@ -49,7 +50,9 @@ import project.Internal.Replace_Helpers
|
||||
import project.Internal.Split_Tokenize
|
||||
import project.Internal.Table_Helpers
|
||||
import project.Internal.Table_Helpers.Table_Column_Helper
|
||||
import project.Internal.Table_Helpers.Union_Result_Type
|
||||
import project.Internal.Table_Ref.Table_Ref
|
||||
import project.Internal.Value_Type_Helpers
|
||||
import project.Internal.Widget_Helpers
|
||||
import project.Join_Condition.Join_Condition
|
||||
import project.Join_Kind.Join_Kind
|
||||
@ -2245,91 +2248,78 @@ type Table
|
||||
- tables: A single table or a vector of tables to append to this one. The
|
||||
tables are concatenated in the order they are specified, with `self`
|
||||
being the first one.
|
||||
- columns_to_keep: Specifies which columns to keep. Defaults to keeping
|
||||
columns that are present in any of the tables, reporting a warning for
|
||||
columns that are not present in all tables and adding `Nothing` values
|
||||
for them.
|
||||
- match_columns: Specifies how to match the columns.
|
||||
- If `Match_Columns.By_Name` - the columns are matched by name across
|
||||
all provided tables.
|
||||
If unmatched columns are to be dropped, the resulting table will keep
|
||||
only the set of columns that appear in all provided tables, in the
|
||||
relative order that they appeared in the `self` table.
|
||||
If unmatched columns are kept, they are added in the order of
|
||||
appearance - i.e. first all columns from `self` will be added in the
|
||||
original order, then any columns from the second table that were not
|
||||
matched will be added at the end (preserving their relative order),
|
||||
and so on for all the remaining tables.
|
||||
- If `Match_Columns.By_Position` - the columns are mapped by position.
|
||||
If unmatched columns are to be dropped, the resulting table will have
|
||||
as many columns as the table that had the least columns and the
|
||||
column names of the first table (self) will be used.
|
||||
If unmatched columns are kept, the resulting table will have as many
|
||||
columns as the table with the most columns. Since the first table may
|
||||
not have all the necessary columns to provide column names for the
|
||||
result, the result will have column names taken from the first table
|
||||
that has the biggest number of columns.
|
||||
- keep_unmatched_columns: If set to `True`, unmatched columns are kept
|
||||
and are padded with `Nothing` for tables that did not have them.
|
||||
If set to `False`, only the common subset of columns is kept - any
|
||||
column that is not present in all tables is dropped. Defaults to
|
||||
`Report_Unmatched`, which behaves like `True` - unmatched columns are
|
||||
kept and padded with `Nothing`, but a problem is reported.
|
||||
- allow_type_widening: Specifies if the resulting column type should be
|
||||
adjusted to fit columns from all arguments. If `True`, a common type
|
||||
will be chosen for each column (see "Unifying Column Types" below).
|
||||
If `False`, the resulting column type will be the same as in the first
|
||||
table containing the column. In this case, all columns that are
|
||||
concatenated must have the same type as the first one (unless this
|
||||
had a `Mixed` type - in which case it will accept any other types).
|
||||
The names of each column come from the first table in which the given
|
||||
column appears in.
|
||||
The `List` option is not applicable when mapping columns by position.
|
||||
Column names are taken from the first table if `In_All` and from the
|
||||
first table that has the maximum number of columns if `In_Any`
|
||||
- on_problems: Specifies how to handle problems if they occur, reporting
|
||||
them as warnings by default.
|
||||
|
||||
- If `keep_unmatched_columns` is set to `Report_Unmatched` (the
|
||||
default):
|
||||
- If matching by name and there are columns that are not present in
|
||||
all tables, `Unmatched_Columns` is reported.
|
||||
- If matching by position and column counts of the merged tables
|
||||
differ, then a `Column_Count_Mismatch` is reported. The error will
|
||||
contain the greatest column count as its `expected` value and the
|
||||
smallest one as its `actual` value.
|
||||
- If `keep_unmatched_columns` is set to `False` and matching by name,
|
||||
it is possible that there are no columns that are common to all
|
||||
provided tables, in that case `No_Output_Columns` is thrown as a
|
||||
dataflow error regardless of the `on_problems` setting, because there
|
||||
are no columns to include in the resulting table.
|
||||
- If type widening is disabled and one of corresponding columns has a
|
||||
type that is incompatible with the type coming from the first table,
|
||||
a `Column_Type_Mismatch` is reported. The problematic column will be
|
||||
dropped from the resulting table. With type widening disabled, the
|
||||
subsequent tables must have the same types as the first one, unless
|
||||
the type of the first one was `Mixed` which will accept any other
|
||||
type.
|
||||
- If a common type coercion for a set of matched columns from
|
||||
concatenated tables cannot be found, a `No_Common_Type` is reported.
|
||||
In warning or ignore mode, the problematic column will be dropped
|
||||
from the resulting table.
|
||||
|
||||
? Unifying Column Types
|
||||
|
||||
If `allow_type_widening` is set to `True`, then the following rules are
|
||||
used to find a common type that will fit values from all merged tables.
|
||||
|
||||
Numeric columns are unified by finding the most general type that can
|
||||
fit all of the columns. The biggest integer type will be chosen and if
|
||||
Numeric columns are unified by finding the smallest type that can fit
|
||||
all of the columns. The biggest integer type will be chosen and if
|
||||
integers and decimals are mixed, the decimal type will be chosen.
|
||||
If boolean columns are mixed with numeric columns, they will be coerced
|
||||
to the numeric type (and converted to 0 and 1).
|
||||
|
||||
Text types will also be coerced according to the common rules - if
|
||||
constant-length texts of different lengths are mixed, they will be
|
||||
coerced to a varying-length type.
|
||||
Text types will are also unified by finding the smallest type that can
|
||||
fit all the values. If constant-length texts of different lengths are
|
||||
mixed, they will be coerced to a varying-length type.
|
||||
|
||||
If date and date-time columns are unified, this yields a date-time
|
||||
column. In-memory, the date is promoted by adding a time of 00:00 and
|
||||
the system time-zone. In other backends that behaviour may differ.
|
||||
|
||||
If one of the matched columns has `Mixed` type, that type will be used
|
||||
regardless of types of other columns. Mixing any other types will
|
||||
result in a `No_Common_Type` problem. If columns of incompatible types
|
||||
are meant to be mixed, at least one of them should be explicitly
|
||||
retyped to the `Mixed` type to indicate that intention. Note that the
|
||||
`Mixed` type may not be supported by most Database backends.
|
||||
union : (Table | Vector Table) -> Match_Columns -> Boolean | Report_Unmatched -> Boolean -> Problem_Behavior -> Table
|
||||
union self tables:(Table | Vector) match_columns=Match_Columns.By_Name keep_unmatched_columns=Report_Unmatched allow_type_widening=True on_problems=Report_Warning =
|
||||
Table.from_union ([self] + Vector.unify_vector_or_element tables) match_columns keep_unmatched_columns allow_type_widening on_problems
|
||||
regardless of types of other columns. Note that the `Mixed` type may
|
||||
not be supported by most Database backends.
|
||||
|
||||
Finally, if no common type is found using the rules above, everything
|
||||
is converted to text.
|
||||
|
||||
? Problem Conditions
|
||||
|
||||
- If no common type is found and the text conversion fallback is used,
|
||||
the `No_Common_Type` problem is reported.
|
||||
- The `Float` type may not be able to exactly represent larger
|
||||
integers, thus if such large integers are mixed with floats, the
|
||||
resulting conversion to `Float` may cause a loss of precision.
|
||||
In that case, a `Loss_Of_Integer_Precision` problem is reported.
|
||||
This warning is only reported in the in-memory backend. Currently,
|
||||
the Database backend proceeds without a warning about precision loss.
|
||||
- If a column of dates is unified with a column of date-times, since
|
||||
the assumption of using the midnight time-of-day is arbitrary,
|
||||
a `Implicit_Date_As_Date_Time_Conversion` problem is reported.
|
||||
- If an empty vector of tables is provided, an `Illegal_Argument` error
|
||||
is raised.
|
||||
- If `columns_to_keep` is set to `In_All` or `List` and an expected
|
||||
column is missing in some of the tables, a `Unmatched_Columns`
|
||||
problem is reported. If this causes the output to contain no columns,
|
||||
a `No_Output_Columns` error is raised.
|
||||
|
||||
? Ordering of Columns in the result
|
||||
|
||||
When matching columns by name, it is possible that the ordering of
|
||||
columns may vary between input tables. The ordering is determined as
|
||||
following: columns that are kept from the first table are in the order
|
||||
they appear in that table. If there are columns that do not appear in
|
||||
the first table, they are appended to the end of the resulting table in
|
||||
the order they appear in the input.
|
||||
@tables (Widget.Vector_Editor item_editor=Widget.Code_Input item_default='_' display=Display.Always)
|
||||
@columns_to_keep Columns_To_Keep.default_widget
|
||||
union : (Table | Vector Table) -> Columns_To_Keep -> Match_Columns -> Problem_Behavior -> Table
|
||||
union self tables:(Table | Vector) (columns_to_keep : Columns_To_Keep = ..In_Any_Warn_On_Missing) (match_columns : Match_Columns = Match_Columns.By_Name) (on_problems : Problem_Behavior = Report_Warning) =
|
||||
Table.from_union ([self] + Vector.unify_vector_or_element tables) columns_to_keep match_columns on_problems
|
||||
|
||||
## ALIAS drop_missing_rows, dropna
|
||||
GROUP Standard.Base.Selections
|
||||
@ -2953,108 +2943,97 @@ type Table
|
||||
Arguments:
|
||||
- tables: A vector of tables to union together. The
|
||||
tables are concatenated in the order they are specified.
|
||||
- columns_to_keep: Specifies which columns to keep. Defaults to keeping
|
||||
columns that are present in any of the tables, reporting a warning for
|
||||
columns that are not present in all tables and adding `Nothing` values
|
||||
for them.
|
||||
- match_columns: Specifies how to match the columns.
|
||||
- If `Match_Columns.By_Name` - the columns are matched by name across
|
||||
all provided tables.
|
||||
If unmatched columns are to be dropped, the resulting table will keep
|
||||
only the set of columns that appear in all provided tables, in the
|
||||
relative order that they appeared in the `self` table.
|
||||
If unmatched columns are kept, they are added in the order of
|
||||
appearance - i.e. first all columns from `self` will be added in the
|
||||
original order, then any columns from the second table that were not
|
||||
matched will be added at the end (preserving their relative order),
|
||||
and so on for all the remaining tables.
|
||||
- If `Match_Columns.By_Position` - the columns are mapped by position.
|
||||
If unmatched columns are to be dropped, the resulting table will have
|
||||
as many columns as the table that had the least columns and the
|
||||
column names of the first table (self) will be used.
|
||||
If unmatched columns are kept, the resulting table will have as many
|
||||
columns as the table with the most columns. Since the first table may
|
||||
not have all the necessary columns to provide column names for the
|
||||
result, the result will have column names taken from the first table
|
||||
that has the biggest number of columns.
|
||||
- keep_unmatched_columns: If set to `True`, unmatched columns are kept
|
||||
and are padded with `Nothing` for tables that did not have them.
|
||||
If set to `False`, only the common subset of columns is kept - any
|
||||
column that is not present in all tables is dropped. Defaults to
|
||||
`Report_Unmatched`, which behaves like `True` - unmatched columns are
|
||||
kept and padded with `Nothing`, but a problem is reported.
|
||||
- allow_type_widening: Specifies if the resulting column type should be
|
||||
adjusted to fit columns from all arguments. If `True`, a common type
|
||||
will be chosen for each column (see "Unifying Column Types" below).
|
||||
If `False`, the resulting column type will be the same as in the first
|
||||
table containing the column. In this case, all columns that are
|
||||
concatenated must have the same type as the first one (unless this
|
||||
had a `Mixed` type - in which case it will accept any other types).
|
||||
The names of each column come from the first table in which the given
|
||||
column appears in.
|
||||
The `List` option is not applicable when mapping columns by position.
|
||||
Column names are taken from the first table if `In_All` and from the
|
||||
first table that has the maximum number of columns if `In_Any`.
|
||||
- on_problems: Specifies how to handle problems if they occur, reporting
|
||||
them as warnings by default.
|
||||
|
||||
- If `keep_unmatched_columns` is set to `Report_Unmatched` (the
|
||||
default):
|
||||
- If matching by name and there are columns that are not present in
|
||||
all tables, `Unmatched_Columns` is reported.
|
||||
- If matching by position and column counts of the merged tables
|
||||
differ, then a `Column_Count_Mismatch` is reported. The error will
|
||||
contain the greatest column count as its `expected` value and the
|
||||
smallest one as its `actual` value.
|
||||
- If `keep_unmatched_columns` is set to `False` and matching by name,
|
||||
it is possible that there are no columns that are common to all
|
||||
provided tables, in that case `No_Output_Columns` is thrown as a
|
||||
dataflow error regardless of the `on_problems` setting, because there
|
||||
are no columns to include in the resulting table.
|
||||
- If type widening is disabled and one of corresponding columns has a
|
||||
type that is incompatible with the type coming from the first table,
|
||||
a `Column_Type_Mismatch` is reported. The problematic column will be
|
||||
dropped from the resulting table. With type widening disabled, the
|
||||
subsequent tables must have the same types as the first one, unless
|
||||
the type of the first one was `Mixed` which will accept any other
|
||||
type.
|
||||
- If a common type coercion for a set of matched columns from
|
||||
concatenated tables cannot be found, a `No_Common_Type` is reported.
|
||||
In warning or ignore mode, the problematic column will be dropped
|
||||
from the resulting table.
|
||||
|
||||
? Unifying Column Types
|
||||
|
||||
If `allow_type_widening` is set to `True`, then the following rules are
|
||||
used to find a common type that will fit values from all merged tables.
|
||||
|
||||
Numeric columns are unified by finding the most general type that can
|
||||
fit all of the columns. The biggest integer type will be chosen and if
|
||||
Numeric columns are unified by finding the smallest type that can fit
|
||||
all of the columns. The biggest integer type will be chosen and if
|
||||
integers and decimals are mixed, the decimal type will be chosen.
|
||||
If boolean columns are mixed with numeric columns, they will be coerced
|
||||
to the numeric type (and converted to 0 and 1).
|
||||
|
||||
Text types will also be coerced according to the common rules - if
|
||||
constant-length texts of different lengths are mixed, they will be
|
||||
coerced to a varying-length type.
|
||||
Text types will are also unified by finding the smallest type that can
|
||||
fit all the values. If constant-length texts of different lengths are
|
||||
mixed, they will be coerced to a varying-length type.
|
||||
|
||||
If date and date-time columns are unified, this yields a date-time
|
||||
column. In-memory, the date is promoted by adding a time of 00:00 and
|
||||
the system time-zone. In other backends that behaviour may differ.
|
||||
|
||||
If one of the matched columns has `Mixed` type, that type will be used
|
||||
regardless of types of other columns. Mixing any other types will
|
||||
result in a `No_Common_Type` problem. If columns of incompatible types
|
||||
are meant to be mixed, at least one of them should be explicitly
|
||||
retyped to the `Mixed` type to indicate that intention. Note that the
|
||||
`Mixed` type may not be supported by most Database backends.
|
||||
from_union : (Vector Table) -> Match_Columns -> Boolean | Report_Unmatched -> Boolean -> Problem_Behavior -> Table
|
||||
from_union tables:(Vector) match_columns=Match_Columns.By_Name keep_unmatched_columns=Report_Unmatched allow_type_widening=True on_problems=Report_Warning =
|
||||
regardless of types of other columns. Note that the `Mixed` type may
|
||||
not be supported by most Database backends.
|
||||
|
||||
Finally, if no common type is found using the rules above, everything
|
||||
is converted to text.
|
||||
|
||||
? Problem Conditions
|
||||
|
||||
- If no common type is found and the text conversion fallback is used,
|
||||
the `No_Common_Type` problem is reported.
|
||||
- The `Float` type may not be able to exactly represent larger
|
||||
integers, thus if such large integers are mixed with floats, the
|
||||
resulting conversion to `Float` may cause a loss of precision.
|
||||
In that case, a `Loss_Of_Integer_Precision` problem is reported.
|
||||
This warning is only reported in the in-memory backend. Currently,
|
||||
the Database backend proceeds without a warning about precision loss.
|
||||
- If a column of dates is unified with a column of date-times, since
|
||||
the assumption of using the midnight time-of-day is arbitrary,
|
||||
a `Implicit_Date_As_Date_Time_Conversion` problem is reported.
|
||||
- If an empty vector of tables is provided, an `Illegal_Argument` error
|
||||
is raised.
|
||||
- If `columns_to_keep` is set to `In_All` or `List` and an expected
|
||||
column is missing in some of the tables, a `Unmatched_Columns`
|
||||
problem is reported. If this causes the output to contain no columns,
|
||||
a `No_Output_Columns` error is raised.
|
||||
|
||||
? Ordering of Columns in the result
|
||||
|
||||
When matching columns by name, it is possible that the ordering of
|
||||
columns may vary between input tables. The ordering is determined as
|
||||
following: columns that are kept from the first table are in the order
|
||||
they appear in that table. If there are columns that do not appear in
|
||||
the first table, they are appended to the end of the resulting table in
|
||||
the order they appear in the input.
|
||||
@tables (Widget.Vector_Editor item_editor=Widget.Code_Input item_default='_' display=Display.Always)
|
||||
@columns_to_keep Columns_To_Keep.default_widget
|
||||
from_union : (Vector Table) -> Columns_To_Keep -> Match_Columns -> Problem_Behavior -> Table ! No_Output_Columns | Illegal_Argument
|
||||
from_union (tables : Vector) (columns_to_keep : Columns_To_Keep = ..In_Any_Warn_On_Missing) (match_columns : Match_Columns = Match_Columns.By_Name) (on_problems : Problem_Behavior = Report_Warning) =
|
||||
all_tables = (tables.map t-> Table.from t)
|
||||
all_tables.if_not_error <|
|
||||
if all_tables.is_empty then Error.throw (Illegal_Argument.Error "`Table.from_union` needs at least 1 input table.") else
|
||||
## We keep separate problem builders, because if we are reporting `No_Output_Columns`,
|
||||
we only want to add a cause coming from unification; matching reports problems that would not fit this error.
|
||||
problem_builder_for_matching = Problem_Builder.new
|
||||
problem_builder_for_unification = Problem_Builder.new
|
||||
matched_column_sets = Match_Columns_Helpers.match_columns all_tables match_columns keep_unmatched_columns problem_builder_for_matching
|
||||
matched_column_sets = Match_Columns_Helpers.match_columns all_tables match_columns columns_to_keep problem_builder_for_matching
|
||||
result_row_count = all_tables.fold 0 c-> t-> c + t.row_count
|
||||
merged_columns = matched_column_sets.map column_set->
|
||||
case Table_Helpers.unify_result_type_for_union column_set all_tables allow_type_widening problem_builder_for_unification of
|
||||
Nothing -> Nothing
|
||||
result_type : Value_Type ->
|
||||
concat_columns column_set all_tables result_type result_row_count on_problems
|
||||
good_columns = merged_columns.filter Filter_Condition.Not_Nothing
|
||||
case Table_Helpers.unify_result_type_for_union column_set all_tables problem_builder_for_unification of
|
||||
Union_Result_Type.Common_Type common_type ->
|
||||
concat_columns column_set all_tables common_type result_row_count needs_cast=False on_problems
|
||||
Union_Result_Type.Fallback_To_Text ->
|
||||
concat_columns column_set all_tables Value_Type.Char result_row_count needs_cast=True on_problems
|
||||
Union_Result_Type.No_Types_To_Unify ->
|
||||
Column.from_repeated_item column_set.name Nothing result_row_count
|
||||
problem_builder_for_matching.attach_problems_before on_problems <|
|
||||
problem_builder_for_unification.attach_problems_before on_problems <|
|
||||
if good_columns.is_empty then problem_builder_for_unification.raise_no_output_columns_with_cause else
|
||||
Table.new good_columns
|
||||
if merged_columns.is_empty then problem_builder_for_unification.raise_no_output_columns_with_cause else
|
||||
Table.new merged_columns
|
||||
|
||||
## PRIVATE
|
||||
A helper to create a new table consisting of slices of the original table.
|
||||
@ -3073,7 +3052,7 @@ make_join_helpers left_table right_table =
|
||||
|
||||
## PRIVATE
|
||||
A helper that efficiently concatenates storages of in-memory columns.
|
||||
concat_columns column_set all_tables result_type result_row_count on_problems =
|
||||
concat_columns column_set all_tables result_type result_row_count needs_cast on_problems =
|
||||
Java_Problems.with_problem_aggregator on_problems java_problem_aggregator->
|
||||
storage_builder = make_storage_builder_for_type result_type on_problems initial_size=result_row_count java_problem_aggregator
|
||||
column_set.column_indices.zip all_tables i-> parent_table->
|
||||
@ -3082,7 +3061,9 @@ concat_columns column_set all_tables result_type result_row_count on_problems =
|
||||
null_row_count = parent_table.row_count
|
||||
storage_builder.appendNulls null_row_count
|
||||
_ : Integer ->
|
||||
storage = parent_table.at i . java_column . getStorage
|
||||
column = parent_table.at i
|
||||
converted = if needs_cast then column.cast result_type on_problems=Report_Error else column
|
||||
storage = converted.java_column.getStorage
|
||||
storage_builder.appendBulkStorage storage
|
||||
sealed_storage = storage_builder.seal
|
||||
Column.from_storage column_set.name sealed_storage
|
||||
|
@ -1,11 +1,16 @@
|
||||
package org.enso.table.data.column.builder;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.datetime.DateStorage;
|
||||
import org.enso.table.data.column.storage.datetime.DateTimeStorage;
|
||||
import org.enso.table.data.column.storage.type.DateTimeType;
|
||||
import org.enso.table.data.column.storage.type.DateType;
|
||||
import org.enso.table.data.column.storage.type.StorageType;
|
||||
import org.enso.table.error.ValueTypeMismatchException;
|
||||
import org.graalvm.polyglot.Context;
|
||||
|
||||
/** A builder for string columns. */
|
||||
public class DateTimeBuilder extends TypedBuilderImpl<ZonedDateTime> {
|
||||
@ -23,6 +28,14 @@ public class DateTimeBuilder extends TypedBuilderImpl<ZonedDateTime> {
|
||||
return DateTimeType.INSTANCE;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO DRY {@link org.enso.table.data.column.operation.cast.ToDateTimeStorageConverter}
|
||||
* convertDate.
|
||||
*/
|
||||
private ZonedDateTime convertDate(LocalDate date) {
|
||||
return date.atStartOfDay().atZone(ZoneId.systemDefault());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendNoGrow(Object o) {
|
||||
try {
|
||||
@ -32,6 +45,34 @@ public class DateTimeBuilder extends TypedBuilderImpl<ZonedDateTime> {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendBulkStorage(Storage<?> storage) {
|
||||
if (storage.getType() instanceof DateType) {
|
||||
if (storage instanceof DateStorage dateStorage) {
|
||||
Context context = Context.getCurrent();
|
||||
for (int i = 0; i < dateStorage.size(); ++i) {
|
||||
LocalDate date = dateStorage.getItemBoxed(i);
|
||||
if (date == null) {
|
||||
data[currentSize++] = null;
|
||||
} else {
|
||||
data[currentSize++] = convertDate(date);
|
||||
}
|
||||
|
||||
context.safepoint();
|
||||
}
|
||||
} else {
|
||||
throw new IllegalStateException(
|
||||
"Unexpected storage implementation for type "
|
||||
+ storage.getType()
|
||||
+ ": "
|
||||
+ storage
|
||||
+ ". This is a bug in the Table library.");
|
||||
}
|
||||
} else {
|
||||
super.appendBulkStorage(storage);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accepts(Object o) {
|
||||
return o instanceof ZonedDateTime;
|
||||
|
@ -28,6 +28,15 @@ public abstract class TypedBuilder extends Builder {
|
||||
*/
|
||||
public abstract TypedBuilder retypeTo(StorageType type);
|
||||
|
||||
/** Specifies if the following object will be accepted by this builder's append* methods. */
|
||||
/**
|
||||
* Specifies if the following object will be accepted by this builder's append* methods.
|
||||
*
|
||||
* <p>This is used to determine if a given value can be appended to the current builder, or if it
|
||||
* needs to be retyped to a more general one.
|
||||
*
|
||||
* <p>Note that the {@code appendBulkStorage} method may still accept more types than {@code
|
||||
* accept}. This is exploited by operations like Union where more flexibility in merging column
|
||||
* types is allowed than in building new columns from scratch.
|
||||
*/
|
||||
public abstract boolean accepts(Object o);
|
||||
}
|
||||
|
@ -171,16 +171,15 @@ public class Column {
|
||||
|
||||
Object converted = Polyglot_Utils.convertPolyglotValue(item);
|
||||
|
||||
Builder builder;
|
||||
if (converted == null) {
|
||||
builder = new MixedBuilder(repeat);
|
||||
} else {
|
||||
StorageType storageType = StorageType.forBoxedItem(converted);
|
||||
builder = Builder.getForType(storageType, repeat, problemAggregator);
|
||||
Builder builder = new MixedBuilder(repeat);
|
||||
builder.appendNulls(repeat);
|
||||
return new Column(name, builder.seal());
|
||||
}
|
||||
|
||||
StorageType storageType = StorageType.forBoxedItem(converted);
|
||||
Builder builder = Builder.getForType(storageType, repeat, problemAggregator);
|
||||
Context context = Context.getCurrent();
|
||||
|
||||
for (int i = 0; i < repeat; i++) {
|
||||
builder.appendNoGrow(converted);
|
||||
context.safepoint();
|
||||
|
@ -9,7 +9,7 @@ from Standard.Database.Errors import Unsupported_Database_Operation, Integrity_E
|
||||
|
||||
from Standard.Test import all
|
||||
|
||||
from project.Common_Table_Operations.Util import expect_column_names, run_default_backend, within_table
|
||||
from project.Common_Table_Operations.Util import all
|
||||
import project.Util
|
||||
|
||||
main filter=Nothing = run_default_backend add_specs filter
|
||||
@ -42,6 +42,11 @@ add_specs suite_builder setup =
|
||||
suite_builder.group prefix+"Table.from_union" pending=db_pending group_builder->
|
||||
run_union_tests group_builder setup call_static_union
|
||||
|
||||
group_builder.specify "should fail if no tables are provided" <|
|
||||
r = Table.from_union []
|
||||
r.should_fail_with Illegal_Argument
|
||||
r.catch.to_display_text . should_contain "at least 1"
|
||||
|
||||
run_union_tests group_builder setup call_union =
|
||||
create_connection_fn = setup.create_connection_func
|
||||
data = Data.setup create_connection_fn
|
||||
@ -90,37 +95,70 @@ run_union_tests group_builder setup call_union =
|
||||
problems2 = [Unmatched_Columns.Error ["A", "D"]]
|
||||
Problems.test_problem_handling action2 problems2 tester2
|
||||
|
||||
group_builder.specify "should drop unmatched columns if asked to" <|
|
||||
group_builder.specify "should fill unmatched columns with nulls with no warning, if In_Any is explicitly chosen" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]]
|
||||
t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]]
|
||||
t3 = table_builder [["D", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]]
|
||||
|
||||
table = call_union [t1, t2, t3] columns_to_keep=..In_Any on_problems=..Report_Error
|
||||
Problems.assume_no_problems table
|
||||
expect_column_names ["A", "B", "C", "D"] table
|
||||
table.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, Nothing, Nothing, Nothing]
|
||||
table.at "B" . to_vector . should_equal ["a", "b", "c", Nothing, Nothing, Nothing, Nothing, Nothing, Nothing]
|
||||
table.at "C" . to_vector . should_equal [Nothing, Nothing, Nothing, "d", "e", "f", "g", "h", "i"]
|
||||
table.at "D" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, 0]
|
||||
|
||||
group_builder.specify "should drop unmatched columns and warn, if In_All is selected" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]]
|
||||
t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]]
|
||||
t3 = table_builder [["A", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]]
|
||||
|
||||
t4 = call_union[t1, t2, t3] keep_unmatched_columns=False on_problems=Problem_Behavior.Report_Error
|
||||
Problems.assume_no_problems t4
|
||||
t4 = call_union [t1, t2, t3] columns_to_keep=..In_All
|
||||
w = Problems.expect_only_warning Unmatched_Columns t4
|
||||
w.column_names.should_equal ["B", "C"]
|
||||
expect_column_names ["A"] t4
|
||||
t4.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, Nothing, Nothing, 0]
|
||||
|
||||
group_builder.specify "should keep unmatched columns without errors if asked to" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]]
|
||||
t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]]
|
||||
t3 = table_builder [["A", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]]
|
||||
|
||||
t4 = call_union [t1, t2, t3] keep_unmatched_columns=True on_problems=Problem_Behavior.Report_Error
|
||||
Problems.assume_no_problems t4
|
||||
expect_column_names ["A", "B", "C"] t4
|
||||
t4.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, Nothing, Nothing, 0]
|
||||
t4.at "B" . to_vector . should_equal ["a", "b", "c", Nothing, Nothing, Nothing, Nothing, Nothing, Nothing]
|
||||
t4.at "C" . to_vector . should_equal [Nothing, Nothing, Nothing, "d", "e", "f", "g", "h", "i"]
|
||||
|
||||
group_builder.specify "should fail if asked to drop unmatched columns but the set of common columns is empty" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]]
|
||||
t2 = table_builder [["C", ["d", "e", "f"]], ["A", [4, 5, 6]]]
|
||||
t3 = table_builder [["D", [Nothing, Nothing, 0]], ["C", ["g", "h", "i"]]]
|
||||
|
||||
t4 = call_union [t1, t2, t3] keep_unmatched_columns=False on_problems=Problem_Behavior.Ignore
|
||||
t4 = call_union [t1, t2, t3] columns_to_keep=..In_All on_problems=..Ignore
|
||||
t4.should_fail_with No_Output_Columns
|
||||
t4.catch.to_display_text . should_equal "No columns in the result, because of another problem: Unmatched columns are set to be dropped, but no common column names were found."
|
||||
|
||||
group_builder.specify "should allow to select specified columns for union by In_List, using the ordering from the list" <|
|
||||
t1 = table_builder [["A", [1]], ["X", [2]], ["B", ["a"]], ["Y", [3]]]
|
||||
t2 = table_builder [["A", [4]], ["Z", [5]], ["B", ["b"]], ["X", [6]]]
|
||||
|
||||
t3 = call_union [t1, t2] columns_to_keep=(..In_List ["B", "A"])
|
||||
expect_column_names ["B", "A"] t3
|
||||
t3.at "B" . to_vector . should_equal ["a", "b"]
|
||||
t3.at "A" . to_vector . should_equal [1, 4]
|
||||
|
||||
group_builder.specify "should add a Null column for unmatched columns from In_List" <|
|
||||
t1 = table_builder [["A", [1]], ["X", [2]]]
|
||||
t2 = table_builder [["Z", [4]], ["A", [5]]]
|
||||
|
||||
t3 = call_union [t1, t2] columns_to_keep=(..In_List ["B", "A"])
|
||||
expect_column_names ["B", "A"] t3
|
||||
t3.at "B" . to_vector . should_equal [Nothing, Nothing]
|
||||
t3.at "A" . to_vector . should_equal [1, 5]
|
||||
|
||||
group_builder.specify "does not allow an empty list in In_List" <|
|
||||
t1 = table_builder [["A", [1]], ["X", [2]]]
|
||||
t2 = table_builder [["Z", [4]], ["A", [5]]]
|
||||
r = call_union [t1, t2] columns_to_keep=(..In_List [])
|
||||
r.should_fail_with Illegal_Argument
|
||||
|
||||
group_builder.specify "does not error if duplicate entries appear in the In_List" <|
|
||||
t1 = table_builder [["A", [1]], ["X", [2]], ["B", ["a"]], ["Y", [3]]]
|
||||
t2 = table_builder [["A", [4]], ["Z", [5]], ["B", ["b"]], ["X", [6]]]
|
||||
|
||||
t3 = call_union [t1, t2] columns_to_keep=(..In_List ["B", "B", "A", "A", "B"])
|
||||
expect_column_names ["B", "A"] t3
|
||||
|
||||
group_builder.specify "should ignore column names when matching by position" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["Y", ["a", "b", "c"]]]
|
||||
t2 = table_builder [["X", [4, 5, 6]], ["A", ["d", "e", "f"]]]
|
||||
@ -144,28 +182,36 @@ run_union_tests group_builder setup call_union =
|
||||
problems = [Column_Count_Mismatch.Error 3 1]
|
||||
Problems.test_problem_handling action problems tester
|
||||
|
||||
group_builder.specify "should keep the least number of columns with positional matching if asked to drop unmatched ones" <|
|
||||
group_builder.specify "should keep the least number of columns with positional matching if In_All" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]]
|
||||
t2 = table_builder [["A1", [4, 5, 6]], ["B1", ["d", "e", "f"]], ["C", [7, 8, 9]]]
|
||||
t3 = table_builder [["A2", [10, 20, 30]]]
|
||||
|
||||
t4 = call_union [t1, t2, t3] keep_unmatched_columns=False match_columns=Match_Columns.By_Position on_problems=Problem_Behavior.Report_Error
|
||||
Problems.assume_no_problems t4
|
||||
t4 = call_union [t1, t2, t3] columns_to_keep=..In_All match_columns=..By_Position
|
||||
expect_column_names ["A"] t4
|
||||
t4.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6, 10, 20, 30]
|
||||
w = Problems.expect_only_warning Column_Count_Mismatch t4
|
||||
w.expected.should_equal 3
|
||||
w.actual.should_equal 1
|
||||
|
||||
group_builder.specify "should keep the greatest number of columns with positional matching if asked to keep unmatched ones, filling missing values with null and reporting no problems" <|
|
||||
group_builder.specify "should keep the greatest number of columns with positional matching if In_Any, reporting no problems" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]]
|
||||
t2 = table_builder [["A1", [4, 5, 6]], ["B1", ["d", "e", "f"]], ["C", [7, 8, 9]]]
|
||||
t3 = table_builder [["A2", [10, 20, 30]]]
|
||||
|
||||
t4 = call_union [t1, t2, t3] match_columns=Match_Columns.By_Position keep_unmatched_columns=True on_problems=Problem_Behavior.Ignore
|
||||
t4 = call_union [t1, t2, t3] columns_to_keep=..In_Any match_columns=..By_Position on_problems=..Report_Error
|
||||
Problems.assume_no_problems t4
|
||||
expect_column_names ["A1", "B1", "C"] t4
|
||||
t4.at "A1" . to_vector . should_equal [1, 2, 3, 4, 5, 6, 10, 20, 30]
|
||||
t4.at "B1" . to_vector . should_equal ["a", "b", "c", "d", "e", "f", Nothing, Nothing, Nothing]
|
||||
t4.at "C" . to_vector . should_equal [Nothing, Nothing, Nothing, 7, 8, 9, Nothing, Nothing, Nothing]
|
||||
|
||||
group_builder.specify "does not allow In_List with positional matching" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]]
|
||||
t2 = table_builder [["A1", [4, 5, 6]], ["B1", ["d", "e", "f"]], ["C", [7, 8, 9]]]
|
||||
r = call_union [t1, t2] columns_to_keep=(..In_List ["A", "B"]) match_columns=Match_Columns.By_Position
|
||||
r.should_fail_with Illegal_Argument
|
||||
|
||||
group_builder.specify "should use column names from the first table that has enough columns in positional matching mode" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]]]
|
||||
t2 = table_builder [["X", [4, 5, 6]], ["A", ["a", "b", "c"]]]
|
||||
@ -180,10 +226,6 @@ run_union_tests group_builder setup call_union =
|
||||
check t3
|
||||
Problems.get_attached_warnings t3 . should_equal [Column_Count_Mismatch.Error 2 1]
|
||||
|
||||
t4 = call_union [t1, t2] match_columns=Match_Columns.By_Position keep_unmatched_columns=True
|
||||
within_table t4 <|
|
||||
check t4
|
||||
|
||||
t5 = table_builder [["Y", [7, 8, 9]], ["A", ["d", "e", "f"]], ["Z", [10, 11, 12]]]
|
||||
t6 = table_builder [["W", [0]]]
|
||||
t7 = table_builder [["X", [7, 8, 9]], ["Y", ["d", "e", "f"]], ["Z", [10, 11, 12]]]
|
||||
@ -216,11 +258,13 @@ run_union_tests group_builder setup call_union =
|
||||
check_same <| call_union [t1]
|
||||
check_same <| call_union [t1] match_columns=Match_Columns.By_Position
|
||||
|
||||
check_same <| call_union [t1] keep_unmatched_columns=False
|
||||
check_same <| call_union [t1] match_columns=Match_Columns.By_Position keep_unmatched_columns=False
|
||||
check_same <| call_union [t1] columns_to_keep=..In_All
|
||||
check_same <| call_union [t1] match_columns=Match_Columns.By_Position columns_to_keep=..In_All
|
||||
|
||||
check_same <| call_union [t1] keep_unmatched_columns=True
|
||||
check_same <| call_union [t1] match_columns=Match_Columns.By_Position keep_unmatched_columns=True
|
||||
check_same <| call_union [t1] columns_to_keep=..In_Any
|
||||
check_same <| call_union [t1] match_columns=Match_Columns.By_Position columns_to_keep=..In_Any
|
||||
|
||||
check_same <| call_union [t1] columns_to_keep=(..In_List ["A", "B"])
|
||||
|
||||
group_builder.specify "should correctly unify text columns of various lengths" pending=(if setup.test_selection.fixed_length_text_columns.not then "Fixed-length Char columns are not supported by this backend.") <|
|
||||
t1 = (table_builder [["A", ["a", "b", "c"]]]) . cast "A" (Value_Type.Char size=1 variable_length=False)
|
||||
@ -231,12 +275,13 @@ run_union_tests group_builder setup call_union =
|
||||
|
||||
t3 = call_union [t1, t2]
|
||||
expect_column_names ["A"] t3
|
||||
Problems.assume_no_problems t3
|
||||
t3.at "A" . to_vector . should_equal ["a", "b", "c", "xyz", "abc", "def"]
|
||||
t3.at "A" . value_type . is_text . should_be_true
|
||||
Test.with_clue "t3[A].value_type="+(t3.at "A").value_type.to_display_text+": " <|
|
||||
t3.at "A" . value_type . variable_length . should_be_true
|
||||
|
||||
group_builder.specify "should find a common type that will fit the merged columns" <|
|
||||
group_builder.specify "should find a common type that will fit the merged columns (Integer + Float)" <|
|
||||
t1 = table_builder [["A", [0, 1, 2]]]
|
||||
t2 = table_builder [["A", [1.0, 2.0, 2.5]]]
|
||||
|
||||
@ -245,29 +290,77 @@ run_union_tests group_builder setup call_union =
|
||||
|
||||
t3 = call_union [t1, t2]
|
||||
expect_column_names ["A"] t3
|
||||
Problems.assume_no_problems t3
|
||||
t3.at "A" . value_type . is_floating_point . should_be_true
|
||||
t3.at "A" . to_vector . should_equal [0, 1, 2, 1.0, 2.0, 2.5]
|
||||
|
||||
# Specific type tests that apply to in-memory. Database behaviour is up to implementation.
|
||||
if setup.is_database.not then
|
||||
t4 = table_builder [["A", [2^100, 2^10, 2]]]
|
||||
t4.at "A" . value_type . should_be_a (Value_Type.Decimal ...)
|
||||
group_builder.specify "should find a common type that will fit the merged columns (numeric + Boolean)" <|
|
||||
t1 = table_builder [["A", [0, 1, 20]]]
|
||||
t2 = table_builder [["A", [True, False, True]]]
|
||||
|
||||
t5 = call_union [t2, t4]
|
||||
expect_column_names ["A"] t5
|
||||
t5.at "A" . value_type . is_floating_point . should_be_true
|
||||
t5.at "A" . to_vector . should_equal [1.0, 2.0, 2.5, 2^100, 2^10, 2]
|
||||
t1.at "A" . value_type . is_integer . should_be_true
|
||||
t2.at "A" . value_type . should_equal Value_Type.Boolean
|
||||
|
||||
t6 = call_union [t1, t4]
|
||||
expect_column_names ["A"] t6
|
||||
t6.at "A" . value_type . should_be_a (Value_Type.Decimal ...)
|
||||
t6.at "A" . to_vector . should_equal [0, 1, 2, 2^100, 2^10, 2]
|
||||
t3 = call_union [t1, t2]
|
||||
expect_column_names ["A"] t3
|
||||
Problems.assume_no_problems t3
|
||||
t3.at "A" . value_type . is_integer . should_be_true
|
||||
t3.at "A" . to_vector . should_equal [0, 1, 20, 1, 0, 1]
|
||||
|
||||
t4 = table_builder [["A", [1.5, 0.0, 2.0]]]
|
||||
t5 = call_union [t2, t4]
|
||||
Problems.assume_no_problems t5
|
||||
t5.at "A" . value_type . is_floating_point . should_be_true
|
||||
t5.at "A" . to_vector . should_equal [1.0, 0.0, 1.0, 1.5, 0.0, 2.0]
|
||||
|
||||
group_builder.specify "should warn about loss of precision when converting large Integer to Float" pending=(if setup.is_database then "Loss_Of_Integer_Precision not yet supported in DB.") <|
|
||||
# 2^70 is not exactly representable as a Float.
|
||||
t1 = table_builder [["A", [2^70, 2^10, 2]]]
|
||||
t2 = table_builder [["A", [1.5, 2.0, 2.5]]]
|
||||
t1.at "A" . value_type . is_decimal . should_be_true
|
||||
t2.at "A" . value_type . is_floating_point . should_be_true
|
||||
|
||||
t3 = call_union [t1, t2]
|
||||
expect_column_names ["A"] t3
|
||||
w = Problems.expect_only_warning Loss_Of_Integer_Precision t3
|
||||
# TODO should we try to include column name here for context? may be worth it...
|
||||
w.affected_rows_count.should_equal 1
|
||||
t3.at "A" . value_type . is_floating_point . should_be_true
|
||||
t3.at "A" . to_vector . should_equal [(2^70).to_float, 2^10, 2, 1.5, 2.0, 2.5]
|
||||
|
||||
group_builder.specify "should find a common type (Integer and Char of different sizes)" <|
|
||||
t1 = (table_builder [["X", [0, 1, 2]], ["Y", ['aa', 'bb', 'cc']]]) . cast "X" (Value_Type.Integer Bits.Bits_16) . cast "Y" (Value_Type.Char size=2 variable_length=False)
|
||||
t2 = (table_builder [["X", [3, 4, 5]], ["Y", ['x', 'y', 'z']]]) . cast "X" (Value_Type.Integer Bits.Bits_32) . cast "Y" (Value_Type.Char size=1 variable_length=False)
|
||||
supports_complex_types = (t1.is_error || t2.is_error || Problems.get_attached_warnings t1 . not_empty).not
|
||||
case supports_complex_types of
|
||||
False -> Nothing
|
||||
True ->
|
||||
t12 = call_union [t1, t2]
|
||||
# No warnings are expected
|
||||
Problems.assume_no_problems t12
|
||||
t12.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_32)
|
||||
t12.at "Y" . value_type . should_equal (Value_Type.Char size=2 variable_length=True)
|
||||
|
||||
t12.at "X" . to_vector . should_equal [0, 1, 2, 3, 4, 5]
|
||||
t12.at "Y" . to_vector . should_equal ['aa', 'bb', 'cc', 'x', 'y', 'z']
|
||||
|
||||
date_time_pending = if setup.test_selection.date_time.not then "Date/Time operations are not supported."
|
||||
group_builder.specify "should warn when converting a Date to Date_Time" pending=date_time_pending <|
|
||||
t1 = table_builder [["D", [Date_Time.new 2024 5 16 16 48 23]]]
|
||||
t2 = table_builder [["D", [Date.new 2019 10 23, Date.new 2020]]]
|
||||
|
||||
action = call_union [t1, t2] on_problems=_
|
||||
tester table =
|
||||
expect_column_names ["D"] table
|
||||
table.at "D" . value_type . should_equal Value_Type.Date_Time
|
||||
table.at "D" . to_vector . should_equal_tz_agnostic [Date_Time.new 2024 5 16 16 48 23, Date_Time.new 2019 10 23 0 0 0, Date_Time.new 2020 1 1 0 0 0]
|
||||
problems = [Mixing_Date_Time_Types.Date_To_Date_Time "D"]
|
||||
problems.first.to_display_text . should_contain "[D]"
|
||||
Problems.test_problem_handling action problems tester
|
||||
|
||||
# Database backends are not required to support Mixed types.
|
||||
if setup.is_database.not then
|
||||
group_builder.specify "should resort to Mixed value type only if at least one column is already Mixed" <|
|
||||
## TODO currently no way to retype a column to Mixed, so we are
|
||||
using a custom object
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["mixed", ["a", My_Type.Value 1 2, Nothing]]]
|
||||
t2 = table_builder [["A", [4, 5, 6]], ["mixed", [1, 2, 3]]]
|
||||
t1.at "mixed" . value_type . should_equal Value_Type.Mixed
|
||||
@ -291,145 +384,68 @@ run_union_tests group_builder setup call_union =
|
||||
t6.at "mixed" . to_vector . should_equal ["X", "y", "a", My_Type.Value 1 2, Nothing, 1, 2, 3, True, False]
|
||||
t6.at "mixed" . value_type . should_equal Value_Type.Mixed
|
||||
|
||||
group_builder.specify "if no common type can be found, should report error and drop the problematic column" <|
|
||||
group_builder.specify "if no common type can be found, will fall back to converting all types to text and warn" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]], ["C", [True, False, Nothing]]]
|
||||
t2 = table_builder [["C", ["x", "Y", "Z"]], ["A", [4, 5, 6]], ["B", [1, 2, 3]]]
|
||||
|
||||
r1 = call_union [t1, t2] on_problems=Problem_Behavior.Report_Error
|
||||
r1.should_fail_with No_Common_Type
|
||||
r1.catch.to_display_text . should_contain "converted to text"
|
||||
|
||||
r2 = call_union [t1, t2] on_problems=Problem_Behavior.Ignore
|
||||
Problems.assume_no_problems r2
|
||||
|
||||
r3 = call_union [t1, t2] on_problems=Problem_Behavior.Report_Warning
|
||||
w3 = Problems.get_attached_warnings r3
|
||||
w3.each w-> w.should_be_a No_Common_Type
|
||||
w3.map w->
|
||||
## We look just at names of the Value_Type constructors, as
|
||||
different database backends may choose integers of different
|
||||
sizes and have differing settings for text types.
|
||||
types = w.types.map value_type->
|
||||
Meta.meta value_type . constructor . name
|
||||
(types == ["Char", "Integer"]) || (types == ["Boolean", "Char"]) . should_be_true
|
||||
|
||||
# A boolean column cannot be merged with integers.
|
||||
t3 = t1.select_columns ["C", "A"] reorder=True
|
||||
t4 = t2.select_columns ["B", "A"] reorder=True
|
||||
r4 = call_union [t3, t4] match_columns=Match_Columns.By_Position on_problems=Problem_Behavior.Report_Error
|
||||
r4.should_fail_with No_Common_Type
|
||||
|
||||
group_builder.specify "if type widening is not allowed, should use the type from first table that contained the given column" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]]]
|
||||
t2 = table_builder [["A", [4, 5, 6]], ["B", [1.2, 2.2, 3.1]]]
|
||||
|
||||
t3 = call_union [t1, t2] allow_type_widening=False keep_unmatched_columns=True
|
||||
within_table t3 <|
|
||||
Problems.assume_no_problems t3
|
||||
expect_column_names ["A", "B"] t3
|
||||
t3.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6]
|
||||
t3.at "B" . to_vector . should_equal [Nothing, Nothing, Nothing, 1.2, 2.2, 3.1]
|
||||
t3.at "A" . value_type . is_integer . should_be_true
|
||||
t2.at "B" . value_type . is_floating_point . should_be_true
|
||||
t3.at "B" . value_type . is_floating_point . should_be_true
|
||||
|
||||
group_builder.specify "if type widening is not allowed and types do not match, should report error and drop the problematic column" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["B", [1, 2, 3]], ["E", [1.1, 2.5, 3.2]]]
|
||||
t2 = table_builder [["A", [4, 5, 6]], ["B", [1.5, 2.5, 3.5]], ["E", [1, 2, 3]]]
|
||||
|
||||
t1.at "B" . value_type . is_integer . should_be_true
|
||||
t1.at "E" . value_type . is_floating_point . should_be_true
|
||||
|
||||
t2.at "B" . value_type . is_floating_point . should_be_true
|
||||
t2.at "E" . value_type . is_integer . should_be_true
|
||||
|
||||
action = call_union [t1, t2] allow_type_widening=False on_problems=_
|
||||
tester table =
|
||||
expect_column_names ["A"] table
|
||||
action = call_union [t1, t2] on_problems=_
|
||||
result_checker table =
|
||||
expect_column_names ["A", "B", "C"] table
|
||||
# If type was matched - the columns are merged as is:
|
||||
table.at "A" . to_vector . should_equal [1, 2, 3, 4, 5, 6]
|
||||
table.at "A" . value_type . is_integer . should_be_true
|
||||
# If mixed, they are converted to text representation:
|
||||
table.at "B" . to_vector . should_equal ["a", "b", "c", "1", "2", "3"]
|
||||
table.at "B" . value_type . is_text . should_be_true
|
||||
|
||||
problem_checker problem =
|
||||
problem.should_be_a Column_Type_Mismatch
|
||||
True
|
||||
err_checker err =
|
||||
problem_checker err.catch
|
||||
warn_checker warnings =
|
||||
warnings.all problem_checker
|
||||
Problems.test_advanced_problem_handling action err_checker warn_checker tester
|
||||
v = table.at "C" . to_vector
|
||||
# The check needs to be case insensitive because various database backends may represent Booleans with lower or uppercase.
|
||||
v.take 2 . map (t -> t.to_case Case.Lower) . should_equal ["true", "false"]
|
||||
# Nothing is preserved, not converted to text because we want to preserve the meaning of 'missing value':
|
||||
v.drop 2 . should_equal [Nothing, "x", "Y", "Z"]
|
||||
|
||||
# Database backends are not required to support Mixed types.
|
||||
if setup.is_database.not then
|
||||
group_builder.specify "even if type widening is not allowed, if the first column is mixed, it should accept any column to be concatenated to it" <|
|
||||
t1 = table_builder [["X", ["a", 1, Nothing]]]
|
||||
t2 = table_builder [["X", [1]]]
|
||||
t3 = table_builder [["X", [1.2, 2.3, 3.4]]]
|
||||
t4 = table_builder [["X", ["a", "b"]]]
|
||||
t5 = table_builder [["X", [True, False]]]
|
||||
table.at "C" . value_type . is_text . should_be_true
|
||||
|
||||
t1.at "X" . value_type . should_equal Value_Type.Mixed
|
||||
t2.at "X" . value_type . should_equal Value_Type.Integer
|
||||
error_checker result = result.should_fail_with No_Common_Type
|
||||
warnings_checker warnings =
|
||||
warnings.map w->
|
||||
w.should_be_a No_Common_Type
|
||||
w.to_display_text . should_contain "converted to text"
|
||||
["B", "C"].should_contain w.related_column_name
|
||||
## We look just at names of the Value_Type constructors, as
|
||||
different database backends may choose integers of different
|
||||
sizes and have differing settings for text types.
|
||||
types = w.types.map value_type->
|
||||
Meta.meta value_type . constructor . name
|
||||
Test.with_clue "(should be one of...) " <|
|
||||
[["Char", "Integer"], ["Boolean", "Char"]].should_contain types
|
||||
Problems.test_advanced_problem_handling action error_checker warnings_checker result_checker
|
||||
|
||||
t6 = call_union [t1, t2, t3, t4, t5] allow_type_widening=False
|
||||
Problems.assume_no_problems t6
|
||||
t6.at "X" . value_type . should_equal Value_Type.Mixed
|
||||
t6.at "X" . to_vector . should_equal ["a", 1, Nothing, 1, 1.2, 2.3, 3.4, "a", "b", True, False]
|
||||
group_builder.specify "if no common type can be found, will fall back to converting all types to text and warn (Date+Time)" pending=date_time_pending <|
|
||||
t1 = table_builder [["D", [Time_Of_Day.new 12, Time_Of_Day.new 13, Time_Of_Day.new 14]]]
|
||||
t2 = table_builder [["D", [Date.new 2019, Date.new 2020, Date.new 2021]]]
|
||||
|
||||
group_builder.specify "when finding a common type for numeric columns to be Float, any precision loss should be reported" <|
|
||||
t1 = table_builder [["X", [1, (2^62)-1, 3]]]
|
||||
t2 = table_builder [["X", [1.5, 2.5, 3.5]]]
|
||||
t3 = table_builder [["X", [(2^100)+1, 2^10, 2]]]
|
||||
action = call_union [t1, t2] on_problems=_
|
||||
tester table =
|
||||
expect_column_names ["D"] table
|
||||
table.at "D" . to_vector . should_equal ["12:00:00", "13:00:00", "14:00:00", "2019-01-01", "2020-01-01", "2021-01-01"]
|
||||
table.at "D" . value_type . is_text . should_be_true
|
||||
problems = [No_Common_Type.Warning_Convert_To_Text [Value_Type.Time, Value_Type.Date] "D"]
|
||||
Problems.test_problem_handling action problems tester
|
||||
|
||||
t1.at "X" . value_type . should_equal Value_Type.Integer
|
||||
t2.at "X" . value_type . should_equal Value_Type.Float
|
||||
t3.at "X" . value_type . should_be_a (Value_Type.Decimal ...)
|
||||
|
||||
t4 = call_union [t2, t1, t3] allow_type_widening=True
|
||||
t4.at "X" . value_type . should_equal Value_Type.Float
|
||||
t4.at "X" . to_vector . should_equal [1.5, 2.5, 3.5, 1, (2^62)-1, 3, (2^100)+1 . to_float, 2^10, 2]
|
||||
|
||||
w = Problems.expect_only_warning Loss_Of_Integer_Precision t4
|
||||
# Losing precision on (2^62)-1 and 2^100+1.
|
||||
w.affected_rows_count . should_equal 2
|
||||
|
||||
group_builder.specify "if type mismatches cause all columns to be dropped, fail with No_Output_Columns" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]]]
|
||||
t2 = table_builder [["A", ['x']]]
|
||||
|
||||
e3 = call_union [t1, t2] allow_type_widening=True on_problems=Problem_Behavior.Ignore
|
||||
e3.should_fail_with No_Output_Columns
|
||||
|
||||
t4 = table_builder [["A", [1.5]]]
|
||||
e5 = call_union [t1, t4] allow_type_widening=False on_problems=Problem_Behavior.Ignore
|
||||
e5.should_fail_with No_Output_Columns
|
||||
|
||||
group_builder.specify "should find a common type (2)" <|
|
||||
t1 = (table_builder [["X", [0, 1, 2]], ["Y", ['aa', 'bb', 'cc']]]) . cast "X" (Value_Type.Integer Bits.Bits_16) . cast "Y" (Value_Type.Char size=2 variable_length=False)
|
||||
t2 = (table_builder [["X", [3, 4, 5]], ["Y", ['x', 'y', 'z']]]) . cast "X" (Value_Type.Integer Bits.Bits_32) . cast "Y" (Value_Type.Char size=1 variable_length=False)
|
||||
supports_complex_types = (t1.is_error || t2.is_error || Problems.get_attached_warnings t1 . not_empty).not
|
||||
case supports_complex_types of
|
||||
False -> Nothing
|
||||
True ->
|
||||
t12 = call_union [t1, t2]
|
||||
Problems.assume_no_problems t12
|
||||
t12.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_32)
|
||||
t12.at "Y" . value_type . should_equal (Value_Type.Char size=2 variable_length=True)
|
||||
|
||||
t12.at "X" . to_vector . should_equal [0, 1, 2, 3, 4, 5]
|
||||
t12.at "Y" . to_vector . should_equal ['aa', 'bb', 'cc', 'x', 'y', 'z']
|
||||
|
||||
group_builder.specify "should fail to find a common type if widening is not allowed (2)" <|
|
||||
t1 = (table_builder [["X", [0, 1, 2]], ["Y", ['aa', 'bb', 'cc']]]) . cast "X" (Value_Type.Integer Bits.Bits_16) . cast "Y" (Value_Type.Char size=2 variable_length=False)
|
||||
t2 = (table_builder [["X", [3, 4, 5]], ["Y", ['x', 'y', 'z']]]) . cast "X" (Value_Type.Integer Bits.Bits_32) . cast "Y" (Value_Type.Char size=1 variable_length=False)
|
||||
supports_complex_types = (t1.is_error || t2.is_error || Problems.get_attached_warnings t1 . not_empty).not
|
||||
case supports_complex_types of
|
||||
False -> Nothing
|
||||
True ->
|
||||
r1 = call_union [t1, t2] allow_type_widening=False
|
||||
r1.should_fail_with No_Output_Columns
|
||||
r1.catch.cause . should_be_a Column_Type_Mismatch
|
||||
r1.catch.to_display_text . should_equal "No columns in the result, because of another problem: The column [X] expects type Integer (16 bits) but one of the provided tables had type Integer (32 bits) which is not compatible with it."
|
||||
|
||||
# And this should report Column_Type_Mismatch as the more important error too:
|
||||
call_union [t1, t2] allow_type_widening=False on_problems=Problem_Behavior.Report_Error . should_fail_with Column_Type_Mismatch
|
||||
group_builder.specify "will use the _output_ column name in the warnings when matching by position (so input names may differ)" <|
|
||||
t1 = table_builder [["A", [1]]]
|
||||
t2 = table_builder [["B", ["a"]]]
|
||||
r1 = call_union [t1, t2] match_columns=Match_Columns.By_Position
|
||||
expect_column_names ["A"] r1
|
||||
r1.at "A" . value_type . is_text . should_be_true
|
||||
r1.at "A" . to_vector . should_equal ["1", "a"]
|
||||
w = Problems.expect_only_warning No_Common_Type r1
|
||||
w.related_column_name.should_equal "A"
|
||||
|
||||
group_builder.specify "should gracefully handle tables from different backends" <|
|
||||
t1 = table_builder [["A", [1, 2, 3]], ["B", ["a", "b", "c"]]]
|
||||
|
@ -822,13 +822,21 @@ add_specs suite_builder =
|
||||
int = ["int", [1, 2, 3, 3]]
|
||||
int2 = ["int2", [1, Nothing, Nothing, 1]]
|
||||
dbl = ["dbl", [0.0, 0.0, Nothing, Nothing]]
|
||||
dates = ["dates", [Date.new 2000, Date.new 1999 1 1, Date.new 1999 1 1, Date_Time.new 2022 8 20]]
|
||||
dts = ["dts", [Date_Time.new 2022 8 27 11 22 25, Nothing, Date_Time.new 2030, Date.new 2000]]
|
||||
tod = ["tod", [Time_Of_Day.new 18 00, Time_Of_Day.new 18 19, Date_Time.new 2000 1 1, Time_Of_Day.new 18 19]]
|
||||
dates = ["dates", [Date.new 2000, Date.new 1999 1 1, Date.new 1999 1 1, Nothing]]
|
||||
dts = ["dts", [Date_Time.new 2022 8 27 11 22 25, Nothing, Nothing, Date_Time.new 2030]]
|
||||
dts_mixed = ["dts_mixed", [Date_Time.new 2022 8 27 11 22 25, Nothing, Date_Time.new 2030, Date.new 2000]]
|
||||
tod = ["tod", [Time_Of_Day.new 18 00, Time_Of_Day.new 18 19, Nothing, Time_Of_Day.new 18 19]]
|
||||
tod_mixed = ["tod_mixed", [Time_Of_Day.new 18 00, Time_Of_Day.new 18 19, Date_Time.new 2000 1 1, Time_Of_Day.new 18 19]]
|
||||
mix = ["mix", [42, Date_Time.new 2022 8 27, 1, 1]]
|
||||
nulls = ["nulls", [Nothing, Nothing, Nothing, 0]]
|
||||
custom = ["custom", [2, My.Data 2 1, Nothing, Nothing]]
|
||||
[str, int, int2, dbl, dates, dts, tod, mix, nulls, custom]
|
||||
[str, int, int2, dbl, dates, dts, dts_mixed, tod, tod_mixed, mix, nulls, custom]
|
||||
ins.at "dates" . value_type . should_equal Value_Type.Date
|
||||
ins.at "dts" . value_type . is_date_time . should_be_true
|
||||
ins.at "dts_mixed" . value_type . should_equal Value_Type.Mixed
|
||||
ins.at "tod" . value_type . should_equal Value_Type.Time
|
||||
ins.at "tod_mixed" . value_type . should_equal Value_Type.Mixed
|
||||
|
||||
data.varied_type_table.filter "strs" (Filter_Condition.Is_In (ins.at "str")) . at "strs" . to_vector . should_equal ["b", "c"]
|
||||
data.varied_type_table.filter "strs" (Filter_Condition.Is_In (ins.at "str" . to_vector)) . at "strs" . to_vector . should_equal ["b", "c"]
|
||||
data.varied_type_table.filter "ints" (Filter_Condition.Is_In (ins.at "int")) . at "ints" . to_vector . should_equal [1, 2]
|
||||
@ -841,7 +849,11 @@ add_specs suite_builder =
|
||||
data.varied_type_table.filter "dates" (Filter_Condition.Is_In (ins.at "dates" . to_vector)) . at "dates" . to_vector . should_equal [Date.new 2000, Date.new 1999 1 1]
|
||||
data.varied_type_table.filter "datetimes" (Filter_Condition.Is_In (ins.at "dts")) . at "datetimes" . to_vector . should_equal [Date_Time.new 2022 8 27 11 22 25]
|
||||
data.varied_type_table.filter "datetimes" (Filter_Condition.Is_In (ins.at "dts" . to_vector)) . at "datetimes" . to_vector . should_equal [Date_Time.new 2022 8 27 11 22 25]
|
||||
# The Date_Time.new 2000 should not match with Date.new 2000 because the types are different:
|
||||
data.varied_type_table.filter "datetimes" (Filter_Condition.Is_In (ins.at "dts_mixed")) . at "datetimes" . to_vector . should_equal [Date_Time.new 2022 8 27 11 22 25]
|
||||
data.varied_type_table.filter "dates" (Filter_Condition.Is_In (ins.at "dts_mixed")) . at "dates" . to_vector . should_equal [Date.new 2000]
|
||||
data.varied_type_table.filter "times" (Filter_Condition.Is_In (ins.at "tod")) . at "times" . to_vector . should_equal [Time_Of_Day.new 18 00]
|
||||
data.varied_type_table.filter "times" (Filter_Condition.Is_In (ins.at "tod_mixed")) . at "times" . to_vector . should_equal [Time_Of_Day.new 18 00]
|
||||
data.varied_type_table.filter "times" (Filter_Condition.Is_In (ins.at "tod" . to_vector)) . at "times" . to_vector . should_equal [Time_Of_Day.new 18 00]
|
||||
data.varied_type_table.filter "mixed" (Filter_Condition.Is_In [42, "a", 1, Nothing, Date.new 2022 8 27, Date_Time.new 2022 8 27]) . at "mixed" . to_vector . should_equal [1, "a", Date.new 2022 8 27]
|
||||
data.varied_type_table.filter "mixed" (Filter_Condition.Is_In (ins.at "mix")) . at "mixed" . to_vector . should_equal [1]
|
||||
|
Loading…
Reference in New Issue
Block a user