Changes from session with Ned (#10349)

- Removed `second_row` and `second_column` from the `Table` and `DB_Table`.
- Added `first_value` and `last_value` to the `Table` and `DB_Table`.
- Fixed bug where negative index access wasn't allowed on `Column`.
- Added error if negative index access used on `DB_Column`. Tells user they have to materialize.
- Fix argument order for `Table.text_cleanse` and a couple of typo corrections.
- Rename `auto_value_type` to `auto_cast` on table and columns.
This commit is contained in:
James Dunkerley 2024-06-24 13:47:14 +01:00 committed by GitHub
parent 5233390ec0
commit e6c8ec7ab5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 210 additions and 131 deletions

View File

@ -156,7 +156,8 @@ type DB_Column
example_at = Examples.integer_column.get 0 -1
get : Integer -> Any -> Any | Nothing
get self index:Integer=0 (~default=Nothing) =
self.read (..First index+1) . get index default
if index < 0 then Error.throw (Unsupported_Database_Operation.Error "Reading backwards from end is not supported in-database. Use `read` to materialize the column.") else
self.read (..First index+1) . get index default
## GROUP Standard.Base.Metadata
ICON metadata
@ -1806,14 +1807,15 @@ type DB_Column
## GROUP Standard.Base.Conversions
ICON convert
ALIAS auto_value_type
Change the value type of the column to a more specific one, based on its
contents.
This operation is currently not available in the Database backend.
auto_value_type : Boolean -> DB_Column
auto_value_type self shrink_types=False =
auto_cast : Boolean -> DB_Column
auto_cast self shrink_types=False =
_ = shrink_types
Error.throw <| Unsupported_Database_Operation.Error "`DB_Column.auto_value_type` is not supported in the Database backends."
Error.throw <| Unsupported_Database_Operation.Error "`DB_Column.auto_cast` is not supported in the Database backends."
## PRIVATE
Shares the core CAST logic between `cast` and `parse`.

View File

@ -172,6 +172,20 @@ type DB_Table
col = self.get selector if_missing=Nothing
if Nothing == col then if_missing else col.get index if_missing
## ALIAS first cell
GROUP Standard.Base.Selections
ICON local_scope4
Gets the top left value from the table.
first_value : Any ! Index_Out_Of_Bounds
first_value self = self.at 0 . at 0
## ALIAS last cell
GROUP Standard.Base.Selections
ICON local_scope4
Gets the bottom right value from the table.
last_value : Any ! Index_Out_Of_Bounds
last_value self = self.last_row . at -1
## ALIAS first field
GROUP Standard.Base.Selections
ICON select_column
@ -179,13 +193,6 @@ type DB_Table
first_column : DB_Column ! Index_Out_Of_Bounds
first_column self = self.at 0
## ALIAS second field
GROUP Standard.Base.Selections
ICON select_column
Gets the second column
second_column : DB_Column ! Index_Out_Of_Bounds
second_column self = self.at 1
## ALIAS last field
GROUP Standard.Base.Selections
ICON select_column
@ -1164,13 +1171,6 @@ type DB_Table
first_row self =
self.read (..First 1) . rows . first
## GROUP Standard.Base.Selections
ICON select_row
Returns the second row of the table.
second_row : Row ! Index_Out_Of_Bounds
second_row self =
self.read (..First 2) . rows . second
## GROUP Standard.Base.Selections
ICON select_row
Returns the last row of the table.
@ -2556,8 +2556,8 @@ type DB_Table
This operation is currently not available in the Database backend.
@columns (Widget_Helpers.make_column_name_multi_selector add_regex=True add_by_type=True)
auto_value_types : Vector (Text | Integer | Regex | By_Type) | Text | Integer | Regex | By_Type -> Boolean -> Boolean -> Problem_Behavior -> DB_Table
auto_value_types self columns:(Vector (Text | Integer | Regex | By_Type) | Text | Integer | Regex | By_Type)=self.column_names shrink_types:Boolean=False error_on_missing_columns:Boolean=True on_problems:Problem_Behavior=..Report_Warning =
auto_cast : Vector (Text | Integer | Regex | By_Type) | Text | Integer | Regex | By_Type -> Boolean -> Boolean -> Problem_Behavior -> DB_Table
auto_cast self columns:(Vector (Text | Integer | Regex | By_Type) | Text | Integer | Regex | By_Type)=self.column_names shrink_types:Boolean=False error_on_missing_columns:Boolean=True on_problems:Problem_Behavior=..Report_Warning =
_ = [columns, shrink_types, error_on_missing_columns, on_problems]
Error.throw (Unsupported_Database_Operation.Error "DB_Table.auto_value_types is not supported in the Database backends.")
@ -2938,13 +2938,14 @@ type DB_Table
Applies the specified cleansings to the text in each row of the specified columns
Arguments:
- from: The column(s) to cleanse.
- remove: A vector of the text cleanings to remove from the text. The text cleansings are
applied in the order they are provided. The same text cleansing can be used multiple
times. The text cleansings are:
- ..Leading_Whitespace: Removes all whitspace from the start of the string.
- ..Trailing_Whitespace: Removes all whitspace from the end of the string.
- ..Duplicate_Whitespace: Removes all duplicate whitspace from the string replacing it with the first whitespace character of the duplicated block.
- ..All_Whitespace: Removes all whitspace from the string.
- ..Leading_Whitespace: Removes all whitespace from the start of the string.
- ..Trailing_Whitespace: Removes all whitespace from the end of the string.
- ..Duplicate_Whitespace: Removes all duplicate whitespace from the string replacing it with the first whitespace character of the duplicated block.
- ..All_Whitespace: Removes all whitespace from the string.
- ..Leading_Numbers: Removes all numbers from the start of the string.
- ..Trailing_Numbers: Removes all numbers from the end of the string.
- ..Non_ASCII: Removes all non-ascii characters from the string.
@ -2953,16 +2954,15 @@ type DB_Table
- ..Numbers: Removes all numbers characters from the string.
- ..Punctuation: Removes all characters in the set ,.!?():;'" from the string.
- ..Symbols: Removes anything that isn't letters, numbers or whitespace from the string.
- from: The column(s) to cleanse.
> Example
Remove leading and trailing spaces from cells.
table.text_cleanse [..Leading_Whitespace, ..Trailing_Whitespace]
@remove make_data_cleanse_vector_selector
table.text_cleanse ["Input"] [..Leading_Whitespace, ..Trailing_Whitespace]
@from (Widget_Helpers.make_column_name_multi_selector add_regex=True add_by_type=True)
text_cleanse : Vector Named_Pattern -> Vector (Integer | Text | Regex | By_Type) -> DB_Table
text_cleanse self remove from:(Vector (Integer | Text | Regex | By_Type)) =
@remove make_data_cleanse_vector_selector
text_cleanse : Vector (Integer | Text | Regex | By_Type) -> Vector Named_Pattern -> DB_Table
text_cleanse self from:(Vector (Integer | Text | Regex | By_Type)) remove =
transformer col = col.text_cleanse remove
Table_Helpers.replace_columns_with_transformed_columns self from transformer

View File

@ -6,6 +6,7 @@ import Standard.Base.Errors.Common.Arithmetic_Error
import Standard.Base.Errors.Common.Incomparable_Values
import Standard.Base.Errors.Common.Index_Out_Of_Bounds
import Standard.Base.Errors.Common.No_Such_Method
import Standard.Base.Errors.Deprecated.Deprecated
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.Errors.Illegal_State.Illegal_State
import Standard.Base.Internal.Polyglot_Helpers
@ -1962,6 +1963,50 @@ type Column
## GROUP Standard.Base.Conversions
ICON convert
ALIAS auto_value_type
Change the value type of the column to a more specific one, based on its
contents.
Arguments:
- shrink_types: If set `True`, smaller types will be chosen if possible,
according to the rules below. Defaults to `False`.
? Auto Type Selection Rules
- If a `Mixed` column can be assigned a single type, like `Char` or
`Integer`, that will be used.
- Text columns are not parsed. To do that, use the `parse` method.
- If a `Float` column contains only integers, it will be converted to
an Integer column.
- If a `Decimal` column contains only integers that could fit in a
64-bit integer storage, it will be converted to an Integer column.
- If `shrink_types` is `False` (default), no other transformations are
applied.
- However, if `shrink_types` is set to `True`, then:
- Integer columns will be assigned the smallest size that can fit all
values (down to 16-bit integers; converting to the `Byte` type has
to be done manually through `cast`).
- If all elements in a text column have the same length, the type
will become fixed length.
- Otherwise, if a text column is variable length, but all text
elements are no longer than 255 characters, the column will get a
max length of 255. Otherwise, the column size limit will stay
unchanged.
auto_cast : Boolean -> Column
auto_cast self shrink_types=False =
new_value_type = case shrink_types of
False -> self.inferred_precise_value_type
True ->
Storage.to_value_type self.java_column.getStorage.inferPreciseTypeShrunk
# We run with Report_Error because we do not expect any problems.
self.cast new_value_type on_problems=Problem_Behavior.Report_Error
## PRIVATE
GROUP Standard.Base.Conversions
ICON convert
Deprecated: Use `auto_cast` instead.
Change the value type of the column to a more specific one, based on its
contents.
@ -1992,12 +2037,8 @@ type Column
unchanged.
auto_value_type : Boolean -> Column
auto_value_type self shrink_types=False =
new_value_type = case shrink_types of
False -> self.inferred_precise_value_type
True ->
Storage.to_value_type self.java_column.getStorage.inferPreciseTypeShrunk
# We run with Report_Error because we do not expect any problems.
self.cast new_value_type on_problems=Problem_Behavior.Report_Error
Warning.attach (Deprecated.Warning "Standard.Table.Column.Column" "auto_value_type" "Deprecated: `auto_value_type` has been replaced by `auto_cast`.") <|
self.auto_cast shrink_types
## ALIAS transform column
ICON column_add
@ -2191,11 +2232,12 @@ type Column
@index (self-> Numeric_Input minimum=0 maximum=self.length-1)
get : Integer -> Any -> Any | Nothing
get self index:Integer=0 (~default=Nothing) =
valid_index = (index >= 0) && (index < self.length)
if valid_index.not then default else
storage = self.java_column.getStorage
if storage.isNothing index then Nothing else
java_to_enso <| storage.getItem index
if index < 0 && index >= -self.length then self.get (self.length + index) default else
valid_index = (index >= 0) && (index < self.length)
if valid_index.not then default else
storage = self.java_column.getStorage
if storage.isNothing index then Nothing else
java_to_enso <| storage.getItem index
## ICON data_input
Returns a column containing rows of this column.

View File

@ -296,7 +296,7 @@ read_name_map_from_table column_map:Table = case column_map.column_count of
if col.value_type.is_text then col.to_vector else
Error.throw (Illegal_Argument.Error "Expected a table with one or two columns of text values.")
2 ->
if column_map.first_column.value_type.is_text && column_map.second_column.value_type.is_text then Map.from_vector column_map.rows else
if column_map.first_column.value_type.is_text && (column_map.at 1).value_type.is_text then Map.from_vector column_map.rows else
Error.throw (Illegal_Argument.Error "Expected a table with one or two columns of text values.")
_ -> Error.throw (Illegal_Argument.Error "Expected a table with one or two columns of text values.")

View File

@ -304,6 +304,20 @@ type Table
col = self.get selector if_missing=Nothing
if Nothing == col then if_missing else col.get index if_missing
## ALIAS first cell
GROUP Standard.Base.Selections
ICON local_scope4
Gets the top left value from the table.
first_value : Any ! Index_Out_Of_Bounds
first_value self = self.at 0 . at 0
## ALIAS last cell
GROUP Standard.Base.Selections
ICON local_scope4
Gets the bottom right value from the table.
last_value : Any ! Index_Out_Of_Bounds
last_value self = self.at -1 . at -1
## ALIAS first field
GROUP Standard.Base.Selections
ICON select_column
@ -311,13 +325,6 @@ type Table
first_column : Column ! Index_Out_Of_Bounds
first_column self = self.at 0
## ALIAS second field
GROUP Standard.Base.Selections
ICON select_column
Gets the second column
second_column : Column ! Index_Out_Of_Bounds
second_column self = self.at 1
## ALIAS last field
GROUP Standard.Base.Selections
ICON select_column
@ -1277,6 +1284,57 @@ type Table
## GROUP Standard.Base.Conversions
ICON convert
ALIAS auto_value_types
Change the value type of table columns to a more specific one, based on
their contents.
This is most useful for `Mixed` type columns and will allow to narrow
down the type if all values in the column fit a more specific type.
Arguments:
- columns: The selection of columns to convert.
- shrink_types: If set `True`, smaller types will be chosen if possible,
according to the rules below. Defaults to `False`.
- error_on_missing_columns: Specifies if a missing input column should
result in an error regardless of the `on_problems` settings. Defaults
to `True`.
- on_problems: Specifies how to handle problems if they occur, reporting
them as warnings by default.
? Auto Type Selection Rules
- If a `Mixed` column can be assigned a single type, like `Char` or
`Integer`, that will be used.
- Text columns are not parsed. To do that, use the `parse` method.
- If a `Float` column contains only integers, it will be converted to
an Integer column.
- If a `Decimal` column contains only integers that could fit in a
64-bit integer storage, it will be converted to an Integer column.
- If `shrink_types` is `False` (default), no other transformations are
applied.
- However, if `shrink_types` is set to `True`, then:
- Integer columns will be assigned the smallest size that can fit all
values (down to 16-bit integers; converting to the `Byte` type has
to be done manually through `cast`).
- If all elements in a text column have the same length, the type
will become fixed length.
- Otherwise, if a text column is variable length, but all text
elements are no longer than 255 characters, the column will get a
max length of 255. Otherwise, the column size limit will stay
unchanged.
@columns (Widget_Helpers.make_column_name_multi_selector add_regex=True add_by_type=True)
auto_cast : Vector (Text | Integer | Regex | By_Type) | Text | Integer | Regex | By_Type -> Boolean -> Boolean -> Problem_Behavior -> Table
auto_cast self columns:(Vector (Text | Integer | Regex | By_Type) | Text | Integer | Regex | By_Type)=self.column_names shrink_types:Boolean=False error_on_missing_columns:Boolean=True on_problems:Problem_Behavior=..Report_Warning =
selected = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=False error_on_missing_columns=error_on_missing_columns on_problems=on_problems error_on_empty=False
selected.fold self table-> column_to_cast->
new_column = column_to_cast.auto_cast shrink_types
table.set new_column as=column_to_cast.name set_mode=Set_Mode.Update
## GROUP Standard.Base.Conversions
ICON convert
Deprecated: Use `auto_cast` instead.
Change the value type of table columns to a more specific one, based on
their contents.
@ -1317,10 +1375,8 @@ type Table
@columns (Widget_Helpers.make_column_name_multi_selector add_regex=True add_by_type=True)
auto_value_types : Vector (Text | Integer | Regex | By_Type) | Text | Integer | Regex | By_Type -> Boolean -> Boolean -> Problem_Behavior -> Table
auto_value_types self columns:(Vector (Text | Integer | Regex | By_Type) | Text | Integer | Regex | By_Type)=self.column_names shrink_types:Boolean=False error_on_missing_columns:Boolean=True on_problems:Problem_Behavior=..Report_Warning =
selected = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=False error_on_missing_columns=error_on_missing_columns on_problems=on_problems error_on_empty=False
selected.fold self table-> column_to_cast->
new_column = column_to_cast.auto_value_type shrink_types
table.set new_column as=column_to_cast.name set_mode=Set_Mode.Update
Warning.attach (Deprecated.Warning "Standard.Table.Table.Table" "auto_value_types" "Deprecated: `auto_value_types` has been replaced by `auto_cast`.") <|
self.auto_cast columns shrink_types error_on_missing_columns on_problems
## GROUP Standard.Base.Conversions
ICON split
@ -1913,16 +1969,6 @@ type Table
if self.row_count == 0 then Error.throw (Index_Out_Of_Bounds.Error 0 0) else
Row.Value self 0
## GROUP Standard.Base.Selections
ICON select_row
Returns the second row of the table.
In the database backend, it first materializes the table to in-memory.
second_row : Row ! Index_Out_Of_Bounds
second_row self =
if self.row_count < 2 then Error.throw (Index_Out_Of_Bounds.Error 1 self.row_count) else
Row.Value self 1
## GROUP Standard.Base.Selections
ICON select_row
Returns the last row of the table.
@ -2919,16 +2965,17 @@ type Table
## GROUP Standard.Base.Text
ICON column_add
Applies the specified cleansings to the text in each row of the specified columns.
Applies the specified cleansings to the text in each row of the specified columns
Arguments:
- from: The column(s) to cleanse.
- remove: A vector of the text cleanings to remove from the text. The text cleansings are
applied in the order they are provided. The same text cleansing can be used multiple
times. The text cleansings are:
- ..Leading_Whitespace: Removes all whitspace from the start of the string.
- ..Trailing_Whitespace: Removes all whitspace from the end of the string.
- ..Duplicate_Whitespace: Removes all duplicate whitspace from the string replacing it with the first whitespace character of the duplicated block.
- ..All_Whitespace: Removes all whitspace from the string.
- ..Leading_Whitespace: Removes all whitespace from the start of the string.
- ..Trailing_Whitespace: Removes all whitespace from the end of the string.
- ..Duplicate_Whitespace: Removes all duplicate whitespace from the string replacing it with the first whitespace character of the duplicated block.
- ..All_Whitespace: Removes all whitespace from the string.
- ..Leading_Numbers: Removes all numbers from the start of the string.
- ..Trailing_Numbers: Removes all numbers from the end of the string.
- ..Non_ASCII: Removes all non-ascii characters from the string.
@ -2937,16 +2984,15 @@ type Table
- ..Numbers: Removes all numbers characters from the string.
- ..Punctuation: Removes all characters in the set ,.!?():;'" from the string.
- ..Symbols: Removes anything that isn't letters, numbers or whitespace from the string.
- from: The column(s) to cleanse.
> Example
Remove leading and trailing spaces from cells.
table.text_cleanse [..Leading_Whitespace, ..Trailing_Whitespace]
@remove make_data_cleanse_vector_selector
table.text_cleanse ["Input"] [..Leading_Whitespace, ..Trailing_Whitespace]
@from (Widget_Helpers.make_column_name_multi_selector add_regex=True add_by_type=True)
text_cleanse : Vector Named_Pattern -> Vector (Integer | Text | Regex | By_Type) -> Table
text_cleanse self remove from:(Vector (Integer | Text | Regex | By_Type)) =
@remove make_data_cleanse_vector_selector
text_cleanse : Vector (Integer | Text | Regex | By_Type) -> Vector Named_Pattern -> Table
text_cleanse self from:(Vector (Integer | Text | Regex | By_Type)) remove =
transformer col = col.text_cleanse remove
Table_Helpers.replace_columns_with_transformed_columns self from transformer

View File

@ -677,8 +677,8 @@ add_specs suite_builder setup =
group_builder.specify "should report unsupported" <|
t = table_builder [["X", [1, 2, 3]]]
t.auto_value_types . should_fail_with Unsupported_Database_Operation
t.at "X" . auto_value_type . should_fail_with Unsupported_Database_Operation
t.auto_cast . should_fail_with Unsupported_Database_Operation
t.at "X" . auto_cast . should_fail_with Unsupported_Database_Operation
# The in-memory functionality of `expand_column` is tested in test/Table_Tests/src/In_Memory/Table_Conversion_Spec.enso
if setup.is_database then suite_builder.group prefix+"Table.expand_column" group_builder->
@ -733,7 +733,7 @@ add_specs suite_builder setup =
t1.at "mixed_time" . value_type . should_equal Value_Type.Mixed
t1.at "bools" . value_type . should_equal Value_Type.Mixed
t2 = t1.auto_value_types shrink_types=shrink_types
t2 = t1.auto_cast shrink_types=shrink_types
# Depending on shrink_types value the size of the Char/Integer types may vary - exact details tested elsewhere.
t2.at "strs" . value_type . should_be_a (Value_Type.Char ...)
t2.at "ints" . value_type . should_be_a (Value_Type.Integer ...)
@ -750,18 +750,18 @@ add_specs suite_builder setup =
t0 = table_builder [["strs", [mixer, "a", "b"]], ["ints", [mixer, 2, 3]], ["floats", [mixer, 1.5, 2.5]]]
t1 = t0.drop 1
t2 = t1.auto_value_types []
t2 = t1.auto_cast []
t2.at "strs" . value_type . should_equal Value_Type.Mixed
t2.at "ints" . value_type . should_equal Value_Type.Mixed
t2.at "floats" . value_type . should_equal Value_Type.Mixed
t3 = t1.auto_value_types ["strs"]
t3 = t1.auto_cast ["strs"]
t3.at "strs" . value_type . should_equal Value_Type.Char
t3.at "ints" . value_type . should_equal Value_Type.Mixed
t3.at "floats" . value_type . should_equal Value_Type.Mixed
# should match ints and floats but not strs
t4 = t1.auto_value_types "[if].*".to_regex
t4 = t1.auto_cast "[if].*".to_regex
t4.at "strs" . value_type . should_equal Value_Type.Mixed
t4.at "ints" . value_type . should_equal Value_Type.Integer
t4.at "floats" . value_type . should_equal Value_Type.Float
@ -771,7 +771,7 @@ add_specs suite_builder setup =
t0 = table_builder [["X", [1.0, 2.0, 3.0]], ["Y", [mixer, 2.5, 3.0]]]
t1 = t0.drop 1
t2 = t1.auto_value_types [..By_Type ..Float]
t2 = t1.auto_cast [..By_Type ..Float]
t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t2.at "Y" . value_type . should_equal Value_Type.Mixed
@ -781,7 +781,7 @@ add_specs suite_builder setup =
t1.at "Y" . value_type . should_equal Value_Type.Float
t1.at "Z" . value_type . should_equal Value_Type.Float
t2 = t1.auto_value_types shrink_types=False
t2 = t1.auto_cast shrink_types=False
t2.at "X" . to_vector . should_equal [1, 2, 3]
t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t2.at "Y" . value_type . should_equal Value_Type.Float
@ -795,7 +795,7 @@ add_specs suite_builder setup =
group_builder.specify "will not parse text columns" <|
t1 = table_builder [["X", ["1", "2", "3"]]]
c2 = t1.at "X" . auto_value_type
c2 = t1.at "X" . auto_cast
c2.value_type . should_equal Value_Type.Char
group_builder.specify "will 'undo' a cast to Mixed" <|
@ -804,7 +804,7 @@ add_specs suite_builder setup =
t2.at "X" . value_type . should_equal Value_Type.Mixed
t2.at "Y" . value_type . should_equal Value_Type.Mixed
t3 = t2.auto_value_types
t3 = t2.auto_cast
t3.at "X" . value_type . should_equal Value_Type.Integer
t3.at "Y" . value_type . should_equal Value_Type.Char
@ -813,7 +813,7 @@ add_specs suite_builder setup =
c1 = c0.drop 1
c1.value_type . should_equal Value_Type.Mixed
c2 = c1.auto_value_type
c2 = c1.auto_cast
c2.value_type . should_be_a (Value_Type.Decimal ...)
c2.to_vector . should_equal [1, 2, (2^100)+1]
@ -831,13 +831,13 @@ add_specs suite_builder setup =
True -> t1.at "F" . value_type . should_equal Value_Type.Mixed
False -> t1.at "F" . value_type . should_equal Value_Type.Float
t2 = t1.auto_value_types shrink_types=False
t2 = t1.auto_cast shrink_types=False
t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t2.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t2.at "F" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t3 = t1.auto_value_types shrink_types=True
t3 = t1.auto_cast shrink_types=True
# Even though X's values are small enough to fit in a Byte, we stick to 16-bit Integers.
t3.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_16)
t3.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_32)
@ -850,7 +850,7 @@ add_specs suite_builder setup =
c1.value_type . should_equal Value_Type.Byte
[True, False].each shrink_types->
c2 = c1.auto_value_type shrink_types=shrink_types
c2 = c1.auto_cast shrink_types=shrink_types
c2.value_type . should_equal Value_Type.Byte
group_builder.specify "Decimal (scale=0, i.e. integer) columns should also be shrinked if possible and shrink_types=True" <|
@ -861,7 +861,7 @@ add_specs suite_builder setup =
t1.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0)
t1.at "Z" . value_type . should_equal (Value_Type.Decimal scale=0)
t2 = t1.auto_value_types shrink_types=False
t2 = t1.auto_cast shrink_types=False
# Without shrinking we get an integer type, but not the smallest one - just the default 64-bit.
t2.at "X" . to_vector . should_equal [1, 2, 3]
@ -869,7 +869,7 @@ add_specs suite_builder setup =
t2.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0)
t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t3 = t1.auto_value_types shrink_types=True
t3 = t1.auto_cast shrink_types=True
t3.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_16)
t3.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0)
t3.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_16)
@ -885,20 +885,20 @@ add_specs suite_builder setup =
True -> c1.value_type . should_equal Value_Type.Mixed
False -> c1.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
c2 = c1.auto_value_type shrink_types=False
c2 = c1.auto_cast shrink_types=False
c2.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
c3 = c1.auto_value_type shrink_types=True
c3 = c1.auto_cast shrink_types=True
c3.value_type . should_equal (Value_Type.Char size=2 variable_length=False)
c4 = table_builder [["X", ["a", "x", "y"]]] . at "X" . cast (Value_Type.Char size=100 variable_length=True)
c4.to_vector . should_equal ["a", "x", "y"]
c4.value_type . should_equal (Value_Type.Char size=100 variable_length=True)
c5 = c4.auto_value_type shrink_types=False
c5 = c4.auto_cast shrink_types=False
c5.value_type . should_equal (Value_Type.Char size=100 variable_length=True)
c6 = c4.auto_value_type shrink_types=True
c6 = c4.auto_cast shrink_types=True
c6.value_type . should_equal (Value_Type.Char size=1 variable_length=False)
group_builder.specify "if all text values are empty string, the type will remain unchanged" <|
@ -909,10 +909,10 @@ add_specs suite_builder setup =
c2.value_type . should_equal (Value_Type.Char size=100 variable_length=True)
[True, False].each shrink_types->
c1_b = c1.auto_value_type shrink_types=shrink_types
c1_b = c1.auto_cast shrink_types=shrink_types
c1_b.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
c2_b = c2.auto_value_type shrink_types=shrink_types
c2_b = c2.auto_cast shrink_types=shrink_types
c2_b.value_type . should_equal (Value_Type.Char size=100 variable_length=True)
group_builder.specify "if all text values fit under 255 characters, will add a 255 length limit (if shrink_types=True)" <|
@ -921,7 +921,7 @@ add_specs suite_builder setup =
t2 = t1 . set (t1.at "short_unbounded" . cast (Value_Type.Char size=1000)) "short_1000" . set (t1.at "short_unbounded" . cast (Value_Type.Char size=10)) "short_10" . set (t1.at "long_unbounded" . cast (Value_Type.Char size=400)) "long_400" . set (t1.at "short_unbounded" . cast Value_Type.Mixed) "short_mixed"
t2.at "short_mixed" . value_type . should_equal Value_Type.Mixed
t3 = t2.auto_value_types shrink_types=False
t3 = t2.auto_cast shrink_types=False
t3.at "short_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
t3.at "short_1000" . value_type . should_equal (Value_Type.Char size=1000 variable_length=True)
t3.at "short_10" . value_type . should_equal (Value_Type.Char size=10 variable_length=True)
@ -930,7 +930,7 @@ add_specs suite_builder setup =
t3.at "long_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
t3.at "long_400" . value_type . should_equal (Value_Type.Char size=400 variable_length=True)
t4 = t2.auto_value_types shrink_types=True
t4 = t2.auto_cast shrink_types=True
# Short ones get shortened to 255 unless they were shorter already.
t4.at "short_unbounded" . value_type . should_equal (Value_Type.Char size=255 variable_length=True)
t4.at "short_1000" . value_type . should_equal (Value_Type.Char size=255 variable_length=True)
@ -950,7 +950,7 @@ add_specs suite_builder setup =
t1.at "str" . value_type . should_equal Value_Type.Char
t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0)
t2 = t1.auto_value_types shrink_types=False
t2 = t1.auto_cast shrink_types=False
t2.at "mix" . value_type . should_equal Value_Type.Mixed
t2.at "int" . value_type . should_equal Value_Type.Integer
## Technically, if there are no elements, "all of elements" are
@ -961,7 +961,7 @@ add_specs suite_builder setup =
t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0)
t2.at "str" . value_type . should_equal Value_Type.Char
t3 = t1.auto_value_types shrink_types=True
t3 = t1.auto_cast shrink_types=True
t3.at "mix" . value_type . should_equal Value_Type.Mixed
# Technically, if there are no elements, then they can be fit inside of the smallest types available:
t3.at "int" . value_type . should_equal (Value_Type.Integer Bits.Bits_16)

View File

@ -105,10 +105,6 @@ add_specs suite_builder setup =
column_4.name . should_equal "foo"
column_4.to_vector . should_equal [1, 2, 3]
column_5 = data.table.second_column
column_5.name . should_equal "bar"
column_5.to_vector . should_equal [4, 5, 6]
column_6 = data.table.last_column
column_6.name . should_equal "abcd123"
column_6.to_vector . should_equal [19, 20, 21]
@ -312,13 +308,6 @@ add_specs suite_builder setup =
first_row.at "Y" . should_equal 5
first_row.at "Z" . should_equal "A"
group_builder.specify "should let you get the second row" <|
second_row = data.table.second_row
second_row . length . should_equal 3
second_row.at "X" . should_equal 2
second_row.at "Y" . should_equal 6
second_row.at "Z" . should_equal "B"
group_builder.specify "should let you get the last row" <|
last_row = data.table.last_row
last_row . length . should_equal 3

View File

@ -46,7 +46,7 @@ add_specs suite_builder setup =
clean_flight = ["Flight", ["BA0123", "BA0123 ", "SG0456 ", "BA 0123", "S G 0 4 5 6 "]]
clean_passenger = ["Passenger", ["Albert Einstein", "Marie Curie ", "Isaac Newton ", "Stephen Hawking", "A d a Lovelace "]]
expected_table = Table.new [clean_flight, clean_passenger, ticket_price]
res = table.text_cleanse [Named_Pattern.Leading_Whitespace] ["Flight", "Passenger"]
res = table.text_cleanse ["Flight", "Passenger"] [..Leading_Whitespace]
case res.is_error && setup.is_database of
True ->
res.should_fail_with Unsupported_Database_Operation
@ -58,7 +58,7 @@ add_specs suite_builder setup =
clean_flight = ["Flight", ["BA0123", "BA0123 ", "SG0456 ", "BA 0123", "S G 0 4 5 6 "]]
clean_passenger = ["Passenger", ["Albert Einstein", "Marie Curie ", "Isaac Newton ", "Stephen Hawking", "A d a Lovelace "]]
expected_table = Table.new [clean_flight, clean_passenger, ticket_price]
res = table.text_cleanse [Named_Pattern.Leading_Whitespace] [(regex "Fl.*"), (regex "P.*")]
res = table.text_cleanse [(regex "Fl.*"), (regex "P.*")] [..Leading_Whitespace]
case res.is_error && setup.is_database of
True ->
res.should_fail_with Unsupported_Database_Operation
@ -70,7 +70,7 @@ add_specs suite_builder setup =
clean_flight = ["Flight", ["BA0123", "BA0123 ", "SG0456 ", "BA 0123", "S G 0 4 5 6 "]]
clean_passenger = ["Passenger", ["Albert Einstein", "Marie Curie ", "Isaac Newton ", "Stephen Hawking", "A d a Lovelace "]]
expected_table = Table.new [clean_flight, clean_passenger, ticket_price]
res = table.text_cleanse [Named_Pattern.Leading_Whitespace] [..By_Type ..Char]
res = table.text_cleanse [..By_Type ..Char] [..Leading_Whitespace]
case res.is_error && setup.is_database of
True ->
res.should_fail_with Unsupported_Database_Operation
@ -79,72 +79,72 @@ add_specs suite_builder setup =
r.length . should_equal 5
r.should_equal (expected_table . rows . map .to_vector)
group_builder.specify "should error if applied to non-text column" <|
table.text_cleanse [Named_Pattern.Leading_Whitespace] ["Ticket Price"] . should_fail_with Invalid_Value_Type
table.text_cleanse ["Ticket Price"] [..Leading_Whitespace] . should_fail_with Invalid_Value_Type
suite_builder.group "Column Text Cleanse" group_builder->
test_col = Column.from_vector "Test" [" It was", "the best ", "of times", " it was the worst of times "]
group_builder.specify "should remove leading whitespace" <|
expected_col = Column.from_vector "Test" ["It was", "the best ", "of times", "it was the worst of times "]
res = test_col.text_cleanse [Named_Pattern.Leading_Whitespace]
res = test_col.text_cleanse [..Leading_Whitespace]
res.should_equal expected_col
group_builder.specify "should remove trailing whitespace" <|
expected_col = Column.from_vector "Test" [" It was", "the best", "of times", " it was the worst of times"]
res = test_col.text_cleanse [Named_Pattern.Trailing_Whitespace]
res = test_col.text_cleanse [..Trailing_Whitespace]
res.should_equal expected_col
group_builder.specify "should remove duplicate whitespace" <|
expected_col = Column.from_vector "Test" [" It was", "the best ", "of times", " it was the worst of times "]
res = test_col.text_cleanse [Named_Pattern.Duplicate_Whitespace]
res = test_col.text_cleanse [..Duplicate_Whitespace]
res.should_equal expected_col
group_builder.specify "should remove leading and trailing whitespace" <|
expected_col = Column.from_vector "Test" ["It was", "the best", "of times", "it was the worst of times"]
res = test_col.text_cleanse [Named_Pattern.Leading_Whitespace, Named_Pattern.Trailing_Whitespace]
res = test_col.text_cleanse [..Leading_Whitespace, ..Trailing_Whitespace]
res.should_equal expected_col
group_builder.specify "should remove all whitespace" <|
expected_col = Column.from_vector "Test" ["Itwas", "thebest", "oftimes", "itwastheworstoftimes"]
res = test_col.text_cleanse [Named_Pattern.All_Whitespace]
res = test_col.text_cleanse [..All_Whitespace]
res.should_equal expected_col
test_col_with_nums = Column.from_vector "Test" ["1It was", "the best2", "3of times4", " 1984 it was the worst of times 72"]
group_builder.specify "should remove leading numbers" <|
expected_col = Column.from_vector "Test" ["It was", "the best2", "of times4", " 1984 it was the worst of times 72"]
res = test_col_with_nums.text_cleanse [Named_Pattern.Leading_Numbers]
res = test_col_with_nums.text_cleanse [..Leading_Numbers]
res.should_equal expected_col
group_builder.specify "should remove trailing numbers" <|
expected_col = Column.from_vector "Test" ["1It was", "the best", "3of times", " 1984 it was the worst of times "]
res = test_col_with_nums.text_cleanse [Named_Pattern.Trailing_Numbers]
res = test_col_with_nums.text_cleanse [..Trailing_Numbers]
res.should_equal expected_col
test_col_with_non_ascii_chars = Column.from_vector "Test" [" It was the 🥇", "of 🕒s", " it was the 𒀂 of 🕒s "]
group_builder.specify "should remove non-ascii characters" <|
expected_col = Column.from_vector "Test" [" It was the ", "of s", " it was the of s "]
res = test_col_with_non_ascii_chars.text_cleanse [Named_Pattern.Non_ASCII]
res = test_col_with_non_ascii_chars.text_cleanse [..Non_ASCII]
res.should_equal expected_col
group_builder.specify "should remove tabs" <|
test_col_with_tabs = Column.from_vector "Test" [' It was\t the best', 'of times it was the worst\t of times ']
expected_col = Column.from_vector "Test" [" It was the best", "of times it was the worst of times "]
res = test_col_with_tabs.text_cleanse [Named_Pattern.Tabs]
res = test_col_with_tabs.text_cleanse [..Tabs]
res.should_equal expected_col
group_builder.specify "should remove numbers and letters" <|
test_col_with_nums_and_letters = Column.from_vector "Test" ["1A2B3C4", "5D6E7F8", "9G0H1I2", "3J4K5L6"]
res1 = test_col_with_nums_and_letters.text_cleanse [Named_Pattern.Numbers]
res2 = test_col_with_nums_and_letters.text_cleanse [Named_Pattern.Letters]
res3 = test_col_with_nums_and_letters.text_cleanse [Named_Pattern.Letters, Named_Pattern.Numbers]
res1 = test_col_with_nums_and_letters.text_cleanse [..Numbers]
res2 = test_col_with_nums_and_letters.text_cleanse [..Letters]
res3 = test_col_with_nums_and_letters.text_cleanse [..Letters, ..Numbers]
res1.should_equal (Column.from_vector "Test" ["ABC", "DEF", "GHI", "JKL"])
res2.should_equal (Column.from_vector "Test" ["1234", "5678", "9012", "3456"])
res3.should_equal (Column.from_vector "Test" ["", "", "", ""])
group_builder.specify "should remove punctuation" <|
test_col_with_punctuation = Column.from_vector "Test" ['Hello, World!', 'How are you?', ',.!?():;\'"']
res = test_col_with_punctuation.text_cleanse [Named_Pattern.Punctuation]
res = test_col_with_punctuation.text_cleanse [..Punctuation]
res.should_equal (Column.from_vector "Test" ["Hello World", "How are you", ""])
group_builder.specify "should remove symbols" <|
test_col_with_symbols = Column.from_vector "Test" ['Hello, World123!', 'How_are_you?', ',.!?():;\'"', '🥇🕒🕒']
res = test_col_with_symbols.text_cleanse [Named_Pattern.Symbols]
res = test_col_with_symbols.text_cleanse [..Symbols]
res.should_equal (Column.from_vector "Test" ["Hello World123", "Howareyou", "", ""])
group_builder.specify "should error if applied to non-text column" <|
test_col_num = Column.from_vector "Test" [1, 2, 3, 4]
test_col_num.text_cleanse [Named_Pattern.Numbers] . should_fail_with Invalid_Value_Type
test_col_num.text_cleanse [..Numbers] . should_fail_with Invalid_Value_Type
group_builder.specify "should apply the operations in order" <|
test_col_with_mixed_chars = Column.from_vector "Test" [" 11String with Leading Spaces then Leading Numbers", "22 String with Leading Numbers then Leading Spaces"]
res1 = test_col_with_mixed_chars.text_cleanse [Named_Pattern.Leading_Whitespace, Named_Pattern.Leading_Numbers]
res2 = test_col_with_mixed_chars.text_cleanse [Named_Pattern.Leading_Numbers, Named_Pattern.Leading_Whitespace]
res3 = test_col_with_mixed_chars.text_cleanse [Named_Pattern.Leading_Numbers, Named_Pattern.Leading_Whitespace, Named_Pattern.Leading_Numbers]
res1 = test_col_with_mixed_chars.text_cleanse [..Leading_Whitespace, ..Leading_Numbers]
res2 = test_col_with_mixed_chars.text_cleanse [..Leading_Numbers, ..Leading_Whitespace]
res3 = test_col_with_mixed_chars.text_cleanse [..Leading_Numbers, ..Leading_Whitespace, ..Leading_Numbers]
expected_col1 = Column.from_vector "Test" ["String with Leading Spaces then Leading Numbers", " String with Leading Numbers then Leading Spaces"]
expected_col2 = Column.from_vector "Test" ["11String with Leading Spaces then Leading Numbers", "String with Leading Numbers then Leading Spaces"]
expected_col3 = Column.from_vector "Test" ["String with Leading Spaces then Leading Numbers", "String with Leading Numbers then Leading Spaces"]