Adjust {Table|Column}.parse to use Value_Type (#6213)

Closes #5660
This commit is contained in:
Radosław Waśko 2023-04-06 12:58:55 +02:00 committed by GitHub
parent 6a09f12f3c
commit f5db35af07
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 223 additions and 170 deletions

View File

@ -902,7 +902,7 @@ type Column
## Parsing values is not supported in database columns.
@type Widget_Helpers.parse_type_selector
parse : (Auto|Integer|Decimal|Date|Date_Time|Time_Of_Day|Boolean) -> Text | Data_Formatter -> Problem_Behavior -> Column
parse : Value_Type | Auto -> Text | Data_Formatter -> Problem_Behavior -> Column
parse self type=Auto format=Data_Formatter.Value on_problems=Report_Warning =
_ = [type, format, on_problems]
Error.throw <| Unsupported_Database_Operation.Error "`Column.parse` is not implemented yet for the Database backends."

View File

@ -1382,8 +1382,8 @@ type Table
## Parsing values is not supported in database tables, the table has to be
loaded into memory first with `read`.
parse_values : Text | Integer | Column_Selector | Vector (Text | Integer | Column_Selector) -> (Auto|Integer|Decimal|Date|Date_Time|Time_Of_Day|Boolean) -> Text | Data_Formatter -> Boolean -> Problem_Behavior -> Table
parse_values columns=(self.columns . filter (c-> c.value_type.is_text) . map .name) type=Auto format=Data_Formatter.Value error_on_missing_columns=True on_problems=Report_Warning =
parse : Text | Integer | Column_Selector | Vector (Text | Integer | Column_Selector) -> Value_Type | Auto -> Text | Data_Formatter -> Boolean -> Problem_Behavior -> Table
parse columns=(self.columns . filter (c-> c.value_type.is_text) . map .name) type=Auto format=Data_Formatter.Value error_on_missing_columns=True on_problems=Report_Warning =
## Avoid unused arguments warning. We cannot rename arguments to `_`,
because we need to keep the API consistent with the in-memory table.
_ = [columns, type, format, error_on_missing_columns, on_problems]

View File

@ -16,7 +16,6 @@ import project.Internal.Widget_Helpers
from project.Data.Table import print_table
from project.Data.Type.Value_Type import Value_Type, Auto
from project.Data.Type.Value_Type_Helpers import ensure_valid_parse_target
from project.Errors import No_Index_Set_Error, Floating_Point_Equality, Invalid_Value_Type
polyglot java import org.enso.table.data.column.operation.map.MapOperationProblemBuilder
@ -1029,15 +1028,16 @@ type Column
example_contains = Examples.text_column_1.parse Boolean 'Yes|No'
@type Widget_Helpers.parse_type_selector
parse : (Auto|Integer|Decimal|Date|Date_Time|Time_Of_Day|Boolean) -> Text | Data_Formatter -> Problem_Behavior -> Column
parse : Value_Type | Auto -> Text | Data_Formatter -> Problem_Behavior -> Column
parse self type=Auto format=Data_Formatter.Value on_problems=Report_Warning =
Value_Type.expect_text self.value_type related_column=self.name <| ensure_valid_parse_target type <|
Value_Type.expect_text self.value_type related_column=self.name <|
formatter = case format of
_ : Text ->
Data_Formatter.Value.with_format type format
_ -> format
_ : Data_Formatter -> format
_ -> Error.throw (Illegal_Argument.Error "Invalid format type. Expected Text or Data_Formatter.")
parser = if type == Auto then formatter.make_auto_parser else formatter.make_datatype_parser type
parser = formatter.make_value_type_parser type
storage = self.java_column.getStorage
new_storage_and_problems = parser.parseColumn self.name storage

View File

@ -2,7 +2,7 @@ from Standard.Base import all
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import project.Internal.Parse_Values_Helper
from project.Data.Type.Value_Type import Value_Type, Auto
from project.Data.Type.Value_Type import Value_Type, Auto, Bits
polyglot java import org.enso.table.parsing.IntegerParser
polyglot java import org.enso.table.parsing.DecimalParser
@ -67,16 +67,15 @@ type Data_Formatter
Arguments:
- text: Text value to parse.
- datatype: Text value to parse.
- datatype: The expected Enso type to parse the value into. If set to
`Auto`, the type will be inferred automatically.
- on_problems: Specifies the behavior when a problem occurs.
By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
If set to `Ignore`, the operation proceeds without errors or warnings.
parse : Text -> (Auto|Integer|Number|Date|Date_Time|Time_Of_Day|Boolean) -> Problem_Behavior -> Any
parse self text datatype=Auto on_problems=Problem_Behavior.Report_Warning =
parser = case datatype of
Auto -> self.make_auto_parser
_ -> self.make_datatype_parser datatype
parser = self.make_datatype_parser datatype
result = parser.parseIndependentValue text
problems = Vector.from_polyglot_array result.problems . map (Parse_Values_Helper.translate_parsing_problem datatype)
on_problems.attach_problems_after result.value problems
@ -145,23 +144,25 @@ type Data_Formatter
It is mostly a convenience function to easily specify a datatype format.
Arguments:
- type: The datatype for which to change the format. The format can be
changed only for Date_Time, Date, Time_Of_Day and Boolean types.
- type: The value type for which to change the format. The format can be
changed only for `Date_Time`, `Date`, `Time` and `Boolean` value types.
- format: The new format string to set. For dates, it is the usual date
format notation, and for booleans it should be two values that
represent true and false, separated by a `|`.
with_format : (Auto|Integer|Number|Date|Date_Time|Time_Of_Day|Boolean) -> Text -> Data_Formatter
with_format : Value_Type | Auto -> Text -> Data_Formatter
with_format self type format = case type of
Auto -> Error.throw (Illegal_Argument.Error "Cannot specify a `format` with type `Auto`.")
Integer -> Error.throw (Illegal_Argument.Error "Cannot specify a `format` with type `Integer`.")
Decimal -> Error.throw (Illegal_Argument.Error "Cannot specify a `format` with type `Decimal`.")
Date -> self.with_datetime_formats date_formats=[format]
Date_Time -> self.with_datetime_formats datetime_formats=[format]
Time_Of_Day -> self.with_datetime_formats time_formats=[format]
Boolean ->
Value_Type.Date -> self.with_datetime_formats date_formats=[format]
Value_Type.Time -> self.with_datetime_formats time_formats=[format]
Value_Type.Date_Time _ ->
self.with_datetime_formats datetime_formats=[format]
Value_Type.Boolean ->
formats = format.split "|"
if formats.length != 2 then Error.throw (Illegal_Argument.Error "The `format` for Booleans must be a string with two values separated by `|`, for example: 'Yes|No'.") else
self.with_boolean_values true_values=[formats.at 0] false_values=[formats.at 1]
Auto ->
Error.throw (Illegal_Argument.Error "Cannot specify a `format` with type `Auto`.")
_ : Value_Type ->
Error.throw (Illegal_Argument.Error "Cannot specify a `format` for type `"+type.to_text+"`.")
## PRIVATE
Clone the instance with some properties overridden.
@ -216,7 +217,26 @@ type Data_Formatter
Date -> self.make_date_parser
Date_Time -> self.make_date_time_parser
Time_Of_Day -> self.make_time_of_day_parser
_ -> Error.throw (Illegal_Argument.Error "Unsupported datatype: "+datatype.to_text)
Auto -> self.make_auto_parser
_ ->
type_name = case datatype.to_text of
text : Text -> text
_ -> Meta.meta datatype . to_text
Error.throw (Illegal_Argument.Error "Unsupported datatype: "+type_name)
## PRIVATE
make_value_type_parser self value_type = case value_type of
# TODO once we implement #5159 we will need to add checks for bounds here and support 16/32-bit ints
Value_Type.Integer Bits.Bits_64 -> self.make_integer_parser
# TODO once we implement #6109 we can support 32-bit floats
Value_Type.Float Bits.Bits_64 -> self.make_decimal_parser
Value_Type.Boolean -> self.make_boolean_parser
Value_Type.Date -> self.make_date_parser
Value_Type.Date_Time True -> self.make_date_time_parser
Value_Type.Time -> self.make_time_of_day_parser
Auto -> self.make_auto_parser
_ ->
Error.throw (Illegal_Argument.Error "Unsupported value type: "+value_type.to_display_text)
## PRIVATE
get_specific_type_parsers self =

View File

@ -38,7 +38,6 @@ import project.Data.Expression.Expression_Error
import project.Delimited.Delimited_Format.Delimited_Format
from project.Data.Type.Value_Type import Value_Type, Auto
from project.Data.Type.Value_Type_Helpers import ensure_valid_parse_target
from project.Internal.Rows_View import Rows_View
from project.Errors import all
@ -796,31 +795,31 @@ type Table
> Example
Parse the first and last columns containing Yes/No values as booleans.
table.parse_values columns=[0, -1] type=Boolean format="Yes|No"
table.parse columns=[0, -1] type=Boolean format="Yes|No"
> Example
Parse dates in a column in the format `yyyy-MM-dd` (the default format).
table.parse_values "birthday" Date
table.parse "birthday" Date
> Example
Parse dates in a column in the format `dd/MM/yyyy`.
table.parse_values "birthday" Date 'dd/MM/yyyy'
table.parse "birthday" Date 'dd/MM/yyyy'
> Example
Parse all columns inferring their types, using `,` as the decimal point for numbers.
table.parse_values format=(Data_Formatter.Value.with_number_formatting decimal_point=',')
parse_values : Text | Integer | Column_Selector | Vector (Text | Integer | Column_Selector) -> (Auto|Integer|Decimal|Date|Date_Time|Time_Of_Day|Boolean) -> Text | Data_Formatter -> Boolean -> Problem_Behavior -> Table
parse_values self columns=(self.columns . filter (c-> c.value_type.is_text) . map .name) type=Auto format=Data_Formatter.Value error_on_missing_columns=True on_problems=Report_Warning = ensure_valid_parse_target type <|
table.parse format=(Data_Formatter.Value.with_number_formatting decimal_point=',')
parse : Text | Integer | Column_Selector | Vector (Text | Integer | Column_Selector) -> Value_Type | Auto -> Text | Data_Formatter -> Boolean -> Problem_Behavior -> Table
parse self columns=(self.columns . filter (c-> c.value_type.is_text) . map .name) type=Auto format=Data_Formatter.Value error_on_missing_columns=True on_problems=Report_Warning =
formatter = case format of
_ : Text ->
Data_Formatter.Value.with_format type format
_ -> format
_ : Data_Formatter -> format
_ -> Error.throw (Illegal_Argument.Error "Invalid format type. Expected Text or Data_Formatter.")
parser = if type == Auto then formatter.make_auto_parser else
formatter.make_datatype_parser type
parser = formatter.make_value_type_parser type
select_problem_builder = Problem_Builder.new error_on_missing_columns=error_on_missing_columns
selected_columns = self.columns_helper.select_columns_helper columns reorder=True select_problem_builder

View File

@ -75,11 +75,3 @@ find_common_type types strict =
# Double check if Mixed was really allowed to come out.
if types.contains Value_Type.Mixed then Value_Type.Mixed else
Nothing
## PRIVATE
Checks if the given type is a valid target type for parsing.
This will be replaced once we change parse to rely on `Value_Type` instead.
ensure_valid_parse_target type ~action =
expected_types = [Auto, Integer, Decimal, Date, Date_Time, Time_Of_Day, Boolean]
if expected_types.contains type . not then Error.throw (Illegal_Argument.Error "Unsupported target type "+type.to_text+".") else action

View File

@ -1,6 +1,7 @@
from Standard.Base import all
import Standard.Table.Data.Expression.Expression_Error
import Standard.Table.Data.Type.Value_Type.Value_Type
polyglot java import org.enso.table.error.ColumnCountMismatchException
polyglot java import org.enso.table.error.ColumnNameMismatchException
@ -243,18 +244,18 @@ type Invalid_Location
Arguments:
- column: the column in which the problematic cells appeared, if applicable.
It may be empty if the value is parsed outside of a context of a column.
- datatype: The expected datatype.
- value_type: The expected value type.
- cells: Contents of the cells that did not match the expected datatype
format.
type Invalid_Format
## PRIVATE
Error column:(Text|Nothing) (datatype:(Integer|Number|Date|Time|Time_Of_Day|Boolean)) (cells:[Text])
Error column:(Text|Nothing) (value_type:Value_Type|Integer|Number|Date|Time|Time_Of_Day|Boolean) (cells:[Text])
## PRIVATE
Pretty print the invalid format error.
to_display_text : Text
to_display_text self =
self.cells.length+" cells in column "+self.column+" had invalid format for datatype "+self.datatype.to_text+"."
self.cells.length+" cells in column "+self.column+" had invalid format for type "+self.value_type.to_text+"."
## Indicates that some values contained leading zeros even though these were not allowed.
@ -270,7 +271,7 @@ type Leading_Zeros
## PRIVATE
Pretty print the leading zeros error.
to_display_text : Text
to_display_text self = "Leading zeros in column "+self.column+" with datatype "+self.datatype.to_text+"."
to_display_text self = "Leading zeros in column "+self.column+" with datatype "+self.value_type.to_text+"."
## Indicates that an empty file was encountered, so no data could be loaded.
type Empty_File_Error

View File

@ -9,10 +9,10 @@ polyglot java import org.enso.table.parsing.problems.LeadingZeros
## PRIVATE
Translates a parse related problem additionally enriching it with expected
datatype information that is not originally present on the Java side.
translate_parsing_problem expected_datatype problem = case problem of
translate_parsing_problem expected_value_type problem = case problem of
java_problem : InvalidFormat ->
Invalid_Format.Error java_problem.column expected_datatype (Vector.from_polyglot_array java_problem.cells)
Invalid_Format.Error java_problem.column expected_value_type (Vector.from_polyglot_array java_problem.cells)
java_problem : LeadingZeros ->
Leading_Zeros.Error java_problem.column expected_datatype (Vector.from_polyglot_array java_problem.cells)
Leading_Zeros.Error java_problem.column expected_value_type (Vector.from_polyglot_array java_problem.cells)
_ ->
Panic.throw (Illegal_State.Error "Reported an unknown problem type: "+problem.to_text)

View File

@ -16,8 +16,10 @@ make_column_name_selector table display=Display.Always =
Selector for type argument on `Column.parse`.
parse_type_selector : Single_Choice
parse_type_selector =
choice = ['Auto', 'Integer', 'Decimal', 'Date', 'Date_Time', 'Time_Of_Day', 'Boolean']
Single_Choice display=Display.Always values=(choice.map n->(Option n))
choice = ['Auto', 'Value_Type.Integer', 'Value_Type.Float', 'Value_Type.Date', 'Value_Type.Date_Time', 'Value_Type.Time', 'Value_Type.Boolean']
names = ['Auto', 'Integer', 'Float', 'Date', 'Date_Time', 'Time', 'Boolean']
options = names.zip choice . map pair-> Option pair.first pair.second
Single_Choice display=Display.Always values=options
## PRIVATE
Selector for type argument on `Column.parse`.

View File

@ -97,11 +97,14 @@ expect_warning expected_warning result =
## UNSTABLE
Checks if the provided value has a specific warning attached and if there are
no other warnings.
As a utility, it also returns the found warning.
Arguments:
- expected_warning: The expected warning. It can either by a warning type or
a concrete value.
- result: The value to check.
expect_only_warning : Any -> Any -> Nothing
expect_only_warning : Any -> Any -> Any
expect_only_warning expected_warning result =
warnings = get_attached_warnings result
is_expected x =
@ -114,6 +117,7 @@ expect_only_warning expected_warning result =
if invalid.not_empty then
loc = Meta.get_source_location 3
Test.fail "Expected the result to contain only the warning: "+found.to_text+", but it also contained: "+invalid.to_text+' (at '+loc+').'
found
## UNSTABLE

View File

@ -40,10 +40,11 @@ public class TypeInferringParser extends DatatypeParser {
@Override
public WithProblems<Storage<?>> parseColumn(String columnName, Storage<String> sourceStorage) {
// If there are now rows, the Auto parser would guess some random type (the first one that is
// checked). Instead,
// we just return the empty column unchanged.
if (sourceStorage.size() == 0) {
// If there are no values, the Auto parser would guess some random type (the first one that is
// checked). Instead, we just return the empty column unchanged.
boolean hasNoValues =
(sourceStorage.size() == 0) || (sourceStorage.countMissing() == sourceStorage.size());
if (hasNoValues) {
return fallbackParser.parseColumn(columnName, sourceStorage);
}

View File

@ -127,6 +127,27 @@ spec =
And newlines toO!
formatter.parse complex_text . should_equal complex_text
Test.specify "should report Invalid_Format errors" <|
formatter = Data_Formatter.Value
expect_warning r =
r.should_equal Nothing
Problems.expect_only_warning Invalid_Format r
r1 = formatter.parse "Text" datatype=Decimal
w1 = expect_warning r1
w1.value_type . should_equal Decimal
w1.column . should_equal Nothing
expect_warning <| formatter.parse "Text" datatype=Integer
expect_warning <| formatter.parse "Text" datatype=Boolean
expect_warning <| formatter.parse "Text" datatype=Date
expect_warning <| formatter.parse "Text" datatype=Date_Time
expect_warning <| formatter.parse "Text" datatype=Time_Of_Day
Test.specify "should not allow unexpected types" <|
formatter = Data_Formatter.Value
formatter.parse "Text" datatype=List . should_fail_with Illegal_Argument
Test.group "DataFormatter.format" <|
Test.specify "should handle Nothing" <|
Data_Formatter.Value.format Nothing . should_equal Nothing

View File

@ -11,24 +11,24 @@ import Standard.Test.Extensions
import project.Util
spec =
Test.group "Table.parse_values" <|
Test.group "Table.parse" <|
Test.specify "should correctly parse integers" <|
t1 = Table.new [["ints", ["0", "+0", "-0", "+1", "-1", "1", "000", "0010", "12345", Nothing]]]
t2 = t1.parse_values type=Integer
t2 = t1.parse type=Value_Type.Integer
t2.at "ints" . to_vector . should_equal [0, 0, 0, 1, -1, 1, Nothing, Nothing, 12345, Nothing]
Test.specify "should correctly parse decimals" <|
t1 = Table.new [["ints", ["0", "+0", "-0", "+1", "-1", "1", "12345", Nothing]]]
t2 = t1.parse_values type=Decimal
t2 = t1.parse type=Value_Type.Float
t2.at "ints" . to_vector . should_equal [0, 0, 0, 1, -1, 1, 12345, Nothing]
t2.at "ints" . to_vector . map .to_text . should_equal ["0.0", "0.0", "-0.0", "1.0", "-1.0", "1.0", "12345.0", "Nothing"]
t3 = Table.new [["floats", ["0.0", "+0.0", "-0.0", "+1.0", "-1.0", "1.0", "0.0000", "10.", "12345."]]]
t4 = t3.parse_values type=Decimal
t4 = t3.parse type=Value_Type.Float
t4.at "floats" . to_vector . should_equal [0, 0, 0, 1, -1, 1, 0, 10, 12345]
t5 = Table.new [["floats", [".0", "0.", "1.", ".1", ".123", "-.1", "+.1", "+0.0", "0.1234", Nothing, "11111111.111"]]]
t6 = t5.parse_values type=Decimal
t6 = t5.parse type=Value_Type.Float
t6.at "floats" . to_vector . should_equal [0.0, 0.0, 1.0, 0.1, 0.123, -0.1, 0.1, 0.0, 0.1234, Nothing, 11111111.111]
Test.specify "should warn on leading zeros in numbers, if asked" <|
@ -37,56 +37,56 @@ spec =
t1_parsed = [0, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, 12345, Nothing]
t1_zeros = ["+00", "-00", "+01", "-01", "01", "000", "0010"]
t3 = t1.parse_values type=Integer
t3 = t1.parse type=Value_Type.Integer
t3.at "ints" . to_vector . should_equal t1_parsed
Problems.get_attached_warnings t3 . should_equal [Leading_Zeros.Error "ints" Integer t1_zeros]
Problems.get_attached_warnings t3 . should_equal [Leading_Zeros.Error "ints" Value_Type.Integer t1_zeros]
t4 = t1.parse_values type=Decimal
t4 = t1.parse type=Value_Type.Float
t4.at "ints" . to_vector . should_equal t1_parsed
Problems.get_attached_warnings t4 . should_equal [Leading_Zeros.Error "ints" Decimal t1_zeros]
Problems.get_attached_warnings t4 . should_equal [Leading_Zeros.Error "ints" Value_Type.Float t1_zeros]
t5 = t2.parse_values type=Decimal
t5 = t2.parse type=Value_Type.Float
t5.at "floats" . to_vector . should_equal [0.0, 0.0, Nothing, Nothing, Nothing, 1.0]
Problems.get_attached_warnings t5 . should_equal [Leading_Zeros.Error "floats" Decimal ["00.", "01.0", '-0010.0000']]
Problems.get_attached_warnings t5 . should_equal [Leading_Zeros.Error "floats" Value_Type.Float ["00.", "01.0", '-0010.0000']]
opts = Data_Formatter.Value allow_leading_zeros=True
t1_parsed_zeros = [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing]
t6 = t1.parse_values format=opts type=Integer
t6 = t1.parse format=opts type=Value_Type.Integer
t6.at "ints" . to_vector . should_equal t1_parsed_zeros
Problems.assume_no_problems t6
t7 = t1.parse_values format=opts type=Decimal
t7 = t1.parse format=opts type=Value_Type.Float
t7.at "ints" . to_vector . should_equal t1_parsed_zeros
Problems.assume_no_problems t7
t8 = t2.parse_values format=opts type=Decimal
t8 = t2.parse format=opts type=Value_Type.Float
t8.at "floats" . to_vector . should_equal [0.0, 0.0, 0.0, 1.0, -10.0, 1.0]
Problems.assume_no_problems t8
Test.specify "should correctly parse booleans" <|
t1 = Table.new [["bools", ["true", "false", "True", "TRUE", "FALSE", Nothing, "False"]]]
t2 = t1.parse_values type=Boolean
t2 = t1.parse type=Value_Type.Boolean
t2.at "bools" . to_vector . should_equal [True, False, True, True, False, Nothing, False]
t3 = Table.new [["bools", ["1", "0", "true", "yes", "oui", "no", "NO!"]]]
t4 = t3.parse_values type=Boolean format="yes|no"
t4 = t3.parse type=Value_Type.Boolean format="yes|no"
t4.at "bools" . to_vector . should_equal [Nothing, Nothing, Nothing, True, Nothing, False, Nothing]
Test.specify "should correctly parse date and time" <|
t1 = Table.new [["dates", ["2022-05-07", "2000-01-01", "2010-12-31"]]]
t2 = t1.parse_values type=Date
t2 = t1.parse type=Value_Type.Date
t2.at "dates" . to_vector . should_equal [Date.new 2022 5 7, Date.new 2000 1 1, Date.new 2010 12 31]
t3 = Table.new [["datetimes", ["2022-05-07 23:59:59", "2000-01-01 00:00:00", "2010-12-31 12:34:56"]]]
t4 = t3.parse_values type=Date_Time
t4 = t3.parse type=Value_Type.Date_Time
t4.at "datetimes" . to_vector . should_equal [Date_Time.new 2022 5 7 23 59 59, Date_Time.new 2000 1 1, Date_Time.new 2010 12 31 12 34 56]
t5 = Table.new [["times", ["23:59:59", "00:00:00", "12:34:56"]]]
t6 = t5.parse_values type=Time_Of_Day
t6 = t5.parse type=Value_Type.Time
t6.at "times" . to_vector . should_equal [Time_Of_Day.new 23 59 59, Time_Of_Day.new, Time_Of_Day.new 12 34 56]
t7 = Table.new [["dates", ["07/05/2022", "01/01/2001", "31/12/2010"]]]
t8 = t7.parse_values type=Date format="dd/MM/yyyy"
t8 = t7.parse type=Value_Type.Date format="dd/MM/yyyy"
t8.at "dates" . value_type . should_equal Value_Type.Date
t8.at "dates" . to_vector . should_equal [Date.new 2022 5 7, Date.new 2001 1 1, Date.new 2010 12 31]
@ -94,15 +94,15 @@ spec =
opts = Data_Formatter.Value date_formats=["d.M.y", "d MMM y[ G]", "E, d MMM y"] datetime_formats=["yyyy-MM-dd'T'HH:mm:ss", "dd/MM/yyyy HH:mm"] time_formats=["H:mm:ss.n", "h:mma"]
t1 = Table.new [["dates", ["1.2.476", "10 Jan 1900 AD", "Tue, 3 Jun 2008"]]]
t2 = t1.parse_values format=opts type=Date
t2 = t1.parse format=opts type=Value_Type.Date
t2.at "dates" . to_vector . should_equal [Date.new 476 2 1, Date.new 1900 1 10, Date.new 2008 6 3]
t3 = Table.new [["datetimes", ["2011-12-03T10:15:30", "31/12/2012 22:33"]]]
t4 = t3.parse_values format=opts type=Date_Time
t4 = t3.parse format=opts type=Value_Type.Date_Time
t4.at "datetimes" . to_vector . should_equal [Date_Time.new 2011 12 3 10 15 30, Date_Time.new 2012 12 31 22 33]
t5 = Table.new [["times", ["1:02:03.987654321", "1:30PM"]]]
t6 = t5.parse_values format=opts type=Time_Of_Day
t6 = t5.parse format=opts type=Value_Type.Time
t6.at "times" . to_vector . should_equal [Time_Of_Day.new 1 2 3 nanosecond=987654321, Time_Of_Day.new 13 30 0 0]
Test.specify "should warn when cells do not fit the expected format" <|
@ -113,51 +113,51 @@ spec =
times = ["2001-01-01", "2001-01-01 12:34:56", "10:00:10", "Tuesday", "foobar", "", "10:99:99", "1/2/2003", "2001-30-10"]
t = Table.new [ints, floats, bools, ["times", times]]
t0 = t.parse_values type=Boolean
t0 = t.parse type=Value_Type.Boolean
t0.at "bools" . to_vector . should_equal [True, False, Nothing, Nothing, Nothing, Nothing, Nothing, True, Nothing]
t0.at "ints" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing]
Problems.expect_warning (Invalid_Format.Error "bools" Boolean ["fAlSE", "foobar", "", "0", "1", "truefalse"]) t0
Problems.expect_warning (Invalid_Format.Error "ints" Boolean ["0", "1", "1.0", "foobar", "", "--1", "+-1", "10", "-+1"]) t0
Problems.expect_warning (Invalid_Format.Error "bools" Value_Type.Boolean ["fAlSE", "foobar", "", "0", "1", "truefalse"]) t0
Problems.expect_warning (Invalid_Format.Error "ints" Value_Type.Boolean ["0", "1", "1.0", "foobar", "", "--1", "+-1", "10", "-+1"]) t0
a1 = t.parse_values columns=["ints"] type=Integer on_problems=_
a1 = t.parse columns=["ints"] type=Value_Type.Integer on_problems=_
t1 t =
t.at "ints" . to_vector . should_equal [0, 1, Nothing, Nothing, Nothing, Nothing, Nothing, 10, Nothing]
p1 = [Invalid_Format.Error "ints" Integer ["1.0", "foobar", "", "--1", "+-1", "-+1"]]
p1 = [Invalid_Format.Error "ints" Value_Type.Integer ["1.0", "foobar", "", "--1", "+-1", "-+1"]]
Problems.test_problem_handling a1 p1 t1
a2 = t.parse_values columns=["floats"] type=Decimal on_problems=_
a2 = t.parse columns=["floats"] type=Value_Type.Float on_problems=_
t2 t =
t.at "floats" . to_vector . should_equal [0, 2, Nothing, Nothing, Nothing, Nothing, Nothing, 100, Nothing]
p2 = [Invalid_Format.Error "floats" Decimal ["1e6", "foobar", "", "--1", "+-1", "-+1"]]
p2 = [Invalid_Format.Error "floats" Value_Type.Float ["1e6", "foobar", "", "--1", "+-1", "-+1"]]
Problems.test_problem_handling a2 p2 t2
a3 = t.parse_values columns=["bools"] type=Boolean on_problems=_
a3 = t.parse columns=["bools"] type=Value_Type.Boolean on_problems=_
t3 t =
t.at "bools" . to_vector . should_equal [True, False, Nothing, Nothing, Nothing, Nothing, Nothing, True, Nothing]
p3 = [Invalid_Format.Error "bools" Boolean ["fAlSE", "foobar", "", "0", "1", "truefalse"]]
p3 = [Invalid_Format.Error "bools" Value_Type.Boolean ["fAlSE", "foobar", "", "0", "1", "truefalse"]]
Problems.test_problem_handling a3 p3 t3
a4 = t.parse_values columns=["times"] type=Date on_problems=_
a4 = t.parse columns=["times"] type=Value_Type.Date on_problems=_
t4 t =
t.at "times" . to_vector . should_equal [Date.new 2001 1 1, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing]
p4 = [Invalid_Format.Error "times" Date ["2001-01-01 12:34:56", "10:00:10", "Tuesday", "foobar", "", "10:99:99", "1/2/2003", "2001-30-10"]]
p4 = [Invalid_Format.Error "times" Value_Type.Date ["2001-01-01 12:34:56", "10:00:10", "Tuesday", "foobar", "", "10:99:99", "1/2/2003", "2001-30-10"]]
Problems.test_problem_handling a4 p4 t4
a5 = t.parse_values columns=["times"] type=Date_Time on_problems=_
a5 = t.parse columns=["times"] type=Value_Type.Date_Time on_problems=_
t5 t =
t.at "times" . to_vector . should_equal [Nothing, Date_Time.new 2001 1 1 12 34 56, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing]
p5 = [Invalid_Format.Error "times" Date_Time ["2001-01-01", "10:00:10", "Tuesday", "foobar", "", "10:99:99", "1/2/2003", "2001-30-10"]]
p5 = [Invalid_Format.Error "times" Value_Type.Date_Time ["2001-01-01", "10:00:10", "Tuesday", "foobar", "", "10:99:99", "1/2/2003", "2001-30-10"]]
Problems.test_problem_handling a5 p5 t5
a6 = t.parse_values columns=["times"] type=Time_Of_Day on_problems=_
a6 = t.parse columns=["times"] type=Value_Type.Time on_problems=_
t6 t =
t.at "times" . to_vector . should_equal [Nothing, Nothing, Time_Of_Day.new 10 0 10 0, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing]
p6 = [Invalid_Format.Error "times" Time_Of_Day ["2001-01-01", "2001-01-01 12:34:56", "Tuesday", "foobar", "", "10:99:99", "1/2/2003", "2001-30-10"]]
p6 = [Invalid_Format.Error "times" Value_Type.Time ["2001-01-01", "2001-01-01 12:34:56", "Tuesday", "foobar", "", "10:99:99", "1/2/2003", "2001-30-10"]]
Problems.test_problem_handling a6 p6 t6
Test.specify "should leave not selected columns unaffected" <|
t1 = Table.new [["A", ["1", "2"]], ["B", ["3", "4"]]]
t2 = t1.parse_values columns="B"
t2 = t1.parse columns="B"
t2.at "A" . to_vector . should_equal ["1", "2"]
t2.at "B" . to_vector . should_equal [3, 4]
@ -174,7 +174,7 @@ spec =
c10 = ["mixeddates", ["2022-10-01", "2000-01-01 01:02:03", "01:02:03", Nothing]]
c11 = ["text+ints", ["1", "2", " foobar", Nothing]]
t = Table.new [c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11]
t2 = t.parse_values
t2 = t.parse
Problems.assume_no_problems t2
t2.at "ints" . to_vector . should_equal [1, 2, -123, Nothing]
@ -191,19 +191,19 @@ spec =
t2.at "text+ints" . to_vector . should_equal ["1", "2", "foobar", Nothing]
# In Auto mode, integers take precedence over booleans.
t3 = Table.new [["bools", ["1", "0", "True"]], ["ints", ["1", "0", "0"]]] . parse_values format=(Data_Formatter.Value true_values=["1", "True"] false_values=["0", "False"])
t3 = Table.new [["bools", ["1", "0", "True"]], ["ints", ["1", "0", "0"]]] . parse format=(Data_Formatter.Value true_values=["1", "True"] false_values=["0", "False"])
t3.at "bools" . to_vector . should_equal [True, False, True]
t3.at "ints" . to_vector . should_equal [1, 0, 0]
t4 = Table.new [c2] . parse_values format=(Data_Formatter.Value allow_leading_zeros=True)
t4 = Table.new [c2] . parse format=(Data_Formatter.Value allow_leading_zeros=True)
t4 . at "ints0" . to_vector . should_equal [1, 2, Nothing, -1]
t5 = t.parse_values columns="ints" type=Decimal
t5 = t.parse columns="ints" type=Value_Type.Float
t5.at "ints" . to_vector . should_equal [1.0, 2.0, -123.0, Nothing]
# `ints` are requested to be parsed as decimals.
t5.at "ints" . to_vector . first . should_be_a Decimal
t6 = t.parse_values columns=["floats", "text+ints"] type=Auto
t6 = t.parse columns=["floats", "text+ints"] type=Auto
# `floats` are auto-detected as decimals.
t6.at "floats" . to_vector . should_equal [1.0, 2.2, Nothing, -1.0]
# `text+ints` is attempted to be parsed (hence whitespace is stripped), but it only fits the text type.
@ -214,27 +214,27 @@ spec =
Test.specify "should allow to specify a thousands separator and a custom decimal point" <|
opts = Data_Formatter.Value decimal_point=',' thousand_separator='_'
t1 = Table.new [["floats", ["0,0", "+0,0", "-0,0", "+1,5", "-1,2", "1,0", "0,0000", "10_000,", ",0"]]]
t2 = t1.parse_values format=opts
t2 = t1.parse format=opts
t2.at "floats" . to_vector . should_equal [0.0, 0.0, 0.0, 1.5, -1.2, 1.0, 0.0, 10000.0, 0.0]
t3 = Table.new [["xs", ["1,2", "1.3", "_0", "0_", "1_0_0"]]]
t4 = t3.parse_values format=opts type=Decimal
t4 = t3.parse format=opts type=Value_Type.Float
t4.at "xs" . to_vector . should_equal [1.2, Nothing, Nothing, Nothing, 100.0]
Problems.get_attached_warnings t4 . should_equal [Invalid_Format.Error "xs" Decimal ["1.3", "_0", "0_"]]
t5 = t3.parse_values format=opts type=Integer
Problems.get_attached_warnings t4 . should_equal [Invalid_Format.Error "xs" Value_Type.Float ["1.3", "_0", "0_"]]
t5 = t3.parse format=opts type=Value_Type.Integer
t5.at "xs" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing, 100]
Problems.get_attached_warnings t5 . should_equal [Invalid_Format.Error "xs" Integer ["1,2", "1.3", "_0", "0_"]]
Problems.get_attached_warnings t5 . should_equal [Invalid_Format.Error "xs" Value_Type.Integer ["1,2", "1.3", "_0", "0_"]]
Test.specify "should allow to specify custom values for booleans" <|
opts_1 = Data_Formatter.Value true_values=["1", "YES"] false_values=["0"]
t1 = Table.new [["bools", ["1", "0", "YES", "1", "0"]]]
t2 = t1.parse_values format=opts_1
t2 = t1.parse format=opts_1
t2.at "bools" . to_vector . should_equal [True, False, True, True, False]
t3 = Table.new [["bools", ["1", "NO", "False", "True", "YES", "no", "oui", "0"]]]
t4 = t3.parse_values format=opts_1 type=Boolean
t4 = t3.parse format=opts_1 type=Value_Type.Boolean
t4.at "bools" . to_vector . should_equal [True, Nothing, Nothing, Nothing, True, Nothing, Nothing, False]
Problems.get_attached_warnings t4 . should_equal [Invalid_Format.Error "bools" Boolean ["NO", "False", "True", "no", "oui"]]
Problems.get_attached_warnings t4 . should_equal [Invalid_Format.Error "bools" Value_Type.Boolean ["NO", "False", "True", "no", "oui"]]
whitespace_table =
ints = ["ints", ["0", "1 ", "0 1", " 2"]]
@ -246,62 +246,62 @@ spec =
Table.new [ints, floats, bools, dates, datetimes, times]
Test.specify "should trim input values by default" <|
t1 = whitespace_table.parse_values columns="ints" type=Integer
t1 = whitespace_table.parse columns="ints" type=Value_Type.Integer
t1.at "ints" . to_vector . should_equal [0, 1, Nothing, 2]
Problems.expect_only_warning (Invalid_Format.Error "ints" Integer ["0 1"]) t1
Problems.expect_only_warning (Invalid_Format.Error "ints" Value_Type.Integer ["0 1"]) t1
t2 = whitespace_table.parse_values columns="floats" type=Decimal
t2 = whitespace_table.parse columns="floats" type=Value_Type.Float
t2.at "floats" . to_vector . should_equal [0.0, 2.0, Nothing, 10.0]
Problems.expect_only_warning (Invalid_Format.Error "floats" Decimal ["- 1"]) t2
Problems.expect_only_warning (Invalid_Format.Error "floats" Value_Type.Float ["- 1"]) t2
t3 = whitespace_table.parse_values columns="bools" type=Boolean
t3 = whitespace_table.parse columns="bools" type=Value_Type.Boolean
t3.at "bools" . to_vector . should_equal [True, False, Nothing, False]
Problems.expect_only_warning (Invalid_Format.Error "bools" Boolean ["t rue"]) t3
Problems.expect_only_warning (Invalid_Format.Error "bools" Value_Type.Boolean ["t rue"]) t3
t4 = whitespace_table.parse_values columns="dates" type=Date
t4 = whitespace_table.parse columns="dates" type=Value_Type.Date
t4.at "dates" . to_vector . should_equal [Date.new 2022 1 1, Date.new 2022 7 17, Nothing, Nothing]
Problems.expect_only_warning (Invalid_Format.Error "dates" Date ["2022 - 07 - 17", ""]) t4
Problems.expect_only_warning (Invalid_Format.Error "dates" Value_Type.Date ["2022 - 07 - 17", ""]) t4
t5 = whitespace_table.parse_values columns="datetimes" type=Date_Time
t5 = whitespace_table.parse columns="datetimes" type=Value_Type.Date_Time
t5.at "datetimes" . to_vector . should_equal [Date_Time.new 2022 1 1 11 59, Nothing, Nothing, Nothing]
Problems.expect_only_warning (Invalid_Format.Error "datetimes" Date_Time ["2022 - 07 - 17 1:2:3", "2022-01-01 11:59:00"]) t5
Problems.expect_only_warning (Invalid_Format.Error "datetimes" Value_Type.Date_Time ["2022 - 07 - 17 1:2:3", "2022-01-01 11:59:00"]) t5
t6 = whitespace_table.parse_values columns="times" type=Time_Of_Day
t6 = whitespace_table.parse columns="times" type=Value_Type.Time
t6.at "times" . to_vector . should_equal [Time_Of_Day.new 11 0 0, Time_Of_Day.new, Nothing, Nothing]
Problems.expect_only_warning (Invalid_Format.Error "times" Time_Of_Day ["00 : 00 : 00"]) t6
Problems.expect_only_warning (Invalid_Format.Error "times" Value_Type.Time ["00 : 00 : 00"]) t6
Test.specify "should fail to parse if whitespace is present and trimming is turned off" <|
opts = Data_Formatter.Value trim_values=False
t1 = whitespace_table.parse_values format=opts columns="ints" type=Integer
t1 = whitespace_table.parse format=opts columns="ints" type=Value_Type.Integer
t1.at "ints" . to_vector . should_equal [0, Nothing, Nothing, Nothing]
Problems.expect_only_warning (Invalid_Format.Error "ints" Integer ["1 ", "0 1", " 2"]) t1
Problems.expect_only_warning (Invalid_Format.Error "ints" Value_Type.Integer ["1 ", "0 1", " 2"]) t1
t2 = whitespace_table.parse_values format=opts columns="floats" type=Decimal
t2 = whitespace_table.parse format=opts columns="floats" type=Value_Type.Float
t2.at "floats" . to_vector . should_equal [Nothing, Nothing, Nothing, 10.0]
Problems.expect_only_warning (Invalid_Format.Error "floats" Decimal ["0 ", " 2.0", "- 1"]) t2
Problems.expect_only_warning (Invalid_Format.Error "floats" Value_Type.Float ["0 ", " 2.0", "- 1"]) t2
t3 = whitespace_table.parse_values format=opts columns="bools" type=Boolean
t3 = whitespace_table.parse format=opts columns="bools" type=Value_Type.Boolean
t3.at "bools" . to_vector . should_equal [Nothing, Nothing, Nothing, False]
Problems.expect_only_warning (Invalid_Format.Error "bools" Boolean ["True ", " false", "t rue"]) t3
Problems.expect_only_warning (Invalid_Format.Error "bools" Value_Type.Boolean ["True ", " false", "t rue"]) t3
t4 = whitespace_table.parse_values format=opts columns="dates" type=Date
t4 = whitespace_table.parse format=opts columns="dates" type=Value_Type.Date
t4.at "dates" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing]
Problems.expect_only_warning (Invalid_Format.Error "dates" Date [" 2022-01-01", "2022-07-17 ", "2022 - 07 - 17", ""]) t4
Problems.expect_only_warning (Invalid_Format.Error "dates" Value_Type.Date [" 2022-01-01", "2022-07-17 ", "2022 - 07 - 17", ""]) t4
t5 = whitespace_table.parse_values format=opts columns="datetimes" type=Date_Time
t5 = whitespace_table.parse format=opts columns="datetimes" type=Value_Type.Date_Time
t5.at "datetimes" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing]
Problems.expect_only_warning (Invalid_Format.Error "datetimes" Date_Time [" 2022-01-01 11:59:00 ", "2022 - 07 - 17 1:2:3 ", "2022-01-01 11:59:00"]) t5
Problems.expect_only_warning (Invalid_Format.Error "datetimes" Value_Type.Date_Time [" 2022-01-01 11:59:00 ", "2022 - 07 - 17 1:2:3 ", "2022-01-01 11:59:00"]) t5
t6 = whitespace_table.parse_values format=opts columns="times" type=Time_Of_Day
t6 = whitespace_table.parse format=opts columns="times" type=Value_Type.Time
t6.at "times" . to_vector . should_equal [Nothing, Nothing, Nothing, Nothing]
Problems.expect_only_warning (Invalid_Format.Error "times" Time_Of_Day ["11:00:00 ", " 00:00:00", "00 : 00 : 00"]) t6
Problems.expect_only_warning (Invalid_Format.Error "times" Value_Type.Time ["11:00:00 ", " 00:00:00", "00 : 00 : 00"]) t6
Test.specify "should fallback to text if whitespace is present and trimming is turned off" <|
c1 = ["1", " +2", "-123", Nothing]
c2 = [" 1.0 ", "2.2", Nothing, "-1.0"]
c3 = ["true", " False", Nothing, "True"]
t = Table.new [["ints", c1], ["floats", c2], ["bools", c3]]
t2 = t.parse_values format=(Data_Formatter.Value trim_values=False)
t2 = t.parse format=(Data_Formatter.Value trim_values=False)
Warning.get_all t2 . should_equal []
t2.at "ints" . to_vector . should_equal c1
@ -310,7 +310,7 @@ spec =
Test.specify "should allow selecting columns by regex" <|
t1 = Table.new [["An", ["1", "2", "3"]], ["Am", ["4", "5", "6"]], ["C", ["7", "8", "9"]], ["D", ["10", "11", "12"]]]
r1 = t1.parse_values columns=[Column_Selector.By_Name "A.*" use_regex=True]
r1 = t1.parse columns=[Column_Selector.By_Name "A.*" use_regex=True]
r1.at "An" . to_vector . should_equal [1, 2, 3]
r1.at "Am" . to_vector . should_equal [4, 5, 6]
r1.at "C" . to_vector . should_equal ["7", "8", "9"]
@ -318,15 +318,15 @@ spec =
Test.specify "should correctly handle problems: missing input columns" <|
t1 = Table.new [["A", ["1", "2", "3"]]]
r1 = t1.parse_values columns=["A", "B", "C", "E"] on_problems=Problem_Behavior.Ignore
r1 = t1.parse columns=["A", "B", "C", "E"] on_problems=Problem_Behavior.Ignore
r1.should_fail_with Missing_Input_Columns
r1.catch.criteria . should_equal ["B", "C", "E"]
r2 = t1.parse_values columns=[Column_Selector.By_Name "A.+" use_regex=True]
r2 = t1.parse columns=[Column_Selector.By_Name "A.+" use_regex=True]
r2.should_fail_with Missing_Input_Columns
r2.catch.criteria . should_equal ["A.+"]
action = t1.parse_values columns=["A", "B", "C", "E"] error_on_missing_columns=False on_problems=_
action = t1.parse columns=["A", "B", "C", "E"] error_on_missing_columns=False on_problems=_
tester table =
table.at "A" . to_vector . should_equal [1, 2, 3]
problems = [Missing_Input_Columns.Error ["B", "C", "E"]]
@ -334,11 +334,11 @@ spec =
Test.specify "should correctly handle problems: out of bounds indices" <|
t1 = Table.new [["A", ["1", "2", "3"]]]
r1 = t1.parse_values columns=[0, -1, 42, -5]
r1 = t1.parse columns=[0, -1, 42, -5]
r1.should_fail_with Column_Indexes_Out_Of_Range
r1.catch.indexes . should_equal [42, -5]
action = t1.parse_values columns=[0, -1, 42, -5] error_on_missing_columns=False on_problems=_
action = t1.parse columns=[0, -1, 42, -5] error_on_missing_columns=False on_problems=_
tester table =
table.at "A" . to_vector . should_equal [1, 2, 3]
problems = [Column_Indexes_Out_Of_Range.Error [42, -5]]
@ -346,7 +346,7 @@ spec =
Test.specify "should allow mixed column selectors" <|
t1 = Table.new [["Am", ["1", "2", "3"]], ["B", ["4", "5", "6"]], ["C", ["7", "8", "9"]], ["D", ["10", "11", "12"]]]
r1 = t1.parse_values columns=[(Column_Selector.By_Name "A.*" use_regex=True), -2, "D"]
r1 = t1.parse columns=[(Column_Selector.By_Name "A.*" use_regex=True), -2, "D"]
r1.at "Am" . to_vector . should_equal [1, 2, 3]
r1.at "B" . to_vector . should_equal ["4", "5", "6"]
r1.at "C" . to_vector . should_equal [7, 8, 9]
@ -354,7 +354,7 @@ spec =
Test.specify "should handle edge-cases: overlapping selectors" <|
t1 = Table.new [["Am", ["1", "2", "3"]], ["B", ["4", "5", "6"]], ["C", ["7", "8", "9"]], ["D", ["10", "11", "12"]]]
r1 = t1.parse_values columns=[(Column_Selector.By_Name "A.*" use_regex=True), 0, "D", -1, -1, 0, 3]
r1 = t1.parse columns=[(Column_Selector.By_Name "A.*" use_regex=True), 0, "D", -1, -1, 0, 3]
r1.at "Am" . to_vector . should_equal [1, 2, 3]
r1.at "B" . to_vector . should_equal ["4", "5", "6"]
r1.at "C" . to_vector . should_equal ["7", "8", "9"]
@ -362,29 +362,29 @@ spec =
Test.specify "should error if invalid target type is provided" <|
t1 = Table.new [["A", ["1", "2", "3"]]]
t1.parse_values type=Nothing . should_fail_with Illegal_Argument
t1.parse type=Nothing . should_fail_with Illegal_Argument
Test.specify "should error if the input column is not text" <|
t1 = Table.new [["A", [1, 2, 3]], ["B", ["4", "5", "6"]], ["C", [7, 8, 9]], ["D", ["10", "11", "12"]]]
r1 = t1.parse_values columns=["A", "B", "C"]
r1 = t1.parse columns=["A", "B", "C"]
r1.should_fail_with Invalid_Value_Type
r1.catch.related_column . should_equal "A"
r1.catch.expected.is_text.should_be_true
Test.specify "should error if no input columns selected, unless error_on_missing_columns=False" <|
t1 = Table.new [["A", ["1", "2", "3"]]]
r1 = t1.parse_values columns=[]
r1 = t1.parse columns=[]
r1.should_fail_with No_Input_Columns_Selected
r2 = t1.parse_values columns=[] error_on_missing_columns=False
r2 = t1.parse columns=[] error_on_missing_columns=False
r2 . should_equal t1
Problems.expect_warning No_Input_Columns_Selected r2
r3 = t1.parse_values columns=[] error_on_missing_columns=False on_problems=Problem_Behavior.Ignore
r3 = t1.parse columns=[] error_on_missing_columns=False on_problems=Problem_Behavior.Ignore
r3 . should_equal t1
Problems.assume_no_problems r3
r4 = t1.parse_values columns=["nonexistent column :D", -42] error_on_missing_columns=False on_problems=Problem_Behavior.Report_Warning
r4 = t1.parse columns=["nonexistent column :D", -42] error_on_missing_columns=False on_problems=Problem_Behavior.Report_Warning
r4 . should_equal t1
Problems.expect_warning No_Input_Columns_Selected r4
Problems.expect_warning (Missing_Input_Columns.Error ["nonexistent column :D"]) r4
@ -393,87 +393,93 @@ spec =
Test.group "Column.parse" <|
Test.specify "should correctly parse integers" <|
c1 = Column.from_vector "ints" ["0", "+0", "-0", "+1", "-1", "1", "000", "0010", "12345", Nothing]
c2 = c1.parse Integer
c2 = c1.parse type=Value_Type.Integer
c2.name.should_equal c1.name
c2 . to_vector . should_equal [0, 0, 0, 1, -1, 1, Nothing, Nothing, 12345, Nothing]
c2.value_type.should_equal Value_Type.Integer
Problems.expect_warning Leading_Zeros c2
c3 = c1.parse Integer format=(Data_Formatter.Value.with_number_formatting allow_leading_zeros=True)
c3 = c1.parse type=Value_Type.Integer format=(Data_Formatter.Value.with_number_formatting allow_leading_zeros=True)
c3.to_vector . should_equal [0, 0, 0, 1, -1, 1, 0, 10, 12345, Nothing]
Problems.assume_no_problems c3
Test.specify "should correctly parse decimals" <|
c1 = Column.from_vector "ints" ["0", "+0", "-0", "+1", "-1", "1", "000", "0010", "12345", Nothing]
c2 = c1.parse Decimal
c2 = c1.parse Value_Type.Float
c2.name.should_equal c1.name
c2 . to_vector . should_equal [0, 0, 0, 1, -1, 1, Nothing, Nothing, 12345, Nothing]
c2.to_vector . should_equal [0, 0, 0, 1, -1, 1, Nothing, Nothing, 12345, Nothing]
c2.value_type.should_equal Value_Type.Float
c2.to_vector . map .to_text . should_equal ["0.0", "0.0", "-0.0", "1.0", "-1.0", "1.0", "Nothing", "Nothing", "12345.0", "Nothing"]
Problems.expect_warning Leading_Zeros c2
c3 = Column.from_vector "floats" ["0.0", "+0.0", "-0.0", "+1.0", "-1.0", "1.0", "0.0000", "10.", "12345."]
c4 = c3.parse Decimal
c4 = c3.parse Value_Type.Float
c4.to_vector . should_equal [0, 0, 0, 1, -1, 1, 0, 10, 12345]
c4.value_type.is_floating_point.should_be_true
Problems.assume_no_problems c4
c5 = Column.from_vector "floats" [".0", "0.", "1.", ".1", ".123", "-.1", "+.1", "+0.0", "0.1234", Nothing, "11111111.111"]
c6 = c5.parse Decimal
c6 = c5.parse Value_Type.Float
c6.to_vector . should_equal [0.0, 0.0, 1.0, 0.1, 0.123, -0.1, 0.1, 0.0, 0.1234, Nothing, 11111111.111]
Problems.assume_no_problems c6
Test.specify "should correctly parse booleans" <|
c1 = Column.from_vector "bools" ["true", "false", "True", "TRUE", "FALSE", Nothing, "False"]
c2 = c1.parse Boolean
c2 = c1.parse type=Value_Type.Boolean
c2.name.should_equal c1.name
c2.to_vector . should_equal [True, False, True, True, False, Nothing, False]
c2.value_type.should_equal Value_Type.Boolean
c1.parse . to_vector . should_equal [True, False, True, True, False, Nothing, False]
c3 = Column.from_vector "bools" ["yes", "no", Nothing]
c4 = c3.parse Boolean "yes|no"
c4 = c3.parse type=Value_Type.Boolean "yes|no"
c4.to_vector . should_equal [True, False, Nothing]
c5 = Column.from_vector "bools" ["true", "yes", "false"]
c6 = c5.parse Boolean
c6 = c5.parse type=Value_Type.Boolean
c6.to_vector . should_equal [True, Nothing, False]
w = Problems.get_attached_warnings c6 . find w-> w.is_a Invalid_Format
w.column.should_equal "bools"
w.datatype . should_equal Boolean
w.value_type . should_equal Value_Type.Boolean
w.cells . should_equal ["yes"]
Test.specify "should correctly parse date and time" <|
c1 = Column.from_vector "date" ["2022-05-07", "2000-01-01", "2010-12-31"]
c2 = c1.parse Date
c2 = c1.parse type=Value_Type.Date
c2.to_vector . should_equal [Date.new 2022 5 7, Date.new 2000 1 1, Date.new 2010 12 31]
c2.value_type.should_equal Value_Type.Date
c3 = Column.from_vector "datetimes" ["2022-05-07 23:59:59", "2000-01-01 00:00:00", "2010-12-31 12:34:56", "2010-12-31T12:34:56", "2010-12-31 12:34:56.123"]
c4 = c3.parse Date_Time
c4 = c3.parse type=Value_Type.Date_Time
c4.to_vector . should_equal [Date_Time.new 2022 5 7 23 59 59, Date_Time.new 2000 1 1, Date_Time.new 2010 12 31 12 34 56, Date_Time.new 2010 12 31 12 34 56, Date_Time.new 2010 12 31 12 34 56 123]
c4.value_type.should_equal Value_Type.Date_Time
c5 = Column.from_vector "times" ["23:59:59", "00:00:00", "12:34:56"]
c6 = c5.parse Time_Of_Day
c6 = c5.parse type=Value_Type.Time
c6.to_vector . should_equal [Time_Of_Day.new 23 59 59, Time_Of_Day.new, Time_Of_Day.new 12 34 56]
c6.value_type.should_equal Value_Type.Time
c7 = Column.from_vector "foo" ["2022-05-07 23:59:59", "42", "2010-12-31"]
c8 = c7.parse Date_Time . to_vector . should_equal [Date_Time.new 2022 5 7 23 59 59, Nothing, Nothing]
c8 = c7.parse type=Value_Type.Date_Time . to_vector . should_equal [Date_Time.new 2022 5 7 23 59 59, Nothing, Nothing]
w = Problems.get_attached_warnings c8 . find w-> w.is_a Invalid_Format
w.column.should_equal "foo"
w.datatype . should_equal Date_Time
w.value_type . should_equal Value_Type.Date_Time
w.cells . should_equal ["42", "2010-12-31"]
Test.specify "should correctly parse date and time with format" <|
c1 = Column.from_vector "date" ["5/7/2022", "1/1/2000", "12/31/2010"]
c2 = c1.parse Date "M/d/yyyy"
c2 = c1.parse type=Value_Type.Date "M/d/yyyy"
c2.to_vector . should_equal [Date.new 2022 5 7, Date.new 2000 1 1, Date.new 2010 12 31]
c3 = Column.from_vector "datetimes" ["5/7/2022 23:59:59", "1/1/2000 00:00:00", "12/31/2010 12:34:56"]
c4 = c3.parse Date_Time "M/d/yyyy HH:mm:ss"
c4 = c3.parse type=Value_Type.Date_Time "M/d/yyyy HH:mm:ss"
c4.to_vector . should_equal [Date_Time.new 2022 5 7 23 59 59, Date_Time.new 2000 1 1, Date_Time.new 2010 12 31 12 34 56]
Test.specify "should handle invalid format strings gracefully" <|
c1 = Column.from_vector "date" ["5/7/2022", "1/1/2000", "12/31/2010"]
c1.parse Date "M/d/fqsrf" . should_fail_with Illegal_Argument
c1.parse Time_Of_Day "HH:mm:ss.fff" . should_fail_with Illegal_Argument
c1.parse Date_Time "M/d/fqsrf HH:mm:ss.fff" . should_fail_with Illegal_Argument
c1.parse type=Value_Type.Date "M/d/fqsrf" . should_fail_with Illegal_Argument
c1.parse type=Value_Type.Time "HH:mm:ss.fff" . should_fail_with Illegal_Argument
c1.parse type=Value_Type.Date_Time "M/d/fqsrf HH:mm:ss.fff" . should_fail_with Illegal_Argument
Test.specify "should correctly work in Auto mode" <|
c1 = Column.from_vector "A" ["1", "2", "3"]
@ -484,21 +490,26 @@ spec =
c6 = Column.from_vector "F" ["this is here to ensure the column has type text... can be replaced one we have retyping"]
c7 = Column.from_vector "G" ["true", "42"]
c8 = Column.from_vector "H" ["text-to-force-value-type-to-be-text", Nothing, Nothing, Nothing]
c8.value_type . should_equal Value_Type.Char
r1 = c1.parse
r1.to_vector . should_equal [1, 2, 3]
r1.value_type.should_equal Value_Type.Integer
Problems.assume_no_problems r1
r2 = c2.parse
r2.to_vector . should_equal [1.0, 2.5, 3.0]
r2.value_type.should_equal Value_Type.Float
Problems.assume_no_problems r2
r3 = c3.parse
r3.to_vector . should_equal [Date.new 2022 5 7, Date.new 2000 1 1, Date.new 2010 12 31]
r3.value_type.should_equal Value_Type.Date
Problems.assume_no_problems r3
r4 = c4.parse
r4.to_vector . should_equal [True, False, Nothing]
r4.value_type.should_equal Value_Type.Boolean
Problems.assume_no_problems r4
r5 = c5.parse
@ -508,25 +519,27 @@ spec =
c5.parse format="yes|no" . should_fail_with Illegal_Argument
r5_2 = c5.parse format=(Data_Formatter.Value.with_boolean_values ["yes"] ["no"])
r5_2.to_vector . should_equal [True, False]
r5_2.value_type . should_equal Value_Type.Boolean
Problems.assume_no_problems r5_2
r6 = (c6.drop 1).parse
r6.to_vector . should_equal []
Test.with_clue "r6.value_type == "+r6.value_type.to_text+"; " <|
r6.value_type.is_text . should_be_true
r6.value_type . should_equal Value_Type.Char
Problems.assume_no_problems r6
r7 = c7.parse
r7.to_vector . should_equal ["true", "42"]
r7.value_type . should_equal Value_Type.Char
Problems.assume_no_problems r7
r8 = c8.drop 1 . parse
r8.value_type . should_equal Value_Type.Char
r8.to_vector . should_equal [Nothing, Nothing, Nothing]
Problems.assume_no_problems r8
Test.specify "should error if invalid target type is provided" <|
c1 = Column.from_vector "A" ["1", "2", "3"]
c1.parse Nothing . should_fail_with Illegal_Argument
c1.parse type=Nothing . should_fail_with Illegal_Argument
Test.specify "should error if the input column is not text" <|
c1 = Column.from_vector "A" [1, 2, 3]