Implement auto_value_type operation (#7908)

Closes #6113
This commit is contained in:
Radosław Waśko 2023-09-27 17:45:34 +02:00 committed by GitHub
parent cf16d32894
commit c690559ec4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 985 additions and 45 deletions

View File

@ -578,6 +578,7 @@
- [Renamed `Decimal` to `Float`.][7807] - [Renamed `Decimal` to `Float`.][7807]
- [Implemented `Date_Time_Formatter` for more user-friendly date/time format - [Implemented `Date_Time_Formatter` for more user-friendly date/time format
parsing.][7826] parsing.][7826]
- [Implemented `Table.auto_value_types` for in-memory tables.][7908]
[debug-shortcuts]: [debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -820,6 +821,7 @@
[7776]: https://github.com/enso-org/enso/pull/7776 [7776]: https://github.com/enso-org/enso/pull/7776
[7807]: https://github.com/enso-org/enso/pull/7807 [7807]: https://github.com/enso-org/enso/pull/7807
[7826]: https://github.com/enso-org/enso/pull/7826 [7826]: https://github.com/enso-org/enso/pull/7826
[7908]: https://github.com/enso-org/enso/pull/7908
#### Enso Compiler #### Enso Compiler

View File

@ -1,6 +1,4 @@
import project.Any.Any import project.Any.Any
import project.Data.Ordering.Comparable
import project.Data.Ordering.Ordering
import project.Nothing.Nothing import project.Nothing.Nothing
from project.Data.Boolean.Boolean import False, True from project.Data.Boolean.Boolean import False, True
@ -98,4 +96,3 @@ type Boolean
if (27 % 3) == 0 then IO.println "Fizz" if (27 % 3) == 0 then IO.println "Fizz"
if_then : Any -> Any | Nothing if_then : Any -> Any | Nothing
if_then self ~on_true = @Builtin_Method "Boolean.if_then" if_then self ~on_true = @Builtin_Method "Boolean.if_then"

View File

@ -1,4 +1,5 @@
import project.Any.Any import project.Any.Any
import project.Data.Ordering.Comparable
import project.Data.Locale.Locale import project.Data.Locale.Locale
import project.Data.Text.Text import project.Data.Text.Text
import project.Error.Error import project.Error.Error
@ -1169,3 +1170,39 @@ type Number_Parse_Error
to_display_text : Text to_display_text : Text
to_display_text self = to_display_text self =
"Could not parse " + self.text.to_text + " as a double." "Could not parse " + self.text.to_text + " as a double."
## A wrapper type that ensures that a function may only take positive integers.
type Positive_Integer
## PRIVATE
This constructor should not be used by user code as it can be used to
break the invariants. Instead, this type should only be created by `new`
or conversions.
Value (integer : Integer)
## PRIVATE
ADVANCED
Constructor to create a `Positive_Integer` from an `Integer` - checking
if it satisfies the condition. User code should prefer the
`Positive_Integer.from` conversion.
new (integer : Integer) =
if integer > 0 then Positive_Integer.Value integer else
Error.throw (Illegal_Argument.Error "Expected a positive integer, but got "+integer.to_display_text)
## Allows to create a `Positive_Integer` from an `Integer`.
It will throw `Illegal_Argument` if the provided integer is not positive.
Positive_Integer.from (that : Integer) = Positive_Integer.new that
## PRIVATE
Integer.from (that : Positive_Integer) = that.integer
## PRIVATE
type Positive_Integer_Comparator
## PRIVATE
compare x y =
Comparable.from x.integer . compare x.integer y.integer
## PRIVATE
hash x = Comparable.from x.integer . hash x.integer
## PRIVATE
Comparable.from (_:Positive_Integer) = Positive_Integer_Comparator

View File

@ -1579,6 +1579,15 @@ type Column
check_cast_compatibility self.value_type value_type <| check_cast_compatibility self.value_type value_type <|
self.internal_do_cast value_type on_problems self.internal_do_cast value_type on_problems
## Change the value type of the column to a more specific one, based on its
contents.
This operation is currently not available in the Database backend.
auto_value_type : Boolean -> Column
auto_value_type self shrink_types=False =
_ = shrink_types
Error.throw <| Unsupported_Database_Operation.Error "`Column.auto_value_type` is not supported in the Database backends."
## PRIVATE ## PRIVATE
Shares the core CAST logic between `cast` and `parse`. Shares the core CAST logic between `cast` and `parse`.
internal_do_cast : Value_Type -> Problem_Behavior -> Column internal_do_cast : Value_Type -> Problem_Behavior -> Column

View File

@ -1979,6 +1979,15 @@ type Table
new_column = column_to_cast.cast value_type on_problems new_column = column_to_cast.cast value_type on_problems
table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update
## Change the value type of table columns to a more specific one, based on
their contents.
This operation is currently not available in the Database backend.
auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table
auto_value_types self columns=self.column_names shrink_types=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning =
_ = [columns, shrink_types, error_on_missing_columns, on_problems]
Error.throw (Unsupported_Database_Operation.Error "Table.auto_value_types is not supported in the Database backends.")
## ALIAS drop_missing_rows, dropna ## ALIAS drop_missing_rows, dropna
GROUP Standard.Base.Selections GROUP Standard.Base.Selections
Remove rows which are all blank or containing blank values. Remove rows which are all blank or containing blank values.

View File

@ -1762,6 +1762,43 @@ type Column
on_problems.attach_problems_before problems <| on_problems.attach_problems_before problems <|
Column.from_storage self.name new_storage Column.from_storage self.name new_storage
## Change the value type of the column to a more specific one, based on its
contents.
Arguments:
- shrink_types: If set `True`, smaller types will be chosen if possible,
according to the rules below. Defaults to `False`.
? Auto Type Selection Rules
- If a `Mixed` column can be assigned a single type, like `Char` or
`Integer`, that will be used.
- Text columns are not parsed. To do that, use the `parse` method.
- If a `Float` column contains only integers, it will be converted to
an Integer column.
- If a `Decimal` column contains only integers that could fit in a
64-bit integer storage, it will be converted to an Integer column.
- If `shrink_types` is `False` (default), no other transformations are
applied.
- However, if `shrink_types` is set to `True`, then:
- Integer columns will be assigned the smallest size that can fit all
values (down to 16-bit integers; converting to the `Byte` type has
to be done manually through `cast`).
- If all elements in a text column have the same length, the type
will become fixed length.
- Otherwise, if a text column is variable length, but all text
elements are no longer than 255 characters, the column will get a
max length of 255. Otherwise, the column size limit will stay
unchanged.
auto_value_type : Boolean -> Column
auto_value_type self shrink_types=False =
new_value_type = case shrink_types of
False -> self.inferred_precise_value_type
True ->
Storage.to_value_type self.java_column.getStorage.inferPreciseTypeShrunk
# We run with Report_Error because we do not expect any problems.
self.cast new_value_type on_problems=Problem_Behavior.Report_Error
## ALIAS transform column ## ALIAS transform column
Applies `function` to each item in this column and returns the column Applies `function` to each item in this column and returns the column

View File

@ -88,10 +88,20 @@ type Table
Column.from_vector (v.at 0) (v.at 1) . java_column Column.from_vector (v.at 0) (v.at 1) . java_column
Column.Value java_col -> java_col Column.Value java_col -> java_col
_ -> invalid_input_shape _ -> invalid_input_shape
if cols.is_empty then Error.throw (Illegal_Argument.Error "Cannot create a table with no columns.") else Panic.recover Illegal_Argument <|
if (cols.all c-> c.getSize == cols.first.getSize).not then Error.throw (Illegal_Argument.Error "All columns must have the same row count.") else if cols.is_empty then
if cols.distinct .getName . length != cols.length then Error.throw (Illegal_Argument.Error "Column names must be distinct.") else Panic.throw (Illegal_Argument.Error "Cannot create a table with no columns.")
Table.Value (Java_Table.new cols)
if cols.distinct .getName . length != cols.length then
Panic.throw (Illegal_Argument.Error "Column names must be distinct.")
mismatched_size_column = cols.find if_missing=Nothing c->
c.getSize != cols.first.getSize
if mismatched_size_column.is_nothing.not then
msg = "All columns must have the same row count, but the column [" + mismatched_size_column.getName + "] has " + mismatched_size_column.getSize.to_text + " rows, while the column [" + cols.first.getName + "] has " + cols.first.getSize.to_text + " rows."
Panic.throw (Illegal_Argument.Error msg)
Table.Value (Java_Table.new cols)
## GROUP Standard.Base.Constants ## GROUP Standard.Base.Constants
Creates a new table from a vector of column names and a vector of vectors Creates a new table from a vector of column names and a vector of vectors
@ -946,6 +956,9 @@ type Table
Arguments: Arguments:
- columns: The selection of columns to cast. - columns: The selection of columns to cast.
- value_type: The `Value_Type` to cast the column to. - value_type: The `Value_Type` to cast the column to.
- error_on_missing_columns: Specifies if a missing input column should
result in an error regardless of the `on_problems` settings. Defaults
to `True`.
- on_problems: Specifies how to handle problems if they occur, reporting - on_problems: Specifies how to handle problems if they occur, reporting
them as warnings by default. them as warnings by default.
@ -996,6 +1009,50 @@ type Table
new_column = column_to_cast.cast value_type on_problems new_column = column_to_cast.cast value_type on_problems
table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update
## Change the value type of table columns to a more specific one, based on
their contents.
This is most useful for `Mixed` type columns and will allow to narrow
down the type if all values in the column fit a more specific type.
Arguments:
- columns: The selection of columns to convert.
- shrink_types: If set `True`, smaller types will be chosen if possible,
according to the rules below. Defaults to `False`.
- error_on_missing_columns: Specifies if a missing input column should
result in an error regardless of the `on_problems` settings. Defaults
to `True`.
- on_problems: Specifies how to handle problems if they occur, reporting
them as warnings by default.
? Auto Type Selection Rules
- If a `Mixed` column can be assigned a single type, like `Char` or
`Integer`, that will be used.
- Text columns are not parsed. To do that, use the `parse` method.
- If a `Float` column contains only integers, it will be converted to
an Integer column.
- If a `Decimal` column contains only integers that could fit in a
64-bit integer storage, it will be converted to an Integer column.
- If `shrink_types` is `False` (default), no other transformations are
applied.
- However, if `shrink_types` is set to `True`, then:
- Integer columns will be assigned the smallest size that can fit all
values (down to 16-bit integers; converting to the `Byte` type has
to be done manually through `cast`).
- If all elements in a text column have the same length, the type
will become fixed length.
- Otherwise, if a text column is variable length, but all text
elements are no longer than 255 characters, the column will get a
max length of 255. Otherwise, the column size limit will stay
unchanged.
auto_value_types : Vector (Text | Integer | Regex) | Text | Integer | Regex -> Boolean -> Boolean -> Problem_Behavior -> Table
auto_value_types self columns=self.column_names shrink_types=False error_on_missing_columns=True on_problems=Problem_Behavior.Report_Warning =
selected = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=False error_on_missing_columns=error_on_missing_columns on_problems=on_problems error_on_empty=False
selected.fold self table-> column_to_cast->
new_column = column_to_cast.auto_value_type shrink_types
table.set new_column new_name=column_to_cast.name set_mode=Set_Mode.Update
## GROUP Standard.Base.Conversions ## GROUP Standard.Base.Conversions
Splits a column of text into a set of new columns. Splits a column of text into a set of new columns.
The original column will be removed from the table. The original column will be removed from the table.

View File

@ -18,7 +18,7 @@ polyglot java import org.enso.table.data.column.storage.type.IntegerType
most_specific_value_type : Any -> Boolean -> Value_Type most_specific_value_type : Any -> Boolean -> Value_Type
most_specific_value_type value use_smallest=False = most_specific_value_type value use_smallest=False =
case value of case value of
_ : Float -> Value_Type.Float Bits.Bits_64 _ : Float -> Value_Type.Float Bits.Bits_64
_ : Boolean -> Value_Type.Boolean _ : Boolean -> Value_Type.Boolean
_ : Date -> Value_Type.Date _ : Date -> Value_Type.Date
_ : Time_Of_Day -> Value_Type.Time _ : Time_Of_Day -> Value_Type.Time
@ -33,9 +33,12 @@ most_specific_value_type value use_smallest=False =
# We do a small rewrite here - for integers we always return the Integer type, even if the value is small enough to fit in a Byte. # We do a small rewrite here - for integers we always return the Integer type, even if the value is small enough to fit in a Byte.
if value_type == Value_Type.Byte then Value_Type.Integer Bits.Bits_16 else value_type if value_type == Value_Type.Byte then Value_Type.Integer Bits.Bits_16 else value_type
True -> Value_Type.Decimal precision=Nothing scale=0 True -> Value_Type.Decimal precision=Nothing scale=0
text : Text -> case use_smallest of text : Text ->
False -> Value_Type.Char size=Nothing variable_length=True length = text.length
True -> Value_Type.Char size=text.length variable_length=False # Not using Char size=0 for empty strings, because that would be an invalid value.
case use_smallest && length > 0 of
True -> Value_Type.Char size=text.length variable_length=False
False -> Value_Type.Char size=Nothing variable_length=True
## TODO [RW] once we add Enso Native Object Type Value Type, we probably ## TODO [RW] once we add Enso Native Object Type Value Type, we probably
want to prefer it over Mixed want to prefer it over Mixed
_ -> Value_Type.Mixed _ -> Value_Type.Mixed

View File

@ -54,7 +54,7 @@ closest_storage_type value_type = case value_type of
Error.throw (Illegal_Argument.Error "Value_Type.Char with fixed length must have a non-nothing size") Error.throw (Illegal_Argument.Error "Value_Type.Char with fixed length must have a non-nothing size")
Value_Type.Char max_length variable_length -> Value_Type.Char max_length variable_length ->
fixed_length = variable_length.not fixed_length = variable_length.not
TextType.new max_length fixed_length TextType.new (max_length : Integer) fixed_length
Value_Type.Date -> DateType.INSTANCE Value_Type.Date -> DateType.INSTANCE
# We currently will not support storing dates without timezones in in-memory mode. # We currently will not support storing dates without timezones in in-memory mode.
Value_Type.Date_Time _ -> DateTimeType.INSTANCE Value_Type.Date_Time _ -> DateTimeType.INSTANCE

View File

@ -1,4 +1,5 @@
from Standard.Base import all from Standard.Base import all
import Standard.Base.Data.Numbers.Positive_Integer
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import project.Data.Type.Value_Type_Helpers import project.Data.Type.Value_Type_Helpers
@ -95,12 +96,22 @@ type Value_Type
ANSI SQL: CHAR, VARCHAR, TEXT, LONGVARCHAR, NCHAR, NVARCHAR, TEXT, CLOB, NCLOB ANSI SQL: CHAR, VARCHAR, TEXT, LONGVARCHAR, NCHAR, NVARCHAR, TEXT, CLOB, NCLOB
! Counting Characters
Note that different backends may count the text in different ways.
The in-memory backend treats a single grapheme cluster (e.g. 💡) as a
single character unit. In most database systems more complex grapheme
clusters may be counted as multiple characters. So there isn't a 1-1
correspondence between these limits across backends which may cause
strings to be truncated if they contain such characters and are close
to the limit.
Arguments: Arguments:
- size: the maximum number of characters that can be stored in the - size: the maximum number of characters that can be stored in the
column. It can be nothing to indicate no limit. column. It can be nothing to indicate no limit. It cannot be 0.
- variable_length: whether the size is a maximum or a fixed length. - variable_length: whether the size is a maximum or a fixed length.
A fixed length string must have a non-nothing size. A fixed length string must have a non-nothing size.
Char size:(Integer|Nothing)=Nothing variable_length:Boolean=True Char (size : (Positive_Integer | Nothing) = Nothing) variable_length:Boolean=True
## Date ## Date
@ -383,15 +394,23 @@ type Value_Type
Value_Type.Integer size -> "Integer (" + size.to_text + ")" Value_Type.Integer size -> "Integer (" + size.to_text + ")"
Value_Type.Float size -> "Float (" + size.to_text + ")" Value_Type.Float size -> "Float (" + size.to_text + ")"
Value_Type.Decimal precision scale -> "Decimal (precision=" + precision.to_text + ", scale=" + scale.to_text + ")" Value_Type.Decimal precision scale -> "Decimal (precision=" + precision.to_text + ", scale=" + scale.to_text + ")"
Value_Type.Char size variable_length -> case variable_length of Value_Type.Char size variable_length ->
True -> "Char (variable length, max_size=" + size.to_text + ")" size_text = case size of
False -> "Char (fixed length, size=" + size.to_text + ")" Nothing -> "unlimited"
_ -> size.to Integer . to_text
case variable_length of
True -> "Char (variable length, max_size=" + size_text + ")"
False -> "Char (fixed length, size=" + size_text + ")"
Value_Type.Date -> "Date" Value_Type.Date -> "Date"
Value_Type.Date_Time with_timezone -> "Date_Time (with_timezone=" + with_timezone.to_text + ")" Value_Type.Date_Time with_timezone -> "Date_Time (with_timezone=" + with_timezone.to_text + ")"
Value_Type.Time -> "Time" Value_Type.Time -> "Time"
Value_Type.Binary size variable_length -> case variable_length of Value_Type.Binary size variable_length ->
True -> "Binary (variable length, max_size=" + size.to_text + " bytes)" size_text = case size of
False -> "Binary (fixed length, size=" + size.to_text + " bytes)" Nothing -> "unlimited"
_ -> size.to Integer . to_text + " bytes"
case variable_length of
True -> "Binary (variable length, max_size=" + size_text + ")"
False -> "Binary (fixed length, size=" + size_text + ")"
Value_Type.Unsupported_Data_Type type_name _ -> case type_name of Value_Type.Unsupported_Data_Type type_name _ -> case type_name of
Nothing -> "Unsupported_Data_Type" Nothing -> "Unsupported_Data_Type"
_ : Text -> "Unsupported_Data_Type (" + type_name + ")" _ : Text -> "Unsupported_Data_Type (" + type_name + ")"

View File

@ -10,6 +10,7 @@ import org.enso.table.data.column.storage.type.BigIntegerType;
import org.enso.table.data.column.storage.type.FloatType; import org.enso.table.data.column.storage.type.FloatType;
import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.column.storage.type.StorageType;
import org.enso.table.error.ValueTypeMismatchException; import org.enso.table.error.ValueTypeMismatchException;
import org.graalvm.polyglot.Context;
// For now the BigInteger builder is just a stub, reusing the ObjectBuilder and adding a warning. // For now the BigInteger builder is just a stub, reusing the ObjectBuilder and adding a warning.
public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> { public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
@ -88,10 +89,12 @@ public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
} }
public static BigIntegerBuilder retypeFromLongBuilder(LongBuilder longBuilder) { public static BigIntegerBuilder retypeFromLongBuilder(LongBuilder longBuilder) {
BigIntegerBuilder res = new BigIntegerBuilder(longBuilder.data.length);
int n = longBuilder.currentSize; int n = longBuilder.currentSize;
BigIntegerBuilder res = new BigIntegerBuilder(n); Context context = Context.getCurrent();
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
res.appendNoGrow(BigInteger.valueOf(longBuilder.data[i])); res.appendNoGrow(BigInteger.valueOf(longBuilder.data[i]));
context.safepoint();
} }
return res; return res;
} }

View File

@ -26,10 +26,10 @@ public class LongBuilderChecked extends LongBuilder {
if (o == null) { if (o == null) {
isMissing.set(currentSize++); isMissing.set(currentSize++);
} else { } else {
try { Long x = NumericConverter.tryConvertingToLong(o);
long x = NumericConverter.coerceToLong(o); if (x != null) {
appendLongNoGrow(x); appendLongNoGrow(x);
} catch (UnsupportedOperationException e) { } else {
throw new ValueTypeMismatchException(type, o); throw new ValueTypeMismatchException(type, o);
} }
} }

View File

@ -19,10 +19,10 @@ public class LongBuilderUnchecked extends LongBuilder {
if (o == null) { if (o == null) {
isMissing.set(currentSize++); isMissing.set(currentSize++);
} else { } else {
try { Long x = NumericConverter.tryConvertingToLong(o);
long x = NumericConverter.coerceToLong(o); if (x != null) {
data[currentSize++] = x; appendLongNoGrow(x);
} catch (UnsupportedOperationException e) { } else {
throw new ValueTypeMismatchException(getType(), o); throw new ValueTypeMismatchException(getType(), o);
} }
} }

View File

@ -29,8 +29,8 @@ public class ToTextStorageConverter implements StorageConverter<String> {
public Storage<String> cast(Storage<?> storage, CastProblemBuilder problemBuilder) { public Storage<String> cast(Storage<?> storage, CastProblemBuilder problemBuilder) {
if (storage instanceof StringStorage stringStorage) { if (storage instanceof StringStorage stringStorage) {
if (stringStorage.getType().equals(targetType)) { if (canAvoidCopying(stringStorage)) {
return stringStorage; return retypeStringStorage(stringStorage);
} else { } else {
return adaptStringStorage(stringStorage, problemBuilder); return adaptStringStorage(stringStorage, problemBuilder);
} }
@ -150,7 +150,8 @@ public class ToTextStorageConverter implements StorageConverter<String> {
return builder.seal(); return builder.seal();
} }
private <T> Storage<String> castDateTimeStorage(Storage<T> storage, Function<T, String> converter, CastProblemBuilder problemBuilder) { private <T> Storage<String> castDateTimeStorage(Storage<T> storage, Function<T, String> converter,
CastProblemBuilder problemBuilder) {
Context context = Context.getCurrent(); Context context = Context.getCurrent();
StringBuilder builder = new StringBuilder(storage.size(), targetType); StringBuilder builder = new StringBuilder(storage.size(), targetType);
for (int i = 0; i < storage.size(); i++) { for (int i = 0; i < storage.size(); i++) {
@ -204,4 +205,43 @@ public class ToTextStorageConverter implements StorageConverter<String> {
problemBuilder.aggregateOtherProblems(builder.getProblems()); problemBuilder.aggregateOtherProblems(builder.getProblems());
return builder.seal(); return builder.seal();
} }
private boolean canAvoidCopying(StringStorage stringStorage) {
if (targetType.fitsExactly(stringStorage.getType())) {
return true;
}
long maxLength = Long.MIN_VALUE;
long minLength = Long.MAX_VALUE;
for (int i = 0; i < stringStorage.size(); i++) {
String value = stringStorage.getItem(i);
if (value == null) {
continue;
}
long length = value.length();
if (length > maxLength) {
maxLength = length;
}
if (length < minLength) {
minLength = length;
}
}
if (targetType.fixedLength()) {
boolean effectivelyFixedLength = minLength == maxLength;
return effectivelyFixedLength && targetType.maxLength() == maxLength;
} else {
return targetType.maxLength() == -1 || maxLength <= targetType.maxLength();
}
}
/**
* Creates a new storage re-using the existing array.
* <p>
* This can only be done if the values do not need any adaptations, checked by {@code canAvoidCopying}.
*/
private Storage<String> retypeStringStorage(StringStorage stringStorage) {
return new StringStorage(stringStorage.getData(), stringStorage.size(), targetType);
}
} }

View File

@ -23,7 +23,7 @@ public abstract class StringStringOp extends BinaryMapOperation<String, Speciali
public Storage<?> runBinaryMap(SpecializedStorage<String> storage, Object arg, MapOperationProblemBuilder problemBuilder) { public Storage<?> runBinaryMap(SpecializedStorage<String> storage, Object arg, MapOperationProblemBuilder problemBuilder) {
int size = storage.size(); int size = storage.size();
if (arg == null) { if (arg == null) {
StringBuilder builder = new StringBuilder(size, TextType.variableLengthWithLimit(0)); StringBuilder builder = new StringBuilder(size, TextType.VARIABLE_LENGTH);
builder.appendNulls(size); builder.appendNulls(size);
return builder.seal(); return builder.seal();
} else if (arg instanceof String argString) { } else if (arg instanceof String argString) {

View File

@ -105,6 +105,20 @@ public final class MixedStorage extends ObjectStorage {
return inferredType; return inferredType;
} }
@Override
public StorageType inferPreciseTypeShrunk() {
Storage<?> specialized = getInferredStorage();
if (specialized == null) {
// If no specialized type is available, it means that:
assert inferredType instanceof AnyObjectType;
return AnyObjectType.INSTANCE;
}
// If we are able to get a more specialized storage for more specific type - we delegate to its
// own shrinking logic.
return specialized.inferPreciseTypeShrunk();
}
private Storage<?> getInferredStorage() { private Storage<?> getInferredStorage() {
if (!hasSpecializedStorageBeenInferred) { if (!hasSpecializedStorageBeenInferred) {
StorageType inferredType = inferPreciseType(); StorageType inferredType = inferPreciseType();

View File

@ -43,6 +43,11 @@ public class MixedStorageFacade extends Storage<Object> {
return underlyingStorage.inferPreciseType(); return underlyingStorage.inferPreciseType();
} }
@Override
public StorageType inferPreciseTypeShrunk() {
return underlyingStorage.inferPreciseTypeShrunk();
}
@Override @Override
public boolean isNa(long idx) { public boolean isNa(long idx) {
return underlyingStorage.isNa(idx); return underlyingStorage.isNa(idx);

View File

@ -39,6 +39,19 @@ public abstract class Storage<T> {
return getType(); return getType();
} }
/**
* Returns the smallest type (according to Column.auto_value_type rules) that may still fit all
* values in this column.
*
* <p>It is a sibling of `inferPreciseType` that allows some further shrinking. It is kept
* separate, because `inferPreciseType` should be quick to compute (cached if needed) as it is
* used in typechecking of lots of operations. This one however, is only used in a specific
* `auto_value_type` use-case and rarely will need to be computed more than once.
*/
public StorageType inferPreciseTypeShrunk() {
return getType();
}
/** /**
* Returns a more specialized storage, if available. * Returns a more specialized storage, if available.
* *

View File

@ -3,9 +3,9 @@ package org.enso.table.data.column.storage;
import org.enso.base.Text_Utils; import org.enso.base.Text_Utils;
import org.enso.table.data.column.builder.Builder; import org.enso.table.data.column.builder.Builder;
import org.enso.table.data.column.builder.StringBuilder; import org.enso.table.data.column.builder.StringBuilder;
import org.enso.table.data.column.operation.map.MapOperationStorage;
import org.enso.table.data.column.operation.map.BinaryMapOperation; import org.enso.table.data.column.operation.map.BinaryMapOperation;
import org.enso.table.data.column.operation.map.MapOperationProblemBuilder; import org.enso.table.data.column.operation.map.MapOperationProblemBuilder;
import org.enso.table.data.column.operation.map.MapOperationStorage;
import org.enso.table.data.column.operation.map.UnaryMapOperation; import org.enso.table.data.column.operation.map.UnaryMapOperation;
import org.enso.table.data.column.operation.map.text.LikeOp; import org.enso.table.data.column.operation.map.text.LikeOp;
import org.enso.table.data.column.operation.map.text.StringBooleanOp; import org.enso.table.data.column.operation.map.text.StringBooleanOp;
@ -19,10 +19,13 @@ import org.graalvm.polyglot.Value;
import java.util.BitSet; import java.util.BitSet;
/** A column storing strings. */ /**
* A column storing strings.
*/
public final class StringStorage extends SpecializedStorage<String> { public final class StringStorage extends SpecializedStorage<String> {
private final TextType type; private final TextType type;
/** /**
* @param data the underlying data * @param data the underlying data
* @param size the number of items stored * @param size the number of items stored
@ -111,7 +114,8 @@ public final class StringStorage extends SpecializedStorage<String> {
t.add( t.add(
new UnaryMapOperation<>(Maps.IS_EMPTY) { new UnaryMapOperation<>(Maps.IS_EMPTY) {
@Override @Override
protected BoolStorage runUnaryMap(SpecializedStorage<String> storage, MapOperationProblemBuilder problemBuilder) { protected BoolStorage runUnaryMap(SpecializedStorage<String> storage,
MapOperationProblemBuilder problemBuilder) {
BitSet r = new BitSet(); BitSet r = new BitSet();
Context context = Context.getCurrent(); Context context = Context.getCurrent();
for (int i = 0; i < storage.size; i++) { for (int i = 0; i < storage.size; i++) {
@ -162,4 +166,40 @@ public final class StringStorage extends SpecializedStorage<String> {
}); });
return t; return t;
} }
@Override
public StorageType inferPreciseTypeShrunk() {
if (type.fixedLength()) {
return type;
}
long minLength = Long.MAX_VALUE;
long maxLength = Long.MIN_VALUE;
for (int i = 0; i < size(); i++) {
String s = getItem(i);
if (s != null) {
long length = Text_Utils.grapheme_length(s);
minLength = Math.min(minLength, length);
maxLength = Math.max(maxLength, length);
}
}
// maxLength will be <0 if all values were null and will be ==0 if all values were empty strings.
// In both of these cases, we avoid shrinking the type and return the original type instead.
if (maxLength <= 0) {
return getType();
}
final long SHORT_LENGTH_THRESHOLD = 255;
if (minLength == maxLength) {
return TextType.fixedLength(minLength);
} else if (maxLength <= SHORT_LENGTH_THRESHOLD && (type.maxLength() < 0 || SHORT_LENGTH_THRESHOLD < type.maxLength())) {
// If the string was unbounded or the bound was larger than 255, we shrink it to 255.
return TextType.variableLengthWithLimit(SHORT_LENGTH_THRESHOLD);
} else {
// Otherwise, we return the original type (because it was either smaller than the proposed 255 bound, or the
// existing elements to do not fit into the 255 bound).
return getType();
}
}
} }

View File

@ -23,6 +23,8 @@ import org.enso.table.data.column.operation.map.numeric.isin.LongIsInOp;
import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.BoolStorage;
import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.type.IntegerType; import org.enso.table.data.column.storage.type.IntegerType;
import org.enso.table.data.column.storage.type.StorageType;
import org.graalvm.polyglot.Context;
public abstract class AbstractLongStorage extends NumericStorage<Long> { public abstract class AbstractLongStorage extends NumericStorage<Long> {
public abstract long getItem(int idx); public abstract long getItem(int idx);
@ -77,6 +79,46 @@ public abstract class AbstractLongStorage extends NumericStorage<Long> {
@Override @Override
public abstract IntegerType getType(); public abstract IntegerType getType();
@Override
public StorageType inferPreciseType() {
return getType();
}
@Override
public StorageType inferPreciseTypeShrunk() {
// If the type is already smallest possible, we return it unchanged (we will return 8-bit
// columns as-is, although
// we will not shrink 16-bit columns to 8-bits even if it were possible).
if (getType().bits().toInteger() <= 16) {
return getType();
}
IntegerType[] possibleTypes =
new IntegerType[] {IntegerType.INT_16, IntegerType.INT_32, IntegerType.INT_64};
int currentTypeIdx = 0;
int n = size();
Context context = Context.getCurrent();
for (int i = 0; i < n; i++) {
if (isNa(i)) {
continue;
}
long item = getItem(i);
while (!possibleTypes[currentTypeIdx].fits(item)) {
currentTypeIdx++;
}
if (currentTypeIdx >= possibleTypes.length - 1) {
break;
}
context.safepoint();
}
return possibleTypes[currentTypeIdx];
}
private static MapOperationStorage<Long, AbstractLongStorage> buildOps() { private static MapOperationStorage<Long, AbstractLongStorage> buildOps() {
MapOperationStorage<Long, AbstractLongStorage> ops = new MapOperationStorage<>(); MapOperationStorage<Long, AbstractLongStorage> ops = new MapOperationStorage<>();
ops.add(new AddOp<>()) ops.add(new AddOp<>())

View File

@ -20,6 +20,7 @@ import org.enso.table.data.column.operation.map.numeric.isin.BigIntegerIsInOp;
import org.enso.table.data.column.storage.ObjectStorage; import org.enso.table.data.column.storage.ObjectStorage;
import org.enso.table.data.column.storage.SpecializedStorage; import org.enso.table.data.column.storage.SpecializedStorage;
import org.enso.table.data.column.storage.type.BigIntegerType; import org.enso.table.data.column.storage.type.BigIntegerType;
import org.enso.table.data.column.storage.type.IntegerType;
import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.column.storage.type.StorageType;
public class BigIntegerStorage extends SpecializedStorage<BigInteger> { public class BigIntegerStorage extends SpecializedStorage<BigInteger> {
@ -59,7 +60,7 @@ public class BigIntegerStorage extends SpecializedStorage<BigInteger> {
@Override @Override
protected BigInteger[] newUnderlyingArray(int size) { protected BigInteger[] newUnderlyingArray(int size) {
return new BigInteger[0]; return new BigInteger[size];
} }
@Override @Override
@ -96,4 +97,67 @@ public class BigIntegerStorage extends SpecializedStorage<BigInteger> {
return cachedMaxPrecisionStored; return cachedMaxPrecisionStored;
} }
private StorageType inferredType = null;
@Override
public StorageType inferPreciseType() {
if (inferredType == null) {
boolean allFitInLong = true;
int visitedCount = 0;
for (int i = 0; i < size; i++) {
BigInteger value = data[i];
if (value == null) {
continue;
}
visitedCount++;
boolean fitsInLong = IntegerType.INT_64.fits(value);
if (!fitsInLong) {
allFitInLong = false;
break;
}
}
inferredType =
(allFitInLong && visitedCount > 0) ? IntegerType.INT_64 : BigIntegerType.INSTANCE;
}
return inferredType;
}
@Override
public StorageType inferPreciseTypeShrunk() {
StorageType preciseType = inferPreciseType();
if (preciseType instanceof IntegerType) {
return findSmallestIntegerTypeThatFits();
}
return preciseType;
}
private StorageType findSmallestIntegerTypeThatFits() {
// This method assumes that all values _do_ fit in some integer type.
assert inferredType instanceof IntegerType;
final BigIntegerStorage parent = this;
// We create a Long storage that gets values by converting our storage.
ComputedNullableLongStorage longAdapter =
new ComputedNullableLongStorage(size) {
@Override
protected Long computeItem(int idx) {
BigInteger bigInteger = parent.getItem(idx);
if (bigInteger == null) {
return null;
}
return bigInteger.longValueExact();
}
};
// And rely on its shrinking logic.
return longAdapter.inferPreciseTypeShrunk();
}
} }

View File

@ -0,0 +1,197 @@
package org.enso.table.data.column.storage.numeric;
import java.util.BitSet;
import java.util.List;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.type.IntegerType;
import org.enso.table.data.index.Index;
import org.enso.table.data.mask.OrderMask;
import org.enso.table.data.mask.SliceRange;
import org.graalvm.polyglot.Context;
/**
* Implements a storage that computes the ith stored value using some function.
*
* <p>This storage allows for missing values. Prefer {@link ComputedLongStorage} for non-nullable
* case.
*/
public abstract class ComputedNullableLongStorage extends AbstractLongStorage {
protected final int size;
protected abstract Long computeItem(int idx);
protected ComputedNullableLongStorage(int size) {
this.size = size;
}
@Override
public int size() {
return size;
}
@Override
public int countMissing() {
return 0;
}
@Override
public IntegerType getType() {
return IntegerType.INT_64;
}
@Override
public boolean isNa(long idx) {
if (idx < 0 || idx >= size) {
throw new IndexOutOfBoundsException(
"Index " + idx + " is out of bounds for range of length " + size + ".");
}
return computeItem((int) idx) == null;
}
@Override
public Long getItemBoxed(int idx) {
if (idx < 0 || idx >= size) {
throw new IndexOutOfBoundsException(
"Index " + idx + " is out of bounds for range of length " + size + ".");
}
return computeItem(idx);
}
public long getItem(int idx) {
return getItemBoxed(idx);
}
@Override
public BitSet getIsMissing() {
BitSet missing = new BitSet();
Context context = Context.getCurrent();
for (int i = 0; i < size; i++) {
if (computeItem(i) == null) {
missing.set(i);
}
context.safepoint();
}
return missing;
}
@Override
public Storage<Long> mask(BitSet mask, int cardinality) {
BitSet newMissing = new BitSet();
long[] newData = new long[cardinality];
int resIx = 0;
Context context = Context.getCurrent();
for (int i = 0; i < size; i++) {
if (mask.get(i)) {
Long item = computeItem(i);
if (item == null) {
newMissing.set(resIx++);
} else {
newData[resIx++] = item;
}
}
context.safepoint();
}
return new LongStorage(newData, cardinality, newMissing, getType());
}
@Override
public Storage<Long> applyMask(OrderMask mask) {
int[] positions = mask.getPositions();
long[] newData = new long[positions.length];
BitSet newMissing = new BitSet();
Context context = Context.getCurrent();
for (int i = 0; i < positions.length; i++) {
if (positions[i] == Index.NOT_FOUND) {
newMissing.set(i);
} else {
Long item = computeItem(positions[i]);
if (item == null) {
newMissing.set(i);
} else {
newData[i] = item;
}
}
context.safepoint();
}
return new LongStorage(newData, positions.length, newMissing, getType());
}
@Override
public Storage<Long> countMask(int[] counts, int total) {
long[] newData = new long[total];
BitSet newMissing = new BitSet();
int pos = 0;
Context context = Context.getCurrent();
for (int i = 0; i < counts.length; i++) {
Long item = computeItem(i);
if (item == null) {
newMissing.set(pos, pos + counts[i]);
pos += counts[i];
} else {
long nonNullItem = item;
for (int j = 0; j < counts[i]; j++) {
newData[pos++] = nonNullItem;
}
}
context.safepoint();
}
return new LongStorage(newData, total, newMissing, getType());
}
@Override
public Storage<Long> slice(int offset, int limit) {
int newSize = Math.min(size - offset, limit);
long[] newData = new long[newSize];
BitSet newMissing = new BitSet();
Context context = Context.getCurrent();
for (int i = 0; i < newSize; i++) {
Long item = computeItem(offset + i);
if (item == null) {
newMissing.set(i);
} else {
newData[i] = item;
}
context.safepoint();
}
return new LongStorage(newData, newSize, newMissing, getType());
}
@Override
public Storage<Long> slice(List<SliceRange> ranges) {
int newSize = SliceRange.totalLength(ranges);
long[] newData = new long[newSize];
BitSet newMissing = new BitSet(newSize);
int offset = 0;
Context context = Context.getCurrent();
for (SliceRange range : ranges) {
int rangeStart = range.start();
int length = range.end() - rangeStart;
for (int i = 0; i < length; i++) {
Long item = computeItem(rangeStart + i);
if (item == null) {
newMissing.set(offset + i);
} else {
newData[offset + i] = item;
}
context.safepoint();
}
offset += length;
}
return new LongStorage(newData, newSize, newMissing, getType());
}
@Override
public AbstractLongStorage widen(IntegerType widerType) {
// Currently the implementation only reports 64-bit type so there is no widening to do - we can
// just return self.
assert getType().equals(IntegerType.INT_64);
return this;
}
}

View File

@ -26,6 +26,7 @@ import org.enso.table.data.column.operation.map.numeric.isin.DoubleIsInOp;
import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.BoolStorage;
import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.type.FloatType; import org.enso.table.data.column.storage.type.FloatType;
import org.enso.table.data.column.storage.type.IntegerType;
import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.column.storage.type.StorageType;
import org.enso.table.data.index.Index; import org.enso.table.data.index.Index;
import org.enso.table.data.mask.OrderMask; import org.enso.table.data.mask.OrderMask;
@ -388,4 +389,68 @@ public final class DoubleStorage extends NumericStorage<Double> implements Doubl
return new DoubleStorage(newData, newSize, newMissing); return new DoubleStorage(newData, newSize, newMissing);
} }
private StorageType inferredType = null;
@Override
public StorageType inferPreciseType() {
if (inferredType == null) {
boolean areAllIntegers = true;
int visitedNumbers = 0;
for (int i = 0; i < size; i++) {
if (isMissing.get(i)) {
continue;
}
double value = Double.longBitsToDouble(data[i]);
visitedNumbers++;
boolean isWholeNumber = value % 1.0 == 0.0;
boolean canBeInteger = isWholeNumber && IntegerType.INT_64.fits(value);
if (!canBeInteger) {
areAllIntegers = false;
break;
}
}
// We only switch to integers if there was at least one number.
inferredType = (areAllIntegers && visitedNumbers > 0) ? IntegerType.INT_64 : getType();
}
return inferredType;
}
@Override
public StorageType inferPreciseTypeShrunk() {
StorageType inferred = inferPreciseType();
if (inferred instanceof IntegerType) {
return findSmallestIntegerTypeThatFits();
} else {
return inferred;
}
}
private StorageType findSmallestIntegerTypeThatFits() {
assert inferredType instanceof IntegerType;
final DoubleStorage parent = this;
// We create a Long storage that gets values by converting our storage.
ComputedNullableLongStorage longAdapter =
new ComputedNullableLongStorage(size) {
@Override
protected Long computeItem(int idx) {
if (parent.isNa(idx)) {
return null;
}
double value = parent.getItem(idx);
assert value % 1.0 == 0.0
: "The value " + value + " should be a whole number (guaranteed by checks).";
return (long) value;
}
};
// And rely on its shrinking logic.
return longAdapter.inferPreciseTypeShrunk();
}
} }

View File

@ -11,9 +11,11 @@ import java.time.ZonedDateTime;
/** /**
* Represents an underlying internal storage type that can be mapped to the Value Type that is exposed to users. * Represents an underlying internal storage type that can be mapped to the Value Type that is exposed to users.
*/ */
public sealed interface StorageType permits AnyObjectType, BigIntegerType, BooleanType, DateTimeType, DateType, FloatType, IntegerType, TextType, TimeOfDayType { public sealed interface StorageType permits AnyObjectType, BigIntegerType, BooleanType, DateTimeType, DateType,
FloatType, IntegerType, TextType, TimeOfDayType {
/** /**
* @return the StorageType that represents a given boxed item. * @return the StorageType that represents a given boxed item. This has special handling for floating-point values -
* if they represent a whole number, they will be treated as integers.
*/ */
static StorageType forBoxedItem(Object item) { static StorageType forBoxedItem(Object item) {
if (NumericConverter.isCoercibleToLong(item)) { if (NumericConverter.isCoercibleToLong(item)) {
@ -21,6 +23,11 @@ public sealed interface StorageType permits AnyObjectType, BigIntegerType, Boole
} }
if (NumericConverter.isFloatLike(item)) { if (NumericConverter.isFloatLike(item)) {
double value = NumericConverter.coerceToDouble(item);
if (value % 1.0 == 0.0 && IntegerType.INT_64.fits(value)) {
return IntegerType.INT_64;
}
return FloatType.FLOAT_64; return FloatType.FLOAT_64;
} }
@ -32,7 +39,7 @@ public sealed interface StorageType permits AnyObjectType, BigIntegerType, Boole
case LocalTime t -> TimeOfDayType.INSTANCE; case LocalTime t -> TimeOfDayType.INSTANCE;
case LocalDateTime d -> DateTimeType.INSTANCE; case LocalDateTime d -> DateTimeType.INSTANCE;
case ZonedDateTime d -> DateTimeType.INSTANCE; case ZonedDateTime d -> DateTimeType.INSTANCE;
default -> null; default -> AnyObjectType.INSTANCE;
}; };
} }
} }

View File

@ -3,6 +3,12 @@ package org.enso.table.data.column.storage.type;
import org.enso.base.Text_Utils; import org.enso.base.Text_Utils;
public record TextType(long maxLength, boolean fixedLength) implements StorageType { public record TextType(long maxLength, boolean fixedLength) implements StorageType {
public TextType {
if (maxLength == 0) {
throw new IllegalArgumentException("The maxLength of a text type must be positive or -1 to indicate unlimited length.");
}
}
public static final TextType VARIABLE_LENGTH = new TextType(-1, false); public static final TextType VARIABLE_LENGTH = new TextType(-1, false);
public static TextType fixedLength(long length) { public static TextType fixedLength(long length) {
@ -10,7 +16,7 @@ public record TextType(long maxLength, boolean fixedLength) implements StorageTy
} }
public static TextType variableLengthWithLimit(long maxLength) { public static TextType variableLengthWithLimit(long maxLength) {
assert maxLength >= 0; assert maxLength > 0;
return new TextType(maxLength, false); return new TextType(maxLength, false);
} }
@ -90,6 +96,10 @@ public record TextType(long maxLength, boolean fixedLength) implements StorageTy
boolean bothFixed = type1.fixedLength && type2.fixedLength; boolean bothFixed = type1.fixedLength && type2.fixedLength;
long lengthSum = type1.maxLength + type2.maxLength; long lengthSum = type1.maxLength + type2.maxLength;
if (lengthSum == 0) {
return VARIABLE_LENGTH;
}
return new TextType(lengthSum, bothFixed); return new TextType(lengthSum, bothFixed);
} }
} }

View File

@ -67,7 +67,6 @@ spec setup =
c.value_type.is_text . should_be_true c.value_type.is_text . should_be_true
c.to_vector . should_equal ["{{{MY Type [x=42] }}}", "{{{MY Type [x=X] }}}"] c.to_vector . should_equal ["{{{MY Type [x=42] }}}", "{{{MY Type [x=X] }}}"]
# TODO what to test here?
Test.specify "should allow to cast an integer column to a decimal type" <| Test.specify "should allow to cast an integer column to a decimal type" <|
t = table_builder [["X", [1, 2, 3]]] t = table_builder [["X", [1, 2, 3]]]
c = t.at "X" . cast Value_Type.Decimal c = t.at "X" . cast Value_Type.Decimal
@ -116,6 +115,15 @@ spec setup =
w2 = Problems.expect_warning Conversion_Failure c2 w2 = Problems.expect_warning Conversion_Failure c2
w2.affected_rows_count . should_equal 4 w2.affected_rows_count . should_equal 4
Test.specify "should not allow 0-length Char type" <|
c1 = table_builder [["X", ["a", "", "bcd"]]] . at "X"
r1 = c1.cast (Value_Type.Char size=0 variable_length=False)
r1.should_fail_with Illegal_Argument
r1.catch.to_display_text . should_contain "positive"
r2 = c1.cast (Value_Type.Char size=0 variable_length=True)
r2.should_fail_with Illegal_Argument
Test.group prefix+"Table/Column.cast - numeric" <| Test.group prefix+"Table/Column.cast - numeric" <|
Test.specify "should allow to cast a boolean column to integer" <| Test.specify "should allow to cast a boolean column to integer" <|
t = table_builder [["X", [True, False, True]]] t = table_builder [["X", [True, False, True]]]
@ -531,3 +539,254 @@ spec setup =
r3 = t.parse ["X", "Y"] Value_Type.Integer r3 = t.parse ["X", "Y"] Value_Type.Integer
r3.should_fail_with Missing_Input_Columns r3.should_fail_with Missing_Input_Columns
r3.catch.criteria . should_equal ["Y"] r3.catch.criteria . should_equal ["Y"]
if setup.is_database then Test.group prefix+"Table/Column auto value type" <|
Test.specify "should report unsupported" <|
t = table_builder [["X", [1, 2, 3]]]
t.auto_value_types . should_fail_with Unsupported_Database_Operation
t.at "X" . auto_value_type . should_fail_with Unsupported_Database_Operation
if setup.is_database.not then Test.group prefix+"Table/Column auto value type" <|
Test.specify "should allow to narrow down types of a Mixed column" <|
[True, False].each shrink_types->
mixer = My_Type.Value 1
t0 = table_builder [["strs", [mixer, "a", "b"]], ["ints", [mixer, 2, 3]], ["floats", [mixer, 1.5, 2.5]], ["mix", [1, mixer, "a"]], ["dates", [mixer, Date.new 2022, Date.new 2020]], ["datetimes", [mixer, Date_Time.new 2022 12 30 13 45, Date_Time.new 2020]], ["times", [mixer, Time_Of_Day.new 12 30, Time_Of_Day.new 13 45]], ["mixed_time", [Date.new 2022, Time_Of_Day.new 12 30, Date_Time.new 2019]], ["bools", [mixer, True, False]]]
t1 = t0.drop 1
t1.at "strs" . value_type . should_equal Value_Type.Mixed
t1.at "ints" . value_type . should_equal Value_Type.Mixed
t1.at "floats" . value_type . should_equal Value_Type.Mixed
t1.at "mix" . value_type . should_equal Value_Type.Mixed
t1.at "dates" . value_type . should_equal Value_Type.Mixed
t1.at "datetimes" . value_type . should_equal Value_Type.Mixed
t1.at "times" . value_type . should_equal Value_Type.Mixed
t1.at "mixed_time" . value_type . should_equal Value_Type.Mixed
t1.at "bools" . value_type . should_equal Value_Type.Mixed
t2 = t1.auto_value_types shrink_types=shrink_types
# Depending on shrink_types value the size of the Char/Integer types may vary - exact details tested elsewhere.
t2.at "strs" . value_type . should_be_a (Value_Type.Char ...)
t2.at "ints" . value_type . should_be_a (Value_Type.Integer ...)
t2.at "floats" . value_type . should_equal Value_Type.Float
t2.at "mix" . value_type . should_equal Value_Type.Mixed
t2.at "dates" . value_type . should_equal Value_Type.Date
t2.at "datetimes" . value_type . should_equal Value_Type.Date_Time
t2.at "times" . value_type . should_equal Value_Type.Time
t2.at "mixed_time" . value_type . should_equal Value_Type.Mixed
t2.at "bools" . value_type . should_equal Value_Type.Boolean
Test.specify "will only modify selected columns" <|
mixer = My_Type.Value 1
t0 = table_builder [["strs", [mixer, "a", "b"]], ["ints", [mixer, 2, 3]], ["floats", [mixer, 1.5, 2.5]]]
t1 = t0.drop 1
t2 = t1.auto_value_types []
t2.at "strs" . value_type . should_equal Value_Type.Mixed
t2.at "ints" . value_type . should_equal Value_Type.Mixed
t2.at "floats" . value_type . should_equal Value_Type.Mixed
t3 = t1.auto_value_types ["strs"]
t3.at "strs" . value_type . should_equal Value_Type.Char
t3.at "ints" . value_type . should_equal Value_Type.Mixed
t3.at "floats" . value_type . should_equal Value_Type.Mixed
# should match ints and floats but not strs
t4 = t1.auto_value_types "[if].*".to_regex
t4.at "strs" . value_type . should_equal Value_Type.Mixed
t4.at "ints" . value_type . should_equal Value_Type.Integer
t4.at "floats" . value_type . should_equal Value_Type.Float
Test.specify "will convert a Float column to Integer if all values can be represented as long" <|
t1 = table_builder [["X", [1.0, 2.0, 3.0]], ["Y", [1.0, 2.5, 3.0]], ["Z", [1.0, 2.0, (2.0^100)]]]
t1.at "X" . value_type . should_equal Value_Type.Float
t1.at "Y" . value_type . should_equal Value_Type.Float
t1.at "Z" . value_type . should_equal Value_Type.Float
t2 = t1.auto_value_types shrink_types=False
t2.at "X" . to_vector . should_equal [1, 2, 3]
t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t2.at "Y" . value_type . should_equal Value_Type.Float
## Technically, Z could get converted to Decimal type. But IMO that
is not desirable - at this scale the Float is no longer a
precise type (as not even consecutive integers are exactly
representable). And Decimal is expected to be precise. So such a
conversion should only happen by explicit request, not
automatically.
t2.at "Z" . value_type . should_equal Value_Type.Float
Test.specify "will not parse text columns" <|
t1 = table_builder [["X", ["1", "2", "3"]]]
c2 = t1.at "X" . auto_value_type
c2.value_type . should_equal Value_Type.Char
Test.specify "will 'undo' a cast to Mixed" <|
t1 = table_builder [["X", [1, 2, 3]], ["Y", ["a", "b", "c"]]]
t2 = t1.cast ["X", "Y"] Value_Type.Mixed
t2.at "X" . value_type . should_equal Value_Type.Mixed
t2.at "Y" . value_type . should_equal Value_Type.Mixed
t3 = t2.auto_value_types
t3.at "X" . value_type . should_equal Value_Type.Integer
t3.at "Y" . value_type . should_equal Value_Type.Char
Test.specify "will choose Decimal type if all values are integers but cannot fit long" <|
c0 = table_builder [["X", [My_Type.Value 42, 1, 2, 2^100]]] . at "X"
c1 = c0.drop 1
c1.value_type . should_equal Value_Type.Mixed
c2 = c1.auto_value_type
c2.value_type . should_be_a (Value_Type.Decimal ...)
c2.to_vector . should_equal [1, 2, 2^100]
Test.specify "will try to find the smallest integer type to fit the value (if shrink_types=True)" <|
[False, True].each is_mixed->
prefix = if is_mixed then "mixed" else 0
t0 = table_builder [["X", [prefix, 1, 2, 3]], ["Y", [prefix, 2^20, 2, 3]], ["Z", [prefix, 2^50, 2, 3]], ["F", [prefix, 1.0, 2.0, 3.0]]]
t1 = t0.drop 1
case is_mixed of
True -> t1.at "Z" . value_type . should_equal Value_Type.Mixed
False -> t1.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
case is_mixed of
True -> t1.at "F" . value_type . should_equal Value_Type.Mixed
False -> t1.at "F" . value_type . should_equal Value_Type.Float
t2 = t1.auto_value_types shrink_types=False
t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t2.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t2.at "F" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t3 = t1.auto_value_types shrink_types=True
# Even though X's values are small enough to fit in a Byte, we stick to 16-bit Integers.
t3.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_16)
t3.at "Y" . value_type . should_equal (Value_Type.Integer Bits.Bits_32)
t3.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
# Shrinking Floats also finds the smallest type that fits.
t3.at "F" . value_type . should_equal (Value_Type.Integer Bits.Bits_16)
Test.specify "will not return Byte columns by default, but should leave existing Byte columns intact" <|
c1 = table_builder [["X", [1, 2, 3]]] . at "X" . cast Value_Type.Byte
c1.value_type . should_equal Value_Type.Byte
[True, False].each shrink_types->
c2 = c1.auto_value_type shrink_types=shrink_types
c2.value_type . should_equal Value_Type.Byte
Test.specify "Decimal (scale=0, i.e. integer) columns should also be shrinked if possible and shrink_types=True" <|
t0 = table_builder [["X", [2^100, 1, 2, 3]], ["Y", [10, 20, 2^100, 30]], ["Z", [1, 2, 3, 4]]] . cast "Z" (Value_Type.Decimal scale=0)
t1 = t0.drop 1
t1.at "X" . value_type . should_equal (Value_Type.Decimal scale=0)
t1.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0)
t1.at "Z" . value_type . should_equal (Value_Type.Decimal scale=0)
t2 = t1.auto_value_types shrink_types=False
# Without shrinking we get an integer type, but not the smallest one - just the default 64-bit.
t2.at "X" . to_vector . should_equal [1, 2, 3]
t2.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t2.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0)
t2.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
t3 = t1.auto_value_types shrink_types=True
t3.at "X" . value_type . should_equal (Value_Type.Integer Bits.Bits_16)
t3.at "Y" . value_type . should_equal (Value_Type.Decimal scale=0)
t3.at "Z" . value_type . should_equal (Value_Type.Integer Bits.Bits_16)
Test.specify "if all text values have the same length, will change the type to fixed-length string (if shrink_types=True)" <|
[False, True].each is_mixed->
prefix = if is_mixed then 42 else "FOOBARBAZ"
c0 = table_builder [["X", [prefix, "aa", "bb", "cc"]]] . at "X"
c1 = c0.drop 1
c1.to_vector . should_equal ["aa", "bb", "cc"]
case is_mixed of
True -> c1.value_type . should_equal Value_Type.Mixed
False -> c1.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
c2 = c1.auto_value_type shrink_types=False
c2.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
c3 = c1.auto_value_type shrink_types=True
c3.value_type . should_equal (Value_Type.Char size=2 variable_length=False)
c4 = table_builder [["X", ["a", "x", "y"]]] . at "X" . cast (Value_Type.Char size=100 variable_length=True)
c4.to_vector . should_equal ["a", "x", "y"]
c4.value_type . should_equal (Value_Type.Char size=100 variable_length=True)
c5 = c4.auto_value_type shrink_types=False
c5.value_type . should_equal (Value_Type.Char size=100 variable_length=True)
c6 = c4.auto_value_type shrink_types=True
c6.value_type . should_equal (Value_Type.Char size=1 variable_length=False)
Test.specify "if all text values are empty string, the type will remain unchanged" <|
c1 = table_builder [["X", ["", ""]]] . at "X"
c2 = c1.cast (Value_Type.Char size=100 variable_length=True)
c1.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
c2.value_type . should_equal (Value_Type.Char size=100 variable_length=True)
[True, False].each shrink_types->
c1_b = c1.auto_value_type shrink_types=shrink_types
c1_b.value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
c2_b = c2.auto_value_type shrink_types=shrink_types
c2_b.value_type . should_equal (Value_Type.Char size=100 variable_length=True)
Test.specify "if all text values fit under 255 characters, will add a 255 length limit (if shrink_types=True)" <|
t1 = table_builder [["short_unbounded", ["a", "bb", "ccc"]], ["long_unbounded", ["a"*100, "b"*200, "c"*300]]]
t2 = t1 . set (t1.at "short_unbounded" . cast (Value_Type.Char size=1000)) "short_1000" . set (t1.at "short_unbounded" . cast (Value_Type.Char size=10)) "short_10" . set (t1.at "long_unbounded" . cast (Value_Type.Char size=400)) "long_400" . set (t1.at "short_unbounded" . cast Value_Type.Mixed) "short_mixed"
t2.at "short_mixed" . value_type . should_equal Value_Type.Mixed
t3 = t2.auto_value_types shrink_types=False
t3.at "short_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
t3.at "short_1000" . value_type . should_equal (Value_Type.Char size=1000 variable_length=True)
t3.at "short_10" . value_type . should_equal (Value_Type.Char size=10 variable_length=True)
# Mixed column gets to be text again.
t3.at "short_mixed" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
t3.at "long_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
t3.at "long_400" . value_type . should_equal (Value_Type.Char size=400 variable_length=True)
t4 = t2.auto_value_types shrink_types=True
# Short ones get shortened to 255 unless they were shorter already.
t4.at "short_unbounded" . value_type . should_equal (Value_Type.Char size=255 variable_length=True)
t4.at "short_1000" . value_type . should_equal (Value_Type.Char size=255 variable_length=True)
t4.at "short_10" . value_type . should_equal (Value_Type.Char size=10 variable_length=True)
t4.at "short_mixed" . value_type . should_equal (Value_Type.Char size=255 variable_length=True)
# Long ones cannot fit in 255 so they are kept as-is.
t4.at "long_unbounded" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)
t4.at "long_400" . value_type . should_equal (Value_Type.Char size=400 variable_length=True)
Test.specify "can deal with all-null columns" <|
t0 = table_builder [["mix", [My_Type.Value 1, Nothing, Nothing]], ["int", [42, Nothing, Nothing]], ["str", ["a", Nothing, Nothing]], ["float", [1.5, Nothing, Nothing]], ["decimal", [2^100, 2^10, 2]]]
t1 = t0.drop 1
t1.at "mix" . value_type . should_equal Value_Type.Mixed
t1.at "int" . value_type . should_equal Value_Type.Integer
t1.at "float" . value_type . should_equal Value_Type.Float
t1.at "str" . value_type . should_equal Value_Type.Char
t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0)
t2 = t1.auto_value_types shrink_types=False
t2.at "mix" . value_type . should_equal Value_Type.Mixed
t2.at "int" . value_type . should_equal Value_Type.Integer
## Technically, if there are no elements, "all of elements" are
whole integers (quantification over empty domain is trivially true).
However, that would be rather not useful, so instead we keep the
original type.
t2.at "float" . value_type . should_equal Value_Type.Float
t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0)
t2.at "str" . value_type . should_equal Value_Type.Char
t3 = t1.auto_value_types shrink_types=True
t3.at "mix" . value_type . should_equal Value_Type.Mixed
# Technically, if there are no elements, then they can be fit inside of the smallest types available:
t3.at "int" . value_type . should_equal (Value_Type.Integer Bits.Bits_16)
t3.at "float" . value_type . should_equal Value_Type.Float
t1.at "decimal" . value_type . should_equal (Value_Type.Decimal scale=0)
# But for Text we make an exception and keep the type unbounded: 0-length fixed length string simply would not make any sense.
t3.at "str" . value_type . should_equal (Value_Type.Char size=Nothing variable_length=True)

View File

@ -125,7 +125,7 @@ spec setup =
k x = if x == 2 then Time_Of_Day.new 13 05 else (x+1).to_text k x = if x == 2 then Time_Of_Day.new 13 05 else (x+1).to_text
r7 = c1.map k expected_value_type=Value_Type.Char r7 = c1.map k expected_value_type=Value_Type.Char
r7.should_fail_with Invalid_Value_Type r7.should_fail_with Invalid_Value_Type
r7.catch.to_display_text . should_contain "Expected type Char (variable length, max_size=Nothing), but got a value 13:05:00 of type Time" r7.catch.to_display_text . should_contain "Expected type Char (variable length, max_size=unlimited), but got a value 13:05:00 of type Time"
l x = if x == 2 then 42 else Date.new 2022 05 x l x = if x == 2 then 42 else Date.new 2022 05 x
r8 = c1.map l expected_value_type=Value_Type.Date r8 = c1.map l expected_value_type=Value_Type.Date

View File

@ -17,7 +17,7 @@ spec =
Value_Type.Float.to_display_text . should_equal "Float (64 bits)" Value_Type.Float.to_display_text . should_equal "Float (64 bits)"
Value_Type.Decimal.to_display_text . should_equal "Decimal (precision=Nothing, scale=Nothing)" Value_Type.Decimal.to_display_text . should_equal "Decimal (precision=Nothing, scale=Nothing)"
Value_Type.Char.to_display_text . should_equal "Char (variable length, max_size=Nothing)" Value_Type.Char.to_display_text . should_equal "Char (variable length, max_size=unlimited)"
(Value_Type.Binary 8 False).to_display_text . should_equal "Binary (fixed length, size=8 bytes)" (Value_Type.Binary 8 False).to_display_text . should_equal "Binary (fixed length, size=8 bytes)"
Value_Type.Date.to_display_text . should_equal "Date" Value_Type.Date.to_display_text . should_equal "Date"

View File

@ -166,10 +166,14 @@ spec =
c8.value_type . should_equal Value_Type.Mixed c8.value_type . should_equal Value_Type.Mixed
c8.to_vector . should_equal ["aaa", 42, Date.new 2022 08 22] c8.to_vector . should_equal ["aaa", 42, Date.new 2022 08 22]
c9 = Column.from_vector "X" [Time_Of_Day.new 10 11 12, Time_Of_Day.new 11 30] Value_Type.Time
c9.value_type . should_equal Value_Type.Time
c9.to_vector . should_equal [Time_Of_Day.new 10 11 12, Time_Of_Day.new 11 30]
Test.specify "will fail if unexpected values are encountered for the requested type" <| Test.specify "will fail if unexpected values are encountered for the requested type" <|
r1 = Column.from_vector "X" ["a", 2] Value_Type.Char r1 = Column.from_vector "X" ["a", 2] Value_Type.Char
r1.should_fail_with Invalid_Value_Type r1.should_fail_with Invalid_Value_Type
r1.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=Nothing), but got a value 2 of type Integer (16 bits)" r1.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=unlimited), but got a value 2 of type Integer (16 bits)"
r2 = Column.from_vector "X" ["aaa", "b"] (Value_Type.Char size=3 variable_length=False) r2 = Column.from_vector "X" ["aaa", "b"] (Value_Type.Char size=3 variable_length=False)
r2.should_fail_with Invalid_Value_Type r2.should_fail_with Invalid_Value_Type
@ -177,7 +181,7 @@ spec =
r3 = Column.from_vector "X" ["aaa", 42] Value_Type.Char r3 = Column.from_vector "X" ["aaa", 42] Value_Type.Char
r3.should_fail_with Invalid_Value_Type r3.should_fail_with Invalid_Value_Type
r3.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=Nothing), but got a value 42 of type Integer (16 bits)" r3.catch.to_display_text.should_contain "Expected type Char (variable length, max_size=unlimited), but got a value 42 of type Integer (16 bits)"
r4 = Column.from_vector "X" [12, Time_Of_Day.new 10 11 12] Value_Type.Integer r4 = Column.from_vector "X" [12, Time_Of_Day.new 10 11 12] Value_Type.Integer
r4.should_fail_with Invalid_Value_Type r4.should_fail_with Invalid_Value_Type
@ -199,6 +203,13 @@ spec =
r8.should_fail_with Invalid_Value_Type r8.should_fail_with Invalid_Value_Type
r8.catch.to_display_text.should_contain "Expected type Byte, but got a value 1000000000 of type Integer (32 bits)" r8.catch.to_display_text.should_contain "Expected type Byte, but got a value 1000000000 of type Integer (32 bits)"
Test.specify "will not allow to construct a column with Char size=0" <|
r1 = Column.from_vector "X" [] (Value_Type.Char size=0 variable_length=False)
r1.should_fail_with Illegal_Argument
r2 = Column.from_vector "X" [] (Value_Type.Char size=0 variable_length=True)
r2.should_fail_with Illegal_Argument
Test.group "Rounding" <| Test.group "Rounding" <|
Test.specify "should be able to round a column of decimals" <| Test.specify "should be able to round a column of decimals" <|
Column.from_vector "foo" [1.2, 2.3, 2.5, 3.6] . round . should_equal (Column.from_vector "round([foo])" [1, 2, 3, 4]) Column.from_vector "foo" [1.2, 2.3, 2.5, 3.6] . round . should_equal (Column.from_vector "round([foo])" [1, 2, 3, 4])