Fix inconsistency when building a Mixed column, fixes to Union (#7919)

- Fixes #7352 by remembering original value types in type inference mode to be able to reconstruct them for Mixed.
   - Added more benchmarks for comparing performance of constructing columns.
- Fixes missing implementations that caused `Table.union` crashing on some type pairs.
- Ensures that `Loss_Of_Integer_Precision` warning is not swallowed when numeric columns are unioned to create a `Float` column.
- Adds test for all of the above cases.
- Allow to output benchmark results to a CSV by setting an environment variable - useful for quickly comparing benchmarks, e.g. in Enso.
This commit is contained in:
Radosław Waśko 2023-10-03 20:33:34 +02:00 committed by GitHub
parent 08cd449a99
commit 0cd446432f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 458 additions and 109 deletions

View File

@ -2390,7 +2390,9 @@ concat_columns column_set all_tables result_type result_row_count on_problems =
_ : Integer ->
storage = parent_table.at i . java_column . getStorage
storage_builder.appendBulkStorage storage
Column.from_storage column_set.name storage_builder.seal
sealed_storage = Java_Problems.unpack_value_with_aggregated_problems on_problems <|
storage_builder.sealWithProblems
Column.from_storage column_set.name sealed_storage
## PRIVATE
Conversion method to a Table from a Column.

View File

@ -157,7 +157,13 @@ type Bench
## Run the specified set of benchmarks.
run_main self =
count = self.total_specs
IO.println <| "Found " + count.to_text + " cases to execute"
IO.println <| "Found " + count.to_text + " cases to execute (ETA " + self.estimated_runtime.to_display_text + ")"
case Environment.get "ENSO_BENCHMARK_REPORT_PATH" of
Nothing -> Nothing
path ->
line = 'Label,Phase,"Invocations count","Average time (ms)"'
line.write path on_existing_file=Existing_File_Behavior.Backup
self.fold Nothing _-> g-> s->
c = g.configuration
@ -168,10 +174,10 @@ type Bench
## Measure the amount of time it takes to execute a given computation.
Arguments:
- act: The action to perform.
- label: A name for the benchmark.
- warmup_conf: Warmup phase configuration.
- measure_conf: Measurement phase configuration.
- act: The action to perform.
> Example
Measure a computation called "foo" with an iteration size of 2 and a number
@ -193,8 +199,8 @@ type Bench
IO.println <| "[DRY-RUN] Benchmark '" + label + "' finished in " + fmt_duration_ms + " ms"
False ->
measure_start = System.nano_time
Bench.run_phase "Warmup" warmup_conf act
Bench.run_phase "Measurement" measure_conf act
Bench.run_phase label "Warmup" warmup_conf act
Bench.run_phase label "Measurement" measure_conf act
measure_end = System.nano_time
measure_duration_ms = (measure_end - measure_start) / 1000000
fmt_duration_ms = measure_duration_ms.format "#.###"
@ -215,11 +221,12 @@ type Bench
so that it is the same as in JMH.
Arguments:
- label: A name for the benchmark.
- phase_name: The name of the phase.
- conf: The phase configuration.
- act: Method that should be measured - benchmark body.
run_phase : Text -> Phase_Conf -> (Any -> Any) -> Nothing
run_phase (phase_name:Text) (conf:Phase_Conf) ~act =
run_phase : Text -> Text -> Phase_Conf -> (Any -> Any) -> Nothing
run_phase (label:Text) (phase_name:Text) (conf:Phase_Conf) ~act =
duration_ns = conf.iterations * conf.seconds * 1000000000
phase_start = System.nano_time
stop_ns = phase_start + duration_ns
@ -230,14 +237,29 @@ type Bench
durations = go [] phase_start
sum = durations.reduce (_ + _)
run_iters = durations.length
avg = sum / run_iters
fmt = (avg / 1000000).format "#.###"
avg = (sum / run_iters) / 1000000
phase_end = System.nano_time
phase_duration_ms = (phase_end - phase_start) / 1000000
IO.println <| phase_name + " duration: " + phase_duration_ms.to_text + " ms"
IO.println <| phase_name + " invocations: " + run_iters.to_text
phase_duration = Duration.new nanoseconds=(phase_end - phase_start)
Bench.summarize_phase label phase_name run_iters avg phase_duration
## PRIVATE
This is a very simple implementation of summarizing the benchmark
results.
We may want to improve it later, but it gets the job done to give us
simple summary that can be analysed more easily than logs.
summarize_phase (label:Text) (phase_name:Text) (invocations:Integer) (average_time:Float) (phase_duration:Duration) =
fmt = average_time.format "#.###"
IO.println <| phase_name + " duration: " + (phase_duration.total_milliseconds.format "#.##") + " ms"
IO.println <| phase_name + " invocations: " + invocations.to_text
IO.println <| phase_name + " avg time: " + fmt + " ms"
case Environment.get "ENSO_BENCHMARK_REPORT_PATH" of
Nothing -> Nothing
path ->
line = '\n"'+label+'","'+phase_name+'",'+invocations.to_text+','+fmt
line.write path on_existing_file=Existing_File_Behavior.Append
## PRIVATE
Checks whether the given name is a valid benchmark name - either group name

View File

@ -4,10 +4,12 @@ import java.math.BigInteger;
import java.util.Arrays;
import org.enso.base.polyglot.NumericConverter;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.numeric.AbstractLongStorage;
import org.enso.table.data.column.storage.numeric.BigIntegerStorage;
import org.enso.table.data.column.storage.type.AnyObjectType;
import org.enso.table.data.column.storage.type.BigIntegerType;
import org.enso.table.data.column.storage.type.FloatType;
import org.enso.table.data.column.storage.type.IntegerType;
import org.enso.table.data.column.storage.type.StorageType;
import org.enso.table.error.ValueTypeMismatchException;
import org.graalvm.polyglot.Context;
@ -24,8 +26,8 @@ public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
}
@Override
public void writeTo(Object[] items) {
super.writeTo(items);
public void retypeToMixed(Object[] items) {
super.retypeToMixed(items);
}
@Override
@ -36,7 +38,7 @@ public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
@Override
public TypedBuilder retypeTo(StorageType type) {
if (type instanceof FloatType) {
DoubleBuilder res = NumericBuilder.createDoubleBuilder(currentSize);
DoubleBuilder res = NumericBuilder.createInferringDoubleBuilder(currentSize);
for (int i = 0; i < currentSize; i++) {
if (data[i] == null) {
res.appendNulls(1);
@ -98,4 +100,28 @@ public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
}
return res;
}
@Override
public void appendBulkStorage(Storage<?> storage) {
if (storage.getType() instanceof IntegerType) {
if (storage instanceof AbstractLongStorage longStorage) {
int n = longStorage.size();
for (int i = 0; i < n; i++) {
if (storage.isNa(i)) {
data[currentSize++] = null;
} else {
long item = longStorage.getItem(i);
data[currentSize++] = BigInteger.valueOf(item);
}
}
} else {
throw new IllegalStateException(
"Unexpected storage implementation for type INTEGER: "
+ storage
+ ". This is a bug in the Table library.");
}
} else {
super.appendBulkStorage(storage);
}
}
}

View File

@ -95,7 +95,7 @@ public class BoolBuilder extends TypedBuilder {
}
@Override
public void writeTo(Object[] items) {
public void retypeToMixed(Object[] items) {
for (int i = 0; i < size; i++) {
if (isNa.get(i)) {
items[i] = null;

View File

@ -5,7 +5,9 @@ import org.enso.table.data.column.operation.cast.ToFloatStorageConverter;
import org.enso.table.data.column.storage.BoolStorage;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.numeric.AbstractLongStorage;
import org.enso.table.data.column.storage.numeric.BigIntegerStorage;
import org.enso.table.data.column.storage.numeric.DoubleStorage;
import org.enso.table.data.column.storage.type.BigIntegerType;
import org.enso.table.data.column.storage.type.BooleanType;
import org.enso.table.data.column.storage.type.FloatType;
import org.enso.table.data.column.storage.type.IntegerType;
@ -28,14 +30,10 @@ public class DoubleBuilder extends NumericBuilder {
}
@Override
public void writeTo(Object[] items) {
for (int i = 0; i < currentSize; i++) {
if (isMissing.get(i)) {
items[i] = null;
} else {
items[i] = Double.longBitsToDouble(data[i]);
}
}
public void retypeToMixed(Object[] items) {
throw new IllegalStateException("The DoubleBuilder cannot be retyped to the Mixed type, because it would lose " +
"type information about integers that were converted to doubles. If recasting is needed, " +
"InferringDoubleBuilder should be used instead. This error leaking is a bug in the Table library.");
}
@Override
@ -53,33 +51,6 @@ public class DoubleBuilder extends NumericBuilder {
return FloatType.FLOAT_64;
}
/**
* Converts the provided LongBuilder to a DoubleBuilder.
*
* <p>The original LongBuilder becomes invalidated after this operation and should no longer be
* used.
*/
static DoubleBuilder retypeFromLongBuilder(LongBuilder longBuilder) {
int currentSize = longBuilder.currentSize;
DoubleBuilder newBuilder = new DoubleBuilder(longBuilder.isMissing, longBuilder.data, currentSize);
// Invalidate the old builder.
longBuilder.data = null;
longBuilder.isMissing = null;
longBuilder.currentSize = -1;
// Translate the data in-place to avoid unnecessary allocations.
for (int i = 0; i < currentSize; i++) {
if (!newBuilder.isMissing.get(i)) {
long currentIntegerValue = newBuilder.data[i];
double convertedFloatValue = newBuilder.convertIntegerToDouble(currentIntegerValue);
newBuilder.data[i] = Double.doubleToRawLongBits(convertedFloatValue);
}
}
return newBuilder;
}
@Override
public void appendNoGrow(Object o) {
if (o == null) {
@ -134,7 +105,25 @@ public class DoubleBuilder extends NumericBuilder {
+ storage
+ ". This is a bug in the Table library.");
}
} else if (Objects.equals(storage.getType(), BooleanType.INSTANCE)) {
} else if (storage.getType() instanceof BigIntegerType) {
if (storage instanceof BigIntegerStorage bigIntegerStorage) {
int n = bigIntegerStorage.size();
for (int i = 0; i < n; i++) {
BigInteger item = bigIntegerStorage.getItem(i);
if (item == null) {
isMissing.set(currentSize++);
} else {
double converted = convertBigIntegerToDouble(item);
data[currentSize++] = Double.doubleToRawLongBits(converted);
}
}
} else {
throw new IllegalStateException(
"Unexpected storage implementation for type BIG INTEGER: "
+ storage
+ ". This is a bug in the Table library.");
}
} else if (storage.getType() instanceof BooleanType) {
if (storage instanceof BoolStorage boolStorage) {
int n = boolStorage.size();
for (int i = 0; i < n; i++) {
@ -201,34 +190,34 @@ public class DoubleBuilder extends NumericBuilder {
* <p>
* It verifies if the integer can be exactly represented in a double, and if not, it reports a warning.
*/
private double convertIntegerToDouble(long integer) {
protected double convertIntegerToDouble(long integer) {
double floatingPointValue = (double) integer;
boolean isLosingPrecision = (long) floatingPointValue != integer;
if (isLosingPrecision) {
if (precisionLoss == null) {
precisionLoss = new LossOfIntegerPrecision(integer, floatingPointValue);
} else {
precisionLoss.incrementAffectedRows();
}
reportPrecisionLoss(integer, floatingPointValue);
}
return floatingPointValue;
}
private double convertBigIntegerToDouble(BigInteger bigInteger) {
protected double convertBigIntegerToDouble(BigInteger bigInteger) {
double floatingPointValue = bigInteger.doubleValue();
BigInteger reconstructed = BigDecimal.valueOf(floatingPointValue).toBigInteger();
boolean isLosingPrecision = !bigInteger.equals(reconstructed);
if (isLosingPrecision) {
if (precisionLoss == null) {
precisionLoss = new LossOfIntegerPrecision(bigInteger, floatingPointValue);
} else {
precisionLoss.incrementAffectedRows();
}
reportPrecisionLoss(bigInteger, floatingPointValue);
}
return floatingPointValue;
}
private LossOfIntegerPrecision precisionLoss = null;
protected final void reportPrecisionLoss(Number number, double approximation) {
if (precisionLoss == null) {
precisionLoss = new LossOfIntegerPrecision(number, approximation);
} else {
precisionLoss.incrementAffectedRows();
}
}
protected LossOfIntegerPrecision precisionLoss = null;
@Override
public AggregatedProblems getProblems() {

View File

@ -102,7 +102,7 @@ public class InferredBuilder extends Builder {
// In inferred builder, we always default to 64-bits.
currentBuilder = NumericBuilder.createLongBuilder(initialCapacity, IntegerType.INT_64);
} else if (NumericConverter.isFloatLike(o)) {
currentBuilder = NumericBuilder.createDoubleBuilder(initialCapacity);
currentBuilder = NumericBuilder.createInferringDoubleBuilder(initialCapacity);
} else if (o instanceof String) {
currentBuilder = new StringBuilder(initialCapacity, TextType.VARIABLE_LENGTH);
} else if (o instanceof BigInteger) {
@ -156,7 +156,7 @@ public class InferredBuilder extends Builder {
private void retypeToMixed() {
ObjectBuilder objectBuilder = new MixedBuilder(initialSize);
currentBuilder.writeTo(objectBuilder.getData());
currentBuilder.retypeToMixed(objectBuilder.getData());
objectBuilder.setCurrentSize(currentBuilder.getCurrentSize());
objectBuilder.setPreExistingProblems(currentBuilder.getProblems());
currentBuilder = objectBuilder;

View File

@ -0,0 +1,192 @@
package org.enso.table.data.column.builder;
import org.enso.base.polyglot.NumericConverter;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.error.ValueTypeMismatchException;
import java.math.BigInteger;
import java.util.BitSet;
/**
* A double builder variant that preserves types and can be retyped to Mixed.
*/
public class InferringDoubleBuilder extends DoubleBuilder {
/**
* Converts the provided LongBuilder to a DoubleBuilder.
*
* <p>The original LongBuilder becomes invalidated after this operation and should no longer be
* used.
*/
static InferringDoubleBuilder retypeFromLongBuilder(LongBuilder longBuilder) {
int currentSize = longBuilder.currentSize;
InferringDoubleBuilder newBuilder =
new InferringDoubleBuilder(longBuilder.isMissing, longBuilder.data, currentSize);
// Invalidate the old builder.
longBuilder.data = null;
longBuilder.isMissing = null;
longBuilder.currentSize = -1;
// Assume all longs will be compacted.
newBuilder.isLongCompactedAsDouble.set(0, currentSize, true);
// Translate the data in-place to avoid unnecessary allocations.
for (int i = 0; i < currentSize; i++) {
if (!newBuilder.isMissing.get(i)) {
long currentIntegerValue = newBuilder.data[i];
double convertedFloatValue = (double) currentIntegerValue;
boolean isLossy = currentIntegerValue != (long) convertedFloatValue;
if (isLossy) {
// Save it raw for recovery.
newBuilder.setRaw(i, currentIntegerValue);
newBuilder.reportPrecisionLoss(currentIntegerValue, convertedFloatValue);
// Unmark the long that did not fit:
newBuilder.isLongCompactedAsDouble.set(i, false);
}
newBuilder.data[i] = Double.doubleToRawLongBits(convertedFloatValue);
}
}
return newBuilder;
}
InferringDoubleBuilder(BitSet isMissing, long[] doubleData, int currentSize) {
super(isMissing, doubleData, currentSize);
rawData = null;
isLongCompactedAsDouble = new BitSet();
}
/**
* Stores the raw data as passed to append, in order to be able to reconstruct the original values when retyping to
* mixed.
*/
private Number[] rawData;
/**
* Specifies at which indices we encountered integers that can be reconstructed from double without loss of
* precision.
* <p>
* This is used for reconstructing the original values when retyping to mixed. Integers that are small enough can be
* reconstructed from their double values, so we do not need to store them as `rawData`, instead we only mark that
* they need to be converted back into integers when retyping. This allows us to completely avoid allocating the
* `rawData` array for most practical scenarios when the integers are not too large. The cost of allocating this
* BitSet should be significantly lower than allocating the `rawData` array.
*/
private final BitSet isLongCompactedAsDouble;
@Override
public void retypeToMixed(Object[] items) {
int rawN = rawData == null ? 0 : rawData.length;
for (int i = 0; i < currentSize; i++) {
if (isMissing.get(i)) {
items[i] = null;
} else {
if (isLongCompactedAsDouble.get(i)) {
double value = Double.longBitsToDouble(data[i]);
long reconstructed = (long) value;
items[i] = reconstructed;
} else if (i < rawN && rawData[i] != null) {
items[i] = rawData[i];
} else {
double value = Double.longBitsToDouble(data[i]);
items[i] = value;
}
}
}
// Since we are retyping to Mixed, the precision loss warnings should not be inherited - thus we reset them.
precisionLoss = null;
}
@Override
public void appendBulkStorage(Storage<?> storage) {
throw new UnsupportedOperationException("appendBulkStorage is not supported on InferringDoubleBuilder. " +
"A DoubleBuilder or MixedBuilder should be used instead. This is a bug in the Table library.");
}
@Override
public void appendDouble(double x) {
if (currentSize >= this.data.length) {
grow();
}
data[currentSize] = Double.doubleToRawLongBits(x);
currentSize++;
}
@Override
public void appendLong(long integer) {
if (currentSize >= this.data.length) {
grow();
}
appendLongNoGrow(integer);
}
private void appendLongNoGrow(long integer) {
double convertedFloatValue = (double) integer;
boolean isLossy = integer != (long) convertedFloatValue;
if (isLossy) {
setRaw(currentSize, integer);
reportPrecisionLoss(integer, convertedFloatValue);
} else {
isLongCompactedAsDouble.set(currentSize, true);
}
data[currentSize] = Double.doubleToRawLongBits(convertedFloatValue);
currentSize++;
}
@Override
public void appendBigInteger(BigInteger integer) {
if (currentSize >= this.data.length) {
grow();
}
setRaw(currentSize, integer);
double convertedFloatValue = convertBigIntegerToDouble(integer);
data[currentSize] = Double.doubleToRawLongBits(convertedFloatValue);
currentSize++;
}
@Override
public void appendNoGrow(Object o) {
if (o == null) {
isMissing.set(currentSize++);
} else if (NumericConverter.isFloatLike(o)) {
double value = NumericConverter.coerceToDouble(o);
data[currentSize++] = Double.doubleToRawLongBits(value);
} else if (NumericConverter.isCoercibleToLong(o)) {
long value = NumericConverter.coerceToLong(o);
appendLongNoGrow(value);
} else if (o instanceof BigInteger bigInteger) {
setRaw(currentSize, bigInteger);
double converted = convertBigIntegerToDouble(bigInteger);
data[currentSize++] = Double.doubleToRawLongBits(converted);
} else {
throw new ValueTypeMismatchException(getType(), o);
}
}
@Override
public void appendRawNoGrow(long rawData) {
throw new UnsupportedOperationException("appendRawNoGrow is not supported on InferringDoubleBuilder. " +
"A DoubleBuilder should be used instead. This is a bug in the Table library.");
}
private void setRaw(int ix, Number o) {
if (rawData == null) {
rawData = new Number[ix + 1];
}
if (rawData.length <= ix) {
int newLength = Math.max(rawData.length * 3 / 2 + 1, ix + 1);
Number[] newRawData = new Number[newLength];
System.arraycopy(rawData, 0, newRawData, 0, rawData.length);
rawData = newRawData;
}
rawData[ix] = o;
}
}

View File

@ -35,7 +35,7 @@ public abstract class LongBuilder extends NumericBuilder {
}
@Override
public void writeTo(Object[] items) {
public void retypeToMixed(Object[] items) {
for (int i = 0; i < currentSize; i++) {
if (isMissing.get(i)) {
items[i] = null;
@ -55,7 +55,7 @@ public abstract class LongBuilder extends NumericBuilder {
if (Objects.equals(type, BigIntegerType.INSTANCE)) {
return BigIntegerBuilder.retypeFromLongBuilder(this);
} else if (Objects.equals(type, FloatType.FLOAT_64)) {
return DoubleBuilder.retypeFromLongBuilder(this);
return InferringDoubleBuilder.retypeFromLongBuilder(this);
} else {
throw new UnsupportedOperationException();
}

View File

@ -16,10 +16,19 @@ public abstract class NumericBuilder extends TypedBuilder {
this.currentSize = currentSize;
}
/**
* Creates a {@link DoubleBuilder} that should be used to create columns of boolean type and are
* not expected to be retyped.
*/
public static DoubleBuilder createDoubleBuilder(int size) {
return new DoubleBuilder(new BitSet(), new long[size], 0);
}
/** Creates a {@link DoubleBuilder} that may be retyped to Mixed type. */
public static DoubleBuilder createInferringDoubleBuilder(int size) {
return new InferringDoubleBuilder(new BitSet(), new long[size], 0);
}
public static LongBuilder createLongBuilder(int size, IntegerType type) {
return LongBuilder.make(size, type);
}

View File

@ -23,7 +23,7 @@ public class ObjectBuilder extends TypedBuilder {
}
@Override
public void writeTo(Object[] items) {
public void retypeToMixed(Object[] items) {
throw new IllegalStateException("Broken invariant: rewriting the most general type.");
}

View File

@ -9,7 +9,7 @@ public abstract class TypedBuilder extends Builder {
*
* @param items the buffer to dump elements into
*/
public abstract void writeTo(Object[] items);
public abstract void retypeToMixed(Object[] items);
/**
* Checks if the builder can be efficiently retyped to the given storage type.

View File

@ -18,7 +18,7 @@ public abstract class TypedBuilderImpl<T> extends TypedBuilder {
}
@Override
public void writeTo(Object[] items) {
public void retypeToMixed(Object[] items) {
if (currentSize >= 0) System.arraycopy(data, 0, items, 0, currentSize);
}

View File

@ -8,15 +8,41 @@ from Standard.Test import Bench
options = Bench.options . set_warmup (Bench.phase_conf 2 3) . set_measure (Bench.phase_conf 2 3)
type Data
Value ~ints
Value ~ints ~floats ~floats_and_small_ints ~floats_and_large_ints
create num_rows =
# 0-argument block to make it lazy
ints =
Vector.new num_rows i->
i % 1000
Data.Value ints
floats =
Vector.new num_rows i->
(i % 1000) + 0.5
floats_and_small_ints =
Vector.new num_rows i->
case i % 2 of
0 -> (i % 1000) + 0.5
1 -> i % 1000
floats_and_large_ints =
Vector.new num_rows i->
case i % 3 of
0 -> (i % 1000) + 0.5
1 -> 2^60 + (i % 100)
2 -> 2^100 + (i % 100)
Data.Value ints floats floats_and_small_ints floats_and_large_ints
## A flag allowing to run some additional benchmarks.
They were used when analyzing performance of an optimization of Mixed
handler. They are disabled on the CI to avoid increasing the time of the
build, but they can be manually re-enabled in case this needs testing in the
future.
When set to `False`, only the core measurements checking the most common
cases are run.
extended_tests = False
collect_benches = Bench.build builder->
num_rows = 1000000
@ -32,4 +58,18 @@ collect_benches = Bench.build builder->
group_builder.specify "Integers_type_Auto" <|
Column.from_vector "Auto" data.ints value_type=Auto
group_builder.specify "Floats_type_Auto" <|
Column.from_vector "Auto" data.floats value_type=Auto
group_builder.specify "Floats_type_Float" <|
Column.from_vector "Floats" data.floats value_type=Value_Type.Float
if extended_tests then group_builder.specify "Numeric_Mix_Small_type_Auto" <|
Column.from_vector "Small_Mix" data.floats_and_small_ints value_type=Auto
if extended_tests then group_builder.specify "Numeric_Mix_Small_type_Float" <|
Column.from_vector "Small_Mix" data.floats_and_small_ints value_type=Value_Type.Float
if extended_tests then group_builder.specify "Numeric_Mix_Large_type_Auto" <|
Column.from_vector "Large_Mix" data.floats_and_large_ints value_type=Auto
if extended_tests then group_builder.specify "Numeric_Mix_Large_type_Float" <|
Column.from_vector "Large_Mix" data.floats_and_large_ints value_type=Value_Type.Float
main = collect_benches . run_main

View File

@ -426,15 +426,16 @@ spec setup =
c3 = t2.cast "super-mix" Value_Type.Char . at "super-mix"
c3.value_type . should_equal Value_Type.Char
date_time_str = Date_Time.new 2023 12 15 19 25 . to_text
c3.to_vector . should_equal ["1.0", "2.25", "3", "-45.25", "2.0", "X", "2020-01-01", date_time_str, "12:30:00", "{{{MY Type [x=42] }}}", "True", Nothing, "True"]
c3.to_vector . should_equal ["1", "2.25", "3", "-45.25", "2.0", "X", "2020-01-01", date_time_str, "12:30:00", "{{{MY Type [x=42] }}}", "True", Nothing, "True"]
Problems.assume_no_problems c3
if setup.test_selection.fixed_length_text_columns then
c3_2 = t2.cast "super-mix" (Value_Type.Char size=2) . at "super-mix"
c3_2.value_type . should_equal (Value_Type.Char size=2)
c3_2.to_vector . should_equal ["1.", "2.", "3", "-4", "2.", "X", "20", "20", "12", "{{", "Tr", Nothing, "Tr"]
c3_2.to_vector . should_equal ["1", "2.", "3", "-4", "2.", "X", "20", "20", "12", "{{", "Tr", Nothing, "Tr"]
w3_2 = Problems.expect_warning Conversion_Failure c3_2
w3_2.affected_rows_count . should_equal 10
w3_2.affected_rows_count . should_equal 9
w3_2.to_display_text . should_contain "have a text representation that does not fit the target type"
c4 = t2.cast "super-mix" Value_Type.Boolean . at "super-mix"
c4.value_type . should_equal Value_Type.Boolean
@ -630,13 +631,13 @@ spec setup =
t3.at "Y" . value_type . should_equal Value_Type.Char
Test.specify "will choose Decimal type if all values are integers but cannot fit long" <|
c0 = table_builder [["X", [My_Type.Value 42, 1, 2, 2^100]]] . at "X"
c0 = table_builder [["X", [My_Type.Value 42, 1, 2, (2^100)+1]]] . at "X"
c1 = c0.drop 1
c1.value_type . should_equal Value_Type.Mixed
c2 = c1.auto_value_type
c2.value_type . should_be_a (Value_Type.Decimal ...)
c2.to_vector . should_equal [1, 2, 2^100]
c2.to_vector . should_equal [1, 2, (2^100)+1]
Test.specify "will try to find the smallest integer type to fit the value (if shrink_types=True)" <|
[False, True].each is_mixed->

View File

@ -208,16 +208,31 @@ spec setup =
t3.at "A" . value_type . variable_length . should_be_true
Test.specify "should find a common type that will fit the merged columns" <|
t1 = table_builder [["int+float", [0, 1, 2]]]
t2 = table_builder [["int+float", [1.0, 2.0, 2.5]]]
t1 = table_builder [["A", [0, 1, 2]]]
t2 = table_builder [["A", [1.0, 2.0, 2.5]]]
t1.at "int+float" . value_type . is_integer . should_be_true
t2.at "int+float" . value_type . is_floating_point . should_be_true
t1.at "A" . value_type . is_integer . should_be_true
t2.at "A" . value_type . is_floating_point . should_be_true
t3 = t1.union t2
expect_column_names ["int+float"] t3
t3.at "int+float" . value_type . is_floating_point . should_be_true
t3.at "int+float" . to_vector . should_equal [0, 1, 2, 1.0, 2.0, 2.5]
expect_column_names ["A"] t3
t3.at "A" . value_type . is_floating_point . should_be_true
t3.at "A" . to_vector . should_equal [0, 1, 2, 1.0, 2.0, 2.5]
# Specific type tests that apply to in-memory. Database behaviour is up to implementation.
if setup.is_database.not then
t4 = table_builder [["A", [2^100, 2^10, 2]]]
t4.at "A" . value_type . should_be_a (Value_Type.Decimal ...)
t5 = t2.union t4
expect_column_names ["A"] t5
t5.at "A" . value_type . is_floating_point . should_be_true
t5.at "A" . to_vector . should_equal [1.0, 2.0, 2.5, 2^100, 2^10, 2]
t6 = t1.union t4
expect_column_names ["A"] t6
t6.at "A" . value_type . should_be_a (Value_Type.Decimal ...)
t6.at "A" . to_vector . should_equal [0, 1, 2, 2^100, 2^10, 2]
# Database backends are not required to support Mixed types.
if setup.is_database.not then
@ -329,6 +344,24 @@ spec setup =
t6.at "X" . value_type . should_equal Value_Type.Mixed
t6.at "X" . to_vector . should_equal ["a", 1, Nothing, 1, 1.2, 2.3, 3.4, "a", "b", True, False]
Test.specify "when finding a common type for numeric columns to be Float, any precision loss should be reported" <|
t1 = table_builder [["X", [1, (2^62)-1, 3]]]
t2 = table_builder [["X", [1.5, 2.5, 3.5]]]
t3 = table_builder [["X", [(2^100)+1, 2^10, 2]]]
t1.at "X" . value_type . should_equal Value_Type.Integer
t2.at "X" . value_type . should_equal Value_Type.Float
t3.at "X" . value_type . should_be_a (Value_Type.Decimal ...)
t4 = t2.union [t1, t3] allow_type_widening=True
t4.at "X" . value_type . should_equal Value_Type.Float
# Inexact float equality will make this pass:
t4.at "X" . to_vector . should_equal [1.5, 2.5, 3.5, 1, (2^62)-1, 3, (2^100)+1, 2^10, 2]
w = Problems.expect_only_warning Loss_Of_Integer_Precision t4
# Losing precision on (2^62)-1 and 2^100+1.
w.affected_rows_count . should_equal 2
Test.specify "if type mismatches cause all columns to be dropped, fail with No_Output_Columns" <|
t1 = table_builder [["A", [1, 2, 3]]]
t2 = table_builder [["A", ['x']]]

View File

@ -65,7 +65,7 @@ spec =
test_column.reverse.to_vector . should_equal expected_1.to_vector
empty_column.reverse.to_vector . should_equal empty_column.to_vector
Test.specify "should allow to count duplicate value occurences" <|
Test.specify "should allow to count duplicate value occurrences" <|
c_1 = Column.from_vector "c 1" [0, 1, 2, 2, 1, 0, 2]
c_1.duplicate_count.to_vector.should_equal [0, 0, 0, 1, 1, 1, 2]
@ -106,19 +106,48 @@ spec =
c1.at 1 . should_be_a Float
c1.at 0 . is_a Integer . should_be_false
Test.specify "will preserve the types if the column is Mixed" <|
c1 = Column.from_vector "X" ["a", 1, 2.0]
c1.value_type . should_equal Value_Type.Mixed
c1.at 0 . should_be_a Text
c1.at 1 . should_be_a Integer
c1.at 2 . should_be_a Float
Test.specify "will preserve the types if the column is Mixed, regardless of ordering" <|
run_test vector =
Test.with_clue vector.pretty+": " <|
c = Column.from_vector "X" vector
c.value_type . should_equal Value_Type.Mixed
c.to_vector . should_equal vector
Problems.assume_no_problems c
Test.specify "will preserve the types if the column is Mixed (2)" pending="ToDo: disabled due to issue #7352" <|
c2 = Column.from_vector "X" [1, 2.0, "a"]
c2.value_type . should_equal Value_Type.Mixed
c2.at 0 . should_be_a Integer
c2.at 1 . should_be_a Float
c2.at 2 . should_be_a Text
# Verify that types of the values are preserved.
c.to_vector.zip vector got-> expected->
Test.with_clue "(type of "+got.pretty+") " <|
(Meta.type_of got) . should_equal (Meta.type_of expected)
got.to_text . should_equal expected.to_text
## Testing various permutations to ensure that the behaviour is
invariant to the order of the values.
(It used to differ depending on the order.)
big = (2^100)+1
medium = (2^62)-1
run_test ["a", 1, 2.0]
run_test [1, 2.0, "a"]
run_test [2.0, 1, "a"]
run_test [2.5, 1, "a"]
run_test [10, 1.5, "a"]
run_test ["a", 1.5, 10]
run_test [big, 1.5, "a"]
run_test [1.5, big, "a"]
run_test [2, big, 1.5, "a"]
run_test [2, big, 1.5, "a"]
run_test [1.5, big, 2, "a"]
run_test [Nothing, 1.5, medium, 2, "a"]
run_test [medium, 1.5, "a"]
run_test [Nothing, medium, 1.5, "a"]
big_test_vector x y =
a = Vector.fill 40 [x, 1.5, 2, y, x+123, y+123, y+5, 2.5]
b = Vector.fill 100 [Nothing]
c = Vector.fill 500 [2, 1.5, x]
d = Vector.fill 5 [2, "a", 2, 2.5]
(a+b+c+d).flatten
run_test (big_test_vector medium big)
run_test (big_test_vector 123 456)
Test.specify "should allow to set a specific type at construction" <|
c1 = Column.from_vector "X" [1, 2] Value_Type.Float
@ -203,6 +232,10 @@ spec =
r8.should_fail_with Invalid_Value_Type
r8.catch.to_display_text.should_contain "Expected type Byte, but got a value 1000000000 of type Integer (32 bits)"
r9 = Column.from_vector "X" [100, 1.5] Value_Type.Integer
r9.should_fail_with Invalid_Value_Type
r9.catch.to_display_text.should_contain "Expected type Integer (64 bits), but got a value 1.5 of type Float"
Test.specify "will not allow to construct a column with Char size=0" <|
r1 = Column.from_vector "X" [] (Value_Type.Char size=0 variable_length=False)
r1.should_fail_with Illegal_Argument

View File

@ -70,17 +70,13 @@ spec =
w4 = Problems.expect_only_warning Loss_Of_Integer_Precision c4
w4.affected_rows_count . should_equal 2
# Until #7352 is fixed, these tests are also useful:
# If we start with mixed before encountering large values, no issues occur:
# No precision loss should happen if the column is mixed:
c5 = Column.from_vector "X" ["a", 1.0, x+1]
Problems.assume_no_problems c5
c5.to_vector.map .to_text . should_equal (["a", 1.0, x+1].map .to_text)
# But if we first have floats and ints and only later enter mixed, we again get precision loss:
c6 = Column.from_vector "X" [x+1, 1.0, x+2, "a", x+3]
w6 = Problems.expect_only_warning Loss_Of_Integer_Precision c6
# The 2 integers that got into DoubleBuilder are affected, the one after upcasting to Mixed is preserved OK.
w6.affected_rows_count . should_equal 2
c6.to_vector.map .to_text . should_equal ([x+0.0, 1.0, x+0.0, "a", x+3].map .to_text)
Problems.assume_no_problems c6
c6.to_vector.map .to_text . should_equal ([x+1, 1.0, x+2, "a", x+3].map .to_text)
# It should also work for from_vector_repeated:
c7 = Column.from_vector_repeated "X" [1.5, x+2, 100] 10

View File

@ -687,9 +687,15 @@ spec =
Test.specify "Should handle parent-less file paths" <|
transient = enso_project.data / "transient"
f1 = transient / "./test1.txt"
f2 = File.new ("parentlesstest" + Date_Time.now.to_unix_epoch_milliseconds.to_text + ".txt")
f2_filename = "parentlesstest" + Date_Time.now.to_unix_epoch_milliseconds.to_text + ".txt"
f2 = File.new f2_filename
Panic.with_finalizer f2.delete <|
do_cleanup =
f2.delete_if_exists
f2_bak = File.new (f2_filename + ".bak")
f2_bak.delete_if_exists
Panic.with_finalizer do_cleanup <|
txt = "MY CONTENT"
txt.write f1
txt.write f2