Fix inconsistency when building a Mixed column, fixes to Union (#7919)

- Fixes #7352 by remembering original value types in type inference mode to be able to reconstruct them for Mixed.
   - Added more benchmarks for comparing performance of constructing columns.
- Fixes missing implementations that caused `Table.union` crashing on some type pairs.
- Ensures that `Loss_Of_Integer_Precision` warning is not swallowed when numeric columns are unioned to create a `Float` column.
- Adds test for all of the above cases.
- Allow to output benchmark results to a CSV by setting an environment variable - useful for quickly comparing benchmarks, e.g. in Enso.
This commit is contained in:
Radosław Waśko 2023-10-03 20:33:34 +02:00 committed by GitHub
parent 08cd449a99
commit 0cd446432f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 458 additions and 109 deletions

View File

@ -2390,7 +2390,9 @@ concat_columns column_set all_tables result_type result_row_count on_problems =
_ : Integer -> _ : Integer ->
storage = parent_table.at i . java_column . getStorage storage = parent_table.at i . java_column . getStorage
storage_builder.appendBulkStorage storage storage_builder.appendBulkStorage storage
Column.from_storage column_set.name storage_builder.seal sealed_storage = Java_Problems.unpack_value_with_aggregated_problems on_problems <|
storage_builder.sealWithProblems
Column.from_storage column_set.name sealed_storage
## PRIVATE ## PRIVATE
Conversion method to a Table from a Column. Conversion method to a Table from a Column.

View File

@ -157,7 +157,13 @@ type Bench
## Run the specified set of benchmarks. ## Run the specified set of benchmarks.
run_main self = run_main self =
count = self.total_specs count = self.total_specs
IO.println <| "Found " + count.to_text + " cases to execute" IO.println <| "Found " + count.to_text + " cases to execute (ETA " + self.estimated_runtime.to_display_text + ")"
case Environment.get "ENSO_BENCHMARK_REPORT_PATH" of
Nothing -> Nothing
path ->
line = 'Label,Phase,"Invocations count","Average time (ms)"'
line.write path on_existing_file=Existing_File_Behavior.Backup
self.fold Nothing _-> g-> s-> self.fold Nothing _-> g-> s->
c = g.configuration c = g.configuration
@ -168,10 +174,10 @@ type Bench
## Measure the amount of time it takes to execute a given computation. ## Measure the amount of time it takes to execute a given computation.
Arguments: Arguments:
- act: The action to perform.
- label: A name for the benchmark. - label: A name for the benchmark.
- warmup_conf: Warmup phase configuration. - warmup_conf: Warmup phase configuration.
- measure_conf: Measurement phase configuration. - measure_conf: Measurement phase configuration.
- act: The action to perform.
> Example > Example
Measure a computation called "foo" with an iteration size of 2 and a number Measure a computation called "foo" with an iteration size of 2 and a number
@ -193,8 +199,8 @@ type Bench
IO.println <| "[DRY-RUN] Benchmark '" + label + "' finished in " + fmt_duration_ms + " ms" IO.println <| "[DRY-RUN] Benchmark '" + label + "' finished in " + fmt_duration_ms + " ms"
False -> False ->
measure_start = System.nano_time measure_start = System.nano_time
Bench.run_phase "Warmup" warmup_conf act Bench.run_phase label "Warmup" warmup_conf act
Bench.run_phase "Measurement" measure_conf act Bench.run_phase label "Measurement" measure_conf act
measure_end = System.nano_time measure_end = System.nano_time
measure_duration_ms = (measure_end - measure_start) / 1000000 measure_duration_ms = (measure_end - measure_start) / 1000000
fmt_duration_ms = measure_duration_ms.format "#.###" fmt_duration_ms = measure_duration_ms.format "#.###"
@ -215,11 +221,12 @@ type Bench
so that it is the same as in JMH. so that it is the same as in JMH.
Arguments: Arguments:
- label: A name for the benchmark.
- phase_name: The name of the phase. - phase_name: The name of the phase.
- conf: The phase configuration. - conf: The phase configuration.
- act: Method that should be measured - benchmark body. - act: Method that should be measured - benchmark body.
run_phase : Text -> Phase_Conf -> (Any -> Any) -> Nothing run_phase : Text -> Text -> Phase_Conf -> (Any -> Any) -> Nothing
run_phase (phase_name:Text) (conf:Phase_Conf) ~act = run_phase (label:Text) (phase_name:Text) (conf:Phase_Conf) ~act =
duration_ns = conf.iterations * conf.seconds * 1000000000 duration_ns = conf.iterations * conf.seconds * 1000000000
phase_start = System.nano_time phase_start = System.nano_time
stop_ns = phase_start + duration_ns stop_ns = phase_start + duration_ns
@ -230,14 +237,29 @@ type Bench
durations = go [] phase_start durations = go [] phase_start
sum = durations.reduce (_ + _) sum = durations.reduce (_ + _)
run_iters = durations.length run_iters = durations.length
avg = sum / run_iters avg = (sum / run_iters) / 1000000
fmt = (avg / 1000000).format "#.###"
phase_end = System.nano_time phase_end = System.nano_time
phase_duration_ms = (phase_end - phase_start) / 1000000 phase_duration = Duration.new nanoseconds=(phase_end - phase_start)
IO.println <| phase_name + " duration: " + phase_duration_ms.to_text + " ms" Bench.summarize_phase label phase_name run_iters avg phase_duration
IO.println <| phase_name + " invocations: " + run_iters.to_text
## PRIVATE
This is a very simple implementation of summarizing the benchmark
results.
We may want to improve it later, but it gets the job done to give us
simple summary that can be analysed more easily than logs.
summarize_phase (label:Text) (phase_name:Text) (invocations:Integer) (average_time:Float) (phase_duration:Duration) =
fmt = average_time.format "#.###"
IO.println <| phase_name + " duration: " + (phase_duration.total_milliseconds.format "#.##") + " ms"
IO.println <| phase_name + " invocations: " + invocations.to_text
IO.println <| phase_name + " avg time: " + fmt + " ms" IO.println <| phase_name + " avg time: " + fmt + " ms"
case Environment.get "ENSO_BENCHMARK_REPORT_PATH" of
Nothing -> Nothing
path ->
line = '\n"'+label+'","'+phase_name+'",'+invocations.to_text+','+fmt
line.write path on_existing_file=Existing_File_Behavior.Append
## PRIVATE ## PRIVATE
Checks whether the given name is a valid benchmark name - either group name Checks whether the given name is a valid benchmark name - either group name

View File

@ -4,10 +4,12 @@ import java.math.BigInteger;
import java.util.Arrays; import java.util.Arrays;
import org.enso.base.polyglot.NumericConverter; import org.enso.base.polyglot.NumericConverter;
import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.numeric.AbstractLongStorage;
import org.enso.table.data.column.storage.numeric.BigIntegerStorage; import org.enso.table.data.column.storage.numeric.BigIntegerStorage;
import org.enso.table.data.column.storage.type.AnyObjectType; import org.enso.table.data.column.storage.type.AnyObjectType;
import org.enso.table.data.column.storage.type.BigIntegerType; import org.enso.table.data.column.storage.type.BigIntegerType;
import org.enso.table.data.column.storage.type.FloatType; import org.enso.table.data.column.storage.type.FloatType;
import org.enso.table.data.column.storage.type.IntegerType;
import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.column.storage.type.StorageType;
import org.enso.table.error.ValueTypeMismatchException; import org.enso.table.error.ValueTypeMismatchException;
import org.graalvm.polyglot.Context; import org.graalvm.polyglot.Context;
@ -24,8 +26,8 @@ public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
} }
@Override @Override
public void writeTo(Object[] items) { public void retypeToMixed(Object[] items) {
super.writeTo(items); super.retypeToMixed(items);
} }
@Override @Override
@ -36,7 +38,7 @@ public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
@Override @Override
public TypedBuilder retypeTo(StorageType type) { public TypedBuilder retypeTo(StorageType type) {
if (type instanceof FloatType) { if (type instanceof FloatType) {
DoubleBuilder res = NumericBuilder.createDoubleBuilder(currentSize); DoubleBuilder res = NumericBuilder.createInferringDoubleBuilder(currentSize);
for (int i = 0; i < currentSize; i++) { for (int i = 0; i < currentSize; i++) {
if (data[i] == null) { if (data[i] == null) {
res.appendNulls(1); res.appendNulls(1);
@ -98,4 +100,28 @@ public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
} }
return res; return res;
} }
@Override
public void appendBulkStorage(Storage<?> storage) {
if (storage.getType() instanceof IntegerType) {
if (storage instanceof AbstractLongStorage longStorage) {
int n = longStorage.size();
for (int i = 0; i < n; i++) {
if (storage.isNa(i)) {
data[currentSize++] = null;
} else {
long item = longStorage.getItem(i);
data[currentSize++] = BigInteger.valueOf(item);
}
}
} else {
throw new IllegalStateException(
"Unexpected storage implementation for type INTEGER: "
+ storage
+ ". This is a bug in the Table library.");
}
} else {
super.appendBulkStorage(storage);
}
}
} }

View File

@ -95,7 +95,7 @@ public class BoolBuilder extends TypedBuilder {
} }
@Override @Override
public void writeTo(Object[] items) { public void retypeToMixed(Object[] items) {
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
if (isNa.get(i)) { if (isNa.get(i)) {
items[i] = null; items[i] = null;

View File

@ -5,7 +5,9 @@ import org.enso.table.data.column.operation.cast.ToFloatStorageConverter;
import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.BoolStorage;
import org.enso.table.data.column.storage.Storage; import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.numeric.AbstractLongStorage; import org.enso.table.data.column.storage.numeric.AbstractLongStorage;
import org.enso.table.data.column.storage.numeric.BigIntegerStorage;
import org.enso.table.data.column.storage.numeric.DoubleStorage; import org.enso.table.data.column.storage.numeric.DoubleStorage;
import org.enso.table.data.column.storage.type.BigIntegerType;
import org.enso.table.data.column.storage.type.BooleanType; import org.enso.table.data.column.storage.type.BooleanType;
import org.enso.table.data.column.storage.type.FloatType; import org.enso.table.data.column.storage.type.FloatType;
import org.enso.table.data.column.storage.type.IntegerType; import org.enso.table.data.column.storage.type.IntegerType;
@ -28,14 +30,10 @@ public class DoubleBuilder extends NumericBuilder {
} }
@Override @Override
public void writeTo(Object[] items) { public void retypeToMixed(Object[] items) {
for (int i = 0; i < currentSize; i++) { throw new IllegalStateException("The DoubleBuilder cannot be retyped to the Mixed type, because it would lose " +
if (isMissing.get(i)) { "type information about integers that were converted to doubles. If recasting is needed, " +
items[i] = null; "InferringDoubleBuilder should be used instead. This error leaking is a bug in the Table library.");
} else {
items[i] = Double.longBitsToDouble(data[i]);
}
}
} }
@Override @Override
@ -53,38 +51,11 @@ public class DoubleBuilder extends NumericBuilder {
return FloatType.FLOAT_64; return FloatType.FLOAT_64;
} }
/**
* Converts the provided LongBuilder to a DoubleBuilder.
*
* <p>The original LongBuilder becomes invalidated after this operation and should no longer be
* used.
*/
static DoubleBuilder retypeFromLongBuilder(LongBuilder longBuilder) {
int currentSize = longBuilder.currentSize;
DoubleBuilder newBuilder = new DoubleBuilder(longBuilder.isMissing, longBuilder.data, currentSize);
// Invalidate the old builder.
longBuilder.data = null;
longBuilder.isMissing = null;
longBuilder.currentSize = -1;
// Translate the data in-place to avoid unnecessary allocations.
for (int i = 0; i < currentSize; i++) {
if (!newBuilder.isMissing.get(i)) {
long currentIntegerValue = newBuilder.data[i];
double convertedFloatValue = newBuilder.convertIntegerToDouble(currentIntegerValue);
newBuilder.data[i] = Double.doubleToRawLongBits(convertedFloatValue);
}
}
return newBuilder;
}
@Override @Override
public void appendNoGrow(Object o) { public void appendNoGrow(Object o) {
if (o == null) { if (o == null) {
isMissing.set(currentSize++); isMissing.set(currentSize++);
} else if (NumericConverter.isFloatLike(o)){ } else if (NumericConverter.isFloatLike(o)) {
double value = NumericConverter.coerceToDouble(o); double value = NumericConverter.coerceToDouble(o);
data[currentSize++] = Double.doubleToRawLongBits(value); data[currentSize++] = Double.doubleToRawLongBits(value);
} else if (NumericConverter.isCoercibleToLong(o)) { } else if (NumericConverter.isCoercibleToLong(o)) {
@ -134,7 +105,25 @@ public class DoubleBuilder extends NumericBuilder {
+ storage + storage
+ ". This is a bug in the Table library."); + ". This is a bug in the Table library.");
} }
} else if (Objects.equals(storage.getType(), BooleanType.INSTANCE)) { } else if (storage.getType() instanceof BigIntegerType) {
if (storage instanceof BigIntegerStorage bigIntegerStorage) {
int n = bigIntegerStorage.size();
for (int i = 0; i < n; i++) {
BigInteger item = bigIntegerStorage.getItem(i);
if (item == null) {
isMissing.set(currentSize++);
} else {
double converted = convertBigIntegerToDouble(item);
data[currentSize++] = Double.doubleToRawLongBits(converted);
}
}
} else {
throw new IllegalStateException(
"Unexpected storage implementation for type BIG INTEGER: "
+ storage
+ ". This is a bug in the Table library.");
}
} else if (storage.getType() instanceof BooleanType) {
if (storage instanceof BoolStorage boolStorage) { if (storage instanceof BoolStorage boolStorage) {
int n = boolStorage.size(); int n = boolStorage.size();
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
@ -201,34 +190,34 @@ public class DoubleBuilder extends NumericBuilder {
* <p> * <p>
* It verifies if the integer can be exactly represented in a double, and if not, it reports a warning. * It verifies if the integer can be exactly represented in a double, and if not, it reports a warning.
*/ */
private double convertIntegerToDouble(long integer) { protected double convertIntegerToDouble(long integer) {
double floatingPointValue = (double) integer; double floatingPointValue = (double) integer;
boolean isLosingPrecision = (long) floatingPointValue != integer; boolean isLosingPrecision = (long) floatingPointValue != integer;
if (isLosingPrecision) { if (isLosingPrecision) {
if (precisionLoss == null) { reportPrecisionLoss(integer, floatingPointValue);
precisionLoss = new LossOfIntegerPrecision(integer, floatingPointValue);
} else {
precisionLoss.incrementAffectedRows();
}
} }
return floatingPointValue; return floatingPointValue;
} }
private double convertBigIntegerToDouble(BigInteger bigInteger) { protected double convertBigIntegerToDouble(BigInteger bigInteger) {
double floatingPointValue = bigInteger.doubleValue(); double floatingPointValue = bigInteger.doubleValue();
BigInteger reconstructed = BigDecimal.valueOf(floatingPointValue).toBigInteger(); BigInteger reconstructed = BigDecimal.valueOf(floatingPointValue).toBigInteger();
boolean isLosingPrecision = !bigInteger.equals(reconstructed); boolean isLosingPrecision = !bigInteger.equals(reconstructed);
if (isLosingPrecision) { if (isLosingPrecision) {
if (precisionLoss == null) { reportPrecisionLoss(bigInteger, floatingPointValue);
precisionLoss = new LossOfIntegerPrecision(bigInteger, floatingPointValue);
} else {
precisionLoss.incrementAffectedRows();
}
} }
return floatingPointValue; return floatingPointValue;
} }
private LossOfIntegerPrecision precisionLoss = null; protected final void reportPrecisionLoss(Number number, double approximation) {
if (precisionLoss == null) {
precisionLoss = new LossOfIntegerPrecision(number, approximation);
} else {
precisionLoss.incrementAffectedRows();
}
}
protected LossOfIntegerPrecision precisionLoss = null;
@Override @Override
public AggregatedProblems getProblems() { public AggregatedProblems getProblems() {

View File

@ -102,7 +102,7 @@ public class InferredBuilder extends Builder {
// In inferred builder, we always default to 64-bits. // In inferred builder, we always default to 64-bits.
currentBuilder = NumericBuilder.createLongBuilder(initialCapacity, IntegerType.INT_64); currentBuilder = NumericBuilder.createLongBuilder(initialCapacity, IntegerType.INT_64);
} else if (NumericConverter.isFloatLike(o)) { } else if (NumericConverter.isFloatLike(o)) {
currentBuilder = NumericBuilder.createDoubleBuilder(initialCapacity); currentBuilder = NumericBuilder.createInferringDoubleBuilder(initialCapacity);
} else if (o instanceof String) { } else if (o instanceof String) {
currentBuilder = new StringBuilder(initialCapacity, TextType.VARIABLE_LENGTH); currentBuilder = new StringBuilder(initialCapacity, TextType.VARIABLE_LENGTH);
} else if (o instanceof BigInteger) { } else if (o instanceof BigInteger) {
@ -156,7 +156,7 @@ public class InferredBuilder extends Builder {
private void retypeToMixed() { private void retypeToMixed() {
ObjectBuilder objectBuilder = new MixedBuilder(initialSize); ObjectBuilder objectBuilder = new MixedBuilder(initialSize);
currentBuilder.writeTo(objectBuilder.getData()); currentBuilder.retypeToMixed(objectBuilder.getData());
objectBuilder.setCurrentSize(currentBuilder.getCurrentSize()); objectBuilder.setCurrentSize(currentBuilder.getCurrentSize());
objectBuilder.setPreExistingProblems(currentBuilder.getProblems()); objectBuilder.setPreExistingProblems(currentBuilder.getProblems());
currentBuilder = objectBuilder; currentBuilder = objectBuilder;

View File

@ -0,0 +1,192 @@
package org.enso.table.data.column.builder;
import org.enso.base.polyglot.NumericConverter;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.error.ValueTypeMismatchException;
import java.math.BigInteger;
import java.util.BitSet;
/**
* A double builder variant that preserves types and can be retyped to Mixed.
*/
public class InferringDoubleBuilder extends DoubleBuilder {
/**
* Converts the provided LongBuilder to a DoubleBuilder.
*
* <p>The original LongBuilder becomes invalidated after this operation and should no longer be
* used.
*/
static InferringDoubleBuilder retypeFromLongBuilder(LongBuilder longBuilder) {
int currentSize = longBuilder.currentSize;
InferringDoubleBuilder newBuilder =
new InferringDoubleBuilder(longBuilder.isMissing, longBuilder.data, currentSize);
// Invalidate the old builder.
longBuilder.data = null;
longBuilder.isMissing = null;
longBuilder.currentSize = -1;
// Assume all longs will be compacted.
newBuilder.isLongCompactedAsDouble.set(0, currentSize, true);
// Translate the data in-place to avoid unnecessary allocations.
for (int i = 0; i < currentSize; i++) {
if (!newBuilder.isMissing.get(i)) {
long currentIntegerValue = newBuilder.data[i];
double convertedFloatValue = (double) currentIntegerValue;
boolean isLossy = currentIntegerValue != (long) convertedFloatValue;
if (isLossy) {
// Save it raw for recovery.
newBuilder.setRaw(i, currentIntegerValue);
newBuilder.reportPrecisionLoss(currentIntegerValue, convertedFloatValue);
// Unmark the long that did not fit:
newBuilder.isLongCompactedAsDouble.set(i, false);
}
newBuilder.data[i] = Double.doubleToRawLongBits(convertedFloatValue);
}
}
return newBuilder;
}
InferringDoubleBuilder(BitSet isMissing, long[] doubleData, int currentSize) {
super(isMissing, doubleData, currentSize);
rawData = null;
isLongCompactedAsDouble = new BitSet();
}
/**
* Stores the raw data as passed to append, in order to be able to reconstruct the original values when retyping to
* mixed.
*/
private Number[] rawData;
/**
* Specifies at which indices we encountered integers that can be reconstructed from double without loss of
* precision.
* <p>
* This is used for reconstructing the original values when retyping to mixed. Integers that are small enough can be
* reconstructed from their double values, so we do not need to store them as `rawData`, instead we only mark that
* they need to be converted back into integers when retyping. This allows us to completely avoid allocating the
* `rawData` array for most practical scenarios when the integers are not too large. The cost of allocating this
* BitSet should be significantly lower than allocating the `rawData` array.
*/
private final BitSet isLongCompactedAsDouble;
@Override
public void retypeToMixed(Object[] items) {
int rawN = rawData == null ? 0 : rawData.length;
for (int i = 0; i < currentSize; i++) {
if (isMissing.get(i)) {
items[i] = null;
} else {
if (isLongCompactedAsDouble.get(i)) {
double value = Double.longBitsToDouble(data[i]);
long reconstructed = (long) value;
items[i] = reconstructed;
} else if (i < rawN && rawData[i] != null) {
items[i] = rawData[i];
} else {
double value = Double.longBitsToDouble(data[i]);
items[i] = value;
}
}
}
// Since we are retyping to Mixed, the precision loss warnings should not be inherited - thus we reset them.
precisionLoss = null;
}
@Override
public void appendBulkStorage(Storage<?> storage) {
throw new UnsupportedOperationException("appendBulkStorage is not supported on InferringDoubleBuilder. " +
"A DoubleBuilder or MixedBuilder should be used instead. This is a bug in the Table library.");
}
@Override
public void appendDouble(double x) {
if (currentSize >= this.data.length) {
grow();
}
data[currentSize] = Double.doubleToRawLongBits(x);
currentSize++;
}
@Override
public void appendLong(long integer) {
if (currentSize >= this.data.length) {
grow();
}
appendLongNoGrow(integer);
}
private void appendLongNoGrow(long integer) {
double convertedFloatValue = (double) integer;
boolean isLossy = integer != (long) convertedFloatValue;
if (isLossy) {
setRaw(currentSize, integer);
reportPrecisionLoss(integer, convertedFloatValue);
} else {
isLongCompactedAsDouble.set(currentSize, true);
}
data[currentSize] = Double.doubleToRawLongBits(convertedFloatValue);
currentSize++;
}
@Override
public void appendBigInteger(BigInteger integer) {
if (currentSize >= this.data.length) {
grow();
}
setRaw(currentSize, integer);
double convertedFloatValue = convertBigIntegerToDouble(integer);
data[currentSize] = Double.doubleToRawLongBits(convertedFloatValue);
currentSize++;
}
@Override
public void appendNoGrow(Object o) {
if (o == null) {
isMissing.set(currentSize++);
} else if (NumericConverter.isFloatLike(o)) {
double value = NumericConverter.coerceToDouble(o);
data[currentSize++] = Double.doubleToRawLongBits(value);
} else if (NumericConverter.isCoercibleToLong(o)) {
long value = NumericConverter.coerceToLong(o);
appendLongNoGrow(value);
} else if (o instanceof BigInteger bigInteger) {
setRaw(currentSize, bigInteger);
double converted = convertBigIntegerToDouble(bigInteger);
data[currentSize++] = Double.doubleToRawLongBits(converted);
} else {
throw new ValueTypeMismatchException(getType(), o);
}
}
@Override
public void appendRawNoGrow(long rawData) {
throw new UnsupportedOperationException("appendRawNoGrow is not supported on InferringDoubleBuilder. " +
"A DoubleBuilder should be used instead. This is a bug in the Table library.");
}
private void setRaw(int ix, Number o) {
if (rawData == null) {
rawData = new Number[ix + 1];
}
if (rawData.length <= ix) {
int newLength = Math.max(rawData.length * 3 / 2 + 1, ix + 1);
Number[] newRawData = new Number[newLength];
System.arraycopy(rawData, 0, newRawData, 0, rawData.length);
rawData = newRawData;
}
rawData[ix] = o;
}
}

View File

@ -35,7 +35,7 @@ public abstract class LongBuilder extends NumericBuilder {
} }
@Override @Override
public void writeTo(Object[] items) { public void retypeToMixed(Object[] items) {
for (int i = 0; i < currentSize; i++) { for (int i = 0; i < currentSize; i++) {
if (isMissing.get(i)) { if (isMissing.get(i)) {
items[i] = null; items[i] = null;
@ -55,7 +55,7 @@ public abstract class LongBuilder extends NumericBuilder {
if (Objects.equals(type, BigIntegerType.INSTANCE)) { if (Objects.equals(type, BigIntegerType.INSTANCE)) {
return BigIntegerBuilder.retypeFromLongBuilder(this); return BigIntegerBuilder.retypeFromLongBuilder(this);
} else if (Objects.equals(type, FloatType.FLOAT_64)) { } else if (Objects.equals(type, FloatType.FLOAT_64)) {
return DoubleBuilder.retypeFromLongBuilder(this); return InferringDoubleBuilder.retypeFromLongBuilder(this);
} else { } else {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }

View File

@ -16,10 +16,19 @@ public abstract class NumericBuilder extends TypedBuilder {
this.currentSize = currentSize; this.currentSize = currentSize;
} }
/**
* Creates a {@link DoubleBuilder} that should be used to create columns of boolean type and are
* not expected to be retyped.
*/
public static DoubleBuilder createDoubleBuilder(int size) { public static DoubleBuilder createDoubleBuilder(int size) {
return new DoubleBuilder(new BitSet(), new long[size], 0); return new DoubleBuilder(new BitSet(), new long[size], 0);
} }
/** Creates a {@link DoubleBuilder} that may be retyped to Mixed type. */
public static DoubleBuilder createInferringDoubleBuilder(int size) {
return new InferringDoubleBuilder(new BitSet(), new long[size], 0);
}
public static LongBuilder createLongBuilder(int size, IntegerType type) { public static LongBuilder createLongBuilder(int size, IntegerType type) {
return LongBuilder.make(size, type); return LongBuilder.make(size, type);
} }

View File

@ -23,7 +23,7 @@ public class ObjectBuilder extends TypedBuilder {
} }
@Override @Override
public void writeTo(Object[] items) { public void retypeToMixed(Object[] items) {
throw new IllegalStateException("Broken invariant: rewriting the most general type."); throw new IllegalStateException("Broken invariant: rewriting the most general type.");
} }

View File

@ -9,7 +9,7 @@ public abstract class TypedBuilder extends Builder {
* *
* @param items the buffer to dump elements into * @param items the buffer to dump elements into
*/ */
public abstract void writeTo(Object[] items); public abstract void retypeToMixed(Object[] items);
/** /**
* Checks if the builder can be efficiently retyped to the given storage type. * Checks if the builder can be efficiently retyped to the given storage type.

View File

@ -18,7 +18,7 @@ public abstract class TypedBuilderImpl<T> extends TypedBuilder {
} }
@Override @Override
public void writeTo(Object[] items) { public void retypeToMixed(Object[] items) {
if (currentSize >= 0) System.arraycopy(data, 0, items, 0, currentSize); if (currentSize >= 0) System.arraycopy(data, 0, items, 0, currentSize);
} }

View File

@ -8,15 +8,41 @@ from Standard.Test import Bench
options = Bench.options . set_warmup (Bench.phase_conf 2 3) . set_measure (Bench.phase_conf 2 3) options = Bench.options . set_warmup (Bench.phase_conf 2 3) . set_measure (Bench.phase_conf 2 3)
type Data type Data
Value ~ints Value ~ints ~floats ~floats_and_small_ints ~floats_and_large_ints
create num_rows = create num_rows =
# 0-argument block to make it lazy # 0-argument block to make it lazy
ints = ints =
Vector.new num_rows i-> Vector.new num_rows i->
i % 1000 i % 1000
Data.Value ints floats =
Vector.new num_rows i->
(i % 1000) + 0.5
floats_and_small_ints =
Vector.new num_rows i->
case i % 2 of
0 -> (i % 1000) + 0.5
1 -> i % 1000
floats_and_large_ints =
Vector.new num_rows i->
case i % 3 of
0 -> (i % 1000) + 0.5
1 -> 2^60 + (i % 100)
2 -> 2^100 + (i % 100)
Data.Value ints floats floats_and_small_ints floats_and_large_ints
## A flag allowing to run some additional benchmarks.
They were used when analyzing performance of an optimization of Mixed
handler. They are disabled on the CI to avoid increasing the time of the
build, but they can be manually re-enabled in case this needs testing in the
future.
When set to `False`, only the core measurements checking the most common
cases are run.
extended_tests = False
collect_benches = Bench.build builder-> collect_benches = Bench.build builder->
num_rows = 1000000 num_rows = 1000000
@ -32,4 +58,18 @@ collect_benches = Bench.build builder->
group_builder.specify "Integers_type_Auto" <| group_builder.specify "Integers_type_Auto" <|
Column.from_vector "Auto" data.ints value_type=Auto Column.from_vector "Auto" data.ints value_type=Auto
group_builder.specify "Floats_type_Auto" <|
Column.from_vector "Auto" data.floats value_type=Auto
group_builder.specify "Floats_type_Float" <|
Column.from_vector "Floats" data.floats value_type=Value_Type.Float
if extended_tests then group_builder.specify "Numeric_Mix_Small_type_Auto" <|
Column.from_vector "Small_Mix" data.floats_and_small_ints value_type=Auto
if extended_tests then group_builder.specify "Numeric_Mix_Small_type_Float" <|
Column.from_vector "Small_Mix" data.floats_and_small_ints value_type=Value_Type.Float
if extended_tests then group_builder.specify "Numeric_Mix_Large_type_Auto" <|
Column.from_vector "Large_Mix" data.floats_and_large_ints value_type=Auto
if extended_tests then group_builder.specify "Numeric_Mix_Large_type_Float" <|
Column.from_vector "Large_Mix" data.floats_and_large_ints value_type=Value_Type.Float
main = collect_benches . run_main main = collect_benches . run_main

View File

@ -426,15 +426,16 @@ spec setup =
c3 = t2.cast "super-mix" Value_Type.Char . at "super-mix" c3 = t2.cast "super-mix" Value_Type.Char . at "super-mix"
c3.value_type . should_equal Value_Type.Char c3.value_type . should_equal Value_Type.Char
date_time_str = Date_Time.new 2023 12 15 19 25 . to_text date_time_str = Date_Time.new 2023 12 15 19 25 . to_text
c3.to_vector . should_equal ["1.0", "2.25", "3", "-45.25", "2.0", "X", "2020-01-01", date_time_str, "12:30:00", "{{{MY Type [x=42] }}}", "True", Nothing, "True"] c3.to_vector . should_equal ["1", "2.25", "3", "-45.25", "2.0", "X", "2020-01-01", date_time_str, "12:30:00", "{{{MY Type [x=42] }}}", "True", Nothing, "True"]
Problems.assume_no_problems c3 Problems.assume_no_problems c3
if setup.test_selection.fixed_length_text_columns then if setup.test_selection.fixed_length_text_columns then
c3_2 = t2.cast "super-mix" (Value_Type.Char size=2) . at "super-mix" c3_2 = t2.cast "super-mix" (Value_Type.Char size=2) . at "super-mix"
c3_2.value_type . should_equal (Value_Type.Char size=2) c3_2.value_type . should_equal (Value_Type.Char size=2)
c3_2.to_vector . should_equal ["1.", "2.", "3", "-4", "2.", "X", "20", "20", "12", "{{", "Tr", Nothing, "Tr"] c3_2.to_vector . should_equal ["1", "2.", "3", "-4", "2.", "X", "20", "20", "12", "{{", "Tr", Nothing, "Tr"]
w3_2 = Problems.expect_warning Conversion_Failure c3_2 w3_2 = Problems.expect_warning Conversion_Failure c3_2
w3_2.affected_rows_count . should_equal 10 w3_2.affected_rows_count . should_equal 9
w3_2.to_display_text . should_contain "have a text representation that does not fit the target type"
c4 = t2.cast "super-mix" Value_Type.Boolean . at "super-mix" c4 = t2.cast "super-mix" Value_Type.Boolean . at "super-mix"
c4.value_type . should_equal Value_Type.Boolean c4.value_type . should_equal Value_Type.Boolean
@ -630,13 +631,13 @@ spec setup =
t3.at "Y" . value_type . should_equal Value_Type.Char t3.at "Y" . value_type . should_equal Value_Type.Char
Test.specify "will choose Decimal type if all values are integers but cannot fit long" <| Test.specify "will choose Decimal type if all values are integers but cannot fit long" <|
c0 = table_builder [["X", [My_Type.Value 42, 1, 2, 2^100]]] . at "X" c0 = table_builder [["X", [My_Type.Value 42, 1, 2, (2^100)+1]]] . at "X"
c1 = c0.drop 1 c1 = c0.drop 1
c1.value_type . should_equal Value_Type.Mixed c1.value_type . should_equal Value_Type.Mixed
c2 = c1.auto_value_type c2 = c1.auto_value_type
c2.value_type . should_be_a (Value_Type.Decimal ...) c2.value_type . should_be_a (Value_Type.Decimal ...)
c2.to_vector . should_equal [1, 2, 2^100] c2.to_vector . should_equal [1, 2, (2^100)+1]
Test.specify "will try to find the smallest integer type to fit the value (if shrink_types=True)" <| Test.specify "will try to find the smallest integer type to fit the value (if shrink_types=True)" <|
[False, True].each is_mixed-> [False, True].each is_mixed->

View File

@ -208,16 +208,31 @@ spec setup =
t3.at "A" . value_type . variable_length . should_be_true t3.at "A" . value_type . variable_length . should_be_true
Test.specify "should find a common type that will fit the merged columns" <| Test.specify "should find a common type that will fit the merged columns" <|
t1 = table_builder [["int+float", [0, 1, 2]]] t1 = table_builder [["A", [0, 1, 2]]]
t2 = table_builder [["int+float", [1.0, 2.0, 2.5]]] t2 = table_builder [["A", [1.0, 2.0, 2.5]]]
t1.at "int+float" . value_type . is_integer . should_be_true t1.at "A" . value_type . is_integer . should_be_true
t2.at "int+float" . value_type . is_floating_point . should_be_true t2.at "A" . value_type . is_floating_point . should_be_true
t3 = t1.union t2 t3 = t1.union t2
expect_column_names ["int+float"] t3 expect_column_names ["A"] t3
t3.at "int+float" . value_type . is_floating_point . should_be_true t3.at "A" . value_type . is_floating_point . should_be_true
t3.at "int+float" . to_vector . should_equal [0, 1, 2, 1.0, 2.0, 2.5] t3.at "A" . to_vector . should_equal [0, 1, 2, 1.0, 2.0, 2.5]
# Specific type tests that apply to in-memory. Database behaviour is up to implementation.
if setup.is_database.not then
t4 = table_builder [["A", [2^100, 2^10, 2]]]
t4.at "A" . value_type . should_be_a (Value_Type.Decimal ...)
t5 = t2.union t4
expect_column_names ["A"] t5
t5.at "A" . value_type . is_floating_point . should_be_true
t5.at "A" . to_vector . should_equal [1.0, 2.0, 2.5, 2^100, 2^10, 2]
t6 = t1.union t4
expect_column_names ["A"] t6
t6.at "A" . value_type . should_be_a (Value_Type.Decimal ...)
t6.at "A" . to_vector . should_equal [0, 1, 2, 2^100, 2^10, 2]
# Database backends are not required to support Mixed types. # Database backends are not required to support Mixed types.
if setup.is_database.not then if setup.is_database.not then
@ -329,6 +344,24 @@ spec setup =
t6.at "X" . value_type . should_equal Value_Type.Mixed t6.at "X" . value_type . should_equal Value_Type.Mixed
t6.at "X" . to_vector . should_equal ["a", 1, Nothing, 1, 1.2, 2.3, 3.4, "a", "b", True, False] t6.at "X" . to_vector . should_equal ["a", 1, Nothing, 1, 1.2, 2.3, 3.4, "a", "b", True, False]
Test.specify "when finding a common type for numeric columns to be Float, any precision loss should be reported" <|
t1 = table_builder [["X", [1, (2^62)-1, 3]]]
t2 = table_builder [["X", [1.5, 2.5, 3.5]]]
t3 = table_builder [["X", [(2^100)+1, 2^10, 2]]]
t1.at "X" . value_type . should_equal Value_Type.Integer
t2.at "X" . value_type . should_equal Value_Type.Float
t3.at "X" . value_type . should_be_a (Value_Type.Decimal ...)
t4 = t2.union [t1, t3] allow_type_widening=True
t4.at "X" . value_type . should_equal Value_Type.Float
# Inexact float equality will make this pass:
t4.at "X" . to_vector . should_equal [1.5, 2.5, 3.5, 1, (2^62)-1, 3, (2^100)+1, 2^10, 2]
w = Problems.expect_only_warning Loss_Of_Integer_Precision t4
# Losing precision on (2^62)-1 and 2^100+1.
w.affected_rows_count . should_equal 2
Test.specify "if type mismatches cause all columns to be dropped, fail with No_Output_Columns" <| Test.specify "if type mismatches cause all columns to be dropped, fail with No_Output_Columns" <|
t1 = table_builder [["A", [1, 2, 3]]] t1 = table_builder [["A", [1, 2, 3]]]
t2 = table_builder [["A", ['x']]] t2 = table_builder [["A", ['x']]]

View File

@ -65,7 +65,7 @@ spec =
test_column.reverse.to_vector . should_equal expected_1.to_vector test_column.reverse.to_vector . should_equal expected_1.to_vector
empty_column.reverse.to_vector . should_equal empty_column.to_vector empty_column.reverse.to_vector . should_equal empty_column.to_vector
Test.specify "should allow to count duplicate value occurences" <| Test.specify "should allow to count duplicate value occurrences" <|
c_1 = Column.from_vector "c 1" [0, 1, 2, 2, 1, 0, 2] c_1 = Column.from_vector "c 1" [0, 1, 2, 2, 1, 0, 2]
c_1.duplicate_count.to_vector.should_equal [0, 0, 0, 1, 1, 1, 2] c_1.duplicate_count.to_vector.should_equal [0, 0, 0, 1, 1, 1, 2]
@ -106,19 +106,48 @@ spec =
c1.at 1 . should_be_a Float c1.at 1 . should_be_a Float
c1.at 0 . is_a Integer . should_be_false c1.at 0 . is_a Integer . should_be_false
Test.specify "will preserve the types if the column is Mixed" <| Test.specify "will preserve the types if the column is Mixed, regardless of ordering" <|
c1 = Column.from_vector "X" ["a", 1, 2.0] run_test vector =
c1.value_type . should_equal Value_Type.Mixed Test.with_clue vector.pretty+": " <|
c1.at 0 . should_be_a Text c = Column.from_vector "X" vector
c1.at 1 . should_be_a Integer c.value_type . should_equal Value_Type.Mixed
c1.at 2 . should_be_a Float c.to_vector . should_equal vector
Problems.assume_no_problems c
Test.specify "will preserve the types if the column is Mixed (2)" pending="ToDo: disabled due to issue #7352" <| # Verify that types of the values are preserved.
c2 = Column.from_vector "X" [1, 2.0, "a"] c.to_vector.zip vector got-> expected->
c2.value_type . should_equal Value_Type.Mixed Test.with_clue "(type of "+got.pretty+") " <|
c2.at 0 . should_be_a Integer (Meta.type_of got) . should_equal (Meta.type_of expected)
c2.at 1 . should_be_a Float got.to_text . should_equal expected.to_text
c2.at 2 . should_be_a Text
## Testing various permutations to ensure that the behaviour is
invariant to the order of the values.
(It used to differ depending on the order.)
big = (2^100)+1
medium = (2^62)-1
run_test ["a", 1, 2.0]
run_test [1, 2.0, "a"]
run_test [2.0, 1, "a"]
run_test [2.5, 1, "a"]
run_test [10, 1.5, "a"]
run_test ["a", 1.5, 10]
run_test [big, 1.5, "a"]
run_test [1.5, big, "a"]
run_test [2, big, 1.5, "a"]
run_test [2, big, 1.5, "a"]
run_test [1.5, big, 2, "a"]
run_test [Nothing, 1.5, medium, 2, "a"]
run_test [medium, 1.5, "a"]
run_test [Nothing, medium, 1.5, "a"]
big_test_vector x y =
a = Vector.fill 40 [x, 1.5, 2, y, x+123, y+123, y+5, 2.5]
b = Vector.fill 100 [Nothing]
c = Vector.fill 500 [2, 1.5, x]
d = Vector.fill 5 [2, "a", 2, 2.5]
(a+b+c+d).flatten
run_test (big_test_vector medium big)
run_test (big_test_vector 123 456)
Test.specify "should allow to set a specific type at construction" <| Test.specify "should allow to set a specific type at construction" <|
c1 = Column.from_vector "X" [1, 2] Value_Type.Float c1 = Column.from_vector "X" [1, 2] Value_Type.Float
@ -203,6 +232,10 @@ spec =
r8.should_fail_with Invalid_Value_Type r8.should_fail_with Invalid_Value_Type
r8.catch.to_display_text.should_contain "Expected type Byte, but got a value 1000000000 of type Integer (32 bits)" r8.catch.to_display_text.should_contain "Expected type Byte, but got a value 1000000000 of type Integer (32 bits)"
r9 = Column.from_vector "X" [100, 1.5] Value_Type.Integer
r9.should_fail_with Invalid_Value_Type
r9.catch.to_display_text.should_contain "Expected type Integer (64 bits), but got a value 1.5 of type Float"
Test.specify "will not allow to construct a column with Char size=0" <| Test.specify "will not allow to construct a column with Char size=0" <|
r1 = Column.from_vector "X" [] (Value_Type.Char size=0 variable_length=False) r1 = Column.from_vector "X" [] (Value_Type.Char size=0 variable_length=False)
r1.should_fail_with Illegal_Argument r1.should_fail_with Illegal_Argument

View File

@ -70,17 +70,13 @@ spec =
w4 = Problems.expect_only_warning Loss_Of_Integer_Precision c4 w4 = Problems.expect_only_warning Loss_Of_Integer_Precision c4
w4.affected_rows_count . should_equal 2 w4.affected_rows_count . should_equal 2
# Until #7352 is fixed, these tests are also useful: # No precision loss should happen if the column is mixed:
# If we start with mixed before encountering large values, no issues occur:
c5 = Column.from_vector "X" ["a", 1.0, x+1] c5 = Column.from_vector "X" ["a", 1.0, x+1]
Problems.assume_no_problems c5 Problems.assume_no_problems c5
c5.to_vector.map .to_text . should_equal (["a", 1.0, x+1].map .to_text) c5.to_vector.map .to_text . should_equal (["a", 1.0, x+1].map .to_text)
# But if we first have floats and ints and only later enter mixed, we again get precision loss:
c6 = Column.from_vector "X" [x+1, 1.0, x+2, "a", x+3] c6 = Column.from_vector "X" [x+1, 1.0, x+2, "a", x+3]
w6 = Problems.expect_only_warning Loss_Of_Integer_Precision c6 Problems.assume_no_problems c6
# The 2 integers that got into DoubleBuilder are affected, the one after upcasting to Mixed is preserved OK. c6.to_vector.map .to_text . should_equal ([x+1, 1.0, x+2, "a", x+3].map .to_text)
w6.affected_rows_count . should_equal 2
c6.to_vector.map .to_text . should_equal ([x+0.0, 1.0, x+0.0, "a", x+3].map .to_text)
# It should also work for from_vector_repeated: # It should also work for from_vector_repeated:
c7 = Column.from_vector_repeated "X" [1.5, x+2, 100] 10 c7 = Column.from_vector_repeated "X" [1.5, x+2, 100] 10

View File

@ -687,9 +687,15 @@ spec =
Test.specify "Should handle parent-less file paths" <| Test.specify "Should handle parent-less file paths" <|
transient = enso_project.data / "transient" transient = enso_project.data / "transient"
f1 = transient / "./test1.txt" f1 = transient / "./test1.txt"
f2 = File.new ("parentlesstest" + Date_Time.now.to_unix_epoch_milliseconds.to_text + ".txt") f2_filename = "parentlesstest" + Date_Time.now.to_unix_epoch_milliseconds.to_text + ".txt"
f2 = File.new f2_filename
Panic.with_finalizer f2.delete <| do_cleanup =
f2.delete_if_exists
f2_bak = File.new (f2_filename + ".bak")
f2_bak.delete_if_exists
Panic.with_finalizer do_cleanup <|
txt = "MY CONTENT" txt = "MY CONTENT"
txt.write f1 txt.write f1
txt.write f2 txt.write f2