Snowflake Dialect pt. 4 - reading a column of small integers as Integer type, other type mapping tests (#10518)

- Related to #9486
- Ensures that even though an integer column in Snowflake is represented by `Decimal` type, if the values are small enough, they are materialized as `Integer`.
- If the values are larger, they are still read in as `Decimal`.
- Adds tests for some other `Decimal` edge cases (various precisions and scales), and for `Float`.
This commit is contained in:
Radosław Waśko 2024-07-11 22:14:46 +02:00 committed by GitHub
parent 4c0fbf0e19
commit 632355f85b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 217 additions and 9 deletions

View File

@ -88,14 +88,16 @@ long_fetcher bits =
## PRIVATE
big_integer_fetcher : Column_Fetcher
big_integer_fetcher =
fetch_value rs i =
big_decimal = rs.getBigDecimal i
if rs.wasNull then Nothing else
big_decimal.toBigIntegerExact
make_builder initial_size java_problem_aggregator =
java_builder = Java_Exports.make_biginteger_builder initial_size java_problem_aggregator
make_builder_from_java_object_builder java_builder
Column_Fetcher.Value fetch_value make_builder
Column_Fetcher.Value fetch_big_integer make_builder
## PRIVATE
fetch_big_integer rs i =
big_decimal = rs.getBigDecimal i
if rs.wasNull then Nothing else
big_decimal.toBigIntegerExact
## PRIVATE
big_decimal_fetcher : Column_Fetcher

View File

@ -17,6 +17,7 @@ import Standard.Database.SQL_Type.SQL_Type
from Standard.Database.Errors import Unsupported_Database_Operation
polyglot java import java.sql.Types
polyglot java import org.enso.snowflake.SnowflakeIntegerColumnMaterializer
## PRIVATE
type Snowflake_Type_Mapping
@ -33,8 +34,13 @@ type Snowflake_Type_Mapping
Value_Type.Decimal precision scale -> case precision of
# If precision is not set, scale is also lost because SQL is unable to express a scale without a precision.
Nothing -> SQL_Type.Value Types.DECIMAL "NUMBER" Nothing Nothing
# Scale can be set or not, if precision is given, so no check needed.
_ -> SQL_Type.Value Types.DECIMAL "NUMBER" precision scale
# Scale can be set or not, but if it is set, it must be in range 0-37.
# If scale or precision is out of range, we fall back to Nothing.
_ -> if (precision < 1) || (precision > 38) then SQL_Type.Value Types.DECIMAL "NUMBER" Nothing Nothing else
if scale.is_nothing then SQL_Type.Value Types.DECIMAL "NUMBER" precision Nothing else
if (scale < 0) || (scale > 37) then SQL_Type.Value Types.DECIMAL "NUMBER" Nothing Nothing else
SQL_Type.Value Types.DECIMAL "NUMBER" precision scale
Value_Type.Char size _ ->
# Snowflake does not support fixed length strings, so we use VARCHAR.
is_unbounded = case size of
@ -118,6 +124,9 @@ type Snowflake_Type_Mapping
case value_type of
Value_Type.Time -> time_fetcher
Value_Type.Date_Time _ -> date_time_fetcher
# If we encounter a Decimal column with scale 0 we will try to fetch it as Integer if the values fit.
Value_Type.Decimal _ 0 -> smart_integer_fetcher
# Other Decimal columns get the default behaviour - fetched as BigInteger or BigDecimal.
_ -> Column_Fetcher_Module.default_fetcher_for_value_type value_type
## PRIVATE
@ -212,6 +221,20 @@ date_time_fetcher =
Column_Fetcher_Module.make_builder_from_java_object_builder java_builder
Column_Fetcher.Value fetch_value make_builder
## PRIVATE
A fetcher for Snowflake Decimal integer columns.
Integer columns in Snowflake are represented as `NUMBER(38, 0)`, meaning
there is no separate Integer type.
In Enso, using `Decimal` values incurs a significant overhead. Thus, when
fetching such an integer column from Snowflake, we try to first fetch it as
lightweight `Integer` and only fall back to `Decimal` if needed.
smart_integer_fetcher =
make_builder initial_size _ =
java_builder = SnowflakeIntegerColumnMaterializer.new initial_size
Column_Fetcher_Module.make_builder_from_java_object_builder java_builder
Column_Fetcher.Value Column_Fetcher_Module.fetch_big_integer make_builder
## PRIVATE
The actual SQL type that Snowflake uses for all integer types.
integer_type = SQL_Type.Value Types.DECIMAL "NUMERIC" 38 0

View File

@ -0,0 +1,158 @@
package org.enso.snowflake;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.BitSet;
import org.enso.table.data.column.builder.Builder;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.numeric.BigIntegerStorage;
import org.enso.table.data.column.storage.numeric.LongStorage;
import org.enso.table.data.column.storage.type.BigIntegerType;
import org.enso.table.data.column.storage.type.IntegerType;
import org.enso.table.data.column.storage.type.StorageType;
import org.enso.table.error.ValueTypeMismatchException;
import org.graalvm.polyglot.Context;
public class SnowflakeIntegerColumnMaterializer extends Builder {
private static final BigInteger LONG_MIN = BigInteger.valueOf(Long.MIN_VALUE);
private static final BigInteger LONG_MAX = BigInteger.valueOf(Long.MAX_VALUE);
// We start in integer mode and will switch to BigInteger mode if we encounter a value that
// exceeds the range
private long[] ints;
private BitSet intsMissing;
private BigInteger[] bigInts;
private int currentSize;
private Mode mode;
public SnowflakeIntegerColumnMaterializer(int initialCapacity) {
ints = new long[initialCapacity];
intsMissing = new BitSet();
bigInts = null;
currentSize = 0;
mode = Mode.LONG;
}
private void retypeToBigIntegers() {
assert mode == Mode.LONG;
Context context = Context.getCurrent();
bigInts = new BigInteger[ints.length];
for (int i = 0; i < currentSize; i++) {
if (intsMissing.get(i)) {
bigInts[i] = null;
} else {
bigInts[i] = BigInteger.valueOf(ints[i]);
}
context.safepoint();
}
ints = null;
intsMissing = null;
mode = Mode.BIG_INTEGER;
}
private boolean fitsInLong(BigInteger bigInteger) {
return bigInteger.compareTo(LONG_MIN) >= 0 && bigInteger.compareTo(LONG_MAX) <= 0;
}
@Override
public void appendNoGrow(Object o) {
switch (o) {
case BigInteger bigInteger -> {
switch (mode) {
case BIG_INTEGER -> bigInts[currentSize++] = bigInteger;
case LONG -> {
if (fitsInLong(bigInteger)) {
ints[currentSize++] = bigInteger.longValue();
} else {
retypeToBigIntegers();
bigInts[currentSize++] = bigInteger;
}
}
}
}
case null -> appendNulls(1);
default -> throw new ValueTypeMismatchException(BigIntegerType.INSTANCE, o);
}
}
@Override
public void append(Object o) {
if (currentSize >= capacity()) {
grow();
}
appendNoGrow(o);
}
@Override
public void appendNulls(int count) {
if (mode == Mode.LONG) {
intsMissing.set(currentSize, currentSize + count);
}
currentSize += count;
}
@Override
public void appendBulkStorage(Storage<?> storage) {
throw new IllegalStateException(
"SnowflakeIntegerColumnMaterializer.appendBulkStorage: Not supported.");
}
@Override
public int getCurrentSize() {
return currentSize;
}
@Override
public Storage<?> seal() {
resize(currentSize);
return switch (mode) {
case LONG -> new LongStorage(ints, currentSize, intsMissing, IntegerType.INT_64);
case BIG_INTEGER -> new BigIntegerStorage(bigInts, currentSize);
};
}
@Override
public StorageType getType() {
// The type of the builder can change over time, so we do not report any stable type here.
// Same as in InferredBuilder.
return null;
}
private int capacity() {
return mode == Mode.LONG ? ints.length : bigInts.length;
}
private void grow() {
int desiredCapacity = 3;
if (capacity() > 1) {
desiredCapacity = (capacity() * 3 / 2);
}
// It is possible for the `currentSize` to grow arbitrarily larger than
// the capacity, because when nulls are being added the array is not
// resized, only the counter is incremented. Thus, we need to ensure
// that we have allocated enough space for at least one element.
if (currentSize >= desiredCapacity) {
desiredCapacity = currentSize + 1;
}
resize(desiredCapacity);
}
private void resize(int desiredCapacity) {
switch (mode) {
case LONG -> ints = Arrays.copyOf(ints, desiredCapacity);
case BIG_INTEGER -> bigInts = Arrays.copyOf(bigInts, desiredCapacity);
}
}
private enum Mode {
LONG,
BIG_INTEGER
}
}

View File

@ -16,7 +16,6 @@ import org.enso.table.error.ValueTypeMismatchException;
import org.enso.table.problems.ProblemAggregator;
import org.graalvm.polyglot.Context;
// For now the BigInteger builder is just a stub, reusing the ObjectBuilder and adding a warning.
public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
// The problem aggregator is only used so that when we are retyping, we can pass it on.
private final ProblemAggregator problemAggregator;

View File

@ -204,13 +204,39 @@ snowflake_specific_spec suite_builder default_connection db_name setup =
t1.at "big_ints" . value_type . should_equal (Value_Type.Decimal 38 0)
in_memory = t1.read
# But when read back to in-memory, they are inferred as Integer type to avoid the BigInteger overhead
in_memory.at "small_ints" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
in_memory.at "big_ints" . value_type . should_equal (Value_Type.Decimal 38 0)
# Unless the values are actually big, then the Decimal type is kept, but its precision is lost, as in-memory BigInteger does not store it.
in_memory.at "big_ints" . value_type . should_equal (Value_Type.Decimal Nothing 0)
# Check correctness of values
in_memory.at "small_ints" . to_vector . should_equal [1, 2, 3]
in_memory.at "big_ints" . to_vector . should_equal [2^100, 2^110, 1]
group_builder.specify "correctly handles Decimal and Float types" <|
table_name = Name_Generator.random_name "DecimalFloat"
t1 = default_connection.get.create_table table_name [Column_Description.Value "d1" (Value_Type.Decimal 38 6), Column_Description.Value "d2" (Value_Type.Decimal 10 2), Column_Description.Value "d3" (Value_Type.Decimal 24 -3), Column_Description.Value "f" (Value_Type.Float)] primary_key=[]
t1.at "d1" . value_type . should_equal (Value_Type.Decimal 38 6)
t1.at "d2" . value_type . should_equal (Value_Type.Decimal 10 2)
# Negative scale is not supported so we fallback to defaults:
t1.at "d3" . value_type . should_equal (Value_Type.Decimal 38 0)
t1.at "f" . value_type . should_equal Value_Type.Float
t1.update_rows (Table.new [["d1", [1.2345678910]], ["d2", [12.3456]], ["d3", [1234567.8910]], ["f", [1.5]]]) update_action=Update_Action.Insert . should_succeed
m1 = t1.read
# Currently in-memory does not support precision and scale in Decimals so they are all change to Nothing
m1.at "d1" . value_type . should_equal (Value_Type.Decimal Nothing Nothing)
m1.at "d2" . value_type . should_equal (Value_Type.Decimal Nothing Nothing)
# The `d3` column got coerced to `Value_Type.Decimal 38 0` so given that the value is relatively small, it is now fetched as integer.
m1.at "d3" . value_type . should_equal Value_Type.Integer
m1.at "f" . value_type . should_equal Value_Type.Float
m1.at "d1" . to_vector . should_equal [Decimal.new "1.234568"]
m1.at "d2" . to_vector . should_equal [Decimal.new "12.35"]
m1.at "d3" . to_vector . should_equal [1234568]
m1.at "f" . to_vector . should_equal [1.5]
suite_builder.group "[Snowflake] Dialect-specific codegen" group_builder->
data = Snowflake_Info_Data.setup default_connection