Snowflake Dialect pt. 4 - reading a column of small integers as Integer type, other type mapping tests (#10518)

- Related to #9486 - Ensures that even though an integer column in Snowflake is represented by `Decimal` type, if the values are small enough, they are materialized as `Integer`. - If the values are larger, they are still read in as `Decimal`. - Adds tests for some other `Decimal` edge cases (various precisions and scales), and for `Float`.
2024-12-22 10:11:37 +03:00 · 2024-07-11 22:14:46 +02:00 · 2024-07-11 22:14:46 +02:00 · 632355f85b
commit 632355f85b
parent 4c0fbf0e19
5 changed files with 217 additions and 9 deletions
--- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Column_Fetcher.enso
+++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Column_Fetcher.enso
@ -88,14 +88,16 @@ long_fetcher bits =
 ## PRIVATE
 big_integer_fetcher : Column_Fetcher
 big_integer_fetcher =
-    fetch_value rs i =
-        big_decimal = rs.getBigDecimal i
-        if rs.wasNull then Nothing else
-            big_decimal.toBigIntegerExact
    make_builder initial_size java_problem_aggregator =
        java_builder = Java_Exports.make_biginteger_builder initial_size java_problem_aggregator
        make_builder_from_java_object_builder java_builder
-    Column_Fetcher.Value fetch_value make_builder
+    Column_Fetcher.Value fetch_big_integer make_builder
+
+## PRIVATE
+fetch_big_integer rs i =
+    big_decimal = rs.getBigDecimal i
+    if rs.wasNull then Nothing else
+        big_decimal.toBigIntegerExact

 ## PRIVATE
 big_decimal_fetcher : Column_Fetcher
--- a/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Internal/Snowflake_Type_Mapping.enso
+++ b/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Internal/Snowflake_Type_Mapping.enso
@ -17,6 +17,7 @@ import Standard.Database.SQL_Type.SQL_Type
 from Standard.Database.Errors import Unsupported_Database_Operation

 polyglot java import java.sql.Types
+polyglot java import org.enso.snowflake.SnowflakeIntegerColumnMaterializer

 ## PRIVATE
 type Snowflake_Type_Mapping
@ -33,8 +34,13 @@ type Snowflake_Type_Mapping
            Value_Type.Decimal precision scale -> case precision of
                # If precision is not set, scale is also lost because SQL is unable to express a scale without a precision.
                Nothing -> SQL_Type.Value Types.DECIMAL "NUMBER" Nothing Nothing
-                # Scale can be set or not, if precision is given, so no check needed.
-                _       -> SQL_Type.Value Types.DECIMAL "NUMBER" precision scale
+                # Scale can be set or not, but if it is set, it must be in range 0-37.
+                # If scale or precision is out of range, we fall back to Nothing.
+                _       -> if (precision < 1) || (precision > 38) then SQL_Type.Value Types.DECIMAL "NUMBER" Nothing Nothing else
+                    if scale.is_nothing then SQL_Type.Value Types.DECIMAL "NUMBER" precision Nothing else
+                        if (scale < 0) || (scale > 37) then SQL_Type.Value Types.DECIMAL "NUMBER" Nothing Nothing else
+                            SQL_Type.Value Types.DECIMAL "NUMBER" precision scale
+
            Value_Type.Char size _ ->
                # Snowflake does not support fixed length strings, so we use VARCHAR.
                is_unbounded = case size of
@ -118,6 +124,9 @@ type Snowflake_Type_Mapping
        case value_type of
            Value_Type.Time -> time_fetcher
            Value_Type.Date_Time _ -> date_time_fetcher
+            # If we encounter a Decimal column with scale 0 we will try to fetch it as Integer if the values fit.
+            Value_Type.Decimal _ 0 -> smart_integer_fetcher
+            # Other Decimal columns get the default behaviour - fetched as BigInteger or BigDecimal.
            _ -> Column_Fetcher_Module.default_fetcher_for_value_type value_type

    ## PRIVATE
@ -212,6 +221,20 @@ date_time_fetcher =
        Column_Fetcher_Module.make_builder_from_java_object_builder java_builder
    Column_Fetcher.Value fetch_value make_builder

+## PRIVATE
+   A fetcher for Snowflake Decimal integer columns.
+   Integer columns in Snowflake are represented as `NUMBER(38, 0)`, meaning
+   there is no separate Integer type.
+
+   In Enso, using `Decimal` values incurs a significant overhead. Thus, when
+   fetching such an integer column from Snowflake, we try to first fetch it as
+   lightweight `Integer` and only fall back to `Decimal` if needed.
+smart_integer_fetcher =
+    make_builder initial_size _ =
+        java_builder = SnowflakeIntegerColumnMaterializer.new initial_size
+        Column_Fetcher_Module.make_builder_from_java_object_builder java_builder
+    Column_Fetcher.Value Column_Fetcher_Module.fetch_big_integer make_builder
+
 ## PRIVATE
   The actual SQL type that Snowflake uses for all integer types.
 integer_type = SQL_Type.Value Types.DECIMAL "NUMERIC" 38 0
--- a/std-bits/snowflake/src/main/java/org/enso/snowflake/SnowflakeIntegerColumnMaterializer.java
+++ b/std-bits/snowflake/src/main/java/org/enso/snowflake/SnowflakeIntegerColumnMaterializer.java
@ -0,0 +1,158 @@
+package org.enso.snowflake;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+import java.util.BitSet;
+import org.enso.table.data.column.builder.Builder;
+import org.enso.table.data.column.storage.Storage;
+import org.enso.table.data.column.storage.numeric.BigIntegerStorage;
+import org.enso.table.data.column.storage.numeric.LongStorage;
+import org.enso.table.data.column.storage.type.BigIntegerType;
+import org.enso.table.data.column.storage.type.IntegerType;
+import org.enso.table.data.column.storage.type.StorageType;
+import org.enso.table.error.ValueTypeMismatchException;
+import org.graalvm.polyglot.Context;
+
+public class SnowflakeIntegerColumnMaterializer extends Builder {
+  private static final BigInteger LONG_MIN = BigInteger.valueOf(Long.MIN_VALUE);
+  private static final BigInteger LONG_MAX = BigInteger.valueOf(Long.MAX_VALUE);
+  // We start in integer mode and will switch to BigInteger mode if we encounter a value that
+  // exceeds the range
+  private long[] ints;
+  private BitSet intsMissing;
+  private BigInteger[] bigInts;
+  private int currentSize;
+  private Mode mode;
+
+  public SnowflakeIntegerColumnMaterializer(int initialCapacity) {
+    ints = new long[initialCapacity];
+    intsMissing = new BitSet();
+    bigInts = null;
+    currentSize = 0;
+    mode = Mode.LONG;
+  }
+
+  private void retypeToBigIntegers() {
+    assert mode == Mode.LONG;
+    Context context = Context.getCurrent();
+    bigInts = new BigInteger[ints.length];
+    for (int i = 0; i < currentSize; i++) {
+      if (intsMissing.get(i)) {
+        bigInts[i] = null;
+      } else {
+        bigInts[i] = BigInteger.valueOf(ints[i]);
+      }
+
+      context.safepoint();
+    }
+
+    ints = null;
+    intsMissing = null;
+    mode = Mode.BIG_INTEGER;
+  }
+
+  private boolean fitsInLong(BigInteger bigInteger) {
+    return bigInteger.compareTo(LONG_MIN) >= 0 && bigInteger.compareTo(LONG_MAX) <= 0;
+  }
+
+  @Override
+  public void appendNoGrow(Object o) {
+    switch (o) {
+      case BigInteger bigInteger -> {
+        switch (mode) {
+          case BIG_INTEGER -> bigInts[currentSize++] = bigInteger;
+
+          case LONG -> {
+            if (fitsInLong(bigInteger)) {
+              ints[currentSize++] = bigInteger.longValue();
+            } else {
+              retypeToBigIntegers();
+              bigInts[currentSize++] = bigInteger;
+            }
+          }
+        }
+      }
+
+      case null -> appendNulls(1);
+      default -> throw new ValueTypeMismatchException(BigIntegerType.INSTANCE, o);
+    }
+  }
+
+  @Override
+  public void append(Object o) {
+    if (currentSize >= capacity()) {
+      grow();
+    }
+
+    appendNoGrow(o);
+  }
+
+  @Override
+  public void appendNulls(int count) {
+    if (mode == Mode.LONG) {
+      intsMissing.set(currentSize, currentSize + count);
+    }
+
+    currentSize += count;
+  }
+
+  @Override
+  public void appendBulkStorage(Storage<?> storage) {
+    throw new IllegalStateException(
+        "SnowflakeIntegerColumnMaterializer.appendBulkStorage: Not supported.");
+  }
+
+  @Override
+  public int getCurrentSize() {
+    return currentSize;
+  }
+
+  @Override
+  public Storage<?> seal() {
+    resize(currentSize);
+    return switch (mode) {
+      case LONG -> new LongStorage(ints, currentSize, intsMissing, IntegerType.INT_64);
+      case BIG_INTEGER -> new BigIntegerStorage(bigInts, currentSize);
+    };
+  }
+
+  @Override
+  public StorageType getType() {
+    // The type of the builder can change over time, so we do not report any stable type here.
+    // Same as in InferredBuilder.
+    return null;
+  }
+
+  private int capacity() {
+    return mode == Mode.LONG ? ints.length : bigInts.length;
+  }
+
+  private void grow() {
+    int desiredCapacity = 3;
+    if (capacity() > 1) {
+      desiredCapacity = (capacity() * 3 / 2);
+    }
+
+    // It is possible for the `currentSize` to grow arbitrarily larger than
+    // the capacity, because when nulls are being added the array is not
+    // resized, only the counter is incremented. Thus, we need to ensure
+    // that we have allocated enough space for at least one element.
+    if (currentSize >= desiredCapacity) {
+      desiredCapacity = currentSize + 1;
+    }
+
+    resize(desiredCapacity);
+  }
+
+  private void resize(int desiredCapacity) {
+    switch (mode) {
+      case LONG -> ints = Arrays.copyOf(ints, desiredCapacity);
+      case BIG_INTEGER -> bigInts = Arrays.copyOf(bigInts, desiredCapacity);
+    }
+  }
+
+  private enum Mode {
+    LONG,
+    BIG_INTEGER
+  }
+}
--- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java
+++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java
@ -16,7 +16,6 @@ import org.enso.table.error.ValueTypeMismatchException;
 import org.enso.table.problems.ProblemAggregator;
 import org.graalvm.polyglot.Context;

-// For now the BigInteger builder is just a stub, reusing the ObjectBuilder and adding a warning.
 public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
  // The problem aggregator is only used so that when we are retyping, we can pass it on.
  private final ProblemAggregator problemAggregator;
--- a/test/Snowflake_Tests/src/Snowflake_Spec.enso
+++ b/test/Snowflake_Tests/src/Snowflake_Spec.enso
@ -204,13 +204,39 @@ snowflake_specific_spec suite_builder default_connection db_name setup =
            t1.at "big_ints" . value_type . should_equal (Value_Type.Decimal 38 0)

            in_memory = t1.read
+            # But when read back to in-memory, they are inferred as Integer type to avoid the BigInteger overhead
            in_memory.at "small_ints" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
-            in_memory.at "big_ints" . value_type . should_equal (Value_Type.Decimal 38 0)
+            # Unless the values are actually big, then the Decimal type is kept, but its precision is lost, as in-memory BigInteger does not store it.
+            in_memory.at "big_ints" . value_type . should_equal (Value_Type.Decimal Nothing 0)

            # Check correctness of values
            in_memory.at "small_ints" . to_vector . should_equal [1, 2, 3]
            in_memory.at "big_ints" . to_vector . should_equal [2^100, 2^110, 1]

+        group_builder.specify "correctly handles Decimal and Float types" <|
+            table_name = Name_Generator.random_name "DecimalFloat"
+            t1 = default_connection.get.create_table table_name [Column_Description.Value "d1" (Value_Type.Decimal 38 6), Column_Description.Value "d2" (Value_Type.Decimal 10 2), Column_Description.Value "d3" (Value_Type.Decimal 24 -3), Column_Description.Value "f" (Value_Type.Float)] primary_key=[]
+            t1.at "d1" . value_type . should_equal (Value_Type.Decimal 38 6)
+            t1.at "d2" . value_type . should_equal (Value_Type.Decimal 10 2)
+            # Negative scale is not supported so we fallback to defaults:
+            t1.at "d3" . value_type . should_equal (Value_Type.Decimal 38 0)
+            t1.at "f" . value_type . should_equal Value_Type.Float
+
+            t1.update_rows (Table.new [["d1", [1.2345678910]], ["d2", [12.3456]], ["d3", [1234567.8910]], ["f", [1.5]]]) update_action=Update_Action.Insert . should_succeed
+
+            m1 = t1.read
+            # Currently in-memory does not support precision and scale in Decimals so they are all change to Nothing
+            m1.at "d1" . value_type . should_equal (Value_Type.Decimal Nothing Nothing)
+            m1.at "d2" . value_type . should_equal (Value_Type.Decimal Nothing Nothing)
+            # The `d3` column got coerced to `Value_Type.Decimal 38 0` so given that the value is relatively small, it is now fetched as integer.
+            m1.at "d3" . value_type . should_equal Value_Type.Integer
+            m1.at "f" . value_type . should_equal Value_Type.Float
+
+            m1.at "d1" . to_vector . should_equal [Decimal.new "1.234568"]
+            m1.at "d2" . to_vector . should_equal [Decimal.new "12.35"]
+            m1.at "d3" . to_vector . should_equal [1234568]
+            m1.at "f" . to_vector . should_equal [1.5]
+
    suite_builder.group "[Snowflake] Dialect-specific codegen" group_builder->
        data = Snowflake_Info_Data.setup default_connection