Snowflake Dialect pt. 4 - reading a column of small integers as Integer type, other type mapping tests (#10518)

- Related to #9486 - Ensures that even though an integer column in Snowflake is represented by `Decimal` type, if the values are small enough, they are materialized as `Integer`. - If the values are larger, they are still read in as `Decimal`. - Adds tests for some other `Decimal` edge cases (various precisions and scales), and for `Float`.
2024-12-23 02:21:54 +03:00 · 2024-07-11 22:14:46 +02:00 · 2024-07-11 22:14:46 +02:00 · 632355f85b
commit 632355f85b
parent 4c0fbf0e19
5 changed files with 217 additions and 9 deletions
--- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Column_Fetcher.enso
+++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Column_Fetcher.enso
@ -88,14 +88,16 @@ long_fetcher bits =
 ## PRIVATE
 big_integer_fetcher : Column_Fetcher
 big_integer_fetcher =
    fetch_value rs i =
        big_decimal = rs.getBigDecimal i
        if rs.wasNull then Nothing else
            big_decimal.toBigIntegerExact
    make_builder initial_size java_problem_aggregator =
        java_builder = Java_Exports.make_biginteger_builder initial_size java_problem_aggregator
        make_builder_from_java_object_builder java_builder
-    Column_Fetcher.Value fetch_value make_builder
+    Column_Fetcher.Value fetch_big_integer make_builder
 ## PRIVATE
 fetch_big_integer rs i =
    big_decimal = rs.getBigDecimal i
    if rs.wasNull then Nothing else
        big_decimal.toBigIntegerExact
 ## PRIVATE
 big_decimal_fetcher : Column_Fetcher
--- a/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Internal/Snowflake_Type_Mapping.enso
+++ b/distribution/lib/Standard/Snowflake/0.0.0-dev/src/Internal/Snowflake_Type_Mapping.enso
@ -17,6 +17,7 @@ import Standard.Database.SQL_Type.SQL_Type
 from Standard.Database.Errors import Unsupported_Database_Operation
 polyglot java import java.sql.Types
 polyglot java import org.enso.snowflake.SnowflakeIntegerColumnMaterializer
 ## PRIVATE
 type Snowflake_Type_Mapping
@ -33,8 +34,13 @@ type Snowflake_Type_Mapping
            Value_Type.Decimal precision scale -> case precision of
                # If precision is not set, scale is also lost because SQL is unable to express a scale without a precision.
                Nothing -> SQL_Type.Value Types.DECIMAL "NUMBER" Nothing Nothing
-                # Scale can be set or not, if precision is given, so no check needed.
+                # Scale can be set or not, but if it is set, it must be in range 0-37.
-                _       -> SQL_Type.Value Types.DECIMAL "NUMBER" precision scale
+                # If scale or precision is out of range, we fall back to Nothing.
                _       -> if (precision < 1) || (precision > 38) then SQL_Type.Value Types.DECIMAL "NUMBER" Nothing Nothing else
                    if scale.is_nothing then SQL_Type.Value Types.DECIMAL "NUMBER" precision Nothing else
                        if (scale < 0) || (scale > 37) then SQL_Type.Value Types.DECIMAL "NUMBER" Nothing Nothing else
                            SQL_Type.Value Types.DECIMAL "NUMBER" precision scale
            Value_Type.Char size _ ->
                # Snowflake does not support fixed length strings, so we use VARCHAR.
                is_unbounded = case size of
@ -118,6 +124,9 @@ type Snowflake_Type_Mapping
        case value_type of
            Value_Type.Time -> time_fetcher
            Value_Type.Date_Time _ -> date_time_fetcher
            # If we encounter a Decimal column with scale 0 we will try to fetch it as Integer if the values fit.
            Value_Type.Decimal _ 0 -> smart_integer_fetcher
            # Other Decimal columns get the default behaviour - fetched as BigInteger or BigDecimal.
            _ -> Column_Fetcher_Module.default_fetcher_for_value_type value_type
    ## PRIVATE
@ -212,6 +221,20 @@ date_time_fetcher =
        Column_Fetcher_Module.make_builder_from_java_object_builder java_builder
    Column_Fetcher.Value fetch_value make_builder
 ## PRIVATE
   A fetcher for Snowflake Decimal integer columns.
   Integer columns in Snowflake are represented as `NUMBER(38, 0)`, meaning
   there is no separate Integer type.
   In Enso, using `Decimal` values incurs a significant overhead. Thus, when
   fetching such an integer column from Snowflake, we try to first fetch it as
   lightweight `Integer` and only fall back to `Decimal` if needed.
 smart_integer_fetcher =
    make_builder initial_size _ =
        java_builder = SnowflakeIntegerColumnMaterializer.new initial_size
        Column_Fetcher_Module.make_builder_from_java_object_builder java_builder
    Column_Fetcher.Value Column_Fetcher_Module.fetch_big_integer make_builder
 ## PRIVATE
   The actual SQL type that Snowflake uses for all integer types.
 integer_type = SQL_Type.Value Types.DECIMAL "NUMERIC" 38 0
--- a/std-bits/snowflake/src/main/java/org/enso/snowflake/SnowflakeIntegerColumnMaterializer.java
+++ b/std-bits/snowflake/src/main/java/org/enso/snowflake/SnowflakeIntegerColumnMaterializer.java
@ -0,0 +1,158 @@
 package org.enso.snowflake;
 import java.math.BigInteger;
 import java.util.Arrays;
 import java.util.BitSet;
 import org.enso.table.data.column.builder.Builder;
 import org.enso.table.data.column.storage.Storage;
 import org.enso.table.data.column.storage.numeric.BigIntegerStorage;
 import org.enso.table.data.column.storage.numeric.LongStorage;
 import org.enso.table.data.column.storage.type.BigIntegerType;
 import org.enso.table.data.column.storage.type.IntegerType;
 import org.enso.table.data.column.storage.type.StorageType;
 import org.enso.table.error.ValueTypeMismatchException;
 import org.graalvm.polyglot.Context;
 public class SnowflakeIntegerColumnMaterializer extends Builder {
  private static final BigInteger LONG_MIN = BigInteger.valueOf(Long.MIN_VALUE);
  private static final BigInteger LONG_MAX = BigInteger.valueOf(Long.MAX_VALUE);
  // We start in integer mode and will switch to BigInteger mode if we encounter a value that
  // exceeds the range
  private long[] ints;
  private BitSet intsMissing;
  private BigInteger[] bigInts;
  private int currentSize;
  private Mode mode;
  public SnowflakeIntegerColumnMaterializer(int initialCapacity) {
    ints = new long[initialCapacity];
    intsMissing = new BitSet();
    bigInts = null;
    currentSize = 0;
    mode = Mode.LONG;
  }
  private void retypeToBigIntegers() {
    assert mode == Mode.LONG;
    Context context = Context.getCurrent();
    bigInts = new BigInteger[ints.length];
    for (int i = 0; i < currentSize; i++) {
      if (intsMissing.get(i)) {
        bigInts[i] = null;
      } else {
        bigInts[i] = BigInteger.valueOf(ints[i]);
      }
      context.safepoint();
    }
    ints = null;
    intsMissing = null;
    mode = Mode.BIG_INTEGER;
  }
  private boolean fitsInLong(BigInteger bigInteger) {
    return bigInteger.compareTo(LONG_MIN) >= 0 && bigInteger.compareTo(LONG_MAX) <= 0;
  }
  @Override
  public void appendNoGrow(Object o) {
    switch (o) {
      case BigInteger bigInteger -> {
        switch (mode) {
          case BIG_INTEGER -> bigInts[currentSize++] = bigInteger;
          case LONG -> {
            if (fitsInLong(bigInteger)) {
              ints[currentSize++] = bigInteger.longValue();
            } else {
              retypeToBigIntegers();
              bigInts[currentSize++] = bigInteger;
            }
          }
        }
      }
      case null -> appendNulls(1);
      default -> throw new ValueTypeMismatchException(BigIntegerType.INSTANCE, o);
    }
  }
  @Override
  public void append(Object o) {
    if (currentSize >= capacity()) {
      grow();
    }
    appendNoGrow(o);
  }
  @Override
  public void appendNulls(int count) {
    if (mode == Mode.LONG) {
      intsMissing.set(currentSize, currentSize + count);
    }
    currentSize += count;
  }
  @Override
  public void appendBulkStorage(Storage<?> storage) {
    throw new IllegalStateException(
        "SnowflakeIntegerColumnMaterializer.appendBulkStorage: Not supported.");
  }
  @Override
  public int getCurrentSize() {
    return currentSize;
  }
  @Override
  public Storage<?> seal() {
    resize(currentSize);
    return switch (mode) {
      case LONG -> new LongStorage(ints, currentSize, intsMissing, IntegerType.INT_64);
      case BIG_INTEGER -> new BigIntegerStorage(bigInts, currentSize);
    };
  }
  @Override
  public StorageType getType() {
    // The type of the builder can change over time, so we do not report any stable type here.
    // Same as in InferredBuilder.
    return null;
  }
  private int capacity() {
    return mode == Mode.LONG ? ints.length : bigInts.length;
  }
  private void grow() {
    int desiredCapacity = 3;
    if (capacity() > 1) {
      desiredCapacity = (capacity() * 3 / 2);
    }
    // It is possible for the `currentSize` to grow arbitrarily larger than
    // the capacity, because when nulls are being added the array is not
    // resized, only the counter is incremented. Thus, we need to ensure
    // that we have allocated enough space for at least one element.
    if (currentSize >= desiredCapacity) {
      desiredCapacity = currentSize + 1;
    }
    resize(desiredCapacity);
  }
  private void resize(int desiredCapacity) {
    switch (mode) {
      case LONG -> ints = Arrays.copyOf(ints, desiredCapacity);
      case BIG_INTEGER -> bigInts = Arrays.copyOf(bigInts, desiredCapacity);
    }
  }
  private enum Mode {
    LONG,
    BIG_INTEGER
  }
 }
--- a/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java
+++ b/std-bits/table/src/main/java/org/enso/table/data/column/builder/BigIntegerBuilder.java
@ -16,7 +16,6 @@ import org.enso.table.error.ValueTypeMismatchException;
 import org.enso.table.problems.ProblemAggregator;
 import org.graalvm.polyglot.Context;
 // For now the BigInteger builder is just a stub, reusing the ObjectBuilder and adding a warning.
 public class BigIntegerBuilder extends TypedBuilderImpl<BigInteger> {
  // The problem aggregator is only used so that when we are retyping, we can pass it on.
  private final ProblemAggregator problemAggregator;
--- a/test/Snowflake_Tests/src/Snowflake_Spec.enso
+++ b/test/Snowflake_Tests/src/Snowflake_Spec.enso
@ -204,13 +204,39 @@ snowflake_specific_spec suite_builder default_connection db_name setup =
            t1.at "big_ints" . value_type . should_equal (Value_Type.Decimal 38 0)
            in_memory = t1.read
            # But when read back to in-memory, they are inferred as Integer type to avoid the BigInteger overhead
            in_memory.at "small_ints" . value_type . should_equal (Value_Type.Integer Bits.Bits_64)
-            in_memory.at "big_ints" . value_type . should_equal (Value_Type.Decimal 38 0)
+            # Unless the values are actually big, then the Decimal type is kept, but its precision is lost, as in-memory BigInteger does not store it.
            in_memory.at "big_ints" . value_type . should_equal (Value_Type.Decimal Nothing 0)
            # Check correctness of values
            in_memory.at "small_ints" . to_vector . should_equal [1, 2, 3]
            in_memory.at "big_ints" . to_vector . should_equal [2^100, 2^110, 1]
        group_builder.specify "correctly handles Decimal and Float types" <|
            table_name = Name_Generator.random_name "DecimalFloat"
            t1 = default_connection.get.create_table table_name [Column_Description.Value "d1" (Value_Type.Decimal 38 6), Column_Description.Value "d2" (Value_Type.Decimal 10 2), Column_Description.Value "d3" (Value_Type.Decimal 24 -3), Column_Description.Value "f" (Value_Type.Float)] primary_key=[]
            t1.at "d1" . value_type . should_equal (Value_Type.Decimal 38 6)
            t1.at "d2" . value_type . should_equal (Value_Type.Decimal 10 2)
            # Negative scale is not supported so we fallback to defaults:
            t1.at "d3" . value_type . should_equal (Value_Type.Decimal 38 0)
            t1.at "f" . value_type . should_equal Value_Type.Float
            t1.update_rows (Table.new [["d1", [1.2345678910]], ["d2", [12.3456]], ["d3", [1234567.8910]], ["f", [1.5]]]) update_action=Update_Action.Insert . should_succeed
            m1 = t1.read
            # Currently in-memory does not support precision and scale in Decimals so they are all change to Nothing
            m1.at "d1" . value_type . should_equal (Value_Type.Decimal Nothing Nothing)
            m1.at "d2" . value_type . should_equal (Value_Type.Decimal Nothing Nothing)
            # The `d3` column got coerced to `Value_Type.Decimal 38 0` so given that the value is relatively small, it is now fetched as integer.
            m1.at "d3" . value_type . should_equal Value_Type.Integer
            m1.at "f" . value_type . should_equal Value_Type.Float
            m1.at "d1" . to_vector . should_equal [Decimal.new "1.234568"]
            m1.at "d2" . to_vector . should_equal [Decimal.new "12.35"]
            m1.at "d3" . to_vector . should_equal [1234568]
            m1.at "f" . to_vector . should_equal [1.5]
    suite_builder.group "[Snowflake] Dialect-specific codegen" group_builder->
        data = Snowflake_Info_Data.setup default_connection