Make In Memory Table Aggregator types more specific where possible (#3679)

Many aggregation types fell back to the general `Any` type where they could have used the type of input column - for example `First` of a column of integers is guaranteed to fit the `Integer` storage type, so it doesn't have to fall back to `Any`. This PR fixes that and adds a test that checks this.
This commit is contained in:
Radosław Waśko 2022-09-05 11:17:41 +02:00 committed by GitHub
parent 1e3b9a3624
commit eafba079d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 51 additions and 19 deletions

View File

@ -27,7 +27,7 @@ public class First extends Aggregator {
Column[] orderByColumns,
Long[] orderByDirections,
Comparator<Object> objectComparator) {
super(name, Storage.Type.OBJECT);
super(name, column.getStorage().getType());
this.storage = column.getStorage();
this.orderByColumns =
orderByColumns == null

View File

@ -10,7 +10,7 @@ public class GroupBy extends Aggregator {
private final Storage storage;
public GroupBy(String name, Column column) {
super(name, Storage.Type.OBJECT);
super(name, column.getStorage().getType());
storage = column.getStorage();
}

View File

@ -26,7 +26,7 @@ public class Last extends Aggregator {
Column[] orderByColumns,
Long[] orderByDirections,
Comparator<Object> objectComparator) {
super(name, Storage.Type.OBJECT);
super(name, column.getStorage().getType());
this.storage = column.getStorage();
this.orderByColumns =
orderByColumns == null

View File

@ -23,7 +23,7 @@ public class MinOrMax extends Aggregator {
* @param minOrMax <0 for minimum, >0 for maximum
*/
public MinOrMax(String name, Column column, int minOrMax, Comparator<Object> objectComparator) {
super(name, Storage.Type.OBJECT);
super(name, column.getStorage().getType());
this.storage = column.getStorage();
this.minOrMax = Integer.signum(minOrMax);
this.objectComparator = objectComparator;

View File

@ -13,7 +13,7 @@ public class Mode extends Aggregator {
private final Storage storage;
public Mode(String name, Column column) {
super(name, Storage.Type.OBJECT);
super(name, column.getStorage().getType());
this.storage = column.getStorage();
}

View File

@ -39,7 +39,7 @@ public class BoolStorage extends Storage {
}
@Override
public long getType() {
public int getType() {
return Type.BOOL;
}

View File

@ -30,7 +30,7 @@ public class DateStorage extends SpecializedStorage<LocalDate> {
}
@Override
public long getType() {
public int getType() {
return Type.DATE;
}
}

View File

@ -30,7 +30,7 @@ public class DateTimeStorage extends SpecializedStorage<ZonedDateTime> {
}
@Override
public long getType() {
public int getType() {
return Type.DATE_TIME;
}
}

View File

@ -63,7 +63,7 @@ public class DoubleStorage extends NumericStorage {
/** @inheritDoc */
@Override
public long getType() {
public int getType() {
return Type.DOUBLE;
}

View File

@ -71,7 +71,7 @@ public class LongStorage extends NumericStorage {
/** @inheritDoc */
@Override
public long getType() {
public int getType() {
return Type.LONG;
}

View File

@ -26,7 +26,7 @@ public class ObjectStorage extends SpecializedStorage<Object> {
}
@Override
public long getType() {
public int getType() {
return Type.OBJECT;
}

View File

@ -15,7 +15,7 @@ public abstract class SpecializedStorage<T> extends Storage {
protected abstract T[] newUnderlyingArray(int size);
@Override
public abstract long getType();
public abstract int getType();
/**
* @param data the underlying data

View File

@ -24,7 +24,7 @@ public abstract class Storage {
public abstract int countMissing();
/** @return the type tag of this column's storage. Must be one of {@link Type} */
public abstract long getType();
public abstract int getType();
/**
* Checks whether the value at {@code idx} is missing.

View File

@ -29,7 +29,7 @@ public class StringStorage extends SpecializedStorage<String> {
}
@Override
public long getType() {
public int getType() {
return Type.STRING;
}

View File

@ -30,7 +30,7 @@ public class TimeOfDayStorage extends SpecializedStorage<LocalTime> {
}
@Override
public long getType() {
public int getType() {
return Type.TIME_OF_DAY;
}
}

View File

@ -102,11 +102,14 @@ public class MultiValueIndex {
private static Builder getBuilderForType(int type, int size) {
return switch (type) {
case Storage.Type.BOOL -> new BoolBuilder();
case Storage.Type.DOUBLE -> NumericBuilder.createDoubleBuilder(size);
case Storage.Type.LONG -> NumericBuilder.createLongBuilder(size);
case Storage.Type.STRING -> new StringBuilder(size);
case Storage.Type.OBJECT -> new ObjectBuilder(size);
case Storage.Type.LONG -> NumericBuilder.createLongBuilder(size);
case Storage.Type.DOUBLE -> NumericBuilder.createDoubleBuilder(size);
case Storage.Type.STRING -> new StringBuilder(size);
case Storage.Type.BOOL -> new BoolBuilder();
case Storage.Type.DATE -> new DateBuilder(size);
case Storage.Type.TIME_OF_DAY -> new TimeOfDayBuilder(size);
case Storage.Type.DATE_TIME -> new DateTimeBuilder(size);
default -> new InferredBuilder(size);
};
}

View File

@ -5,6 +5,8 @@ from Standard.Table import Column, Sort_Column, Sort_Column_Selector
from Standard.Table.Data.Table import Empty_Error
from Standard.Table.Errors as Table_Errors import Invalid_Output_Column_Names_Data, Duplicate_Output_Column_Names_Data, No_Input_Columns_Selected, Missing_Input_Columns_Data
import Standard.Table.Data.Storage
import Standard.Table.Data.Aggregate_Column
from Standard.Table.Data.Aggregate_Column import all hiding First, Last
import Standard.Visualization
@ -617,4 +619,31 @@ spec =
problems = [Duplicate_Output_Column_Names_Data ["A", "A", "A"]]
Problems.test_problem_handling action problems tester
Test.group "[In-Memory] Table.aggregate" <|
Test.specify "should return columns with correct types" <|
dates = ["dates", [Date.new 1999, Date.new 2000, Date.new 2000, Date.new 2000]]
texts = ["texts", ["a", "bb", "a", "bb"]]
mixed = ["mixed", [1, "a", "a", 1]]
ints = ["ints", [0, 1, 1, 0]]
floats = ["floats", [0.1, 1.0, 2.0, 1.5]]
objects = ["objects", [My_Data 0 1, My_Data 0 1, My_Data 2 2, My_Data 2 2]]
table = Table.new [dates, texts, mixed, ints, floats, objects]
t1 = table.aggregate [Group_By "dates", Shortest "texts", Aggregate_Column.First "texts", Aggregate_Column.First "objects", Aggregate_Column.First "ints", Aggregate_Column.Last "mixed"]
t1.info.at "Column" . to_vector . should_equal ["dates", "Shortest texts", "First texts", "First objects", "First ints", "Last mixed"]
t1.info.at "Storage Type" . to_vector . should_equal [Storage.Date, Storage.Text, Storage.Text, Storage.Any, Storage.Integer, Storage.Any]
t2 = table.aggregate [Mode "dates", Count_Distinct "objects", Count_Distinct "texts", Minimum "ints", Maximum "floats"]
t2.info.at "Column" . to_vector . should_equal ["Mode dates", "Count Distinct objects", "Count Distinct texts", "Minimum ints", "Maximum floats"]
t2.info.at "Storage Type" . to_vector . should_equal [Storage.Date, Storage.Integer, Storage.Integer, Storage.Integer, Storage.Decimal]
t3 = table.aggregate [Group_By "texts", Group_By "ints", Aggregate_Column.Last "floats"]
t3.info.at "Column" . to_vector . should_equal ["texts", "ints", "Last floats"]
t3.info.at "Storage Type" . to_vector . should_equal [Storage.Text, Storage.Integer, Storage.Decimal]
t4 = table.aggregate [Group_By "mixed", Sum "ints", Sum "floats"]
t4.info.at "Column" . to_vector . should_equal ["mixed", "Sum ints", "Sum floats"]
t4.info.at "Storage Type" . to_vector . should_equal [Storage.Any, Storage.Decimal, Storage.Decimal]
main = Test.Suite.run_main spec