Implement In-Memory Table order_by (#3515)

Implemented the `order_by` function with support for all modes of operation.
Added support for case insensitive natural order.

# Important Notes
- Improved MultiValueIndex/Key to not create loads of arrays.
- Adjusted HashCode for MultiValueKey to have a simple algorithm.
- Added Text_Utils.compare_normalized_ignoring_case to allow for case insensitive comparisons.
- Fixed issues with ObjectComparator and added some unit tests for it.
This commit is contained in:
James Dunkerley 2022-06-08 13:30:50 +01:00 committed by GitHub
parent c602404b1a
commit 8afba43add
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 392 additions and 163 deletions

View File

@ -134,6 +134,7 @@
- [Added rank data, correlation and covariance statistics for `Vector`][3484]
- [Implemented `Table.order_by` for the SQLite backend.][3502]
- [Implemented `Table.order_by` for the PostgreSQL backend.][3514]
- [Implemented `Table.order_by` for the in-memory table.][3515]
- [Renamed `File_Format.Text` to `Plain_Text`, updated `File_Format.Delimited`
API and added builders for customizing less common settings.][3516]
@ -212,6 +213,7 @@
[3484]: https://github.com/enso-org/enso/pull/3484
[3502]: https://github.com/enso-org/enso/pull/3502
[3514]: https://github.com/enso-org/enso/pull/3514
[3515]: https://github.com/enso-org/enso/pull/3515
[3516]: https://github.com/enso-org/enso/pull/3516
#### Enso Compiler

View File

@ -1,4 +1,6 @@
from Standard.Base import Any, Ordering, Nothing, Vector
from Standard.Base import all
import Standard.Base.Data.Ordering.Natural_Order
from Standard.Base.Data.Text.Text_Ordering as Text_Ordering_Module import Text_Ordering
polyglot java import org.enso.base.ObjectComparator
@ -9,10 +11,26 @@ polyglot java import org.enso.base.ObjectComparator
- custom_comparator:
If `Nothing` will get a singleton instance for `.compare_to`.
Otherwise can support a custom fallback comparator.
new : Nothing | (Any->Any->Ordering)
new : Nothing | (Any->Any->Ordering) -> ObjectComparator
new custom_comparator=Nothing =
comparator_to_java cmp x y = Vector.handle_incomparable_value (cmp x y . to_sign)
case custom_comparator of
Nothing -> ObjectComparator.getInstance (comparator_to_java .compare_to)
_ -> ObjectComparator.new (comparator_to_java custom_comparator)
## ADVANCED
Create a Java Comparator with the specified Text_Ordering
Arguments:
- text_ordering:
Specifies how to compare Text values within the Comparator.
for_text_ordering : Text_Ordering -> ObjectComparator
for_text_ordering text_ordering =
case text_ordering.sort_digits_as_numbers of
True ->
txt_cmp a b = Natural_Order.compare a b text_ordering.case_sensitive . to_sign
here.new.withCustomTextComparator txt_cmp
False -> case text_ordering.case_sensitive of
Case_Insensitive locale -> here.new.withCaseInsensitivity locale.java_locale
_ -> here.new

View File

@ -18,8 +18,12 @@ polyglot java import com.ibm.icu.text.BreakIterator
Sort a vector of texts according to the natural dictionary ordering.
["a2", "a1", "a100", "a001", "a0001"].sort by=Natural_Order.compare . should_equal ["a0001", "a001", "a1", "a2", "a100"]
compare : Text -> Text -> Ordering
compare text1 text2 =
compare : Text -> Text -> (True|Case_Insensitive) Ordering
compare text1 text2 case_sensitive=True =
compare_text = case case_sensitive of
Case_Insensitive locale -> a -> b -> a.compare_to_ignore_case b locale
_ -> _.compare_to _
iter1 = BreakIterator.getCharacterInstance
iter1.setText text1
@ -79,7 +83,7 @@ compare text1 text2 =
if (tmp.first.not && tmp.second) then Ordering.Greater else
case tmp.first.not of
True ->
text_comparison = substring1.compare_to substring2
text_comparison = compare_text substring1 substring2
if text_comparison != Ordering.Equal then text_comparison else
@Tail_Call order next1 iter1.next next2 iter2.next
False ->
@ -93,7 +97,7 @@ compare text1 text2 =
value_comparison = value1.compare_to value2
if value_comparison != Ordering.Equal then value_comparison else
text_comparison = num_text1.compare_to num_text2
text_comparison = compare_text num_text1 num_text2
if text_comparison != Ordering.Equal then text_comparison else
@Tail_Call order (parsed1.at 2) (parsed1.at 3) (parsed2.at 2) (parsed2.at 3)

View File

@ -614,6 +614,22 @@ Text.compare_to that =
if comparison_result < 0 then Ordering.Less else
Ordering.Greater
## Compare two texts to discover their ordering.
Arguments:
- that: The text to order `this` with respect to.
> Example
Checking how "a" orders in relation to "b".
"a".compare_to_ignore_case "b"
Text.compare_to_ignore_case : Text -> Locale -> Ordering
Text.compare_to_ignore_case that locale=Locale.default =
comparison_result = Text_Utils.compare_normalized_ignoring_case this that locale.java_locale
if comparison_result == 0 then Ordering.Equal else
if comparison_result < 0 then Ordering.Less else
Ordering.Greater
## ALIAS Check Emptiness
Check if `this` is empty.

View File

@ -156,7 +156,7 @@ make_first_aggregator reverse ignore_null args =
filter_clause = if ignore_null.not then Sql.code "" else
Sql.code " FILTER (WHERE " ++ result_expr.paren ++ Sql.code " IS NOT NULL)"
modified_order_exprs =
order_exprs.map expr-> expr ++ Sql.code " ASC NULLS LAST"
order_exprs.map expr-> expr ++ Sql.code " ASC NULLS FIRST"
order_clause =
Sql.code " ORDER BY " ++ Sql.join "," modified_order_exprs
index_expr = case reverse of

View File

@ -19,10 +19,14 @@ from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_For
from Standard.Base.Data.Text.Text_Ordering as Text_Ordering_Module import Text_Ordering
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
from Standard.Table.Error as Error_Module import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector
import Standard.Table.Data.Column_Mapping
import Standard.Table.Data.Position
import Standard.Table.Data.Sort_Column_Selector
import Standard.Table.Data.Sort_Column
import Standard.Table.Data.Aggregate_Column
import Standard.Base.Data.Ordering.Comparator
polyglot java import org.enso.table.data.table.Table as Java_Table
polyglot java import org.enso.table.data.table.Column as Java_Column
@ -524,7 +528,7 @@ type Table
on_problems.attach_problems_before validated.problems <|
java_key_columns = validated.key_columns.map .java_column
index = this.java_table.indexFromColumns java_key_columns.to_array
index = this.java_table.indexFromColumns java_key_columns.to_array Comparator.new
new_columns = validated.valid_columns.map c->(Aggregate_Column_Helper.java_aggregator c.first c.second)
@ -535,6 +539,50 @@ type Table
problems = java_table.getProblems
Aggregate_Column_Helper.parse_aggregated_problems problems
## Sorts the rows of the table according to the specified columns and order.
Arguments:
- columns: The columns and order to sort the table.
- text_ordering: The ordering method to use on text values.
- on_problems: Specifies how to handle if a problem occurs, raising as a
warning by default. The following problems can occur:
- If a column in `columns` is not present in the input table, a
`Missing_Input_Columns`.
- If duplicate columns, names or indices are provided, a
`Duplicate_Column_Selectors`.
- If a column index is out of range, a `Column_Indexes_Out_Of_Range`.
- If two distinct indices refer to the same column, an
`Input_Indices_Already_Matched`.
- If two name matchers match the same column, a
`Column_Matched_By_Multiple_Selectors`.
- If no valid columns are selected, a `No_Input_Columns_Selected`.
- If values do not implement an ordering, an
`Incomparable_Values_Error`.
> Example
Order the table by the column "alpha" in ascending order.
table.order_by (Sort_Column_Selector.By_Name [Sort_Column.Name "alpha"])
> Example
Order the table by the second column in ascending order. In case of any
ties, break them based on the 7th column from the end of the table in
descending order.
table.order_by (Sort_Column_Selector.By_Index [Sort_Column.Index 1, Sort_Column.Index -7 Sort_Direction.Descending])
order_by : Sort_Column_Selector -> Text_Ordering -> Problem_Behavior -> Table
order_by (columns = (Sort_Column_Selector.By_Name [(Sort_Column.Name (this.columns.at 0 . name))])) text_ordering=Text_Ordering on_problems=Report_Warning =
columns_for_ordering = Table_Helpers.prepare_order_by this.columns columns on_problems
selected_columns = columns_for_ordering.map c->c.column.java_column
ordering = columns_for_ordering.map c->
case c.associated_selector.direction of
Sort_Direction.Ascending -> 1
Sort_Direction.Descending -> -1
comparator = Comparator.for_text_ordering text_ordering
Table <|
this.java_table.orderBy selected_columns.to_array ordering.to_array comparator
## Parses columns within a Table to a specific value type.
By default, it looks at all `Text` columns and attempts to deduce the
type (columns with other types are not affected). If `column_types` are

View File

@ -120,7 +120,7 @@ resolve_aggregate table problem_builder aggregate_column =
resolve_selector_to_vector : Column_Selector -> [Column] ! Internal_Missing_Column_Error
resolve_selector_to_vector selector =
resolved = Table_Helpers.select_columns_helper table_columns selector reorder=False problem_builder
resolved = Table_Helpers.select_columns_helper table_columns selector reorder=True problem_builder
if resolved.is_empty then Error.throw Internal_Missing_Column_Error else resolved
resolve_selector_or_nothing selector = case selector of
@ -175,7 +175,7 @@ java_aggregator name column =
Count _ -> CountAggregator.new name
Count_Distinct columns _ ignore_nothing ->
resolved = columns.map .java_column
CountDistinctAggregator.new name resolved.to_array ignore_nothing
CountDistinctAggregator.new name resolved.to_array ignore_nothing Comparator.new
Count_Not_Nothing c _ -> CountNothingAggregator.new name c.java_column False
Count_Nothing c _ -> CountNothingAggregator.new name c.java_column True
Count_Not_Empty c _ -> CountEmptyAggregator.new name c.java_column False

View File

@ -4,27 +4,30 @@ import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.util.Comparator;
import java.util.Locale;
import java.util.function.BiFunction;
public class ObjectComparator implements Comparator<Object> {
private static ObjectComparator INSTANCE;
/**
* A singleton instance of an ObjectComparator
* A singleton instance of an ObjectComparator.
*
* @param fallbackComparator this MUST be the default .compare_to function for Enso. Needs to be
* passed to allow calling back from Java.
* @return Comparator object
* @return Comparator object.
*/
public static ObjectComparator getInstance(BiFunction<Object, Object, Long> fallbackComparator) {
if (INSTANCE == null) {
INSTANCE = new ObjectComparator((l, r) -> fallbackComparator.apply(l, r).intValue());
INSTANCE = new ObjectComparator(fallbackComparator);
}
return INSTANCE;
}
private final BiFunction<Object, Object, Integer> fallbackComparator;
private final BiFunction<Object, Object, Long> fallbackComparator;
private final BiFunction<String, String, Long> textComparator;
public ObjectComparator() {
this(
@ -33,8 +36,31 @@ public class ObjectComparator implements Comparator<Object> {
});
}
public ObjectComparator(BiFunction<Object, Object, Integer> fallbackComparator) {
public ObjectComparator(BiFunction<Object, Object, Long> fallbackComparator) {
this(fallbackComparator, (a, b) -> Long.valueOf(Text_Utils.compare_normalized(a, b)));
}
private ObjectComparator(BiFunction<Object, Object, Long> fallbackComparator, BiFunction<String, String, Long> textComparator) {
this.fallbackComparator = fallbackComparator;
this.textComparator = textComparator;
}
/**
* Create a copy of the ObjectComparator with case-insensitive text comparisons.
* @param locale to use for case folding.
* @return Comparator object.
*/
public ObjectComparator withCaseInsensitivity(Locale locale) {
return new ObjectComparator(this.fallbackComparator, (a, b) -> Long.valueOf(Text_Utils.compare_normalized_ignoring_case(a, b, locale)));
}
/**
* Create a copy of the ObjectComparator with case-insensitive text comparisons.
* @param textComparator custom comparator for Text.
* @return Comparator object.
*/
public ObjectComparator withCustomTextComparator(BiFunction<String, String, Long> textComparator) {
return new ObjectComparator(this.fallbackComparator, textComparator);
}
@Override
@ -42,18 +68,16 @@ public class ObjectComparator implements Comparator<Object> {
// NULLs
if (thisValue == null) {
if (thatValue != null) {
return 1;
return -1;
}
return 0;
}
if (thatValue == null) {
return -1;
return 1;
}
// Booleans
if (thisValue instanceof Boolean && thatValue instanceof Boolean) {
boolean thisBool = (Boolean) thisValue;
boolean thatBool = (Boolean) thatValue;
if (thisValue instanceof Boolean thisBool && thatValue instanceof Boolean thatBool) {
if (thisBool == thatBool) {
return 0;
}
@ -61,13 +85,11 @@ public class ObjectComparator implements Comparator<Object> {
}
// Long this
if (thisValue instanceof Long) {
Long thisLong = (Long) thisValue;
if (thatValue instanceof Long) {
return thisLong.compareTo((Long) thatValue);
if (thisValue instanceof Long thisLong) {
if (thatValue instanceof Long thatLong) {
return thisLong.compareTo(thatLong);
}
if (thatValue instanceof Double) {
Double thatDouble = (Double) thatValue;
if (thatValue instanceof Double thatDouble) {
if (thisLong > thatDouble) {
return 1;
}
@ -79,13 +101,11 @@ public class ObjectComparator implements Comparator<Object> {
}
// Double this
if (thisValue instanceof Double) {
Double thisDouble = (Double) thisValue;
if (thatValue instanceof Double) {
return thisDouble.compareTo((Double) thatValue);
if (thisValue instanceof Double thisDouble) {
if (thatValue instanceof Double thatDouble) {
return thisDouble.compareTo(thatDouble);
}
if (thatValue instanceof Long) {
Long thatLong = (Long) thatValue;
if (thatValue instanceof Long thatLong) {
if (thisDouble > thatLong) {
return 1;
}
@ -97,39 +117,36 @@ public class ObjectComparator implements Comparator<Object> {
}
// Text
if (thisValue instanceof String && thatValue instanceof String) {
return Text_Utils.compare_normalized((String) thisValue, (String) thatValue);
if (thisValue instanceof String thisString && thatValue instanceof String thatString) {
return textComparator.apply(thisString, thatString).intValue();
}
// DateTimes
if (thisValue instanceof LocalDate) {
LocalDate thisDate = (LocalDate) thisValue;
if (thatValue instanceof LocalDate) {
return thisDate.compareTo((LocalDate) thatValue);
if (thisValue instanceof LocalDate thisDate) {
if (thatValue instanceof LocalDate thatDate) {
return thisDate.compareTo(thatDate);
}
if (thatValue instanceof LocalDateTime) {
return thisDate.atStartOfDay().compareTo((LocalDateTime) thatValue);
if (thatValue instanceof LocalDateTime thatDateTime) {
return thisDate.atStartOfDay().compareTo(thatDateTime);
}
}
if (thisValue instanceof LocalDateTime) {
LocalDateTime thisDateTime = (LocalDateTime) thisValue;
if (thatValue instanceof LocalDate) {
return thisDateTime.compareTo(((LocalDate) thatValue).atStartOfDay());
if (thisValue instanceof LocalDateTime thisDateTime) {
if (thatValue instanceof LocalDate thatDate) {
return thisDateTime.compareTo(thatDate.atStartOfDay());
}
if (thatValue instanceof LocalDateTime) {
return thisDateTime.compareTo((LocalDateTime) thatValue);
if (thatValue instanceof LocalDateTime thatDateTime) {
return thisDateTime.compareTo(thatDateTime);
}
}
// TimeOfDay
if (thisValue instanceof LocalTime) {
LocalTime thisTime = (LocalTime) thisValue;
if (thatValue instanceof LocalTime) {
return thisTime.compareTo((LocalTime) thatValue);
if (thisValue instanceof LocalTime thisTime) {
if (thatValue instanceof LocalTime thatTime) {
return thisTime.compareTo(thatTime);
}
}
// Fallback to Enso
return fallbackComparator.apply(thisValue, thatValue);
return fallbackComparator.apply(thisValue, thatValue).intValue();
}
}

View File

@ -124,9 +124,8 @@ public class Text_Utils {
* @return the result of comparison
*/
public static boolean equals_ignore_case(String str1, Object str2, Locale locale) {
if (str2 instanceof String) {
Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
return compare_normalized(fold.apply(str1), fold.apply((String) str2)) == 0;
if (str2 instanceof String string2) {
return compare_normalized_ignoring_case(str1, string2, locale) == 0;
} else {
return false;
}
@ -165,6 +164,21 @@ public class Text_Utils {
return Normalizer.compare(a, b, Normalizer.FOLD_CASE_DEFAULT);
}
/**
* Compares {@code a} to {@code b} according to the lexicographical order, handling Unicode
* normalization.
*
* @param a the left operand
* @param b the right operand
* @param locale the locale to use for case folding
* @return a negative value if {@code a} is before {@code b}, 0 if both values are equal and a
* positive value if {@code a} is after {@code b}
*/
public static int compare_normalized_ignoring_case(String a, String b, Locale locale) {
Fold fold = CaseFoldedString.caseFoldAlgorithmForLocale(locale);
return Normalizer.compare(fold.apply(a), fold.apply(b), Normalizer.FOLD_CASE_DEFAULT);
}
/**
* Checks if {@code substring} is a substring of {@code string}.
*

View File

@ -5,10 +5,7 @@ import org.enso.table.data.index.MultiValueKey;
import org.enso.table.data.table.Column;
import org.enso.table.data.table.problems.FloatingPointGrouping;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
/**
* Aggregate Column counting the number of distinct items in a group. If `ignoreAllNull` is true,
@ -16,6 +13,7 @@ import java.util.Set;
*/
public class CountDistinct extends Aggregator {
private final Storage[] storage;
private final Comparator<Object> objectComparator;
private final boolean ignoreAllNull;
/**
@ -25,18 +23,19 @@ public class CountDistinct extends Aggregator {
* @param columns input columns
* @param ignoreAllNull if true ignore then all values are null
*/
public CountDistinct(String name, Column[] columns, boolean ignoreAllNull) {
public CountDistinct(
String name, Column[] columns, boolean ignoreAllNull, Comparator<Object> objectComparator) {
super(name, Storage.Type.LONG);
this.storage = Arrays.stream(columns).map(Column::getStorage).toArray(Storage[]::new);
this.ignoreAllNull = ignoreAllNull;
this.objectComparator = objectComparator;
}
@Override
public Object aggregate(List<Integer> indexes) {
Set<MultiValueKey> set = new HashSet<>();
for (int row : indexes) {
MultiValueKey key =
new MultiValueKey(Arrays.stream(storage).map(s -> s.getItemBoxed(row)).toArray());
MultiValueKey key = new MultiValueKey(storage, row, objectComparator);
if (key.hasFloatValues()) {
this.addProblem(new FloatingPointGrouping(this.getName(), row));
}

View File

@ -54,10 +54,7 @@ public class First extends Aggregator {
continue;
}
MultiValueKey newKey =
new MultiValueKey(
Arrays.stream(this.ordering).map(o -> o.getItemBoxed(row)).toArray(),
objectComparator);
MultiValueKey newKey = new MultiValueKey(this.ordering, row, objectComparator);
if (key == null || key.compareTo(newKey) > 0) {
key = newKey;
current = storage.getItemBoxed(row);

View File

@ -54,10 +54,7 @@ public class Last extends Aggregator {
continue;
}
MultiValueKey newKey =
new MultiValueKey(
Arrays.stream(this.ordering).map(o -> o.getItemBoxed(row)).toArray(),
objectComparator);
MultiValueKey newKey = new MultiValueKey(this.ordering, row, objectComparator);
if (key == null || key.compareTo(newKey) < 0) {
key = newKey;
current = storage.getItemBoxed(row);

View File

@ -9,11 +9,7 @@ import org.enso.table.data.table.Table;
import org.enso.table.data.table.problems.AggregatedProblems;
import org.enso.table.data.table.problems.FloatingPointGrouping;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@ -22,18 +18,20 @@ public class MultiValueIndex {
private final Map<MultiValueKey, List<Integer>> locs;
private final AggregatedProblems problems;
public MultiValueIndex(Column[] keyColumns, int tableSize) {
public MultiValueIndex(Column[] keyColumns, int tableSize, Comparator<Object> objectComparator) {
this(keyColumns, tableSize, null, objectComparator);
}
public MultiValueIndex(Column[] keyColumns, int tableSize, int[] ordering, Comparator<Object> objectComparator) {
this.keyColumnsLength = keyColumns.length;
this.locs = new HashMap<>();
this.locs = ordering == null ? new HashMap<>() : new TreeMap<>();
this.problems = new AggregatedProblems();
if (keyColumns.length != 0) {
int size = keyColumns[0].getSize();
Storage[] storage = Arrays.stream(keyColumns).map(Column::getStorage).toArray(Storage[]::new);
for (int i = 0; i < size; i++) {
int finalI = i;
MultiValueKey key =
new MultiValueKey(
Arrays.stream(keyColumns).map(c -> c.getStorage().getItemBoxed(finalI)).toArray());
MultiValueKey key = new MultiValueKey(storage, i, ordering, objectComparator);
if (key.hasFloatValues()) {
problems.add(new FloatingPointGrouping("GroupBy", i));
@ -43,9 +41,7 @@ public class MultiValueIndex {
ids.add(i);
}
} else {
this.locs.put(
new MultiValueKey(new Object[0]),
IntStream.range(0, tableSize).boxed().collect(Collectors.toList()));
this.locs.put(new MultiValueKey(new Storage[0], 0, objectComparator), IntStream.range(0, tableSize).boxed().collect(Collectors.toList()));
}
}
@ -87,19 +83,31 @@ public class MultiValueIndex {
merged);
}
private static Builder getBuilderForType(int type, int size) {
switch (type) {
case Storage.Type.BOOL:
return new BoolBuilder();
case Storage.Type.DOUBLE:
return NumericBuilder.createDoubleBuilder(size);
case Storage.Type.LONG:
return NumericBuilder.createLongBuilder(size);
case Storage.Type.STRING:
return new StringBuilder(size);
case Storage.Type.OBJECT:
return new ObjectBuilder(size);
public int[] makeOrderMap(int rowCount) {
if (this.locs.size() == 0) {
return new int[0];
}
return new InferredBuilder(size);
int[] output = new int[rowCount];
int idx = 0;
for (List<Integer> rowIndexes : this.locs.values()) {
for (Integer rowIndex : rowIndexes) {
output[idx++] = rowIndex;
}
}
return output;
}
private static Builder getBuilderForType(int type, int size) {
return switch (type) {
case Storage.Type.BOOL -> new BoolBuilder();
case Storage.Type.DOUBLE -> NumericBuilder.createDoubleBuilder(size);
case Storage.Type.LONG -> NumericBuilder.createLongBuilder(size);
case Storage.Type.STRING -> new StringBuilder(size);
case Storage.Type.OBJECT -> new ObjectBuilder(size);
default -> new InferredBuilder(size);
};
}
}

View File

@ -1,45 +1,63 @@
package org.enso.table.data.index;
import org.enso.table.data.column.storage.Storage;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Objects;
public class MultiValueKey implements Comparable<MultiValueKey> {
private final Object[] values;
private final Storage[] storage;
private final int[] directions;
private final int rowIndex;
private final Comparator<Object> objectComparator;
private final int hashCodeValue;
private final boolean allNull;
private final boolean floatValue;
public MultiValueKey(Object[] values) {
this(values, null);
public MultiValueKey(Storage[] storage, int rowIndex, Comparator<Object> objectComparator) {
this(storage, rowIndex, null, objectComparator);
}
public MultiValueKey(Object[] values, Comparator<Object> objectComparator) {
this.values = values;
public MultiValueKey(
Storage[] storage, int rowIndex, int[] directions, Comparator<Object> objectComparator) {
this.storage = storage;
this.rowIndex = rowIndex;
if (directions == null) {
directions = new int[storage.length];
Arrays.fill(directions, 1);
}
this.directions = directions;
this.objectComparator = objectComparator;
boolean allNull = true;
boolean floatValue = false;
// Precompute HashCode - using Apache.Commons.Collections.Map.MultiKeyMap.hash algorithm
int h = 0;
for (Object value : this.values) {
int h = 1;
for (int i = 0; i < storage.length; i++) {
h = 31 * h;
Object value = this.get(i);
if (value != null) {
Object folded = foldObject(value);
floatValue = floatValue || (folded instanceof Double);
h ^= folded.hashCode();
h += folded.hashCode();
allNull = false;
}
}
h += ~(h << 9);
h ^= h >>> 14;
h += h << 4;
this.hashCodeValue = h ^ (h >>> 10);
this.hashCodeValue = h;
this.allNull = allNull;
this.floatValue = floatValue;
}
public Object get(int column) {
return storage[column].getItemBoxed(rowIndex);
}
@Override
public int hashCode() {
return this.hashCodeValue;
@ -48,9 +66,16 @@ public class MultiValueKey implements Comparable<MultiValueKey> {
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
MultiValueKey that = (MultiValueKey) o;
return hashCodeValue == that.hashCodeValue && Arrays.equals(values, that.values);
if (!(o instanceof MultiValueKey that)) return false;
if (storage.length != that.storage.length) return false;
if (hashCodeValue != that.hashCodeValue) return false;
for (int i = 0; i < storage.length; i++) {
if (objectComparator.compare(get(i), that.get(i)) != 0) {
return false;
}
}
return true;
}
public boolean areAllNull() {
@ -87,14 +112,14 @@ public class MultiValueKey implements Comparable<MultiValueKey> {
throw new NullPointerException();
}
if (that.values.length != values.length) {
if (that.storage.length != storage.length) {
throw new ClassCastException("Incomparable keys.");
}
for (int i = 0; i < values.length; i++) {
int comparison = objectComparator.compare(values[i], that.values[i]);
for (int i = 0; i < storage.length; i++) {
int comparison = objectComparator.compare(get(i), that.get(i));
if (comparison != 0) {
return comparison;
return comparison * directions[i];
}
}

View File

@ -1,11 +1,6 @@
package org.enso.table.data.table;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import org.enso.table.data.column.builder.object.InferredBuilder;
import org.enso.table.data.column.storage.BoolStorage;
@ -115,11 +110,10 @@ public class Table {
* @return the result of masking this table with the provided column
*/
public Table mask(Column maskCol) {
if (!(maskCol.getStorage() instanceof BoolStorage)) {
if (!(maskCol.getStorage() instanceof BoolStorage storage)) {
throw new UnexpectedColumnTypeException("Boolean");
}
BoolStorage storage = (BoolStorage) maskCol.getStorage();
var mask = BoolStorage.toMask(storage);
var localStorageMask = new BitSet();
localStorageMask.set(0, rowCount());
@ -212,13 +206,28 @@ public class Table {
}
/**
* Creates an index fpr this table by using values from the specified columns.
* Creates an index for this table by using values from the specified columns.
*
* @param columns set of columns to use as an Index
* @param objectComparator Object comparator allowing calling back to `compare_to` when needed.
* @return a table indexed by the proper column
*/
public MultiValueIndex indexFromColumns(Column[] columns) {
return new MultiValueIndex(columns, this.rowCount());
public MultiValueIndex indexFromColumns(Column[] columns, Comparator<Object> objectComparator) {
return new MultiValueIndex(columns, this.rowCount(), objectComparator);
}
/**
* Creates a new table with the rows sorted
*
* @param columns set of columns to use as an Index
* @param objectComparator Object comparator allowing calling back to `compare_to` when needed.
* @return a table indexed by the proper column
*/
public Table orderBy(Column[] columns, Long[] directions, Comparator<Object> objectComparator) {
int[] directionInts = Arrays.stream(directions).mapToInt(Long::intValue).toArray();
MultiValueIndex index = new MultiValueIndex(columns, this.rowCount(), directionInts, objectComparator);
OrderMask mask = new OrderMask(index.makeOrderMap(this.rowCount()));
return this.applyMask(mask);
}
/**

View File

@ -350,11 +350,7 @@ public class Reader {
* @throws IOException when the input stream cannot be read.
*/
public static Table readSheetByName(
InputStream stream,
String sheetName,
int skip_rows,
Integer row_limit,
boolean xls_format)
InputStream stream, String sheetName, int skip_rows, Integer row_limit, boolean xls_format)
throws IOException, IllegalArgumentException {
Workbook workbook = getWorkbook(stream, xls_format);
@ -364,11 +360,7 @@ public class Reader {
}
return readSheetToTable(
workbook,
sheetIndex,
null,
skip_rows,
row_limit == null ? Integer.MAX_VALUE : row_limit);
workbook, sheetIndex, null, skip_rows, row_limit == null ? Integer.MAX_VALUE : row_limit);
}
/**
@ -394,11 +386,7 @@ public class Reader {
}
return readSheetToTable(
workbook,
index - 1,
null,
skip_rows,
row_limit == null ? Integer.MAX_VALUE : row_limit);
workbook, index - 1, null, skip_rows, row_limit == null ? Integer.MAX_VALUE : row_limit);
}
/**
@ -447,18 +435,13 @@ public class Reader {
return xls_format ? new HSSFWorkbook(stream) : new XSSFWorkbook(stream);
}
private static Table readRange(
Workbook workbook, Range range, int skip_rows, Integer row_limit) {
private static Table readRange(Workbook workbook, Range range, int skip_rows, Integer row_limit) {
int sheetIndex = getSheetIndex(workbook, range.getSheetName());
if (sheetIndex == -1) {
throw new IllegalArgumentException("Unknown sheet '" + range.getSheetName() + "'.");
}
return readSheetToTable(
workbook,
sheetIndex,
range,
skip_rows,
row_limit == null ? Integer.MAX_VALUE : row_limit);
workbook, sheetIndex, range, skip_rows, row_limit == null ? Integer.MAX_VALUE : row_limit);
}
}

View File

@ -147,7 +147,7 @@ aggregate_spec prefix table empty_table table_builder materialize is_database te
grouped.row_count . should_equal 1
materialized.columns.length . should_equal 2
materialized.columns.at 0 . name . should_equal "First Index"
materialized.columns.at 0 . at 0 . should_equal 9
materialized.columns.at 0 . at 0 . should_equal 5
materialized.columns.at 1 . name . should_equal "Last ValueWithNothing"
materialized.columns.at 1 . at 0 . should_equal -89.78 epsilon=0.000001
@ -517,17 +517,17 @@ aggregate_spec prefix table empty_table table_builder materialize is_database te
materialized.columns.at 6 . at idx . should_equal -18.802000 epsilon=0.000001
Test.specify "should be able to get first and last values" (pending = resolve_pending test_selection.first_last) <|
grouped = table.aggregate [Group_By "Index", First "TextWithNothing" (order_by = By_Name ["Hexadecimal", "Flag"]), Last "ValueWithNothing" (order_by = By_Name ["Value"])]
grouped = table.aggregate [Group_By "Index", First "TextWithNothing" (order_by = By_Name ["Value", "Flag"]), Last "ValueWithNothing" (order_by = By_Name ["Value"])]
materialized = materialize grouped
grouped.row_count . should_equal 10
materialized.columns.length . should_equal 3
materialized.columns.at 0 . name . should_equal "Index"
idx = find_row [6] materialized
idx = find_row [7] materialized
idx.is_nothing . should_be_false
materialized.columns.at 1 . name . should_equal "First TextWithNothing"
materialized.columns.at 1 . at idx . should_equal "v78nbv8fr1"
materialized.columns.at 1 . at idx . should_equal "8g6kidngic"
materialized.columns.at 2 . name . should_equal "Last ValueWithNothing"
materialized.columns.at 2 . at idx . should_equal 19.77 epsilon=0.000001
materialized.columns.at 2 . at idx . should_equal -89.78 epsilon=0.000001
Test.specify "should be able to get first and last values with default row order" (pending = resolve_pending test_selection.first_last_row_order) <|
grouped = table.aggregate [Group_By "Index", First "TextWithNothing", Last "Value"]
@ -701,18 +701,18 @@ aggregate_spec prefix table empty_table table_builder materialize is_database te
materialized.columns.at 7 . at idx . should_equal -17.174000 epsilon=0.000001
Test.specify "should be able to get first and last values" (pending = resolve_pending test_selection.first_last) <|
grouped = table.aggregate [Group_By "Flag", First "TextWithNothing" (order_by = By_Name ["Hexadecimal", "Flag"]), Last "ValueWithNothing" (order_by = By_Name ["Value"]), Group_By "Index"]
grouped = table.aggregate [Group_By "Flag", First "TextWithNothing" (order_by = By_Name ["Value", "Flag"]), Last "ValueWithNothing" (order_by = By_Name ["Value"]), Group_By "Index"]
materialized = materialize grouped
grouped.row_count . should_equal 20
materialized.columns.length . should_equal 4
materialized.columns.at 0 . name . should_equal "Flag"
materialized.columns.at 3 . name . should_equal "Index"
idx = find_row [False, 6] materialized [0, 3]
idx = find_row [False, 7] materialized [0, 3]
idx.is_nothing . should_be_false
materialized.columns.at 1 . name . should_equal "First TextWithNothing"
materialized.columns.at 1 . at idx . should_equal "v78nbv8fr1"
materialized.columns.at 1 . at idx . should_equal "8g6kidngic"
materialized.columns.at 2 . name . should_equal "Last ValueWithNothing"
materialized.columns.at 2 . at idx . should_equal 42.17 epsilon=0.000001
materialized.columns.at 2 . at idx . should_equal -89.78 epsilon=0.000001
Test.specify "should be able to get first and last values with default row order" (pending = resolve_pending test_selection.first_last_row_order) <|
grouped = table.aggregate [Group_By "Flag", First "TextWithNothing", Last "Value", Group_By "Index"]

View File

@ -122,6 +122,13 @@ spec prefix table_builder test_selection pending=Nothing =
problems = [Duplicate_Column_Selectors ["foo"]]
Problems.test_problem_handling action problems tester
Test.specify "should correctly handle problems: duplicate matches due to case insensitivity" pending="TODO needs fixing" <|
selector = By_Name.new ["FOO", "foo"] (Text_Matcher case_sensitive=Case_Insensitive)
action = table.select_columns selector on_problems=_
tester = expect_column_names ["foo"]
problems = [Duplicate_Column_Selectors ["foo"]]
Problems.test_problem_handling action problems tester
Test.specify "should correctly handle problems: unmatched names" <|
weird_name = '.*?-!@#!"'
selector = By_Name ["foo", "hmm", weird_name]
@ -238,6 +245,13 @@ spec prefix table_builder test_selection pending=Nothing =
problems = [Duplicate_Column_Selectors ["foo"]]
Problems.test_problem_handling action problems tester
Test.specify "should correctly handle problems: duplicate matches due to case insensitivity" pending="TODO needs fixing" <|
selector = By_Name.new ["FOO", "foo"] (Text_Matcher case_sensitive=Case_Insensitive)
action = table.remove_columns selector on_problems=_
tester = expect_column_names ["bar", "Baz", "foo_1", "foo_2", "ab.+123", "abcd123"]
problems = [Duplicate_Column_Selectors ["foo"]]
Problems.test_problem_handling action problems tester
Test.specify "should correctly handle problems: unmatched names" <|
weird_name = '.*?-!@#!"'
selector = By_Name ["foo", "hmm", weird_name]
@ -723,16 +737,16 @@ spec prefix table_builder test_selection pending=Nothing =
Test.specify "should support natural and case insensitive ordering at the same time" pending=(if (test_selection.natural_ordering.not || test_selection.case_insensitive_ordering.not) then "Natural ordering or case sensitive ordering is not supported.") <|
t1 = table.order_by (Sort_Column_Selector.By_Name [Sort_Column.Name "psi"]) text_ordering=(Text_Ordering sort_digits_as_numbers=True case_sensitive=Case_Insensitive)
t1.at "psi" . to_vector . should_equal ["c01", "C2", "c10", Nothing]
t1.at "psi" . to_vector . should_equal [Nothing, "c01", "C2", "c10"]
t2 = table.order_by (Sort_Column_Selector.By_Name [Sort_Column.Name "psi"]) text_ordering=(Text_Ordering sort_digits_as_numbers=True)
t2.at "psi" . to_vector . should_equal ["C2", "c01", "c10", Nothing]
t2.at "psi" . to_vector . should_equal [Nothing, "C2", "c01", "c10"]
t3 = table.order_by (Sort_Column_Selector.By_Name [Sort_Column.Name "psi"]) text_ordering=(Text_Ordering case_sensitive=Case_Insensitive)
t3.at "psi" . to_vector . should_equal ["c01", "c10", "C2", Nothing]
t3.at "psi" . to_vector . should_equal [Nothing, "c01", "c10", "C2"]
t4 = table.order_by (Sort_Column_Selector.By_Name [Sort_Column.Name "psi"])
t4.at "psi" . to_vector . should_equal ["C2", "c01", "c10", Nothing]
t4.at "psi" . to_vector . should_equal [Nothing, "C2", "c01", "c10"]
Test.specify "text ordering settings should not affect numeric columns" <|
ordering = Text_Ordering sort_digits_as_numbers=True case_sensitive=Case_Insensitive

View File

@ -635,7 +635,7 @@ spec =
t_3 = Table.new [c_3_1, c_3_2, c_3_3]
t_3.default_visualization.should_equal Visualization.Id.table
selection = Common_Table_Spec.Test_Selection supports_case_sensitive_columns=True order_by=False natural_ordering=True case_insensitive_ordering=True order_by_unicode_normalization_by_default=True
selection = Common_Table_Spec.Test_Selection supports_case_sensitive_columns=True order_by=True natural_ordering=True case_insensitive_ordering=True order_by_unicode_normalization_by_default=True
Common_Table_Spec.spec "[In-Memory] " table_builder=Table.new test_selection=selection
Test.group "Use First Row As Names" <|

View File

@ -0,0 +1,63 @@
from Standard.Base import all
import Standard.Base.Data.Ordering.Comparator
from Standard.Base.Data.Text.Text_Ordering as Text_Ordering_Module import Text_Ordering
import Standard.Test
polyglot java import java.lang.ClassCastException
# === Test Resources ===
type Ord number
Ord.compare_to : Ord -> Ordering
Ord.compare_to that = that.number.compare_to this.number
type No_Ord number
# Tests
spec = Test.group "Object Comparator" <|
handle_classcast = Panic.catch ClassCastException handler=(Error.throw Vector.Incomparable_Values_Error)
default_comparator a b = handle_classcast <| Comparator.new.compare a b
case_insensitive a b = handle_classcast <| Comparator.for_text_ordering (Text_Ordering False Case_Insensitive) . compare a b
Test.specify "can compare numbers" <|
((default_comparator 1 2) < 0) . should_equal True
((default_comparator 1 1.2) < 0) . should_equal True
((default_comparator 1 1) == 0) . should_equal True
Test.specify "can compare booleans" <|
((default_comparator True False) > 0) . should_equal True
((default_comparator True True) == 0) . should_equal True
((default_comparator False False) == 0) . should_equal True
Test.specify "can compare Nothing and it ends up as lowest value" <|
((default_comparator 1 Nothing) > 0) . should_equal True
((default_comparator Nothing 1.235) < 0) . should_equal True
((default_comparator True Nothing) > 0) . should_equal True
((default_comparator Nothing False) < 0) . should_equal True
((default_comparator "A" Nothing) > 0) . should_equal True
((default_comparator Nothing "ZSA") < 0) . should_equal True
((default_comparator Nothing Nothing) == 0) . should_equal True
Test.specify "can compare Text with Enso standard defaults" <|
((default_comparator "A" "a") < 0) . should_equal True
((default_comparator "ABBA" "ABBA") == 0) . should_equal True
((default_comparator '\u00E9' '\u0065\u{301}') == 0) . should_equal True
Test.specify "can compare Text with case-insensitive comparisons" <|
((case_insensitive "A" "a") == 0) . should_equal True
((case_insensitive "ABBA" "abba") == 0) . should_equal True
((case_insensitive '\u00E9' '\u0065\u{301}') == 0) . should_equal True
Test.specify "can compare custom types" <|
((default_comparator (Ord 1) (Ord 0)) < 0) . should_equal True
((default_comparator (Ord 1) (Ord 1)) == 0) . should_equal True
Test.specify "should fail gracefully for incomparable items" <|
(default_comparator 1 True).should_fail_with Vector.Incomparable_Values_Error
(default_comparator (No_Ord 1) (No_Ord 2)).should_fail_with Vector.Incomparable_Values_Error
main = Test.Suite.run_main here.spec

View File

@ -5,9 +5,13 @@ import Standard.Base.Data.Ordering.Natural_Order
import Standard.Test
spec = Test.group "Natural Order" <|
case_insensitive_compare a b = Natural_Order.compare a b Case_Insensitive
Test.specify "should behave as shown in examples" <|
Natural_Order.compare "a2" "a100" . should_equal Ordering.Less
["a2", "a1", "a100", "a001", "a0001"].sort by=Natural_Order.compare . should_equal ["a0001", "a001", "a1", "a2", "a100"]
["A2", "a1", "A100", "A001", "a0001"].sort by=Natural_Order.compare . should_equal ["A001", "A2", "A100", "a0001", "a1"]
["A2", "a1", "A100", "A001", "a0001"].sort by=case_insensitive_compare . should_equal ["a0001", "A001", "a1", "A2", "A100"]
Test.specify "should correctly compare values" <|
Natural_Order.compare "a1" "a2" . should_equal Ordering.Less

View File

@ -247,5 +247,4 @@ spec =
Statistics.covariance_matrix series . should_fail_with Illegal_Argument_Error
Statistics.pearson_correlation series . should_fail_with Illegal_Argument_Error
main = Test.Suite.run_main here.spec

View File

@ -93,8 +93,14 @@ spec =
'a\u0321\u0302'=='a\u0302\u0321' . should_be_true
'a\u0321\u0302'=='A\u0302\u0321' . should_be_false
accent_1+"a" . compare_to accent_2+"a" . should_equal Ordering.Equal
accent_1+"A" . compare_to accent_2+"a" . should_equal Ordering.Less
accent_1+"A" . compare_to_ignore_case accent_2+"a" . should_equal Ordering.Equal
accent_1+"a" . compare_to accent_2+"b" . should_equal Ordering.Less
accent_1+"a" . compare_to_ignore_case accent_2+"B" . should_equal Ordering.Less
accent_2+"a" . compare_to accent_1+"b" . should_equal Ordering.Less
accent_1+"a" . compare_to accent_2+"B" . should_equal Ordering.Greater
accent_1+"a" . compare_to_ignore_case accent_2+"B" . should_equal Ordering.Less
accent_1+"b" . compare_to accent_2+"a" . should_equal Ordering.Greater
accent_2+"b" . compare_to accent_1+"a" . should_equal Ordering.Greater

View File

@ -30,6 +30,9 @@ import project.Data.Noise.Generator_Spec as Noise_Generator_Spec
import project.Data.Noise_Spec
import project.Data.Numbers_Spec
import project.Data.Ordering_Spec
import project.Data.Ordering.Comparator_Spec
import project.Data.Ordering.Natural_Order_Spec
import project.Data.Ordering.Vector_Lexicographic_Order_Spec
import project.Data.Range_Spec
import project.Data.Ref_Spec
import project.Data.Text_Spec
@ -88,6 +91,9 @@ main = Test.Suite.run_main <|
Noise_Spec.spec
Numbers_Spec.spec
Ordering_Spec.spec
Comparator_Spec.spec
Natural_Order_Spec.spec
Vector_Lexicographic_Order_Spec.spec
Process_Spec.spec
Python_Interop_Spec.spec
R_Interop_Spec.spec