mirror of
https://github.com/enso-org/enso.git
synced 2025-01-09 03:37:19 +03:00
Improve performance of Join_Condition.Between
by sorting on one dimension (#8212)
- Closes #5303 - Refactors `JoinStrategy` allowing us to 'stack' join strategies on top of each other (to some extent) - currently a `HashJoin` can be followed by another join strategy (currently `SortJoin`) - Adds benchmarks for join - Due to limitations of the sorting approach this will still not be as fast as possible for cases where there is more than 1 `Between` condition in a single query - trying to demonstrate that in benchmarks. - We can replace sorting by d-dimensional [RangeTrees](https://en.wikipedia.org/wiki/Range_tree) to get `O((n + m) log^d n + k)` performance (where `n` and `m` are sizes of joined tables, `d` is the amount of `Between` conditions used in the query and `k` is the result set size). - Follow up ticket for consideration later: #8216 - Closes #8215 - After all, it turned out that `TreeSet` was problematic (because of not enough flexibility with duplicate key handling), so the simplest solution was to immediately implement this sub-task. - Closes #8204 - Unrelated, but I ran into this here: adds type checks to other arguments of `set`. - Before, putting in a Column as `new_name` (i.e. mistakenly messing up the order of arguments), lead to a hard to understand `Method `if_then_else` of type Column could not be found.`, instead now it would file with type error 'expected Text got Column`.
This commit is contained in:
parent
1388fe1cf9
commit
1b8b30a68d
@ -837,7 +837,7 @@ type Table
|
||||
table.set "2 * [total_stock]" new_name="total_stock_expr"
|
||||
@new_name Widget_Helpers.make_column_name_selector
|
||||
set : Column | Text | Array | Vector | Range | Date_Range | Constant_Column | Column_Operation -> Text -> Set_Mode -> Problem_Behavior -> Table ! Existing_Column | Missing_Column | No_Such_Column | Expression_Error
|
||||
set self column new_name="" set_mode=Set_Mode.Add_Or_Update on_problems=Report_Warning =
|
||||
set self column (new_name : Text = "") (set_mode : Set_Mode = Set_Mode.Add_Or_Update) (on_problems : Problem_Behavior = Report_Warning) =
|
||||
problem_builder = Problem_Builder.new
|
||||
unique = self.column_naming_helper.create_unique_name_strategy
|
||||
unique.mark_used self.column_names
|
||||
|
@ -68,10 +68,10 @@ polyglot java import org.enso.base.ObjectComparator
|
||||
polyglot java import org.enso.table.data.index.MultiValueIndex
|
||||
polyglot java import org.enso.table.data.mask.OrderMask
|
||||
polyglot java import org.enso.table.data.table.Column as Java_Column
|
||||
polyglot java import org.enso.table.data.table.join.Between as Java_Join_Between
|
||||
polyglot java import org.enso.table.data.table.join.Equals as Java_Join_Equals
|
||||
polyglot java import org.enso.table.data.table.join.EqualsIgnoreCase as Java_Join_Equals_Ignore_Case
|
||||
polyglot java import org.enso.table.data.table.join.LookupJoin
|
||||
polyglot java import org.enso.table.data.table.join.conditions.Between as Java_Join_Between
|
||||
polyglot java import org.enso.table.data.table.join.conditions.Equals as Java_Join_Equals
|
||||
polyglot java import org.enso.table.data.table.join.conditions.EqualsIgnoreCase as Java_Join_Equals_Ignore_Case
|
||||
polyglot java import org.enso.table.data.table.join.lookup.LookupJoin
|
||||
polyglot java import org.enso.table.data.table.Table as Java_Table
|
||||
polyglot java import org.enso.table.error.TooManyColumnsException
|
||||
polyglot java import org.enso.table.error.NullValuesInKeyColumns
|
||||
@ -1570,7 +1570,7 @@ type Table
|
||||
table.set "2 * [total_stock]" new_name="total_stock_expr"
|
||||
@column Column_Operation.default_widget
|
||||
set : Text | Column -> Text -> Set_Mode -> Problem_Behavior -> Table ! Existing_Column | Missing_Column | No_Such_Column | Expression_Error
|
||||
set self column:(Text | Column | Constant_Column | Column_Operation) new_name="" set_mode=Set_Mode.Add_Or_Update on_problems=Report_Warning =
|
||||
set self column:(Text | Column | Constant_Column | Column_Operation) (new_name : Text = "") (set_mode : Set_Mode = Set_Mode.Add_Or_Update) (on_problems : Problem_Behavior = Report_Warning) =
|
||||
problem_builder = Problem_Builder.new
|
||||
unique = self.column_naming_helper.create_unique_name_strategy
|
||||
unique.mark_used self.column_names
|
||||
|
@ -6,7 +6,7 @@ import project.Data.Type.Value_Type.Value_Type
|
||||
import project.Data.Type.Value_Type_Helpers
|
||||
from project.Errors import Missing_Input_Columns, Unexpected_Extra_Columns, Floating_Point_Equality, No_Common_Type, No_Output_Columns
|
||||
|
||||
polyglot java import org.enso.table.data.table.join.LookupColumnDescription
|
||||
polyglot java import org.enso.table.data.table.join.lookup.LookupColumnDescription
|
||||
|
||||
## PRIVATE
|
||||
type Lookup_Column
|
||||
|
@ -40,6 +40,10 @@ public abstract class MultiValueKeyBase {
|
||||
return rowIndex;
|
||||
}
|
||||
|
||||
public int getNumberOfColumns() {
|
||||
return storages.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public abstract boolean equals(Object o);
|
||||
|
||||
|
@ -78,4 +78,24 @@ public class OrderedMultiValueKey extends MultiValueKeyBase
|
||||
public String toString() {
|
||||
return "OrderedMultiValueKey{row="+rowIndex+"}";
|
||||
}
|
||||
|
||||
/**
|
||||
* A comparator that uses only one dimension of the key.
|
||||
*/
|
||||
public static class ProjectionComparator implements Comparator<OrderedMultiValueKey> {
|
||||
private final int ix;
|
||||
|
||||
public ProjectionComparator(int ix) {
|
||||
this.ix = ix;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(OrderedMultiValueKey o1, OrderedMultiValueKey o2) {
|
||||
if (o1.storages.length != o2.storages.length) {
|
||||
throw new ClassCastException("Incomparable keys.");
|
||||
}
|
||||
|
||||
return o1.objectComparator.compare(o1.get(ix), o2.get(ix));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -18,9 +18,9 @@ import org.enso.table.data.index.OrderedMultiValueKey;
|
||||
import org.enso.table.data.mask.OrderMask;
|
||||
import org.enso.table.data.mask.SliceRange;
|
||||
import org.enso.table.data.table.join.CrossJoin;
|
||||
import org.enso.table.data.table.join.IndexJoin;
|
||||
import org.enso.table.data.table.join.JoinCondition;
|
||||
import org.enso.table.data.table.join.conditions.JoinCondition;
|
||||
import org.enso.table.data.table.join.JoinResult;
|
||||
import org.enso.table.data.table.join.JoinStrategy;
|
||||
import org.enso.table.error.UnexpectedColumnTypeException;
|
||||
import org.enso.table.operations.Distinct;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
@ -279,8 +279,8 @@ public class Table {
|
||||
"be true.");
|
||||
}
|
||||
|
||||
var strategy = new IndexJoin();
|
||||
JoinResult joinResult = strategy.join(this, right, conditions, problemAggregator);
|
||||
JoinStrategy strategy = JoinStrategy.createStrategy(conditions);
|
||||
JoinResult joinResult = strategy.join(problemAggregator);
|
||||
|
||||
List<JoinResult> resultsToKeep = new ArrayList<>();
|
||||
|
||||
|
@ -1,5 +0,0 @@
|
||||
package org.enso.table.data.table.join;
|
||||
|
||||
import org.enso.table.data.table.Column;
|
||||
|
||||
public record Equals(Column left, Column right) implements JoinCondition {}
|
@ -1,112 +0,0 @@
|
||||
package org.enso.table.data.table.join;
|
||||
|
||||
import org.enso.base.text.TextFoldingStrategy;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.type.AnyObjectType;
|
||||
import org.enso.table.data.index.MultiValueIndex;
|
||||
import org.enso.table.data.table.Column;
|
||||
import org.enso.table.data.table.Table;
|
||||
import org.enso.table.data.table.join.scan.Matcher;
|
||||
import org.enso.table.data.table.join.scan.MatcherFactory;
|
||||
import org.enso.table.problems.ColumnAggregatedProblemAggregator;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
import org.graalvm.polyglot.Context;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class IndexJoin implements JoinStrategy {
|
||||
private record HashEqualityCondition(
|
||||
Column left, Column right, TextFoldingStrategy textFoldingStrategy) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public JoinResult join(Table left, Table right, List<JoinCondition> conditions, ProblemAggregator problemAggregator) {
|
||||
Context context = Context.getCurrent();
|
||||
List<HashEqualityCondition> equalConditions =
|
||||
conditions.stream()
|
||||
.filter(IndexJoin::isSupported)
|
||||
.map(IndexJoin::makeHashEqualityCondition)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
var remainingConditions =
|
||||
conditions.stream().filter(c -> !isSupported(c)).collect(Collectors.toList());
|
||||
|
||||
var leftEquals =
|
||||
equalConditions.stream().map(HashEqualityCondition::left).toArray(Column[]::new);
|
||||
var rightEquals =
|
||||
equalConditions.stream().map(HashEqualityCondition::right).toArray(Column[]::new);
|
||||
var textFoldingStrategies =
|
||||
equalConditions.stream()
|
||||
.map(HashEqualityCondition::textFoldingStrategy)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
var leftIndex =
|
||||
MultiValueIndex.makeUnorderedIndex(leftEquals, left.rowCount(), textFoldingStrategies, problemAggregator);
|
||||
var rightIndex =
|
||||
MultiValueIndex.makeUnorderedIndex(rightEquals, right.rowCount(), textFoldingStrategies, problemAggregator);
|
||||
|
||||
MatcherFactory factory = new MatcherFactory();
|
||||
Matcher remainingMatcher = factory.create(
|
||||
remainingConditions, new ColumnAggregatedProblemAggregator(problemAggregator)
|
||||
);
|
||||
|
||||
JoinResult.Builder resultBuilder = new JoinResult.Builder();
|
||||
for (var leftKey : leftIndex.keys()) {
|
||||
if (rightIndex.contains(leftKey)) {
|
||||
for (var leftRow : leftIndex.get(leftKey)) {
|
||||
for (var rightRow : rightIndex.get(leftKey)) {
|
||||
if (remainingMatcher.matches(leftRow, rightRow)) {
|
||||
resultBuilder.addRow(leftRow, rightRow);
|
||||
}
|
||||
|
||||
context.safepoint();
|
||||
}
|
||||
|
||||
context.safepoint();
|
||||
}
|
||||
}
|
||||
|
||||
context.safepoint();
|
||||
}
|
||||
|
||||
return resultBuilder.build();
|
||||
}
|
||||
|
||||
private static boolean isSupported(JoinCondition condition) {
|
||||
switch (condition) {
|
||||
case Equals eq -> {
|
||||
return isBuiltinType(eq.left().getStorage()) && isBuiltinType(eq.right().getStorage());
|
||||
}
|
||||
case EqualsIgnoreCase ignored -> {
|
||||
return true;
|
||||
}
|
||||
default -> {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static HashEqualityCondition makeHashEqualityCondition(JoinCondition eq) {
|
||||
switch (eq) {
|
||||
case Equals e -> {
|
||||
return new HashEqualityCondition(
|
||||
e.left(), e.right(), TextFoldingStrategy.unicodeNormalizedFold);
|
||||
}
|
||||
case EqualsIgnoreCase e -> {
|
||||
return new HashEqualityCondition(
|
||||
e.left(), e.right(), TextFoldingStrategy.caseInsensitiveFold(e.locale()));
|
||||
}
|
||||
default -> throw new IllegalStateException(
|
||||
"Impossible: trying to convert condition "
|
||||
+ eq
|
||||
+ " to a HashEqualityCondition, but it should not be marked as supported. This is a"
|
||||
+ " bug in the Table library.");
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isBuiltinType(Storage<?> storage) {
|
||||
// TODO: this should be removed when #5626 and #5259 are implemented
|
||||
return !storage.getType().equals(AnyObjectType.INSTANCE);
|
||||
}
|
||||
}
|
@ -1,3 +0,0 @@
|
||||
package org.enso.table.data.table.join;
|
||||
|
||||
public interface JoinCondition {}
|
@ -1,10 +1,75 @@
|
||||
package org.enso.table.data.table.join;
|
||||
|
||||
import java.util.List;
|
||||
import org.enso.table.data.table.Table;
|
||||
import org.enso.table.data.table.join.between.SortJoin;
|
||||
import org.enso.table.data.table.join.conditions.Between;
|
||||
import org.enso.table.data.table.join.conditions.Equals;
|
||||
import org.enso.table.data.table.join.conditions.EqualsIgnoreCase;
|
||||
import org.enso.table.data.table.join.conditions.HashableCondition;
|
||||
import org.enso.table.data.table.join.conditions.JoinCondition;
|
||||
import org.enso.table.data.table.join.hashing.HashJoin;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A strategy used for performing a join of two tables.
|
||||
*/
|
||||
public interface JoinStrategy {
|
||||
JoinResult join(
|
||||
Table left, Table right, List<JoinCondition> conditions, ProblemAggregator problemAggregator);
|
||||
JoinResult join(ProblemAggregator problemAggregator);
|
||||
|
||||
static JoinStrategy createStrategy(List<JoinCondition> conditions) {
|
||||
if (conditions.isEmpty()) {
|
||||
throw new IllegalArgumentException("At least one join condition must be provided.");
|
||||
}
|
||||
|
||||
List<HashableCondition> hashableConditions = conditions.stream()
|
||||
.filter(c -> c instanceof HashableCondition)
|
||||
.map(c -> (HashableCondition) c)
|
||||
.toList();
|
||||
List<Between> betweenConditions = conditions.stream()
|
||||
.filter(c -> c instanceof Between)
|
||||
.map(c -> (Between) c)
|
||||
.toList();
|
||||
|
||||
if (hashableConditions.size() + betweenConditions.size() != conditions.size()) {
|
||||
throw new IllegalArgumentException("Unsupported join condition.");
|
||||
}
|
||||
|
||||
if (hashableConditions.isEmpty()) {
|
||||
assert !betweenConditions.isEmpty();
|
||||
return new SortJoin(betweenConditions);
|
||||
} else if (betweenConditions.isEmpty()) {
|
||||
return new HashJoin(hashableConditions, new MatchAllStrategy());
|
||||
} else {
|
||||
return new HashJoin(hashableConditions, new SortJoin(betweenConditions));
|
||||
}
|
||||
}
|
||||
|
||||
class ConditionsHelper {
|
||||
private final List<? extends JoinCondition> conditions;
|
||||
|
||||
public ConditionsHelper(List<? extends JoinCondition> conditions) {
|
||||
if (conditions.isEmpty()) {
|
||||
throw new IllegalArgumentException("At least one join condition must be provided.");
|
||||
}
|
||||
|
||||
this.conditions = conditions;
|
||||
}
|
||||
|
||||
public int getLeftTableRowCount() {
|
||||
return switch (conditions.get(0)) {
|
||||
case Equals equals -> equals.left().getStorage().size();
|
||||
case EqualsIgnoreCase equalsIgnoreCase -> equalsIgnoreCase.left().getStorage().size();
|
||||
case Between between -> between.left().getStorage().size();
|
||||
};
|
||||
}
|
||||
|
||||
public int getRightTableRowCount() {
|
||||
return switch (conditions.get(0)) {
|
||||
case Equals equals -> equals.right().getStorage().size();
|
||||
case EqualsIgnoreCase equalsIgnoreCase -> equalsIgnoreCase.right().getStorage().size();
|
||||
case Between between -> between.rightLower().getStorage().size();
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,28 @@
|
||||
package org.enso.table.data.table.join;
|
||||
|
||||
import java.util.List;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
import org.graalvm.polyglot.Context;
|
||||
|
||||
/**
|
||||
* A pluggable strategy that can be used as the inner strategy for a join if there are no more join
|
||||
* conditions to process - so all rows are matched with each other within a given group.
|
||||
*/
|
||||
public class MatchAllStrategy implements PluggableJoinStrategy {
|
||||
@Override
|
||||
public void joinSubsets(
|
||||
List<Integer> leftGroup,
|
||||
List<Integer> rightGroup,
|
||||
JoinResult.Builder resultBuilder,
|
||||
ProblemAggregator problemAggregator) {
|
||||
Context context = Context.getCurrent();
|
||||
for (var leftRow : leftGroup) {
|
||||
for (var rightRow : rightGroup) {
|
||||
resultBuilder.addRow(leftRow, rightRow);
|
||||
context.safepoint();
|
||||
}
|
||||
|
||||
context.safepoint();
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
package org.enso.table.data.table.join;
|
||||
|
||||
import java.util.List;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
|
||||
/**
|
||||
* A helper join strategy that can be used within another join strategy to perform a join of
|
||||
* sub-sets of indices, stemming from already joining on other conditions.
|
||||
*/
|
||||
public interface PluggableJoinStrategy {
|
||||
|
||||
/** Performs a join of two sub-sets of indices. */
|
||||
void joinSubsets(
|
||||
List<Integer> leftGroup,
|
||||
List<Integer> rightGroup,
|
||||
JoinResult.Builder resultBuilder,
|
||||
ProblemAggregator problemAggregator);
|
||||
}
|
@ -0,0 +1,162 @@
|
||||
package org.enso.table.data.table.join.between;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import org.enso.base.ObjectComparator;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.index.OrderedMultiValueKey;
|
||||
import org.enso.table.data.table.join.JoinResult;
|
||||
import org.enso.table.data.table.join.JoinStrategy;
|
||||
import org.enso.table.data.table.join.PluggableJoinStrategy;
|
||||
import org.enso.table.data.table.join.conditions.Between;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
import org.graalvm.polyglot.Context;
|
||||
|
||||
public class SortJoin implements JoinStrategy, PluggableJoinStrategy {
|
||||
|
||||
public SortJoin(List<Between> conditions) {
|
||||
conditionsHelper = new JoinStrategy.ConditionsHelper(conditions);
|
||||
|
||||
Context context = Context.getCurrent();
|
||||
int nConditions = conditions.size();
|
||||
directions = new int[nConditions];
|
||||
leftStorages = new Storage<?>[nConditions];
|
||||
lowerStorages = new Storage<?>[nConditions];
|
||||
upperStorages = new Storage<?>[nConditions];
|
||||
for (int i = 0; i < nConditions; i++) {
|
||||
directions[i] = 1;
|
||||
leftStorages[i] = conditions.get(i).left().getStorage();
|
||||
lowerStorages[i] = conditions.get(i).rightLower().getStorage();
|
||||
upperStorages[i] = conditions.get(i).rightUpper().getStorage();
|
||||
context.safepoint();
|
||||
}
|
||||
}
|
||||
|
||||
private final JoinStrategy.ConditionsHelper conditionsHelper;
|
||||
|
||||
private final int[] directions;
|
||||
private final Storage<?>[] leftStorages;
|
||||
private final Storage<?>[] lowerStorages;
|
||||
private final Storage<?>[] upperStorages;
|
||||
|
||||
@Override
|
||||
public JoinResult join(ProblemAggregator problemAggregator) {
|
||||
Context context = Context.getCurrent();
|
||||
JoinResult.Builder resultBuilder = new JoinResult.Builder();
|
||||
|
||||
int leftRowCount = conditionsHelper.getLeftTableRowCount();
|
||||
int rightRowCount = conditionsHelper.getRightTableRowCount();
|
||||
if (leftRowCount == 0 || rightRowCount == 0) {
|
||||
// if one group is completely empty, there will be no matches to report
|
||||
return resultBuilder.build();
|
||||
}
|
||||
List<OrderedMultiValueKey> leftKeys = new ArrayList<>(leftRowCount);
|
||||
for (int i = 0; i < leftRowCount; i++) {
|
||||
leftKeys.add(new OrderedMultiValueKey(leftStorages, i, directions));
|
||||
context.safepoint();
|
||||
}
|
||||
|
||||
SortedListIndex<OrderedMultiValueKey> leftIndex = buildSortedLeftIndex(leftKeys);
|
||||
|
||||
for (int rightRowIx = 0; rightRowIx < rightRowCount; rightRowIx++) {
|
||||
addMatchingLeftRows(leftIndex, rightRowIx, resultBuilder);
|
||||
context.safepoint();
|
||||
}
|
||||
|
||||
return resultBuilder.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void joinSubsets(
|
||||
List<Integer> leftGroup,
|
||||
List<Integer> rightGroup,
|
||||
JoinResult.Builder resultBuilder,
|
||||
ProblemAggregator problemAggregator) {
|
||||
Context context = Context.getCurrent();
|
||||
|
||||
List<OrderedMultiValueKey> leftKeys =
|
||||
leftGroup.stream()
|
||||
.map(i -> new OrderedMultiValueKey(leftStorages, i, directions, objectComparator))
|
||||
.toList();
|
||||
if (leftKeys.isEmpty()) {
|
||||
// left group is completely empty - there will be no matches at all
|
||||
return;
|
||||
}
|
||||
|
||||
SortedListIndex<OrderedMultiValueKey> leftIndex = buildSortedLeftIndex(leftKeys);
|
||||
|
||||
for (int rightRowIx : rightGroup) {
|
||||
addMatchingLeftRows(leftIndex, rightRowIx, resultBuilder);
|
||||
context.safepoint();
|
||||
}
|
||||
}
|
||||
|
||||
private SortedListIndex<OrderedMultiValueKey> buildSortedLeftIndex(
|
||||
List<OrderedMultiValueKey> keys) {
|
||||
return SortedListIndex.build(keys, firstCoordinateComparator);
|
||||
}
|
||||
|
||||
private OrderedMultiValueKey buildLowerBound(int rightRowIx) {
|
||||
return new OrderedMultiValueKey(lowerStorages, rightRowIx, directions, objectComparator);
|
||||
}
|
||||
|
||||
private OrderedMultiValueKey buildUpperBound(int rightRowIx) {
|
||||
return new OrderedMultiValueKey(upperStorages, rightRowIx, directions, objectComparator);
|
||||
}
|
||||
|
||||
private void addMatchingLeftRows(
|
||||
SortedListIndex<OrderedMultiValueKey> sortedLeftIndex,
|
||||
int rightRowIx,
|
||||
JoinResult.Builder resultBuilder) {
|
||||
OrderedMultiValueKey lowerBound = buildLowerBound(rightRowIx);
|
||||
OrderedMultiValueKey upperBound = buildUpperBound(rightRowIx);
|
||||
|
||||
// If the match interval is invalid or empty, there is nothing to do.
|
||||
if (lowerBound.hasAnyNulls()
|
||||
|| upperBound.hasAnyNulls()
|
||||
|| lowerBound.compareTo(upperBound) > 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
List<OrderedMultiValueKey> firstCoordinateMatches =
|
||||
sortedLeftIndex.findSubRange(lowerBound, upperBound);
|
||||
Context context = Context.getCurrent();
|
||||
for (OrderedMultiValueKey key : firstCoordinateMatches) {
|
||||
if (isInRange(key, lowerBound, upperBound)) {
|
||||
resultBuilder.addRow(key.getRowIndex(), rightRowIx);
|
||||
}
|
||||
|
||||
context.safepoint();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isInRange(
|
||||
OrderedMultiValueKey key, OrderedMultiValueKey lowerBound, OrderedMultiValueKey upperBound) {
|
||||
assert key.getNumberOfColumns() == lowerBound.getNumberOfColumns();
|
||||
assert key.getNumberOfColumns() == upperBound.getNumberOfColumns();
|
||||
|
||||
// Note: we cannot just use `compareTo`, because we are now not checking that the key is between
|
||||
// the bounds in lexicographic order.
|
||||
// Instead, we are checking if the key is between the bounds for all dimensions.
|
||||
|
||||
int n = key.getNumberOfColumns();
|
||||
for (int i = 0; i < n; i++) {
|
||||
var keyValue = key.get(i);
|
||||
var lowerBoundValue = lowerBound.get(i);
|
||||
var upperBoundValue = upperBound.get(i);
|
||||
boolean fitsInThisDimension =
|
||||
objectComparator.compare(keyValue, lowerBoundValue) >= 0
|
||||
&& objectComparator.compare(keyValue, upperBoundValue) <= 0;
|
||||
if (!fitsInThisDimension) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private final ObjectComparator objectComparator = ObjectComparator.DEFAULT;
|
||||
private final Comparator<OrderedMultiValueKey> firstCoordinateComparator =
|
||||
new OrderedMultiValueKey.ProjectionComparator(0);
|
||||
}
|
@ -0,0 +1,129 @@
|
||||
package org.enso.table.data.table.join.between;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
public class SortedListIndex<T> {
|
||||
/** Defines the <= ordering for the index. */
|
||||
private final Comparator<T> comparator;
|
||||
|
||||
/* forall 0 <= i <= j < n, sortedList[i] <= sortedList[j] */
|
||||
private final ArrayList<T> sortedList;
|
||||
|
||||
protected SortedListIndex(ArrayList<T> sortedList, Comparator<T> comparator) {
|
||||
this.comparator = comparator;
|
||||
this.sortedList = sortedList;
|
||||
}
|
||||
|
||||
public static <T> SortedListIndex<T> build(List<T> list, Comparator<T> comparator) {
|
||||
ArrayList<T> copy = new ArrayList<>(list);
|
||||
copy.sort(comparator);
|
||||
return new SortedListIndex<>(copy, comparator);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds a sub-range of the index containing all elements between the lower and upper bounds
|
||||
* (both-ends inclusive).
|
||||
*/
|
||||
public List<T> findSubRange(T lowerBound, T upperBound) {
|
||||
int start = findLowerIndex(lowerBound);
|
||||
int end = findUpperIndex(upperBound) + 1;
|
||||
if (start >= end) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
return sortedList.subList(start, end);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the index of the first element that is greater than or equal to the argument.
|
||||
*
|
||||
* <p>If all elements are greater than the argument, returns 0. If all elements are less than the
|
||||
* argument, returns N.
|
||||
*/
|
||||
private int findLowerIndex(T element) {
|
||||
int start = 0;
|
||||
int end = sortedList.size();
|
||||
|
||||
/*
|
||||
* Loop invariants:
|
||||
* 1) start <= end
|
||||
* 2) forall 0 <= i < start: sortedList[i] < element
|
||||
* 3) forall end <= i < N: sortedList[i] >= element
|
||||
*
|
||||
* end - start is strictly decreasing, so the loop will always terminate.
|
||||
*/
|
||||
while (start < end) {
|
||||
// start <= mid < mid + 1 <= end
|
||||
int mid = Math.addExact(start, end) / 2;
|
||||
T midElement = sortedList.get(mid);
|
||||
int cmp = comparator.compare(midElement, element);
|
||||
if (cmp < 0) {
|
||||
start = mid + 1;
|
||||
} else {
|
||||
end = mid;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* After the loop, start >= end, but also start <= end, so start == end.
|
||||
*
|
||||
* Thus, from invariants:
|
||||
* forall 0 <= i < start: sortedList[i] < element
|
||||
* forall start <= i < N: sortedList[i] >= element
|
||||
*
|
||||
* start is the first element that is >= element;
|
||||
* if there is no such element, it will be N.
|
||||
*/
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the index of the last element that is less than or equal to the argument.
|
||||
*
|
||||
* <p>If all elements are greater than the argument, returns -1. If all elements are less than the
|
||||
* argument, returns N-1 (index of the last element).
|
||||
*/
|
||||
private int findUpperIndex(T element) {
|
||||
int start = 0;
|
||||
int end = sortedList.size();
|
||||
|
||||
/*
|
||||
* Loop invariants:
|
||||
* 1) start <= end
|
||||
* 2) forall 0 <= i < start: sortedList[i] <= element
|
||||
* 3) forall end <= i < N: sortedList[i] > element
|
||||
*
|
||||
* end - start is strictly decreasing.
|
||||
*/
|
||||
while (start < end) {
|
||||
// start <= mid < end
|
||||
int mid = Math.addExact(start, end) / 2;
|
||||
T midElement = sortedList.get(mid);
|
||||
int cmp = comparator.compare(midElement, element);
|
||||
if (cmp <= 0) {
|
||||
start = mid + 1;
|
||||
} else {
|
||||
end = mid;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* After the loop, start >= end, but also start <= end, so start == end.
|
||||
*
|
||||
* Thus, from invariants:
|
||||
* forall 0 <= i < start: sortedList[i] <= element
|
||||
* forall start <= i < N: sortedList[i] > element
|
||||
*
|
||||
* So start-1 is the last element that is <= element (if it exists);
|
||||
* if there is no such element, it will be -1.
|
||||
*/
|
||||
return start - 1;
|
||||
}
|
||||
|
||||
private boolean keysEqual(T k1, T k2) {
|
||||
return comparator.compare(k1, k2) == 0;
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package org.enso.table.data.table.join;
|
||||
package org.enso.table.data.table.join.conditions;
|
||||
|
||||
import org.enso.table.data.table.Column;
|
||||
|
@ -0,0 +1,5 @@
|
||||
package org.enso.table.data.table.join.conditions;
|
||||
|
||||
import org.enso.table.data.table.Column;
|
||||
|
||||
public record Equals(Column left, Column right) implements HashableCondition {}
|
@ -1,7 +1,7 @@
|
||||
package org.enso.table.data.table.join;
|
||||
package org.enso.table.data.table.join.conditions;
|
||||
|
||||
import org.enso.table.data.table.Column;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
public record EqualsIgnoreCase(Column left, Column right, Locale locale) implements JoinCondition {}
|
||||
public record EqualsIgnoreCase(Column left, Column right, Locale locale) implements HashableCondition {}
|
@ -0,0 +1,4 @@
|
||||
package org.enso.table.data.table.join.conditions;
|
||||
|
||||
public sealed interface HashableCondition extends JoinCondition permits Equals, EqualsIgnoreCase {
|
||||
}
|
@ -0,0 +1,3 @@
|
||||
package org.enso.table.data.table.join.conditions;
|
||||
|
||||
public sealed interface JoinCondition permits HashableCondition, Between {}
|
@ -0,0 +1,84 @@
|
||||
package org.enso.table.data.table.join.hashing;
|
||||
|
||||
import org.enso.base.text.TextFoldingStrategy;
|
||||
import org.enso.table.data.index.MultiValueIndex;
|
||||
import org.enso.table.data.index.UnorderedMultiValueKey;
|
||||
import org.enso.table.data.table.Column;
|
||||
import org.enso.table.data.table.join.JoinResult;
|
||||
import org.enso.table.data.table.join.JoinStrategy;
|
||||
import org.enso.table.data.table.join.PluggableJoinStrategy;
|
||||
import org.enso.table.data.table.join.conditions.Equals;
|
||||
import org.enso.table.data.table.join.conditions.EqualsIgnoreCase;
|
||||
import org.enso.table.data.table.join.conditions.HashableCondition;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
import org.graalvm.polyglot.Context;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A strategy that uses a hash-map to perform join on the equality conditions.
|
||||
* <p>
|
||||
* It then delegates to {@code remainingMatcher} to perform the remaining conditions on the matching pairs of row
|
||||
* subsets.
|
||||
*/
|
||||
public class HashJoin implements JoinStrategy {
|
||||
public HashJoin(List<HashableCondition> conditions, PluggableJoinStrategy remainingMatcher) {
|
||||
conditionsHelper = new JoinStrategy.ConditionsHelper(conditions);
|
||||
this.remainingMatcher = remainingMatcher;
|
||||
|
||||
List<HashEqualityCondition> equalConditions =
|
||||
conditions.stream().map(HashJoin::makeHashEqualityCondition).toList();
|
||||
|
||||
if (equalConditions.isEmpty()) {
|
||||
throw new IllegalArgumentException("EqualityHashJoin is applicable if there is at least one equality condition.");
|
||||
}
|
||||
|
||||
leftEquals = equalConditions.stream().map(HashEqualityCondition::left).toArray(Column[]::new);
|
||||
rightEquals = equalConditions.stream().map(HashEqualityCondition::right).toArray(Column[]::new);
|
||||
textFoldingStrategies = equalConditions.stream().map(HashEqualityCondition::textFoldingStrategy).toList();
|
||||
}
|
||||
|
||||
private final JoinStrategy.ConditionsHelper conditionsHelper;
|
||||
private final Column[] leftEquals, rightEquals;
|
||||
private final List<TextFoldingStrategy> textFoldingStrategies;
|
||||
private final PluggableJoinStrategy remainingMatcher;
|
||||
|
||||
@Override
|
||||
public JoinResult join(ProblemAggregator problemAggregator) {
|
||||
Context context = Context.getCurrent();
|
||||
|
||||
var leftIndex = MultiValueIndex.makeUnorderedIndex(leftEquals, conditionsHelper.getLeftTableRowCount(),
|
||||
textFoldingStrategies, problemAggregator);
|
||||
var rightIndex = MultiValueIndex.makeUnorderedIndex(rightEquals, conditionsHelper.getRightTableRowCount(),
|
||||
textFoldingStrategies, problemAggregator);
|
||||
|
||||
JoinResult.Builder resultBuilder = new JoinResult.Builder();
|
||||
for (var leftEntry : leftIndex.mapping().entrySet()) {
|
||||
UnorderedMultiValueKey leftKey = leftEntry.getKey();
|
||||
List<Integer> leftRows = leftEntry.getValue();
|
||||
List<Integer> rightRows = rightIndex.get(leftKey);
|
||||
|
||||
if (rightRows != null) {
|
||||
remainingMatcher.joinSubsets(leftRows, rightRows, resultBuilder, problemAggregator);
|
||||
}
|
||||
|
||||
context.safepoint();
|
||||
}
|
||||
|
||||
return resultBuilder.build();
|
||||
}
|
||||
|
||||
private static HashEqualityCondition makeHashEqualityCondition(HashableCondition eq) {
|
||||
switch (eq) {
|
||||
case Equals e -> {
|
||||
return new HashEqualityCondition(e.left(), e.right(), TextFoldingStrategy.unicodeNormalizedFold);
|
||||
}
|
||||
case EqualsIgnoreCase e -> {
|
||||
return new HashEqualityCondition(e.left(), e.right(), TextFoldingStrategy.caseInsensitiveFold(e.locale()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private record HashEqualityCondition(Column left, Column right, TextFoldingStrategy textFoldingStrategy) {
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package org.enso.table.data.table.join;
|
||||
package org.enso.table.data.table.join.lookup;
|
||||
|
||||
import org.enso.table.data.column.storage.type.StorageType;
|
||||
import org.enso.table.data.table.Column;
|
@ -1,4 +1,4 @@
|
||||
package org.enso.table.data.table.join;
|
||||
package org.enso.table.data.table.join.lookup;
|
||||
|
||||
import org.enso.base.text.TextFoldingStrategy;
|
||||
import org.enso.table.data.column.builder.Builder;
|
||||
@ -9,6 +9,7 @@ import org.enso.table.data.index.UnorderedMultiValueKey;
|
||||
import org.enso.table.data.mask.OrderMask;
|
||||
import org.enso.table.data.table.Column;
|
||||
import org.enso.table.data.table.Table;
|
||||
import org.enso.table.data.table.join.conditions.Equals;
|
||||
import org.enso.table.error.NonUniqueLookupKey;
|
||||
import org.enso.table.error.NullValuesInKeyColumns;
|
||||
import org.enso.table.error.UnmatchedRow;
|
||||
@ -17,7 +18,6 @@ import org.enso.table.util.ConstantList;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public class LookupJoin {
|
@ -1,5 +0,0 @@
|
||||
package org.enso.table.data.table.join.scan;
|
||||
|
||||
public interface Matcher {
|
||||
boolean matches(int left, int right);
|
||||
}
|
@ -1,152 +0,0 @@
|
||||
package org.enso.table.data.table.join.scan;
|
||||
|
||||
import org.enso.base.ObjectComparator;
|
||||
import org.enso.base.Text_Utils;
|
||||
import org.enso.base.polyglot.NumericConverter;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.StringStorage;
|
||||
import org.enso.table.data.table.join.Between;
|
||||
import org.enso.table.data.table.join.Equals;
|
||||
import org.enso.table.data.table.join.EqualsIgnoreCase;
|
||||
import org.enso.table.data.table.join.JoinCondition;
|
||||
import org.enso.table.data.table.problems.FloatingPointGrouping;
|
||||
import org.enso.table.problems.ColumnAggregatedProblemAggregator;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class MatcherFactory {
|
||||
public Matcher create(JoinCondition condition, ColumnAggregatedProblemAggregator problemAggregator) {
|
||||
return switch (condition) {
|
||||
case Equals eq -> new EqualsMatcher(eq, problemAggregator);
|
||||
case EqualsIgnoreCase eq -> new EqualsIgnoreCaseMatcher(eq);
|
||||
case Between between -> new BetweenMatcher(between);
|
||||
default -> throw new UnsupportedOperationException(
|
||||
"Unsupported join condition: " + condition);
|
||||
};
|
||||
}
|
||||
|
||||
public Matcher create(List<JoinCondition> condition, ColumnAggregatedProblemAggregator problemAggregator) {
|
||||
List<Matcher> matchers = condition.stream().map(m-> create(m, problemAggregator)).collect(Collectors.toList());
|
||||
return new CompoundMatcher(matchers);
|
||||
}
|
||||
|
||||
static final class CompoundMatcher implements Matcher {
|
||||
private final List<Matcher> matchers;
|
||||
|
||||
CompoundMatcher(List<Matcher> matchers) {
|
||||
this.matchers = matchers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(int left, int right) {
|
||||
for (Matcher matcher : matchers) {
|
||||
if (!matcher.matches(left, right)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static final class EqualsMatcher implements Matcher {
|
||||
private final Storage<?> leftStorage;
|
||||
private final Storage<?> rightStorage;
|
||||
private final String leftColumnName;
|
||||
private final String rightColumnName;
|
||||
private final ColumnAggregatedProblemAggregator problemAggregator;
|
||||
|
||||
public EqualsMatcher(Equals eq, ColumnAggregatedProblemAggregator problemAggregator) {
|
||||
leftStorage = eq.left().getStorage();
|
||||
rightStorage = eq.right().getStorage();
|
||||
leftColumnName = eq.left().getName();
|
||||
rightColumnName = eq.right().getName();
|
||||
this.problemAggregator = problemAggregator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(int left, int right) {
|
||||
Object leftValue = leftStorage.getItemBoxed(left);
|
||||
Object rightValue = rightStorage.getItemBoxed(right);
|
||||
|
||||
if (NumericConverter.isFloatLike(leftValue)) {
|
||||
problemAggregator.reportColumnAggregatedProblem(new FloatingPointGrouping(leftColumnName, left));
|
||||
}
|
||||
|
||||
if (NumericConverter.isFloatLike(rightValue)) {
|
||||
problemAggregator.reportColumnAggregatedProblem(new FloatingPointGrouping(rightColumnName, right));
|
||||
}
|
||||
|
||||
return ObjectComparator.areEqual(leftValue, rightValue);
|
||||
}
|
||||
}
|
||||
|
||||
static final class EqualsIgnoreCaseMatcher implements Matcher {
|
||||
private final StringStorage leftStorage;
|
||||
private final StringStorage rightStorage;
|
||||
|
||||
private final Locale locale;
|
||||
|
||||
public EqualsIgnoreCaseMatcher(EqualsIgnoreCase eq) {
|
||||
if (eq.left().getStorage() instanceof StringStorage leftStrings) {
|
||||
leftStorage = leftStrings;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Expected left column to have type Text.");
|
||||
}
|
||||
|
||||
if (eq.right().getStorage() instanceof StringStorage rightStrings) {
|
||||
rightStorage = rightStrings;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Expected right column to have type Text.");
|
||||
}
|
||||
|
||||
locale = eq.locale();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(int left, int right) {
|
||||
String leftValue = leftStorage.getItem(left);
|
||||
String rightValue = rightStorage.getItem(right);
|
||||
|
||||
if (leftValue == null && rightValue == null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (leftValue == null || rightValue == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return Text_Utils.equals_ignore_case(leftValue, rightValue, locale);
|
||||
}
|
||||
}
|
||||
|
||||
static final class BetweenMatcher implements Matcher {
|
||||
private final Storage<?> leftStorage;
|
||||
private final Storage<?> rightLowerStorage;
|
||||
private final Storage<?> rightUpperStorage;
|
||||
|
||||
public BetweenMatcher(Between between) {
|
||||
leftStorage = between.left().getStorage();
|
||||
rightLowerStorage = between.rightLower().getStorage();
|
||||
rightUpperStorage = between.rightUpper().getStorage();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(int left, int right) {
|
||||
Object leftValue = leftStorage.getItemBoxed(left);
|
||||
Object rightLowerValue = rightLowerStorage.getItemBoxed(right);
|
||||
Object rightUpperValue = rightUpperStorage.getItemBoxed(right);
|
||||
|
||||
// If any value is missing, such a pair of rows is never correlated with Between as we assume
|
||||
// the ordering is not well-defined for missing values.
|
||||
if (leftValue == null || rightLowerValue == null || rightUpperValue == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return ObjectComparator.DEFAULT.compare(leftValue, rightLowerValue) >= 0
|
||||
&& ObjectComparator.DEFAULT.compare(leftValue, rightUpperValue) <= 0;
|
||||
}
|
||||
}
|
||||
}
|
@ -2,7 +2,6 @@ package org.enso.table.operations;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -18,7 +17,6 @@ import org.enso.table.data.table.Column;
|
||||
import org.enso.table.problems.ColumnAggregatedProblemAggregator;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
import org.enso.table.util.ConstantList;
|
||||
import org.graalvm.collections.Pair;
|
||||
|
||||
public class AddRowNumber {
|
||||
|
||||
@ -62,18 +60,17 @@ public class AddRowNumber {
|
||||
Storage<?>[] orderingStorages =
|
||||
Arrays.stream(orderingColumns).map(Column::getStorage).toArray(Storage[]::new);
|
||||
long[] numbers = new long[n];
|
||||
List<Pair<OrderedMultiValueKey, Integer>> keys =
|
||||
List<OrderedMultiValueKey> keys =
|
||||
new ArrayList<>(
|
||||
IntStream.range(0, n)
|
||||
.mapToObj(
|
||||
i -> Pair.create(new OrderedMultiValueKey(orderingStorages, i, directions), i))
|
||||
.mapToObj(i -> new OrderedMultiValueKey(orderingStorages, i, directions))
|
||||
.toList());
|
||||
|
||||
keys.sort(OrderedPairComparator.INSTANCE);
|
||||
keys.sort(null);
|
||||
|
||||
RangeIterator it = new RangeIterator(start, step);
|
||||
for (var key : keys) {
|
||||
numbers[key.getRight()] = it.next();
|
||||
numbers[key.getRowIndex()] = it.next();
|
||||
}
|
||||
return new LongStorage(numbers, IntegerType.INT_64);
|
||||
}
|
||||
@ -103,44 +100,21 @@ public class AddRowNumber {
|
||||
|
||||
for (var entry : groupIndex.mapping().entrySet()) {
|
||||
List<Integer> indices = entry.getValue();
|
||||
List<Pair<OrderedMultiValueKey, Integer>> orderingKeys =
|
||||
List<OrderedMultiValueKey> orderingKeys =
|
||||
new ArrayList<>(
|
||||
indices.stream()
|
||||
.map(
|
||||
i ->
|
||||
Pair.create(new OrderedMultiValueKey(orderingStorages, i, directions), i))
|
||||
.map(i -> new OrderedMultiValueKey(orderingStorages, i, directions))
|
||||
.toList());
|
||||
orderingKeys.sort(OrderedPairComparator.INSTANCE);
|
||||
orderingKeys.sort(null);
|
||||
RangeIterator it = new RangeIterator(start, step);
|
||||
for (var key : orderingKeys) {
|
||||
numbers[key.getRight()] = it.next();
|
||||
for (OrderedMultiValueKey key : orderingKeys) {
|
||||
numbers[key.getRowIndex()] = it.next();
|
||||
}
|
||||
}
|
||||
|
||||
return new LongStorage(numbers, IntegerType.INT_64);
|
||||
}
|
||||
|
||||
private static class OrderedPairComparator
|
||||
implements Comparator<Pair<OrderedMultiValueKey, Integer>> {
|
||||
@Override
|
||||
public int compare(
|
||||
Pair<OrderedMultiValueKey, Integer> o1, Pair<OrderedMultiValueKey, Integer> o2) {
|
||||
int p1 = o1.getLeft().compareTo(o2.getLeft());
|
||||
if (p1 != 0) {
|
||||
return p1;
|
||||
}
|
||||
|
||||
return o1.getRight().compareTo(o2.getRight());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return obj instanceof OrderedPairComparator;
|
||||
}
|
||||
|
||||
static OrderedPairComparator INSTANCE = new OrderedPairComparator();
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper for computing consecutive numbers based on a start and step. It will throw an {@link
|
||||
* java.lang.ArithmeticException} if the next number overflows.
|
||||
|
@ -10,6 +10,7 @@ import project.Table.Aggregate
|
||||
import project.Table.Arithmetic
|
||||
import project.Table.Column_From_Vector
|
||||
import project.Table.Cross_Tab
|
||||
import project.Table.Join
|
||||
import project.Table.Sorting
|
||||
import project.Table.Internal.Multi_Value_Key
|
||||
import project.Text.Build
|
||||
@ -54,6 +55,7 @@ all_benchmarks =
|
||||
builder.append Add_Row_Number.collect_benches
|
||||
builder.append Column_From_Vector.collect_benches
|
||||
builder.append Cross_Tab.collect_benches
|
||||
builder.append Join.collect_benches
|
||||
builder.append Sorting.collect_benches
|
||||
builder.append Multi_Value_Key.collect_benches
|
||||
|
||||
|
48
test/Benchmarks/src/Table/Is_In.enso
Normal file
48
test/Benchmarks/src/Table/Is_In.enso
Normal file
@ -0,0 +1,48 @@
|
||||
from Standard.Base import all
|
||||
from Standard.Base.Runtime import assert
|
||||
|
||||
from Standard.Table import all
|
||||
|
||||
from Standard.Test import Bench
|
||||
|
||||
from project.Config import extended_tests
|
||||
|
||||
options = Bench.options . set_warmup (Bench.phase_conf 1 2) . set_measure (Bench.phase_conf 2 3)
|
||||
|
||||
type Scenario
|
||||
Value table ints_vec dates_vec bool_vec
|
||||
|
||||
create_scenario =
|
||||
t = Table.new [["X", (200.up_to 10000 . to_vector)]]
|
||||
ints_vec = 40000.up_to 130000 . to_vector
|
||||
first_day = Date_Time.new 2000 1 1
|
||||
make_date x = first_day + (Duration.new seconds=x)
|
||||
dates_vec = ints_vec.map make_date
|
||||
bool_vec = Vector.fill 7000 True
|
||||
|
||||
t2 = t.set (t.at "X" . map make_date) "dates"
|
||||
t3 = t2.set (t.at "X" % 2 == 0) "bools"
|
||||
Scenario.Value t3 ints_vec dates_vec bool_vec
|
||||
|
||||
type Data
|
||||
Value ~scenario
|
||||
|
||||
create = Data.Value create_scenario
|
||||
|
||||
collect_benches = Bench.build builder->
|
||||
data = Data.create
|
||||
|
||||
builder.group ("Filter_Is_In") options group_builder->
|
||||
group_builder.specify "integers" <|
|
||||
scenario = data.scenario
|
||||
scenario.table.filter "X" (Filter_Condition.Is_In scenario.ints_vec)
|
||||
|
||||
group_builder.specify "dates" <|
|
||||
scenario = data.scenario
|
||||
scenario.table.filter "dates" (Filter_Condition.Is_In scenario.dates_vec)
|
||||
|
||||
group_builder.specify "bools" <|
|
||||
scenario = data.scenario
|
||||
scenario.table.filter "bools" (Filter_Condition.Is_In scenario.bool_vec)
|
||||
|
||||
main = collect_benches . run_main
|
222
test/Benchmarks/src/Table/Join.enso
Normal file
222
test/Benchmarks/src/Table/Join.enso
Normal file
@ -0,0 +1,222 @@
|
||||
from Standard.Base import all
|
||||
from Standard.Base.Runtime import assert
|
||||
|
||||
from Standard.Table import all
|
||||
|
||||
from Standard.Test import Bench
|
||||
|
||||
from project.Config import extended_tests
|
||||
|
||||
options = Bench.options . set_warmup (Bench.phase_conf 2 5) . set_measure (Bench.phase_conf 2 5)
|
||||
|
||||
type Scenario
|
||||
Value table1 table2
|
||||
|
||||
shuffle vec =
|
||||
vec.take (Index_Sub_Range.Sample vec.length seed=42)
|
||||
|
||||
create_scenario_equals num_rows =
|
||||
xs = (0.up_to num_rows).to_vector
|
||||
table1 = Table.new [["key", xs]]
|
||||
table2 = Table.new [["key", shuffle xs]]
|
||||
Scenario.Value table1 table2
|
||||
|
||||
create_scenario_equals_medium_groups num_rows =
|
||||
xs = (0.up_to num_rows).map x-> (x/30).floor
|
||||
ys = xs.reverse.map (+2)
|
||||
table1 = Table.new [["key", xs]]
|
||||
table2 = Table.new [["key", ys]]
|
||||
Scenario.Value table1 table2
|
||||
|
||||
create_scenario_equals_ignore_case num_rows =
|
||||
table1 = Table.new [["key", (0.up_to num_rows).map i-> "a"+i.to_text]]
|
||||
table2 = Table.new [["case_insensitive_key", (0.up_to num_rows).reverse.map i-> "A"+i.to_text]]
|
||||
Scenario.Value table1 table2
|
||||
|
||||
create_scenario_between num_rows =
|
||||
xs = (0.up_to num_rows).map x-> x*100
|
||||
lows = xs.map x-> x-10
|
||||
highs = xs.map x-> x+50
|
||||
|
||||
table1 = Table.new [["x", shuffle xs]]
|
||||
table2 = Table.new [["lows", lows], ["highs", highs]]
|
||||
Scenario.Value table1 table2
|
||||
|
||||
## The mixed scenario creates a pair of tables where all rows are mapped 1-1,
|
||||
but they are split into 3 groups. Each group differs by only one 'key' while
|
||||
having equal keys of the other two types.
|
||||
|
||||
This ensures that a combined scenario must be efficient for all conditions,
|
||||
regardless of the distribution of keys - it cannot naively group by only a
|
||||
subset of keys and brute force the remaining keys - because in this example,
|
||||
splitting by any subset of keys will still yield a big group - only splitting
|
||||
by all 3 keys gives us small groups (1-1).
|
||||
create_scenario_mixed num_rows =
|
||||
n = (num_rows/3).round
|
||||
xs = (0.up_to n).to_vector
|
||||
ys_1 = (0.up_to n).map i-> "a"+i.to_text
|
||||
ys_2 = (0.up_to n).map i-> "A"+i.to_text
|
||||
zs = (0.up_to n).map x-> 1000 + x*100
|
||||
|
||||
constant_x = Vector.new n _-> 1
|
||||
constant_y = Vector.new n _-> "_"
|
||||
constant_z = Vector.new n _-> 0
|
||||
|
||||
table1 =
|
||||
group1 = Table.new [["EQ", shuffle xs], ["case_insensitive", constant_y], ["x", constant_z]]
|
||||
group2 = Table.new [["EQ", constant_x], ["case_insensitive", shuffle ys_1], ["x", constant_z]]
|
||||
group3 = Table.new [["EQ", constant_x], ["case_insensitive", constant_y], ["x", shuffle zs]]
|
||||
group1.union [group2, group3]
|
||||
|
||||
table2 =
|
||||
group1 = Table.new [["EQ", shuffle xs], ["case_insensitive", constant_y], ["lows", constant_z], ["highs", constant_z]]
|
||||
group2 = Table.new [["EQ", constant_x], ["case_insensitive", shuffle ys_2], ["lows", constant_z], ["highs", constant_z]]
|
||||
|
||||
lows = zs.map x-> x-10
|
||||
highs = zs.map x-> x+30
|
||||
group3 = Table.new [["EQ", constant_x], ["case_insensitive", constant_y], ["lows", lows], ["highs", highs]]
|
||||
|
||||
group1.union [group2, group3]
|
||||
|
||||
Scenario.Value table1 table2
|
||||
|
||||
## The 2d equality scenario matches rows based on 2 keys -
|
||||
it matches corresponding points on a 2d grid.
|
||||
|
||||
This is used to verify that multi-key joins are efficient too.
|
||||
create_scenario_equals_2d num_rows =
|
||||
n = num_rows.sqrt.ceil
|
||||
pts = (0.up_to n).to_vector.flat_map x->
|
||||
(0.up_to n).map y-> [x, y]
|
||||
|
||||
shuffled_pts = shuffle pts
|
||||
|
||||
table1 = Table.new [["x", shuffled_pts.map .first], ["y", shuffled_pts.map .second]]
|
||||
table2 = Table.new [["x", pts.map .first], ["y", pts.map .second]]
|
||||
Scenario.Value table1 table2
|
||||
|
||||
## Similarly to the example with equality, this creates a 2d grid of points, but
|
||||
they are matched using the Between condition.
|
||||
create_scenario_between_2d num_rows =
|
||||
n = num_rows.sqrt.ceil
|
||||
pts = (0.up_to n).to_vector.flat_map x->
|
||||
(0.up_to n).map y-> [x, y]
|
||||
|
||||
shuffled_pts = shuffle pts
|
||||
table1 = Table.new [["x", shuffled_pts.map .first], ["y", shuffled_pts.map .second]]
|
||||
|
||||
lows = pts.map p-> [p.first - 0.1, p.second - 0.1]
|
||||
highs = pts.map p-> [p.first + 0.1, p.second + 0.1]
|
||||
|
||||
table2 = Table.new [["x_lows", lows.map .first], ["y_lows", lows.map .second], ["x_highs", highs.map .first], ["y_highs", highs.map .second]]
|
||||
Scenario.Value table1 table2
|
||||
|
||||
## This one creates a scenario with a 2d grid of points for the left table, but
|
||||
the right table contains pairs of coordinates that denote belts of size 2 x n
|
||||
on that grid.
|
||||
|
||||
Some of them will be horizontal and some vertical, to see how the order of
|
||||
Between arguments affects performance.
|
||||
create_scenario_between_2d_belts num_rows =
|
||||
n = num_rows.sqrt.ceil
|
||||
pts = (0.up_to n).to_vector.flat_map x->
|
||||
(0.up_to n).map y-> [x, y]
|
||||
|
||||
shuffled_pts = shuffle pts
|
||||
table1 = Table.new [["x", shuffled_pts.map .first], ["y", shuffled_pts.map .second]]
|
||||
|
||||
horizontal_belts = Vector.new n x->
|
||||
[x, x+1, 0, n, False]
|
||||
vertical_belts = Vector.new n y->
|
||||
[0, n, y, y+1, True]
|
||||
|
||||
table2 = Table.from_rows ["x_lows", "x_highs", "y_lows", "y_highs", "is_vertical"] (horizontal_belts + vertical_belts)
|
||||
Scenario.Value table1 table2
|
||||
|
||||
## This is a scenario where we want to find rows unmatched in another table.
|
||||
|
||||
The scenario is set-up on purpose in such a way that the intersection of the
|
||||
two tables is very large. This will only be fast if the anti-join does not
|
||||
compute the intersection which is not needed in this scenario.
|
||||
create_scenario_antijoin num_rows =
|
||||
xs = Vector.new num_rows _-> 1
|
||||
|
||||
## The first 1000 rows will be unmatched (and should be returned in the anti-join).
|
||||
All other rows will match with _all_ rows from `xs`, creating a huge intersection.
|
||||
ys = Vector.new num_rows ix->
|
||||
if ix < 1000 then -ix else 1
|
||||
|
||||
table1 = Table.new [["key", xs]]
|
||||
table2 = Table.new [["key", ys]]
|
||||
Scenario.Value table1 table2
|
||||
|
||||
type Data
|
||||
Value ~equals ~equals_medium_groups ~equals_ignore_case ~between ~mixed ~equals2d ~between2d ~between2d_belts ~antijoin
|
||||
|
||||
create num_rows =
|
||||
Data.Value (create_scenario_equals num_rows) (create_scenario_equals_medium_groups num_rows) (create_scenario_equals_ignore_case num_rows) (create_scenario_between num_rows) (create_scenario_mixed num_rows) (create_scenario_equals_2d num_rows) (create_scenario_between_2d num_rows) (create_scenario_between_2d_belts num_rows) (create_scenario_antijoin num_rows)
|
||||
|
||||
collect_benches = Bench.build builder->
|
||||
num_rows = 50000
|
||||
data = Data.create num_rows
|
||||
|
||||
builder.group ("Join_" + num_rows.to_text) options group_builder->
|
||||
group_builder.specify "Equals" <|
|
||||
scenario = data.equals
|
||||
r = scenario.table1.join scenario.table2 on="key"
|
||||
assert (r.row_count == num_rows)
|
||||
|
||||
group_builder.specify "Equals_Medium_Groups" <|
|
||||
scenario = data.equals_medium_groups
|
||||
scenario.table1.join scenario.table2 on="key"
|
||||
|
||||
group_builder.specify "Equals_Ignore_Case" <|
|
||||
scenario = data.equals_ignore_case
|
||||
r = scenario.table1.join scenario.table2 on=(Join_Condition.Equals_Ignore_Case "key" "case_insensitive_key")
|
||||
assert (r.row_count == num_rows)
|
||||
|
||||
group_builder.specify "Between" <|
|
||||
scenario = data.between
|
||||
r = scenario.table1.join scenario.table2 on=(Join_Condition.Between "x" "lows" "highs")
|
||||
assert (r.row_count == num_rows)
|
||||
|
||||
group_builder.specify "Mixed" <|
|
||||
scenario = data.mixed
|
||||
r = scenario.table1.join scenario.table2 on=[Join_Condition.Equals "EQ", Join_Condition.Equals_Ignore_Case "case_insensitive", Join_Condition.Between "x" "lows" "highs"]
|
||||
expected_rows = data.mixed.table1.row_count
|
||||
assert (r.row_count == expected_rows)
|
||||
|
||||
group_builder.specify "Equals_2D" <|
|
||||
scenario = data.equals2d
|
||||
r = scenario.table1.join scenario.table2 on=["x", "y"]
|
||||
assert (r.row_count == scenario.table1.row_count)
|
||||
|
||||
group_builder.specify "Between_2D" <|
|
||||
scenario = data.between2d
|
||||
r = scenario.table1.join scenario.table2 on=[Join_Condition.Between "x" "x_lows" "x_highs", Join_Condition.Between "y" "y_lows" "y_highs"]
|
||||
assert (r.row_count == scenario.table1.row_count)
|
||||
|
||||
if extended_tests then group_builder.specify "Between_2D_Belts_All" <|
|
||||
scenario = data.between2d_belts
|
||||
r = scenario.table1.join scenario.table2 on=[Join_Condition.Between "x" "x_lows" "x_highs", Join_Condition.Between "y" "y_lows" "y_highs"]
|
||||
assert (r.row_count == scenario.table1.row_count)
|
||||
|
||||
if extended_tests then group_builder.specify "Between_2D_Belts_V" <|
|
||||
scenario = data.between2d_belts
|
||||
t2 = scenario.table2.filter "is_vertical" Filter_Condition.Is_True
|
||||
r = scenario.table1.join t2 on=[Join_Condition.Between "x" "x_lows" "x_highs", Join_Condition.Between "y" "y_lows" "y_highs"]
|
||||
assert (r.row_count == scenario.table1.row_count)
|
||||
|
||||
if extended_tests then group_builder.specify "Between_2D_Belts_H" <|
|
||||
scenario = data.between2d_belts
|
||||
t2 = scenario.table2.filter "is_vertical" Filter_Condition.Is_False
|
||||
r = scenario.table1.join t2 on=[Join_Condition.Between "x" "x_lows" "x_highs", Join_Condition.Between "y" "y_lows" "y_highs"]
|
||||
assert (r.row_count == scenario.table1.row_count)
|
||||
|
||||
# TODO this should be part of the main tests, but it was causing issues on CI; re-enable this with #8217
|
||||
if extended_tests then group_builder.specify "AntiJoin" <|
|
||||
scenario = data.antijoin
|
||||
r = scenario.table2.join scenario.table1 on="key" join_kind=Join_Kind.Left_Exclusive
|
||||
assert (r.row_count == 1000)
|
||||
|
||||
main = collect_benches . run_main
|
@ -103,6 +103,15 @@ spec setup =
|
||||
r = t3.join t4 join_kind=Join_Kind.Inner on=["X", "Y"] |> materialize |> _.order_by ["X", "Y", "Z", "Right Z"]
|
||||
check_xy_joined r
|
||||
|
||||
Test.specify "should correctly handle duplicated rows in Equals" <|
|
||||
t1 = table_builder [["X", [1, 2, 2, 3]]]
|
||||
t2 = table_builder [["X", [1, 2, 2, 4]]]
|
||||
r1 = t1.join t2 join_kind=Join_Kind.Full on="X" . order_by "X"
|
||||
within_table r1 <|
|
||||
# Both 2's from t1 match with _both_ ones from t2 _each_, so in total we get 4 `2` pairs:
|
||||
r1.at "X" . to_vector . should_equal [Nothing, 1, 2, 2, 2, 2, 3]
|
||||
r1.at "Right X" . to_vector . should_equal [4, 1, 2, 2, 2, 2, Nothing]
|
||||
|
||||
Test.specify "should allow to join on text equality ignoring case" <|
|
||||
t1 = table_builder [["X", ["a", "B"]], ["Y", [1, 2]]]
|
||||
t2 = table_builder [["X", ["A", "a", "b"]], ["Z", [1, 2, 3]]]
|
||||
@ -170,7 +179,7 @@ spec setup =
|
||||
t2 = table_builder [["lower", [1, 10, 8, 12]], ["upper", [1, 12, 30, 0]], ["Z", [1, 2, 3, 4]]]
|
||||
|
||||
r1 = t1.join join_kind=Join_Kind.Inner t2 on=(Join_Condition.Between "X" "lower" "upper") |> materialize |> _.order_by ["X", "Z"]
|
||||
expect_column_names ["X", "Y", "lower", "upper", "Z"] r1
|
||||
r1.column_names . should_equal ["X", "Y", "lower", "upper", "Z"]
|
||||
r1 . at "X" . to_vector . should_equal [1, 10, 10, 12, 12]
|
||||
r1 . at "Y" . to_vector . should_equal [1, 2, 2, 3, 3]
|
||||
r1 . at "lower" . to_vector . should_equal [1, 10, 8, 10, 8]
|
||||
@ -182,13 +191,71 @@ spec setup =
|
||||
t2 = table_builder [["lower", ["a", "b"]], ["upper", ["a", "ccc"]], ["Z", [10, 20]]]
|
||||
|
||||
r1 = t1.join t2 join_kind=Join_Kind.Inner on=(Join_Condition.Between "X" "lower" "upper") |> materialize |> _.order_by ["X", "Z"]
|
||||
expect_column_names ["X", "Y", "lower", "upper", "Z"] r1
|
||||
r1.column_names . should_equal ["X", "Y", "lower", "upper", "Z"]
|
||||
r1 . at "X" . to_vector . should_equal ["a", "b", "c"]
|
||||
r1 . at "Y" . to_vector . should_equal [1, 2, 3]
|
||||
r1 . at "lower" . to_vector . should_equal ["a", "b", "b"]
|
||||
r1 . at "upper" . to_vector . should_equal ["a", "ccc", "ccc"]
|
||||
r1 . at "Z" . to_vector . should_equal [10, 20, 20]
|
||||
|
||||
Test.specify "should correctly handle Between edge cases (1)" pending=(if prefix.contains "PostgreSQL" then "TODO: fix issue #8243") <|
|
||||
# 1. multiple rows with the same key value on the left side
|
||||
# 2. fully duplicated rows (1, 7) on the left side
|
||||
# 3. empty bounds (lower > upper: 10 > 0)
|
||||
# 4. equal bounds (10 = 10)
|
||||
# 5. unmatched rows on both sides - Full join
|
||||
t1 = table_builder [["X", [1, 10, 20, 1, 2, 1, 1]], ["id", [1, 2, 3, 4, 5, 7, 7]]]
|
||||
t2 = table_builder [["lower", [0, 10, 10]], ["upper", [3, 10, 0]], ["Z", ['a', 'b', 'c']]]
|
||||
r1 = t1.join t2 join_kind=Join_Kind.Full on=(Join_Condition.Between "X" "lower" "upper") |> materialize |> _.order_by ["Z", "id"]
|
||||
within_table r1 <|
|
||||
r1.column_names . should_equal ["X", "id", "lower", "upper", "Z"]
|
||||
rows = r1.rows.map .to_vector
|
||||
rows.length . should_equal 8
|
||||
|
||||
rows.at 0 . should_equal [20, 3, Nothing, Nothing, Nothing]
|
||||
rows.at 1 . should_equal [ 1, 1, 0, 3, 'a']
|
||||
rows.at 2 . should_equal [ 1, 4, 0, 3, 'a']
|
||||
rows.at 3 . should_equal [ 2, 5, 0, 3, 'a']
|
||||
rows.at 4 . should_equal [ 1, 7, 0, 3, 'a']
|
||||
rows.at 5 . should_equal [ 1, 7, 0, 3, 'a']
|
||||
rows.at 6 . should_equal [10, 2, 10, 10, 'b']
|
||||
rows.at 7 . should_equal [Nothing, Nothing, 10, 0, 'c']
|
||||
|
||||
Test.specify "should correctly handle Between edge cases (2)" <|
|
||||
# 6. multiple Between conditions
|
||||
xs = [0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4]
|
||||
ys = [1, 2, 3, 1, 9, 2, 3, 2, 4, 2, 1, 1, 1, 2]
|
||||
pts = xs.zip ys . take (Index_Sub_Range.Sample xs.length seed=42)
|
||||
t1 = table_builder [["X", pts.map .first], ["Y", pts.map .second]]
|
||||
|
||||
t2 = table_builder [["lx", [1]], ["ux", [3]], ["ly", [1]], ["uy", [2]]]
|
||||
r2 = t1.join t2 join_kind=Join_Kind.Inner on=[Join_Condition.Between "X" "lx" "ux", Join_Condition.Between "Y" "ly" "uy"] |> materialize |> _.order_by ["X", "Y"]
|
||||
within_table r2 <|
|
||||
r2.at "X" . to_vector . should_equal [1, 1, 2, 3, 3]
|
||||
r2.at "Y" . to_vector . should_equal [1, 2, 2, 1, 2]
|
||||
|
||||
t3 = table_builder [["lx", [1.9]], ["ux", [3]], ["ly", [1]], ["uy", [2]]]
|
||||
r3 = t1.join t3 join_kind=Join_Kind.Inner on=[Join_Condition.Between "X" "lx" "ux", Join_Condition.Between "Y" "ly" "uy"] |> materialize |> _.order_by ["X", "Y"]
|
||||
within_table r3 <|
|
||||
r3.at "X" . to_vector . should_equal [2, 3, 3]
|
||||
r3.at "Y" . to_vector . should_equal [2, 1, 2]
|
||||
|
||||
Test.specify "should correctly handle Between edge cases (3)" <|
|
||||
# 7. duplicated rows on both sides
|
||||
t1 = table_builder [["X", [10, 20, 20]]]
|
||||
t2 = table_builder [["low", [15, 15]], ["high", [30, 30]]]
|
||||
r1 = t1.join t2 join_kind=Join_Kind.Right_Outer on=(Join_Condition.Between "X" "low" "high")
|
||||
within_table r1 <|
|
||||
r1.at "X" . to_vector . should_equal [20, 20, 20, 20]
|
||||
r1.at "low" . to_vector . should_equal [15, 15, 15, 15]
|
||||
r1.at "high" . to_vector . should_equal [30, 30, 30, 30]
|
||||
|
||||
# 8. keep only unmatched rows
|
||||
r2 = t1.join t2 join_kind=Join_Kind.Left_Exclusive on=(Join_Condition.Between "X" "low" "high")
|
||||
within_table r2 <|
|
||||
r2.column_names . should_equal ["X"]
|
||||
r2.at "X" . to_vector . should_equal [10]
|
||||
|
||||
if setup.test_selection.supports_unicode_normalization then
|
||||
Test.specify "should allow range-based joins (using Between) for text with Unicode normalization" <|
|
||||
t1 = table_builder [["X", ['s\u0301', 's']], ["Y", [1, 2]]]
|
||||
@ -368,16 +435,15 @@ spec setup =
|
||||
if setup.supports_custom_objects then
|
||||
t1 = table_builder [["X", [My_Type.Value 1 2, 2.0, 2]], ["Y", [10, 20, 30]]]
|
||||
t2 = table_builder [["Z", [2.0, 1.5, 2.0]], ["W", [1, 2, 3]]]
|
||||
action3 = t1.join t2 join_kind=Join_Kind.Inner on=(Join_Condition.Equals "X" "Z") on_problems=_
|
||||
tester3 table =
|
||||
expect_column_names ["X", "Y", "Z", "W"] table
|
||||
t1 = table.order_by ["Y", "W"]
|
||||
t1.at "X" . to_vector . should_equal [2.0, 2.0, 2, 2]
|
||||
t1.at "Y" . to_vector . should_equal [20, 20, 30, 30]
|
||||
t1.at "Z" . to_vector . should_equal [2.0, 2.0, 2.0, 2.0]
|
||||
t1.at "W" . to_vector . should_equal [1, 3, 1, 3]
|
||||
problems3 = [Floating_Point_Equality.Error "Z", Floating_Point_Equality.Error "X"]
|
||||
Problems.test_problem_handling action3 problems3 tester3
|
||||
r3 = t1.join t2 join_kind=Join_Kind.Inner on=(Join_Condition.Equals "X" "Z") on_problems=Problem_Behavior.Report_Warning
|
||||
r3.column_names.should_equal ["X", "Y", "Z", "W"]
|
||||
r4 = r3.order_by ["Y", "W"]
|
||||
r4.at "X" . to_vector . should_equal [2.0, 2.0, 2, 2]
|
||||
r4.at "Y" . to_vector . should_equal [20, 20, 30, 30]
|
||||
r4.at "Z" . to_vector . should_equal [2.0, 2.0, 2.0, 2.0]
|
||||
r4.at "W" . to_vector . should_equal [1, 3, 1, 3]
|
||||
expected_problems = [Floating_Point_Equality.Error "Z", Floating_Point_Equality.Error "X"]
|
||||
Problems.get_attached_warnings r3 . should_contain_the_same_elements_as expected_problems
|
||||
|
||||
Test.specify "should correctly handle nulls in equality conditions" pending=db_todo <|
|
||||
t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą"]], ["Y", [0, 1, 2, 3, 4]]]
|
||||
@ -650,6 +716,17 @@ spec setup =
|
||||
r3.at 3 . should_equal [2, 20, 2, Nothing, Nothing]
|
||||
r3.at 4 . should_equal [3, 30, 3, Nothing, Nothing]
|
||||
|
||||
t8 = table_builder [["X", [2, 99]], ["Y", [20, 99]], ["C", [5, 99]]]
|
||||
t9 = t4_2.join t8 join_kind=Join_Kind.Full on=["X", "Y", "C"]
|
||||
within_table t9 <|
|
||||
t9.column_names . should_equal ["X", "Y", "C", "Right X", "Right Y", "Right C"]
|
||||
r3 = materialize t9 . order_by ["X", "Right X"] . rows . map .to_vector
|
||||
r3.length . should_equal 4
|
||||
r3.at 0 . should_equal [Nothing, Nothing, Nothing, 99, 99, 99]
|
||||
r3.at 1 . should_equal [1, 10, 3, Nothing, Nothing, Nothing]
|
||||
r3.at 2 . should_equal [2, 20, 5, 2, 20, 5]
|
||||
r3.at 3 . should_equal [3, 30, 7, Nothing, Nothing, Nothing]
|
||||
|
||||
Test.specify "should gracefully handle tables from different backends" <|
|
||||
alternative_connection = Database.connect (SQLite In_Memory)
|
||||
t0 = (Table.new [["X", [1, 2, 4]], ["Z", [10, 20, 30]]]).select_into_database_table alternative_connection "T0" temporary=True
|
||||
|
@ -2,11 +2,13 @@ from Standard.Base import all
|
||||
|
||||
from Standard.Test import Test_Suite
|
||||
|
||||
import project.Helpers.Sorted_List_Index_Spec
|
||||
import project.Helpers.Unique_Naming_Strategy_Spec
|
||||
import project.Helpers.Value_Type_Spec
|
||||
|
||||
spec =
|
||||
Unique_Naming_Strategy_Spec.spec
|
||||
Sorted_List_Index_Spec.spec
|
||||
Value_Type_Spec.spec
|
||||
|
||||
main = Test_Suite.run_main spec
|
||||
|
64
test/Table_Tests/src/Helpers/Sorted_List_Index_Spec.enso
Normal file
64
test/Table_Tests/src/Helpers/Sorted_List_Index_Spec.enso
Normal file
@ -0,0 +1,64 @@
|
||||
from Standard.Base import all
|
||||
|
||||
# We need this import, to ensure that we depend on `Standard.Table`, so that the Java import of `org.enso.table` is valid.
|
||||
from Standard.Table import all
|
||||
|
||||
from Standard.Test import Test, Test_Suite
|
||||
import Standard.Test.Extensions
|
||||
|
||||
polyglot java import java.util.Comparator
|
||||
polyglot java import org.enso.table.data.table.join.between.SortedListIndex
|
||||
|
||||
|
||||
main = Test_Suite.run_main spec
|
||||
|
||||
## White-box tests for the SortedListIndex, ensuring correctness of the
|
||||
implementation - these are additional tests apart from
|
||||
the `Join_Condition.Between` test cases, to ensure no off-by-one errors
|
||||
or other bugs are present in the implementation.
|
||||
spec = Test.group "SortedListIndex (used for SortJoin)" <|
|
||||
make_index vec = SortedListIndex.build vec Comparator.naturalOrder
|
||||
|
||||
v1 = [0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 10, 10, 10, 10, 11, 14, 17, 19]
|
||||
v1_shuffled = v1.take (Index_Sub_Range.Sample v1.length)
|
||||
index1 = make_index v1_shuffled
|
||||
|
||||
Test.specify "should correctly handle empty matches" <|
|
||||
Vector.from_polyglot_array (index1.findSubRange 9 9) . should_equal []
|
||||
Vector.from_polyglot_array (index1.findSubRange -10 -2) . should_equal []
|
||||
Vector.from_polyglot_array (index1.findSubRange 200 300) . should_equal []
|
||||
Vector.from_polyglot_array (index1.findSubRange 20 0) . should_equal []
|
||||
|
||||
Test.specify "should correctly handle single-element matches" <|
|
||||
Vector.from_polyglot_array (index1.findSubRange 8 8) . should_equal [8]
|
||||
Vector.from_polyglot_array (index1.findSubRange 12 16) . should_equal [14]
|
||||
Vector.from_polyglot_array (index1.findSubRange 18 100) . should_equal [19]
|
||||
Vector.from_polyglot_array (index1.findSubRange 19 100) . should_equal [19]
|
||||
Vector.from_polyglot_array (index1.findSubRange 19 19) . should_equal [19]
|
||||
|
||||
Test.specify "should correctly handle matches" <|
|
||||
Vector.from_polyglot_array (index1.findSubRange 4 6) . should_equal [4, 5, 6]
|
||||
Vector.from_polyglot_array (index1.findSubRange 3 5) . should_equal [3, 3, 4, 5]
|
||||
|
||||
Vector.from_polyglot_array (index1.findSubRange 0 3) . should_equal [0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3]
|
||||
Vector.from_polyglot_array (index1.findSubRange 2 4) . should_equal [2, 2, 2, 3, 3, 4]
|
||||
Vector.from_polyglot_array (index1.findSubRange 8 10) . should_equal [8, 10, 10, 10, 10]
|
||||
Vector.from_polyglot_array (index1.findSubRange 8 11) . should_equal [8, 10, 10, 10, 10, 11]
|
||||
Vector.from_polyglot_array (index1.findSubRange 8 12) . should_equal [8, 10, 10, 10, 10, 11]
|
||||
Vector.from_polyglot_array (index1.findSubRange 9 12) . should_equal [10, 10, 10, 10, 11]
|
||||
|
||||
Test.specify "should correctly handle big all-equal ranges" <|
|
||||
Vector.from_polyglot_array (index1.findSubRange 1 1) . should_equal [1, 1, 1, 1]
|
||||
Vector.from_polyglot_array (index1.findSubRange 7 7) . should_equal [7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
|
||||
|
||||
Test.specify "other cases: empty index" <|
|
||||
index2 = make_index []
|
||||
Vector.from_polyglot_array (index2.findSubRange 1 5) . should_equal []
|
||||
|
||||
Test.specify "other cases: single element index" <|
|
||||
index2 = make_index [5]
|
||||
Vector.from_polyglot_array (index2.findSubRange 1 5) . should_equal [5]
|
||||
Vector.from_polyglot_array (index2.findSubRange 5 5) . should_equal [5]
|
||||
Vector.from_polyglot_array (index2.findSubRange 1 2) . should_equal []
|
||||
Vector.from_polyglot_array (index2.findSubRange 2 1) . should_equal []
|
||||
Vector.from_polyglot_array (index2.findSubRange 10 10) . should_equal []
|
@ -1,118 +0,0 @@
|
||||
from Standard.Base import all
|
||||
|
||||
from Standard.Table import all
|
||||
|
||||
from Standard.Test import Test, Test_Suite
|
||||
import Standard.Test.Extensions
|
||||
from project.Util import all
|
||||
|
||||
spec =
|
||||
Test.group "[In-Memory] Table.join performance" <|
|
||||
n = 10000
|
||||
Test.specify "should efficiently compute equality joins" <|
|
||||
vec = 0.up_to n . to_vector
|
||||
vec2 = 1.up_to n+1 . to_vector
|
||||
t1 = Table.new [["X", vec], ["Y", 0.up_to n . map (_ % 2)]]
|
||||
t2 = Table.new [["B", [0, 1]]]
|
||||
t3 = Table.new [["X", vec.reverse], ["Z", vec2]]
|
||||
|
||||
r1 = Duration.time_execution <|
|
||||
t1.join t2 on=(Join_Condition.Equals "Y" "B")
|
||||
|
||||
r2 = Duration.time_execution <|
|
||||
t1.join t3 on="X"
|
||||
t4 = r2.second . order_by ["X"]
|
||||
t4.at "X" . to_vector . should_equal <| vec
|
||||
t4.at "Z" . to_vector . should_equal <| vec2.reverse
|
||||
|
||||
base_ms = r1.first.total_milliseconds
|
||||
expected_max_time_ms = base_ms * 5 + 100
|
||||
runtime_ms = r2.first.total_milliseconds
|
||||
if runtime_ms > expected_max_time_ms then
|
||||
Test.fail "Expected a join of "+n.to_text+"x"+n.to_text+" with linear result size to be efficient, but it took "+runtime_ms.to_text+"ms while a join of 2x"+n.to_text+" with the same result size took "+base_ms.to_text+"ms. The maximum time threshold for this operation to be deemed efficient has been estimated at "+expected_max_time_ms.to_text+"ms."
|
||||
|
||||
Test.specify "should efficiently compute equality joins mixed with other secondary conditions" <|
|
||||
vec = 0.up_to n . to_vector
|
||||
vec2 = 1.up_to n+1 . to_vector
|
||||
t1 = Table.new [["X", vec], ["Y", 0.up_to n . map (_ % 2)], ["A", Vector.fill n "a"], ["B", Vector.fill n 9]]
|
||||
t2 = Table.new [["B", [0, 1]], ["A", ["A", "A"]], ["l", [0, 0]], ["u", [20, 20]]]
|
||||
t3 = Table.new [["X", vec.reverse], ["Z", vec2], ["A", Vector.fill n "a"], ["l", Vector.fill n 0], ["u", Vector.fill n 20]]
|
||||
|
||||
secondary_conditions = [Join_Condition.Equals_Ignore_Case "A", Join_Condition.Between "B" "l" "u"]
|
||||
|
||||
r1 = Duration.time_execution <|
|
||||
t1.join t2 on=secondary_conditions+[Join_Condition.Equals "Y" "B"]
|
||||
|
||||
r2 = Duration.time_execution <|
|
||||
t1.join t3 on=secondary_conditions+[Join_Condition.Equals "X" "X"]
|
||||
t4 = r2.second . order_by ["X"]
|
||||
t4.at "X" . to_vector . should_equal <| vec
|
||||
t4.at "Z" . to_vector . should_equal <| vec2.reverse
|
||||
|
||||
base_ms = r1.first.total_milliseconds
|
||||
expected_max_time_ms = base_ms * 5 + 100
|
||||
runtime_ms = r2.first.total_milliseconds
|
||||
if runtime_ms > expected_max_time_ms then
|
||||
Test.fail "Expected a join of "+n.to_text+"x"+n.to_text+" with linear result size to be efficient, but it took "+runtime_ms.to_text+"ms while a join of 2x"+n.to_text+" with the same result size took "+base_ms.to_text+"ms. The maximum time threshold for this operation to be deemed efficient has been estimated at "+expected_max_time_ms.to_text+"ms."
|
||||
|
||||
Test.specify "should efficiently compute case-insensitive equality joins" <|
|
||||
unique_text_for_number prefix i =
|
||||
suffix = Text.from_utf_8 [97 + i%20]
|
||||
prefix + i.to_text + "-" + suffix
|
||||
lowers = 0.up_to n . map (unique_text_for_number "a")
|
||||
uppers = 0.up_to n . map (unique_text_for_number "A")
|
||||
t1 = Table.new [["X", lowers], ["Y", 0.up_to n . map i-> if i%2 == 0 then "a" else "b"], ["A", Vector.fill n 44], ["B", Vector.fill n 9], ["N", 0.up_to n . to_vector]]
|
||||
t2 = Table.new [["B", ["A", "B", "a"]], ["A", [44, 44, 44]], ["l", [0, 0, 0]], ["u", [20, 20, 20]]]
|
||||
t3 = Table.new [["X", uppers.reverse], ["Z", 1.up_to n+1 . to_vector], ["A", Vector.fill n 44], ["l", Vector.fill n 0], ["u", Vector.fill n 20]]
|
||||
|
||||
secondary_conditions = [Join_Condition.Equals "A", Join_Condition.Between "B" "l" "u"]
|
||||
|
||||
r1 = Duration.time_execution <|
|
||||
t1.join t2 on=[Join_Condition.Equals_Ignore_Case "Y" "B"]+secondary_conditions
|
||||
r1.second.row_count . should_equal (n + n/2)
|
||||
|
||||
r2 = Duration.time_execution <|
|
||||
t1.join t3 on=[Join_Condition.Equals_Ignore_Case "X" "X"]+secondary_conditions
|
||||
t4 = r2.second . order_by "N"
|
||||
t4.row_count . should_equal n
|
||||
t4.at "X" . to_vector . should_equal lowers
|
||||
t4.at "Right X" . to_vector . should_equal uppers
|
||||
t4.at "Z" . to_vector . should_equal <| 1.up_to n+1 . to_vector . reverse
|
||||
|
||||
base_ms = r1.first.total_milliseconds
|
||||
expected_max_time_ms = base_ms * 5 + 100
|
||||
runtime_ms = r2.first.total_milliseconds
|
||||
if runtime_ms > expected_max_time_ms then
|
||||
Test.fail "Expected a join of "+n.to_text+"x"+n.to_text+" with linear result size to be efficient, but it took "+runtime_ms.to_text+"ms while a join of 3x"+n.to_text+" with the same result size took "+base_ms.to_text+"ms. The maximum time threshold for this operation to be deemed efficient has been estimated at "+expected_max_time_ms.to_text+"ms."
|
||||
|
||||
Test.specify "should efficiently compute Between joins" pending="TODO in task https://www.pivotaltracker.com/story/show/183913337" <|
|
||||
xs = 0.up_to n . map x-> x * 20
|
||||
ls = 0.up_to n . map x-> x * 20 - 20
|
||||
us = 0.up_to n . map x-> x * 20 + 5
|
||||
t1 = Table.new [["X", xs], ["A", Vector.fill n "a"], ["B", Vector.fill n 44]]
|
||||
# We set up the ranges so that each entry of `t1` will match 2, apart from the first entry matched only once.
|
||||
t2 = Table.new [["l", [0, 10]], ["u", [20 * n, 20 * n + 100]], ["A", ["a", "A"]], ["B", [44, 44]]]
|
||||
# Here also, each range from `t3` will match 2 entries of `t1`, apart from the first one.
|
||||
t3 = Table.new [["l", ls], ["u", us], ["A", Vector.fill n "A"], ["B", Vector.fill n 44]]
|
||||
|
||||
conditions = [Join_Condition.Equals_Ignore_Case "A", Join_Condition.Between "X" "l" "u", Join_Condition.Equals "B"]
|
||||
|
||||
r1 = Duration.time_execution <|
|
||||
t1.join t2 on=conditions
|
||||
r1.second.row_count . should_equal (2*n - 1)
|
||||
|
||||
r2 = Duration.time_execution <|
|
||||
t1.join t3 on=conditions
|
||||
t4 = r2.second . order_by ["X", "l"]
|
||||
t4.row_count . should_equal (2*n - 1)
|
||||
|
||||
t4.at "X" . to_vector . should_equal ((xs.flat_map x-> [x, x]) . drop (Last 1))
|
||||
t4.at "l" . to_vector . should_equal (ls.zip (ls.drop 1) . flatten)+[ls.last]
|
||||
|
||||
base_ms = r1.first.total_milliseconds
|
||||
expected_max_time_ms = base_ms * 5 + 100
|
||||
runtime_ms = r2.first.total_milliseconds
|
||||
if runtime_ms > expected_max_time_ms then
|
||||
Test.fail "Expected a join of "+n.to_text+"x"+n.to_text+" with linear result size to be efficient, but it took "+runtime_ms.to_text+"ms while a join of 2x"+n.to_text+" with the same result size took "+base_ms.to_text+"ms. The maximum time threshold for this operation to be deemed efficient has been estimated at "+expected_max_time_ms.to_text+"ms."
|
||||
|
||||
main = Test_Suite.run_main spec
|
@ -8,7 +8,6 @@ import project.In_Memory.Column_Spec
|
||||
import project.In_Memory.Column_Format_Spec
|
||||
import project.In_Memory.Common_Spec
|
||||
import project.In_Memory.Integer_Overflow_Spec
|
||||
import project.In_Memory.Join_Performance_Spec
|
||||
import project.In_Memory.Lossy_Conversions_Spec
|
||||
import project.In_Memory.Parse_To_Table_Spec
|
||||
import project.In_Memory.Split_Tokenize_Spec
|
||||
@ -29,7 +28,6 @@ spec =
|
||||
Table_Time_Of_Day_Spec.spec
|
||||
Aggregate_Column_Spec.spec
|
||||
Builders_Spec.spec
|
||||
Join_Performance_Spec.spec
|
||||
Split_Tokenize_Spec.spec
|
||||
Parse_To_Table_Spec.spec
|
||||
|
||||
|
@ -871,42 +871,6 @@ spec =
|
||||
t2.filter "Y" (Filter_Condition.Is_In in_vector) . at "Y" . to_vector . should_equal expected_neg_vector
|
||||
t2.filter "Y" (Filter_Condition.Is_In in_column) . at "Y" . to_vector . should_equal expected_neg_vector
|
||||
|
||||
Test.specify "should perform `Is_In` efficiently for builtin types" <|
|
||||
first_day = Date_Time.new 2000 1 1
|
||||
make_date x = first_day + (Duration.new seconds=x)
|
||||
init = Duration.time_execution <|
|
||||
t = Table.new [["X", (200.up_to 10000 . to_vector)]]
|
||||
vec = 4000.up_to 13000 . to_vector
|
||||
expected_vector = 4000.up_to 10000 . to_vector
|
||||
expected_vector_2 = 200.up_to 10000 . with_step 2 . to_vector
|
||||
dates_vec = vec.map make_date
|
||||
bool_vec = Vector.fill 7000 True
|
||||
date_col = t.at "X" . map make_date
|
||||
[t, vec, expected_vector, expected_vector_2, dates_vec, bool_vec, date_col]
|
||||
t = init.second . at 0
|
||||
vec = init.second . at 1
|
||||
expected_vector = init.second . at 2
|
||||
expected_vector_2 = init.second . at 3
|
||||
dates_vec = init.second . at 4
|
||||
bool_vec = init.second . at 5
|
||||
date_col = init.second . at 6
|
||||
|
||||
expected_max_time_ms = init.first.total_milliseconds * 2
|
||||
check_timing name ~action =
|
||||
res = Duration.time_execution action
|
||||
runtime_ms = res.first.total_milliseconds
|
||||
if runtime_ms > expected_max_time_ms then
|
||||
Test.fail "Expected `Is_In` on "+name+" to be efficient, but it took "+runtime_ms.to_text+"ms while initialization itself took just "+expected_max_time_ms.to_text+"ms."
|
||||
|
||||
check_timing "integers" <|
|
||||
t.filter "X" (Filter_Condition.Is_In vec) . at "X" . to_vector . should_equal expected_vector
|
||||
|
||||
check_timing "booleans" <|
|
||||
t.filter (t.at "X" % 2 == 0) (Filter_Condition.Is_In bool_vec) . at "X" . to_vector . should_equal expected_vector_2
|
||||
|
||||
check_timing "dates" <|
|
||||
t.filter date_col (Filter_Condition.Is_In dates_vec) . at "X" . to_vector . should_equal expected_vector
|
||||
|
||||
Test.group "[In-Memory-specific] Table.join" <|
|
||||
Test.specify "should correctly report unsupported cross-backend joins" <|
|
||||
t = Table.new [["X", [1, 2, 3]]]
|
||||
|
Loading…
Reference in New Issue
Block a user