From 1b8b30a68d85658feec1ea4879e14b6673bcaac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 8 Nov 2023 13:59:55 +0100 Subject: [PATCH] Improve performance of `Join_Condition.Between` by sorting on one dimension (#8212) - Closes #5303 - Refactors `JoinStrategy` allowing us to 'stack' join strategies on top of each other (to some extent) - currently a `HashJoin` can be followed by another join strategy (currently `SortJoin`) - Adds benchmarks for join - Due to limitations of the sorting approach this will still not be as fast as possible for cases where there is more than 1 `Between` condition in a single query - trying to demonstrate that in benchmarks. - We can replace sorting by d-dimensional [RangeTrees](https://en.wikipedia.org/wiki/Range_tree) to get `O((n + m) log^d n + k)` performance (where `n` and `m` are sizes of joined tables, `d` is the amount of `Between` conditions used in the query and `k` is the result set size). - Follow up ticket for consideration later: #8216 - Closes #8215 - After all, it turned out that `TreeSet` was problematic (because of not enough flexibility with duplicate key handling), so the simplest solution was to immediately implement this sub-task. - Closes #8204 - Unrelated, but I ran into this here: adds type checks to other arguments of `set`. - Before, putting in a Column as `new_name` (i.e. mistakenly messing up the order of arguments), lead to a hard to understand `Method `if_then_else` of type Column could not be found.`, instead now it would file with type error 'expected Text got Column`. --- .../Database/0.0.0-dev/src/Data/Table.enso | 2 +- .../Table/0.0.0-dev/src/Data/Table.enso | 10 +- .../src/Internal/Lookup_Helpers.enso | 2 +- .../table/data/index/MultiValueKeyBase.java | 4 + .../data/index/OrderedMultiValueKey.java | 20 ++ .../java/org/enso/table/data/table/Table.java | 8 +- .../enso/table/data/table/join/Equals.java | 5 - .../enso/table/data/table/join/IndexJoin.java | 112 --------- .../table/data/table/join/JoinCondition.java | 3 - .../table/data/table/join/JoinStrategy.java | 73 +++++- .../data/table/join/MatchAllStrategy.java | 28 +++ .../table/join/PluggableJoinStrategy.java | 18 ++ .../data/table/join/between/SortJoin.java | 162 +++++++++++++ .../table/join/between/SortedListIndex.java | 129 ++++++++++ .../table/join/{ => conditions}/Between.java | 2 +- .../data/table/join/conditions/Equals.java | 5 + .../{ => conditions}/EqualsIgnoreCase.java | 4 +- .../join/conditions/HashableCondition.java | 4 + .../table/join/conditions/JoinCondition.java | 3 + .../data/table/join/hashing/HashJoin.java | 84 +++++++ .../{ => lookup}/LookupColumnDescription.java | 2 +- .../table/join/{ => lookup}/LookupJoin.java | 4 +- .../table/data/table/join/scan/Matcher.java | 5 - .../data/table/join/scan/MatcherFactory.java | 152 ------------ .../enso/table/operations/AddRowNumber.java | 44 +--- test/Benchmarks/src/Main.enso | 2 + test/Benchmarks/src/Table/Is_In.enso | 48 ++++ test/Benchmarks/src/Table/Join.enso | 222 ++++++++++++++++++ .../Join/Join_Spec.enso | 101 +++++++- test/Table_Tests/src/Helpers/Main.enso | 2 + .../src/Helpers/Sorted_List_Index_Spec.enso | 64 +++++ .../src/In_Memory/Join_Performance_Spec.enso | 118 ---------- test/Table_Tests/src/In_Memory/Main.enso | 2 - .../Table_Tests/src/In_Memory/Table_Spec.enso | 36 --- 34 files changed, 979 insertions(+), 501 deletions(-) delete mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/Equals.java delete mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/IndexJoin.java delete mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/JoinCondition.java create mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/MatchAllStrategy.java create mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/PluggableJoinStrategy.java create mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortJoin.java create mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortedListIndex.java rename std-bits/table/src/main/java/org/enso/table/data/table/join/{ => conditions}/Between.java (73%) create mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/Equals.java rename std-bits/table/src/main/java/org/enso/table/data/table/join/{ => conditions}/EqualsIgnoreCase.java (59%) create mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/HashableCondition.java create mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/JoinCondition.java create mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java rename std-bits/table/src/main/java/org/enso/table/data/table/join/{ => lookup}/LookupColumnDescription.java (90%) rename std-bits/table/src/main/java/org/enso/table/data/table/join/{ => lookup}/LookupJoin.java (98%) delete mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/scan/Matcher.java delete mode 100644 std-bits/table/src/main/java/org/enso/table/data/table/join/scan/MatcherFactory.java create mode 100644 test/Benchmarks/src/Table/Is_In.enso create mode 100644 test/Benchmarks/src/Table/Join.enso create mode 100644 test/Table_Tests/src/Helpers/Sorted_List_Index_Spec.enso delete mode 100644 test/Table_Tests/src/In_Memory/Join_Performance_Spec.enso diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso index aad233acec5..9831c0596d5 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso @@ -837,7 +837,7 @@ type Table table.set "2 * [total_stock]" new_name="total_stock_expr" @new_name Widget_Helpers.make_column_name_selector set : Column | Text | Array | Vector | Range | Date_Range | Constant_Column | Column_Operation -> Text -> Set_Mode -> Problem_Behavior -> Table ! Existing_Column | Missing_Column | No_Such_Column | Expression_Error - set self column new_name="" set_mode=Set_Mode.Add_Or_Update on_problems=Report_Warning = + set self column (new_name : Text = "") (set_mode : Set_Mode = Set_Mode.Add_Or_Update) (on_problems : Problem_Behavior = Report_Warning) = problem_builder = Problem_Builder.new unique = self.column_naming_helper.create_unique_name_strategy unique.mark_used self.column_names diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index 695bc74e569..8cdab276ba7 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -68,10 +68,10 @@ polyglot java import org.enso.base.ObjectComparator polyglot java import org.enso.table.data.index.MultiValueIndex polyglot java import org.enso.table.data.mask.OrderMask polyglot java import org.enso.table.data.table.Column as Java_Column -polyglot java import org.enso.table.data.table.join.Between as Java_Join_Between -polyglot java import org.enso.table.data.table.join.Equals as Java_Join_Equals -polyglot java import org.enso.table.data.table.join.EqualsIgnoreCase as Java_Join_Equals_Ignore_Case -polyglot java import org.enso.table.data.table.join.LookupJoin +polyglot java import org.enso.table.data.table.join.conditions.Between as Java_Join_Between +polyglot java import org.enso.table.data.table.join.conditions.Equals as Java_Join_Equals +polyglot java import org.enso.table.data.table.join.conditions.EqualsIgnoreCase as Java_Join_Equals_Ignore_Case +polyglot java import org.enso.table.data.table.join.lookup.LookupJoin polyglot java import org.enso.table.data.table.Table as Java_Table polyglot java import org.enso.table.error.TooManyColumnsException polyglot java import org.enso.table.error.NullValuesInKeyColumns @@ -1570,7 +1570,7 @@ type Table table.set "2 * [total_stock]" new_name="total_stock_expr" @column Column_Operation.default_widget set : Text | Column -> Text -> Set_Mode -> Problem_Behavior -> Table ! Existing_Column | Missing_Column | No_Such_Column | Expression_Error - set self column:(Text | Column | Constant_Column | Column_Operation) new_name="" set_mode=Set_Mode.Add_Or_Update on_problems=Report_Warning = + set self column:(Text | Column | Constant_Column | Column_Operation) (new_name : Text = "") (set_mode : Set_Mode = Set_Mode.Add_Or_Update) (on_problems : Problem_Behavior = Report_Warning) = problem_builder = Problem_Builder.new unique = self.column_naming_helper.create_unique_name_strategy unique.mark_used self.column_names diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Lookup_Helpers.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Lookup_Helpers.enso index 0ecb2af58ef..e8a669bf60f 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Lookup_Helpers.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Lookup_Helpers.enso @@ -6,7 +6,7 @@ import project.Data.Type.Value_Type.Value_Type import project.Data.Type.Value_Type_Helpers from project.Errors import Missing_Input_Columns, Unexpected_Extra_Columns, Floating_Point_Equality, No_Common_Type, No_Output_Columns -polyglot java import org.enso.table.data.table.join.LookupColumnDescription +polyglot java import org.enso.table.data.table.join.lookup.LookupColumnDescription ## PRIVATE type Lookup_Column diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java index e8e33c47258..e8ad683ac08 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java +++ b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java @@ -40,6 +40,10 @@ public abstract class MultiValueKeyBase { return rowIndex; } + public int getNumberOfColumns() { + return storages.length; + } + @Override public abstract boolean equals(Object o); diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/OrderedMultiValueKey.java b/std-bits/table/src/main/java/org/enso/table/data/index/OrderedMultiValueKey.java index d5e3c805cd8..6c235ee8e23 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/index/OrderedMultiValueKey.java +++ b/std-bits/table/src/main/java/org/enso/table/data/index/OrderedMultiValueKey.java @@ -78,4 +78,24 @@ public class OrderedMultiValueKey extends MultiValueKeyBase public String toString() { return "OrderedMultiValueKey{row="+rowIndex+"}"; } + + /** + * A comparator that uses only one dimension of the key. + */ + public static class ProjectionComparator implements Comparator { + private final int ix; + + public ProjectionComparator(int ix) { + this.ix = ix; + } + + @Override + public int compare(OrderedMultiValueKey o1, OrderedMultiValueKey o2) { + if (o1.storages.length != o2.storages.length) { + throw new ClassCastException("Incomparable keys."); + } + + return o1.objectComparator.compare(o1.get(ix), o2.get(ix)); + } + } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/Table.java b/std-bits/table/src/main/java/org/enso/table/data/table/Table.java index 25fd46c2fd6..9ba670e52db 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/Table.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/Table.java @@ -18,9 +18,9 @@ import org.enso.table.data.index.OrderedMultiValueKey; import org.enso.table.data.mask.OrderMask; import org.enso.table.data.mask.SliceRange; import org.enso.table.data.table.join.CrossJoin; -import org.enso.table.data.table.join.IndexJoin; -import org.enso.table.data.table.join.JoinCondition; +import org.enso.table.data.table.join.conditions.JoinCondition; import org.enso.table.data.table.join.JoinResult; +import org.enso.table.data.table.join.JoinStrategy; import org.enso.table.error.UnexpectedColumnTypeException; import org.enso.table.operations.Distinct; import org.enso.table.problems.ProblemAggregator; @@ -279,8 +279,8 @@ public class Table { "be true."); } - var strategy = new IndexJoin(); - JoinResult joinResult = strategy.join(this, right, conditions, problemAggregator); + JoinStrategy strategy = JoinStrategy.createStrategy(conditions); + JoinResult joinResult = strategy.join(problemAggregator); List resultsToKeep = new ArrayList<>(); diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/Equals.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/Equals.java deleted file mode 100644 index f5e185a5088..00000000000 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/Equals.java +++ /dev/null @@ -1,5 +0,0 @@ -package org.enso.table.data.table.join; - -import org.enso.table.data.table.Column; - -public record Equals(Column left, Column right) implements JoinCondition {} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/IndexJoin.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/IndexJoin.java deleted file mode 100644 index 20b5f99c20b..00000000000 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/IndexJoin.java +++ /dev/null @@ -1,112 +0,0 @@ -package org.enso.table.data.table.join; - -import org.enso.base.text.TextFoldingStrategy; -import org.enso.table.data.column.storage.Storage; -import org.enso.table.data.column.storage.type.AnyObjectType; -import org.enso.table.data.index.MultiValueIndex; -import org.enso.table.data.table.Column; -import org.enso.table.data.table.Table; -import org.enso.table.data.table.join.scan.Matcher; -import org.enso.table.data.table.join.scan.MatcherFactory; -import org.enso.table.problems.ColumnAggregatedProblemAggregator; -import org.enso.table.problems.ProblemAggregator; -import org.graalvm.polyglot.Context; - -import java.util.List; -import java.util.stream.Collectors; - -public class IndexJoin implements JoinStrategy { - private record HashEqualityCondition( - Column left, Column right, TextFoldingStrategy textFoldingStrategy) { - } - - @Override - public JoinResult join(Table left, Table right, List conditions, ProblemAggregator problemAggregator) { - Context context = Context.getCurrent(); - List equalConditions = - conditions.stream() - .filter(IndexJoin::isSupported) - .map(IndexJoin::makeHashEqualityCondition) - .collect(Collectors.toList()); - - var remainingConditions = - conditions.stream().filter(c -> !isSupported(c)).collect(Collectors.toList()); - - var leftEquals = - equalConditions.stream().map(HashEqualityCondition::left).toArray(Column[]::new); - var rightEquals = - equalConditions.stream().map(HashEqualityCondition::right).toArray(Column[]::new); - var textFoldingStrategies = - equalConditions.stream() - .map(HashEqualityCondition::textFoldingStrategy) - .collect(Collectors.toList()); - - var leftIndex = - MultiValueIndex.makeUnorderedIndex(leftEquals, left.rowCount(), textFoldingStrategies, problemAggregator); - var rightIndex = - MultiValueIndex.makeUnorderedIndex(rightEquals, right.rowCount(), textFoldingStrategies, problemAggregator); - - MatcherFactory factory = new MatcherFactory(); - Matcher remainingMatcher = factory.create( - remainingConditions, new ColumnAggregatedProblemAggregator(problemAggregator) - ); - - JoinResult.Builder resultBuilder = new JoinResult.Builder(); - for (var leftKey : leftIndex.keys()) { - if (rightIndex.contains(leftKey)) { - for (var leftRow : leftIndex.get(leftKey)) { - for (var rightRow : rightIndex.get(leftKey)) { - if (remainingMatcher.matches(leftRow, rightRow)) { - resultBuilder.addRow(leftRow, rightRow); - } - - context.safepoint(); - } - - context.safepoint(); - } - } - - context.safepoint(); - } - - return resultBuilder.build(); - } - - private static boolean isSupported(JoinCondition condition) { - switch (condition) { - case Equals eq -> { - return isBuiltinType(eq.left().getStorage()) && isBuiltinType(eq.right().getStorage()); - } - case EqualsIgnoreCase ignored -> { - return true; - } - default -> { - return false; - } - } - } - - private static HashEqualityCondition makeHashEqualityCondition(JoinCondition eq) { - switch (eq) { - case Equals e -> { - return new HashEqualityCondition( - e.left(), e.right(), TextFoldingStrategy.unicodeNormalizedFold); - } - case EqualsIgnoreCase e -> { - return new HashEqualityCondition( - e.left(), e.right(), TextFoldingStrategy.caseInsensitiveFold(e.locale())); - } - default -> throw new IllegalStateException( - "Impossible: trying to convert condition " - + eq - + " to a HashEqualityCondition, but it should not be marked as supported. This is a" - + " bug in the Table library."); - } - } - - private static boolean isBuiltinType(Storage storage) { - // TODO: this should be removed when #5626 and #5259 are implemented - return !storage.getType().equals(AnyObjectType.INSTANCE); - } -} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinCondition.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinCondition.java deleted file mode 100644 index ba6fc17461b..00000000000 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinCondition.java +++ /dev/null @@ -1,3 +0,0 @@ -package org.enso.table.data.table.join; - -public interface JoinCondition {} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinStrategy.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinStrategy.java index c7c564bc4b7..4371c238e30 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinStrategy.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinStrategy.java @@ -1,10 +1,75 @@ package org.enso.table.data.table.join; -import java.util.List; -import org.enso.table.data.table.Table; +import org.enso.table.data.table.join.between.SortJoin; +import org.enso.table.data.table.join.conditions.Between; +import org.enso.table.data.table.join.conditions.Equals; +import org.enso.table.data.table.join.conditions.EqualsIgnoreCase; +import org.enso.table.data.table.join.conditions.HashableCondition; +import org.enso.table.data.table.join.conditions.JoinCondition; +import org.enso.table.data.table.join.hashing.HashJoin; import org.enso.table.problems.ProblemAggregator; +import java.util.List; + +/** + * A strategy used for performing a join of two tables. + */ public interface JoinStrategy { - JoinResult join( - Table left, Table right, List conditions, ProblemAggregator problemAggregator); + JoinResult join(ProblemAggregator problemAggregator); + + static JoinStrategy createStrategy(List conditions) { + if (conditions.isEmpty()) { + throw new IllegalArgumentException("At least one join condition must be provided."); + } + + List hashableConditions = conditions.stream() + .filter(c -> c instanceof HashableCondition) + .map(c -> (HashableCondition) c) + .toList(); + List betweenConditions = conditions.stream() + .filter(c -> c instanceof Between) + .map(c -> (Between) c) + .toList(); + + if (hashableConditions.size() + betweenConditions.size() != conditions.size()) { + throw new IllegalArgumentException("Unsupported join condition."); + } + + if (hashableConditions.isEmpty()) { + assert !betweenConditions.isEmpty(); + return new SortJoin(betweenConditions); + } else if (betweenConditions.isEmpty()) { + return new HashJoin(hashableConditions, new MatchAllStrategy()); + } else { + return new HashJoin(hashableConditions, new SortJoin(betweenConditions)); + } + } + + class ConditionsHelper { + private final List conditions; + + public ConditionsHelper(List conditions) { + if (conditions.isEmpty()) { + throw new IllegalArgumentException("At least one join condition must be provided."); + } + + this.conditions = conditions; + } + + public int getLeftTableRowCount() { + return switch (conditions.get(0)) { + case Equals equals -> equals.left().getStorage().size(); + case EqualsIgnoreCase equalsIgnoreCase -> equalsIgnoreCase.left().getStorage().size(); + case Between between -> between.left().getStorage().size(); + }; + } + + public int getRightTableRowCount() { + return switch (conditions.get(0)) { + case Equals equals -> equals.right().getStorage().size(); + case EqualsIgnoreCase equalsIgnoreCase -> equalsIgnoreCase.right().getStorage().size(); + case Between between -> between.rightLower().getStorage().size(); + }; + } + } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/MatchAllStrategy.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/MatchAllStrategy.java new file mode 100644 index 00000000000..a25882e65db --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/MatchAllStrategy.java @@ -0,0 +1,28 @@ +package org.enso.table.data.table.join; + +import java.util.List; +import org.enso.table.problems.ProblemAggregator; +import org.graalvm.polyglot.Context; + +/** + * A pluggable strategy that can be used as the inner strategy for a join if there are no more join + * conditions to process - so all rows are matched with each other within a given group. + */ +public class MatchAllStrategy implements PluggableJoinStrategy { + @Override + public void joinSubsets( + List leftGroup, + List rightGroup, + JoinResult.Builder resultBuilder, + ProblemAggregator problemAggregator) { + Context context = Context.getCurrent(); + for (var leftRow : leftGroup) { + for (var rightRow : rightGroup) { + resultBuilder.addRow(leftRow, rightRow); + context.safepoint(); + } + + context.safepoint(); + } + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/PluggableJoinStrategy.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/PluggableJoinStrategy.java new file mode 100644 index 00000000000..f9e3ea57f53 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/PluggableJoinStrategy.java @@ -0,0 +1,18 @@ +package org.enso.table.data.table.join; + +import java.util.List; +import org.enso.table.problems.ProblemAggregator; + +/** + * A helper join strategy that can be used within another join strategy to perform a join of + * sub-sets of indices, stemming from already joining on other conditions. + */ +public interface PluggableJoinStrategy { + + /** Performs a join of two sub-sets of indices. */ + void joinSubsets( + List leftGroup, + List rightGroup, + JoinResult.Builder resultBuilder, + ProblemAggregator problemAggregator); +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortJoin.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortJoin.java new file mode 100644 index 00000000000..50dedbbbacc --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortJoin.java @@ -0,0 +1,162 @@ +package org.enso.table.data.table.join.between; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import org.enso.base.ObjectComparator; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.index.OrderedMultiValueKey; +import org.enso.table.data.table.join.JoinResult; +import org.enso.table.data.table.join.JoinStrategy; +import org.enso.table.data.table.join.PluggableJoinStrategy; +import org.enso.table.data.table.join.conditions.Between; +import org.enso.table.problems.ProblemAggregator; +import org.graalvm.polyglot.Context; + +public class SortJoin implements JoinStrategy, PluggableJoinStrategy { + + public SortJoin(List conditions) { + conditionsHelper = new JoinStrategy.ConditionsHelper(conditions); + + Context context = Context.getCurrent(); + int nConditions = conditions.size(); + directions = new int[nConditions]; + leftStorages = new Storage[nConditions]; + lowerStorages = new Storage[nConditions]; + upperStorages = new Storage[nConditions]; + for (int i = 0; i < nConditions; i++) { + directions[i] = 1; + leftStorages[i] = conditions.get(i).left().getStorage(); + lowerStorages[i] = conditions.get(i).rightLower().getStorage(); + upperStorages[i] = conditions.get(i).rightUpper().getStorage(); + context.safepoint(); + } + } + + private final JoinStrategy.ConditionsHelper conditionsHelper; + + private final int[] directions; + private final Storage[] leftStorages; + private final Storage[] lowerStorages; + private final Storage[] upperStorages; + + @Override + public JoinResult join(ProblemAggregator problemAggregator) { + Context context = Context.getCurrent(); + JoinResult.Builder resultBuilder = new JoinResult.Builder(); + + int leftRowCount = conditionsHelper.getLeftTableRowCount(); + int rightRowCount = conditionsHelper.getRightTableRowCount(); + if (leftRowCount == 0 || rightRowCount == 0) { + // if one group is completely empty, there will be no matches to report + return resultBuilder.build(); + } + List leftKeys = new ArrayList<>(leftRowCount); + for (int i = 0; i < leftRowCount; i++) { + leftKeys.add(new OrderedMultiValueKey(leftStorages, i, directions)); + context.safepoint(); + } + + SortedListIndex leftIndex = buildSortedLeftIndex(leftKeys); + + for (int rightRowIx = 0; rightRowIx < rightRowCount; rightRowIx++) { + addMatchingLeftRows(leftIndex, rightRowIx, resultBuilder); + context.safepoint(); + } + + return resultBuilder.build(); + } + + @Override + public void joinSubsets( + List leftGroup, + List rightGroup, + JoinResult.Builder resultBuilder, + ProblemAggregator problemAggregator) { + Context context = Context.getCurrent(); + + List leftKeys = + leftGroup.stream() + .map(i -> new OrderedMultiValueKey(leftStorages, i, directions, objectComparator)) + .toList(); + if (leftKeys.isEmpty()) { + // left group is completely empty - there will be no matches at all + return; + } + + SortedListIndex leftIndex = buildSortedLeftIndex(leftKeys); + + for (int rightRowIx : rightGroup) { + addMatchingLeftRows(leftIndex, rightRowIx, resultBuilder); + context.safepoint(); + } + } + + private SortedListIndex buildSortedLeftIndex( + List keys) { + return SortedListIndex.build(keys, firstCoordinateComparator); + } + + private OrderedMultiValueKey buildLowerBound(int rightRowIx) { + return new OrderedMultiValueKey(lowerStorages, rightRowIx, directions, objectComparator); + } + + private OrderedMultiValueKey buildUpperBound(int rightRowIx) { + return new OrderedMultiValueKey(upperStorages, rightRowIx, directions, objectComparator); + } + + private void addMatchingLeftRows( + SortedListIndex sortedLeftIndex, + int rightRowIx, + JoinResult.Builder resultBuilder) { + OrderedMultiValueKey lowerBound = buildLowerBound(rightRowIx); + OrderedMultiValueKey upperBound = buildUpperBound(rightRowIx); + + // If the match interval is invalid or empty, there is nothing to do. + if (lowerBound.hasAnyNulls() + || upperBound.hasAnyNulls() + || lowerBound.compareTo(upperBound) > 0) { + return; + } + + List firstCoordinateMatches = + sortedLeftIndex.findSubRange(lowerBound, upperBound); + Context context = Context.getCurrent(); + for (OrderedMultiValueKey key : firstCoordinateMatches) { + if (isInRange(key, lowerBound, upperBound)) { + resultBuilder.addRow(key.getRowIndex(), rightRowIx); + } + + context.safepoint(); + } + } + + private boolean isInRange( + OrderedMultiValueKey key, OrderedMultiValueKey lowerBound, OrderedMultiValueKey upperBound) { + assert key.getNumberOfColumns() == lowerBound.getNumberOfColumns(); + assert key.getNumberOfColumns() == upperBound.getNumberOfColumns(); + + // Note: we cannot just use `compareTo`, because we are now not checking that the key is between + // the bounds in lexicographic order. + // Instead, we are checking if the key is between the bounds for all dimensions. + + int n = key.getNumberOfColumns(); + for (int i = 0; i < n; i++) { + var keyValue = key.get(i); + var lowerBoundValue = lowerBound.get(i); + var upperBoundValue = upperBound.get(i); + boolean fitsInThisDimension = + objectComparator.compare(keyValue, lowerBoundValue) >= 0 + && objectComparator.compare(keyValue, upperBoundValue) <= 0; + if (!fitsInThisDimension) { + return false; + } + } + + return true; + } + + private final ObjectComparator objectComparator = ObjectComparator.DEFAULT; + private final Comparator firstCoordinateComparator = + new OrderedMultiValueKey.ProjectionComparator(0); +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortedListIndex.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortedListIndex.java new file mode 100644 index 00000000000..a98f2c8a392 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortedListIndex.java @@ -0,0 +1,129 @@ +package org.enso.table.data.table.join.between; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +public class SortedListIndex { + /** Defines the <= ordering for the index. */ + private final Comparator comparator; + + /* forall 0 <= i <= j < n, sortedList[i] <= sortedList[j] */ + private final ArrayList sortedList; + + protected SortedListIndex(ArrayList sortedList, Comparator comparator) { + this.comparator = comparator; + this.sortedList = sortedList; + } + + public static SortedListIndex build(List list, Comparator comparator) { + ArrayList copy = new ArrayList<>(list); + copy.sort(comparator); + return new SortedListIndex<>(copy, comparator); + } + + /** + * Finds a sub-range of the index containing all elements between the lower and upper bounds + * (both-ends inclusive). + */ + public List findSubRange(T lowerBound, T upperBound) { + int start = findLowerIndex(lowerBound); + int end = findUpperIndex(upperBound) + 1; + if (start >= end) { + return Collections.emptyList(); + } + + return sortedList.subList(start, end); + } + + /** + * Finds the index of the first element that is greater than or equal to the argument. + * + *

If all elements are greater than the argument, returns 0. If all elements are less than the + * argument, returns N. + */ + private int findLowerIndex(T element) { + int start = 0; + int end = sortedList.size(); + + /* + * Loop invariants: + * 1) start <= end + * 2) forall 0 <= i < start: sortedList[i] < element + * 3) forall end <= i < N: sortedList[i] >= element + * + * end - start is strictly decreasing, so the loop will always terminate. + */ + while (start < end) { + // start <= mid < mid + 1 <= end + int mid = Math.addExact(start, end) / 2; + T midElement = sortedList.get(mid); + int cmp = comparator.compare(midElement, element); + if (cmp < 0) { + start = mid + 1; + } else { + end = mid; + } + } + + /* + * After the loop, start >= end, but also start <= end, so start == end. + * + * Thus, from invariants: + * forall 0 <= i < start: sortedList[i] < element + * forall start <= i < N: sortedList[i] >= element + * + * start is the first element that is >= element; + * if there is no such element, it will be N. + */ + return start; + } + + /** + * Finds the index of the last element that is less than or equal to the argument. + * + *

If all elements are greater than the argument, returns -1. If all elements are less than the + * argument, returns N-1 (index of the last element). + */ + private int findUpperIndex(T element) { + int start = 0; + int end = sortedList.size(); + + /* + * Loop invariants: + * 1) start <= end + * 2) forall 0 <= i < start: sortedList[i] <= element + * 3) forall end <= i < N: sortedList[i] > element + * + * end - start is strictly decreasing. + */ + while (start < end) { + // start <= mid < end + int mid = Math.addExact(start, end) / 2; + T midElement = sortedList.get(mid); + int cmp = comparator.compare(midElement, element); + if (cmp <= 0) { + start = mid + 1; + } else { + end = mid; + } + } + + /* + * After the loop, start >= end, but also start <= end, so start == end. + * + * Thus, from invariants: + * forall 0 <= i < start: sortedList[i] <= element + * forall start <= i < N: sortedList[i] > element + * + * So start-1 is the last element that is <= element (if it exists); + * if there is no such element, it will be -1. + */ + return start - 1; + } + + private boolean keysEqual(T k1, T k2) { + return comparator.compare(k1, k2) == 0; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/Between.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/Between.java similarity index 73% rename from std-bits/table/src/main/java/org/enso/table/data/table/join/Between.java rename to std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/Between.java index 8b87df274fd..348a73265c8 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/Between.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/Between.java @@ -1,4 +1,4 @@ -package org.enso.table.data.table.join; +package org.enso.table.data.table.join.conditions; import org.enso.table.data.table.Column; diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/Equals.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/Equals.java new file mode 100644 index 00000000000..d52d99241c0 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/Equals.java @@ -0,0 +1,5 @@ +package org.enso.table.data.table.join.conditions; + +import org.enso.table.data.table.Column; + +public record Equals(Column left, Column right) implements HashableCondition {} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/EqualsIgnoreCase.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/EqualsIgnoreCase.java similarity index 59% rename from std-bits/table/src/main/java/org/enso/table/data/table/join/EqualsIgnoreCase.java rename to std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/EqualsIgnoreCase.java index de1f811bbb6..c870ddea9aa 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/EqualsIgnoreCase.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/EqualsIgnoreCase.java @@ -1,7 +1,7 @@ -package org.enso.table.data.table.join; +package org.enso.table.data.table.join.conditions; import org.enso.table.data.table.Column; import java.util.Locale; -public record EqualsIgnoreCase(Column left, Column right, Locale locale) implements JoinCondition {} +public record EqualsIgnoreCase(Column left, Column right, Locale locale) implements HashableCondition {} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/HashableCondition.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/HashableCondition.java new file mode 100644 index 00000000000..39db68db350 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/HashableCondition.java @@ -0,0 +1,4 @@ +package org.enso.table.data.table.join.conditions; + +public sealed interface HashableCondition extends JoinCondition permits Equals, EqualsIgnoreCase { +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/JoinCondition.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/JoinCondition.java new file mode 100644 index 00000000000..3018abe0c42 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/conditions/JoinCondition.java @@ -0,0 +1,3 @@ +package org.enso.table.data.table.join.conditions; + +public sealed interface JoinCondition permits HashableCondition, Between {} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java new file mode 100644 index 00000000000..66b4f0c87b0 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java @@ -0,0 +1,84 @@ +package org.enso.table.data.table.join.hashing; + +import org.enso.base.text.TextFoldingStrategy; +import org.enso.table.data.index.MultiValueIndex; +import org.enso.table.data.index.UnorderedMultiValueKey; +import org.enso.table.data.table.Column; +import org.enso.table.data.table.join.JoinResult; +import org.enso.table.data.table.join.JoinStrategy; +import org.enso.table.data.table.join.PluggableJoinStrategy; +import org.enso.table.data.table.join.conditions.Equals; +import org.enso.table.data.table.join.conditions.EqualsIgnoreCase; +import org.enso.table.data.table.join.conditions.HashableCondition; +import org.enso.table.problems.ProblemAggregator; +import org.graalvm.polyglot.Context; + +import java.util.List; + +/** + * A strategy that uses a hash-map to perform join on the equality conditions. + *

+ * It then delegates to {@code remainingMatcher} to perform the remaining conditions on the matching pairs of row + * subsets. + */ +public class HashJoin implements JoinStrategy { + public HashJoin(List conditions, PluggableJoinStrategy remainingMatcher) { + conditionsHelper = new JoinStrategy.ConditionsHelper(conditions); + this.remainingMatcher = remainingMatcher; + + List equalConditions = + conditions.stream().map(HashJoin::makeHashEqualityCondition).toList(); + + if (equalConditions.isEmpty()) { + throw new IllegalArgumentException("EqualityHashJoin is applicable if there is at least one equality condition."); + } + + leftEquals = equalConditions.stream().map(HashEqualityCondition::left).toArray(Column[]::new); + rightEquals = equalConditions.stream().map(HashEqualityCondition::right).toArray(Column[]::new); + textFoldingStrategies = equalConditions.stream().map(HashEqualityCondition::textFoldingStrategy).toList(); + } + + private final JoinStrategy.ConditionsHelper conditionsHelper; + private final Column[] leftEquals, rightEquals; + private final List textFoldingStrategies; + private final PluggableJoinStrategy remainingMatcher; + + @Override + public JoinResult join(ProblemAggregator problemAggregator) { + Context context = Context.getCurrent(); + + var leftIndex = MultiValueIndex.makeUnorderedIndex(leftEquals, conditionsHelper.getLeftTableRowCount(), + textFoldingStrategies, problemAggregator); + var rightIndex = MultiValueIndex.makeUnorderedIndex(rightEquals, conditionsHelper.getRightTableRowCount(), + textFoldingStrategies, problemAggregator); + + JoinResult.Builder resultBuilder = new JoinResult.Builder(); + for (var leftEntry : leftIndex.mapping().entrySet()) { + UnorderedMultiValueKey leftKey = leftEntry.getKey(); + List leftRows = leftEntry.getValue(); + List rightRows = rightIndex.get(leftKey); + + if (rightRows != null) { + remainingMatcher.joinSubsets(leftRows, rightRows, resultBuilder, problemAggregator); + } + + context.safepoint(); + } + + return resultBuilder.build(); + } + + private static HashEqualityCondition makeHashEqualityCondition(HashableCondition eq) { + switch (eq) { + case Equals e -> { + return new HashEqualityCondition(e.left(), e.right(), TextFoldingStrategy.unicodeNormalizedFold); + } + case EqualsIgnoreCase e -> { + return new HashEqualityCondition(e.left(), e.right(), TextFoldingStrategy.caseInsensitiveFold(e.locale())); + } + } + } + + private record HashEqualityCondition(Column left, Column right, TextFoldingStrategy textFoldingStrategy) { + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/LookupColumnDescription.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/lookup/LookupColumnDescription.java similarity index 90% rename from std-bits/table/src/main/java/org/enso/table/data/table/join/LookupColumnDescription.java rename to std-bits/table/src/main/java/org/enso/table/data/table/join/lookup/LookupColumnDescription.java index f816a94db26..fd6710e8eb8 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/LookupColumnDescription.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/lookup/LookupColumnDescription.java @@ -1,4 +1,4 @@ -package org.enso.table.data.table.join; +package org.enso.table.data.table.join.lookup; import org.enso.table.data.column.storage.type.StorageType; import org.enso.table.data.table.Column; diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/LookupJoin.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/lookup/LookupJoin.java similarity index 98% rename from std-bits/table/src/main/java/org/enso/table/data/table/join/LookupJoin.java rename to std-bits/table/src/main/java/org/enso/table/data/table/join/lookup/LookupJoin.java index 77ca5cbdcad..3abef8ebee8 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/LookupJoin.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/lookup/LookupJoin.java @@ -1,4 +1,4 @@ -package org.enso.table.data.table.join; +package org.enso.table.data.table.join.lookup; import org.enso.base.text.TextFoldingStrategy; import org.enso.table.data.column.builder.Builder; @@ -9,6 +9,7 @@ import org.enso.table.data.index.UnorderedMultiValueKey; import org.enso.table.data.mask.OrderMask; import org.enso.table.data.table.Column; import org.enso.table.data.table.Table; +import org.enso.table.data.table.join.conditions.Equals; import org.enso.table.error.NonUniqueLookupKey; import org.enso.table.error.NullValuesInKeyColumns; import org.enso.table.error.UnmatchedRow; @@ -17,7 +18,6 @@ import org.enso.table.util.ConstantList; import java.util.Arrays; import java.util.List; -import java.util.Map; import java.util.stream.IntStream; public class LookupJoin { diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/scan/Matcher.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/scan/Matcher.java deleted file mode 100644 index 447a5a0362f..00000000000 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/scan/Matcher.java +++ /dev/null @@ -1,5 +0,0 @@ -package org.enso.table.data.table.join.scan; - -public interface Matcher { - boolean matches(int left, int right); -} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/scan/MatcherFactory.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/scan/MatcherFactory.java deleted file mode 100644 index 37ce0bc1ad8..00000000000 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/scan/MatcherFactory.java +++ /dev/null @@ -1,152 +0,0 @@ -package org.enso.table.data.table.join.scan; - -import org.enso.base.ObjectComparator; -import org.enso.base.Text_Utils; -import org.enso.base.polyglot.NumericConverter; -import org.enso.table.data.column.storage.Storage; -import org.enso.table.data.column.storage.StringStorage; -import org.enso.table.data.table.join.Between; -import org.enso.table.data.table.join.Equals; -import org.enso.table.data.table.join.EqualsIgnoreCase; -import org.enso.table.data.table.join.JoinCondition; -import org.enso.table.data.table.problems.FloatingPointGrouping; -import org.enso.table.problems.ColumnAggregatedProblemAggregator; - -import java.util.List; -import java.util.Locale; -import java.util.stream.Collectors; - -public class MatcherFactory { - public Matcher create(JoinCondition condition, ColumnAggregatedProblemAggregator problemAggregator) { - return switch (condition) { - case Equals eq -> new EqualsMatcher(eq, problemAggregator); - case EqualsIgnoreCase eq -> new EqualsIgnoreCaseMatcher(eq); - case Between between -> new BetweenMatcher(between); - default -> throw new UnsupportedOperationException( - "Unsupported join condition: " + condition); - }; - } - - public Matcher create(List condition, ColumnAggregatedProblemAggregator problemAggregator) { - List matchers = condition.stream().map(m-> create(m, problemAggregator)).collect(Collectors.toList()); - return new CompoundMatcher(matchers); - } - - static final class CompoundMatcher implements Matcher { - private final List matchers; - - CompoundMatcher(List matchers) { - this.matchers = matchers; - } - - @Override - public boolean matches(int left, int right) { - for (Matcher matcher : matchers) { - if (!matcher.matches(left, right)) { - return false; - } - } - - return true; - } - } - - static final class EqualsMatcher implements Matcher { - private final Storage leftStorage; - private final Storage rightStorage; - private final String leftColumnName; - private final String rightColumnName; - private final ColumnAggregatedProblemAggregator problemAggregator; - - public EqualsMatcher(Equals eq, ColumnAggregatedProblemAggregator problemAggregator) { - leftStorage = eq.left().getStorage(); - rightStorage = eq.right().getStorage(); - leftColumnName = eq.left().getName(); - rightColumnName = eq.right().getName(); - this.problemAggregator = problemAggregator; - } - - @Override - public boolean matches(int left, int right) { - Object leftValue = leftStorage.getItemBoxed(left); - Object rightValue = rightStorage.getItemBoxed(right); - - if (NumericConverter.isFloatLike(leftValue)) { - problemAggregator.reportColumnAggregatedProblem(new FloatingPointGrouping(leftColumnName, left)); - } - - if (NumericConverter.isFloatLike(rightValue)) { - problemAggregator.reportColumnAggregatedProblem(new FloatingPointGrouping(rightColumnName, right)); - } - - return ObjectComparator.areEqual(leftValue, rightValue); - } - } - - static final class EqualsIgnoreCaseMatcher implements Matcher { - private final StringStorage leftStorage; - private final StringStorage rightStorage; - - private final Locale locale; - - public EqualsIgnoreCaseMatcher(EqualsIgnoreCase eq) { - if (eq.left().getStorage() instanceof StringStorage leftStrings) { - leftStorage = leftStrings; - } else { - throw new IllegalArgumentException("Expected left column to have type Text."); - } - - if (eq.right().getStorage() instanceof StringStorage rightStrings) { - rightStorage = rightStrings; - } else { - throw new IllegalArgumentException("Expected right column to have type Text."); - } - - locale = eq.locale(); - } - - @Override - public boolean matches(int left, int right) { - String leftValue = leftStorage.getItem(left); - String rightValue = rightStorage.getItem(right); - - if (leftValue == null && rightValue == null) { - return true; - } - - if (leftValue == null || rightValue == null) { - return false; - } - - return Text_Utils.equals_ignore_case(leftValue, rightValue, locale); - } - } - - static final class BetweenMatcher implements Matcher { - private final Storage leftStorage; - private final Storage rightLowerStorage; - private final Storage rightUpperStorage; - - public BetweenMatcher(Between between) { - leftStorage = between.left().getStorage(); - rightLowerStorage = between.rightLower().getStorage(); - rightUpperStorage = between.rightUpper().getStorage(); - } - - @Override - public boolean matches(int left, int right) { - Object leftValue = leftStorage.getItemBoxed(left); - Object rightLowerValue = rightLowerStorage.getItemBoxed(right); - Object rightUpperValue = rightUpperStorage.getItemBoxed(right); - - // If any value is missing, such a pair of rows is never correlated with Between as we assume - // the ordering is not well-defined for missing values. - if (leftValue == null || rightLowerValue == null || rightUpperValue == null) { - return false; - } - - return ObjectComparator.DEFAULT.compare(leftValue, rightLowerValue) >= 0 - && ObjectComparator.DEFAULT.compare(leftValue, rightUpperValue) <= 0; - } - } -} diff --git a/std-bits/table/src/main/java/org/enso/table/operations/AddRowNumber.java b/std-bits/table/src/main/java/org/enso/table/operations/AddRowNumber.java index e96749ca46f..8237d8f138f 100644 --- a/std-bits/table/src/main/java/org/enso/table/operations/AddRowNumber.java +++ b/std-bits/table/src/main/java/org/enso/table/operations/AddRowNumber.java @@ -2,7 +2,6 @@ package org.enso.table.operations; import java.util.ArrayList; import java.util.Arrays; -import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -18,7 +17,6 @@ import org.enso.table.data.table.Column; import org.enso.table.problems.ColumnAggregatedProblemAggregator; import org.enso.table.problems.ProblemAggregator; import org.enso.table.util.ConstantList; -import org.graalvm.collections.Pair; public class AddRowNumber { @@ -62,18 +60,17 @@ public class AddRowNumber { Storage[] orderingStorages = Arrays.stream(orderingColumns).map(Column::getStorage).toArray(Storage[]::new); long[] numbers = new long[n]; - List> keys = + List keys = new ArrayList<>( IntStream.range(0, n) - .mapToObj( - i -> Pair.create(new OrderedMultiValueKey(orderingStorages, i, directions), i)) + .mapToObj(i -> new OrderedMultiValueKey(orderingStorages, i, directions)) .toList()); - keys.sort(OrderedPairComparator.INSTANCE); + keys.sort(null); RangeIterator it = new RangeIterator(start, step); for (var key : keys) { - numbers[key.getRight()] = it.next(); + numbers[key.getRowIndex()] = it.next(); } return new LongStorage(numbers, IntegerType.INT_64); } @@ -103,44 +100,21 @@ public class AddRowNumber { for (var entry : groupIndex.mapping().entrySet()) { List indices = entry.getValue(); - List> orderingKeys = + List orderingKeys = new ArrayList<>( indices.stream() - .map( - i -> - Pair.create(new OrderedMultiValueKey(orderingStorages, i, directions), i)) + .map(i -> new OrderedMultiValueKey(orderingStorages, i, directions)) .toList()); - orderingKeys.sort(OrderedPairComparator.INSTANCE); + orderingKeys.sort(null); RangeIterator it = new RangeIterator(start, step); - for (var key : orderingKeys) { - numbers[key.getRight()] = it.next(); + for (OrderedMultiValueKey key : orderingKeys) { + numbers[key.getRowIndex()] = it.next(); } } return new LongStorage(numbers, IntegerType.INT_64); } - private static class OrderedPairComparator - implements Comparator> { - @Override - public int compare( - Pair o1, Pair o2) { - int p1 = o1.getLeft().compareTo(o2.getLeft()); - if (p1 != 0) { - return p1; - } - - return o1.getRight().compareTo(o2.getRight()); - } - - @Override - public boolean equals(Object obj) { - return obj instanceof OrderedPairComparator; - } - - static OrderedPairComparator INSTANCE = new OrderedPairComparator(); - } - /** * A helper for computing consecutive numbers based on a start and step. It will throw an {@link * java.lang.ArithmeticException} if the next number overflows. diff --git a/test/Benchmarks/src/Main.enso b/test/Benchmarks/src/Main.enso index 21c0093f765..785bf797acf 100644 --- a/test/Benchmarks/src/Main.enso +++ b/test/Benchmarks/src/Main.enso @@ -10,6 +10,7 @@ import project.Table.Aggregate import project.Table.Arithmetic import project.Table.Column_From_Vector import project.Table.Cross_Tab +import project.Table.Join import project.Table.Sorting import project.Table.Internal.Multi_Value_Key import project.Text.Build @@ -54,6 +55,7 @@ all_benchmarks = builder.append Add_Row_Number.collect_benches builder.append Column_From_Vector.collect_benches builder.append Cross_Tab.collect_benches + builder.append Join.collect_benches builder.append Sorting.collect_benches builder.append Multi_Value_Key.collect_benches diff --git a/test/Benchmarks/src/Table/Is_In.enso b/test/Benchmarks/src/Table/Is_In.enso new file mode 100644 index 00000000000..257215f3530 --- /dev/null +++ b/test/Benchmarks/src/Table/Is_In.enso @@ -0,0 +1,48 @@ +from Standard.Base import all +from Standard.Base.Runtime import assert + +from Standard.Table import all + +from Standard.Test import Bench + +from project.Config import extended_tests + +options = Bench.options . set_warmup (Bench.phase_conf 1 2) . set_measure (Bench.phase_conf 2 3) + +type Scenario + Value table ints_vec dates_vec bool_vec + +create_scenario = + t = Table.new [["X", (200.up_to 10000 . to_vector)]] + ints_vec = 40000.up_to 130000 . to_vector + first_day = Date_Time.new 2000 1 1 + make_date x = first_day + (Duration.new seconds=x) + dates_vec = ints_vec.map make_date + bool_vec = Vector.fill 7000 True + + t2 = t.set (t.at "X" . map make_date) "dates" + t3 = t2.set (t.at "X" % 2 == 0) "bools" + Scenario.Value t3 ints_vec dates_vec bool_vec + +type Data + Value ~scenario + + create = Data.Value create_scenario + +collect_benches = Bench.build builder-> + data = Data.create + + builder.group ("Filter_Is_In") options group_builder-> + group_builder.specify "integers" <| + scenario = data.scenario + scenario.table.filter "X" (Filter_Condition.Is_In scenario.ints_vec) + + group_builder.specify "dates" <| + scenario = data.scenario + scenario.table.filter "dates" (Filter_Condition.Is_In scenario.dates_vec) + + group_builder.specify "bools" <| + scenario = data.scenario + scenario.table.filter "bools" (Filter_Condition.Is_In scenario.bool_vec) + +main = collect_benches . run_main diff --git a/test/Benchmarks/src/Table/Join.enso b/test/Benchmarks/src/Table/Join.enso new file mode 100644 index 00000000000..b1c2e1eced8 --- /dev/null +++ b/test/Benchmarks/src/Table/Join.enso @@ -0,0 +1,222 @@ +from Standard.Base import all +from Standard.Base.Runtime import assert + +from Standard.Table import all + +from Standard.Test import Bench + +from project.Config import extended_tests + +options = Bench.options . set_warmup (Bench.phase_conf 2 5) . set_measure (Bench.phase_conf 2 5) + +type Scenario + Value table1 table2 + +shuffle vec = + vec.take (Index_Sub_Range.Sample vec.length seed=42) + +create_scenario_equals num_rows = + xs = (0.up_to num_rows).to_vector + table1 = Table.new [["key", xs]] + table2 = Table.new [["key", shuffle xs]] + Scenario.Value table1 table2 + +create_scenario_equals_medium_groups num_rows = + xs = (0.up_to num_rows).map x-> (x/30).floor + ys = xs.reverse.map (+2) + table1 = Table.new [["key", xs]] + table2 = Table.new [["key", ys]] + Scenario.Value table1 table2 + +create_scenario_equals_ignore_case num_rows = + table1 = Table.new [["key", (0.up_to num_rows).map i-> "a"+i.to_text]] + table2 = Table.new [["case_insensitive_key", (0.up_to num_rows).reverse.map i-> "A"+i.to_text]] + Scenario.Value table1 table2 + +create_scenario_between num_rows = + xs = (0.up_to num_rows).map x-> x*100 + lows = xs.map x-> x-10 + highs = xs.map x-> x+50 + + table1 = Table.new [["x", shuffle xs]] + table2 = Table.new [["lows", lows], ["highs", highs]] + Scenario.Value table1 table2 + +## The mixed scenario creates a pair of tables where all rows are mapped 1-1, + but they are split into 3 groups. Each group differs by only one 'key' while + having equal keys of the other two types. + + This ensures that a combined scenario must be efficient for all conditions, + regardless of the distribution of keys - it cannot naively group by only a + subset of keys and brute force the remaining keys - because in this example, + splitting by any subset of keys will still yield a big group - only splitting + by all 3 keys gives us small groups (1-1). +create_scenario_mixed num_rows = + n = (num_rows/3).round + xs = (0.up_to n).to_vector + ys_1 = (0.up_to n).map i-> "a"+i.to_text + ys_2 = (0.up_to n).map i-> "A"+i.to_text + zs = (0.up_to n).map x-> 1000 + x*100 + + constant_x = Vector.new n _-> 1 + constant_y = Vector.new n _-> "_" + constant_z = Vector.new n _-> 0 + + table1 = + group1 = Table.new [["EQ", shuffle xs], ["case_insensitive", constant_y], ["x", constant_z]] + group2 = Table.new [["EQ", constant_x], ["case_insensitive", shuffle ys_1], ["x", constant_z]] + group3 = Table.new [["EQ", constant_x], ["case_insensitive", constant_y], ["x", shuffle zs]] + group1.union [group2, group3] + + table2 = + group1 = Table.new [["EQ", shuffle xs], ["case_insensitive", constant_y], ["lows", constant_z], ["highs", constant_z]] + group2 = Table.new [["EQ", constant_x], ["case_insensitive", shuffle ys_2], ["lows", constant_z], ["highs", constant_z]] + + lows = zs.map x-> x-10 + highs = zs.map x-> x+30 + group3 = Table.new [["EQ", constant_x], ["case_insensitive", constant_y], ["lows", lows], ["highs", highs]] + + group1.union [group2, group3] + + Scenario.Value table1 table2 + +## The 2d equality scenario matches rows based on 2 keys - + it matches corresponding points on a 2d grid. + + This is used to verify that multi-key joins are efficient too. +create_scenario_equals_2d num_rows = + n = num_rows.sqrt.ceil + pts = (0.up_to n).to_vector.flat_map x-> + (0.up_to n).map y-> [x, y] + + shuffled_pts = shuffle pts + + table1 = Table.new [["x", shuffled_pts.map .first], ["y", shuffled_pts.map .second]] + table2 = Table.new [["x", pts.map .first], ["y", pts.map .second]] + Scenario.Value table1 table2 + +## Similarly to the example with equality, this creates a 2d grid of points, but + they are matched using the Between condition. +create_scenario_between_2d num_rows = + n = num_rows.sqrt.ceil + pts = (0.up_to n).to_vector.flat_map x-> + (0.up_to n).map y-> [x, y] + + shuffled_pts = shuffle pts + table1 = Table.new [["x", shuffled_pts.map .first], ["y", shuffled_pts.map .second]] + + lows = pts.map p-> [p.first - 0.1, p.second - 0.1] + highs = pts.map p-> [p.first + 0.1, p.second + 0.1] + + table2 = Table.new [["x_lows", lows.map .first], ["y_lows", lows.map .second], ["x_highs", highs.map .first], ["y_highs", highs.map .second]] + Scenario.Value table1 table2 + +## This one creates a scenario with a 2d grid of points for the left table, but + the right table contains pairs of coordinates that denote belts of size 2 x n + on that grid. + + Some of them will be horizontal and some vertical, to see how the order of + Between arguments affects performance. +create_scenario_between_2d_belts num_rows = + n = num_rows.sqrt.ceil + pts = (0.up_to n).to_vector.flat_map x-> + (0.up_to n).map y-> [x, y] + + shuffled_pts = shuffle pts + table1 = Table.new [["x", shuffled_pts.map .first], ["y", shuffled_pts.map .second]] + + horizontal_belts = Vector.new n x-> + [x, x+1, 0, n, False] + vertical_belts = Vector.new n y-> + [0, n, y, y+1, True] + + table2 = Table.from_rows ["x_lows", "x_highs", "y_lows", "y_highs", "is_vertical"] (horizontal_belts + vertical_belts) + Scenario.Value table1 table2 + +## This is a scenario where we want to find rows unmatched in another table. + + The scenario is set-up on purpose in such a way that the intersection of the + two tables is very large. This will only be fast if the anti-join does not + compute the intersection which is not needed in this scenario. +create_scenario_antijoin num_rows = + xs = Vector.new num_rows _-> 1 + + ## The first 1000 rows will be unmatched (and should be returned in the anti-join). + All other rows will match with _all_ rows from `xs`, creating a huge intersection. + ys = Vector.new num_rows ix-> + if ix < 1000 then -ix else 1 + + table1 = Table.new [["key", xs]] + table2 = Table.new [["key", ys]] + Scenario.Value table1 table2 + +type Data + Value ~equals ~equals_medium_groups ~equals_ignore_case ~between ~mixed ~equals2d ~between2d ~between2d_belts ~antijoin + + create num_rows = + Data.Value (create_scenario_equals num_rows) (create_scenario_equals_medium_groups num_rows) (create_scenario_equals_ignore_case num_rows) (create_scenario_between num_rows) (create_scenario_mixed num_rows) (create_scenario_equals_2d num_rows) (create_scenario_between_2d num_rows) (create_scenario_between_2d_belts num_rows) (create_scenario_antijoin num_rows) + +collect_benches = Bench.build builder-> + num_rows = 50000 + data = Data.create num_rows + + builder.group ("Join_" + num_rows.to_text) options group_builder-> + group_builder.specify "Equals" <| + scenario = data.equals + r = scenario.table1.join scenario.table2 on="key" + assert (r.row_count == num_rows) + + group_builder.specify "Equals_Medium_Groups" <| + scenario = data.equals_medium_groups + scenario.table1.join scenario.table2 on="key" + + group_builder.specify "Equals_Ignore_Case" <| + scenario = data.equals_ignore_case + r = scenario.table1.join scenario.table2 on=(Join_Condition.Equals_Ignore_Case "key" "case_insensitive_key") + assert (r.row_count == num_rows) + + group_builder.specify "Between" <| + scenario = data.between + r = scenario.table1.join scenario.table2 on=(Join_Condition.Between "x" "lows" "highs") + assert (r.row_count == num_rows) + + group_builder.specify "Mixed" <| + scenario = data.mixed + r = scenario.table1.join scenario.table2 on=[Join_Condition.Equals "EQ", Join_Condition.Equals_Ignore_Case "case_insensitive", Join_Condition.Between "x" "lows" "highs"] + expected_rows = data.mixed.table1.row_count + assert (r.row_count == expected_rows) + + group_builder.specify "Equals_2D" <| + scenario = data.equals2d + r = scenario.table1.join scenario.table2 on=["x", "y"] + assert (r.row_count == scenario.table1.row_count) + + group_builder.specify "Between_2D" <| + scenario = data.between2d + r = scenario.table1.join scenario.table2 on=[Join_Condition.Between "x" "x_lows" "x_highs", Join_Condition.Between "y" "y_lows" "y_highs"] + assert (r.row_count == scenario.table1.row_count) + + if extended_tests then group_builder.specify "Between_2D_Belts_All" <| + scenario = data.between2d_belts + r = scenario.table1.join scenario.table2 on=[Join_Condition.Between "x" "x_lows" "x_highs", Join_Condition.Between "y" "y_lows" "y_highs"] + assert (r.row_count == scenario.table1.row_count) + + if extended_tests then group_builder.specify "Between_2D_Belts_V" <| + scenario = data.between2d_belts + t2 = scenario.table2.filter "is_vertical" Filter_Condition.Is_True + r = scenario.table1.join t2 on=[Join_Condition.Between "x" "x_lows" "x_highs", Join_Condition.Between "y" "y_lows" "y_highs"] + assert (r.row_count == scenario.table1.row_count) + + if extended_tests then group_builder.specify "Between_2D_Belts_H" <| + scenario = data.between2d_belts + t2 = scenario.table2.filter "is_vertical" Filter_Condition.Is_False + r = scenario.table1.join t2 on=[Join_Condition.Between "x" "x_lows" "x_highs", Join_Condition.Between "y" "y_lows" "y_highs"] + assert (r.row_count == scenario.table1.row_count) + + # TODO this should be part of the main tests, but it was causing issues on CI; re-enable this with #8217 + if extended_tests then group_builder.specify "AntiJoin" <| + scenario = data.antijoin + r = scenario.table2.join scenario.table1 on="key" join_kind=Join_Kind.Left_Exclusive + assert (r.row_count == 1000) + +main = collect_benches . run_main diff --git a/test/Table_Tests/src/Common_Table_Operations/Join/Join_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Join/Join_Spec.enso index fcade1cf466..d24981ef2d1 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Join/Join_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Join/Join_Spec.enso @@ -103,6 +103,15 @@ spec setup = r = t3.join t4 join_kind=Join_Kind.Inner on=["X", "Y"] |> materialize |> _.order_by ["X", "Y", "Z", "Right Z"] check_xy_joined r + Test.specify "should correctly handle duplicated rows in Equals" <| + t1 = table_builder [["X", [1, 2, 2, 3]]] + t2 = table_builder [["X", [1, 2, 2, 4]]] + r1 = t1.join t2 join_kind=Join_Kind.Full on="X" . order_by "X" + within_table r1 <| + # Both 2's from t1 match with _both_ ones from t2 _each_, so in total we get 4 `2` pairs: + r1.at "X" . to_vector . should_equal [Nothing, 1, 2, 2, 2, 2, 3] + r1.at "Right X" . to_vector . should_equal [4, 1, 2, 2, 2, 2, Nothing] + Test.specify "should allow to join on text equality ignoring case" <| t1 = table_builder [["X", ["a", "B"]], ["Y", [1, 2]]] t2 = table_builder [["X", ["A", "a", "b"]], ["Z", [1, 2, 3]]] @@ -170,7 +179,7 @@ spec setup = t2 = table_builder [["lower", [1, 10, 8, 12]], ["upper", [1, 12, 30, 0]], ["Z", [1, 2, 3, 4]]] r1 = t1.join join_kind=Join_Kind.Inner t2 on=(Join_Condition.Between "X" "lower" "upper") |> materialize |> _.order_by ["X", "Z"] - expect_column_names ["X", "Y", "lower", "upper", "Z"] r1 + r1.column_names . should_equal ["X", "Y", "lower", "upper", "Z"] r1 . at "X" . to_vector . should_equal [1, 10, 10, 12, 12] r1 . at "Y" . to_vector . should_equal [1, 2, 2, 3, 3] r1 . at "lower" . to_vector . should_equal [1, 10, 8, 10, 8] @@ -182,13 +191,71 @@ spec setup = t2 = table_builder [["lower", ["a", "b"]], ["upper", ["a", "ccc"]], ["Z", [10, 20]]] r1 = t1.join t2 join_kind=Join_Kind.Inner on=(Join_Condition.Between "X" "lower" "upper") |> materialize |> _.order_by ["X", "Z"] - expect_column_names ["X", "Y", "lower", "upper", "Z"] r1 + r1.column_names . should_equal ["X", "Y", "lower", "upper", "Z"] r1 . at "X" . to_vector . should_equal ["a", "b", "c"] r1 . at "Y" . to_vector . should_equal [1, 2, 3] r1 . at "lower" . to_vector . should_equal ["a", "b", "b"] r1 . at "upper" . to_vector . should_equal ["a", "ccc", "ccc"] r1 . at "Z" . to_vector . should_equal [10, 20, 20] + Test.specify "should correctly handle Between edge cases (1)" pending=(if prefix.contains "PostgreSQL" then "TODO: fix issue #8243") <| + # 1. multiple rows with the same key value on the left side + # 2. fully duplicated rows (1, 7) on the left side + # 3. empty bounds (lower > upper: 10 > 0) + # 4. equal bounds (10 = 10) + # 5. unmatched rows on both sides - Full join + t1 = table_builder [["X", [1, 10, 20, 1, 2, 1, 1]], ["id", [1, 2, 3, 4, 5, 7, 7]]] + t2 = table_builder [["lower", [0, 10, 10]], ["upper", [3, 10, 0]], ["Z", ['a', 'b', 'c']]] + r1 = t1.join t2 join_kind=Join_Kind.Full on=(Join_Condition.Between "X" "lower" "upper") |> materialize |> _.order_by ["Z", "id"] + within_table r1 <| + r1.column_names . should_equal ["X", "id", "lower", "upper", "Z"] + rows = r1.rows.map .to_vector + rows.length . should_equal 8 + + rows.at 0 . should_equal [20, 3, Nothing, Nothing, Nothing] + rows.at 1 . should_equal [ 1, 1, 0, 3, 'a'] + rows.at 2 . should_equal [ 1, 4, 0, 3, 'a'] + rows.at 3 . should_equal [ 2, 5, 0, 3, 'a'] + rows.at 4 . should_equal [ 1, 7, 0, 3, 'a'] + rows.at 5 . should_equal [ 1, 7, 0, 3, 'a'] + rows.at 6 . should_equal [10, 2, 10, 10, 'b'] + rows.at 7 . should_equal [Nothing, Nothing, 10, 0, 'c'] + + Test.specify "should correctly handle Between edge cases (2)" <| + # 6. multiple Between conditions + xs = [0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4] + ys = [1, 2, 3, 1, 9, 2, 3, 2, 4, 2, 1, 1, 1, 2] + pts = xs.zip ys . take (Index_Sub_Range.Sample xs.length seed=42) + t1 = table_builder [["X", pts.map .first], ["Y", pts.map .second]] + + t2 = table_builder [["lx", [1]], ["ux", [3]], ["ly", [1]], ["uy", [2]]] + r2 = t1.join t2 join_kind=Join_Kind.Inner on=[Join_Condition.Between "X" "lx" "ux", Join_Condition.Between "Y" "ly" "uy"] |> materialize |> _.order_by ["X", "Y"] + within_table r2 <| + r2.at "X" . to_vector . should_equal [1, 1, 2, 3, 3] + r2.at "Y" . to_vector . should_equal [1, 2, 2, 1, 2] + + t3 = table_builder [["lx", [1.9]], ["ux", [3]], ["ly", [1]], ["uy", [2]]] + r3 = t1.join t3 join_kind=Join_Kind.Inner on=[Join_Condition.Between "X" "lx" "ux", Join_Condition.Between "Y" "ly" "uy"] |> materialize |> _.order_by ["X", "Y"] + within_table r3 <| + r3.at "X" . to_vector . should_equal [2, 3, 3] + r3.at "Y" . to_vector . should_equal [2, 1, 2] + + Test.specify "should correctly handle Between edge cases (3)" <| + # 7. duplicated rows on both sides + t1 = table_builder [["X", [10, 20, 20]]] + t2 = table_builder [["low", [15, 15]], ["high", [30, 30]]] + r1 = t1.join t2 join_kind=Join_Kind.Right_Outer on=(Join_Condition.Between "X" "low" "high") + within_table r1 <| + r1.at "X" . to_vector . should_equal [20, 20, 20, 20] + r1.at "low" . to_vector . should_equal [15, 15, 15, 15] + r1.at "high" . to_vector . should_equal [30, 30, 30, 30] + + # 8. keep only unmatched rows + r2 = t1.join t2 join_kind=Join_Kind.Left_Exclusive on=(Join_Condition.Between "X" "low" "high") + within_table r2 <| + r2.column_names . should_equal ["X"] + r2.at "X" . to_vector . should_equal [10] + if setup.test_selection.supports_unicode_normalization then Test.specify "should allow range-based joins (using Between) for text with Unicode normalization" <| t1 = table_builder [["X", ['s\u0301', 's']], ["Y", [1, 2]]] @@ -368,16 +435,15 @@ spec setup = if setup.supports_custom_objects then t1 = table_builder [["X", [My_Type.Value 1 2, 2.0, 2]], ["Y", [10, 20, 30]]] t2 = table_builder [["Z", [2.0, 1.5, 2.0]], ["W", [1, 2, 3]]] - action3 = t1.join t2 join_kind=Join_Kind.Inner on=(Join_Condition.Equals "X" "Z") on_problems=_ - tester3 table = - expect_column_names ["X", "Y", "Z", "W"] table - t1 = table.order_by ["Y", "W"] - t1.at "X" . to_vector . should_equal [2.0, 2.0, 2, 2] - t1.at "Y" . to_vector . should_equal [20, 20, 30, 30] - t1.at "Z" . to_vector . should_equal [2.0, 2.0, 2.0, 2.0] - t1.at "W" . to_vector . should_equal [1, 3, 1, 3] - problems3 = [Floating_Point_Equality.Error "Z", Floating_Point_Equality.Error "X"] - Problems.test_problem_handling action3 problems3 tester3 + r3 = t1.join t2 join_kind=Join_Kind.Inner on=(Join_Condition.Equals "X" "Z") on_problems=Problem_Behavior.Report_Warning + r3.column_names.should_equal ["X", "Y", "Z", "W"] + r4 = r3.order_by ["Y", "W"] + r4.at "X" . to_vector . should_equal [2.0, 2.0, 2, 2] + r4.at "Y" . to_vector . should_equal [20, 20, 30, 30] + r4.at "Z" . to_vector . should_equal [2.0, 2.0, 2.0, 2.0] + r4.at "W" . to_vector . should_equal [1, 3, 1, 3] + expected_problems = [Floating_Point_Equality.Error "Z", Floating_Point_Equality.Error "X"] + Problems.get_attached_warnings r3 . should_contain_the_same_elements_as expected_problems Test.specify "should correctly handle nulls in equality conditions" pending=db_todo <| t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą"]], ["Y", [0, 1, 2, 3, 4]]] @@ -650,6 +716,17 @@ spec setup = r3.at 3 . should_equal [2, 20, 2, Nothing, Nothing] r3.at 4 . should_equal [3, 30, 3, Nothing, Nothing] + t8 = table_builder [["X", [2, 99]], ["Y", [20, 99]], ["C", [5, 99]]] + t9 = t4_2.join t8 join_kind=Join_Kind.Full on=["X", "Y", "C"] + within_table t9 <| + t9.column_names . should_equal ["X", "Y", "C", "Right X", "Right Y", "Right C"] + r3 = materialize t9 . order_by ["X", "Right X"] . rows . map .to_vector + r3.length . should_equal 4 + r3.at 0 . should_equal [Nothing, Nothing, Nothing, 99, 99, 99] + r3.at 1 . should_equal [1, 10, 3, Nothing, Nothing, Nothing] + r3.at 2 . should_equal [2, 20, 5, 2, 20, 5] + r3.at 3 . should_equal [3, 30, 7, Nothing, Nothing, Nothing] + Test.specify "should gracefully handle tables from different backends" <| alternative_connection = Database.connect (SQLite In_Memory) t0 = (Table.new [["X", [1, 2, 4]], ["Z", [10, 20, 30]]]).select_into_database_table alternative_connection "T0" temporary=True diff --git a/test/Table_Tests/src/Helpers/Main.enso b/test/Table_Tests/src/Helpers/Main.enso index fcf8455901e..f2f3f68f06b 100644 --- a/test/Table_Tests/src/Helpers/Main.enso +++ b/test/Table_Tests/src/Helpers/Main.enso @@ -2,11 +2,13 @@ from Standard.Base import all from Standard.Test import Test_Suite +import project.Helpers.Sorted_List_Index_Spec import project.Helpers.Unique_Naming_Strategy_Spec import project.Helpers.Value_Type_Spec spec = Unique_Naming_Strategy_Spec.spec + Sorted_List_Index_Spec.spec Value_Type_Spec.spec main = Test_Suite.run_main spec diff --git a/test/Table_Tests/src/Helpers/Sorted_List_Index_Spec.enso b/test/Table_Tests/src/Helpers/Sorted_List_Index_Spec.enso new file mode 100644 index 00000000000..0433f397d86 --- /dev/null +++ b/test/Table_Tests/src/Helpers/Sorted_List_Index_Spec.enso @@ -0,0 +1,64 @@ +from Standard.Base import all + +# We need this import, to ensure that we depend on `Standard.Table`, so that the Java import of `org.enso.table` is valid. +from Standard.Table import all + +from Standard.Test import Test, Test_Suite +import Standard.Test.Extensions + +polyglot java import java.util.Comparator +polyglot java import org.enso.table.data.table.join.between.SortedListIndex + + +main = Test_Suite.run_main spec + +## White-box tests for the SortedListIndex, ensuring correctness of the + implementation - these are additional tests apart from + the `Join_Condition.Between` test cases, to ensure no off-by-one errors + or other bugs are present in the implementation. +spec = Test.group "SortedListIndex (used for SortJoin)" <| + make_index vec = SortedListIndex.build vec Comparator.naturalOrder + + v1 = [0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 10, 10, 10, 10, 11, 14, 17, 19] + v1_shuffled = v1.take (Index_Sub_Range.Sample v1.length) + index1 = make_index v1_shuffled + + Test.specify "should correctly handle empty matches" <| + Vector.from_polyglot_array (index1.findSubRange 9 9) . should_equal [] + Vector.from_polyglot_array (index1.findSubRange -10 -2) . should_equal [] + Vector.from_polyglot_array (index1.findSubRange 200 300) . should_equal [] + Vector.from_polyglot_array (index1.findSubRange 20 0) . should_equal [] + + Test.specify "should correctly handle single-element matches" <| + Vector.from_polyglot_array (index1.findSubRange 8 8) . should_equal [8] + Vector.from_polyglot_array (index1.findSubRange 12 16) . should_equal [14] + Vector.from_polyglot_array (index1.findSubRange 18 100) . should_equal [19] + Vector.from_polyglot_array (index1.findSubRange 19 100) . should_equal [19] + Vector.from_polyglot_array (index1.findSubRange 19 19) . should_equal [19] + + Test.specify "should correctly handle matches" <| + Vector.from_polyglot_array (index1.findSubRange 4 6) . should_equal [4, 5, 6] + Vector.from_polyglot_array (index1.findSubRange 3 5) . should_equal [3, 3, 4, 5] + + Vector.from_polyglot_array (index1.findSubRange 0 3) . should_equal [0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3] + Vector.from_polyglot_array (index1.findSubRange 2 4) . should_equal [2, 2, 2, 3, 3, 4] + Vector.from_polyglot_array (index1.findSubRange 8 10) . should_equal [8, 10, 10, 10, 10] + Vector.from_polyglot_array (index1.findSubRange 8 11) . should_equal [8, 10, 10, 10, 10, 11] + Vector.from_polyglot_array (index1.findSubRange 8 12) . should_equal [8, 10, 10, 10, 10, 11] + Vector.from_polyglot_array (index1.findSubRange 9 12) . should_equal [10, 10, 10, 10, 11] + + Test.specify "should correctly handle big all-equal ranges" <| + Vector.from_polyglot_array (index1.findSubRange 1 1) . should_equal [1, 1, 1, 1] + Vector.from_polyglot_array (index1.findSubRange 7 7) . should_equal [7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7] + + Test.specify "other cases: empty index" <| + index2 = make_index [] + Vector.from_polyglot_array (index2.findSubRange 1 5) . should_equal [] + + Test.specify "other cases: single element index" <| + index2 = make_index [5] + Vector.from_polyglot_array (index2.findSubRange 1 5) . should_equal [5] + Vector.from_polyglot_array (index2.findSubRange 5 5) . should_equal [5] + Vector.from_polyglot_array (index2.findSubRange 1 2) . should_equal [] + Vector.from_polyglot_array (index2.findSubRange 2 1) . should_equal [] + Vector.from_polyglot_array (index2.findSubRange 10 10) . should_equal [] diff --git a/test/Table_Tests/src/In_Memory/Join_Performance_Spec.enso b/test/Table_Tests/src/In_Memory/Join_Performance_Spec.enso deleted file mode 100644 index 5014a25555f..00000000000 --- a/test/Table_Tests/src/In_Memory/Join_Performance_Spec.enso +++ /dev/null @@ -1,118 +0,0 @@ -from Standard.Base import all - -from Standard.Table import all - -from Standard.Test import Test, Test_Suite -import Standard.Test.Extensions -from project.Util import all - -spec = - Test.group "[In-Memory] Table.join performance" <| - n = 10000 - Test.specify "should efficiently compute equality joins" <| - vec = 0.up_to n . to_vector - vec2 = 1.up_to n+1 . to_vector - t1 = Table.new [["X", vec], ["Y", 0.up_to n . map (_ % 2)]] - t2 = Table.new [["B", [0, 1]]] - t3 = Table.new [["X", vec.reverse], ["Z", vec2]] - - r1 = Duration.time_execution <| - t1.join t2 on=(Join_Condition.Equals "Y" "B") - - r2 = Duration.time_execution <| - t1.join t3 on="X" - t4 = r2.second . order_by ["X"] - t4.at "X" . to_vector . should_equal <| vec - t4.at "Z" . to_vector . should_equal <| vec2.reverse - - base_ms = r1.first.total_milliseconds - expected_max_time_ms = base_ms * 5 + 100 - runtime_ms = r2.first.total_milliseconds - if runtime_ms > expected_max_time_ms then - Test.fail "Expected a join of "+n.to_text+"x"+n.to_text+" with linear result size to be efficient, but it took "+runtime_ms.to_text+"ms while a join of 2x"+n.to_text+" with the same result size took "+base_ms.to_text+"ms. The maximum time threshold for this operation to be deemed efficient has been estimated at "+expected_max_time_ms.to_text+"ms." - - Test.specify "should efficiently compute equality joins mixed with other secondary conditions" <| - vec = 0.up_to n . to_vector - vec2 = 1.up_to n+1 . to_vector - t1 = Table.new [["X", vec], ["Y", 0.up_to n . map (_ % 2)], ["A", Vector.fill n "a"], ["B", Vector.fill n 9]] - t2 = Table.new [["B", [0, 1]], ["A", ["A", "A"]], ["l", [0, 0]], ["u", [20, 20]]] - t3 = Table.new [["X", vec.reverse], ["Z", vec2], ["A", Vector.fill n "a"], ["l", Vector.fill n 0], ["u", Vector.fill n 20]] - - secondary_conditions = [Join_Condition.Equals_Ignore_Case "A", Join_Condition.Between "B" "l" "u"] - - r1 = Duration.time_execution <| - t1.join t2 on=secondary_conditions+[Join_Condition.Equals "Y" "B"] - - r2 = Duration.time_execution <| - t1.join t3 on=secondary_conditions+[Join_Condition.Equals "X" "X"] - t4 = r2.second . order_by ["X"] - t4.at "X" . to_vector . should_equal <| vec - t4.at "Z" . to_vector . should_equal <| vec2.reverse - - base_ms = r1.first.total_milliseconds - expected_max_time_ms = base_ms * 5 + 100 - runtime_ms = r2.first.total_milliseconds - if runtime_ms > expected_max_time_ms then - Test.fail "Expected a join of "+n.to_text+"x"+n.to_text+" with linear result size to be efficient, but it took "+runtime_ms.to_text+"ms while a join of 2x"+n.to_text+" with the same result size took "+base_ms.to_text+"ms. The maximum time threshold for this operation to be deemed efficient has been estimated at "+expected_max_time_ms.to_text+"ms." - - Test.specify "should efficiently compute case-insensitive equality joins" <| - unique_text_for_number prefix i = - suffix = Text.from_utf_8 [97 + i%20] - prefix + i.to_text + "-" + suffix - lowers = 0.up_to n . map (unique_text_for_number "a") - uppers = 0.up_to n . map (unique_text_for_number "A") - t1 = Table.new [["X", lowers], ["Y", 0.up_to n . map i-> if i%2 == 0 then "a" else "b"], ["A", Vector.fill n 44], ["B", Vector.fill n 9], ["N", 0.up_to n . to_vector]] - t2 = Table.new [["B", ["A", "B", "a"]], ["A", [44, 44, 44]], ["l", [0, 0, 0]], ["u", [20, 20, 20]]] - t3 = Table.new [["X", uppers.reverse], ["Z", 1.up_to n+1 . to_vector], ["A", Vector.fill n 44], ["l", Vector.fill n 0], ["u", Vector.fill n 20]] - - secondary_conditions = [Join_Condition.Equals "A", Join_Condition.Between "B" "l" "u"] - - r1 = Duration.time_execution <| - t1.join t2 on=[Join_Condition.Equals_Ignore_Case "Y" "B"]+secondary_conditions - r1.second.row_count . should_equal (n + n/2) - - r2 = Duration.time_execution <| - t1.join t3 on=[Join_Condition.Equals_Ignore_Case "X" "X"]+secondary_conditions - t4 = r2.second . order_by "N" - t4.row_count . should_equal n - t4.at "X" . to_vector . should_equal lowers - t4.at "Right X" . to_vector . should_equal uppers - t4.at "Z" . to_vector . should_equal <| 1.up_to n+1 . to_vector . reverse - - base_ms = r1.first.total_milliseconds - expected_max_time_ms = base_ms * 5 + 100 - runtime_ms = r2.first.total_milliseconds - if runtime_ms > expected_max_time_ms then - Test.fail "Expected a join of "+n.to_text+"x"+n.to_text+" with linear result size to be efficient, but it took "+runtime_ms.to_text+"ms while a join of 3x"+n.to_text+" with the same result size took "+base_ms.to_text+"ms. The maximum time threshold for this operation to be deemed efficient has been estimated at "+expected_max_time_ms.to_text+"ms." - - Test.specify "should efficiently compute Between joins" pending="TODO in task https://www.pivotaltracker.com/story/show/183913337" <| - xs = 0.up_to n . map x-> x * 20 - ls = 0.up_to n . map x-> x * 20 - 20 - us = 0.up_to n . map x-> x * 20 + 5 - t1 = Table.new [["X", xs], ["A", Vector.fill n "a"], ["B", Vector.fill n 44]] - # We set up the ranges so that each entry of `t1` will match 2, apart from the first entry matched only once. - t2 = Table.new [["l", [0, 10]], ["u", [20 * n, 20 * n + 100]], ["A", ["a", "A"]], ["B", [44, 44]]] - # Here also, each range from `t3` will match 2 entries of `t1`, apart from the first one. - t3 = Table.new [["l", ls], ["u", us], ["A", Vector.fill n "A"], ["B", Vector.fill n 44]] - - conditions = [Join_Condition.Equals_Ignore_Case "A", Join_Condition.Between "X" "l" "u", Join_Condition.Equals "B"] - - r1 = Duration.time_execution <| - t1.join t2 on=conditions - r1.second.row_count . should_equal (2*n - 1) - - r2 = Duration.time_execution <| - t1.join t3 on=conditions - t4 = r2.second . order_by ["X", "l"] - t4.row_count . should_equal (2*n - 1) - - t4.at "X" . to_vector . should_equal ((xs.flat_map x-> [x, x]) . drop (Last 1)) - t4.at "l" . to_vector . should_equal (ls.zip (ls.drop 1) . flatten)+[ls.last] - - base_ms = r1.first.total_milliseconds - expected_max_time_ms = base_ms * 5 + 100 - runtime_ms = r2.first.total_milliseconds - if runtime_ms > expected_max_time_ms then - Test.fail "Expected a join of "+n.to_text+"x"+n.to_text+" with linear result size to be efficient, but it took "+runtime_ms.to_text+"ms while a join of 2x"+n.to_text+" with the same result size took "+base_ms.to_text+"ms. The maximum time threshold for this operation to be deemed efficient has been estimated at "+expected_max_time_ms.to_text+"ms." - -main = Test_Suite.run_main spec diff --git a/test/Table_Tests/src/In_Memory/Main.enso b/test/Table_Tests/src/In_Memory/Main.enso index c50fdbae606..9221dba475f 100644 --- a/test/Table_Tests/src/In_Memory/Main.enso +++ b/test/Table_Tests/src/In_Memory/Main.enso @@ -8,7 +8,6 @@ import project.In_Memory.Column_Spec import project.In_Memory.Column_Format_Spec import project.In_Memory.Common_Spec import project.In_Memory.Integer_Overflow_Spec -import project.In_Memory.Join_Performance_Spec import project.In_Memory.Lossy_Conversions_Spec import project.In_Memory.Parse_To_Table_Spec import project.In_Memory.Split_Tokenize_Spec @@ -29,7 +28,6 @@ spec = Table_Time_Of_Day_Spec.spec Aggregate_Column_Spec.spec Builders_Spec.spec - Join_Performance_Spec.spec Split_Tokenize_Spec.spec Parse_To_Table_Spec.spec diff --git a/test/Table_Tests/src/In_Memory/Table_Spec.enso b/test/Table_Tests/src/In_Memory/Table_Spec.enso index 698873f1364..023f31013d7 100644 --- a/test/Table_Tests/src/In_Memory/Table_Spec.enso +++ b/test/Table_Tests/src/In_Memory/Table_Spec.enso @@ -871,42 +871,6 @@ spec = t2.filter "Y" (Filter_Condition.Is_In in_vector) . at "Y" . to_vector . should_equal expected_neg_vector t2.filter "Y" (Filter_Condition.Is_In in_column) . at "Y" . to_vector . should_equal expected_neg_vector - Test.specify "should perform `Is_In` efficiently for builtin types" <| - first_day = Date_Time.new 2000 1 1 - make_date x = first_day + (Duration.new seconds=x) - init = Duration.time_execution <| - t = Table.new [["X", (200.up_to 10000 . to_vector)]] - vec = 4000.up_to 13000 . to_vector - expected_vector = 4000.up_to 10000 . to_vector - expected_vector_2 = 200.up_to 10000 . with_step 2 . to_vector - dates_vec = vec.map make_date - bool_vec = Vector.fill 7000 True - date_col = t.at "X" . map make_date - [t, vec, expected_vector, expected_vector_2, dates_vec, bool_vec, date_col] - t = init.second . at 0 - vec = init.second . at 1 - expected_vector = init.second . at 2 - expected_vector_2 = init.second . at 3 - dates_vec = init.second . at 4 - bool_vec = init.second . at 5 - date_col = init.second . at 6 - - expected_max_time_ms = init.first.total_milliseconds * 2 - check_timing name ~action = - res = Duration.time_execution action - runtime_ms = res.first.total_milliseconds - if runtime_ms > expected_max_time_ms then - Test.fail "Expected `Is_In` on "+name+" to be efficient, but it took "+runtime_ms.to_text+"ms while initialization itself took just "+expected_max_time_ms.to_text+"ms." - - check_timing "integers" <| - t.filter "X" (Filter_Condition.Is_In vec) . at "X" . to_vector . should_equal expected_vector - - check_timing "booleans" <| - t.filter (t.at "X" % 2 == 0) (Filter_Condition.Is_In bool_vec) . at "X" . to_vector . should_equal expected_vector_2 - - check_timing "dates" <| - t.filter date_col (Filter_Condition.Is_In dates_vec) . at "X" . to_vector . should_equal expected_vector - Test.group "[In-Memory-specific] Table.join" <| Test.specify "should correctly report unsupported cross-backend joins" <| t = Table.new [["X", [1, 2, 3]]]