diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinStrategy.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinStrategy.java index 7e0d93635d..325367e2f4 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinStrategy.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/JoinStrategy.java @@ -5,7 +5,8 @@ import org.enso.table.data.table.join.between.SortJoin; import org.enso.table.data.table.join.conditions.Between; import org.enso.table.data.table.join.conditions.HashableCondition; import org.enso.table.data.table.join.conditions.JoinCondition; -import org.enso.table.data.table.join.hashing.HashJoin; +import org.enso.table.data.table.join.hashing.CompoundHashJoin; +import org.enso.table.data.table.join.hashing.SimpleHashJoin; import org.enso.table.problems.ProblemAggregator; /** A strategy used for performing a join of two tables. */ @@ -31,12 +32,9 @@ public interface JoinStrategy { assert !betweenConditions.isEmpty(); return new SortJoin(betweenConditions, joinKind); } else if (betweenConditions.isEmpty()) { - return new HashJoin( - hashableConditions, - joinKind.wantsCommon ? new MatchAllStrategy() : new NoOpStrategy(), - joinKind); + return new SimpleHashJoin(hashableConditions, joinKind); } else { - return new HashJoin(hashableConditions, new SortJoin(betweenConditions, joinKind), joinKind); + return new CompoundHashJoin(hashableConditions, betweenConditions, joinKind); } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/MatchAllStrategy.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/MatchAllStrategy.java deleted file mode 100644 index 38a6f348d1..0000000000 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/MatchAllStrategy.java +++ /dev/null @@ -1,28 +0,0 @@ -package org.enso.table.data.table.join; - -import java.util.List; -import org.enso.table.problems.ProblemAggregator; -import org.graalvm.polyglot.Context; - -/** - * A pluggable strategy that can be used as the inner strategy for a join if there are no more join - * conditions to process - so all rows are matched with each other within a given group. - */ -public class MatchAllStrategy implements PluggableJoinStrategy { - @Override - public void joinSubsets( - List leftGroup, - List rightGroup, - JoinResult.Builder resultBuilder, - ProblemAggregator problemAggregator) { - Context context = Context.getCurrent(); - for (var leftRow : leftGroup) { - for (var rightRow : rightGroup) { - resultBuilder.addMatchedRowsPair(leftRow, rightRow); - context.safepoint(); - } - - context.safepoint(); - } - } -} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/NoOpStrategy.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/NoOpStrategy.java deleted file mode 100644 index df7036c011..0000000000 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/NoOpStrategy.java +++ /dev/null @@ -1,15 +0,0 @@ -package org.enso.table.data.table.join; - -import java.util.List; -import org.enso.table.problems.ProblemAggregator; - -public class NoOpStrategy implements PluggableJoinStrategy { - @Override - public void joinSubsets( - List leftGroup, - List rightGroup, - JoinResult.Builder resultBuilder, - ProblemAggregator problemAggregator) { - return; - } -} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/PluggableJoinStrategy.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/PluggableJoinStrategy.java deleted file mode 100644 index f9e3ea57f5..0000000000 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/PluggableJoinStrategy.java +++ /dev/null @@ -1,18 +0,0 @@ -package org.enso.table.data.table.join; - -import java.util.List; -import org.enso.table.problems.ProblemAggregator; - -/** - * A helper join strategy that can be used within another join strategy to perform a join of - * sub-sets of indices, stemming from already joining on other conditions. - */ -public interface PluggableJoinStrategy { - - /** Performs a join of two sub-sets of indices. */ - void joinSubsets( - List leftGroup, - List rightGroup, - JoinResult.Builder resultBuilder, - ProblemAggregator problemAggregator); -} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortJoin.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortJoin.java index 148f8d7958..71b3013fff 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortJoin.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/between/SortJoin.java @@ -10,12 +10,11 @@ import org.enso.table.data.index.OrderedMultiValueKey; import org.enso.table.data.table.join.JoinKind; import org.enso.table.data.table.join.JoinResult; import org.enso.table.data.table.join.JoinStrategy; -import org.enso.table.data.table.join.PluggableJoinStrategy; import org.enso.table.data.table.join.conditions.Between; import org.enso.table.problems.ProblemAggregator; import org.graalvm.polyglot.Context; -public class SortJoin implements JoinStrategy, PluggableJoinStrategy { +public class SortJoin implements JoinStrategy { public SortJoin(List conditions, JoinKind joinKind) { JoinStrategy.ensureConditionsNotEmpty(conditions); @@ -83,7 +82,6 @@ public class SortJoin implements JoinStrategy, PluggableJoinStrategy { return resultBuilder.buildAndInvalidate(); } - @Override public void joinSubsets( List leftGroup, List rightGroup, @@ -184,7 +182,6 @@ public class SortJoin implements JoinStrategy, PluggableJoinStrategy { // Note: we cannot just use `compareTo`, because we are now not checking that the key is between // the bounds in lexicographic order. // Instead, we are checking if the key is between the bounds for all dimensions. - int n = key.getNumberOfColumns(); for (int i = 0; i < n; i++) { var keyValue = key.get(i); diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/CompoundHashJoin.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/CompoundHashJoin.java new file mode 100644 index 0000000000..b2751a7034 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/CompoundHashJoin.java @@ -0,0 +1,90 @@ +package org.enso.table.data.table.join.hashing; + +import java.util.List; +import org.enso.table.data.index.MultiValueIndex; +import org.enso.table.data.index.UnorderedMultiValueKey; +import org.enso.table.data.table.join.JoinKind; +import org.enso.table.data.table.join.JoinResult; +import org.enso.table.data.table.join.JoinStrategy; +import org.enso.table.data.table.join.between.SortJoin; +import org.enso.table.data.table.join.conditions.Between; +import org.enso.table.data.table.join.conditions.HashableCondition; +import org.enso.table.problems.ProblemAggregator; +import org.graalvm.polyglot.Context; + +/** + * A strategy that uses a hash-map to perform join on the equality conditions. + * + *

It then delegates to {@code SortJoin} to perform the remaining conditions on the matching + * pairs of row subsets. + */ +public class CompoundHashJoin implements JoinStrategy { + + public CompoundHashJoin( + List hashableConditions, + List betweenConditions, + JoinKind joinKind) { + this.hashJoinConfig = new HashJoinConfig(hashableConditions); + this.sortJoin = new SortJoin(betweenConditions, joinKind); + this.joinKind = joinKind; + } + + private final HashJoinConfig hashJoinConfig; + private final SortJoin sortJoin; + private final JoinKind joinKind; + + @Override + public JoinResult join(ProblemAggregator problemAggregator) { + Context context = Context.getCurrent(); + + var leftIndex = + MultiValueIndex.makeUnorderedIndex( + hashJoinConfig.getLeftEquals(), + hashJoinConfig.getLeftNumRows(), + hashJoinConfig.getTextFoldingStrategies(), + problemAggregator); + var rightIndex = + MultiValueIndex.makeUnorderedIndex( + hashJoinConfig.getRightEquals(), + hashJoinConfig.getRightNumRows(), + hashJoinConfig.getTextFoldingStrategies(), + problemAggregator); + + JoinResult.Builder resultBuilder = new JoinResult.Builder(); + for (var leftEntry : leftIndex.mapping().entrySet()) { + UnorderedMultiValueKey leftKey = leftEntry.getKey(); + List leftRows = leftEntry.getValue(); + // If any field of the key is null, it cannot match anything. + List rightRows = leftKey.hasAnyNulls() ? null : rightIndex.get(leftKey); + + if (rightRows != null) { + sortJoin.joinSubsets(leftRows, rightRows, resultBuilder, problemAggregator); + } else { + if (joinKind.wantsLeftUnmatched) { + for (int leftRow : leftRows) { + resultBuilder.addUnmatchedLeftRow(leftRow); + context.safepoint(); + } + } + } + + context.safepoint(); + } + + if (joinKind.wantsRightUnmatched) { + for (var rightEntry : rightIndex.mapping().entrySet()) { + UnorderedMultiValueKey rightKey = rightEntry.getKey(); + // If any field of the key is null, it cannot match anything. + boolean wasCompletelyUnmatched = + rightKey.hasAnyNulls() ? true : !leftIndex.contains(rightKey); + if (wasCompletelyUnmatched) { + for (int rightRow : rightEntry.getValue()) { + resultBuilder.addUnmatchedRightRow(rightRow); + } + } + } + } + + return resultBuilder.buildAndInvalidate(); + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java deleted file mode 100644 index 711781ffc5..0000000000 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java +++ /dev/null @@ -1,116 +0,0 @@ -package org.enso.table.data.table.join.hashing; - -import java.util.List; -import org.enso.base.text.TextFoldingStrategy; -import org.enso.table.data.index.MultiValueIndex; -import org.enso.table.data.index.UnorderedMultiValueKey; -import org.enso.table.data.table.Column; -import org.enso.table.data.table.join.JoinKind; -import org.enso.table.data.table.join.JoinResult; -import org.enso.table.data.table.join.JoinStrategy; -import org.enso.table.data.table.join.PluggableJoinStrategy; -import org.enso.table.data.table.join.conditions.Equals; -import org.enso.table.data.table.join.conditions.EqualsIgnoreCase; -import org.enso.table.data.table.join.conditions.HashableCondition; -import org.enso.table.problems.ProblemAggregator; -import org.graalvm.polyglot.Context; - -/** - * A strategy that uses a hash-map to perform join on the equality conditions. - * - *

It then delegates to {@code remainingMatcher} to perform the remaining conditions on the - * matching pairs of row subsets. - */ -public class HashJoin implements JoinStrategy { - public HashJoin( - List conditions, - PluggableJoinStrategy remainingMatcher, - JoinKind joinKind) { - JoinStrategy.ensureConditionsNotEmpty(conditions); - this.remainingMatcher = remainingMatcher; - this.joinKind = joinKind; - - List equalConditions = - conditions.stream().map(HashJoin::makeHashEqualityCondition).toList(); - - if (equalConditions.isEmpty()) { - throw new IllegalArgumentException( - "EqualityHashJoin is applicable if there is at least one equality condition."); - } - - leftEquals = equalConditions.stream().map(HashEqualityCondition::left).toArray(Column[]::new); - rightEquals = equalConditions.stream().map(HashEqualityCondition::right).toArray(Column[]::new); - textFoldingStrategies = - equalConditions.stream().map(HashEqualityCondition::textFoldingStrategy).toList(); - } - - private final Column[] leftEquals, rightEquals; - private final List textFoldingStrategies; - private final PluggableJoinStrategy remainingMatcher; - private final JoinKind joinKind; - - @Override - public JoinResult join(ProblemAggregator problemAggregator) { - Context context = Context.getCurrent(); - - var leftIndex = - MultiValueIndex.makeUnorderedIndex( - leftEquals, leftEquals[0].getSize(), textFoldingStrategies, problemAggregator); - var rightIndex = - MultiValueIndex.makeUnorderedIndex( - rightEquals, rightEquals[0].getSize(), textFoldingStrategies, problemAggregator); - - JoinResult.Builder resultBuilder = new JoinResult.Builder(); - for (var leftEntry : leftIndex.mapping().entrySet()) { - UnorderedMultiValueKey leftKey = leftEntry.getKey(); - List leftRows = leftEntry.getValue(); - // If any field of the key is null, it cannot match anything. - List rightRows = leftKey.hasAnyNulls() ? null : rightIndex.get(leftKey); - - if (rightRows != null) { - remainingMatcher.joinSubsets(leftRows, rightRows, resultBuilder, problemAggregator); - } else { - if (joinKind.wantsLeftUnmatched) { - for (int leftRow : leftRows) { - resultBuilder.addUnmatchedLeftRow(leftRow); - context.safepoint(); - } - } - } - - context.safepoint(); - } - - if (joinKind.wantsRightUnmatched) { - for (var rightEntry : rightIndex.mapping().entrySet()) { - UnorderedMultiValueKey rightKey = rightEntry.getKey(); - // If any field of the key is null, it cannot match anything. - boolean wasCompletelyUnmatched = - rightKey.hasAnyNulls() ? true : !leftIndex.contains(rightKey); - if (wasCompletelyUnmatched) { - for (int rightRow : rightEntry.getValue()) { - resultBuilder.addUnmatchedRightRow(rightRow); - } - } - } - } - - return resultBuilder.buildAndInvalidate(); - } - - private static HashEqualityCondition makeHashEqualityCondition(HashableCondition eq) { - switch (eq) { - case Equals e -> { - return new HashEqualityCondition( - e.left(), e.right(), TextFoldingStrategy.unicodeNormalizedFold); - } - case EqualsIgnoreCase e -> { - return new HashEqualityCondition( - e.left(), e.right(), TextFoldingStrategy.caseInsensitiveFold(e.locale())); - } - } - } - - private record HashEqualityCondition( - Column left, Column right, TextFoldingStrategy textFoldingStrategy) {} -} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoinConfig.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoinConfig.java new file mode 100644 index 0000000000..48ef24082b --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoinConfig.java @@ -0,0 +1,72 @@ +package org.enso.table.data.table.join.hashing; + +import java.util.List; +import org.enso.base.text.TextFoldingStrategy; +import org.enso.table.data.table.Column; +import org.enso.table.data.table.join.JoinStrategy; +import org.enso.table.data.table.join.conditions.Equals; +import org.enso.table.data.table.join.conditions.EqualsIgnoreCase; +import org.enso.table.data.table.join.conditions.HashableCondition; + +public class HashJoinConfig { + + private final Column[] leftEquals; + private final Column[] rightEquals; + private final List textFoldingStrategies; + + public HashJoinConfig(List conditions) { + JoinStrategy.ensureConditionsNotEmpty(conditions); + List equalConditions = + conditions.stream().map(HashJoinConfig::makeHashEqualityCondition).toList(); + + this.leftEquals = + equalConditions.stream().map(HashEqualityCondition::left).toArray(Column[]::new); + this.rightEquals = + equalConditions.stream().map(HashEqualityCondition::right).toArray(Column[]::new); + this.textFoldingStrategies = + equalConditions.stream().map(HashEqualityCondition::textFoldingStrategy).toList(); + } + + public HashJoinConfig( + Column[] leftEquals, Column[] rightEquals, List textFoldingStrategies) { + this.leftEquals = leftEquals; + this.rightEquals = rightEquals; + this.textFoldingStrategies = textFoldingStrategies; + } + + public Column[] getLeftEquals() { + return leftEquals; + } + + public Column[] getRightEquals() { + return rightEquals; + } + + public int getLeftNumRows() { + return leftEquals[0].getSize(); + } + + public int getRightNumRows() { + return rightEquals[0].getSize(); + } + + public List getTextFoldingStrategies() { + return textFoldingStrategies; + } + + private static HashEqualityCondition makeHashEqualityCondition(HashableCondition eq) { + switch (eq) { + case Equals e -> { + return new HashEqualityCondition( + e.left(), e.right(), TextFoldingStrategy.unicodeNormalizedFold); + } + case EqualsIgnoreCase e -> { + return new HashEqualityCondition( + e.left(), e.right(), TextFoldingStrategy.caseInsensitiveFold(e.locale())); + } + } + } + + private record HashEqualityCondition( + Column left, Column right, TextFoldingStrategy textFoldingStrategy) {} +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/SimpleHashJoin.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/SimpleHashJoin.java new file mode 100644 index 0000000000..fa63e46e48 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/SimpleHashJoin.java @@ -0,0 +1,177 @@ +package org.enso.table.data.table.join.hashing; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.index.MultiValueIndex; +import org.enso.table.data.index.UnorderedMultiValueKey; +import org.enso.table.data.table.Column; +import org.enso.table.data.table.join.JoinKind; +import org.enso.table.data.table.join.JoinResult; +import org.enso.table.data.table.join.JoinStrategy; +import org.enso.table.data.table.join.conditions.HashableCondition; +import org.enso.table.problems.ColumnAggregatedProblemAggregator; +import org.enso.table.problems.ProblemAggregator; +import org.graalvm.polyglot.Context; + +/** A strategy that uses a hash-map to perform join on the equality conditions. */ +public class SimpleHashJoin implements JoinStrategy { + + public SimpleHashJoin(List conditions, JoinKind joinKind) { + var tempHashJoinConfig = new HashJoinConfig(conditions); + + // algorithm assumes that left table is the big table. + // If not we will flip the left and right tables over to do the join + if (tempHashJoinConfig.getLeftNumRows() >= tempHashJoinConfig.getRightNumRows()) { + this.hashJoinConfig = tempHashJoinConfig; + this.joinKind = joinKind; + this.resultBuilder = new SimpleHashJoinResultBuilder(false); + } else { + // flip left and right inside of HashJoinConfig + this.hashJoinConfig = + new HashJoinConfig( + tempHashJoinConfig.getRightEquals(), + tempHashJoinConfig.getLeftEquals(), + tempHashJoinConfig.getTextFoldingStrategies()); + this.joinKind = flipJoinKind(joinKind); + this.resultBuilder = new SimpleHashJoinResultBuilder(true); + } + } + + private final HashJoinConfig hashJoinConfig; + private final JoinKind joinKind; + private final SimpleHashJoinResultBuilder resultBuilder; + + @Override + public JoinResult join(ProblemAggregator problemAggregator) { + // algorithm assumes that left table is the big table. + // If not we have flipped the tables round to do the join. + // If you are debugging your left table might not be your left table here. + // The result builder flips the indexes back as you add them + assert (hashJoinConfig.getLeftNumRows() >= hashJoinConfig.getRightNumRows()); + + var groupingProblemAggregator = new ColumnAggregatedProblemAggregator(problemAggregator); + var rightIndex = + MultiValueIndex.makeUnorderedIndex( + hashJoinConfig.getRightEquals(), + hashJoinConfig.getRightNumRows(), + hashJoinConfig.getTextFoldingStrategies(), + problemAggregator); + var storage = + Arrays.stream(hashJoinConfig.getLeftEquals()) + .map(Column::getStorage) + .toArray(Storage[]::new); + Set matchedRightKeys = new HashSet<>(); + + Context context = Context.getCurrent(); + for (int leftRow = 0; leftRow < hashJoinConfig.getLeftNumRows(); leftRow++) { + var leftKey = makeLeftKey(storage, leftRow, groupingProblemAggregator); + // If any field of the key is null, it cannot match anything. + List rightRows = leftKey.hasAnyNulls() ? null : rightIndex.get(leftKey); + if (rightRows != null) { + if (joinKind.wantsCommon) { + addAll(leftRow, rightRows, resultBuilder); + } + if (joinKind.wantsRightUnmatched) { + matchedRightKeys.add(leftKey); + } + } else if (joinKind.wantsLeftUnmatched) { + resultBuilder.addUnmatchedLeftRow(leftRow); + context.safepoint(); + } + context.safepoint(); + } + + if (joinKind.wantsRightUnmatched) { + addUnmatchedRightRows(rightIndex, matchedRightKeys); + } + + return resultBuilder.buildAndInvalidate(); + } + + private void addUnmatchedRightRows( + MultiValueIndex rightIndex, + Set matchedRightKeys) { + Context context = Context.getCurrent(); + for (var rightEntry : rightIndex.mapping().entrySet()) { + UnorderedMultiValueKey rightKey = rightEntry.getKey(); + boolean wasCompletelyUnmatched = !matchedRightKeys.contains(rightKey); + if (wasCompletelyUnmatched) { + for (int rightRow : rightEntry.getValue()) { + resultBuilder.addUnmatchedRightRow(rightRow); + context.safepoint(); + } + } + context.safepoint(); + } + } + + public UnorderedMultiValueKey makeLeftKey( + Storage[] storage, + int rowNumber, + ColumnAggregatedProblemAggregator groupingProblemAggregator) { + var leftEquals = hashJoinConfig.getLeftEquals(); + var leftKey = + new UnorderedMultiValueKey(storage, rowNumber, hashJoinConfig.getTextFoldingStrategies()); + leftKey.checkAndReportFloatingEquality( + groupingProblemAggregator, columnIx -> leftEquals[columnIx].getName()); + return leftKey; + } + + private static void addAll( + int leftRow, List rightGroup, SimpleHashJoinResultBuilder resultBuilder) { + Context context = Context.getCurrent(); + for (var rightRow : rightGroup) { + resultBuilder.addMatchedRowsPair(leftRow, rightRow); + context.safepoint(); + } + context.safepoint(); + } + + private static JoinKind flipJoinKind(JoinKind joinKind) { + return switch (joinKind) { + case LEFT_OUTER -> JoinKind.RIGHT_OUTER; + case RIGHT_OUTER -> JoinKind.LEFT_OUTER; + case LEFT_ANTI -> JoinKind.RIGHT_ANTI; + case RIGHT_ANTI -> JoinKind.LEFT_ANTI; + default -> joinKind; + }; + } + + private class SimpleHashJoinResultBuilder { + + public SimpleHashJoinResultBuilder(boolean flipLeftAndRight) { + this.flipLeftAndRight = flipLeftAndRight; + this.resultBuilder = new JoinResult.Builder(); + } + + JoinResult.Builder resultBuilder; + private final boolean flipLeftAndRight; + + public void addMatchedRowsPair(int leftIndex, int rightIndex) { + addPair(leftIndex, rightIndex); + } + + public void addUnmatchedLeftRow(int leftIndex) { + addPair(leftIndex, -1); + } + + public void addUnmatchedRightRow(int rightIndex) { + addPair(-1, rightIndex); + } + + public JoinResult buildAndInvalidate() { + return resultBuilder.buildAndInvalidate(); + } + + private void addPair(int leftIndex, int rightIndex) { + if (flipLeftAndRight) { + resultBuilder.addMatchedRowsPair(rightIndex, leftIndex); + } else { + resultBuilder.addMatchedRowsPair(leftIndex, rightIndex); + } + } + } +}