Split HashJoin to SimpleHashJoin and CompoundHashJoin (#8850)

Completes #8342 . Creates a SimpleHashJoin and CompoundHashJoin.

# Important Notes
Creates SimpleHashJoin and CompoundHashJoin.

CompoundHashJoin is what was HashJoin.
SimpleHashJoin is a new implementation that only indexs the smaller of the 2 tables being joined together.

The rest is refactor and clean-up of the shared join code.
This commit is contained in:
AdRiley 2024-02-01 18:48:44 +00:00 committed by GitHub
parent ed65af7005
commit 340a3eec4e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 344 additions and 187 deletions

View File

@ -5,7 +5,8 @@ import org.enso.table.data.table.join.between.SortJoin;
import org.enso.table.data.table.join.conditions.Between;
import org.enso.table.data.table.join.conditions.HashableCondition;
import org.enso.table.data.table.join.conditions.JoinCondition;
import org.enso.table.data.table.join.hashing.HashJoin;
import org.enso.table.data.table.join.hashing.CompoundHashJoin;
import org.enso.table.data.table.join.hashing.SimpleHashJoin;
import org.enso.table.problems.ProblemAggregator;
/** A strategy used for performing a join of two tables. */
@ -31,12 +32,9 @@ public interface JoinStrategy {
assert !betweenConditions.isEmpty();
return new SortJoin(betweenConditions, joinKind);
} else if (betweenConditions.isEmpty()) {
return new HashJoin(
hashableConditions,
joinKind.wantsCommon ? new MatchAllStrategy() : new NoOpStrategy(),
joinKind);
return new SimpleHashJoin(hashableConditions, joinKind);
} else {
return new HashJoin(hashableConditions, new SortJoin(betweenConditions, joinKind), joinKind);
return new CompoundHashJoin(hashableConditions, betweenConditions, joinKind);
}
}

View File

@ -1,28 +0,0 @@
package org.enso.table.data.table.join;
import java.util.List;
import org.enso.table.problems.ProblemAggregator;
import org.graalvm.polyglot.Context;
/**
* A pluggable strategy that can be used as the inner strategy for a join if there are no more join
* conditions to process - so all rows are matched with each other within a given group.
*/
public class MatchAllStrategy implements PluggableJoinStrategy {
@Override
public void joinSubsets(
List<Integer> leftGroup,
List<Integer> rightGroup,
JoinResult.Builder resultBuilder,
ProblemAggregator problemAggregator) {
Context context = Context.getCurrent();
for (var leftRow : leftGroup) {
for (var rightRow : rightGroup) {
resultBuilder.addMatchedRowsPair(leftRow, rightRow);
context.safepoint();
}
context.safepoint();
}
}
}

View File

@ -1,15 +0,0 @@
package org.enso.table.data.table.join;
import java.util.List;
import org.enso.table.problems.ProblemAggregator;
public class NoOpStrategy implements PluggableJoinStrategy {
@Override
public void joinSubsets(
List<Integer> leftGroup,
List<Integer> rightGroup,
JoinResult.Builder resultBuilder,
ProblemAggregator problemAggregator) {
return;
}
}

View File

@ -1,18 +0,0 @@
package org.enso.table.data.table.join;
import java.util.List;
import org.enso.table.problems.ProblemAggregator;
/**
* A helper join strategy that can be used within another join strategy to perform a join of
* sub-sets of indices, stemming from already joining on other conditions.
*/
public interface PluggableJoinStrategy {
/** Performs a join of two sub-sets of indices. */
void joinSubsets(
List<Integer> leftGroup,
List<Integer> rightGroup,
JoinResult.Builder resultBuilder,
ProblemAggregator problemAggregator);
}

View File

@ -10,12 +10,11 @@ import org.enso.table.data.index.OrderedMultiValueKey;
import org.enso.table.data.table.join.JoinKind;
import org.enso.table.data.table.join.JoinResult;
import org.enso.table.data.table.join.JoinStrategy;
import org.enso.table.data.table.join.PluggableJoinStrategy;
import org.enso.table.data.table.join.conditions.Between;
import org.enso.table.problems.ProblemAggregator;
import org.graalvm.polyglot.Context;
public class SortJoin implements JoinStrategy, PluggableJoinStrategy {
public class SortJoin implements JoinStrategy {
public SortJoin(List<Between> conditions, JoinKind joinKind) {
JoinStrategy.ensureConditionsNotEmpty(conditions);
@ -83,7 +82,6 @@ public class SortJoin implements JoinStrategy, PluggableJoinStrategy {
return resultBuilder.buildAndInvalidate();
}
@Override
public void joinSubsets(
List<Integer> leftGroup,
List<Integer> rightGroup,
@ -184,7 +182,6 @@ public class SortJoin implements JoinStrategy, PluggableJoinStrategy {
// Note: we cannot just use `compareTo`, because we are now not checking that the key is between
// the bounds in lexicographic order.
// Instead, we are checking if the key is between the bounds for all dimensions.
int n = key.getNumberOfColumns();
for (int i = 0; i < n; i++) {
var keyValue = key.get(i);

View File

@ -0,0 +1,90 @@
package org.enso.table.data.table.join.hashing;
import java.util.List;
import org.enso.table.data.index.MultiValueIndex;
import org.enso.table.data.index.UnorderedMultiValueKey;
import org.enso.table.data.table.join.JoinKind;
import org.enso.table.data.table.join.JoinResult;
import org.enso.table.data.table.join.JoinStrategy;
import org.enso.table.data.table.join.between.SortJoin;
import org.enso.table.data.table.join.conditions.Between;
import org.enso.table.data.table.join.conditions.HashableCondition;
import org.enso.table.problems.ProblemAggregator;
import org.graalvm.polyglot.Context;
/**
* A strategy that uses a hash-map to perform join on the equality conditions.
*
* <p>It then delegates to {@code SortJoin} to perform the remaining conditions on the matching
* pairs of row subsets.
*/
public class CompoundHashJoin implements JoinStrategy {
public CompoundHashJoin(
List<HashableCondition> hashableConditions,
List<Between> betweenConditions,
JoinKind joinKind) {
this.hashJoinConfig = new HashJoinConfig(hashableConditions);
this.sortJoin = new SortJoin(betweenConditions, joinKind);
this.joinKind = joinKind;
}
private final HashJoinConfig hashJoinConfig;
private final SortJoin sortJoin;
private final JoinKind joinKind;
@Override
public JoinResult join(ProblemAggregator problemAggregator) {
Context context = Context.getCurrent();
var leftIndex =
MultiValueIndex.makeUnorderedIndex(
hashJoinConfig.getLeftEquals(),
hashJoinConfig.getLeftNumRows(),
hashJoinConfig.getTextFoldingStrategies(),
problemAggregator);
var rightIndex =
MultiValueIndex.makeUnorderedIndex(
hashJoinConfig.getRightEquals(),
hashJoinConfig.getRightNumRows(),
hashJoinConfig.getTextFoldingStrategies(),
problemAggregator);
JoinResult.Builder resultBuilder = new JoinResult.Builder();
for (var leftEntry : leftIndex.mapping().entrySet()) {
UnorderedMultiValueKey leftKey = leftEntry.getKey();
List<Integer> leftRows = leftEntry.getValue();
// If any field of the key is null, it cannot match anything.
List<Integer> rightRows = leftKey.hasAnyNulls() ? null : rightIndex.get(leftKey);
if (rightRows != null) {
sortJoin.joinSubsets(leftRows, rightRows, resultBuilder, problemAggregator);
} else {
if (joinKind.wantsLeftUnmatched) {
for (int leftRow : leftRows) {
resultBuilder.addUnmatchedLeftRow(leftRow);
context.safepoint();
}
}
}
context.safepoint();
}
if (joinKind.wantsRightUnmatched) {
for (var rightEntry : rightIndex.mapping().entrySet()) {
UnorderedMultiValueKey rightKey = rightEntry.getKey();
// If any field of the key is null, it cannot match anything.
boolean wasCompletelyUnmatched =
rightKey.hasAnyNulls() ? true : !leftIndex.contains(rightKey);
if (wasCompletelyUnmatched) {
for (int rightRow : rightEntry.getValue()) {
resultBuilder.addUnmatchedRightRow(rightRow);
}
}
}
}
return resultBuilder.buildAndInvalidate();
}
}

View File

@ -1,116 +0,0 @@
package org.enso.table.data.table.join.hashing;
import java.util.List;
import org.enso.base.text.TextFoldingStrategy;
import org.enso.table.data.index.MultiValueIndex;
import org.enso.table.data.index.UnorderedMultiValueKey;
import org.enso.table.data.table.Column;
import org.enso.table.data.table.join.JoinKind;
import org.enso.table.data.table.join.JoinResult;
import org.enso.table.data.table.join.JoinStrategy;
import org.enso.table.data.table.join.PluggableJoinStrategy;
import org.enso.table.data.table.join.conditions.Equals;
import org.enso.table.data.table.join.conditions.EqualsIgnoreCase;
import org.enso.table.data.table.join.conditions.HashableCondition;
import org.enso.table.problems.ProblemAggregator;
import org.graalvm.polyglot.Context;
/**
* A strategy that uses a hash-map to perform join on the equality conditions.
*
* <p>It then delegates to {@code remainingMatcher} to perform the remaining conditions on the
* matching pairs of row subsets.
*/
public class HashJoin implements JoinStrategy {
public HashJoin(
List<HashableCondition> conditions,
PluggableJoinStrategy remainingMatcher,
JoinKind joinKind) {
JoinStrategy.ensureConditionsNotEmpty(conditions);
this.remainingMatcher = remainingMatcher;
this.joinKind = joinKind;
List<HashEqualityCondition> equalConditions =
conditions.stream().map(HashJoin::makeHashEqualityCondition).toList();
if (equalConditions.isEmpty()) {
throw new IllegalArgumentException(
"EqualityHashJoin is applicable if there is at least one equality condition.");
}
leftEquals = equalConditions.stream().map(HashEqualityCondition::left).toArray(Column[]::new);
rightEquals = equalConditions.stream().map(HashEqualityCondition::right).toArray(Column[]::new);
textFoldingStrategies =
equalConditions.stream().map(HashEqualityCondition::textFoldingStrategy).toList();
}
private final Column[] leftEquals, rightEquals;
private final List<TextFoldingStrategy> textFoldingStrategies;
private final PluggableJoinStrategy remainingMatcher;
private final JoinKind joinKind;
@Override
public JoinResult join(ProblemAggregator problemAggregator) {
Context context = Context.getCurrent();
var leftIndex =
MultiValueIndex.makeUnorderedIndex(
leftEquals, leftEquals[0].getSize(), textFoldingStrategies, problemAggregator);
var rightIndex =
MultiValueIndex.makeUnorderedIndex(
rightEquals, rightEquals[0].getSize(), textFoldingStrategies, problemAggregator);
JoinResult.Builder resultBuilder = new JoinResult.Builder();
for (var leftEntry : leftIndex.mapping().entrySet()) {
UnorderedMultiValueKey leftKey = leftEntry.getKey();
List<Integer> leftRows = leftEntry.getValue();
// If any field of the key is null, it cannot match anything.
List<Integer> rightRows = leftKey.hasAnyNulls() ? null : rightIndex.get(leftKey);
if (rightRows != null) {
remainingMatcher.joinSubsets(leftRows, rightRows, resultBuilder, problemAggregator);
} else {
if (joinKind.wantsLeftUnmatched) {
for (int leftRow : leftRows) {
resultBuilder.addUnmatchedLeftRow(leftRow);
context.safepoint();
}
}
}
context.safepoint();
}
if (joinKind.wantsRightUnmatched) {
for (var rightEntry : rightIndex.mapping().entrySet()) {
UnorderedMultiValueKey rightKey = rightEntry.getKey();
// If any field of the key is null, it cannot match anything.
boolean wasCompletelyUnmatched =
rightKey.hasAnyNulls() ? true : !leftIndex.contains(rightKey);
if (wasCompletelyUnmatched) {
for (int rightRow : rightEntry.getValue()) {
resultBuilder.addUnmatchedRightRow(rightRow);
}
}
}
}
return resultBuilder.buildAndInvalidate();
}
private static HashEqualityCondition makeHashEqualityCondition(HashableCondition eq) {
switch (eq) {
case Equals e -> {
return new HashEqualityCondition(
e.left(), e.right(), TextFoldingStrategy.unicodeNormalizedFold);
}
case EqualsIgnoreCase e -> {
return new HashEqualityCondition(
e.left(), e.right(), TextFoldingStrategy.caseInsensitiveFold(e.locale()));
}
}
}
private record HashEqualityCondition(
Column left, Column right, TextFoldingStrategy textFoldingStrategy) {}
}

View File

@ -0,0 +1,72 @@
package org.enso.table.data.table.join.hashing;
import java.util.List;
import org.enso.base.text.TextFoldingStrategy;
import org.enso.table.data.table.Column;
import org.enso.table.data.table.join.JoinStrategy;
import org.enso.table.data.table.join.conditions.Equals;
import org.enso.table.data.table.join.conditions.EqualsIgnoreCase;
import org.enso.table.data.table.join.conditions.HashableCondition;
public class HashJoinConfig {
private final Column[] leftEquals;
private final Column[] rightEquals;
private final List<TextFoldingStrategy> textFoldingStrategies;
public HashJoinConfig(List<HashableCondition> conditions) {
JoinStrategy.ensureConditionsNotEmpty(conditions);
List<HashEqualityCondition> equalConditions =
conditions.stream().map(HashJoinConfig::makeHashEqualityCondition).toList();
this.leftEquals =
equalConditions.stream().map(HashEqualityCondition::left).toArray(Column[]::new);
this.rightEquals =
equalConditions.stream().map(HashEqualityCondition::right).toArray(Column[]::new);
this.textFoldingStrategies =
equalConditions.stream().map(HashEqualityCondition::textFoldingStrategy).toList();
}
public HashJoinConfig(
Column[] leftEquals, Column[] rightEquals, List<TextFoldingStrategy> textFoldingStrategies) {
this.leftEquals = leftEquals;
this.rightEquals = rightEquals;
this.textFoldingStrategies = textFoldingStrategies;
}
public Column[] getLeftEquals() {
return leftEquals;
}
public Column[] getRightEquals() {
return rightEquals;
}
public int getLeftNumRows() {
return leftEquals[0].getSize();
}
public int getRightNumRows() {
return rightEquals[0].getSize();
}
public List<TextFoldingStrategy> getTextFoldingStrategies() {
return textFoldingStrategies;
}
private static HashEqualityCondition makeHashEqualityCondition(HashableCondition eq) {
switch (eq) {
case Equals e -> {
return new HashEqualityCondition(
e.left(), e.right(), TextFoldingStrategy.unicodeNormalizedFold);
}
case EqualsIgnoreCase e -> {
return new HashEqualityCondition(
e.left(), e.right(), TextFoldingStrategy.caseInsensitiveFold(e.locale()));
}
}
}
private record HashEqualityCondition(
Column left, Column right, TextFoldingStrategy textFoldingStrategy) {}
}

View File

@ -0,0 +1,177 @@
package org.enso.table.data.table.join.hashing;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.index.MultiValueIndex;
import org.enso.table.data.index.UnorderedMultiValueKey;
import org.enso.table.data.table.Column;
import org.enso.table.data.table.join.JoinKind;
import org.enso.table.data.table.join.JoinResult;
import org.enso.table.data.table.join.JoinStrategy;
import org.enso.table.data.table.join.conditions.HashableCondition;
import org.enso.table.problems.ColumnAggregatedProblemAggregator;
import org.enso.table.problems.ProblemAggregator;
import org.graalvm.polyglot.Context;
/** A strategy that uses a hash-map to perform join on the equality conditions. */
public class SimpleHashJoin implements JoinStrategy {
public SimpleHashJoin(List<HashableCondition> conditions, JoinKind joinKind) {
var tempHashJoinConfig = new HashJoinConfig(conditions);
// algorithm assumes that left table is the big table.
// If not we will flip the left and right tables over to do the join
if (tempHashJoinConfig.getLeftNumRows() >= tempHashJoinConfig.getRightNumRows()) {
this.hashJoinConfig = tempHashJoinConfig;
this.joinKind = joinKind;
this.resultBuilder = new SimpleHashJoinResultBuilder(false);
} else {
// flip left and right inside of HashJoinConfig
this.hashJoinConfig =
new HashJoinConfig(
tempHashJoinConfig.getRightEquals(),
tempHashJoinConfig.getLeftEquals(),
tempHashJoinConfig.getTextFoldingStrategies());
this.joinKind = flipJoinKind(joinKind);
this.resultBuilder = new SimpleHashJoinResultBuilder(true);
}
}
private final HashJoinConfig hashJoinConfig;
private final JoinKind joinKind;
private final SimpleHashJoinResultBuilder resultBuilder;
@Override
public JoinResult join(ProblemAggregator problemAggregator) {
// algorithm assumes that left table is the big table.
// If not we have flipped the tables round to do the join.
// If you are debugging your left table might not be your left table here.
// The result builder flips the indexes back as you add them
assert (hashJoinConfig.getLeftNumRows() >= hashJoinConfig.getRightNumRows());
var groupingProblemAggregator = new ColumnAggregatedProblemAggregator(problemAggregator);
var rightIndex =
MultiValueIndex.makeUnorderedIndex(
hashJoinConfig.getRightEquals(),
hashJoinConfig.getRightNumRows(),
hashJoinConfig.getTextFoldingStrategies(),
problemAggregator);
var storage =
Arrays.stream(hashJoinConfig.getLeftEquals())
.map(Column::getStorage)
.toArray(Storage[]::new);
Set<UnorderedMultiValueKey> matchedRightKeys = new HashSet<>();
Context context = Context.getCurrent();
for (int leftRow = 0; leftRow < hashJoinConfig.getLeftNumRows(); leftRow++) {
var leftKey = makeLeftKey(storage, leftRow, groupingProblemAggregator);
// If any field of the key is null, it cannot match anything.
List<Integer> rightRows = leftKey.hasAnyNulls() ? null : rightIndex.get(leftKey);
if (rightRows != null) {
if (joinKind.wantsCommon) {
addAll(leftRow, rightRows, resultBuilder);
}
if (joinKind.wantsRightUnmatched) {
matchedRightKeys.add(leftKey);
}
} else if (joinKind.wantsLeftUnmatched) {
resultBuilder.addUnmatchedLeftRow(leftRow);
context.safepoint();
}
context.safepoint();
}
if (joinKind.wantsRightUnmatched) {
addUnmatchedRightRows(rightIndex, matchedRightKeys);
}
return resultBuilder.buildAndInvalidate();
}
private void addUnmatchedRightRows(
MultiValueIndex<UnorderedMultiValueKey> rightIndex,
Set<UnorderedMultiValueKey> matchedRightKeys) {
Context context = Context.getCurrent();
for (var rightEntry : rightIndex.mapping().entrySet()) {
UnorderedMultiValueKey rightKey = rightEntry.getKey();
boolean wasCompletelyUnmatched = !matchedRightKeys.contains(rightKey);
if (wasCompletelyUnmatched) {
for (int rightRow : rightEntry.getValue()) {
resultBuilder.addUnmatchedRightRow(rightRow);
context.safepoint();
}
}
context.safepoint();
}
}
public UnorderedMultiValueKey makeLeftKey(
Storage[] storage,
int rowNumber,
ColumnAggregatedProblemAggregator groupingProblemAggregator) {
var leftEquals = hashJoinConfig.getLeftEquals();
var leftKey =
new UnorderedMultiValueKey(storage, rowNumber, hashJoinConfig.getTextFoldingStrategies());
leftKey.checkAndReportFloatingEquality(
groupingProblemAggregator, columnIx -> leftEquals[columnIx].getName());
return leftKey;
}
private static void addAll(
int leftRow, List<Integer> rightGroup, SimpleHashJoinResultBuilder resultBuilder) {
Context context = Context.getCurrent();
for (var rightRow : rightGroup) {
resultBuilder.addMatchedRowsPair(leftRow, rightRow);
context.safepoint();
}
context.safepoint();
}
private static JoinKind flipJoinKind(JoinKind joinKind) {
return switch (joinKind) {
case LEFT_OUTER -> JoinKind.RIGHT_OUTER;
case RIGHT_OUTER -> JoinKind.LEFT_OUTER;
case LEFT_ANTI -> JoinKind.RIGHT_ANTI;
case RIGHT_ANTI -> JoinKind.LEFT_ANTI;
default -> joinKind;
};
}
private class SimpleHashJoinResultBuilder {
public SimpleHashJoinResultBuilder(boolean flipLeftAndRight) {
this.flipLeftAndRight = flipLeftAndRight;
this.resultBuilder = new JoinResult.Builder();
}
JoinResult.Builder resultBuilder;
private final boolean flipLeftAndRight;
public void addMatchedRowsPair(int leftIndex, int rightIndex) {
addPair(leftIndex, rightIndex);
}
public void addUnmatchedLeftRow(int leftIndex) {
addPair(leftIndex, -1);
}
public void addUnmatchedRightRow(int rightIndex) {
addPair(-1, rightIndex);
}
public JoinResult buildAndInvalidate() {
return resultBuilder.buildAndInvalidate();
}
private void addPair(int leftIndex, int rightIndex) {
if (flipLeftAndRight) {
resultBuilder.addMatchedRowsPair(rightIndex, leftIndex);
} else {
resultBuilder.addMatchedRowsPair(leftIndex, rightIndex);
}
}
}
}