Implement relational NULL/Nothing for join for in-memory tables (#8849)

Implements relational NULL for join, for all `Join_Kind`s.
This commit is contained in:
GregoryTravis 2024-01-29 11:19:07 -05:00 committed by GitHub
parent 0b16db4399
commit 7436848e90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 144 additions and 67 deletions

View File

@ -603,8 +603,10 @@
- [Added text_length to Column][8606]
- [Added none delimiter option for Data.Read][8627]
- [Added text_left and text_right to Column][8691]
- [Implement relational `NULL` semantics for `Nothing` for in-memory Column
operations.][5156]
- [Implement relational `NULL` semantics for `Nothing` for in-memory `Column`
operations.][8816]
- [Implement relational `NULL` semantics for `Nothing` for in-memory `Table`
join operations.][8849]
- [Attach a warning when Nothing is used as a value in a comparison or `is_in`
`Filter_Condition`.][8865]
@ -774,7 +776,6 @@
[4120]: https://github.com/enso-org/enso/pull/4120
[4050]: https://github.com/enso-org/enso/pull/4050
[4072]: https://github.com/enso-org/enso/pull/4072
[5156]: https://github.com/enso-org/enso/pull/5156
[5582]: https://github.com/enso-org/enso/pull/5582
[5645]: https://github.com/enso-org/enso/pull/5645
[5646]: https://github.com/enso-org/enso/pull/5646
@ -871,6 +872,8 @@
[8606]: https://github.com/enso-org/enso/pull/8606
[8627]: https://github.com/enso-org/enso/pull/8627
[8691]: https://github.com/enso-org/enso/pull/8691
[8816]: https://github.com/enso-org/enso/pull/8816
[8849]: https://github.com/enso-org/enso/pull/8849
[8865]: https://github.com/enso-org/enso/pull/8865
#### Enso Compiler

View File

@ -64,7 +64,8 @@ public class HashJoin implements JoinStrategy {
for (var leftEntry : leftIndex.mapping().entrySet()) {
UnorderedMultiValueKey leftKey = leftEntry.getKey();
List<Integer> leftRows = leftEntry.getValue();
List<Integer> rightRows = rightIndex.get(leftKey);
// If any field of the key is null, it cannot match anything.
List<Integer> rightRows = leftKey.hasAnyNulls() ? null : rightIndex.get(leftKey);
if (rightRows != null) {
remainingMatcher.joinSubsets(leftRows, rightRows, resultBuilder, problemAggregator);
@ -83,7 +84,9 @@ public class HashJoin implements JoinStrategy {
if (joinKind.wantsRightUnmatched) {
for (var rightEntry : rightIndex.mapping().entrySet()) {
UnorderedMultiValueKey rightKey = rightEntry.getKey();
boolean wasCompletelyUnmatched = !leftIndex.contains(rightKey);
// If any field of the key is null, it cannot match anything.
boolean wasCompletelyUnmatched =
rightKey.hasAnyNulls() ? true : !leftIndex.contains(rightKey);
if (wasCompletelyUnmatched) {
for (int rightRow : rightEntry.getValue()) {
resultBuilder.addUnmatchedRightRow(rightRow);

View File

@ -12,6 +12,8 @@ from Standard.Test_New import all
from project.Common_Table_Operations.Util import expect_column_names, run_default_backend, within_table
main = run_default_backend add_specs
type My_Type
Value x y
@ -24,7 +26,6 @@ type My_Type_Comparator
Comparable.from (_:My_Type) = My_Type_Comparator
type Data
Value ~data
@ -50,7 +51,7 @@ add_specs suite_builder setup =
table_builder = setup.table_builder
create_connection_fn = setup.create_connection_func
materialize = setup.materialize
db_todo = if setup.is_database.not then Nothing else "ToDo: handling NULLs in equality conditions."
suite_builder.group prefix+"Table.join" group_builder->
data = Data.setup create_connection_fn table_builder
@ -381,17 +382,16 @@ add_specs suite_builder setup =
t3.at "Right X" . to_vector . should_equal [2, 3, Nothing, 0, 1, 1, 2]
t3.at "Right A" . to_vector . should_equal ["X", "E", Nothing, "B", "C", "C", "D"]
t4 = table_builder [["X", [Nothing, "a", "B"]], ["Y", ["ą", "b", Nothing]], ["Z", [1, 2, 3]]]
t4 = table_builder [["X", [Nothing, "a", "B", "c"]], ["Y", ["ą", "b", "C", Nothing]], ["Z", [1, 2, 3, 4]]]
t5 = t4.join t4 join_kind=Join_Kind.Inner on=(Join_Condition.Equals_Ignore_Case left="Y" right="X") |> materialize |> _.order_by ["Y"]
expect_column_names ["X", "Y", "Z", "Right X", "Right Y", "Right Z"] t5
# TODO enable once we handle nothing properly
# t5.at "Y" . to_vector . should_equal [Nothing, "b"]
# t5.at "Right X" . to_vector . should_equal [Nothing, "B"]
# t5.at "X" . to_vector . should_equal ["B", "a"]
# t5.at "Z" . to_vector . should_equal [3, 2]
# t5.at "Right Y" . to_vector . should_equal ["ą", Nothing]
# t5.at "Right Z" . to_vector . should_equal [1, 3]
within_table t5 <|
t5.at "X" . to_vector . should_equal ["B", "a"]
t5.at "Y" . to_vector . should_equal ["C", "b"]
t5.at "Z" . to_vector . should_equal [3, 2]
t5.at "Right X" . to_vector . should_equal ["c", "B"]
t5.at "Right Y" . to_vector . should_equal [Nothing, "C"]
t5.at "Right Z" . to_vector . should_equal [4, 3]
group_builder.specify "should gracefully handle unmatched columns in Join_Conditions" <|
t1 = table_builder [["X", [1, 2]], ["Y", [3, 4]]]
@ -486,38 +486,111 @@ add_specs suite_builder setup =
expected_problems = [Floating_Point_Equality.Error "Z", Floating_Point_Equality.Error "X"]
Problems.get_attached_warnings r3 . should_contain_the_same_elements_as expected_problems
group_builder.specify "should correctly handle nulls in equality conditions" pending=db_todo <|
t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą"]], ["Y", [0, 1, 2, 3, 4]]]
t2 = table_builder [["X", ["a", Nothing, Nothing]], ["Z", [10, 20, 30]]]
group_builder.specify "should correctly handle nulls in equality conditions" <|
t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą", "b"]], ["Y", [0, 1, 2, 3, 4, 5]]]
t2 = table_builder [["X", ["a", Nothing, Nothing, "b"]], ["Z", [10, 20, 30, 50]]]
r1 = t1.join t2 join_kind=Join_Kind.Inner |> materialize |> _.order_by ["Y"]
expect_column_names ["X", "Y", "Z"] r1
r1.at "X" . to_vector . should_equal [Nothing, Nothing, "a", Nothing, Nothing]
r1.at "Y" . to_vector . should_equal [1, 1, 2, 3, 3]
r1.at "Z" . to_vector . should_equal [20, 30, 10, 20, 30]
r1.at "X" . to_vector . should_equal ["a", "b"]
r1.at "Y" . to_vector . should_equal [2, 5]
r1.at "Z" . to_vector . should_equal [10, 50]
group_builder.specify "should correctly handle nulls in case-insensitive equality conditions" pending=db_todo <|
t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą"]], ["Y", [0, 1, 2, 3, 4]]]
t2 = table_builder [["X", ["a", Nothing, Nothing]], ["Z", [10, 20, 30]]]
group_builder.specify "should correctly handle nulls in equality conditions in outer joins" <|
t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą", "b"]], ["Y", [0, 1, 2, 3, 4, 5]]]
t2 = table_builder [["X", ["a", Nothing, Nothing, "b"]], ["Z", [10, 20, 30, 50]]]
r2 = t1.join t2 join_kind=Join_Kind.Left_Outer |> materialize |> _.order_by ["Y"]
expect_column_names ["X", "Y", "Right X", "Z"] r2
vs2 = r2 . rows . map .to_vector
within_table r2 <|
vs2.at 0 . to_vector . should_equal ["A", 0, Nothing, Nothing]
vs2.at 1 . to_vector . should_equal [Nothing, 1, Nothing, Nothing]
vs2.at 2 . to_vector . should_equal ["a", 2, "a", 10]
vs2.at 3 . to_vector . should_equal [Nothing, 3, Nothing, Nothing]
vs2.at 4 . to_vector . should_equal ["ą", 4, Nothing, Nothing]
vs2.at 5 . to_vector . should_equal ["b", 5, "b", 50]
r3 = t1.join t2 join_kind=Join_Kind.Right_Outer |> materialize |> _.order_by ["Z"]
expect_column_names ["X", "Y", "Right X", "Z"] r3
vs3 = r3 . rows . map .to_vector
within_table r3 <|
vs3.at 0 . to_vector . should_equal ["a", 2, "a", 10]
vs3.at 1 . to_vector . should_equal [Nothing, Nothing, Nothing, 20]
vs3.at 2 . to_vector . should_equal [Nothing, Nothing, Nothing, 30]
vs3.at 3 . to_vector . should_equal ["b", 5, "b", 50]
group_builder.specify "should correctly handle nulls in case-insensitive equality conditions" <|
t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą", "b"]], ["Y", [0, 1, 2, 3, 4, 5]]]
t2 = table_builder [["X", ["a", Nothing, Nothing, "b"]], ["Z", [10, 20, 30, 50]]]
r1 = t1.join t2 join_kind=Join_Kind.Inner on=(Join_Condition.Equals_Ignore_Case "X") |> materialize |> _.order_by ["Y"]
expect_column_names ["X", "Y", "Right X", "Z"] r1
r1.at "X" . to_vector . should_equal ["A", Nothing, Nothing, "a", Nothing, Nothing]
r1.at "Right X" . to_vector . should_equal ["a", Nothing, Nothing, "a", Nothing, Nothing]
r1.at "Y" . to_vector . should_equal [0, 1, 1, 2, 3, 3]
r1.at "Z" . to_vector . should_equal [10, 20, 30, 10, 20, 30]
r1.at "X" . to_vector . should_equal ["A", "a", "b"]
r1.at "Y" . to_vector . should_equal [0, 2, 5]
r1.at "Right X" . to_vector . should_equal ["a", "a", "b"]
r1.at "Z" . to_vector . should_equal [10, 10, 50]
group_builder.specify "should correctly handle nulls in case-insensitive equality conditions in outer joins" <|
t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą", "b"]], ["Y", [0, 1, 2, 3, 4, 5]]]
t2 = table_builder [["X", ["a", Nothing, Nothing, "b"]], ["Z", [10, 20, 30, 50]]]
r2 = t1.join t2 join_kind=Join_Kind.Left_Outer on=(Join_Condition.Equals_Ignore_Case "X") |> materialize |> _.order_by ["Y"]
expect_column_names ["X", "Y", "Right X", "Z"] r2
vs2 = r2 . rows . map .to_vector
within_table r2 <|
vs2.at 0 . to_vector . should_equal ["A", 0, "a", 10]
vs2.at 1 . to_vector . should_equal [Nothing, 1, Nothing, Nothing]
vs2.at 2 . to_vector . should_equal ["a", 2, "a", 10]
vs2.at 3 . to_vector . should_equal [Nothing, 3, Nothing, Nothing]
vs2.at 4 . to_vector . should_equal ["ą", 4, Nothing, Nothing]
vs2.at 5 . to_vector . should_equal ["b", 5, "b", 50]
r3 = t1.join t2 join_kind=Join_Kind.Right_Outer on=(Join_Condition.Equals_Ignore_Case "X") |> materialize |> _.order_by ["Z", "Y"]
expect_column_names ["X", "Y", "Right X", "Z"] r3
vs3 = r3 . rows . map .to_vector
within_table r3 <|
vs3.at 0 . to_vector . should_equal ["A", 0, "a", 10]
vs3.at 1 . to_vector . should_equal ["a", 2, "a", 10]
vs3.at 2 . to_vector . should_equal [Nothing, Nothing, Nothing, 20]
vs3.at 3 . to_vector . should_equal [Nothing, Nothing, Nothing, 30]
vs3.at 4 . to_vector . should_equal ["b", 5, "b", 50]
group_builder.specify "should correctly handle nulls in Between conditions" <|
t1 = table_builder [["X", [1, Nothing, 2, Nothing]], ["Y", [0, 1, 2, 3]]]
t2 = table_builder [["l", [Nothing, 0, 1]], ["u", [100, 10, Nothing]], ["Z", [10, 20, 30]]]
t1 = table_builder [["X", [1, Nothing, 2, Nothing, 20]], ["Y", [0, 1, 2, 3, 4]]]
t2 = table_builder [["l", [Nothing, 0, 1, 20]], ["u", [100, 10, Nothing, 100]], ["Z", [10, 20, 30, 40]]]
r1 = t1.join t2 join_kind=Join_Kind.Inner on=(Join_Condition.Between "X" "l" "u") |> materialize |> _.order_by ["Y"]
expect_column_names ["X", "Y", "l", "u", "Z"] r1
r1.at "X" . to_vector . should_equal [1, 2]
r1.at "Y" . to_vector . should_equal [0, 2]
r1.at "l" . to_vector . should_equal [0, 0]
r1.at "u" . to_vector . should_equal [10, 10]
r1.at "Z" . to_vector . should_equal [20, 20]
vs1 = r1 . rows . map .to_vector
within_table r1 <|
vs1.at 0 . should_equal [1, 0, 0, 10, 20]
vs1.at 1 . should_equal [2, 2, 0, 10, 20]
vs1.at 2 . should_equal [20, 4, 20, 100, 40]
group_builder.specify "should correctly handle nulls in Between conditions in outer joins" <|
t1 = table_builder [["X", [1, Nothing, 2, Nothing, 20]], ["Y", [0, 1, 2, 3, 4]]]
t2 = table_builder [["l", [Nothing, 0, 1, 20]], ["u", [100, 10, Nothing, 100]], ["Z", [10, 20, 30, 40]]]
r1 = t1.join t2 join_kind=Join_Kind.Left_Outer on=(Join_Condition.Between "X" "l" "u") |> materialize |> _.order_by ["Y"]
expect_column_names ["X", "Y", "l", "u", "Z"] r1
vs1 = r1 . rows . map .to_vector
within_table r1 <|
vs1.at 0 . should_equal [1, 0, 0, 10, 20]
vs1.at 1 . should_equal [Nothing, 1, Nothing, Nothing, Nothing]
vs1.at 2 . should_equal [2, 2, 0, 10, 20]
vs1.at 3 . should_equal [Nothing, 3, Nothing, Nothing, Nothing]
vs1.at 4 . should_equal [20, 4, 20, 100, 40]
r2 = t1.join t2 join_kind=Join_Kind.Right_Outer on=(Join_Condition.Between "X" "l" "u") |> materialize |> _.order_by ["Z", "Y"]
expect_column_names ["X", "Y", "l", "u", "Z"] r2
vs2 = r2 . rows . map .to_vector
within_table r2 <|
vs2.at 0 . should_equal [Nothing, Nothing, Nothing, 100, 10]
vs2.at 1 . should_equal [1, 0, 0, 10, 20]
vs2.at 2 . should_equal [2, 2, 0, 10, 20]
vs2.at 3 . should_equal [Nothing, Nothing, 1, Nothing, 30]
vs2.at 4 . should_equal [20, 4, 20, 100, 40]
group_builder.specify "should rename columns of the right table to avoid duplicates" <|
t1 = table_builder [["X", [1, 2]], ["Y", [3, 4]], ["Right Y", [5, 6]]]
@ -576,7 +649,7 @@ add_specs suite_builder setup =
data.t1.join error . should_fail_with Illegal_State
data.t1.join data.t2 on=[error, "X"] . should_fail_with Illegal_State
group_builder.specify "should correctly handle all null rows" pending=db_todo <|
group_builder.specify "should correctly handle all null rows" <|
t1 = table_builder [["A", [Nothing, 2, Nothing, 1]], ["B", [Nothing, 3, 4, 7]]]
t2 = table_builder [["C", [Nothing, 2, Nothing, 4]], ["D", [Nothing, 5, 6, Nothing]]]
@ -584,12 +657,8 @@ add_specs suite_builder setup =
expect_column_names ["A", "B", "C", "D"] t3
r3 = materialize t3 . order_by ["A", "B", "D"] . rows . map .to_vector
within_table t3 <|
r3.length . should_equal 5
r3.at 0 . should_equal [Nothing, Nothing, Nothing, Nothing]
r3.at 1 . should_equal [Nothing, Nothing, Nothing, 6]
r3.at 2 . should_equal [Nothing, 4, Nothing, Nothing]
r3.at 3 . should_equal [Nothing, 4, Nothing, 6]
r3.at 4 . should_equal [2, 3, 2, 5]
r3.length . should_equal 1
r3.at 0 . should_equal [2, 3, 2, 5]
t4 = t1.join t2 on=[Join_Condition.Equals "A" "C"] join_kind=Join_Kind.Full
expect_column_names ["A", "B", "C", "D"] t4
@ -597,10 +666,10 @@ add_specs suite_builder setup =
within_table t4 <|
r4.length . should_equal 7
r4.at 0 . should_equal [Nothing, Nothing, Nothing, Nothing]
r4.at 1 . should_equal [Nothing, Nothing, 4, Nothing]
r4.at 2 . should_equal [Nothing, Nothing, Nothing, 6]
r4.at 3 . should_equal [Nothing, 4, Nothing, Nothing]
r4.at 4 . should_equal [Nothing, 4, Nothing, 6]
r4.at 1 . should_equal [Nothing, Nothing, Nothing, Nothing]
r4.at 2 . should_equal [Nothing, Nothing, 4, Nothing]
r4.at 3 . should_equal [Nothing, Nothing, Nothing, 6]
r4.at 4 . should_equal [Nothing, 4, Nothing, Nothing]
r4.at 5 . should_equal [1, 7, Nothing, Nothing]
r4.at 6 . should_equal [2, 3, 2, 5]
@ -608,37 +677,39 @@ add_specs suite_builder setup =
expect_column_names ["A", "B", "C", "D"] t4_2
r4_2 = materialize t4_2 . order_by ["A", "B", "D", "C"] . rows . map .to_vector
within_table t4_2 <|
r4_2.length . should_equal 6
r4_2.length . should_equal 4
r4_2.at 0 . should_equal [Nothing, Nothing, Nothing, Nothing]
r4_2.at 1 . should_equal [Nothing, Nothing, Nothing, 6]
r4_2.at 2 . should_equal [Nothing, 4, Nothing, Nothing]
r4_2.at 3 . should_equal [Nothing, 4, Nothing, 6]
r4_2.at 4 . should_equal [1, 7, Nothing, Nothing]
r4_2.at 5 . should_equal [2, 3, 2, 5]
r4_2.at 1 . should_equal [Nothing, 4, Nothing, Nothing]
r4_2.at 2 . should_equal [1, 7, Nothing, Nothing]
r4_2.at 3 . should_equal [2, 3, 2, 5]
t4_3 = t1.join t2 on=[Join_Condition.Equals "A" "C"] join_kind=Join_Kind.Right_Outer
expect_column_names ["A", "B", "C", "D"] t4_3
r4_3 = materialize t4_3 . order_by ["A", "B", "D", "C"] . rows . map .to_vector
within_table t4_3 <|
r4_3.length . should_equal 6
r4_3 = materialize t4_3 . order_by ["A", "B", "C", "D"] . rows . map .to_vector
within_table r4_3 <|
r4_3.length . should_equal 4
r4_3.at 0 . should_equal [Nothing, Nothing, Nothing, Nothing]
r4_3.at 1 . should_equal [Nothing, Nothing, 4, Nothing]
r4_3.at 2 . should_equal [Nothing, Nothing, Nothing, 6]
r4_3.at 3 . should_equal [Nothing, 4, Nothing, Nothing]
r4_3.at 4 . should_equal [Nothing, 4, Nothing, 6]
r4_3.at 5 . should_equal [2, 3, 2, 5]
r4_3.at 1 . should_equal [Nothing, Nothing, Nothing, 6]
r4_3.at 2 . should_equal [Nothing, Nothing, 4, Nothing]
r4_3.at 3 . should_equal [2, 3, 2, 5]
t5 = t1.join t2 on=[Join_Condition.Equals "A" "C"] join_kind=Join_Kind.Left_Exclusive
expect_column_names ["A", "B"] t5
r5 = materialize t5 . order_by ["A", "B"] . rows . map .to_vector
within_table t5 <|
expect_column_names ["A", "B"] t5
t5.at "A" . to_vector . should_equal [1]
t5.at "B" . to_vector . should_equal [7]
r5.length . should_equal 3
r5.at 0 . should_equal [Nothing, Nothing]
r5.at 1 . should_equal [Nothing, 4]
r5.at 2 . should_equal [1, 7]
t6 = t1.join t2 on=[Join_Condition.Equals "A" "C"] join_kind=Join_Kind.Right_Exclusive
expect_column_names ["C", "D"] t6
r6 = materialize t6 . order_by ["C", "D"] . rows . map .to_vector
within_table t6 <|
expect_column_names ["C", "D"] t6
t6.at "C" . to_vector . should_equal [4]
t6.at "D" . to_vector . should_equal [Nothing]
r6.length . should_equal 3
r6.at 0 . should_equal [Nothing, Nothing]
r6.at 1 . should_equal [Nothing, 6]
r6.at 2 . should_equal [4, Nothing]
t7 = table_builder [["A", [Nothing, 2]], ["B", [Nothing, 3]]]
t8 = table_builder [["C", [2, 3]], ["D", [4, 5]]]