From 7436848e90c71184a2b95f1e1f88978413790b8d Mon Sep 17 00:00:00 2001 From: GregoryTravis Date: Mon, 29 Jan 2024 11:19:07 -0500 Subject: [PATCH] Implement relational NULL/Nothing for join for in-memory tables (#8849) Implements relational NULL for join, for all `Join_Kind`s. --- CHANGELOG.md | 9 +- .../data/table/join/hashing/HashJoin.java | 7 +- .../Join/Join_Spec.enso | 195 ++++++++++++------ 3 files changed, 144 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5fb1a3873b..e9bb9065149 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -603,8 +603,10 @@ - [Added text_length to Column][8606] - [Added none delimiter option for Data.Read][8627] - [Added text_left and text_right to Column][8691] -- [Implement relational `NULL` semantics for `Nothing` for in-memory Column - operations.][5156] +- [Implement relational `NULL` semantics for `Nothing` for in-memory `Column` + operations.][8816] +- [Implement relational `NULL` semantics for `Nothing` for in-memory `Table` + join operations.][8849] - [Attach a warning when Nothing is used as a value in a comparison or `is_in` `Filter_Condition`.][8865] @@ -774,7 +776,6 @@ [4120]: https://github.com/enso-org/enso/pull/4120 [4050]: https://github.com/enso-org/enso/pull/4050 [4072]: https://github.com/enso-org/enso/pull/4072 -[5156]: https://github.com/enso-org/enso/pull/5156 [5582]: https://github.com/enso-org/enso/pull/5582 [5645]: https://github.com/enso-org/enso/pull/5645 [5646]: https://github.com/enso-org/enso/pull/5646 @@ -871,6 +872,8 @@ [8606]: https://github.com/enso-org/enso/pull/8606 [8627]: https://github.com/enso-org/enso/pull/8627 [8691]: https://github.com/enso-org/enso/pull/8691 +[8816]: https://github.com/enso-org/enso/pull/8816 +[8849]: https://github.com/enso-org/enso/pull/8849 [8865]: https://github.com/enso-org/enso/pull/8865 #### Enso Compiler diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java index 9d79d8ad5ec..711781ffc52 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/join/hashing/HashJoin.java @@ -64,7 +64,8 @@ public class HashJoin implements JoinStrategy { for (var leftEntry : leftIndex.mapping().entrySet()) { UnorderedMultiValueKey leftKey = leftEntry.getKey(); List leftRows = leftEntry.getValue(); - List rightRows = rightIndex.get(leftKey); + // If any field of the key is null, it cannot match anything. + List rightRows = leftKey.hasAnyNulls() ? null : rightIndex.get(leftKey); if (rightRows != null) { remainingMatcher.joinSubsets(leftRows, rightRows, resultBuilder, problemAggregator); @@ -83,7 +84,9 @@ public class HashJoin implements JoinStrategy { if (joinKind.wantsRightUnmatched) { for (var rightEntry : rightIndex.mapping().entrySet()) { UnorderedMultiValueKey rightKey = rightEntry.getKey(); - boolean wasCompletelyUnmatched = !leftIndex.contains(rightKey); + // If any field of the key is null, it cannot match anything. + boolean wasCompletelyUnmatched = + rightKey.hasAnyNulls() ? true : !leftIndex.contains(rightKey); if (wasCompletelyUnmatched) { for (int rightRow : rightEntry.getValue()) { resultBuilder.addUnmatchedRightRow(rightRow); diff --git a/test/Table_Tests/src/Common_Table_Operations/Join/Join_Spec.enso b/test/Table_Tests/src/Common_Table_Operations/Join/Join_Spec.enso index c6639367ea1..174cc18eaa7 100644 --- a/test/Table_Tests/src/Common_Table_Operations/Join/Join_Spec.enso +++ b/test/Table_Tests/src/Common_Table_Operations/Join/Join_Spec.enso @@ -12,6 +12,8 @@ from Standard.Test_New import all from project.Common_Table_Operations.Util import expect_column_names, run_default_backend, within_table +main = run_default_backend add_specs + type My_Type Value x y @@ -24,7 +26,6 @@ type My_Type_Comparator Comparable.from (_:My_Type) = My_Type_Comparator - type Data Value ~data @@ -50,7 +51,7 @@ add_specs suite_builder setup = table_builder = setup.table_builder create_connection_fn = setup.create_connection_func materialize = setup.materialize - db_todo = if setup.is_database.not then Nothing else "ToDo: handling NULLs in equality conditions." + suite_builder.group prefix+"Table.join" group_builder-> data = Data.setup create_connection_fn table_builder @@ -381,17 +382,16 @@ add_specs suite_builder setup = t3.at "Right X" . to_vector . should_equal [2, 3, Nothing, 0, 1, 1, 2] t3.at "Right A" . to_vector . should_equal ["X", "E", Nothing, "B", "C", "C", "D"] - t4 = table_builder [["X", [Nothing, "a", "B"]], ["Y", ["ą", "b", Nothing]], ["Z", [1, 2, 3]]] + t4 = table_builder [["X", [Nothing, "a", "B", "c"]], ["Y", ["ą", "b", "C", Nothing]], ["Z", [1, 2, 3, 4]]] t5 = t4.join t4 join_kind=Join_Kind.Inner on=(Join_Condition.Equals_Ignore_Case left="Y" right="X") |> materialize |> _.order_by ["Y"] expect_column_names ["X", "Y", "Z", "Right X", "Right Y", "Right Z"] t5 - # TODO enable once we handle nothing properly - # t5.at "Y" . to_vector . should_equal [Nothing, "b"] - # t5.at "Right X" . to_vector . should_equal [Nothing, "B"] - - # t5.at "X" . to_vector . should_equal ["B", "a"] - # t5.at "Z" . to_vector . should_equal [3, 2] - # t5.at "Right Y" . to_vector . should_equal ["ą", Nothing] - # t5.at "Right Z" . to_vector . should_equal [1, 3] + within_table t5 <| + t5.at "X" . to_vector . should_equal ["B", "a"] + t5.at "Y" . to_vector . should_equal ["C", "b"] + t5.at "Z" . to_vector . should_equal [3, 2] + t5.at "Right X" . to_vector . should_equal ["c", "B"] + t5.at "Right Y" . to_vector . should_equal [Nothing, "C"] + t5.at "Right Z" . to_vector . should_equal [4, 3] group_builder.specify "should gracefully handle unmatched columns in Join_Conditions" <| t1 = table_builder [["X", [1, 2]], ["Y", [3, 4]]] @@ -486,38 +486,111 @@ add_specs suite_builder setup = expected_problems = [Floating_Point_Equality.Error "Z", Floating_Point_Equality.Error "X"] Problems.get_attached_warnings r3 . should_contain_the_same_elements_as expected_problems - group_builder.specify "should correctly handle nulls in equality conditions" pending=db_todo <| - t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą"]], ["Y", [0, 1, 2, 3, 4]]] - t2 = table_builder [["X", ["a", Nothing, Nothing]], ["Z", [10, 20, 30]]] + group_builder.specify "should correctly handle nulls in equality conditions" <| + t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą", "b"]], ["Y", [0, 1, 2, 3, 4, 5]]] + t2 = table_builder [["X", ["a", Nothing, Nothing, "b"]], ["Z", [10, 20, 30, 50]]] r1 = t1.join t2 join_kind=Join_Kind.Inner |> materialize |> _.order_by ["Y"] expect_column_names ["X", "Y", "Z"] r1 - r1.at "X" . to_vector . should_equal [Nothing, Nothing, "a", Nothing, Nothing] - r1.at "Y" . to_vector . should_equal [1, 1, 2, 3, 3] - r1.at "Z" . to_vector . should_equal [20, 30, 10, 20, 30] + r1.at "X" . to_vector . should_equal ["a", "b"] + r1.at "Y" . to_vector . should_equal [2, 5] + r1.at "Z" . to_vector . should_equal [10, 50] - group_builder.specify "should correctly handle nulls in case-insensitive equality conditions" pending=db_todo <| - t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą"]], ["Y", [0, 1, 2, 3, 4]]] - t2 = table_builder [["X", ["a", Nothing, Nothing]], ["Z", [10, 20, 30]]] + group_builder.specify "should correctly handle nulls in equality conditions in outer joins" <| + t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą", "b"]], ["Y", [0, 1, 2, 3, 4, 5]]] + t2 = table_builder [["X", ["a", Nothing, Nothing, "b"]], ["Z", [10, 20, 30, 50]]] + + r2 = t1.join t2 join_kind=Join_Kind.Left_Outer |> materialize |> _.order_by ["Y"] + expect_column_names ["X", "Y", "Right X", "Z"] r2 + vs2 = r2 . rows . map .to_vector + within_table r2 <| + vs2.at 0 . to_vector . should_equal ["A", 0, Nothing, Nothing] + vs2.at 1 . to_vector . should_equal [Nothing, 1, Nothing, Nothing] + vs2.at 2 . to_vector . should_equal ["a", 2, "a", 10] + vs2.at 3 . to_vector . should_equal [Nothing, 3, Nothing, Nothing] + vs2.at 4 . to_vector . should_equal ["ą", 4, Nothing, Nothing] + vs2.at 5 . to_vector . should_equal ["b", 5, "b", 50] + + r3 = t1.join t2 join_kind=Join_Kind.Right_Outer |> materialize |> _.order_by ["Z"] + expect_column_names ["X", "Y", "Right X", "Z"] r3 + vs3 = r3 . rows . map .to_vector + within_table r3 <| + vs3.at 0 . to_vector . should_equal ["a", 2, "a", 10] + vs3.at 1 . to_vector . should_equal [Nothing, Nothing, Nothing, 20] + vs3.at 2 . to_vector . should_equal [Nothing, Nothing, Nothing, 30] + vs3.at 3 . to_vector . should_equal ["b", 5, "b", 50] + + group_builder.specify "should correctly handle nulls in case-insensitive equality conditions" <| + t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą", "b"]], ["Y", [0, 1, 2, 3, 4, 5]]] + t2 = table_builder [["X", ["a", Nothing, Nothing, "b"]], ["Z", [10, 20, 30, 50]]] r1 = t1.join t2 join_kind=Join_Kind.Inner on=(Join_Condition.Equals_Ignore_Case "X") |> materialize |> _.order_by ["Y"] expect_column_names ["X", "Y", "Right X", "Z"] r1 - r1.at "X" . to_vector . should_equal ["A", Nothing, Nothing, "a", Nothing, Nothing] - r1.at "Right X" . to_vector . should_equal ["a", Nothing, Nothing, "a", Nothing, Nothing] - r1.at "Y" . to_vector . should_equal [0, 1, 1, 2, 3, 3] - r1.at "Z" . to_vector . should_equal [10, 20, 30, 10, 20, 30] + r1.at "X" . to_vector . should_equal ["A", "a", "b"] + r1.at "Y" . to_vector . should_equal [0, 2, 5] + r1.at "Right X" . to_vector . should_equal ["a", "a", "b"] + r1.at "Z" . to_vector . should_equal [10, 10, 50] + + group_builder.specify "should correctly handle nulls in case-insensitive equality conditions in outer joins" <| + t1 = table_builder [["X", ["A", Nothing, "a", Nothing, "ą", "b"]], ["Y", [0, 1, 2, 3, 4, 5]]] + t2 = table_builder [["X", ["a", Nothing, Nothing, "b"]], ["Z", [10, 20, 30, 50]]] + + r2 = t1.join t2 join_kind=Join_Kind.Left_Outer on=(Join_Condition.Equals_Ignore_Case "X") |> materialize |> _.order_by ["Y"] + expect_column_names ["X", "Y", "Right X", "Z"] r2 + vs2 = r2 . rows . map .to_vector + within_table r2 <| + vs2.at 0 . to_vector . should_equal ["A", 0, "a", 10] + vs2.at 1 . to_vector . should_equal [Nothing, 1, Nothing, Nothing] + vs2.at 2 . to_vector . should_equal ["a", 2, "a", 10] + vs2.at 3 . to_vector . should_equal [Nothing, 3, Nothing, Nothing] + vs2.at 4 . to_vector . should_equal ["ą", 4, Nothing, Nothing] + vs2.at 5 . to_vector . should_equal ["b", 5, "b", 50] + + r3 = t1.join t2 join_kind=Join_Kind.Right_Outer on=(Join_Condition.Equals_Ignore_Case "X") |> materialize |> _.order_by ["Z", "Y"] + expect_column_names ["X", "Y", "Right X", "Z"] r3 + vs3 = r3 . rows . map .to_vector + within_table r3 <| + vs3.at 0 . to_vector . should_equal ["A", 0, "a", 10] + vs3.at 1 . to_vector . should_equal ["a", 2, "a", 10] + vs3.at 2 . to_vector . should_equal [Nothing, Nothing, Nothing, 20] + vs3.at 3 . to_vector . should_equal [Nothing, Nothing, Nothing, 30] + vs3.at 4 . to_vector . should_equal ["b", 5, "b", 50] group_builder.specify "should correctly handle nulls in Between conditions" <| - t1 = table_builder [["X", [1, Nothing, 2, Nothing]], ["Y", [0, 1, 2, 3]]] - t2 = table_builder [["l", [Nothing, 0, 1]], ["u", [100, 10, Nothing]], ["Z", [10, 20, 30]]] + t1 = table_builder [["X", [1, Nothing, 2, Nothing, 20]], ["Y", [0, 1, 2, 3, 4]]] + t2 = table_builder [["l", [Nothing, 0, 1, 20]], ["u", [100, 10, Nothing, 100]], ["Z", [10, 20, 30, 40]]] r1 = t1.join t2 join_kind=Join_Kind.Inner on=(Join_Condition.Between "X" "l" "u") |> materialize |> _.order_by ["Y"] expect_column_names ["X", "Y", "l", "u", "Z"] r1 - r1.at "X" . to_vector . should_equal [1, 2] - r1.at "Y" . to_vector . should_equal [0, 2] - r1.at "l" . to_vector . should_equal [0, 0] - r1.at "u" . to_vector . should_equal [10, 10] - r1.at "Z" . to_vector . should_equal [20, 20] + vs1 = r1 . rows . map .to_vector + within_table r1 <| + vs1.at 0 . should_equal [1, 0, 0, 10, 20] + vs1.at 1 . should_equal [2, 2, 0, 10, 20] + vs1.at 2 . should_equal [20, 4, 20, 100, 40] + + group_builder.specify "should correctly handle nulls in Between conditions in outer joins" <| + t1 = table_builder [["X", [1, Nothing, 2, Nothing, 20]], ["Y", [0, 1, 2, 3, 4]]] + t2 = table_builder [["l", [Nothing, 0, 1, 20]], ["u", [100, 10, Nothing, 100]], ["Z", [10, 20, 30, 40]]] + + r1 = t1.join t2 join_kind=Join_Kind.Left_Outer on=(Join_Condition.Between "X" "l" "u") |> materialize |> _.order_by ["Y"] + expect_column_names ["X", "Y", "l", "u", "Z"] r1 + vs1 = r1 . rows . map .to_vector + within_table r1 <| + vs1.at 0 . should_equal [1, 0, 0, 10, 20] + vs1.at 1 . should_equal [Nothing, 1, Nothing, Nothing, Nothing] + vs1.at 2 . should_equal [2, 2, 0, 10, 20] + vs1.at 3 . should_equal [Nothing, 3, Nothing, Nothing, Nothing] + vs1.at 4 . should_equal [20, 4, 20, 100, 40] + + r2 = t1.join t2 join_kind=Join_Kind.Right_Outer on=(Join_Condition.Between "X" "l" "u") |> materialize |> _.order_by ["Z", "Y"] + expect_column_names ["X", "Y", "l", "u", "Z"] r2 + vs2 = r2 . rows . map .to_vector + within_table r2 <| + vs2.at 0 . should_equal [Nothing, Nothing, Nothing, 100, 10] + vs2.at 1 . should_equal [1, 0, 0, 10, 20] + vs2.at 2 . should_equal [2, 2, 0, 10, 20] + vs2.at 3 . should_equal [Nothing, Nothing, 1, Nothing, 30] + vs2.at 4 . should_equal [20, 4, 20, 100, 40] group_builder.specify "should rename columns of the right table to avoid duplicates" <| t1 = table_builder [["X", [1, 2]], ["Y", [3, 4]], ["Right Y", [5, 6]]] @@ -576,7 +649,7 @@ add_specs suite_builder setup = data.t1.join error . should_fail_with Illegal_State data.t1.join data.t2 on=[error, "X"] . should_fail_with Illegal_State - group_builder.specify "should correctly handle all null rows" pending=db_todo <| + group_builder.specify "should correctly handle all null rows" <| t1 = table_builder [["A", [Nothing, 2, Nothing, 1]], ["B", [Nothing, 3, 4, 7]]] t2 = table_builder [["C", [Nothing, 2, Nothing, 4]], ["D", [Nothing, 5, 6, Nothing]]] @@ -584,12 +657,8 @@ add_specs suite_builder setup = expect_column_names ["A", "B", "C", "D"] t3 r3 = materialize t3 . order_by ["A", "B", "D"] . rows . map .to_vector within_table t3 <| - r3.length . should_equal 5 - r3.at 0 . should_equal [Nothing, Nothing, Nothing, Nothing] - r3.at 1 . should_equal [Nothing, Nothing, Nothing, 6] - r3.at 2 . should_equal [Nothing, 4, Nothing, Nothing] - r3.at 3 . should_equal [Nothing, 4, Nothing, 6] - r3.at 4 . should_equal [2, 3, 2, 5] + r3.length . should_equal 1 + r3.at 0 . should_equal [2, 3, 2, 5] t4 = t1.join t2 on=[Join_Condition.Equals "A" "C"] join_kind=Join_Kind.Full expect_column_names ["A", "B", "C", "D"] t4 @@ -597,10 +666,10 @@ add_specs suite_builder setup = within_table t4 <| r4.length . should_equal 7 r4.at 0 . should_equal [Nothing, Nothing, Nothing, Nothing] - r4.at 1 . should_equal [Nothing, Nothing, 4, Nothing] - r4.at 2 . should_equal [Nothing, Nothing, Nothing, 6] - r4.at 3 . should_equal [Nothing, 4, Nothing, Nothing] - r4.at 4 . should_equal [Nothing, 4, Nothing, 6] + r4.at 1 . should_equal [Nothing, Nothing, Nothing, Nothing] + r4.at 2 . should_equal [Nothing, Nothing, 4, Nothing] + r4.at 3 . should_equal [Nothing, Nothing, Nothing, 6] + r4.at 4 . should_equal [Nothing, 4, Nothing, Nothing] r4.at 5 . should_equal [1, 7, Nothing, Nothing] r4.at 6 . should_equal [2, 3, 2, 5] @@ -608,37 +677,39 @@ add_specs suite_builder setup = expect_column_names ["A", "B", "C", "D"] t4_2 r4_2 = materialize t4_2 . order_by ["A", "B", "D", "C"] . rows . map .to_vector within_table t4_2 <| - r4_2.length . should_equal 6 + r4_2.length . should_equal 4 r4_2.at 0 . should_equal [Nothing, Nothing, Nothing, Nothing] - r4_2.at 1 . should_equal [Nothing, Nothing, Nothing, 6] - r4_2.at 2 . should_equal [Nothing, 4, Nothing, Nothing] - r4_2.at 3 . should_equal [Nothing, 4, Nothing, 6] - r4_2.at 4 . should_equal [1, 7, Nothing, Nothing] - r4_2.at 5 . should_equal [2, 3, 2, 5] + r4_2.at 1 . should_equal [Nothing, 4, Nothing, Nothing] + r4_2.at 2 . should_equal [1, 7, Nothing, Nothing] + r4_2.at 3 . should_equal [2, 3, 2, 5] t4_3 = t1.join t2 on=[Join_Condition.Equals "A" "C"] join_kind=Join_Kind.Right_Outer expect_column_names ["A", "B", "C", "D"] t4_3 - r4_3 = materialize t4_3 . order_by ["A", "B", "D", "C"] . rows . map .to_vector - within_table t4_3 <| - r4_3.length . should_equal 6 + r4_3 = materialize t4_3 . order_by ["A", "B", "C", "D"] . rows . map .to_vector + within_table r4_3 <| + r4_3.length . should_equal 4 r4_3.at 0 . should_equal [Nothing, Nothing, Nothing, Nothing] - r4_3.at 1 . should_equal [Nothing, Nothing, 4, Nothing] - r4_3.at 2 . should_equal [Nothing, Nothing, Nothing, 6] - r4_3.at 3 . should_equal [Nothing, 4, Nothing, Nothing] - r4_3.at 4 . should_equal [Nothing, 4, Nothing, 6] - r4_3.at 5 . should_equal [2, 3, 2, 5] + r4_3.at 1 . should_equal [Nothing, Nothing, Nothing, 6] + r4_3.at 2 . should_equal [Nothing, Nothing, 4, Nothing] + r4_3.at 3 . should_equal [2, 3, 2, 5] t5 = t1.join t2 on=[Join_Condition.Equals "A" "C"] join_kind=Join_Kind.Left_Exclusive + expect_column_names ["A", "B"] t5 + r5 = materialize t5 . order_by ["A", "B"] . rows . map .to_vector within_table t5 <| - expect_column_names ["A", "B"] t5 - t5.at "A" . to_vector . should_equal [1] - t5.at "B" . to_vector . should_equal [7] + r5.length . should_equal 3 + r5.at 0 . should_equal [Nothing, Nothing] + r5.at 1 . should_equal [Nothing, 4] + r5.at 2 . should_equal [1, 7] t6 = t1.join t2 on=[Join_Condition.Equals "A" "C"] join_kind=Join_Kind.Right_Exclusive + expect_column_names ["C", "D"] t6 + r6 = materialize t6 . order_by ["C", "D"] . rows . map .to_vector within_table t6 <| - expect_column_names ["C", "D"] t6 - t6.at "C" . to_vector . should_equal [4] - t6.at "D" . to_vector . should_equal [Nothing] + r6.length . should_equal 3 + r6.at 0 . should_equal [Nothing, Nothing] + r6.at 1 . should_equal [Nothing, 6] + r6.at 2 . should_equal [4, Nothing] t7 = table_builder [["A", [Nothing, 2]], ["B", [Nothing, 3]]] t8 = table_builder [["C", [2, 3]], ["D", [4, 5]]]