Add duplicates component (#10323)

* Update existing behaviou to match new

* Add signatures

* Red test

* First test green

* sbt javafmtAll

* In-Memory working

* Not implemeted for In-Db

* Docs

* Disable tests for in-db

* Changelog

* Code review changes

* Fix

* Fix

* Fixc tests
This commit is contained in:
AdRiley 2024-06-24 11:29:03 +01:00 committed by GitHub
parent 791dba6729
commit c324c78e23
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 238 additions and 40 deletions

View File

@ -42,6 +42,7 @@
- [Implemented `.cast` to and from `Decimal` columns for the in-memory
database.][10206]
- [Implemented fallback to Windows-1252 encoding for `Encoding.Default`.][10190]
- [Added Table.duplicates component][10323]
[debug-shortcuts]:
@ -50,6 +51,7 @@
[10130]: https://github.com/enso-org/enso/pull/10130
[10206]: https://github.com/enso-org/enso/pull/10206
[10190]: https://github.com/enso-org/enso/pull/10190
[10323]: https://github.com/enso-org/enso/pull/10323
<br/>![Release Notes](/docs/assets/tags/release_notes.svg)

View File

@ -357,9 +357,9 @@ type Array
first duplicate appeared in the input.
> Example
Removing repeating entries.
Removing unique entries.
[1, 3, 1, 2, 2, 1].to_array . duplicates == [1, 2].to_array
[1, 3, 1, 2, 2, 1].to_array . duplicates == [1, 1, 2, 2, 1].to_array
duplicates : (Any -> Any) -> Vector Any
duplicates self (on = x->x) =
Array_Like_Helpers.duplicates self on

View File

@ -1227,9 +1227,9 @@ type Vector a
first duplicate appeared in the input.
> Example
Removing repeating entries.
Removing unique entries.
[1, 3, 1, 2, 2, 1] . duplicates == [1, 2]
[1, 3, 1, 2, 2, 1] . duplicates == [1, 1, 2, 2, 1]
duplicates : (Any -> Any) -> Vector Any
duplicates self (on = x->x) =
Array_Like_Helpers.duplicates self on

View File

@ -165,11 +165,14 @@ distinct vector on =
existing.insert key True
duplicates vector on = Vector.build builder->
vector.fold Map.empty current-> item->
counts = vector.fold Map.empty current-> item->
key = on item
count = current.get key 0
if count == 1 then builder.append item
current.insert key count+1
vector.map item->
key = on item
count = counts.get key 0
if count != 1 then builder.append item
take vector range = case range of
## We are using a specialized implementation for `take Sample`, because

View File

@ -1323,9 +1323,7 @@ type DB_Table
raised as an error regardless of the problem behavior, because it is
not possible to create a table without any columns.
- If a column in `columns` is not in the input table, a
`Missing_Input_Columns` is raised as an error, unless
`error_on_missing_columns` is set to `False`, in which case the
problem is reported according to the `on_problems` setting.
`Missing_Input_Columns` is raised as an error.
- If no valid columns are selected, a `No_Input_Columns_Selected`, is
reported as a dataflow error regardless of setting.
- If floating points values are present in the distinct columns, a
@ -1333,13 +1331,46 @@ type DB_Table
setting.
@columns Widget_Helpers.make_column_name_multi_selector
distinct : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Case_Sensitivity -> Boolean -> Problem_Behavior -> DB_Table ! No_Output_Columns | Missing_Input_Columns | No_Input_Columns_Selected | Floating_Point_Equality
distinct self columns=self.column_names case_sensitivity:Case_Sensitivity=..Default error_on_missing_columns:Boolean=True on_problems:Problem_Behavior=Report_Warning =
key_columns = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=True error_on_missing_columns=error_on_missing_columns on_problems=on_problems . catch No_Output_Columns _->
distinct self columns=self.column_names case_sensitivity:Case_Sensitivity=..Default on_problems:Problem_Behavior=Report_Warning =
key_columns = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=True error_on_missing_columns=True on_problems=on_problems . catch No_Output_Columns _->
Error.throw No_Input_Columns_Selected
problem_builder = Problem_Builder.new
new_table = self.connection.dialect.prepare_distinct self key_columns case_sensitivity problem_builder
problem_builder.attach_problems_before on_problems new_table
## GROUP Standard.Base.Selections
ICON preparation
Returns the set of rows which are duplicated within the specified columns from the
input table.
When multiple rows have the same values within the specified columns all of those rows are
returned. Rows which are unique within the specified columns are removed.
Arguments:
- columns: The columns of the table to use for distinguishing the rows.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
- on_problems: Specifies how to handle if a problem occurs, raising as a
warning by default.
! Error Conditions
- If there are no columns in the output table, a `No_Output_Columns` is
raised as an error regardless of the problem behavior, because it is
not possible to create a table without any columns.
- If a column in `columns` is not in the input table, a
`Missing_Input_Columns` is raised as an error.
- If no valid columns are selected, a `No_Input_Columns_Selected`, is
reported as a dataflow error regardless of setting.
- If floating points values are present in the distinct columns, a
`Floating_Point_Equality` is reported according to the `on_problems`
setting.
@columns Widget_Helpers.make_column_name_multi_selector
duplicates : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Case_Sensitivity -> Boolean -> Problem_Behavior -> DB_Table ! No_Output_Columns | Missing_Input_Columns | No_Input_Columns_Selected | Floating_Point_Equality
duplicates self columns=self.column_names case_sensitivity:Case_Sensitivity=..Default on_problems:Problem_Behavior=..Report_Warning =
_ = [columns, case_sensitivity, on_problems]
Error.throw (Unsupported_Database_Operation.Error "DB_Table.duplicates is not implemented yet for the Database backends.")
## GROUP Standard.Base.Calculations
ICON join
Joins two tables according to the specified join conditions.

View File

@ -336,7 +336,7 @@ rename_columns (naming_helper : Column_Naming_Helper) (internal_columns:Vector)
## Attempt to treat as Map
map = Map.from_vector mapping error_on_duplicates=False
if map.length == mapping.length then rename_columns naming_helper internal_columns map case_sensitivity error_on_missing_columns on_problems else
duplicates = mapping.duplicates on=_.first . map p->p.first.to_text
duplicates = mapping.duplicates on=_.first . map p->p.first.to_text . distinct
duplicate_text = if duplicates.length < 5 then duplicates.to_vector . join ", " else
duplicates.take 3 . to_vector . join ", " + (", ... " + (duplicates.length - 3).to_text + " others")
Error.throw (Illegal_Argument.Error "duplicate old name mappings ("+duplicate_text+").")

View File

@ -939,9 +939,7 @@ type Table
raised as an error regardless of the problem behavior, because it is
not possible to create a table without any columns.
- If a column in `columns` is not in the input table, a
`Missing_Input_Columns` is raised as an error, unless
`error_on_missing_columns` is set to `False`, in which case the
problem is reported according to the `on_problems` setting.
`Missing_Input_Columns` is raised as an error.
- If no valid columns are selected, a `No_Input_Columns_Selected`, is
reported as a dataflow error regardless of setting.
- If floating points values are present in the distinct columns, a
@ -949,8 +947,8 @@ type Table
setting.
@columns Widget_Helpers.make_column_name_multi_selector
distinct : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Case_Sensitivity -> Boolean -> Problem_Behavior -> Table ! No_Output_Columns | Missing_Input_Columns | No_Input_Columns_Selected | Floating_Point_Equality
distinct self (columns = self.column_names) case_sensitivity:Case_Sensitivity=Case_Sensitivity.Default error_on_missing_columns:Boolean=True on_problems:Problem_Behavior=..Report_Warning =
key_columns = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=True error_on_missing_columns=error_on_missing_columns on_problems=on_problems . catch No_Output_Columns _->
distinct self (columns = self.column_names) case_sensitivity:Case_Sensitivity=Case_Sensitivity.Default on_problems:Problem_Behavior=..Report_Warning =
key_columns = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=True error_on_missing_columns=True on_problems=on_problems . catch No_Output_Columns _->
Error.throw No_Input_Columns_Selected
java_columns = key_columns.map c->c.java_column
text_folding_strategy = Case_Sensitivity.folding_strategy case_sensitivity
@ -959,6 +957,45 @@ type Table
self.java_table.distinct java_columns text_folding_strategy java_aggregator
Table.Value java_table
## GROUP Standard.Base.Selections
ICON preparation
Returns the set of rows which are duplicated within the specified columns from the
input table.
When multiple rows have the same values within the specified columns all of those rows are
returned. Rows which are unique within the specified columns are removed.
Arguments:
- columns: The columns of the table to use for distinguishing the rows.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
- on_problems: Specifies how to handle if a problem occurs, raising as a
warning by default.
! Error Conditions
- If there are no columns in the output table, a `No_Output_Columns` is
raised as an error regardless of the problem behavior, because it is
not possible to create a table without any columns.
- If a column in `columns` is not in the input table, a
`Missing_Input_Columns` is raised as an error.
- If no valid columns are selected, a `No_Input_Columns_Selected`, is
reported as a dataflow error regardless of setting.
- If floating points values are present in the distinct columns, a
`Floating_Point_Equality` is reported according to the `on_problems`
setting.
@columns Widget_Helpers.make_column_name_multi_selector
duplicates : Vector (Integer | Text | Regex) | Text | Integer | Regex -> Case_Sensitivity -> Boolean -> Problem_Behavior -> Table ! No_Output_Columns | Missing_Input_Columns | No_Input_Columns_Selected | Floating_Point_Equality
duplicates self (columns = self.column_names) case_sensitivity:Case_Sensitivity=..Default on_problems:Problem_Behavior=..Report_Warning =
key_columns = self.columns_helper.select_columns columns Case_Sensitivity.Default reorder=True error_on_missing_columns=True on_problems=on_problems . catch No_Output_Columns _->
Error.throw No_Input_Columns_Selected
java_columns = key_columns.map c->c.java_column
text_folding_strategy = Case_Sensitivity.folding_strategy case_sensitivity
java_table = Illegal_Argument.handle_java_exception <|
Java_Problems.with_problem_aggregator on_problems java_aggregator->
self.java_table.duplicates java_columns text_folding_strategy java_aggregator
Table.Value java_table
## GROUP Standard.Base.Conversions
ICON convert
Parses columns within a `Table` to a specific value type.

View File

@ -239,6 +239,30 @@ public class Table {
return new Table(newColumns);
}
/**
* Creates a new table keeping only rows with distinct key columns.
*
* @param keyColumns set of columns to use as an index
* @param textFoldingStrategy a strategy for folding text columns
* @param problemAggregator an aggregator for problems
* @return a table where duplicate rows with the same key are removed
*/
public Table duplicates(
Column[] keyColumns,
TextFoldingStrategy textFoldingStrategy,
ProblemAggregator problemAggregator) {
var rowsToKeep =
Distinct.buildDuplicatesRowsMask(
rowCount(), keyColumns, textFoldingStrategy, problemAggregator);
int cardinality = rowsToKeep.cardinality();
Column[] newColumns = new Column[this.columns.length];
for (int i = 0; i < this.columns.length; i++) {
newColumns[i] = this.columns[i].applyFilter(rowsToKeep, cardinality);
}
return new Table(newColumns);
}
/**
* Selects a subset of columns of this table, by names.
*

View File

@ -2,8 +2,10 @@ package org.enso.table.operations;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import org.enso.base.text.TextFoldingStrategy;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.index.MultiValueKeyBase;
@ -15,6 +17,7 @@ import org.enso.table.util.ConstantList;
import org.graalvm.polyglot.Context;
public class Distinct {
/** Creates a row mask containing only the first row from sets of rows grouped by key columns. */
public static BitSet buildDistinctRowsMask(
int tableSize,
@ -50,4 +53,42 @@ public class Distinct {
return mask;
}
public static BitSet buildDuplicatesRowsMask(
int tableSize,
Column[] keyColumns,
TextFoldingStrategy textFoldingStrategy,
ProblemAggregator problemAggregator) {
ColumnAggregatedProblemAggregator groupingProblemAggregator =
new ColumnAggregatedProblemAggregator(problemAggregator);
Context context = Context.getCurrent();
var mask = new BitSet();
if (keyColumns.length != 0) {
Map<MultiValueKeyBase, Integer> visitedRows = new HashMap<>();
int size = keyColumns[0].getSize();
Storage<?>[] storage =
Arrays.stream(keyColumns).map(Column::getStorage).toArray(Storage[]::new);
List<TextFoldingStrategy> strategies = ConstantList.make(textFoldingStrategy, storage.length);
for (int i = 0; i < size; i++) {
UnorderedMultiValueKey key = new UnorderedMultiValueKey(storage, i, strategies);
key.checkAndReportFloatingEquality(
groupingProblemAggregator, columnIx -> keyColumns[columnIx].getName());
var keyIndex = visitedRows.get(key);
if (keyIndex == null) {
visitedRows.put(key, i);
} else {
mask.set(i);
mask.set(keyIndex);
}
context.safepoint();
}
} else {
// If there are no columns to distinct-by we just return the whole table.
mask.set(0, tableSize);
}
return mask;
}
}

View File

@ -858,11 +858,11 @@ type_spec suite_builder name alter = suite_builder.group name group_builder->
alter [1, 1.0, 2, 2.0] . distinct . should_equal [1, 2]
alter [] . distinct . should_equal []
group_builder.specify "should return a vector containing only duplicate elements" <|
alter [1, 3, 1, 2, 2, 1] . duplicates . should_equal [1, 2]
alter ["a", "a", "a"] . duplicates . should_equal ["a"]
alter ['ś', 's', 's\u0301'] . duplicates . should_equal ['s\u0301']
alter [1, 1.0, 2, 2.0] . duplicates . should_equal [1.0, 2.0]
group_builder.specify "should return a vector containing duplicate elements" <|
alter [1, 3, 1, 2, 2, 1] . duplicates . should_equal [1, 1, 2, 2, 1]
alter ["a", "a", "a"] . duplicates . should_equal ["a", "a", "a"]
alter ['ś', 's', 's\u0301'] . duplicates . should_equal ['ś', 's\u0301']
alter [1, 1.0, 2, 2.0] . duplicates . should_equal [1, 1.0, 2, 2.0]
alter [] . duplicates . should_equal []
group_builder.specify "should be able to handle distinct on different primitive values" <|

View File

@ -128,21 +128,81 @@ add_specs suite_builder setup =
t5 = t1.distinct [] on_problems=pb
t5.should_fail_with No_Input_Columns_Selected
t6 = t1.distinct [] error_on_missing_columns=False on_problems=pb
t6.should_fail_with No_Input_Columns_Selected
db_todo = if setup.is_database.not then Nothing else "Table.duplicates is not implemented yet in Database."
suite_builder.group setup.prefix+"Table.duplicates" pending=db_todo group_builder->
data = Data.setup create_connection_fn
# When not erroring about missing columns, No_Input_Columns_Selected will still yield an error.
t7 = t1.distinct ["Y", "Z"] error_on_missing_columns=False on_problems=Problem_Behavior.Report_Warning
t7.should_fail_with No_Input_Columns_Selected
group_builder.teardown <|
data.teardown
action2 = t1.distinct ["X", "Y"] error_on_missing_columns=False on_problems=_
tester2 table =
table.at "X" . to_vector . should_equal [1, 2, 3]
problems2 = [Missing_Input_Columns.Error ["Y"]]
Problems.test_problem_handling action2 problems2 tester2
table_builder cols =
setup.table_builder cols connection=data.connection
action3 = t1.distinct [0, 42] error_on_missing_columns=False on_problems=_
tester3 table =
table.at "X" . to_vector . should_equal [1, 2, 3]
problems3 = [Missing_Input_Columns.Error [42]]
Problems.test_problem_handling action3 problems3 tester3
group_builder.specify "should group by all columns by default" <|
a = ["A", ["a", "a", "b", "b", "c"]]
b = ["B", [1, 1, 1, 2, 1]]
t = table_builder [a, b]
r = t.duplicates on_problems=Report_Error |> materialize |> _.order_by ["A", "B"]
r.at "A" . to_vector . should_equal ["a", "a"]
r.at "B" . to_vector . should_equal [1, 1]
group_builder.specify "should allow to select duplicates rows based on a subset of columns, returning any row from each group" <|
a = ["A", ["a", "a", "a", "a", "a", "a"]]
b = ["B", [1, 1, 2, 2, 1, 3]]
c = ["C", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]]
t = table_builder [a, b, c]
r1 = t.duplicates ["A"] on_problems=Report_Error |> materialize
r1.at "A" . to_vector . should_equal ["a", "a", "a", "a", "a", "a"]
r1.at "B" . to_vector . should_equal [1, 1, 2, 2, 1, 3]
r1.at "C" . to_vector . should_equal [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
r2 = t.duplicates ["A", "B"] on_problems=Report_Error |> materialize
r2.at "A" . to_vector . should_equal ["a", "a", "a", "a", "a"]
r2.at "B" . to_vector . should_equal [1, 1, 2, 2, 1]
r2.at "C" . to_vector . should_equal [0.1, 0.2, 0.3, 0.4, 0.5]
group_builder.specify "should allow to control case-sensitivity of keys" <|
x = ["X", ['A', 'a', 'enso', 'Enso', 'A']]
t1 = table_builder [x]
d1 = t1.duplicates ["X"] on_problems=Report_Error |> materialize |> _.order_by ["X"]
d1.at "X" . to_vector . should_equal ['A', 'A']
d2 = t1.duplicates ["X"] case_sensitivity=Case_Sensitivity.Insensitive on_problems=Report_Error |> materialize |> _.order_by ["X"]
d2.at "X" . to_vector . should_equal ['A', 'A', 'Enso', 'a', 'enso']
group_builder.specify "should report a warning if the key contains floating point values" <|
t1 = table_builder [["X", [3.0, 1.0, 2.0, 2.0, 1.0]]]
action1 = t1.duplicates on_problems=_
tester1 table =
v = table.at "X" . to_vector
v.length . should_equal 4
v.fold 0 (+) . should_equal 6.0
problems1 = [Floating_Point_Equality.Error "X"]
Problems.test_problem_handling action1 problems1 tester1
group_builder.specify "should handle nulls correctly" <|
a = ["A", ["a", Nothing, "b", Nothing]]
b = ["B", [1, 2, 3, 4]]
t = table_builder [a, b]
r = t.duplicates ["A"] on_problems=Report_Error |> materialize |> _.order_by "B"
r.at "A" . to_vector . should_equal [Nothing, Nothing]
r.at "B" . to_vector . should_equal [2, 4]
group_builder.specify "should report missing input columns" <|
t1 = table_builder [["X", [1, 2, 3, 2, 2]]]
[Problem_Behavior.Ignore, Problem_Behavior.Report_Warning, Problem_Behavior.Report_Error].each pb->
t2 = t1.duplicates ["Y", "Z"] on_problems=pb
t2.should_fail_with Missing_Input_Columns
t2.catch . should_equal (Missing_Input_Columns.Error ["Y", "Z"])
t3 = t1.duplicates ["X", "Y"] on_problems=pb
t3.should_fail_with Missing_Input_Columns
t3.catch . should_equal (Missing_Input_Columns.Error ["Y"])
t4 = t1.duplicates [0, 42] on_problems=pb
t4.should_fail_with Missing_Input_Columns
t4.catch . should_equal (Missing_Input_Columns.Error [42])
t5 = t1.duplicates [] on_problems=pb
t5.should_fail_with No_Input_Columns_Selected

View File

@ -584,10 +584,10 @@ add_specs suite_builder setup =
result.catch Any . message . should_equal message
test_duplicate_names [["Alpha", "1"], ["Alpha", "2"]] "duplicate old name mappings (Alpha)."
test_duplicate_names [["Alpha", "1"], ["Beta", "2"], ["Gamma", "3"], ["Beta", "4"], ["Alpha", "5"]] "duplicate old name mappings (Beta, Alpha)."
test_duplicate_names [["Alpha", "1"], ["Beta", "2"], ["Gamma", "3"], ["Beta", "4"], ["Alpha", "5"]] "duplicate old name mappings (Alpha, Beta)."
test_duplicate_names [["Alpha", "1"], ["Alpha", "2"], ["Alpha", "3"]] "duplicate old name mappings (Alpha)."
test_duplicate_names [["Alpha", "1"], ["Beta", "2"], ["Gamma", "3"], ["Beta", "4"], ["Alpha", "5"], ["Gamma","6"], ["Delta","7"], ["Delta","8"]] "duplicate old name mappings (Beta, Alpha, Gamma, Delta)."
test_duplicate_names [["Alpha", "1"], ["Beta", "2"], ["Gamma", "3"], ["Beta", "4"], ["Alpha", "5"], ["Gamma","6"], ["Delta","7"], ["Delta","8"], ["Echo","9"], ["Echo","10"]] "duplicate old name mappings (Beta, Alpha, Gamma, ... 2 others)."
test_duplicate_names [["Alpha", "1"], ["Beta", "2"], ["Gamma", "3"], ["Beta", "4"], ["Alpha", "5"], ["Gamma","6"], ["Delta","7"], ["Delta","8"]] "duplicate old name mappings (Alpha, Beta, Gamma, Delta)."
test_duplicate_names [["Alpha", "1"], ["Beta", "2"], ["Gamma", "3"], ["Beta", "4"], ["Alpha", "5"], ["Gamma","6"], ["Delta","7"], ["Delta","8"], ["Echo","9"], ["Echo","10"]] "duplicate old name mappings (Alpha, Beta, Gamma, ... 2 others)."
group_builder.specify "should correctly handle problems: unmatched names" <|
weird_name = '.*?-!@#!"'