Rework Invalid Aggregations (#5579)

Closes #5108
This commit is contained in:
Radosław Waśko 2023-02-08 19:39:09 +01:00 committed by GitHub
parent 4405be8a74
commit 4f90946d1e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 140 additions and 73 deletions

View File

@ -35,14 +35,15 @@ type Problem_Behavior
dataflow error. If the value already contained any dataflow error, that
error takes precedence.
attach_problem_after : Any -> Any -> Any
attach_problem_after self decorated_value ~problem = case self of
attach_problem_after self decorated_value problem = case self of
Ignore ->
decorated_value
if decorated_value.is_error then decorated_value else
if problem.is_error then problem else decorated_value
Report_Warning ->
Warning.attach problem decorated_value
Report_Error ->
case decorated_value of
_ -> Error.throw problem
if decorated_value.is_error then decorated_value else
Error.throw problem
## ADVANCED
UNSTABLE
@ -53,13 +54,11 @@ type Problem_Behavior
If it is set to Report_Warning, the value is returned with the problem
attached as a warning.
If it is set to Report_Error, the problem is returned in the form of
a dataflow error. The problem takes precedence over any errors that may
have been contained in the value - in such case the `decorated_value` is
not computed at all.
a dataflow error.
attach_problem_before : Any -> Any -> Any
attach_problem_before self problem ~decorated_value = case self of
Ignore ->
decorated_value
if problem.is_error then problem else decorated_value
Report_Warning ->
Warning.attach problem decorated_value
Report_Error ->
@ -88,7 +87,7 @@ type Problem_Behavior
attach_problems_before : Vector -> Any -> Any
attach_problems_before self problems ~decorated_value = case self of
Ignore ->
decorated_value
if problems.is_error then problems else decorated_value
Report_Warning ->
problems.fold decorated_value acc-> problem->
Warning.attach problem acc
@ -116,14 +115,15 @@ type Problem_Behavior
problem_behavior.attach_problems_after result <|
perform_post_process_checks_and_return_problems
attach_problems_after : Any -> Vector -> Any
attach_problems_after self decorated_value ~problems = case self of
attach_problems_after self decorated_value problems = case self of
Ignore ->
decorated_value
if decorated_value.is_error then decorated_value else
if problems.is_error then problems else decorated_value
Report_Warning ->
problems.fold decorated_value acc-> problem->
Warning.attach problem acc
Report_Error -> case decorated_value of
_ -> if problems.is_empty then decorated_value else
Report_Error ->
if decorated_value.is_error || problems.is_empty then decorated_value else
Error.throw problems.first
## ADVANCED

View File

@ -826,8 +826,8 @@ type Table
join self right join_kind=Join_Kind.Inner on=[Join_Condition.Equals 0 0] right_prefix="Right_" on_problems=Report_Warning =
can_proceed = if Table_Helpers.is_table right . not then Error.throw (Type_Error.Error Table right "right") else
same_backend = case right of
_ : Table -> True
_ -> False
_ : Table -> True
_ -> False
if same_backend . not then Error.throw (Illegal_Argument.Error "Currently cross-backend joins are not supported. You need to upload the in-memory table before joining it with a database one, or materialize this table.") else
True
if can_proceed then
@ -1087,6 +1087,8 @@ type Table
`Group_By` columns are reported as dataflow errors regardless of
these settings, as a missing grouping will completely change
semantics of the query.
- If an aggregation fails, an `Invalid_Aggregation` dataflow error is
raised.
- The following additional problems may be reported according to the
`on_problems` settings:
- If there are invalid column names in the output table,
@ -1095,7 +1097,6 @@ type Table
a `Duplicate_Output_Column_Names`.
- If grouping on or computing the `Mode` on a floating point number,
a `Floating_Point_Grouping`.
- If an aggregation fails, an `Invalid_Aggregation`.
- If when concatenating values there is an quoted delimited,
an `Unquoted_Delimiter`
- If there are more than 10 issues with a single column,
@ -1193,11 +1194,12 @@ type Table
- If a column selector in `values` given as a `Text` and it does not
match any columns in the input table nor is it a valid expression, an
`Invalid_Aggregate_Column` dataflow error is raised.
- If an aggregation fails, an `Invalid_Aggregation` dataflow error is
raised.
- Additionally, the following problems may be reported according to the
`on_problems` setting:
- If grouping on, using as the column name, or computing the `Mode` on
a floating point number, a `Floating_Point_Grouping`.
- If an aggregation fails, an `Invalid_Aggregation`.
- If when concatenating values there is an quoted delimited,
an `Unquoted_Delimiter`
- If there are more than 10 issues with a single column,

View File

@ -555,6 +555,8 @@ type Table
`Group_By` columns are reported as dataflow errors regardless of
these settings, as a missing grouping will completely change
semantics of the query.
- If an aggregation fails, an `Invalid_Aggregation` dataflow error is
raised.
- Additionally, the following problems may be reported according to the
`on_problems` setting:
- If there are invalid column names in the output table,
@ -563,7 +565,6 @@ type Table
a `Duplicate_Output_Column_Names`.
- If grouping on or computing the `Mode` on a floating point number,
a `Floating_Point_Grouping`.
- If an aggregation fails, an `Invalid_Aggregation`.
- If when concatenating values there is an quoted delimited,
an `Unquoted_Delimiter`
- If there are more than 10 issues with a single column,
@ -1508,11 +1509,12 @@ type Table
- If a column selector in `values` given as a `Text` and it does not
match any columns in the input table nor is it a valid expression, an
`Invalid_Aggregate_Column` dataflow error is raised.
- If an aggregation fails, an `Invalid_Aggregation` dataflow error is
raised.
- Additionally, the following problems may be reported according to the
`on_problems` setting:
- If grouping on, using as the column name, or computing the `Mode` on
a floating point number, a `Floating_Point_Grouping`.
- If an aggregation fails, an `Invalid_Aggregation`.
- If when concatenating values there is an quoted delimited,
an `Unquoted_Delimiter`
- If there are more than 10 issues with a single column,
@ -1550,7 +1552,8 @@ type Table
c -> Aggregate_Column_Helper.default_aggregate_column_name c all_same
data_columns = validated_values.map c->
col_name = c.new_name.if_nothing (name_mapper c)
col_name = c.new_name.if_nothing <|
Aggregate_Column_Helper.default_aggregate_column_name c
Aggregate_Column_Helper.java_aggregator col_name c
result = case matched_name.is_empty of
@ -1558,7 +1561,9 @@ type Table
group_by = grouping.map g->(Aggregate_Column_Helper.java_aggregator g.name (Aggregate_Column.Group_By g))
index.makeTable (group_by + data_columns).to_array
False ->
index.makeCrossTabTable java_key_columns.to_array matched_name.first.java_column data_columns.to_array
aggregate_names = validated_values.map c->
c.new_name.if_nothing (name_mapper c)
index.makeCrossTabTable java_key_columns matched_name.first.java_column data_columns aggregate_names
on_problems.attach_problems_after (Table.Value result) <|
problems = result.getProblems

View File

@ -19,7 +19,8 @@ polyglot java import org.enso.table.util.problems.InvalidNames
Convert a Java problem into its Enso equivalent.
translate_problem p = case p of
_ : InvalidAggregation ->
Invalid_Aggregation.Error p.getColumnName (Vector.from_polyglot_array p.getRows) p.getMessage
err = Invalid_Aggregation.Error p.getColumnName (Vector.from_polyglot_array p.getRows) p.getMessage
Error.throw err
_ : FloatingPointGrouping ->
Floating_Point_Grouping.Error p.getColumnName
_ : UnquotedDelimiter ->

View File

@ -118,7 +118,10 @@ public class MultiValueIndex<KeyType extends MultiValueKeyBase> {
}
public Table makeCrossTabTable(
Column[] groupingColumns, Column nameColumn, Aggregator[] aggregates) {
Column[] groupingColumns,
Column nameColumn,
Aggregator[] aggregates,
String[] aggregateNames) {
final int size = locs.size();
var nameIndex =
@ -187,7 +190,7 @@ public class MultiValueIndex<KeyType extends MultiValueKeyBase> {
for (int i = 0; i < aggregates.length; i++) {
output[offset + i] =
new Column((name + " " + aggregates[i].getName()).trim(), storage[offset + i].seal());
new Column((name + " " + aggregateNames[i]).trim(), storage[offset + i].seal());
}
offset += aggregates.length;

View File

@ -43,10 +43,26 @@ public class Table {
throw new IllegalArgumentException("A Table must have at least one column.");
}
if (!checkUniqueColumns(columns)) {
throw new IllegalArgumentException("Column names must be unique within a Table.");
}
this.columns = columns;
this.problems = problems;
}
private static boolean checkUniqueColumns(Column[] columns) {
HashSet<String> names = new HashSet<>();
for (Column column : columns) {
boolean wasNew = names.add(column.getName());
if (!wasNew) {
return false;
}
}
return true;
}
/** @return the number of rows in this table */
public int rowCount() {
return columns[0].getSize();

View File

@ -1227,7 +1227,7 @@ spec setup =
table_builder [col1, col2]
Test.specify "should fail if there are no output columns, and promote any warnings to errors" <|
[Problem_Behavior.Ignore, Problem_Behavior.Report_Warning, Problem_Behavior.Report_Error].each pb->
[Problem_Behavior.Ignore, Problem_Behavior.Report_Warning, Problem_Behavior.Report_Error].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+": " <|
t1 = table.aggregate [] on_problems=pb
t1.should_fail_with No_Output_Columns
@ -1296,6 +1296,12 @@ spec setup =
tester = expect_column_names ["Value_1", "Value"]
Problems.test_problem_handling action problems tester
Test.specify "should raise a warning when duplicate column names" <|
action = table.aggregate [Sum "Value" new_name="AGG1", Count new_name="AGG1"] on_problems=_
problems = [Duplicate_Output_Column_Names.Error ["AGG1"]]
tester = expect_column_names ["AGG1", "AGG1_1"]
Problems.test_problem_handling action problems tester
Test.specify "should allow partial matches on Count_Distinct" <|
action = table.aggregate [Count_Distinct (By_Name ["Missing", "Value"])] on_problems=_
problems = [Missing_Input_Columns.Error ["Missing"]]
@ -1324,46 +1330,46 @@ spec setup =
Problems.test_problem_handling action problems tester
Test.specify "should warn if totaling on a non number" <|
action = table.aggregate [Sum "Text"] on_problems=_
problems = [Invalid_Aggregation.Error "Sum Text" [0] "Cannot convert to a number."]
tester = expect_column_names ["Sum Text"]
Problems.test_problem_handling action problems tester
[Problem_Behavior.Report_Error, Problem_Behavior.Report_Warning, Problem_Behavior.Ignore].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+" " <|
err = table.aggregate [Sum "Text"] on_problems=pb
err.should_fail_with Invalid_Aggregation
err.catch . should_equal (Invalid_Aggregation.Error "Sum Text" [0] "Cannot convert to a number.")
Test.specify "should warn if averaging on a non number" <|
action = table.aggregate [Average "Text"] on_problems=_
problems = [Invalid_Aggregation.Error "Average Text" [0] "Cannot convert to a number."]
tester = expect_column_names ["Average Text"]
Problems.test_problem_handling action problems tester
[Problem_Behavior.Report_Error, Problem_Behavior.Report_Warning, Problem_Behavior.Ignore].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+" " <|
err = table.aggregate [Average "Text"] on_problems=pb
err.should_fail_with Invalid_Aggregation
err.catch . should_equal (Invalid_Aggregation.Error "Average Text" [0] "Cannot convert to a number.")
Test.specify "should warn if calculating standard deviation on a non number" <|
action = table.aggregate [Standard_Deviation "Text"] on_problems=_
problems = [Invalid_Aggregation.Error "Standard Deviation Text" [0] "Cannot convert to a number."]
tester = expect_column_names ["Standard Deviation Text"]
Problems.test_problem_handling action problems tester
[Problem_Behavior.Report_Error, Problem_Behavior.Report_Warning, Problem_Behavior.Ignore].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+" " <|
err = table.aggregate [Standard_Deviation "Text"] on_problems=pb
err.should_fail_with Invalid_Aggregation
err.catch . should_equal (Invalid_Aggregation.Error "Standard Deviation Text" [0] "Cannot convert to a number.")
Test.specify "should warn if median on a non number" <|
action = table.aggregate [Median "Text"] on_problems=_
problems = [Invalid_Aggregation.Error "Median Text" [0] "Cannot convert to a number."]
tester = expect_column_names ["Median Text"]
Problems.test_problem_handling action problems tester
[Problem_Behavior.Report_Error, Problem_Behavior.Report_Warning, Problem_Behavior.Ignore].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+" " <|
err = table.aggregate [Median "Text"] on_problems=pb
err.should_fail_with Invalid_Aggregation
err.catch . should_equal (Invalid_Aggregation.Error "Median Text" [0] "Cannot convert to a number.")
Test.specify "should warn if trying shortest on a non text" <|
action = table.aggregate [Shortest "Index"] on_problems=_
problems = [Invalid_Aggregation.Error "Shortest Index" [0] "Not a text value."]
tester = expect_column_names ["Shortest Index"]
Problems.test_problem_handling action problems tester
[Problem_Behavior.Report_Error, Problem_Behavior.Report_Warning, Problem_Behavior.Ignore].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+" " <|
err = table.aggregate [Shortest "Index"] on_problems=pb
err.should_fail_with Invalid_Aggregation
err.catch . should_equal (Invalid_Aggregation.Error "Shortest Index" [0] "Not a text value.")
Test.specify "should warn if trying count empties on a non text" <|
action = table.aggregate [Count_Empty "Index"] on_problems=_
problems = [Invalid_Aggregation.Error "Count Empty Index" [0] "Not a text value."]
tester = expect_column_names ["Count Empty Index"]
Problems.test_problem_handling action problems tester
[Problem_Behavior.Report_Error, Problem_Behavior.Report_Warning, Problem_Behavior.Ignore].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+" " <|
err = table.aggregate [Count_Empty "Index"] on_problems=pb
err.should_fail_with Invalid_Aggregation
err.catch . should_equal (Invalid_Aggregation.Error "Count Empty Index" [0] "Not a text value.")
Test.specify "should warn if trying concatenate on a non text" <|
action = table.aggregate [Concatenate "Index"] on_problems=_
problems = [Invalid_Aggregation.Error "Concatenate Index" [0] "Not a text value."]
tester = expect_column_names ["Concatenate Index"]
Problems.test_problem_handling action problems tester
[Problem_Behavior.Report_Error, Problem_Behavior.Report_Warning, Problem_Behavior.Ignore].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+" " <|
err = table.aggregate [Concatenate "Index"] on_problems=pb
err.should_fail_with Invalid_Aggregation
err.catch . should_equal (Invalid_Aggregation.Error "Concatenate Index" [0] "Not a text value.")
Test.specify "should warn if trying concatenate unquoted delimiters" <|
column = Concatenate "Text" separator=","
@ -1373,10 +1379,10 @@ spec setup =
Problems.test_problem_handling action problems tester
Test.specify "should warn if can't compare value for Min or Max" <|
action = table.aggregate [Maximum "Mixed"] on_problems=_
problems = [Invalid_Aggregation.Error "Maximum Mixed" [1] "Cannot compare values."]
tester = expect_column_names ["Maximum Mixed"]
Problems.test_problem_handling action problems tester
[Problem_Behavior.Report_Error, Problem_Behavior.Report_Warning, Problem_Behavior.Ignore].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+" " <|
err = table.aggregate [Maximum "Mixed"] on_problems=pb
err.should_fail_with Invalid_Aggregation
err.catch . should_equal (Invalid_Aggregation.Error "Maximum Mixed" [1] "Cannot compare values.")
Test.group prefix+"Table.aggregate should merge warnings when issues computing aggregation" pending=(resolve_pending test_selection.aggregation_problems) <|
table =
@ -1387,11 +1393,10 @@ spec setup =
Test.specify "should merge Invalid Aggregation warnings" <|
new_table = table.aggregate [Group_By "Key", Concatenate "Value"]
problems = Problems.get_attached_warnings new_table
problems.length . should_equal 1
problems.at 0 . is_a Invalid_Aggregation.Error . should_be_true
problems.at 0 . column . should_equal "Concatenate Value"
problems.at 0 . rows . length . should_equal 15
err = new_table.catch
err . should_be_a Invalid_Aggregation.Error
err.column . should_equal "Concatenate Value"
err.rows . length . should_equal 15
Test.specify "should merge Floating Point Grouping warnings" <|
new_table = table.aggregate [Group_By "Float", Count]

View File

@ -2,11 +2,13 @@ from Standard.Base import all
import Standard.Base.Error.Illegal_Argument.Illegal_Argument
from Standard.Table import Column_Selector
from Standard.Table.Data.Aggregate_Column.Aggregate_Column import Sum, Count, Group_By
from Standard.Table.Data.Aggregate_Column.Aggregate_Column import Average, Count, Group_By, Sum
import Standard.Table.Data.Expression.Expression_Error
from Standard.Table.Errors import Missing_Input_Columns, Column_Indexes_Out_Of_Range, No_Such_Column, Invalid_Aggregate_Column
from Standard.Table.Errors import all
from Standard.Test import Test
from Standard.Database.Errors import SQL_Error
from Standard.Test import Test, Problems
import Standard.Test.Extensions
from project.Common_Table_Operations.Util import run_default_backend
@ -22,24 +24,24 @@ spec setup =
table2 = table_builder [["Group", ["A","B","A","B","A","B","A","B","A"]], ["Key", ["x", "x", "x", "x", "y", "y", "y", "z", "z"]], ["Value", [1, 2, 3, 4, 5, 6, 7, 8, 9]]]
Test.specify "should cross_tab counts by default using first column as names" <|
t1 = table.cross_tab
t1.column_names . should_equal ["x", "y", "z"]
t1.row_count . should_equal 1
t1.column_count . should_equal 3
t1.at "x" . to_vector . should_equal [4]
t1.at "y" . to_vector . should_equal [3]
t1.at "z" . to_vector . should_equal [2]
Test.specify "should allow a different aggregate" <|
t1 = table.cross_tab values=[Sum "Value"]
t1.column_names . should_equal ["x", "y", "z"]
t1.row_count . should_equal 1
t1.column_count . should_equal 3
t1.at "x" . to_vector . should_equal [10]
t1.at "y" . to_vector . should_equal [18]
t1.at "z" . to_vector . should_equal [17]
Test.specify "should allow a custom expression for the aggregate" <|
t1 = table.cross_tab values=[Sum "[Value]*[Value]"]
t1.column_names . should_equal ["x", "y", "z"]
t1.row_count . should_equal 1
t1.column_count . should_equal 3
t1.at "x" . to_vector . should_equal [30]
t1.at "y" . to_vector . should_equal [110]
t1.at "z" . to_vector . should_equal [145]
@ -47,23 +49,23 @@ spec setup =
Test.specify "should allow a chosen column" <|
t = table_builder [["Group", ["A","B","A","B","A","B","A","B","A"]], ["Species", ["x", "x", "x", "x", "y", "y", "y", "z", "z"]], ["Value", [1, 2, 3, 4, 5, 6, 7, 8, 9]]]
t1 = t.cross_tab [] "Species"
t1.column_names . should_equal ["x", "y", "z"]
t1.row_count . should_equal 1
t1.column_count . should_equal 3
t1.at "x" . to_vector . should_equal [4]
t1.at "y" . to_vector . should_equal [3]
t1.at "z" . to_vector . should_equal [2]
t2 = t.cross_tab [] 1
t2.column_names . should_equal ["x", "y", "z"]
t2.row_count . should_equal 1
t2.column_count . should_equal 3
t2.at "x" . to_vector . should_equal [4]
t2.at "y" . to_vector . should_equal [3]
t2.at "z" . to_vector . should_equal [2]
Test.specify "should allow a grouping" <|
t1 = table2.cross_tab ["Group"] "Key"
t1.column_names . should_equal ["Group", "x", "y", "z"]
t1.row_count . should_equal 2
t1.column_count . should_equal 4
t1.at "Group" . to_vector . should_equal ["A", "B"]
t1.at "x" . to_vector . should_equal [2, 2]
t1.at "y" . to_vector . should_equal [2, 1]
@ -71,8 +73,8 @@ spec setup =
Test.specify "should allow a grouping by text" <|
t1 = table2.cross_tab "Group" "Key"
t1.column_names . should_equal ["Group", "x", "y", "z"]
t1.row_count . should_equal 2
t1.column_count . should_equal 4
t1.at "Group" . to_vector . should_equal ["A", "B"]
t1.at "x" . to_vector . should_equal [2, 2]
t1.at "y" . to_vector . should_equal [2, 1]
@ -80,8 +82,8 @@ spec setup =
Test.specify "should allow multiple values aggregates" <|
t1 = table.cross_tab values=[Count, Sum "Value"]
t1.row_count . should_equal 1
t1.column_names . should_equal ["x Count", "x Sum Value", "y Count", "y Sum Value", "z Count", "z Sum Value"]
t1.row_count . should_equal 1
t1.at "x Count" . to_vector . should_equal [4]
t1.at "x Sum Value" . to_vector . should_equal [10]
t1.at "y Count" . to_vector . should_equal [3]
@ -130,3 +132,30 @@ spec setup =
Test.specify "should not allow Group_By for values" <|
err1 = table.cross_tab [] "Key" values=[Count, Group_By "Value"] on_problems=Problem_Behavior.Ignore
err1.should_fail_with Illegal_Argument.Error
Test.specify "should gracefully handle duplicate aggregate names" pending="TODO: this should be fixed as part of https://github.com/enso-org/enso/issues/5151" <|
action = table.cross_tab [] "Key" values=[Count new_name="Agg1", Sum "Value" new_name="Agg1"]
tester table =
table.column_names . should_equal ["x Agg1", "x Agg1_1", "y Agg1", "y Agg1_1", "z Agg1", "z Agg1_1"]
problems = [Duplicate_Output_Column_Names.Error ["Agg1"]]
Problems.test_problem_handling action problems tester
Test.specify "should fail on invalid aggregations" <|
table = table_builder [["Key", ["x", "x", "x", "x", "y", "y", "y", "z", "z"]], ["TextValue", ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']], ["Value", [1, 2, 3, 4, 5, 6, 7, 8, 9]]]
[Problem_Behavior.Report_Error, Problem_Behavior.Report_Warning, Problem_Behavior.Ignore].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+" " <|
err = table.cross_tab [] "Key" values=[Average "TextValue"] on_problems=pb
case setup.is_database of
True ->
err.should_fail_with SQL_Error
False ->
err.should_fail_with Invalid_Aggregation
err.catch . should_equal (Invalid_Aggregation.Error "Average TextValue" [0, 4, 7] "Cannot convert to a number.")
[Problem_Behavior.Report_Error, Problem_Behavior.Report_Warning, Problem_Behavior.Ignore].each pb-> Test.with_clue "Problem_Behavior="+pb.to_text+" " <|
err = table.cross_tab [] "Key" values=[Average "Value", Sum "TextValue"] on_problems=pb
case setup.is_database of
True ->
err.should_fail_with SQL_Error
False ->
err.should_fail_with Invalid_Aggregation
err.catch . should_equal (Invalid_Aggregation.Error "Sum TextValue" [0, 4, 7] "Cannot convert to a number.")

View File

@ -20,6 +20,9 @@ import Standard.Test.Extensions
from project.Util import all
polyglot java import java.lang.IllegalArgumentException
polyglot java import org.enso.table.data.table.Table as Java_Table
type My
Data x y
@ -84,6 +87,9 @@ spec =
Table.from_rows [] [] . should_fail_with Illegal_Argument.Error
Table.from_rows [] [[]] . should_fail_with Illegal_Argument.Error
Test.specify "should be internally guarded against creating a table without columns" <|
Test.expect_panic_with (Java_Table.new []) IllegalArgumentException
Test.specify "should correctly infer storage types" <|
varied_type_table.at "strs" . storage_type . should_equal Storage.Text
varied_type_table.at "ints" . storage_type . should_equal Storage.Integer