AdRiley 2024-04-30 16:30:40 +03:00 committed by GitHub
parent 6655d5fbb2
commit d1bf4cb771
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 283 additions and 23 deletions

View File

@ -144,6 +144,30 @@ type Invalid_Aggregation
to_display_text self =
"The "+self.column+" could not be calculated at "+self.rows.short_display_text+": "+self.message
## Indicates that NaN values were found and ignored.
type Ignored_NaN_Values
## PRIVATE
Warning (column:Text) (rows:(Vector Integer))
## PRIVATE
Create a human-readable version of the error.
to_display_text : Text
to_display_text self =
"The column "+self.column+" contained NaN values in rows " + self.rows.short_display_text + " which were ignored."
## Indicates that Nothing values were found and ignored.
type Ignored_Nothing_Values
## PRIVATE
Warning (column:Text) (rows:(Vector Integer))
## PRIVATE
Create a human-readable version of the error.
to_display_text : Text
to_display_text self =
"The column "+self.column+" contained Nothing values in rows " + self.rows.short_display_text + " which were ignored."
## Indicates that some operation relies on equality on floating-point values,
which is not recommended.
type Floating_Point_Equality

View File

@ -14,6 +14,8 @@ polyglot java import org.enso.table.data.column.operation.map.MapOperationProble
polyglot java import org.enso.table.data.table.problems.ArithmeticError
polyglot java import org.enso.table.data.table.problems.ArithmeticOverflow
polyglot java import org.enso.table.data.table.problems.FloatingPointGrouping
polyglot java import org.enso.table.data.table.problems.IgnoredNaN
polyglot java import org.enso.table.data.table.problems.IgnoredNothing
polyglot java import org.enso.table.data.table.problems.IllegalArgumentError
polyglot java import org.enso.table.data.table.problems.InvalidAggregation
polyglot java import org.enso.table.data.table.problems.UnquotedCharactersInOutput
@ -35,6 +37,10 @@ translate_problem p = case p of
Arithmetic_Error.Error p.getMessage
_ : IllegalArgumentError ->
Illegal_Argument.Error p.getMessage
_ : IgnoredNaN ->
Ignored_NaN_Values.Warning p.getLocationName (Vector.from_polyglot_array p.getRows)
_ : IgnoredNothing ->
Ignored_Nothing_Values.Warning p.getLocationName (Vector.from_polyglot_array p.getRows)
_ : FloatingPointGrouping ->
Floating_Point_Equality.Error p.getLocationName
_ : LossOfIntegerPrecision ->

View File

@ -0,0 +1,27 @@
package org.enso.table.data.table.problems;
public class IgnoredNaN extends ColumnAggregatedProblem {
public IgnoredNaN(String locationName, Integer row) {
super(locationName, row);
}
@Override
public boolean merge(ColumnAggregatedProblem another) {
if (another instanceof IgnoredNaN && this.getLocationName().equals(another.getLocationName())) {
this.rows.addAll(another.rows);
return true;
}
return false;
}
@Override
public String getMessage() {
return "The column "
+ getLocationName()
+ " contained NaN values in rows "
+ makeTruncatedRowsString()
+ " which were ignored.";
}
}

View File

@ -0,0 +1,28 @@
package org.enso.table.data.table.problems;
public class IgnoredNothing extends ColumnAggregatedProblem {
public IgnoredNothing(String locationName, Integer row) {
super(locationName, row);
}
@Override
public boolean merge(ColumnAggregatedProblem another) {
if (another instanceof IgnoredNothing
&& this.getLocationName().equals(another.getLocationName())) {
this.rows.addAll(another.rows);
return true;
}
return false;
}
@Override
public String getMessage() {
return "The column "
+ getLocationName()
+ " contained Nothing values in rows "
+ makeTruncatedRowsString()
+ " which were ignored.";
}
}

View File

@ -26,6 +26,7 @@ public class AddRunning {
var ret =
new DoubleStorage(
runningGenerator.result, sourceColumn.getSize(), runningGenerator.isNothing);
return ret;
}
@ -72,16 +73,21 @@ public class AddRunning {
increment(value);
}
}
return !isInitialized ? null : getCurrent();
return isInitialized ? getCurrent() : null;
}
public void initialize(double value) {
@Override
public Double currentValue() {
return isInitialized ? getCurrent() : null;
}
protected void initialize(double value) {
current = value;
}
public abstract void increment(double value);
protected abstract void increment(double value);
public double getCurrent() {
protected double getCurrent() {
return current;
}
}

View File

@ -14,26 +14,42 @@ import org.enso.table.data.index.MultiValueIndex;
import org.enso.table.data.index.OrderedMultiValueKey;
import org.enso.table.data.index.UnorderedMultiValueKey;
import org.enso.table.data.table.Column;
import org.enso.table.data.table.problems.IgnoredNaN;
import org.enso.table.data.table.problems.IgnoredNothing;
import org.enso.table.problems.ColumnAggregatedProblemAggregator;
import org.enso.table.problems.ProblemAggregator;
import org.enso.table.util.ConstantList;
abstract class RunningGenerator {
Storage<?> sourceStorage;
Column sourceColumn;
long[] result;
BitSet isNothing;
ColumnAggregatedProblemAggregator columnAggregatedProblemAggregator;
RunningGenerator(Column sourceColumn) {
this.sourceStorage = sourceColumn.getStorage();
RunningGenerator(Column sourceColumn, ProblemAggregator problemAggregator) {
this.sourceColumn = sourceColumn;
result = new long[sourceColumn.getSize()];
isNothing = new BitSet();
columnAggregatedProblemAggregator = new ColumnAggregatedProblemAggregator(problemAggregator);
}
void calculateNextValue(int i, RunningIterator it) {
Object value = sourceStorage.getItemBoxed(i);
Object value = sourceColumn.getStorage().getItemBoxed(i);
if (value == null) {
columnAggregatedProblemAggregator.reportColumnAggregatedProblem(
new IgnoredNothing(sourceColumn.getName(), i));
}
Double dValue = NumericConverter.tryConvertingToDouble(value);
Double dNextValue = it.next(dValue);
Double dNextValue;
if (dValue != null && dValue.equals(Double.NaN)) {
columnAggregatedProblemAggregator.reportColumnAggregatedProblem(
new IgnoredNaN(sourceColumn.getName(), i));
dNextValue = it.currentValue();
} else {
dNextValue = it.next(dValue);
}
if (dNextValue == null) {
isNothing.set(i);
} else {
@ -59,9 +75,11 @@ abstract class RunningGenerator {
runningGenerator =
new GroupingNoOrderingRunning(sourceColumn, groupingColumns, problemAggregator);
} else if (orderingColumns.length > 0) {
runningGenerator = new NoGroupingOrderingRunning(sourceColumn, orderingColumns, directions);
runningGenerator =
new NoGroupingOrderingRunning(
sourceColumn, orderingColumns, directions, problemAggregator);
} else {
runningGenerator = new NoGroupingNoOrderingRunning(sourceColumn);
runningGenerator = new NoGroupingNoOrderingRunning(sourceColumn, problemAggregator);
}
return runningGenerator;
}
@ -69,8 +87,8 @@ abstract class RunningGenerator {
class NoGroupingNoOrderingRunning extends RunningGenerator {
NoGroupingNoOrderingRunning(Column sourceColumn) {
super(sourceColumn);
NoGroupingNoOrderingRunning(Column sourceColumn, ProblemAggregator problemAggregator) {
super(sourceColumn, problemAggregator);
}
@Override
@ -92,7 +110,7 @@ class GroupingNoOrderingRunning extends RunningGenerator {
public GroupingNoOrderingRunning(
Column sourceColumn, Column[] groupingColumns, ProblemAggregator problemAggregator) {
super(sourceColumn);
super(sourceColumn, problemAggregator);
this.groupingColumns = groupingColumns;
groupingStorages =
Arrays.stream(groupingColumns).map(Column::getStorage).toArray(Storage[]::new);
@ -120,8 +138,11 @@ class NoGroupingOrderingRunning extends RunningGenerator {
private final List<OrderedMultiValueKey> keys;
public NoGroupingOrderingRunning(
Column sourceColumn, Column[] orderingColumns, int[] directions) {
super(sourceColumn);
Column sourceColumn,
Column[] orderingColumns,
int[] directions,
ProblemAggregator problemAggregator) {
super(sourceColumn, problemAggregator);
int n = orderingColumns[0].getSize();
orderingStorages =
Arrays.stream(orderingColumns).map(Column::getStorage).toArray(Storage[]::new);
@ -158,7 +179,7 @@ class GroupingOrderingRunning extends RunningGenerator {
Column[] orderingColumns,
int[] directions,
ProblemAggregator problemAggregator) {
super(sourceColumn);
super(sourceColumn, problemAggregator);
this.groupingColumns = groupingColumns;
this.orderingColumns = orderingColumns;
this.directions = directions;

View File

@ -3,4 +3,6 @@ package org.enso.table.operations;
public interface RunningIterator {
Double next(Double value);
Double currentValue();
}

View File

@ -217,7 +217,9 @@ add_specs suite_builder =
# 4 | SG0456 | E | 73.77 | 5
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running sum works ignores nothing values" <|
warnings = Problems.get_attached_warnings result
warnings.not_empty . should_be_false
group_builder.specify "Running sum works ignores nothing values and warns" <|
result = table.running Statistic.Sum "Ticket Price"
expected_column = Column.from_vector "Running Sum of Ticket Price" [100.5, 676.49, 676.49, 676.49, 750.26]
# | Flight | Passenger | Ticket Price | Running Sum of Ticket Price
@ -226,10 +228,27 @@ add_specs suite_builder =
# 1 | BA0123 | B | 575.99 | 676.49
# 2 | SG0456 | A | Nothing | 676.49
# 3 | BA0123 | C | Nothing | 676.49
# 4 | SG0456 | E | 73.77 | 649.76
# 4 | SG0456 | E | 73.77 | 750.26
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running min ignores nothing values and works with grouping" <|
w = Problems.expect_warning Ignored_Nothing_Values result
w.column.should_equal "Ticket Price"
w.rows.should_equal [2, 3]
group_builder.specify "Running sum works ignores nothing values and warnings can be hushed" <|
result = table.running Statistic.Sum "Ticket Price" on_problems=..Ignore
expected_column = Column.from_vector "Running Sum of Ticket Price" [100.5, 676.49, 676.49, 676.49, 750.26]
# | Flight | Passenger | Ticket Price | Running Sum of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 676.49
# 2 | SG0456 | A | Nothing | 676.49
# 3 | BA0123 | C | Nothing | 676.49
# 4 | SG0456 | E | 73.77 | 750.26
expected_table = table.zip expected_column
result.should_equal expected_table
warnings = Problems.get_attached_warnings result
warnings.not_empty . should_be_false
group_builder.specify "Running min ignores nothing values and works with grouping and warns" <|
result = table.running Statistic.Minimum "Ticket Price" "Running" ["Flight"]
expected_column = Column.from_vector "Running" [100.5, 100.5, Nothing, 100.5, 73.77]
# | Flight | Passenger | Ticket Price | Running
@ -241,7 +260,10 @@ add_specs suite_builder =
# 4 | SG0456 | E | 73.77 | 73.77
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running max ignores nothing values and works with grouping" <|
w = Problems.expect_warning Ignored_Nothing_Values result
w.column.should_equal "Ticket Price"
w.rows.should_equal [2, 3]
group_builder.specify "Running max ignores nothing values and works with grouping and warns" <|
result = table.running Statistic.Maximum "Ticket Price" "Running" ["Flight"]
expected_column = Column.from_vector "Running" [100.5, 575.99, Nothing, 575.99, 73.77]
# | Flight | Passenger | Ticket Price | Running
@ -253,7 +275,10 @@ add_specs suite_builder =
# 4 | SG0456 | E | 73.77 | 73.77
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running mean ignores nothing values" <|
w = Problems.expect_warning Ignored_Nothing_Values result
w.column.should_equal "Ticket Price"
w.rows.should_equal [2, 3]
group_builder.specify "Running mean ignores nothing values and warns" <|
result = table.running Statistic.Mean "Ticket Price" "Running"
expected_column = Column.from_vector "Running" [100.5, 338.245, 338.245, 338.245, 250.08666666666667]
# | Flight | Passenger | Ticket Price | Running
@ -265,7 +290,10 @@ add_specs suite_builder =
# 4 | SG0456 | E | 73.77 | 250.08666666666667
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running mean ignores nothing values and works when first value is Nothing" <|
w = Problems.expect_warning Ignored_Nothing_Values result
w.column.should_equal "Ticket Price"
w.rows.should_equal [2, 3]
group_builder.specify "Running mean ignores nothing values and works when first value is Nothing and warns" <|
result = table.running Statistic.Mean "Ticket Price" "Running" ["Flight"]
expected_column = Column.from_vector "Running" [100.5, 338.245, Nothing, 338.245, 73.77]
# | Flight | Passenger | Ticket Price | Running
@ -277,6 +305,124 @@ add_specs suite_builder =
# 4 | SG0456 | E | 73.77 | 73.77
expected_table = table.zip expected_column
result.should_equal expected_table
w = Problems.expect_warning Ignored_Nothing_Values result
w.column.should_equal "Ticket Price"
w.rows.should_equal [2, 3]
suite_builder.group "NaN handling" group_builder->
# | Flight | Passenger | Ticket Price
#---+--------+-----------+--------------
# 0 | BA0123 | A | 100.5
# 1 | BA0123 | B | 575.99
# 2 | SG0456 | A | NaN
# 3 | BA0123 | C | NaN
# 4 | SG0456 | E | 73.77
flight = ["Flight", ["BA0123", "BA0123", "SG0456", "BA0123", "SG0456"]]
passenger = ["Passenger", ["A", "B", "A", "C", "E"]]
ticket_price = ["Ticket Price", [100.50, 575.99, Number.nan, Number.nan, 73.77]]
table = Table.new [flight, passenger, ticket_price]
group_builder.specify "Running count doesn't care about NaN values" <|
result = table.running Statistic.Count "Passenger"
expected_column = Column.from_vector "Running Count of Passenger" [1, 2, 3, 4, 5]
# | Flight | Passenger | Ticket Price | Running Count of Passenger
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 1
# 1 | BA0123 | B | 575.99 | 2
# 2 | SG0456 | A | nothing | 3
# 3 | BA0123 | C | nothing | 4
# 4 | SG0456 | E | 73.77 | 5
expected_table = table.zip expected_column
result.should_equal expected_table
warnings = Problems.get_attached_warnings result
warnings.not_empty . should_be_false
group_builder.specify "Running sum works ignores NaN values and warns" <|
result = table.running Statistic.Sum "Ticket Price"
expected_column = Column.from_vector "Running Sum of Ticket Price" [100.5, 676.49, 676.49, 676.49, 750.26]
# | Flight | Passenger | Ticket Price | Running Sum of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 676.49
# 2 | SG0456 | A | NaN | 676.49
# 3 | BA0123 | C | NaN | 676.49
# 4 | SG0456 | E | 73.77 | 750.26
expected_table = table.zip expected_column
result.should_equal expected_table
w = Problems.expect_warning Ignored_NaN_Values result
w.column.should_equal "Ticket Price"
w.rows.should_equal [2, 3]
group_builder.specify "Running sum works ignores NaN values and warnings can be hushed" <|
result = table.running Statistic.Sum "Ticket Price" on_problems=..Ignore
expected_column = Column.from_vector "Running Sum of Ticket Price" [100.5, 676.49, 676.49, 676.49, 750.26]
# | Flight | Passenger | Ticket Price | Running Sum of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 676.49
# 2 | SG0456 | A | NaN | 676.49
# 3 | BA0123 | C | NaN | 676.49
# 4 | SG0456 | E | 73.77 | 750.26
expected_table = table.zip expected_column
result.should_equal expected_table
warnings = Problems.get_attached_warnings result
warnings.not_empty . should_be_false
group_builder.specify "Running min ignores NaN values and works with grouping and warns" <|
result = table.running Statistic.Minimum "Ticket Price" "Running" ["Flight"]
expected_column = Column.from_vector "Running" [100.5, 100.5, Number.nan, 100.5, 73.77]
# | Flight | Passenger | Ticket Price | Running
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 100.5
# 2 | SG0456 | A | NaN | NaN
# 3 | BA0123 | C | NaN | 100.5
# 4 | SG0456 | E | 73.77 | 73.77
expected_table = table.zip expected_column
result.should_equal expected_table
w = Problems.expect_warning Ignored_NaN_Values result
w.column.should_equal "Ticket Price"
w.rows.should_equal [2, 3]
group_builder.specify "Running max ignores NaN values and works with grouping and warns" <|
result = table.running Statistic.Maximum "Ticket Price" "Running" ["Flight"]
expected_column = Column.from_vector "Running" [100.5, 575.99, Number.nan, 575.99, 73.77]
# | Flight | Passenger | Ticket Price | Running
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 575.99
# 2 | SG0456 | A | NaN | NaN
# 3 | BA0123 | C | NaN | 575.99
# 4 | SG0456 | E | 73.77 | 73.77
expected_table = table.zip expected_column
result.should_equal expected_table
w = Problems.expect_warning Ignored_NaN_Values result
w.column.should_equal "Ticket Price"
w.rows.should_equal [2, 3]
group_builder.specify "Running mean ignores NaN values and warns" <|
result = table.running Statistic.Mean "Ticket Price" "Running"
expected_column = Column.from_vector "Running" [100.5, 338.245, 338.245, 338.245, 250.08666666666667]
# | Flight | Passenger | Ticket Price | Running
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 338.245
# 2 | SG0456 | A | NaN | 338.245
# 3 | BA0123 | C | NaN | 338.245
# 4 | SG0456 | E | 73.77 | 250.08666666666667
expected_table = table.zip expected_column
result.should_equal expected_table
w = Problems.expect_warning Ignored_NaN_Values result
w.column.should_equal "Ticket Price"
w.rows.should_equal [2, 3]
group_builder.specify "Running mean ignores NaN values and works when first value is NaN and warns" <|
result = table.running Statistic.Mean "Ticket Price" "Running" ["Flight"]
expected_column = Column.from_vector "Running" [100.5, 338.245, Number.nan, 338.245, 73.77]
# | Flight | Passenger | Ticket Price | Running
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 338.245
# 2 | SG0456 | A | NaN | NaN
# 3 | BA0123 | C | NaN | 338.245
# 4 | SG0456 | E | 73.77 | 73.77
expected_table = table.zip expected_column
result.should_equal expected_table
w = Problems.expect_warning Ignored_NaN_Values result
w.column.should_equal "Ticket Price"
w.rows.should_equal [2, 3]
suite_builder.group "different types" group_builder->
# | Flight | Passenger | Ticket Price
#---+--------+-----------+--------------