Add table running functionality for Sum, Mean, Min, Max. (#9577)

* Add Table.Running

* Code Review fixes

* Code Review changes

* Change null handling
This commit is contained in:
AdRiley 2024-04-23 09:45:43 +01:00 committed by GitHub
parent d665f4d9c2
commit 4a97bfa31f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 665 additions and 15 deletions

View File

@ -654,6 +654,7 @@
- [Added `Decimal.floor`, `.ceil`, and `.trunc`.][9694]
- [Added `recursive` option to `File.delete`.][9719]
- [Added `Vector.build`.][9725]
- [Added `Table.running` method][9577]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -958,6 +959,7 @@
[9716]: https://github.com/enso-org/enso/pull/9716
[9719]: https://github.com/enso-org/enso/pull/9719
[9725]: https://github.com/enso-org/enso/pull/9725
[9577]: https://github.com/enso-org/enso/pull/9577
#### Enso Compiler

View File

@ -23,6 +23,7 @@ polyglot java import java.lang.NullPointerException
polyglot java import org.enso.base.CompareException
polyglot java import org.enso.base.statistics.CorrelationStatistics
polyglot java import org.enso.base.statistics.Rank
polyglot java import org.enso.base.statistics.Statistic as Java_Statistic
## Specifies how to handle ranking of equal values.
type Rank_Method
@ -137,6 +138,22 @@ type Statistic
Statistic.Kurtosis -> 4
_ -> Nothing
## PRIVATE
to_java self = case self of
Statistic.Count -> Java_Statistic.Count
Statistic.Minimum -> Java_Statistic.Minimum
Statistic.Maximum -> Java_Statistic.Maximum
Statistic.Sum -> Java_Statistic.Sum
Statistic.Mean -> Java_Statistic.Mean
Statistic.Variance _ -> Java_Statistic.Variance
Statistic.Standard_Deviation _ -> Java_Statistic.StandardDeviation
Statistic.Skew _ -> Java_Statistic.Skew
Statistic.Kurtosis -> Java_Statistic.Kurtosis
Statistic.Covariance _ -> Java_Statistic.Covariance
Statistic.Pearson _ -> Java_Statistic.Pearson
Statistic.Spearman _ -> Java_Statistic.Spearman
Statistic.R_Squared _ -> Java_Statistic.R_Squared
## PRIVATE
Compute a single statistic on a vector like object.
@ -191,14 +208,7 @@ type Statistic
- statistics: Set of statistics to calculate.
running_bulk : Vector -> Vector Statistic -> Vector Any
running_bulk data statistics=[Statistic.Count, Statistic.Sum] =
is_unsupported s = case s of
Statistic.Covariance _ -> True
Statistic.Pearson _ -> True
Statistic.Spearman _ -> True
Statistic.R_Squared _ -> True
_ -> False
if statistics.any is_unsupported then Error.throw (Illegal_Argument.Error ("Unsupported Statistics ( " + (statistics.filter is_unsupported . to_text) ") for running calculations.")) else
check_running_support statistics <|
moment_order = statistics.map on_problems=No_Wrap .order
has_min_max = statistics.any (s-> s == Statistic.Minimum || s == Statistic.Maximum)
max_moment_order = moment_order.filter (v-> v != Nothing) . fold 0 .max
@ -270,6 +280,21 @@ type Statistic
rank_data input method=Rank_Method.Average =
method.compute input
## PRIVATE
Check if the statistics are supported for running calculations.
check_running_support : Vector Statistic -> Any -> Any
check_running_support statistics ~action =
is_unsupported s = case s of
Statistic.Covariance _ -> True
Statistic.Pearson _ -> True
Statistic.Spearman _ -> True
Statistic.R_Squared _ -> True
_ -> False
if statistics.any is_unsupported then Error.throw (Illegal_Argument.Error ("Unsupported Statistics ( " + (statistics.filter is_unsupported . to_text) + ") for running calculations.")) else
action
## PRIVATE
wrap_java_call : Any -> Any
wrap_java_call ~function =

View File

@ -0,0 +1,44 @@
from Standard.Base import all
import Standard.Base.Errors.Common.Unsupported_Argument_Types
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import project.Column.Column
import project.Set_Mode.Set_Mode
import project.Sort_Column.Sort_Column
import project.Table.Table
import project.Internal.Add_Row_Number
import project.Internal.Java_Problems
import project.Internal.Problem_Builder.Problem_Builder
import project.Internal.Table_Helpers
from project.Errors import Duplicate_Output_Column_Names
import project.Value_Type.Value_Type
polyglot java import java.lang.ArithmeticException
polyglot java import org.enso.table.data.column.storage.numeric.LongRangeStorage
polyglot java import org.enso.table.operations.AddRunning
## PRIVATE
add_running : Statistic -> (Text | Integer) -> Text -> Vector (Text | Integer | Regex) | Text | Integer | Regex -> Vector (Text | Sort_Column) | Text -> Problem_Behavior -> Table
add_running table (statistic:Statistic=Statistic.Count) (of:Text|Integer=0) (as:Text='') (group_by:(Vector | Text | Integer | Regex)=[]) (order_by:(Vector | Text)=[]) (on_problems:Problem_Behavior=Problem_Behavior.Report_Warning) =
check_running_support [statistic] <|
of_col = table.at of
new_name = if as.is_empty then 'Running ' + statistic.to_text + ' of ' + of_col.name else as
case statistic of
Statistic.Count ->
Add_Row_Number.add_row_number table new_name 1 1 group_by order_by on_problems
_ ->
Value_Type.expect_numeric of_col <|
problem_builder = Problem_Builder.new error_on_missing_columns=True
grouping_columns = table.columns_helper.select_columns_helper group_by Case_Sensitivity.Default True problem_builder
ordering = Table_Helpers.resolve_order_by table.columns order_by problem_builder
source_java_column = of_col.java_column
grouping_java_columns = grouping_columns.map .java_column
ordering_java_columns = ordering.map c->
c.column.java_column
directions = ordering.map c->
c.associated_selector.direction.to_sign
Java_Problems.with_problem_aggregator on_problems java_problem_aggregator->
new_storage = AddRunning.create_running statistic.to_java source_java_column grouping_java_columns ordering_java_columns directions java_problem_aggregator
new_column = Column.from_storage new_name new_storage
table.set new_column new_name set_mode=Set_Mode.Add

View File

@ -31,6 +31,7 @@ import project.Expression.Expression
import project.Expression.Expression_Error
import project.Extensions.Table_Conversions
import project.Internal.Add_Row_Number
import project.Internal.Add_Running
import project.Internal.Aggregate_Column_Helper
import project.Internal.Column_Naming_Helper.Column_Naming_Helper
import project.Internal.Constant_Column.Constant_Column
@ -2867,8 +2868,7 @@ type Table
transformer col = col.text_replace resolved_term resolved_new_text case_sensitivity only_first
Table_Helpers.replace_columns_with_transformed_columns self columns transformer
## PRIVATE
ALIAS cumulative
## ALIAS cumulative
GROUP Standard.Base.Values
ICON dataframe_map_column
Adds a new column to the table with a running calculation.
@ -2905,10 +2905,7 @@ type Table
@of Widget_Helpers.make_column_name_selector
running : Statistic -> (Text | Integer) -> Text -> Vector (Text | Integer | Regex) | Text | Integer | Regex -> Vector (Text | Sort_Column) | Text -> Problem_Behavior -> Table
running self (statistic:Statistic=Statistic.Count) (of:(Text | Integer)=0) (as:Text='') (group_by:(Vector | Text | Integer | Regex)=[]) (order_by:(Vector | Text)=[]) (on_problems:Problem_Behavior=Problem_Behavior.Report_Warning) =
if statistic != Statistic.Count then Error.throw (Illegal_Argument.Error ("Currently only Statistic.Count is supported in Table.running.")) else
of_col = self.at of
new_name = if as == '' then 'Running ' + statistic.to_text + ' of ' + of_col.name else as
Add_Row_Number.add_row_number self new_name 1 1 group_by order_by on_problems
Add_Running.add_running self statistic of as group_by order_by on_problems
## PRIVATE
column_naming_helper : Column_Naming_Helper

View File

@ -0,0 +1,17 @@
package org.enso.base.statistics;
public enum Statistic {
Count,
Minimum,
Maximum,
Sum,
Mean,
Variance,
Standard_Deviation,
Skew,
Kurtosis,
Covariance,
Pearson,
Spearman,
R_Squared
}

View File

@ -0,0 +1,134 @@
package org.enso.table.operations;
import org.enso.base.statistics.Statistic;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.numeric.DoubleStorage;
import org.enso.table.data.table.Column;
import org.enso.table.problems.ProblemAggregator;
public class AddRunning {
public static Storage<Double> create_running(
Statistic statistic,
Column sourceColumn,
Column[] groupingColumns,
Column[] orderingColumns,
int[] directions,
ProblemAggregator problemAggregator) {
if (orderingColumns.length != directions.length) {
throw new IllegalArgumentException(
"The number of ordering columns and directions must be the same.");
}
var runningGenerator =
RunningGenerator.createGenerator(
sourceColumn, groupingColumns, orderingColumns, directions, problemAggregator);
runningGenerator.generate(new RunningIteratorFactoryImpl(statistic));
var ret =
new DoubleStorage(
runningGenerator.result, sourceColumn.getSize(), runningGenerator.isNothing);
return ret;
}
private static class RunningIteratorFactoryImpl implements RunningIteratorFactory {
Statistic statistic;
RunningIteratorFactoryImpl(Statistic statistic) {
this.statistic = statistic;
}
@Override
public RunningIterator getIterator() {
switch (statistic) {
case Sum -> {
return new RunningSumIterator();
}
case Mean -> {
return new RunningMeanIterator();
}
case Minimum -> {
return new RunningMinIterator();
}
case Maximum -> {
return new RunningMaxIterator();
}
default -> throw new IllegalArgumentException("Unsupported statistic: " + statistic);
}
}
}
private abstract static class RunningIteratorBase implements RunningIterator {
protected double current;
private boolean isInitialized = false;
@Override
public Double next(Double value) {
if (value != null) {
if (!isInitialized) {
isInitialized = true;
initialize(value);
} else {
increment(value);
}
}
return !isInitialized ? null : getCurrent();
}
public void initialize(double value) {
current = value;
}
public abstract void increment(double value);
public double getCurrent() {
return current;
}
}
private static class RunningSumIterator extends RunningIteratorBase {
@Override
public void increment(double value) {
current += value;
}
}
private static class RunningMeanIterator extends RunningIteratorBase {
private int currentCount;
@Override
public void increment(double value) {
current += value;
currentCount++;
}
@Override
public void initialize(double value) {
current = value;
currentCount = 1;
}
@Override
public double getCurrent() {
return current / currentCount;
}
}
private static class RunningMinIterator extends RunningIteratorBase {
@Override
public void increment(double value) {
current = Math.min(current, value);
}
}
private static class RunningMaxIterator extends RunningIteratorBase {
@Override
public void increment(double value) {
current = Math.max(current, value);
}
}
}

View File

@ -0,0 +1,194 @@
package org.enso.table.operations;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.IntStream;
import org.enso.base.polyglot.NumericConverter;
import org.enso.base.text.TextFoldingStrategy;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.index.MultiValueIndex;
import org.enso.table.data.index.OrderedMultiValueKey;
import org.enso.table.data.index.UnorderedMultiValueKey;
import org.enso.table.data.table.Column;
import org.enso.table.problems.ColumnAggregatedProblemAggregator;
import org.enso.table.problems.ProblemAggregator;
import org.enso.table.util.ConstantList;
abstract class RunningGenerator {
Storage<?> sourceStorage;
long[] result;
BitSet isNothing;
RunningGenerator(Column sourceColumn) {
this.sourceStorage = sourceColumn.getStorage();
result = new long[sourceColumn.getSize()];
isNothing = new BitSet();
}
void calculateNextValue(int i, RunningIterator it) {
Object value = sourceStorage.getItemBoxed(i);
Double dValue = NumericConverter.tryConvertingToDouble(value);
Double dNextValue = it.next(dValue);
if (dNextValue == null) {
isNothing.set(i);
} else {
result[i] = Double.doubleToRawLongBits(dNextValue);
}
}
// implement this method in subclasses to control the order you want to iterate over the data
public abstract void generate(RunningIteratorFactory factory);
public static RunningGenerator createGenerator(
Column sourceColumn,
Column[] groupingColumns,
Column[] orderingColumns,
int[] directions,
ProblemAggregator problemAggregator) {
RunningGenerator runningGenerator;
if (groupingColumns.length > 0 && orderingColumns.length > 0) {
runningGenerator =
new GroupingOrderingRunning(
sourceColumn, groupingColumns, orderingColumns, directions, problemAggregator);
} else if (groupingColumns.length > 0) {
runningGenerator =
new GroupingNoOrderingRunning(sourceColumn, groupingColumns, problemAggregator);
} else if (orderingColumns.length > 0) {
runningGenerator = new NoGroupingOrderingRunning(sourceColumn, orderingColumns, directions);
} else {
runningGenerator = new NoGroupingNoOrderingRunning(sourceColumn);
}
return runningGenerator;
}
}
class NoGroupingNoOrderingRunning extends RunningGenerator {
NoGroupingNoOrderingRunning(Column sourceColumn) {
super(sourceColumn);
}
@Override
public void generate(RunningIteratorFactory factory) {
var it = factory.getIterator();
for (int i = 0; i < result.length; i++) {
calculateNextValue(i, it);
}
}
}
class GroupingNoOrderingRunning extends RunningGenerator {
private final Column[] groupingColumns;
private final Storage<?>[] groupingStorages;
private final ColumnAggregatedProblemAggregator groupingProblemAggregator;
private final List<TextFoldingStrategy> textFoldingStrategy;
private final Map<UnorderedMultiValueKey, RunningIterator> groups;
public GroupingNoOrderingRunning(
Column sourceColumn, Column[] groupingColumns, ProblemAggregator problemAggregator) {
super(sourceColumn);
this.groupingColumns = groupingColumns;
groupingStorages =
Arrays.stream(groupingColumns).map(Column::getStorage).toArray(Storage[]::new);
groupingProblemAggregator = new ColumnAggregatedProblemAggregator(problemAggregator);
textFoldingStrategy =
ConstantList.make(TextFoldingStrategy.unicodeNormalizedFold, groupingStorages.length);
groups = new HashMap<>();
}
@Override
public void generate(RunningIteratorFactory factory) {
for (int i = 0; i < result.length; i++) {
var key = new UnorderedMultiValueKey(groupingStorages, i, textFoldingStrategy);
key.checkAndReportFloatingEquality(
groupingProblemAggregator, columnIx -> groupingColumns[columnIx].getName());
RunningIterator it = groups.computeIfAbsent(key, k -> factory.getIterator());
calculateNextValue(i, it);
}
}
}
class NoGroupingOrderingRunning extends RunningGenerator {
private final Storage<?>[] orderingStorages;
private final List<OrderedMultiValueKey> keys;
public NoGroupingOrderingRunning(
Column sourceColumn, Column[] orderingColumns, int[] directions) {
super(sourceColumn);
int n = orderingColumns[0].getSize();
orderingStorages =
Arrays.stream(orderingColumns).map(Column::getStorage).toArray(Storage[]::new);
keys =
new ArrayList<>(
IntStream.range(0, n)
.mapToObj(i -> new OrderedMultiValueKey(orderingStorages, i, directions))
.toList());
keys.sort(null);
}
@Override
public void generate(RunningIteratorFactory factory) {
var it = factory.getIterator();
for (var key : keys) {
var i = key.getRowIndex();
calculateNextValue(i, it);
}
}
}
class GroupingOrderingRunning extends RunningGenerator {
private final Column[] groupingColumns;
private final Column[] orderingColumns;
private final int[] directions;
private final Storage<?>[] groupingStorages;
private final Storage<?>[] orderingStorages;
private final ProblemAggregator problemAggregator;
public GroupingOrderingRunning(
Column sourceColumn,
Column[] groupingColumns,
Column[] orderingColumns,
int[] directions,
ProblemAggregator problemAggregator) {
super(sourceColumn);
this.groupingColumns = groupingColumns;
this.orderingColumns = orderingColumns;
this.directions = directions;
groupingStorages =
Arrays.stream(groupingColumns).map(Column::getStorage).toArray(Storage[]::new);
ConstantList.make(TextFoldingStrategy.unicodeNormalizedFold, groupingStorages.length);
orderingStorages =
Arrays.stream(orderingColumns).map(Column::getStorage).toArray(Storage[]::new);
this.problemAggregator = problemAggregator;
}
@Override
public void generate(RunningIteratorFactory factory) {
int n = orderingColumns[0].getSize();
var groupIndex =
MultiValueIndex.makeUnorderedIndex(
groupingColumns, n, TextFoldingStrategy.unicodeNormalizedFold, problemAggregator);
for (var entry : groupIndex.mapping().entrySet()) {
List<Integer> indices = entry.getValue();
List<OrderedMultiValueKey> orderingKeys =
new ArrayList<>(
indices.stream()
.map(i -> new OrderedMultiValueKey(orderingStorages, i, directions))
.toList());
orderingKeys.sort(null);
RunningIterator it = factory.getIterator();
for (OrderedMultiValueKey key : orderingKeys) {
var i = key.getRowIndex();
calculateNextValue(i, it);
}
}
}
}

View File

@ -0,0 +1,6 @@
package org.enso.table.operations;
public interface RunningIterator {
Double next(Double value);
}

View File

@ -0,0 +1,6 @@
package org.enso.table.operations;
public interface RunningIteratorFactory {
RunningIterator getIterator();
}

View File

@ -3,6 +3,7 @@ from Standard.Table import Column, Table
from Standard.Test import all
from Standard.Table.Errors import all
import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
from project.Util import all
@ -26,7 +27,7 @@ type Data
Data.Value make_table
add_specs suite_builder =
suite_builder.group "running" group_builder->
suite_builder.group "running count" group_builder->
data = Data.setup
group_builder.specify "Defaults add running count of first column" <|
result = data.table.running
@ -88,6 +89,230 @@ add_specs suite_builder =
# 4 | SG0456 | E | 73.77 | 1
expected_table = data.table.zip expected_column
result.should_equal expected_table
group_builder.specify "Can provide running count based on order by without grouping" <|
result = data.table.running Statistic.Count "Passenger" "Ranked ticket cost" [] ["Ticket Price"]
expected_column = Column.from_vector "Ranked ticket cost" [3, 5, 1, 4, 2]
# | Flight | Passenger | Ticket Price | Ranked ticket cost
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 3
# 1 | BA0123 | B | 575.99 | 5
# 2 | SG0456 | A | 73.23 | 1
# 3 | BA0123 | C | 112.34 | 4
# 4 | SG0456 | E | 73.77 | 2
expected_table = data.table.zip expected_column
result.should_equal expected_table
suite_builder.group "running sum" group_builder->
data = Data.setup
group_builder.specify "Not setting the as name gives default name based on of column" <|
result = data.table.running Statistic.Sum "Ticket Price"
expected_column = Column.from_vector "Running Sum of Ticket Price" [100.5, 676.49, 749.72, 862.0600000000001, 935.83]
# | Flight | Passenger | Ticket Price | Running Sum of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 676.49
# 2 | SG0456 | A | 73.23 | 749.72
# 3 | BA0123 | C | 112.34 | 862.06
# 4 | SG0456 | E | 73.77 | 935.83
expected_table = data.table.zip expected_column
result.should_equal expected_table
group_builder.specify "Can group by and provide running sum per group" <|
result = data.table.running Statistic.Sum "Ticket Price" "Running" ["Flight"]
expected_column = Column.from_vector "Running" [100.5, 676.49, 73.23, 788.83, 147]
# | Flight | Passenger | Ticket Price | Running
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 676.49
# 2 | SG0456 | A | 73.23 | 73.23
# 3 | BA0123 | C | 112.34 | 788.83
# 4 | SG0456 | E | 73.77 | 147
expected_table = data.table.zip expected_column
result.should_equal expected_table
group_builder.specify "Can group by and provide running sum per group based on order by" <|
result = data.table.running Statistic.Sum "Ticket Price" "Sum ticket cost per pass" ["Passenger"] ["Ticket Price"]
expected_column = Column.from_vector "Sum ticket cost per pass" [173.73000000000002, 575.99, 73.23, 112.34, 73.77]
# | Flight | Passenger | Ticket Price | Sum ticket cost per pass
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 173.73
# 1 | BA0123 | B | 575.99 | 575.99
# 2 | SG0456 | A | 73.23 | 73.23
# 3 | BA0123 | C | 112.34 | 112.34
# 4 | SG0456 | E | 73.77 | 73.77
expected_table = data.table.zip expected_column
result.should_equal expected_table
group_builder.specify "Can provide running sum based on order by without grouping" <|
result = data.table.running Statistic.Sum "Ticket Price" "Sum ticket cost" [] ["Ticket Price"]
expected_column = Column.from_vector "Sum ticket cost" [247.5, 935.83, 73.23, 359.84000000000003, 147]
# | Flight | Passenger | Ticket Price | Ranked ticket cost
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 3
# 1 | BA0123 | B | 575.99 | 5
# 2 | SG0456 | A | 73.23 | 1
# 3 | BA0123 | C | 112.34 | 4
# 4 | SG0456 | E | 73.77 | 2
expected_table = data.table.zip expected_column
result.should_equal expected_table
suite_builder.group "running mean" group_builder->
data = Data.setup
group_builder.specify "Not setting the as name gives default name based on of column" <|
result = data.table.running Statistic.Mean "Ticket Price"
expected_column = Column.from_vector "Running Mean of Ticket Price" [100.5, 338.245, 249.90666666666667, 215.51500000000001, 187.166]
# | Flight | Passenger | Ticket Price | Running Mean of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 338.245
# 2 | SG0456 | A | 73.23 | 249.90666666666667
# 3 | BA0123 | C | 112.34 | 215.51500000000001
# 4 | SG0456 | E | 73.77 | 187.166
expected_table = data.table.zip expected_column
result.should_equal expected_table
suite_builder.group "running max" group_builder->
data = Data.setup
group_builder.specify "Not setting the as name gives default name based on of column" <|
result = data.table.running Statistic.Maximum "Ticket Price"
expected_column = Column.from_vector "Running Maximum of Ticket Price" [100.5, 575.99, 575.99, 575.99, 575.99]
# | Flight | Passenger | Ticket Price | Running Maximum of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 575.99
# 2 | SG0456 | A | 73.23 | 575.99
# 3 | BA0123 | C | 112.34 | 575.99
# 4 | SG0456 | E | 73.77 | 575.99
expected_table = data.table.zip expected_column
result.should_equal expected_table
suite_builder.group "running min" group_builder->
data = Data.setup
group_builder.specify "Not setting the as name gives default name based on of column" <|
result = data.table.running Statistic.Minimum "Ticket Price"
expected_column = Column.from_vector "Running Minimum of Ticket Price" [100.5, 100.5, 73.23, 73.23, 73.23]
# | Flight | Passenger | Ticket Price | Running Minimum of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 100.5
# 2 | SG0456 | A | 73.23 | 73.23
# 3 | BA0123 | C | 112.34 | 73.23
# 4 | SG0456 | E | 73.77 | 73.23
expected_table = data.table.zip expected_column
result.should_equal expected_table
suite_builder.group "nothing handling" group_builder->
# | Flight | Passenger | Ticket Price
#---+--------+-----------+--------------
# 0 | BA0123 | A | 100.5
# 1 | BA0123 | B | 575.99
# 2 | SG0456 | A | nothing
# 3 | BA0123 | C | nothing
# 4 | SG0456 | E | 73.77
flight = ["Flight", ["BA0123", "BA0123", "SG0456", "BA0123", "SG0456"]]
passenger = ["Passenger", ["A", "B", "A", "C", "E"]]
ticket_price = ["Ticket Price", [100.50, 575.99, Nothing, Nothing, 73.77]]
table = Table.new [flight, passenger, ticket_price]
group_builder.specify "Running count doesn't care about nothing values" <|
result = table.running Statistic.Count "Passenger"
expected_column = Column.from_vector "Running Count of Passenger" [1, 2, 3, 4, 5]
# | Flight | Passenger | Ticket Price | Running Count of Passenger
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 1
# 1 | BA0123 | B | 575.99 | 2
# 2 | SG0456 | A | nothing | 3
# 3 | BA0123 | C | nothing | 4
# 4 | SG0456 | E | 73.77 | 5
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running sum works ignores nothing values" <|
result = table.running Statistic.Sum "Ticket Price"
expected_column = Column.from_vector "Running Sum of Ticket Price" [100.5, 676.49, 676.49, 676.49, 750.26]
# | Flight | Passenger | Ticket Price | Running Sum of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 676.49
# 2 | SG0456 | A | Nothing | 676.49
# 3 | BA0123 | C | Nothing | 676.49
# 4 | SG0456 | E | 73.77 | 649.76
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running min ignores nothing values and works with grouping" <|
result = table.running Statistic.Minimum "Ticket Price" "Running" ["Flight"]
expected_column = Column.from_vector "Running" [100.5, 100.5, Nothing, 100.5, 73.77]
# | Flight | Passenger | Ticket Price | Running
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 100.5
# 2 | SG0456 | A | Nothing | Nothing
# 3 | BA0123 | C | Nothing | 100.5
# 4 | SG0456 | E | 73.77 | 73.77
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running max ignores nothing values and works with grouping" <|
result = table.running Statistic.Maximum "Ticket Price" "Running" ["Flight"]
expected_column = Column.from_vector "Running" [100.5, 575.99, Nothing, 575.99, 73.77]
# | Flight | Passenger | Ticket Price | Running
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 575.99
# 2 | SG0456 | A | Nothing | Nothing
# 3 | BA0123 | C | Nothing | 575.99
# 4 | SG0456 | E | 73.77 | 73.77
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running mean ignores nothing values" <|
result = table.running Statistic.Mean "Ticket Price" "Running"
expected_column = Column.from_vector "Running" [100.5, 338.245, 338.245, 338.245, 250.08666666666667]
# | Flight | Passenger | Ticket Price | Running
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 338.245
# 2 | SG0456 | A | Nothing | 338.245
# 3 | BA0123 | C | Nothing | 338.245
# 4 | SG0456 | E | 73.77 | 250.08666666666667
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running mean ignores nothing values and works when first value is Nothing" <|
result = table.running Statistic.Mean "Ticket Price" "Running" ["Flight"]
expected_column = Column.from_vector "Running" [100.5, 338.245, Nothing, 338.245, 73.77]
# | Flight | Passenger | Ticket Price | Running
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 100.5
# 1 | BA0123 | B | 575.99 | 338.245
# 2 | SG0456 | A | Nothing | Nothing
# 3 | BA0123 | C | Nothing | 338.245
# 4 | SG0456 | E | 73.77 | 73.77
expected_table = table.zip expected_column
result.should_equal expected_table
suite_builder.group "different types" group_builder->
# | Flight | Passenger | Ticket Price
#---+--------+-----------+--------------
# 0 | BA0123 | A | 1
# 1 | BA0123 | B | 2
# 2 | SG0456 | A | 3
# 3 | BA0123 | C | 4
# 4 | SG0456 | E | 5
flight = ["Flight", ["BA0123", "BA0123", "SG0456", "BA0123", "SG0456"]]
passenger = ["Passenger", ["A", "B", "A", "C", "E"]]
ticket_price = ["Ticket Price", [1, 2, 3, 4, 5]]
table = Table.new [flight, passenger, ticket_price]
group_builder.specify "Running sum works over an integer column" <|
result = table.running Statistic.Sum "Ticket Price"
expected_column = Column.from_vector "Running Sum of Ticket Price" [1.0, 3.0, 6.0, 10.0, 15.0]
# | Flight | Passenger | Ticket Price | Running Sum of Ticket Price
#---+--------+-----------+--------------+------------------------------
# 0 | BA0123 | A | 1 | 1.0
# 1 | BA0123 | B | 2 | 3.0
# 2 | SG0456 | A | 3 | 6.0
# 3 | BA0123 | C | 4 | 10.0
# 4 | SG0456 | E | 5 | 15.0
expected_table = table.zip expected_column
result.should_equal expected_table
group_builder.specify "Running sum does not work over a string column" <|
(table.running Statistic.Sum "Passenger").should_fail_with Invalid_Value_Type
suite_builder.group "Unsupported statistics" group_builder->
data = Data.setup
group_builder.specify "RSquared is not supported" <|
(data.table.running (Statistic.R_Squared [1, 2 ,3]) "Ticket Price").should_fail_with Illegal_Argument
group_builder.specify "Covariance is not supported" <|
(data.table.running (Statistic.Covariance []) "Ticket Price").should_fail_with Illegal_Argument
group_builder.specify "Pearson is not supported" <|
(data.table.running (Statistic.Pearson []) "Ticket Price").should_fail_with Illegal_Argument
group_builder.specify "Spearman is not supported" <|
(data.table.running (Statistic.Spearman []) "Ticket Price").should_fail_with Illegal_Argument
main filter=Nothing =
suite = Test.build suite_builder->