mirror of
https://github.com/enso-org/enso.git
synced 2024-11-26 17:06:48 +03:00
Add table running functionality for Sum, Mean, Min, Max. (#9577)
* Add Table.Running * Code Review fixes * Code Review changes * Change null handling
This commit is contained in:
parent
d665f4d9c2
commit
4a97bfa31f
@ -654,6 +654,7 @@
|
||||
- [Added `Decimal.floor`, `.ceil`, and `.trunc`.][9694]
|
||||
- [Added `recursive` option to `File.delete`.][9719]
|
||||
- [Added `Vector.build`.][9725]
|
||||
- [Added `Table.running` method][9577]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -958,6 +959,7 @@
|
||||
[9716]: https://github.com/enso-org/enso/pull/9716
|
||||
[9719]: https://github.com/enso-org/enso/pull/9719
|
||||
[9725]: https://github.com/enso-org/enso/pull/9725
|
||||
[9577]: https://github.com/enso-org/enso/pull/9577
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -23,6 +23,7 @@ polyglot java import java.lang.NullPointerException
|
||||
polyglot java import org.enso.base.CompareException
|
||||
polyglot java import org.enso.base.statistics.CorrelationStatistics
|
||||
polyglot java import org.enso.base.statistics.Rank
|
||||
polyglot java import org.enso.base.statistics.Statistic as Java_Statistic
|
||||
|
||||
## Specifies how to handle ranking of equal values.
|
||||
type Rank_Method
|
||||
@ -137,6 +138,22 @@ type Statistic
|
||||
Statistic.Kurtosis -> 4
|
||||
_ -> Nothing
|
||||
|
||||
## PRIVATE
|
||||
to_java self = case self of
|
||||
Statistic.Count -> Java_Statistic.Count
|
||||
Statistic.Minimum -> Java_Statistic.Minimum
|
||||
Statistic.Maximum -> Java_Statistic.Maximum
|
||||
Statistic.Sum -> Java_Statistic.Sum
|
||||
Statistic.Mean -> Java_Statistic.Mean
|
||||
Statistic.Variance _ -> Java_Statistic.Variance
|
||||
Statistic.Standard_Deviation _ -> Java_Statistic.StandardDeviation
|
||||
Statistic.Skew _ -> Java_Statistic.Skew
|
||||
Statistic.Kurtosis -> Java_Statistic.Kurtosis
|
||||
Statistic.Covariance _ -> Java_Statistic.Covariance
|
||||
Statistic.Pearson _ -> Java_Statistic.Pearson
|
||||
Statistic.Spearman _ -> Java_Statistic.Spearman
|
||||
Statistic.R_Squared _ -> Java_Statistic.R_Squared
|
||||
|
||||
## PRIVATE
|
||||
Compute a single statistic on a vector like object.
|
||||
|
||||
@ -191,14 +208,7 @@ type Statistic
|
||||
- statistics: Set of statistics to calculate.
|
||||
running_bulk : Vector -> Vector Statistic -> Vector Any
|
||||
running_bulk data statistics=[Statistic.Count, Statistic.Sum] =
|
||||
is_unsupported s = case s of
|
||||
Statistic.Covariance _ -> True
|
||||
Statistic.Pearson _ -> True
|
||||
Statistic.Spearman _ -> True
|
||||
Statistic.R_Squared _ -> True
|
||||
_ -> False
|
||||
|
||||
if statistics.any is_unsupported then Error.throw (Illegal_Argument.Error ("Unsupported Statistics ( " + (statistics.filter is_unsupported . to_text) ") for running calculations.")) else
|
||||
check_running_support statistics <|
|
||||
moment_order = statistics.map on_problems=No_Wrap .order
|
||||
has_min_max = statistics.any (s-> s == Statistic.Minimum || s == Statistic.Maximum)
|
||||
max_moment_order = moment_order.filter (v-> v != Nothing) . fold 0 .max
|
||||
@ -270,6 +280,21 @@ type Statistic
|
||||
rank_data input method=Rank_Method.Average =
|
||||
method.compute input
|
||||
|
||||
## PRIVATE
|
||||
Check if the statistics are supported for running calculations.
|
||||
check_running_support : Vector Statistic -> Any -> Any
|
||||
check_running_support statistics ~action =
|
||||
is_unsupported s = case s of
|
||||
Statistic.Covariance _ -> True
|
||||
Statistic.Pearson _ -> True
|
||||
Statistic.Spearman _ -> True
|
||||
Statistic.R_Squared _ -> True
|
||||
_ -> False
|
||||
|
||||
if statistics.any is_unsupported then Error.throw (Illegal_Argument.Error ("Unsupported Statistics ( " + (statistics.filter is_unsupported . to_text) + ") for running calculations.")) else
|
||||
action
|
||||
|
||||
|
||||
## PRIVATE
|
||||
wrap_java_call : Any -> Any
|
||||
wrap_java_call ~function =
|
||||
|
@ -0,0 +1,44 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Errors.Common.Unsupported_Argument_Types
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
|
||||
import project.Column.Column
|
||||
import project.Set_Mode.Set_Mode
|
||||
import project.Sort_Column.Sort_Column
|
||||
import project.Table.Table
|
||||
import project.Internal.Add_Row_Number
|
||||
import project.Internal.Java_Problems
|
||||
import project.Internal.Problem_Builder.Problem_Builder
|
||||
import project.Internal.Table_Helpers
|
||||
from project.Errors import Duplicate_Output_Column_Names
|
||||
import project.Value_Type.Value_Type
|
||||
|
||||
polyglot java import java.lang.ArithmeticException
|
||||
polyglot java import org.enso.table.data.column.storage.numeric.LongRangeStorage
|
||||
polyglot java import org.enso.table.operations.AddRunning
|
||||
|
||||
## PRIVATE
|
||||
add_running : Statistic -> (Text | Integer) -> Text -> Vector (Text | Integer | Regex) | Text | Integer | Regex -> Vector (Text | Sort_Column) | Text -> Problem_Behavior -> Table
|
||||
add_running table (statistic:Statistic=Statistic.Count) (of:Text|Integer=0) (as:Text='') (group_by:(Vector | Text | Integer | Regex)=[]) (order_by:(Vector | Text)=[]) (on_problems:Problem_Behavior=Problem_Behavior.Report_Warning) =
|
||||
check_running_support [statistic] <|
|
||||
of_col = table.at of
|
||||
new_name = if as.is_empty then 'Running ' + statistic.to_text + ' of ' + of_col.name else as
|
||||
case statistic of
|
||||
Statistic.Count ->
|
||||
Add_Row_Number.add_row_number table new_name 1 1 group_by order_by on_problems
|
||||
_ ->
|
||||
Value_Type.expect_numeric of_col <|
|
||||
problem_builder = Problem_Builder.new error_on_missing_columns=True
|
||||
grouping_columns = table.columns_helper.select_columns_helper group_by Case_Sensitivity.Default True problem_builder
|
||||
ordering = Table_Helpers.resolve_order_by table.columns order_by problem_builder
|
||||
source_java_column = of_col.java_column
|
||||
grouping_java_columns = grouping_columns.map .java_column
|
||||
ordering_java_columns = ordering.map c->
|
||||
c.column.java_column
|
||||
directions = ordering.map c->
|
||||
c.associated_selector.direction.to_sign
|
||||
|
||||
Java_Problems.with_problem_aggregator on_problems java_problem_aggregator->
|
||||
new_storage = AddRunning.create_running statistic.to_java source_java_column grouping_java_columns ordering_java_columns directions java_problem_aggregator
|
||||
new_column = Column.from_storage new_name new_storage
|
||||
table.set new_column new_name set_mode=Set_Mode.Add
|
@ -31,6 +31,7 @@ import project.Expression.Expression
|
||||
import project.Expression.Expression_Error
|
||||
import project.Extensions.Table_Conversions
|
||||
import project.Internal.Add_Row_Number
|
||||
import project.Internal.Add_Running
|
||||
import project.Internal.Aggregate_Column_Helper
|
||||
import project.Internal.Column_Naming_Helper.Column_Naming_Helper
|
||||
import project.Internal.Constant_Column.Constant_Column
|
||||
@ -2867,8 +2868,7 @@ type Table
|
||||
transformer col = col.text_replace resolved_term resolved_new_text case_sensitivity only_first
|
||||
Table_Helpers.replace_columns_with_transformed_columns self columns transformer
|
||||
|
||||
## PRIVATE
|
||||
ALIAS cumulative
|
||||
## ALIAS cumulative
|
||||
GROUP Standard.Base.Values
|
||||
ICON dataframe_map_column
|
||||
Adds a new column to the table with a running calculation.
|
||||
@ -2905,10 +2905,7 @@ type Table
|
||||
@of Widget_Helpers.make_column_name_selector
|
||||
running : Statistic -> (Text | Integer) -> Text -> Vector (Text | Integer | Regex) | Text | Integer | Regex -> Vector (Text | Sort_Column) | Text -> Problem_Behavior -> Table
|
||||
running self (statistic:Statistic=Statistic.Count) (of:(Text | Integer)=0) (as:Text='') (group_by:(Vector | Text | Integer | Regex)=[]) (order_by:(Vector | Text)=[]) (on_problems:Problem_Behavior=Problem_Behavior.Report_Warning) =
|
||||
if statistic != Statistic.Count then Error.throw (Illegal_Argument.Error ("Currently only Statistic.Count is supported in Table.running.")) else
|
||||
of_col = self.at of
|
||||
new_name = if as == '' then 'Running ' + statistic.to_text + ' of ' + of_col.name else as
|
||||
Add_Row_Number.add_row_number self new_name 1 1 group_by order_by on_problems
|
||||
Add_Running.add_running self statistic of as group_by order_by on_problems
|
||||
|
||||
## PRIVATE
|
||||
column_naming_helper : Column_Naming_Helper
|
||||
|
@ -0,0 +1,17 @@
|
||||
package org.enso.base.statistics;
|
||||
|
||||
public enum Statistic {
|
||||
Count,
|
||||
Minimum,
|
||||
Maximum,
|
||||
Sum,
|
||||
Mean,
|
||||
Variance,
|
||||
Standard_Deviation,
|
||||
Skew,
|
||||
Kurtosis,
|
||||
Covariance,
|
||||
Pearson,
|
||||
Spearman,
|
||||
R_Squared
|
||||
}
|
@ -0,0 +1,134 @@
|
||||
package org.enso.table.operations;
|
||||
|
||||
import org.enso.base.statistics.Statistic;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.column.storage.numeric.DoubleStorage;
|
||||
import org.enso.table.data.table.Column;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
|
||||
public class AddRunning {
|
||||
|
||||
public static Storage<Double> create_running(
|
||||
Statistic statistic,
|
||||
Column sourceColumn,
|
||||
Column[] groupingColumns,
|
||||
Column[] orderingColumns,
|
||||
int[] directions,
|
||||
ProblemAggregator problemAggregator) {
|
||||
if (orderingColumns.length != directions.length) {
|
||||
throw new IllegalArgumentException(
|
||||
"The number of ordering columns and directions must be the same.");
|
||||
}
|
||||
var runningGenerator =
|
||||
RunningGenerator.createGenerator(
|
||||
sourceColumn, groupingColumns, orderingColumns, directions, problemAggregator);
|
||||
runningGenerator.generate(new RunningIteratorFactoryImpl(statistic));
|
||||
var ret =
|
||||
new DoubleStorage(
|
||||
runningGenerator.result, sourceColumn.getSize(), runningGenerator.isNothing);
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static class RunningIteratorFactoryImpl implements RunningIteratorFactory {
|
||||
|
||||
Statistic statistic;
|
||||
|
||||
RunningIteratorFactoryImpl(Statistic statistic) {
|
||||
this.statistic = statistic;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RunningIterator getIterator() {
|
||||
switch (statistic) {
|
||||
case Sum -> {
|
||||
return new RunningSumIterator();
|
||||
}
|
||||
case Mean -> {
|
||||
return new RunningMeanIterator();
|
||||
}
|
||||
case Minimum -> {
|
||||
return new RunningMinIterator();
|
||||
}
|
||||
case Maximum -> {
|
||||
return new RunningMaxIterator();
|
||||
}
|
||||
default -> throw new IllegalArgumentException("Unsupported statistic: " + statistic);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private abstract static class RunningIteratorBase implements RunningIterator {
|
||||
|
||||
protected double current;
|
||||
private boolean isInitialized = false;
|
||||
|
||||
@Override
|
||||
public Double next(Double value) {
|
||||
if (value != null) {
|
||||
if (!isInitialized) {
|
||||
isInitialized = true;
|
||||
initialize(value);
|
||||
} else {
|
||||
increment(value);
|
||||
}
|
||||
}
|
||||
return !isInitialized ? null : getCurrent();
|
||||
}
|
||||
|
||||
public void initialize(double value) {
|
||||
current = value;
|
||||
}
|
||||
|
||||
public abstract void increment(double value);
|
||||
|
||||
public double getCurrent() {
|
||||
return current;
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningSumIterator extends RunningIteratorBase {
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
current += value;
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningMeanIterator extends RunningIteratorBase {
|
||||
|
||||
private int currentCount;
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
current += value;
|
||||
currentCount++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize(double value) {
|
||||
current = value;
|
||||
currentCount = 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getCurrent() {
|
||||
return current / currentCount;
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningMinIterator extends RunningIteratorBase {
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
current = Math.min(current, value);
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningMaxIterator extends RunningIteratorBase {
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
current = Math.max(current, value);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,194 @@
|
||||
package org.enso.table.operations;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.IntStream;
|
||||
import org.enso.base.polyglot.NumericConverter;
|
||||
import org.enso.base.text.TextFoldingStrategy;
|
||||
import org.enso.table.data.column.storage.Storage;
|
||||
import org.enso.table.data.index.MultiValueIndex;
|
||||
import org.enso.table.data.index.OrderedMultiValueKey;
|
||||
import org.enso.table.data.index.UnorderedMultiValueKey;
|
||||
import org.enso.table.data.table.Column;
|
||||
import org.enso.table.problems.ColumnAggregatedProblemAggregator;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
import org.enso.table.util.ConstantList;
|
||||
|
||||
abstract class RunningGenerator {
|
||||
|
||||
Storage<?> sourceStorage;
|
||||
long[] result;
|
||||
BitSet isNothing;
|
||||
|
||||
RunningGenerator(Column sourceColumn) {
|
||||
this.sourceStorage = sourceColumn.getStorage();
|
||||
result = new long[sourceColumn.getSize()];
|
||||
isNothing = new BitSet();
|
||||
}
|
||||
|
||||
void calculateNextValue(int i, RunningIterator it) {
|
||||
Object value = sourceStorage.getItemBoxed(i);
|
||||
Double dValue = NumericConverter.tryConvertingToDouble(value);
|
||||
Double dNextValue = it.next(dValue);
|
||||
if (dNextValue == null) {
|
||||
isNothing.set(i);
|
||||
} else {
|
||||
result[i] = Double.doubleToRawLongBits(dNextValue);
|
||||
}
|
||||
}
|
||||
|
||||
// implement this method in subclasses to control the order you want to iterate over the data
|
||||
public abstract void generate(RunningIteratorFactory factory);
|
||||
|
||||
public static RunningGenerator createGenerator(
|
||||
Column sourceColumn,
|
||||
Column[] groupingColumns,
|
||||
Column[] orderingColumns,
|
||||
int[] directions,
|
||||
ProblemAggregator problemAggregator) {
|
||||
RunningGenerator runningGenerator;
|
||||
if (groupingColumns.length > 0 && orderingColumns.length > 0) {
|
||||
runningGenerator =
|
||||
new GroupingOrderingRunning(
|
||||
sourceColumn, groupingColumns, orderingColumns, directions, problemAggregator);
|
||||
} else if (groupingColumns.length > 0) {
|
||||
runningGenerator =
|
||||
new GroupingNoOrderingRunning(sourceColumn, groupingColumns, problemAggregator);
|
||||
} else if (orderingColumns.length > 0) {
|
||||
runningGenerator = new NoGroupingOrderingRunning(sourceColumn, orderingColumns, directions);
|
||||
} else {
|
||||
runningGenerator = new NoGroupingNoOrderingRunning(sourceColumn);
|
||||
}
|
||||
return runningGenerator;
|
||||
}
|
||||
}
|
||||
|
||||
class NoGroupingNoOrderingRunning extends RunningGenerator {
|
||||
|
||||
NoGroupingNoOrderingRunning(Column sourceColumn) {
|
||||
super(sourceColumn);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void generate(RunningIteratorFactory factory) {
|
||||
var it = factory.getIterator();
|
||||
for (int i = 0; i < result.length; i++) {
|
||||
calculateNextValue(i, it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class GroupingNoOrderingRunning extends RunningGenerator {
|
||||
|
||||
private final Column[] groupingColumns;
|
||||
private final Storage<?>[] groupingStorages;
|
||||
private final ColumnAggregatedProblemAggregator groupingProblemAggregator;
|
||||
private final List<TextFoldingStrategy> textFoldingStrategy;
|
||||
private final Map<UnorderedMultiValueKey, RunningIterator> groups;
|
||||
|
||||
public GroupingNoOrderingRunning(
|
||||
Column sourceColumn, Column[] groupingColumns, ProblemAggregator problemAggregator) {
|
||||
super(sourceColumn);
|
||||
this.groupingColumns = groupingColumns;
|
||||
groupingStorages =
|
||||
Arrays.stream(groupingColumns).map(Column::getStorage).toArray(Storage[]::new);
|
||||
groupingProblemAggregator = new ColumnAggregatedProblemAggregator(problemAggregator);
|
||||
textFoldingStrategy =
|
||||
ConstantList.make(TextFoldingStrategy.unicodeNormalizedFold, groupingStorages.length);
|
||||
groups = new HashMap<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void generate(RunningIteratorFactory factory) {
|
||||
for (int i = 0; i < result.length; i++) {
|
||||
var key = new UnorderedMultiValueKey(groupingStorages, i, textFoldingStrategy);
|
||||
key.checkAndReportFloatingEquality(
|
||||
groupingProblemAggregator, columnIx -> groupingColumns[columnIx].getName());
|
||||
RunningIterator it = groups.computeIfAbsent(key, k -> factory.getIterator());
|
||||
calculateNextValue(i, it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class NoGroupingOrderingRunning extends RunningGenerator {
|
||||
|
||||
private final Storage<?>[] orderingStorages;
|
||||
private final List<OrderedMultiValueKey> keys;
|
||||
|
||||
public NoGroupingOrderingRunning(
|
||||
Column sourceColumn, Column[] orderingColumns, int[] directions) {
|
||||
super(sourceColumn);
|
||||
int n = orderingColumns[0].getSize();
|
||||
orderingStorages =
|
||||
Arrays.stream(orderingColumns).map(Column::getStorage).toArray(Storage[]::new);
|
||||
keys =
|
||||
new ArrayList<>(
|
||||
IntStream.range(0, n)
|
||||
.mapToObj(i -> new OrderedMultiValueKey(orderingStorages, i, directions))
|
||||
.toList());
|
||||
keys.sort(null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void generate(RunningIteratorFactory factory) {
|
||||
var it = factory.getIterator();
|
||||
for (var key : keys) {
|
||||
var i = key.getRowIndex();
|
||||
calculateNextValue(i, it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class GroupingOrderingRunning extends RunningGenerator {
|
||||
|
||||
private final Column[] groupingColumns;
|
||||
private final Column[] orderingColumns;
|
||||
private final int[] directions;
|
||||
private final Storage<?>[] groupingStorages;
|
||||
private final Storage<?>[] orderingStorages;
|
||||
private final ProblemAggregator problemAggregator;
|
||||
|
||||
public GroupingOrderingRunning(
|
||||
Column sourceColumn,
|
||||
Column[] groupingColumns,
|
||||
Column[] orderingColumns,
|
||||
int[] directions,
|
||||
ProblemAggregator problemAggregator) {
|
||||
super(sourceColumn);
|
||||
this.groupingColumns = groupingColumns;
|
||||
this.orderingColumns = orderingColumns;
|
||||
this.directions = directions;
|
||||
groupingStorages =
|
||||
Arrays.stream(groupingColumns).map(Column::getStorage).toArray(Storage[]::new);
|
||||
ConstantList.make(TextFoldingStrategy.unicodeNormalizedFold, groupingStorages.length);
|
||||
orderingStorages =
|
||||
Arrays.stream(orderingColumns).map(Column::getStorage).toArray(Storage[]::new);
|
||||
this.problemAggregator = problemAggregator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void generate(RunningIteratorFactory factory) {
|
||||
int n = orderingColumns[0].getSize();
|
||||
var groupIndex =
|
||||
MultiValueIndex.makeUnorderedIndex(
|
||||
groupingColumns, n, TextFoldingStrategy.unicodeNormalizedFold, problemAggregator);
|
||||
for (var entry : groupIndex.mapping().entrySet()) {
|
||||
List<Integer> indices = entry.getValue();
|
||||
List<OrderedMultiValueKey> orderingKeys =
|
||||
new ArrayList<>(
|
||||
indices.stream()
|
||||
.map(i -> new OrderedMultiValueKey(orderingStorages, i, directions))
|
||||
.toList());
|
||||
orderingKeys.sort(null);
|
||||
RunningIterator it = factory.getIterator();
|
||||
for (OrderedMultiValueKey key : orderingKeys) {
|
||||
var i = key.getRowIndex();
|
||||
calculateNextValue(i, it);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
package org.enso.table.operations;
|
||||
|
||||
public interface RunningIterator {
|
||||
|
||||
Double next(Double value);
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
package org.enso.table.operations;
|
||||
|
||||
public interface RunningIteratorFactory {
|
||||
|
||||
RunningIterator getIterator();
|
||||
}
|
@ -3,6 +3,7 @@ from Standard.Table import Column, Table
|
||||
from Standard.Test import all
|
||||
from Standard.Table.Errors import all
|
||||
import Standard.Base.Errors.Common.Type_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
|
||||
from project.Util import all
|
||||
|
||||
@ -26,7 +27,7 @@ type Data
|
||||
Data.Value make_table
|
||||
|
||||
add_specs suite_builder =
|
||||
suite_builder.group "running" group_builder->
|
||||
suite_builder.group "running count" group_builder->
|
||||
data = Data.setup
|
||||
group_builder.specify "Defaults add running count of first column" <|
|
||||
result = data.table.running
|
||||
@ -88,6 +89,230 @@ add_specs suite_builder =
|
||||
# 4 | SG0456 | E | 73.77 | 1
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Can provide running count based on order by without grouping" <|
|
||||
result = data.table.running Statistic.Count "Passenger" "Ranked ticket cost" [] ["Ticket Price"]
|
||||
expected_column = Column.from_vector "Ranked ticket cost" [3, 5, 1, 4, 2]
|
||||
# | Flight | Passenger | Ticket Price | Ranked ticket cost
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 3
|
||||
# 1 | BA0123 | B | 575.99 | 5
|
||||
# 2 | SG0456 | A | 73.23 | 1
|
||||
# 3 | BA0123 | C | 112.34 | 4
|
||||
# 4 | SG0456 | E | 73.77 | 2
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "running sum" group_builder->
|
||||
data = Data.setup
|
||||
group_builder.specify "Not setting the as name gives default name based on of column" <|
|
||||
result = data.table.running Statistic.Sum "Ticket Price"
|
||||
expected_column = Column.from_vector "Running Sum of Ticket Price" [100.5, 676.49, 749.72, 862.0600000000001, 935.83]
|
||||
# | Flight | Passenger | Ticket Price | Running Sum of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 100.5
|
||||
# 1 | BA0123 | B | 575.99 | 676.49
|
||||
# 2 | SG0456 | A | 73.23 | 749.72
|
||||
# 3 | BA0123 | C | 112.34 | 862.06
|
||||
# 4 | SG0456 | E | 73.77 | 935.83
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Can group by and provide running sum per group" <|
|
||||
result = data.table.running Statistic.Sum "Ticket Price" "Running" ["Flight"]
|
||||
expected_column = Column.from_vector "Running" [100.5, 676.49, 73.23, 788.83, 147]
|
||||
# | Flight | Passenger | Ticket Price | Running
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 100.5
|
||||
# 1 | BA0123 | B | 575.99 | 676.49
|
||||
# 2 | SG0456 | A | 73.23 | 73.23
|
||||
# 3 | BA0123 | C | 112.34 | 788.83
|
||||
# 4 | SG0456 | E | 73.77 | 147
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Can group by and provide running sum per group based on order by" <|
|
||||
result = data.table.running Statistic.Sum "Ticket Price" "Sum ticket cost per pass" ["Passenger"] ["Ticket Price"]
|
||||
expected_column = Column.from_vector "Sum ticket cost per pass" [173.73000000000002, 575.99, 73.23, 112.34, 73.77]
|
||||
# | Flight | Passenger | Ticket Price | Sum ticket cost per pass
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 173.73
|
||||
# 1 | BA0123 | B | 575.99 | 575.99
|
||||
# 2 | SG0456 | A | 73.23 | 73.23
|
||||
# 3 | BA0123 | C | 112.34 | 112.34
|
||||
# 4 | SG0456 | E | 73.77 | 73.77
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Can provide running sum based on order by without grouping" <|
|
||||
result = data.table.running Statistic.Sum "Ticket Price" "Sum ticket cost" [] ["Ticket Price"]
|
||||
expected_column = Column.from_vector "Sum ticket cost" [247.5, 935.83, 73.23, 359.84000000000003, 147]
|
||||
# | Flight | Passenger | Ticket Price | Ranked ticket cost
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 3
|
||||
# 1 | BA0123 | B | 575.99 | 5
|
||||
# 2 | SG0456 | A | 73.23 | 1
|
||||
# 3 | BA0123 | C | 112.34 | 4
|
||||
# 4 | SG0456 | E | 73.77 | 2
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "running mean" group_builder->
|
||||
data = Data.setup
|
||||
group_builder.specify "Not setting the as name gives default name based on of column" <|
|
||||
result = data.table.running Statistic.Mean "Ticket Price"
|
||||
expected_column = Column.from_vector "Running Mean of Ticket Price" [100.5, 338.245, 249.90666666666667, 215.51500000000001, 187.166]
|
||||
# | Flight | Passenger | Ticket Price | Running Mean of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 100.5
|
||||
# 1 | BA0123 | B | 575.99 | 338.245
|
||||
# 2 | SG0456 | A | 73.23 | 249.90666666666667
|
||||
# 3 | BA0123 | C | 112.34 | 215.51500000000001
|
||||
# 4 | SG0456 | E | 73.77 | 187.166
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "running max" group_builder->
|
||||
data = Data.setup
|
||||
group_builder.specify "Not setting the as name gives default name based on of column" <|
|
||||
result = data.table.running Statistic.Maximum "Ticket Price"
|
||||
expected_column = Column.from_vector "Running Maximum of Ticket Price" [100.5, 575.99, 575.99, 575.99, 575.99]
|
||||
# | Flight | Passenger | Ticket Price | Running Maximum of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 100.5
|
||||
# 1 | BA0123 | B | 575.99 | 575.99
|
||||
# 2 | SG0456 | A | 73.23 | 575.99
|
||||
# 3 | BA0123 | C | 112.34 | 575.99
|
||||
# 4 | SG0456 | E | 73.77 | 575.99
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "running min" group_builder->
|
||||
data = Data.setup
|
||||
group_builder.specify "Not setting the as name gives default name based on of column" <|
|
||||
result = data.table.running Statistic.Minimum "Ticket Price"
|
||||
expected_column = Column.from_vector "Running Minimum of Ticket Price" [100.5, 100.5, 73.23, 73.23, 73.23]
|
||||
# | Flight | Passenger | Ticket Price | Running Minimum of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 100.5
|
||||
# 1 | BA0123 | B | 575.99 | 100.5
|
||||
# 2 | SG0456 | A | 73.23 | 73.23
|
||||
# 3 | BA0123 | C | 112.34 | 73.23
|
||||
# 4 | SG0456 | E | 73.77 | 73.23
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "nothing handling" group_builder->
|
||||
# | Flight | Passenger | Ticket Price
|
||||
#---+--------+-----------+--------------
|
||||
# 0 | BA0123 | A | 100.5
|
||||
# 1 | BA0123 | B | 575.99
|
||||
# 2 | SG0456 | A | nothing
|
||||
# 3 | BA0123 | C | nothing
|
||||
# 4 | SG0456 | E | 73.77
|
||||
flight = ["Flight", ["BA0123", "BA0123", "SG0456", "BA0123", "SG0456"]]
|
||||
passenger = ["Passenger", ["A", "B", "A", "C", "E"]]
|
||||
ticket_price = ["Ticket Price", [100.50, 575.99, Nothing, Nothing, 73.77]]
|
||||
table = Table.new [flight, passenger, ticket_price]
|
||||
group_builder.specify "Running count doesn't care about nothing values" <|
|
||||
result = table.running Statistic.Count "Passenger"
|
||||
expected_column = Column.from_vector "Running Count of Passenger" [1, 2, 3, 4, 5]
|
||||
# | Flight | Passenger | Ticket Price | Running Count of Passenger
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 1
|
||||
# 1 | BA0123 | B | 575.99 | 2
|
||||
# 2 | SG0456 | A | nothing | 3
|
||||
# 3 | BA0123 | C | nothing | 4
|
||||
# 4 | SG0456 | E | 73.77 | 5
|
||||
expected_table = table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Running sum works ignores nothing values" <|
|
||||
result = table.running Statistic.Sum "Ticket Price"
|
||||
expected_column = Column.from_vector "Running Sum of Ticket Price" [100.5, 676.49, 676.49, 676.49, 750.26]
|
||||
# | Flight | Passenger | Ticket Price | Running Sum of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 100.5
|
||||
# 1 | BA0123 | B | 575.99 | 676.49
|
||||
# 2 | SG0456 | A | Nothing | 676.49
|
||||
# 3 | BA0123 | C | Nothing | 676.49
|
||||
# 4 | SG0456 | E | 73.77 | 649.76
|
||||
expected_table = table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Running min ignores nothing values and works with grouping" <|
|
||||
result = table.running Statistic.Minimum "Ticket Price" "Running" ["Flight"]
|
||||
expected_column = Column.from_vector "Running" [100.5, 100.5, Nothing, 100.5, 73.77]
|
||||
# | Flight | Passenger | Ticket Price | Running
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 100.5
|
||||
# 1 | BA0123 | B | 575.99 | 100.5
|
||||
# 2 | SG0456 | A | Nothing | Nothing
|
||||
# 3 | BA0123 | C | Nothing | 100.5
|
||||
# 4 | SG0456 | E | 73.77 | 73.77
|
||||
expected_table = table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Running max ignores nothing values and works with grouping" <|
|
||||
result = table.running Statistic.Maximum "Ticket Price" "Running" ["Flight"]
|
||||
expected_column = Column.from_vector "Running" [100.5, 575.99, Nothing, 575.99, 73.77]
|
||||
# | Flight | Passenger | Ticket Price | Running
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 100.5
|
||||
# 1 | BA0123 | B | 575.99 | 575.99
|
||||
# 2 | SG0456 | A | Nothing | Nothing
|
||||
# 3 | BA0123 | C | Nothing | 575.99
|
||||
# 4 | SG0456 | E | 73.77 | 73.77
|
||||
expected_table = table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Running mean ignores nothing values" <|
|
||||
result = table.running Statistic.Mean "Ticket Price" "Running"
|
||||
expected_column = Column.from_vector "Running" [100.5, 338.245, 338.245, 338.245, 250.08666666666667]
|
||||
# | Flight | Passenger | Ticket Price | Running
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 100.5
|
||||
# 1 | BA0123 | B | 575.99 | 338.245
|
||||
# 2 | SG0456 | A | Nothing | 338.245
|
||||
# 3 | BA0123 | C | Nothing | 338.245
|
||||
# 4 | SG0456 | E | 73.77 | 250.08666666666667
|
||||
expected_table = table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Running mean ignores nothing values and works when first value is Nothing" <|
|
||||
result = table.running Statistic.Mean "Ticket Price" "Running" ["Flight"]
|
||||
expected_column = Column.from_vector "Running" [100.5, 338.245, Nothing, 338.245, 73.77]
|
||||
# | Flight | Passenger | Ticket Price | Running
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 100.5
|
||||
# 1 | BA0123 | B | 575.99 | 338.245
|
||||
# 2 | SG0456 | A | Nothing | Nothing
|
||||
# 3 | BA0123 | C | Nothing | 338.245
|
||||
# 4 | SG0456 | E | 73.77 | 73.77
|
||||
expected_table = table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "different types" group_builder->
|
||||
# | Flight | Passenger | Ticket Price
|
||||
#---+--------+-----------+--------------
|
||||
# 0 | BA0123 | A | 1
|
||||
# 1 | BA0123 | B | 2
|
||||
# 2 | SG0456 | A | 3
|
||||
# 3 | BA0123 | C | 4
|
||||
# 4 | SG0456 | E | 5
|
||||
flight = ["Flight", ["BA0123", "BA0123", "SG0456", "BA0123", "SG0456"]]
|
||||
passenger = ["Passenger", ["A", "B", "A", "C", "E"]]
|
||||
ticket_price = ["Ticket Price", [1, 2, 3, 4, 5]]
|
||||
table = Table.new [flight, passenger, ticket_price]
|
||||
group_builder.specify "Running sum works over an integer column" <|
|
||||
result = table.running Statistic.Sum "Ticket Price"
|
||||
expected_column = Column.from_vector "Running Sum of Ticket Price" [1.0, 3.0, 6.0, 10.0, 15.0]
|
||||
# | Flight | Passenger | Ticket Price | Running Sum of Ticket Price
|
||||
#---+--------+-----------+--------------+------------------------------
|
||||
# 0 | BA0123 | A | 1 | 1.0
|
||||
# 1 | BA0123 | B | 2 | 3.0
|
||||
# 2 | SG0456 | A | 3 | 6.0
|
||||
# 3 | BA0123 | C | 4 | 10.0
|
||||
# 4 | SG0456 | E | 5 | 15.0
|
||||
expected_table = table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Running sum does not work over a string column" <|
|
||||
(table.running Statistic.Sum "Passenger").should_fail_with Invalid_Value_Type
|
||||
suite_builder.group "Unsupported statistics" group_builder->
|
||||
data = Data.setup
|
||||
group_builder.specify "RSquared is not supported" <|
|
||||
(data.table.running (Statistic.R_Squared [1, 2 ,3]) "Ticket Price").should_fail_with Illegal_Argument
|
||||
group_builder.specify "Covariance is not supported" <|
|
||||
(data.table.running (Statistic.Covariance []) "Ticket Price").should_fail_with Illegal_Argument
|
||||
group_builder.specify "Pearson is not supported" <|
|
||||
(data.table.running (Statistic.Pearson []) "Ticket Price").should_fail_with Illegal_Argument
|
||||
group_builder.specify "Spearman is not supported" <|
|
||||
(data.table.running (Statistic.Spearman []) "Ticket Price").should_fail_with Illegal_Argument
|
||||
|
||||
main filter=Nothing =
|
||||
suite = Test.build suite_builder->
|
||||
|
Loading…
Reference in New Issue
Block a user