mirror of
https://github.com/enso-org/enso.git
synced 2024-12-23 04:43:26 +03:00
Add table running variance skew sd and kurtosis (#9854)
Adds support for Variance, Skew, Standard Deviation and Kurtosis to Table.Running.
This commit is contained in:
parent
d395168bc5
commit
e25ec96aaa
@ -145,9 +145,12 @@ type Statistic
|
||||
Statistic.Maximum -> Java_Statistic.Maximum
|
||||
Statistic.Sum -> Java_Statistic.Sum
|
||||
Statistic.Mean -> Java_Statistic.Mean
|
||||
Statistic.Variance _ -> Java_Statistic.Variance
|
||||
Statistic.Standard_Deviation _ -> Java_Statistic.StandardDeviation
|
||||
Statistic.Skew _ -> Java_Statistic.Skew
|
||||
Statistic.Variance True -> Java_Statistic.VariancePopulation
|
||||
Statistic.Variance False -> Java_Statistic.VarianceSample
|
||||
Statistic.Standard_Deviation True -> Java_Statistic.StandardDeviationPopulation
|
||||
Statistic.Standard_Deviation False -> Java_Statistic.StandardDeviationSample
|
||||
Statistic.Skew True -> Java_Statistic.SkewPopulation
|
||||
Statistic.Skew False -> Java_Statistic.SkewSample
|
||||
Statistic.Kurtosis -> Java_Statistic.Kurtosis
|
||||
Statistic.Covariance _ -> Java_Statistic.Covariance
|
||||
Statistic.Pearson _ -> Java_Statistic.Pearson
|
||||
|
@ -6,9 +6,12 @@ public enum Statistic {
|
||||
Maximum,
|
||||
Sum,
|
||||
Mean,
|
||||
Variance,
|
||||
Standard_Deviation,
|
||||
Skew,
|
||||
VariancePopulation,
|
||||
VarianceSample,
|
||||
StandardDeviationPopulation,
|
||||
StandardDeviationSample,
|
||||
SkewPopulation,
|
||||
SkewSample,
|
||||
Kurtosis,
|
||||
Covariance,
|
||||
Pearson,
|
||||
|
@ -54,6 +54,28 @@ public class AddRunning {
|
||||
}
|
||||
return new RunningMaxStatistic(sourceColumn, problemAggregator);
|
||||
}
|
||||
case VariancePopulation -> {
|
||||
return new RunningVarianceStatistic(sourceColumn, problemAggregator, true);
|
||||
}
|
||||
case VarianceSample -> {
|
||||
return new RunningVarianceStatistic(sourceColumn, problemAggregator, false);
|
||||
}
|
||||
case StandardDeviationPopulation -> {
|
||||
return new RunningStandardDeviationStatistic(sourceColumn, problemAggregator, true);
|
||||
}
|
||||
case StandardDeviationSample -> {
|
||||
return new RunningStandardDeviationStatistic(sourceColumn, problemAggregator, false);
|
||||
}
|
||||
case SkewPopulation -> {
|
||||
return new RunningSkewStatistic(sourceColumn, problemAggregator, true);
|
||||
}
|
||||
case SkewSample -> {
|
||||
return new RunningSkewStatistic(sourceColumn, problemAggregator, false);
|
||||
}
|
||||
case Kurtosis -> {
|
||||
return new RunningKurtosisStatistic(sourceColumn, problemAggregator);
|
||||
}
|
||||
|
||||
default -> throw new IllegalArgumentException("Unsupported statistic: " + statistic);
|
||||
}
|
||||
}
|
||||
@ -199,14 +221,6 @@ public class AddRunning {
|
||||
public RunningIterator<Double> getNewIterator() {
|
||||
return new RunningSumIterator();
|
||||
}
|
||||
|
||||
private static class RunningSumIterator extends RunningIteratorBase {
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
current += value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningMeanStatistic extends RunningStatisticBase<Double> {
|
||||
@ -219,27 +233,240 @@ public class AddRunning {
|
||||
public RunningIterator<Double> getNewIterator() {
|
||||
return new RunningMeanIterator();
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningMeanIterator extends RunningIteratorBase {
|
||||
private static class RunningVarianceStatistic extends RunningStatisticBase<Double> {
|
||||
|
||||
private int currentCount;
|
||||
private final boolean isPopulationVariance;
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
current += value;
|
||||
currentCount++;
|
||||
RunningVarianceStatistic(
|
||||
Column sourceColumn, ProblemAggregator problemAggregator, boolean isPopulationVariance) {
|
||||
super(sourceColumn, problemAggregator, new DoubleHandler());
|
||||
this.isPopulationVariance = isPopulationVariance;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RunningIterator<Double> getNewIterator() {
|
||||
return new RunningVarianceIterator(isPopulationVariance);
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningStandardDeviationStatistic extends RunningStatisticBase<Double> {
|
||||
|
||||
private final boolean isPopulation;
|
||||
|
||||
RunningStandardDeviationStatistic(
|
||||
Column sourceColumn, ProblemAggregator problemAggregator, boolean isPopulation) {
|
||||
super(sourceColumn, problemAggregator, new DoubleHandler());
|
||||
this.isPopulation = isPopulation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RunningIterator<Double> getNewIterator() {
|
||||
return new RunningStandardDeviationIterator(isPopulation);
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningSkewStatistic extends RunningStatisticBase<Double> {
|
||||
|
||||
private final boolean isPopulation;
|
||||
|
||||
RunningSkewStatistic(
|
||||
Column sourceColumn, ProblemAggregator problemAggregator, boolean isPopulation) {
|
||||
super(sourceColumn, problemAggregator, new DoubleHandler());
|
||||
this.isPopulation = isPopulation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RunningIterator<Double> getNewIterator() {
|
||||
return new RunningSkewIterator(isPopulation);
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningKurtosisStatistic extends RunningStatisticBase<Double> {
|
||||
|
||||
RunningKurtosisStatistic(Column sourceColumn, ProblemAggregator problemAggregator) {
|
||||
super(sourceColumn, problemAggregator, new DoubleHandler());
|
||||
}
|
||||
|
||||
@Override
|
||||
public RunningIterator<Double> getNewIterator() {
|
||||
return new RunningKurtosisIterator();
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningSumIterator extends RunningIteratorBase {
|
||||
|
||||
protected double sum;
|
||||
|
||||
@Override
|
||||
public void initialize(double value) {
|
||||
super.initialize(value);
|
||||
sum = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
sum += value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getCurrent() {
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningMeanIterator extends RunningSumIterator {
|
||||
|
||||
protected int currentCount;
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
super.increment(value);
|
||||
currentCount++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize(double value) {
|
||||
super.initialize(value);
|
||||
currentCount = 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getCurrent() {
|
||||
return sum / currentCount;
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningVarianceIterator extends RunningMeanIterator {
|
||||
|
||||
protected double sumSquares;
|
||||
protected boolean isPopulation;
|
||||
|
||||
RunningVarianceIterator(boolean isPopulation) {
|
||||
this.isPopulation = isPopulation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
super.increment(value);
|
||||
sumSquares += value * value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize(double value) {
|
||||
super.initialize(value);
|
||||
sumSquares = value * value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getCurrent() {
|
||||
double mean = super.getCurrent();
|
||||
double denominator = isPopulation ? currentCount : currentCount - 1;
|
||||
return (sumSquares - 2 * mean * sum + currentCount * mean * mean) / denominator;
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningStandardDeviationIterator extends RunningVarianceIterator {
|
||||
|
||||
RunningStandardDeviationIterator(boolean isPopulation) {
|
||||
super(isPopulation);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getCurrent() {
|
||||
return Math.sqrt(super.getCurrent());
|
||||
}
|
||||
}
|
||||
|
||||
private static class RunningSkewIterator extends RunningStandardDeviationIterator {
|
||||
|
||||
protected double sumCubes;
|
||||
|
||||
RunningSkewIterator(boolean isPopulation) {
|
||||
super(isPopulation);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
super.increment(value);
|
||||
sumCubes += value * value * value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize(double value) {
|
||||
super.initialize(value);
|
||||
sumCubes = value * value * value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getCurrent() {
|
||||
if (currentCount <= 2) {
|
||||
return Double.NaN;
|
||||
}
|
||||
double mean = sum / currentCount;
|
||||
double standardDeviation = super.getCurrent();
|
||||
double denominator =
|
||||
isPopulation
|
||||
? currentCount
|
||||
: ((double) ((currentCount - 1) * (currentCount - 2)) / (double) currentCount);
|
||||
double scale =
|
||||
1.0 / (standardDeviation * standardDeviation * standardDeviation) / denominator;
|
||||
double skew = (sumCubes - 3 * mean * sumSquares + 2 * mean * mean * sum) * scale;
|
||||
return skew;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize(double value) {
|
||||
current = value;
|
||||
currentCount = 1;
|
||||
}
|
||||
private static class RunningKurtosisIterator extends RunningVarianceIterator {
|
||||
|
||||
@Override
|
||||
public double getCurrent() {
|
||||
return current / currentCount;
|
||||
private double sumCubes;
|
||||
private double sumQuads;
|
||||
|
||||
RunningKurtosisIterator() {
|
||||
super(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void increment(double value) {
|
||||
super.increment(value);
|
||||
sumCubes += value * value * value;
|
||||
sumQuads += value * value * value * value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize(double value) {
|
||||
super.initialize(value);
|
||||
sumCubes = value * value * value;
|
||||
sumQuads = value * value * value * value;
|
||||
currentCount = 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getCurrent() {
|
||||
if (currentCount <= 3) {
|
||||
return Double.NaN;
|
||||
}
|
||||
double mean = sum / currentCount;
|
||||
double variance = super.getCurrent();
|
||||
double scale =
|
||||
(double) (currentCount * (currentCount + 1))
|
||||
/ (double)
|
||||
((currentCount - 1)
|
||||
* (currentCount - 2)
|
||||
* (currentCount - 3)
|
||||
* variance
|
||||
* variance);
|
||||
double shift =
|
||||
(double) (3 * (currentCount - 1) * (currentCount - 1))
|
||||
/ (double) ((currentCount - 2) * (currentCount - 3));
|
||||
double kurtosis =
|
||||
(sumQuads
|
||||
- 4 * mean * sumCubes
|
||||
+ 6 * mean * mean * sumSquares
|
||||
- 3 * mean * mean * mean * sum)
|
||||
* scale
|
||||
- shift;
|
||||
return kurtosis;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -245,7 +245,7 @@ add_specs suite_builder =
|
||||
# 4 | SG0456 | E | 73.77 | 73.23
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Can provide running mmin of integer columns (returning column of integers)" <|
|
||||
group_builder.specify "Can provide running min of integer columns (returning column of integers)" <|
|
||||
result = data.integer_table.running Statistic.Minimum "Ticket Price" "Min ticket cost"
|
||||
expected_column = Column.from_vector "Min ticket cost" [101, 101, 73, 73, 73]
|
||||
# | Flight | Passenger | Ticket Price | Min ticket cost
|
||||
@ -271,6 +271,98 @@ add_specs suite_builder =
|
||||
# 4 | SG0456 | E | 74 | 73
|
||||
expected_table = int16_table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "running variance" group_builder->
|
||||
data = Data.setup
|
||||
group_builder.specify "Can calculate Variance with population" <|
|
||||
result = data.table.running (Statistic.Variance True) "Ticket Price"
|
||||
expected_column = Column.from_vector "Running (Variance True) of Ticket Price" [0.0, 56522.685024999984, 53289.11228888889, 43515.194424999994, 38026.81874399999]
|
||||
# | Flight | Passenger | Ticket Price | Running (Variance True) of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 0.0
|
||||
# 1 | BA0123 | B | 575.99 | 56522.685024999984
|
||||
# 2 | SG0456 | A | 73.23 | 53289.11228888889
|
||||
# 3 | BA0123 | C | 112.34 | 43515.194424999994
|
||||
# 4 | SG0456 | E | 73.77 | 38026.81874399999
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Can calculate Variance without population" <|
|
||||
result = data.table.running (Statistic.Variance False) "Ticket Price"
|
||||
expected_column = Column.from_vector "Running (Variance False) of Ticket Price" [Number.nan, 113045.37004999997, 79933.66843333334, 58020.25923333332, 47533.52342999999]
|
||||
# | Flight | Passenger | Ticket Price | Running (Variance False) of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | NaN
|
||||
# 1 | BA0123 | B | 575.99 | 113045.37004999997
|
||||
# 2 | SG0456 | A | 73.23 | 79933.66843333334
|
||||
# 3 | BA0123 | C | 112.34 | 58020.25923333332
|
||||
# 4 | SG0456 | E | 73.77 | 47533.52342999999
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "running standard deviation" group_builder->
|
||||
data = Data.setup
|
||||
group_builder.specify "Can calculate Standard Deviation with population" <|
|
||||
result = data.table.running (Statistic.Standard_Deviation True) "Ticket Price"
|
||||
expected_column = Column.from_vector "Running (Standard_Deviation True) of Ticket Price" [0.0, 237.74499999999998, 230.84434645208208, 208.60295881171004, 195.00466339039176]
|
||||
# | Flight | Passenger | Ticket Price | Running (Standard_Deviation True) of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | 0.0
|
||||
# 1 | BA0123 | B | 575.99 | 237.74499999999998
|
||||
# 2 | SG0456 | A | 73.23 | 230.84434645208208
|
||||
# 3 | BA0123 | C | 112.34 | 208.60295881171004
|
||||
# 4 | SG0456 | E | 73.77 | 195.00466339039176
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Can calculate Standard Deviation without population" <|
|
||||
result = data.table.running (Statistic.Standard_Deviation False) "Ticket Price"
|
||||
expected_column = Column.from_vector "Running (Standard_Deviation False) of Ticket Price" [Number.nan, 336.2222033863914, 282.7254294069307, 240.8739488473864, 218.02184163519027]
|
||||
# | Flight | Passenger | Ticket Price | Running (Standard_Deviation False) of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | NaN
|
||||
# 1 | BA0123 | B | 575.99 | 336.2222033863914
|
||||
# 2 | SG0456 | A | 73.23 | 282.7254294069307
|
||||
# 3 | BA0123 | C | 112.34 | 240.8739488473864
|
||||
# 4 | SG0456 | E | 73.77 | 218.02184163519027
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "running skew" group_builder->
|
||||
data = Data.setup
|
||||
group_builder.specify "Can calculate skew with population" <|
|
||||
result = data.table.running (Statistic.Skew True) "Ticket Price"
|
||||
expected_column = Column.from_vector "Running (Skew True) of Ticket Price" [Number.nan, Number.nan, 0.6997131676317522, 1.138558183958172, 1.4773820041422983]
|
||||
# | Flight | Passenger | Ticket Price | Running (Skew True) of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | NaN
|
||||
# 1 | BA0123 | B | 575.99 | NaN
|
||||
# 2 | SG0456 | A | 73.23 | 0.6997131676317522
|
||||
# 3 | BA0123 | C | 112.34 | 1.138558183958172
|
||||
# 4 | SG0456 | E | 73.77 | 1.4773820041422983
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
group_builder.specify "Can calculate skew without population" <|
|
||||
result = data.table.running (Statistic.Skew False) "Ticket Price"
|
||||
expected_column = Column.from_vector "Running (Skew False) of Ticket Price" [Number.nan, Number.nan, 1.7139402270043034, 1.9720406219889064, 2.202351059998037]
|
||||
# | Flight | Passenger | Ticket Price | Running (Skew False) of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | NaN
|
||||
# 1 | BA0123 | B | 575.99 | NaN
|
||||
# 2 | SG0456 | A | 73.23 | 1.7139402270043034
|
||||
# 3 | BA0123 | C | 112.34 | 1.9720406219889064
|
||||
# 4 | SG0456 | E | 73.77 | 2.202351059998037
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "running kurtosis" group_builder->
|
||||
data = Data.setup
|
||||
group_builder.specify "Can calculate kurtosis" <|
|
||||
result = data.table.running Statistic.Kurtosis "Ticket Price"
|
||||
expected_column = Column.from_vector "Running Kurtosis of Ticket Price" [Number.nan, Number.nan, Number.nan, 3.910697052351704, 4.878357490643253]
|
||||
# | Flight | Passenger | Ticket Price | Running Kurtosis of Ticket Price
|
||||
#---+--------+-----------+--------------+-------------------------
|
||||
# 0 | BA0123 | A | 100.5 | NaN
|
||||
# 1 | BA0123 | B | 575.99 | NaN
|
||||
# 2 | SG0456 | A | 73.23 | NaN
|
||||
# 3 | BA0123 | C | 112.34 | 3.910697052351704
|
||||
# 4 | SG0456 | E | 73.77 | 4.878357490643253
|
||||
expected_table = data.table.zip expected_column
|
||||
result.should_equal expected_table
|
||||
suite_builder.group "nothing handling" group_builder->
|
||||
# | Flight | Passenger | Ticket Price
|
||||
#---+--------+-----------+--------------
|
||||
|
Loading…
Reference in New Issue
Block a user