Add table running variance skew sd and kurtosis (#9854)

Adds support for Variance, Skew, Standard Deviation and Kurtosis to Table.Running.
This commit is contained in:
AdRiley 2024-05-09 11:45:29 +03:00 committed by GitHub
parent d395168bc5
commit e25ec96aaa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 354 additions and 29 deletions

View File

@ -145,9 +145,12 @@ type Statistic
Statistic.Maximum -> Java_Statistic.Maximum
Statistic.Sum -> Java_Statistic.Sum
Statistic.Mean -> Java_Statistic.Mean
Statistic.Variance _ -> Java_Statistic.Variance
Statistic.Standard_Deviation _ -> Java_Statistic.StandardDeviation
Statistic.Skew _ -> Java_Statistic.Skew
Statistic.Variance True -> Java_Statistic.VariancePopulation
Statistic.Variance False -> Java_Statistic.VarianceSample
Statistic.Standard_Deviation True -> Java_Statistic.StandardDeviationPopulation
Statistic.Standard_Deviation False -> Java_Statistic.StandardDeviationSample
Statistic.Skew True -> Java_Statistic.SkewPopulation
Statistic.Skew False -> Java_Statistic.SkewSample
Statistic.Kurtosis -> Java_Statistic.Kurtosis
Statistic.Covariance _ -> Java_Statistic.Covariance
Statistic.Pearson _ -> Java_Statistic.Pearson

View File

@ -6,9 +6,12 @@ public enum Statistic {
Maximum,
Sum,
Mean,
Variance,
Standard_Deviation,
Skew,
VariancePopulation,
VarianceSample,
StandardDeviationPopulation,
StandardDeviationSample,
SkewPopulation,
SkewSample,
Kurtosis,
Covariance,
Pearson,

View File

@ -54,6 +54,28 @@ public class AddRunning {
}
return new RunningMaxStatistic(sourceColumn, problemAggregator);
}
case VariancePopulation -> {
return new RunningVarianceStatistic(sourceColumn, problemAggregator, true);
}
case VarianceSample -> {
return new RunningVarianceStatistic(sourceColumn, problemAggregator, false);
}
case StandardDeviationPopulation -> {
return new RunningStandardDeviationStatistic(sourceColumn, problemAggregator, true);
}
case StandardDeviationSample -> {
return new RunningStandardDeviationStatistic(sourceColumn, problemAggregator, false);
}
case SkewPopulation -> {
return new RunningSkewStatistic(sourceColumn, problemAggregator, true);
}
case SkewSample -> {
return new RunningSkewStatistic(sourceColumn, problemAggregator, false);
}
case Kurtosis -> {
return new RunningKurtosisStatistic(sourceColumn, problemAggregator);
}
default -> throw new IllegalArgumentException("Unsupported statistic: " + statistic);
}
}
@ -199,14 +221,6 @@ public class AddRunning {
public RunningIterator<Double> getNewIterator() {
return new RunningSumIterator();
}
private static class RunningSumIterator extends RunningIteratorBase {
@Override
public void increment(double value) {
current += value;
}
}
}
private static class RunningMeanStatistic extends RunningStatisticBase<Double> {
@ -219,27 +233,240 @@ public class AddRunning {
public RunningIterator<Double> getNewIterator() {
return new RunningMeanIterator();
}
}
private static class RunningMeanIterator extends RunningIteratorBase {
private static class RunningVarianceStatistic extends RunningStatisticBase<Double> {
private int currentCount;
private final boolean isPopulationVariance;
@Override
public void increment(double value) {
current += value;
currentCount++;
RunningVarianceStatistic(
Column sourceColumn, ProblemAggregator problemAggregator, boolean isPopulationVariance) {
super(sourceColumn, problemAggregator, new DoubleHandler());
this.isPopulationVariance = isPopulationVariance;
}
@Override
public RunningIterator<Double> getNewIterator() {
return new RunningVarianceIterator(isPopulationVariance);
}
}
private static class RunningStandardDeviationStatistic extends RunningStatisticBase<Double> {
private final boolean isPopulation;
RunningStandardDeviationStatistic(
Column sourceColumn, ProblemAggregator problemAggregator, boolean isPopulation) {
super(sourceColumn, problemAggregator, new DoubleHandler());
this.isPopulation = isPopulation;
}
@Override
public RunningIterator<Double> getNewIterator() {
return new RunningStandardDeviationIterator(isPopulation);
}
}
private static class RunningSkewStatistic extends RunningStatisticBase<Double> {
private final boolean isPopulation;
RunningSkewStatistic(
Column sourceColumn, ProblemAggregator problemAggregator, boolean isPopulation) {
super(sourceColumn, problemAggregator, new DoubleHandler());
this.isPopulation = isPopulation;
}
@Override
public RunningIterator<Double> getNewIterator() {
return new RunningSkewIterator(isPopulation);
}
}
private static class RunningKurtosisStatistic extends RunningStatisticBase<Double> {
RunningKurtosisStatistic(Column sourceColumn, ProblemAggregator problemAggregator) {
super(sourceColumn, problemAggregator, new DoubleHandler());
}
@Override
public RunningIterator<Double> getNewIterator() {
return new RunningKurtosisIterator();
}
}
private static class RunningSumIterator extends RunningIteratorBase {
protected double sum;
@Override
public void initialize(double value) {
super.initialize(value);
sum = value;
}
@Override
public void increment(double value) {
sum += value;
}
@Override
public double getCurrent() {
return sum;
}
}
private static class RunningMeanIterator extends RunningSumIterator {
protected int currentCount;
@Override
public void increment(double value) {
super.increment(value);
currentCount++;
}
@Override
public void initialize(double value) {
super.initialize(value);
currentCount = 1;
}
@Override
public double getCurrent() {
return sum / currentCount;
}
}
private static class RunningVarianceIterator extends RunningMeanIterator {
protected double sumSquares;
protected boolean isPopulation;
RunningVarianceIterator(boolean isPopulation) {
this.isPopulation = isPopulation;
}
@Override
public void increment(double value) {
super.increment(value);
sumSquares += value * value;
}
@Override
public void initialize(double value) {
super.initialize(value);
sumSquares = value * value;
}
@Override
public double getCurrent() {
double mean = super.getCurrent();
double denominator = isPopulation ? currentCount : currentCount - 1;
return (sumSquares - 2 * mean * sum + currentCount * mean * mean) / denominator;
}
}
private static class RunningStandardDeviationIterator extends RunningVarianceIterator {
RunningStandardDeviationIterator(boolean isPopulation) {
super(isPopulation);
}
@Override
public double getCurrent() {
return Math.sqrt(super.getCurrent());
}
}
private static class RunningSkewIterator extends RunningStandardDeviationIterator {
protected double sumCubes;
RunningSkewIterator(boolean isPopulation) {
super(isPopulation);
}
@Override
public void increment(double value) {
super.increment(value);
sumCubes += value * value * value;
}
@Override
public void initialize(double value) {
super.initialize(value);
sumCubes = value * value * value;
}
@Override
public double getCurrent() {
if (currentCount <= 2) {
return Double.NaN;
}
double mean = sum / currentCount;
double standardDeviation = super.getCurrent();
double denominator =
isPopulation
? currentCount
: ((double) ((currentCount - 1) * (currentCount - 2)) / (double) currentCount);
double scale =
1.0 / (standardDeviation * standardDeviation * standardDeviation) / denominator;
double skew = (sumCubes - 3 * mean * sumSquares + 2 * mean * mean * sum) * scale;
return skew;
}
}
@Override
public void initialize(double value) {
current = value;
currentCount = 1;
}
private static class RunningKurtosisIterator extends RunningVarianceIterator {
@Override
public double getCurrent() {
return current / currentCount;
private double sumCubes;
private double sumQuads;
RunningKurtosisIterator() {
super(false);
}
@Override
public void increment(double value) {
super.increment(value);
sumCubes += value * value * value;
sumQuads += value * value * value * value;
}
@Override
public void initialize(double value) {
super.initialize(value);
sumCubes = value * value * value;
sumQuads = value * value * value * value;
currentCount = 1;
}
@Override
public double getCurrent() {
if (currentCount <= 3) {
return Double.NaN;
}
double mean = sum / currentCount;
double variance = super.getCurrent();
double scale =
(double) (currentCount * (currentCount + 1))
/ (double)
((currentCount - 1)
* (currentCount - 2)
* (currentCount - 3)
* variance
* variance);
double shift =
(double) (3 * (currentCount - 1) * (currentCount - 1))
/ (double) ((currentCount - 2) * (currentCount - 3));
double kurtosis =
(sumQuads
- 4 * mean * sumCubes
+ 6 * mean * mean * sumSquares
- 3 * mean * mean * mean * sum)
* scale
- shift;
return kurtosis;
}
}

View File

@ -245,7 +245,7 @@ add_specs suite_builder =
# 4 | SG0456 | E | 73.77 | 73.23
expected_table = data.table.zip expected_column
result.should_equal expected_table
group_builder.specify "Can provide running mmin of integer columns (returning column of integers)" <|
group_builder.specify "Can provide running min of integer columns (returning column of integers)" <|
result = data.integer_table.running Statistic.Minimum "Ticket Price" "Min ticket cost"
expected_column = Column.from_vector "Min ticket cost" [101, 101, 73, 73, 73]
# | Flight | Passenger | Ticket Price | Min ticket cost
@ -271,6 +271,98 @@ add_specs suite_builder =
# 4 | SG0456 | E | 74 | 73
expected_table = int16_table.zip expected_column
result.should_equal expected_table
suite_builder.group "running variance" group_builder->
data = Data.setup
group_builder.specify "Can calculate Variance with population" <|
result = data.table.running (Statistic.Variance True) "Ticket Price"
expected_column = Column.from_vector "Running (Variance True) of Ticket Price" [0.0, 56522.685024999984, 53289.11228888889, 43515.194424999994, 38026.81874399999]
# | Flight | Passenger | Ticket Price | Running (Variance True) of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 0.0
# 1 | BA0123 | B | 575.99 | 56522.685024999984
# 2 | SG0456 | A | 73.23 | 53289.11228888889
# 3 | BA0123 | C | 112.34 | 43515.194424999994
# 4 | SG0456 | E | 73.77 | 38026.81874399999
expected_table = data.table.zip expected_column
result.should_equal expected_table
group_builder.specify "Can calculate Variance without population" <|
result = data.table.running (Statistic.Variance False) "Ticket Price"
expected_column = Column.from_vector "Running (Variance False) of Ticket Price" [Number.nan, 113045.37004999997, 79933.66843333334, 58020.25923333332, 47533.52342999999]
# | Flight | Passenger | Ticket Price | Running (Variance False) of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | NaN
# 1 | BA0123 | B | 575.99 | 113045.37004999997
# 2 | SG0456 | A | 73.23 | 79933.66843333334
# 3 | BA0123 | C | 112.34 | 58020.25923333332
# 4 | SG0456 | E | 73.77 | 47533.52342999999
expected_table = data.table.zip expected_column
result.should_equal expected_table
suite_builder.group "running standard deviation" group_builder->
data = Data.setup
group_builder.specify "Can calculate Standard Deviation with population" <|
result = data.table.running (Statistic.Standard_Deviation True) "Ticket Price"
expected_column = Column.from_vector "Running (Standard_Deviation True) of Ticket Price" [0.0, 237.74499999999998, 230.84434645208208, 208.60295881171004, 195.00466339039176]
# | Flight | Passenger | Ticket Price | Running (Standard_Deviation True) of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | 0.0
# 1 | BA0123 | B | 575.99 | 237.74499999999998
# 2 | SG0456 | A | 73.23 | 230.84434645208208
# 3 | BA0123 | C | 112.34 | 208.60295881171004
# 4 | SG0456 | E | 73.77 | 195.00466339039176
expected_table = data.table.zip expected_column
result.should_equal expected_table
group_builder.specify "Can calculate Standard Deviation without population" <|
result = data.table.running (Statistic.Standard_Deviation False) "Ticket Price"
expected_column = Column.from_vector "Running (Standard_Deviation False) of Ticket Price" [Number.nan, 336.2222033863914, 282.7254294069307, 240.8739488473864, 218.02184163519027]
# | Flight | Passenger | Ticket Price | Running (Standard_Deviation False) of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | NaN
# 1 | BA0123 | B | 575.99 | 336.2222033863914
# 2 | SG0456 | A | 73.23 | 282.7254294069307
# 3 | BA0123 | C | 112.34 | 240.8739488473864
# 4 | SG0456 | E | 73.77 | 218.02184163519027
expected_table = data.table.zip expected_column
result.should_equal expected_table
suite_builder.group "running skew" group_builder->
data = Data.setup
group_builder.specify "Can calculate skew with population" <|
result = data.table.running (Statistic.Skew True) "Ticket Price"
expected_column = Column.from_vector "Running (Skew True) of Ticket Price" [Number.nan, Number.nan, 0.6997131676317522, 1.138558183958172, 1.4773820041422983]
# | Flight | Passenger | Ticket Price | Running (Skew True) of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | NaN
# 1 | BA0123 | B | 575.99 | NaN
# 2 | SG0456 | A | 73.23 | 0.6997131676317522
# 3 | BA0123 | C | 112.34 | 1.138558183958172
# 4 | SG0456 | E | 73.77 | 1.4773820041422983
expected_table = data.table.zip expected_column
result.should_equal expected_table
group_builder.specify "Can calculate skew without population" <|
result = data.table.running (Statistic.Skew False) "Ticket Price"
expected_column = Column.from_vector "Running (Skew False) of Ticket Price" [Number.nan, Number.nan, 1.7139402270043034, 1.9720406219889064, 2.202351059998037]
# | Flight | Passenger | Ticket Price | Running (Skew False) of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | NaN
# 1 | BA0123 | B | 575.99 | NaN
# 2 | SG0456 | A | 73.23 | 1.7139402270043034
# 3 | BA0123 | C | 112.34 | 1.9720406219889064
# 4 | SG0456 | E | 73.77 | 2.202351059998037
expected_table = data.table.zip expected_column
result.should_equal expected_table
suite_builder.group "running kurtosis" group_builder->
data = Data.setup
group_builder.specify "Can calculate kurtosis" <|
result = data.table.running Statistic.Kurtosis "Ticket Price"
expected_column = Column.from_vector "Running Kurtosis of Ticket Price" [Number.nan, Number.nan, Number.nan, 3.910697052351704, 4.878357490643253]
# | Flight | Passenger | Ticket Price | Running Kurtosis of Ticket Price
#---+--------+-----------+--------------+-------------------------
# 0 | BA0123 | A | 100.5 | NaN
# 1 | BA0123 | B | 575.99 | NaN
# 2 | SG0456 | A | 73.23 | NaN
# 3 | BA0123 | C | 112.34 | 3.910697052351704
# 4 | SG0456 | E | 73.77 | 4.878357490643253
expected_table = data.table.zip expected_column
result.should_equal expected_table
suite_builder.group "nothing handling" group_builder->
# | Flight | Passenger | Ticket Price
#---+--------+-----------+--------------