From e25ec96aaa9f57e407af01540e132906d6c5e4fb Mon Sep 17 00:00:00 2001 From: AdRiley Date: Thu, 9 May 2024 11:45:29 +0300 Subject: [PATCH] Add table running variance skew sd and kurtosis (#9854) Adds support for Variance, Skew, Standard Deviation and Kurtosis to Table.Running. --- .../Base/0.0.0-dev/src/Data/Statistics.enso | 9 +- .../org/enso/base/statistics/Statistic.java | 9 +- .../org/enso/table/operations/AddRunning.java | 271 ++++++++++++++++-- .../src/In_Memory/Table_Running_Spec.enso | 94 +++++- 4 files changed, 354 insertions(+), 29 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso index e62e9714512..5806b456860 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Statistics.enso @@ -145,9 +145,12 @@ type Statistic Statistic.Maximum -> Java_Statistic.Maximum Statistic.Sum -> Java_Statistic.Sum Statistic.Mean -> Java_Statistic.Mean - Statistic.Variance _ -> Java_Statistic.Variance - Statistic.Standard_Deviation _ -> Java_Statistic.StandardDeviation - Statistic.Skew _ -> Java_Statistic.Skew + Statistic.Variance True -> Java_Statistic.VariancePopulation + Statistic.Variance False -> Java_Statistic.VarianceSample + Statistic.Standard_Deviation True -> Java_Statistic.StandardDeviationPopulation + Statistic.Standard_Deviation False -> Java_Statistic.StandardDeviationSample + Statistic.Skew True -> Java_Statistic.SkewPopulation + Statistic.Skew False -> Java_Statistic.SkewSample Statistic.Kurtosis -> Java_Statistic.Kurtosis Statistic.Covariance _ -> Java_Statistic.Covariance Statistic.Pearson _ -> Java_Statistic.Pearson diff --git a/std-bits/base/src/main/java/org/enso/base/statistics/Statistic.java b/std-bits/base/src/main/java/org/enso/base/statistics/Statistic.java index 036dc20e152..fe99ee6bef2 100644 --- a/std-bits/base/src/main/java/org/enso/base/statistics/Statistic.java +++ b/std-bits/base/src/main/java/org/enso/base/statistics/Statistic.java @@ -6,9 +6,12 @@ public enum Statistic { Maximum, Sum, Mean, - Variance, - Standard_Deviation, - Skew, + VariancePopulation, + VarianceSample, + StandardDeviationPopulation, + StandardDeviationSample, + SkewPopulation, + SkewSample, Kurtosis, Covariance, Pearson, diff --git a/std-bits/table/src/main/java/org/enso/table/operations/AddRunning.java b/std-bits/table/src/main/java/org/enso/table/operations/AddRunning.java index 9e25ff0c1e9..853361b0d84 100644 --- a/std-bits/table/src/main/java/org/enso/table/operations/AddRunning.java +++ b/std-bits/table/src/main/java/org/enso/table/operations/AddRunning.java @@ -54,6 +54,28 @@ public class AddRunning { } return new RunningMaxStatistic(sourceColumn, problemAggregator); } + case VariancePopulation -> { + return new RunningVarianceStatistic(sourceColumn, problemAggregator, true); + } + case VarianceSample -> { + return new RunningVarianceStatistic(sourceColumn, problemAggregator, false); + } + case StandardDeviationPopulation -> { + return new RunningStandardDeviationStatistic(sourceColumn, problemAggregator, true); + } + case StandardDeviationSample -> { + return new RunningStandardDeviationStatistic(sourceColumn, problemAggregator, false); + } + case SkewPopulation -> { + return new RunningSkewStatistic(sourceColumn, problemAggregator, true); + } + case SkewSample -> { + return new RunningSkewStatistic(sourceColumn, problemAggregator, false); + } + case Kurtosis -> { + return new RunningKurtosisStatistic(sourceColumn, problemAggregator); + } + default -> throw new IllegalArgumentException("Unsupported statistic: " + statistic); } } @@ -199,14 +221,6 @@ public class AddRunning { public RunningIterator getNewIterator() { return new RunningSumIterator(); } - - private static class RunningSumIterator extends RunningIteratorBase { - - @Override - public void increment(double value) { - current += value; - } - } } private static class RunningMeanStatistic extends RunningStatisticBase { @@ -219,27 +233,240 @@ public class AddRunning { public RunningIterator getNewIterator() { return new RunningMeanIterator(); } + } - private static class RunningMeanIterator extends RunningIteratorBase { + private static class RunningVarianceStatistic extends RunningStatisticBase { - private int currentCount; + private final boolean isPopulationVariance; - @Override - public void increment(double value) { - current += value; - currentCount++; + RunningVarianceStatistic( + Column sourceColumn, ProblemAggregator problemAggregator, boolean isPopulationVariance) { + super(sourceColumn, problemAggregator, new DoubleHandler()); + this.isPopulationVariance = isPopulationVariance; + } + + @Override + public RunningIterator getNewIterator() { + return new RunningVarianceIterator(isPopulationVariance); + } + } + + private static class RunningStandardDeviationStatistic extends RunningStatisticBase { + + private final boolean isPopulation; + + RunningStandardDeviationStatistic( + Column sourceColumn, ProblemAggregator problemAggregator, boolean isPopulation) { + super(sourceColumn, problemAggregator, new DoubleHandler()); + this.isPopulation = isPopulation; + } + + @Override + public RunningIterator getNewIterator() { + return new RunningStandardDeviationIterator(isPopulation); + } + } + + private static class RunningSkewStatistic extends RunningStatisticBase { + + private final boolean isPopulation; + + RunningSkewStatistic( + Column sourceColumn, ProblemAggregator problemAggregator, boolean isPopulation) { + super(sourceColumn, problemAggregator, new DoubleHandler()); + this.isPopulation = isPopulation; + } + + @Override + public RunningIterator getNewIterator() { + return new RunningSkewIterator(isPopulation); + } + } + + private static class RunningKurtosisStatistic extends RunningStatisticBase { + + RunningKurtosisStatistic(Column sourceColumn, ProblemAggregator problemAggregator) { + super(sourceColumn, problemAggregator, new DoubleHandler()); + } + + @Override + public RunningIterator getNewIterator() { + return new RunningKurtosisIterator(); + } + } + + private static class RunningSumIterator extends RunningIteratorBase { + + protected double sum; + + @Override + public void initialize(double value) { + super.initialize(value); + sum = value; + } + + @Override + public void increment(double value) { + sum += value; + } + + @Override + public double getCurrent() { + return sum; + } + } + + private static class RunningMeanIterator extends RunningSumIterator { + + protected int currentCount; + + @Override + public void increment(double value) { + super.increment(value); + currentCount++; + } + + @Override + public void initialize(double value) { + super.initialize(value); + currentCount = 1; + } + + @Override + public double getCurrent() { + return sum / currentCount; + } + } + + private static class RunningVarianceIterator extends RunningMeanIterator { + + protected double sumSquares; + protected boolean isPopulation; + + RunningVarianceIterator(boolean isPopulation) { + this.isPopulation = isPopulation; + } + + @Override + public void increment(double value) { + super.increment(value); + sumSquares += value * value; + } + + @Override + public void initialize(double value) { + super.initialize(value); + sumSquares = value * value; + } + + @Override + public double getCurrent() { + double mean = super.getCurrent(); + double denominator = isPopulation ? currentCount : currentCount - 1; + return (sumSquares - 2 * mean * sum + currentCount * mean * mean) / denominator; + } + } + + private static class RunningStandardDeviationIterator extends RunningVarianceIterator { + + RunningStandardDeviationIterator(boolean isPopulation) { + super(isPopulation); + } + + @Override + public double getCurrent() { + return Math.sqrt(super.getCurrent()); + } + } + + private static class RunningSkewIterator extends RunningStandardDeviationIterator { + + protected double sumCubes; + + RunningSkewIterator(boolean isPopulation) { + super(isPopulation); + } + + @Override + public void increment(double value) { + super.increment(value); + sumCubes += value * value * value; + } + + @Override + public void initialize(double value) { + super.initialize(value); + sumCubes = value * value * value; + } + + @Override + public double getCurrent() { + if (currentCount <= 2) { + return Double.NaN; } + double mean = sum / currentCount; + double standardDeviation = super.getCurrent(); + double denominator = + isPopulation + ? currentCount + : ((double) ((currentCount - 1) * (currentCount - 2)) / (double) currentCount); + double scale = + 1.0 / (standardDeviation * standardDeviation * standardDeviation) / denominator; + double skew = (sumCubes - 3 * mean * sumSquares + 2 * mean * mean * sum) * scale; + return skew; + } + } - @Override - public void initialize(double value) { - current = value; - currentCount = 1; - } + private static class RunningKurtosisIterator extends RunningVarianceIterator { - @Override - public double getCurrent() { - return current / currentCount; + private double sumCubes; + private double sumQuads; + + RunningKurtosisIterator() { + super(false); + } + + @Override + public void increment(double value) { + super.increment(value); + sumCubes += value * value * value; + sumQuads += value * value * value * value; + } + + @Override + public void initialize(double value) { + super.initialize(value); + sumCubes = value * value * value; + sumQuads = value * value * value * value; + currentCount = 1; + } + + @Override + public double getCurrent() { + if (currentCount <= 3) { + return Double.NaN; } + double mean = sum / currentCount; + double variance = super.getCurrent(); + double scale = + (double) (currentCount * (currentCount + 1)) + / (double) + ((currentCount - 1) + * (currentCount - 2) + * (currentCount - 3) + * variance + * variance); + double shift = + (double) (3 * (currentCount - 1) * (currentCount - 1)) + / (double) ((currentCount - 2) * (currentCount - 3)); + double kurtosis = + (sumQuads + - 4 * mean * sumCubes + + 6 * mean * mean * sumSquares + - 3 * mean * mean * mean * sum) + * scale + - shift; + return kurtosis; } } diff --git a/test/Table_Tests/src/In_Memory/Table_Running_Spec.enso b/test/Table_Tests/src/In_Memory/Table_Running_Spec.enso index 742d276d36e..51ab3f13da5 100644 --- a/test/Table_Tests/src/In_Memory/Table_Running_Spec.enso +++ b/test/Table_Tests/src/In_Memory/Table_Running_Spec.enso @@ -245,7 +245,7 @@ add_specs suite_builder = # 4 | SG0456 | E | 73.77 | 73.23 expected_table = data.table.zip expected_column result.should_equal expected_table - group_builder.specify "Can provide running mmin of integer columns (returning column of integers)" <| + group_builder.specify "Can provide running min of integer columns (returning column of integers)" <| result = data.integer_table.running Statistic.Minimum "Ticket Price" "Min ticket cost" expected_column = Column.from_vector "Min ticket cost" [101, 101, 73, 73, 73] # | Flight | Passenger | Ticket Price | Min ticket cost @@ -271,6 +271,98 @@ add_specs suite_builder = # 4 | SG0456 | E | 74 | 73 expected_table = int16_table.zip expected_column result.should_equal expected_table + suite_builder.group "running variance" group_builder-> + data = Data.setup + group_builder.specify "Can calculate Variance with population" <| + result = data.table.running (Statistic.Variance True) "Ticket Price" + expected_column = Column.from_vector "Running (Variance True) of Ticket Price" [0.0, 56522.685024999984, 53289.11228888889, 43515.194424999994, 38026.81874399999] + # | Flight | Passenger | Ticket Price | Running (Variance True) of Ticket Price + #---+--------+-----------+--------------+------------------------- + # 0 | BA0123 | A | 100.5 | 0.0 + # 1 | BA0123 | B | 575.99 | 56522.685024999984 + # 2 | SG0456 | A | 73.23 | 53289.11228888889 + # 3 | BA0123 | C | 112.34 | 43515.194424999994 + # 4 | SG0456 | E | 73.77 | 38026.81874399999 + expected_table = data.table.zip expected_column + result.should_equal expected_table + group_builder.specify "Can calculate Variance without population" <| + result = data.table.running (Statistic.Variance False) "Ticket Price" + expected_column = Column.from_vector "Running (Variance False) of Ticket Price" [Number.nan, 113045.37004999997, 79933.66843333334, 58020.25923333332, 47533.52342999999] + # | Flight | Passenger | Ticket Price | Running (Variance False) of Ticket Price + #---+--------+-----------+--------------+------------------------- + # 0 | BA0123 | A | 100.5 | NaN + # 1 | BA0123 | B | 575.99 | 113045.37004999997 + # 2 | SG0456 | A | 73.23 | 79933.66843333334 + # 3 | BA0123 | C | 112.34 | 58020.25923333332 + # 4 | SG0456 | E | 73.77 | 47533.52342999999 + expected_table = data.table.zip expected_column + result.should_equal expected_table + suite_builder.group "running standard deviation" group_builder-> + data = Data.setup + group_builder.specify "Can calculate Standard Deviation with population" <| + result = data.table.running (Statistic.Standard_Deviation True) "Ticket Price" + expected_column = Column.from_vector "Running (Standard_Deviation True) of Ticket Price" [0.0, 237.74499999999998, 230.84434645208208, 208.60295881171004, 195.00466339039176] + # | Flight | Passenger | Ticket Price | Running (Standard_Deviation True) of Ticket Price + #---+--------+-----------+--------------+------------------------- + # 0 | BA0123 | A | 100.5 | 0.0 + # 1 | BA0123 | B | 575.99 | 237.74499999999998 + # 2 | SG0456 | A | 73.23 | 230.84434645208208 + # 3 | BA0123 | C | 112.34 | 208.60295881171004 + # 4 | SG0456 | E | 73.77 | 195.00466339039176 + expected_table = data.table.zip expected_column + result.should_equal expected_table + group_builder.specify "Can calculate Standard Deviation without population" <| + result = data.table.running (Statistic.Standard_Deviation False) "Ticket Price" + expected_column = Column.from_vector "Running (Standard_Deviation False) of Ticket Price" [Number.nan, 336.2222033863914, 282.7254294069307, 240.8739488473864, 218.02184163519027] + # | Flight | Passenger | Ticket Price | Running (Standard_Deviation False) of Ticket Price + #---+--------+-----------+--------------+------------------------- + # 0 | BA0123 | A | 100.5 | NaN + # 1 | BA0123 | B | 575.99 | 336.2222033863914 + # 2 | SG0456 | A | 73.23 | 282.7254294069307 + # 3 | BA0123 | C | 112.34 | 240.8739488473864 + # 4 | SG0456 | E | 73.77 | 218.02184163519027 + expected_table = data.table.zip expected_column + result.should_equal expected_table + suite_builder.group "running skew" group_builder-> + data = Data.setup + group_builder.specify "Can calculate skew with population" <| + result = data.table.running (Statistic.Skew True) "Ticket Price" + expected_column = Column.from_vector "Running (Skew True) of Ticket Price" [Number.nan, Number.nan, 0.6997131676317522, 1.138558183958172, 1.4773820041422983] + # | Flight | Passenger | Ticket Price | Running (Skew True) of Ticket Price + #---+--------+-----------+--------------+------------------------- + # 0 | BA0123 | A | 100.5 | NaN + # 1 | BA0123 | B | 575.99 | NaN + # 2 | SG0456 | A | 73.23 | 0.6997131676317522 + # 3 | BA0123 | C | 112.34 | 1.138558183958172 + # 4 | SG0456 | E | 73.77 | 1.4773820041422983 + expected_table = data.table.zip expected_column + result.should_equal expected_table + group_builder.specify "Can calculate skew without population" <| + result = data.table.running (Statistic.Skew False) "Ticket Price" + expected_column = Column.from_vector "Running (Skew False) of Ticket Price" [Number.nan, Number.nan, 1.7139402270043034, 1.9720406219889064, 2.202351059998037] + # | Flight | Passenger | Ticket Price | Running (Skew False) of Ticket Price + #---+--------+-----------+--------------+------------------------- + # 0 | BA0123 | A | 100.5 | NaN + # 1 | BA0123 | B | 575.99 | NaN + # 2 | SG0456 | A | 73.23 | 1.7139402270043034 + # 3 | BA0123 | C | 112.34 | 1.9720406219889064 + # 4 | SG0456 | E | 73.77 | 2.202351059998037 + expected_table = data.table.zip expected_column + result.should_equal expected_table + suite_builder.group "running kurtosis" group_builder-> + data = Data.setup + group_builder.specify "Can calculate kurtosis" <| + result = data.table.running Statistic.Kurtosis "Ticket Price" + expected_column = Column.from_vector "Running Kurtosis of Ticket Price" [Number.nan, Number.nan, Number.nan, 3.910697052351704, 4.878357490643253] + # | Flight | Passenger | Ticket Price | Running Kurtosis of Ticket Price + #---+--------+-----------+--------------+------------------------- + # 0 | BA0123 | A | 100.5 | NaN + # 1 | BA0123 | B | 575.99 | NaN + # 2 | SG0456 | A | 73.23 | NaN + # 3 | BA0123 | C | 112.34 | 3.910697052351704 + # 4 | SG0456 | E | 73.77 | 4.878357490643253 + expected_table = data.table.zip expected_column + result.should_equal expected_table suite_builder.group "nothing handling" group_builder-> # | Flight | Passenger | Ticket Price #---+--------+-----------+--------------