mirror of
https://github.com/enso-org/enso.git
synced 2024-11-23 08:08:34 +03:00
Rank Data, Correlation, Covariance, R Squared (#3484)
- Added new `Statistic`s: Covariance, Pearson, Spearman, R Squared - Added `covariance_matrix` function - Added `pearson_correlation` function to compute correlation matrix - Added `rank_data` and Rank_Method type to create rankings of a Vector - Added `spearman_correlation` function to compute Spearman Rank correlation matrix # Important Notes - Added `Panic.throw_wrapped_if_error` and `Panic.handle_wrapped_dataflow_error` to help with errors within a loop. - Removed `Array.set_at` use from `Table.Vector_Builder`
This commit is contained in:
parent
dac49a44b5
commit
1aa0bb3552
@ -131,6 +131,7 @@
|
||||
and made it the default.][3472]
|
||||
- [Implemented a `Table.from Text` conversion allowing to parse strings
|
||||
representing `Delimited` files without storing them on the filesystem.][3478]
|
||||
- [Added rank data, correlation and covariance statistics for `Vector`][3484]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -204,6 +205,7 @@
|
||||
[3472]: https://github.com/enso-org/enso/pull/3472
|
||||
[3486]: https://github.com/enso-org/enso/pull/3486
|
||||
[3478]: https://github.com/enso-org/enso/pull/3478
|
||||
[3484]: https://github.com/enso-org/enso/pull/3484
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -1,16 +1,25 @@
|
||||
from Standard.Base import Boolean, True, False, Nothing, Vector, Number, Any, Error, Array, Panic, Illegal_Argument_Error, Unsupported_Argument_Types
|
||||
|
||||
from Standard.Base.Data.Vector import Empty_Error
|
||||
|
||||
import Standard.Base.Data.Ordering.Comparator
|
||||
|
||||
import Standard.Base.Data.Statistics.Rank_Method
|
||||
|
||||
polyglot java import org.enso.base.statistics.Moments
|
||||
polyglot java import org.enso.base.statistics.CountMinMax
|
||||
polyglot java import org.enso.base.statistics.CorrelationStatistics
|
||||
polyglot java import org.enso.base.statistics.Rank
|
||||
|
||||
polyglot java import java.lang.IllegalArgumentException
|
||||
polyglot java import java.lang.ClassCastException
|
||||
polyglot java import java.lang.NullPointerException
|
||||
|
||||
type Statistic
|
||||
## PRIVATE
|
||||
Convert the Enso Statistic into Java equivalent.
|
||||
to_java : SingleValue
|
||||
to_java = case this of
|
||||
to_moment_statistic : SingleValue
|
||||
to_moment_statistic = case this of
|
||||
Sum -> Moments.SUM
|
||||
Mean -> Moments.MEAN
|
||||
Variance p -> if p then Moments.VARIANCE_POPULATION else Moments.VARIANCE
|
||||
@ -52,6 +61,32 @@ type Statistic
|
||||
## The sample kurtosis of the values.
|
||||
type Kurtosis
|
||||
|
||||
## Calculate the Covariance between data and series.
|
||||
|
||||
Arguments:
|
||||
- series: the series to compute the covariance with.
|
||||
type Covariance (series:Vector)
|
||||
|
||||
## Calculate the Pearson Correlation between data and series.
|
||||
|
||||
Arguments:
|
||||
- series: the series to compute the correlation with.
|
||||
type Pearson (series:Vector)
|
||||
|
||||
## Calculate the Spearman Rank Correlation between data and series.
|
||||
|
||||
Arguments:
|
||||
- series: the series to compute the correlation with.
|
||||
type Spearman (series:Vector)
|
||||
|
||||
## Calculate the coefficient of determination between data and predicted
|
||||
series.
|
||||
|
||||
Arguments:
|
||||
- predicted: the series to compute the r_squared with.
|
||||
type R_Squared (predicted:Vector)
|
||||
|
||||
|
||||
## Compute a single statistic on a vector like object.
|
||||
|
||||
Arguments:
|
||||
@ -69,11 +104,11 @@ compute data statistic=Count =
|
||||
- statistics: Set of statistics to calculate.
|
||||
compute_bulk : Vector -> [Statistic] -> [Any]
|
||||
compute_bulk data statistics=[Count, Sum] =
|
||||
|
||||
count_min_max = statistics.any s->((s.is_a Count) || (s.is_a Minimum) || (s.is_a Maximum))
|
||||
|
||||
java_stats = statistics.map .to_java
|
||||
java_stats = statistics.map .to_moment_statistic
|
||||
skip_java_stats = java_stats.all s->s.is_nothing
|
||||
|
||||
report_invalid _ =
|
||||
statistics.map_with_index i->v->
|
||||
if java_stats.at i . is_nothing then Nothing else
|
||||
@ -97,8 +132,88 @@ compute_bulk data statistics=[Count, Sum] =
|
||||
Maximum ->
|
||||
if count_min_max_values.comparatorError then (Error.throw Vector.Incomparable_Values_Error) else
|
||||
count_min_max_values.maximum
|
||||
Covariance s -> here.calculate_correlation_statistics data s . covariance
|
||||
Pearson s -> here.calculate_correlation_statistics data s . pearsonCorrelation
|
||||
Spearman s -> here.calculate_spearman_rank data s
|
||||
R_Squared s -> here.calculate_correlation_statistics data s . rSquared
|
||||
_ -> stats_array.at i
|
||||
|
||||
|
||||
## Calculate a variance-covariance matrix between the input series.
|
||||
|
||||
Arguments:
|
||||
- data: The input data sets
|
||||
covariance_matrix : [Vector] -> [Vector]
|
||||
covariance_matrix data =
|
||||
stats_vectors = here.calculate_correlation_statistics_matrix data
|
||||
stats_vectors.map v->(v.map .covariance)
|
||||
|
||||
|
||||
## Calculate a Pearson correlation matrix between the input series.
|
||||
|
||||
Arguments:
|
||||
- data: The input data sets
|
||||
pearson_correlation : [Vector] -> [Vector]
|
||||
pearson_correlation data =
|
||||
stats_vectors = here.calculate_correlation_statistics_matrix data
|
||||
stats_vectors.map v->(v.map .pearsonCorrelation)
|
||||
|
||||
|
||||
## Calculate a Spearman Rank correlation matrix between the input series.
|
||||
|
||||
Arguments:
|
||||
- data: The input data sets
|
||||
spearman_correlation : [Vector] -> [Vector]
|
||||
spearman_correlation data =
|
||||
Panic.handle_wrapped_dataflow_error <|
|
||||
output = Vector.new_builder data.length
|
||||
|
||||
0.up_to data.length . each i->
|
||||
output.append <|
|
||||
Vector.new data.length j->
|
||||
if j == i then 1 else
|
||||
if j < i then (output.at j . at i) else
|
||||
Panic.throw_wrapped_if_error <|
|
||||
here.calculate_spearman_rank (data.at i) (data.at j)
|
||||
|
||||
output.to_vector
|
||||
|
||||
|
||||
## PRIVATE
|
||||
wrap_java_call : Any -> Any
|
||||
wrap_java_call ~function =
|
||||
report_unsupported _ = Error.throw (Illegal_Argument_Error ("Can only compute correlations on numerical data sets."))
|
||||
handle_unsupported = Panic.catch Unsupported_Argument_Types handler=report_unsupported
|
||||
|
||||
report_illegal caught_panic = Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage)
|
||||
handle_illegal = Panic.catch IllegalArgumentException handler=report_illegal
|
||||
|
||||
handle_unsupported <| handle_illegal <| function
|
||||
|
||||
|
||||
## PRIVATE
|
||||
Given two series, get a computed CorrelationStatistics object
|
||||
calculate_correlation_statistics : Vector -> Vector -> CorrelationStatistics
|
||||
calculate_correlation_statistics x_data y_data =
|
||||
here.wrap_java_call <| CorrelationStatistics.compute x_data.to_array y_data.to_array
|
||||
|
||||
|
||||
## PRIVATE
|
||||
Given two series, get a compute the Spearman Rank correlation
|
||||
calculate_spearman_rank : Vector -> Vector -> Decimal
|
||||
calculate_spearman_rank x_data y_data =
|
||||
here.wrap_java_call <| CorrelationStatistics.spearmanRankCorrelation x_data.to_array y_data.to_array
|
||||
|
||||
|
||||
## PRIVATE
|
||||
Given a set of series get CorrelationStatistics objects
|
||||
calculate_correlation_statistics_matrix : [Vector] -> [CorrelationStatistics]
|
||||
calculate_correlation_statistics_matrix data =
|
||||
data_array = Vector.new data.length i->(data.at i).to_array . to_array
|
||||
stats_array = here.wrap_java_call <| CorrelationStatistics.computeMatrix data_array
|
||||
Vector.new stats_array.length i->(Vector.Vector (stats_array.at i))
|
||||
|
||||
|
||||
## Compute a single statistic on the vector.
|
||||
|
||||
Arguments:
|
||||
@ -115,3 +230,26 @@ Vector.Vector.compute statistic=Count =
|
||||
Vector.Vector.compute_bulk : [Statistic] -> [Any]
|
||||
Vector.Vector.compute_bulk statistics=[Count, Sum] =
|
||||
here.compute_bulk this statistics
|
||||
|
||||
|
||||
## Assigns a rank to each value of data, dealing with equal values according to the method.
|
||||
|
||||
Arguments:
|
||||
- data: Input data to rank.
|
||||
- method: Method used to deal with equal values.
|
||||
rank_data : Vector -> Rank_Method -> Vector
|
||||
rank_data input method=Rank_Method.Average =
|
||||
java_method = case method of
|
||||
Rank_Method.Minimum -> Rank.Method.MINIMUM
|
||||
Rank_Method.Maximum -> Rank.Method.MAXIMUM
|
||||
Rank_Method.Average -> Rank.Method.AVERAGE
|
||||
Rank_Method.Ordinal -> Rank.Method.ORDINAL
|
||||
Rank_Method.Dense -> Rank.Method.DENSE
|
||||
|
||||
report_nullpointer caught_panic = Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage)
|
||||
handle_nullpointer = Panic.catch NullPointerException handler=report_nullpointer
|
||||
handle_classcast = Panic.catch ClassCastException handler=(Error.throw Vector.Incomparable_Values_Error)
|
||||
|
||||
handle_classcast <| handle_nullpointer <|
|
||||
java_ranks = Rank.rank input.to_array Comparator.new java_method
|
||||
Vector.Vector java_ranks
|
||||
|
@ -0,0 +1,18 @@
|
||||
|
||||
## Specifies how to handle ranking of equal values.
|
||||
type Rank_Method
|
||||
## Use the mean of all ranks for equal values.
|
||||
type Average
|
||||
|
||||
## Use the lowest of all ranks for equal values.
|
||||
type Minimum
|
||||
|
||||
## Use the highest of all ranks for equal values.
|
||||
type Maximum
|
||||
|
||||
## Use same rank value for equal values and next group is the immediate
|
||||
following ranking number.
|
||||
type Dense
|
||||
|
||||
## Equal values are assigned the next rank in order that they occur.
|
||||
type Ordinal
|
@ -55,19 +55,22 @@ fill length ~item =
|
||||
A vector allows to store an arbitrary number of elements in linear memory. It
|
||||
is the recommended data structure for most applications.
|
||||
|
||||
Arguments:
|
||||
- capacity: Initial capacity of the Vector.Builder
|
||||
|
||||
> Example
|
||||
Construct a vector using a builder that contains the items 1 to 10.
|
||||
|
||||
example_new_builder =
|
||||
builder = Vector.new_builder
|
||||
builder = Vector.new_builder 10
|
||||
do_build start stop =
|
||||
builder.append start
|
||||
if start >= stop then Nothing else
|
||||
@Tail_Call do_build start+1 stop
|
||||
do_build 1 10
|
||||
builder.to_vector
|
||||
new_builder : Builder
|
||||
new_builder = Builder.new
|
||||
new_builder : Integer -> Builder
|
||||
new_builder (capacity=1) = Builder.new capacity
|
||||
|
||||
## ADVANCED
|
||||
|
||||
@ -141,13 +144,7 @@ type Vector
|
||||
at : Integer -> Any ! Index_Out_Of_Bounds_Error
|
||||
at index =
|
||||
actual_index = if index < 0 then this.length + index else index
|
||||
## TODO [RW] Ideally we do not want an additional check here, but we
|
||||
should catch a Invalid_Array_Index_Error panic. However, such a catch
|
||||
should still properly forward any other panics or dataflow errors
|
||||
which is not fully possible until the approach to handling Panics is
|
||||
improved, as described in the following Pivotal ticket:
|
||||
https://www.pivotaltracker.com/n/projects/2539304/stories/181029230
|
||||
if actual_index>=0 && actual_index<this.length then this.unsafe_at actual_index else
|
||||
Panic.catch Invalid_Array_Index_Error (this.unsafe_at actual_index) _->
|
||||
Error.throw (Index_Out_Of_Bounds_Error index this.length)
|
||||
|
||||
## ADVANCED
|
||||
@ -1015,12 +1012,15 @@ type Builder
|
||||
|
||||
## Creates a new builder.
|
||||
|
||||
Arguments:
|
||||
- capacity: Initial capacity of the Vector.Builder
|
||||
|
||||
> Example
|
||||
Make a new builder
|
||||
|
||||
Vector.new_builder
|
||||
new : Builder
|
||||
new = Builder (Array.new 1) 0
|
||||
new : Integer->Builder
|
||||
new (capacity=1) = Builder (Array.new capacity) 0
|
||||
|
||||
## Returns the current capacity (i.e. the size of the underlying storage)
|
||||
of this builder.
|
||||
@ -1088,6 +1088,18 @@ type Builder
|
||||
this.append item
|
||||
Nothing
|
||||
|
||||
## Gets an element from the vector at a specified index (0-based).
|
||||
|
||||
Arguments:
|
||||
- index: The location in the vector to get the element from. The index is
|
||||
also allowed be negative, then the elements are indexed from the back
|
||||
of the vector, i.e. -1 will correspond to the last element.
|
||||
at : Integer -> Any ! Index_Out_Of_Bounds_Error
|
||||
at index =
|
||||
actual_index = if index < 0 then this.length + index else index
|
||||
Panic.catch Invalid_Array_Index_Error (this.to_array.at actual_index) _->
|
||||
Error.throw (Index_Out_Of_Bounds_Error index this.length)
|
||||
|
||||
## Checks whether a predicate holds for at least one element of this builder.
|
||||
|
||||
Arguments:
|
||||
|
@ -386,6 +386,23 @@ type Panic
|
||||
True -> caught_panic.convert_to_dataflow_error
|
||||
False -> Panic.throw caught_panic
|
||||
|
||||
## If a dataflow error had occurred, wrap it in a `Wrapped_Dataflow_Error` and promote to a Panic.
|
||||
|
||||
Arguments:
|
||||
- value: value to return if not an error, or rethrow as a Panic.
|
||||
throw_wrapped_if_error : Any -> Any
|
||||
throw_wrapped_if_error ~value =
|
||||
if value.is_error then Panic.throw (Wrapped_Dataflow_Error value.catch) else value
|
||||
|
||||
## Catch any `Wrapped_Dataflow_Error` Panic and rethrow it as a dataflow error.
|
||||
|
||||
Arguments:
|
||||
- action: The code to execute that potentially raised a Wrapped_Dataflow_Error.
|
||||
handle_wrapped_dataflow_error : Any -> Any
|
||||
handle_wrapped_dataflow_error ~action =
|
||||
Panic.catch Wrapped_Dataflow_Error action caught_panic->
|
||||
Error.throw caught_panic.payload.payload
|
||||
|
||||
## The runtime representation of a syntax error.
|
||||
|
||||
Arguments:
|
||||
|
@ -50,8 +50,7 @@ type Vector_Builder
|
||||
array = Array.new this.length
|
||||
go ix elem = case elem of
|
||||
Leaf vec ->
|
||||
vec.map_with_index vi-> elem->
|
||||
array.set_at ix+vi elem
|
||||
Array.copy vec.to_array 0 array ix vec.length
|
||||
ix + vec.length
|
||||
Append l r _ ->
|
||||
ix2 = go ix l
|
||||
|
@ -0,0 +1,96 @@
|
||||
package org.enso.base.statistics;
|
||||
|
||||
/** Class to compute covariance and correlations between series. */
|
||||
public class CorrelationStatistics {
|
||||
private long count = 0;
|
||||
private double totalX = 0.0;
|
||||
private double totalXX = 0.0;
|
||||
private double totalY = 0.0;
|
||||
private double totalYY = 0.0;
|
||||
private double totalXY = 0.0;
|
||||
|
||||
private void append(Double x, Double y) {
|
||||
if (x == null || x.isNaN() || y == null || y.isNaN()) {
|
||||
return;
|
||||
}
|
||||
|
||||
count++;
|
||||
totalX += x;
|
||||
totalXX += x * x;
|
||||
totalY += y;
|
||||
totalYY += y * y;
|
||||
totalXY += x * y;
|
||||
}
|
||||
|
||||
public double covariance() {
|
||||
if (count < 2) {
|
||||
return Double.NaN;
|
||||
}
|
||||
|
||||
return (totalXY - totalX * totalY / count) / count;
|
||||
}
|
||||
|
||||
public double pearsonCorrelation() {
|
||||
if (count < 2) {
|
||||
return Double.NaN;
|
||||
}
|
||||
|
||||
double n_stdev_x = Math.sqrt(count * totalXX - totalX * totalX);
|
||||
double n_stdev_y = Math.sqrt(count * totalYY - totalY * totalY);
|
||||
return (count * totalXY - totalX * totalY) / (n_stdev_x * n_stdev_y);
|
||||
}
|
||||
|
||||
public double rSquared() {
|
||||
double correl = this.pearsonCorrelation();
|
||||
return correl * correl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the CorrelationStats between two series
|
||||
*
|
||||
* @param x Array of X values
|
||||
* @param y Array of Y values
|
||||
* @return CorrelationStats object for the 2 series.
|
||||
*/
|
||||
public static CorrelationStatistics compute(Double[] x, Double[] y) {
|
||||
if (x.length != y.length) {
|
||||
throw new IllegalArgumentException("Left and right lengths are not the same.");
|
||||
}
|
||||
|
||||
CorrelationStatistics output = new CorrelationStatistics();
|
||||
for (int i = 0; i < x.length; i++) {
|
||||
output.append(x[i], y[i]);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
public static CorrelationStatistics[][] computeMatrix(Double[][] data) {
|
||||
int len = data[0].length;
|
||||
|
||||
CorrelationStatistics[][] output = new CorrelationStatistics[data.length][];
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
if (data[i].length != len) {
|
||||
throw new IllegalArgumentException("Data lengths are not consistent.");
|
||||
}
|
||||
output[i] = new CorrelationStatistics[data.length];
|
||||
for (int j = 0; j < data.length; j++) {
|
||||
if (j < i) {
|
||||
output[i][j] = output[j][i];
|
||||
} else {
|
||||
output[i][j] = compute(data[i], data[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
public static double spearmanRankCorrelation(Double[] x, Double[] y) {
|
||||
double[][] pairedRanks = Rank.pairedRanks(x, y, Rank.Method.AVERAGE);
|
||||
|
||||
CorrelationStatistics computation = new CorrelationStatistics();
|
||||
for (int i = 0; i < pairedRanks[0].length; i++) {
|
||||
computation.append(pairedRanks[0][i], pairedRanks[1][i]);
|
||||
}
|
||||
return computation.pearsonCorrelation();
|
||||
}
|
||||
}
|
@ -0,0 +1,95 @@
|
||||
package org.enso.base.statistics;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class Rank {
|
||||
private static final Comparator<Object> DOUBLE_COMPARATOR = (a, b) -> Double.compare((Double)a, (Double)b);
|
||||
|
||||
public enum Method {
|
||||
AVERAGE,
|
||||
MINIMUM,
|
||||
MAXIMUM,
|
||||
DENSE,
|
||||
ORDINAL
|
||||
}
|
||||
|
||||
private record ValueWithIndex(Object value, int index) {
|
||||
}
|
||||
|
||||
public static double[] rank(Object[] input, Comparator<Object> comparator, Method method)
|
||||
throws NullPointerException, ClassCastException
|
||||
{
|
||||
List<ValueWithIndex> tuples = new ArrayList<>(input.length);
|
||||
for(int i = 0; i < input.length; i++) {
|
||||
if (input[i] == null) {
|
||||
throw new NullPointerException("Value is Nothing at index " + i);
|
||||
}
|
||||
tuples.add(new ValueWithIndex(input[i], i));
|
||||
}
|
||||
|
||||
return computeRankFromTuples(tuples, comparator, method);
|
||||
}
|
||||
|
||||
public static double[][] pairedRanks(Double[] x, Double[] y, Method method)
|
||||
throws IllegalArgumentException, NullPointerException, ClassCastException
|
||||
{
|
||||
if (x.length != y.length) {
|
||||
throw new IllegalArgumentException("Left and right lengths are not the same.");
|
||||
}
|
||||
|
||||
List<ValueWithIndex> x_tuples = new ArrayList<>(x.length);
|
||||
List<ValueWithIndex> y_tuples = new ArrayList<>(y.length);
|
||||
for (int i = 0; i < x.length; i++) {
|
||||
if (x[i] == null || Double.isNaN(x[i]) || y[i] == null || Double.isNaN(y[i])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
x_tuples.add(new ValueWithIndex(x[i], x_tuples.size()));
|
||||
y_tuples.add(new ValueWithIndex(y[i], y_tuples.size()));
|
||||
}
|
||||
|
||||
return new double[][] {
|
||||
computeRankFromTuples(x_tuples, DOUBLE_COMPARATOR, method),
|
||||
computeRankFromTuples(y_tuples, DOUBLE_COMPARATOR, method)
|
||||
};
|
||||
}
|
||||
|
||||
private static double[] computeRankFromTuples(List<ValueWithIndex> tuples, Comparator<Object> comparator, Method method)
|
||||
throws NullPointerException, ClassCastException
|
||||
{
|
||||
Comparator<ValueWithIndex> tupleComparator = (a, b) -> {
|
||||
int c = comparator.compare(a.value, b.value);
|
||||
return c == 0 ? Integer.compare(a.index, b.index) : -c;
|
||||
};
|
||||
tuples.sort(tupleComparator);
|
||||
|
||||
double[] output = new double[tuples.size()];
|
||||
|
||||
int index = 0;
|
||||
int dense = 0;
|
||||
while (index < tuples.size()) {
|
||||
dense++;
|
||||
int start = index;
|
||||
|
||||
// Find End of Equal Values
|
||||
while (index < tuples.size() && comparator.compare(tuples.get(start).value, tuples.get(index).value) == 0) {
|
||||
index++;
|
||||
}
|
||||
|
||||
// Build Rank
|
||||
for (int i = start; i < index; i++) {
|
||||
double rank = switch (method) {
|
||||
case MINIMUM -> start + 1;
|
||||
case MAXIMUM -> index;
|
||||
case DENSE -> dense;
|
||||
case AVERAGE -> (start + 1 + index) / 2.0;
|
||||
case ORDINAL -> i + 1;
|
||||
};
|
||||
|
||||
output[tuples.get(i).index] = rank;
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
from Standard.Base import Nothing, Vector, Number, True, Illegal_Argument_Error, False
|
||||
from Standard.Base import Nothing, Vector, Number, Decimal, True, Illegal_Argument_Error, False
|
||||
|
||||
import Standard.Base.Data.Statistics
|
||||
import Standard.Base.Data.Statistics.Rank_Method
|
||||
from Standard.Base.Data.Statistics import all
|
||||
|
||||
import Standard.Test
|
||||
@ -17,18 +18,21 @@ type No_Ord number
|
||||
# Tests
|
||||
|
||||
spec =
|
||||
simple_set = [1, 2, 3, 4, 5]
|
||||
number_set = [0.4, -18.56, -16.99, -16.43, -45.84, 13.44, -6.85, 9.68, -8.55, 10.87, 10.38, 33.85, -41.02, 1.87, -26.52, -13.87, -39.06, 25.92, -16.01, 42.01]
|
||||
missing_set = number_set.map_with_index i->v->(if i % 5 == 4 then Nothing else v)
|
||||
with_nans_set = number_set.map_with_index i->v->(if i % 5 == 4 then (if i % 10 == 9 then Number.nan else Nothing) else v)
|
||||
text_set = ["A", "B", Nothing, "D"]
|
||||
|
||||
ord_set = [Ord 10, Ord 2, Nothing, Ord 9]
|
||||
no_ord_set = [No_Ord 10, No_Ord 2, Nothing, No_Ord 9]
|
||||
|
||||
double_error = 0.000001
|
||||
|
||||
vector_compare values expected =
|
||||
values.each_with_index i->v->
|
||||
case v of
|
||||
Decimal -> v.should_equal (expected.at i) epsilon=double_error
|
||||
_ -> v.should_equal (expected.at i)
|
||||
|
||||
Test.group "Statistics" <|
|
||||
simple_set = [1, 2, 3, 4, 5]
|
||||
number_set = [0.4, -18.56, -16.99, -16.43, -45.84, 13.44, -6.85, 9.68, -8.55, 10.87, 10.38, 33.85, -41.02, 1.87, -26.52, -13.87, -39.06, 25.92, -16.01, 42.01]
|
||||
missing_set = number_set.map_with_index i->v->(if i % 5 == 4 then Nothing else v)
|
||||
with_nans_set = number_set.map_with_index i->v->(if i % 5 == 4 then (if i % 10 == 9 then Number.nan else Nothing) else v)
|
||||
text_set = ["A", "B", Nothing, "D"]
|
||||
|
||||
Test.specify "should be able to count valid values" <|
|
||||
simple_set.compute . should_equal 5
|
||||
number_set.compute . should_equal 20
|
||||
@ -111,8 +115,9 @@ spec =
|
||||
stats = [Count, Minimum, Mean, Variance, Skew]
|
||||
expected = [20, -45.84, -5.064, 582.0137832, 0.165086552]
|
||||
values = number_set.compute_bulk stats
|
||||
values.map_with_index i->v->((expected.at i - v).abs < double_error) . any v->(v == True) . should_equal True
|
||||
vector_compare values expected
|
||||
|
||||
Test.group "Statistics - empty Vector " <|
|
||||
Test.specify "should be able to count and sum on empty Vector" <|
|
||||
[].compute . should_equal 0
|
||||
[].compute Sum . should_equal 0
|
||||
@ -127,6 +132,11 @@ spec =
|
||||
[].compute Skew . is_nan . should_equal True
|
||||
[].compute Kurtosis . is_nan . should_equal True
|
||||
|
||||
Test.group "Statistics - invalid input" <|
|
||||
text_set = ["A", "B", Nothing, "D"]
|
||||
ord_set = [Ord 10, Ord 2, Nothing, Ord 9]
|
||||
no_ord_set = [No_Ord 10, No_Ord 2, Nothing, No_Ord 9]
|
||||
|
||||
Test.specify "should fail with Illegal_Argument_Error on number based statistics for text Vector" <|
|
||||
text_set.compute Sum . should_fail_with Illegal_Argument_Error
|
||||
text_set.compute Mean . should_fail_with Illegal_Argument_Error
|
||||
@ -147,4 +157,95 @@ spec =
|
||||
Test.specify "should fail with Incomparable_Values_Error on mixed Vectors" <|
|
||||
[1, False].compute Minimum . should_fail_with Vector.Incomparable_Values_Error
|
||||
|
||||
Test.group "Rank Data" <|
|
||||
Test.specify "can rank a Decimal data series" <|
|
||||
values = [409.892906, 0.839952, 796.468572, 126.931298, -405.265005, -476.675817, 441.651325, 796.468572, 78.50094, 340.163324, 234.861926, 409.892906, 226.467105, 234.861926, 126.931298, 637.870512, -71.008044, -386.399663, -126.534337, -476.675817, 78.50094, -386.399663, 409.892906, 868.54485, 669.113037, 669.113037, 0.839952, 407.162613, -476.675817, 126.931298]
|
||||
Statistics.rank_data values . should_equal [9, 21.5, 2.5, 17, 27, 29, 7, 2.5, 19.5, 12, 13.5, 9, 15, 13.5, 17, 6, 23, 25.5, 24, 29, 19.5, 25.5, 9, 1, 4.5, 4.5, 21.5, 11, 29, 17]
|
||||
Statistics.rank_data values Rank_Method.Minimum . should_equal [8, 21, 2, 16, 27, 28, 7, 2, 19, 12, 13, 8, 15, 13, 16, 6, 23, 25, 24, 28, 19, 25, 8, 1, 4, 4, 21, 11, 28, 16]
|
||||
Statistics.rank_data values Rank_Method.Maximum . should_equal [10, 22, 3, 18, 27, 30, 7, 3, 20, 12, 14, 10, 15, 14, 18, 6, 23, 26, 24, 30, 20, 26, 10, 1, 5, 5, 22, 11, 30, 18]
|
||||
Statistics.rank_data values Rank_Method.Ordinal . should_equal [8, 21, 2, 16, 27, 28, 7, 3, 19, 12, 13, 9, 15, 14, 17, 6, 23, 25, 24, 29, 20, 26, 10, 1, 4, 5, 22, 11, 30, 18]
|
||||
Statistics.rank_data values Rank_Method.Dense . should_equal [6, 13, 2, 11, 17, 18, 5, 2, 12, 8, 9, 6, 10, 9, 11, 4, 14, 16, 15, 18, 12, 16, 6, 1, 3, 3, 13, 7, 18, 11]
|
||||
|
||||
Test.specify "can rank an Integer data series" <|
|
||||
values = [10, 1, 124, 10]
|
||||
Statistics.rank_data values . should_equal [2.5, 4, 1, 2.5]
|
||||
|
||||
Test.specify "can rank a Number data series" <|
|
||||
values = [10.0, 1, 12.4, 10]
|
||||
Statistics.rank_data values . should_equal [2.5, 4, 1, 2.5]
|
||||
|
||||
Test.specify "can rank a Text data series" <|
|
||||
values = ["G", "AA", "B", "G", "D"]
|
||||
Statistics.rank_data values . should_equal [1.5, 5, 4, 1.5, 3]
|
||||
|
||||
Test.specify "should fail with Incomparable_Values_Error on custom type without compare_to" <|
|
||||
values = [No_Ord 10, No_Ord 2, No_Ord 9]
|
||||
Statistics.rank_data values . should_fail_with Vector.Incomparable_Values_Error
|
||||
|
||||
Test.specify "should fail with Incomparable_Values_Error on mixed Vectors" <|
|
||||
Statistics.rank_data [1, "A"] . should_fail_with Vector.Incomparable_Values_Error
|
||||
|
||||
Test.specify "should fail with Illegal_Argument_Error on Vectors with Nothing" <|
|
||||
Statistics.rank_data [1, Nothing, 4] . should_fail_with Illegal_Argument_Error
|
||||
|
||||
Test.group "Correlation Statistics" <|
|
||||
series_a = [0.22345,0.258315,0.74663,Nothing,0.686843,0.692246,Nothing,0.401859,0.725442,Nothing,0.963527,0.520363,0.633053,0.397123,Nothing,0.458942,0.036499,0.368194,0.598939,0.296476,0.093746,0.609329]
|
||||
series_b = [0.140743,Nothing,0.574639,0.251683,0.902023,0.08723,0.251813,0.1669,0.234405,Nothing,0.28774,0.471757,0.280681,0.925207,0.919041,0.626234,0.429497,0.358597,0.566118,0.333606,0.828172,0.887829]
|
||||
series_c = [Nothing,0.769797,0.281678,0.462145,0.727132,0.327978,Nothing,0.648639,0.562636,Nothing,0.159836,0.367404,0.877087,0.365483,Nothing,0.931873,0.723546,0.558085,0.163396,0.940997,0.399685,0.617509]
|
||||
series = [series_a, series_b, series_c]
|
||||
|
||||
Test.specify "can compute Covariance, Correlation and R Squared between a pair of series"
|
||||
series_a.compute (Covariance series_b) . should_equal -0.0053554 epsilon=double_error
|
||||
series_a.compute (Pearson series_b) . should_equal -0.08263943 epsilon=double_error
|
||||
series_a.compute (Spearman series_b) . should_equal -0.09313725 epsilon=double_error
|
||||
series_a.compute (R_Squared series_b) . should_equal 0.006829275 epsilon=double_error
|
||||
|
||||
Test.specify "can calculate a covariance matrix" <|
|
||||
matrix = Statistics.covariance_matrix series
|
||||
matrix.length . should_equal 3
|
||||
vector_compare (matrix.at 0) [0.0571699, -0.0053554, -0.02378204]
|
||||
vector_compare (matrix.at 1) [-0.0053554, 0.07707381, -0.00098274]
|
||||
vector_compare (matrix.at 2) [-0.02378204, -0.00098274, 0.05837098]
|
||||
|
||||
Test.specify "can calculate a pearson correlation matrix" <|
|
||||
matrix = Statistics.pearson_correlation series
|
||||
matrix.length . should_equal 3
|
||||
vector_compare (matrix.at 0) [1, -0.08263943, -0.40469045]
|
||||
vector_compare (matrix.at 1) [-0.08263943, 1, -0.01537537]
|
||||
vector_compare (matrix.at 2) [-0.40469045, -0.01537537, 1]
|
||||
|
||||
Test.specify "can calculate a spearman rank correlation matrix" <|
|
||||
matrix = Statistics.spearman_correlation series
|
||||
matrix.length . should_equal 3
|
||||
vector_compare (matrix.at 0) [1, -0.09313725, -0.43382353]
|
||||
vector_compare (matrix.at 1) [-0.09313725, 1, 0]
|
||||
vector_compare (matrix.at 2) [-0.43382353, 0, 1]
|
||||
|
||||
Test.specify "should fail with Illegal_Argument_Error if different lengths" <|
|
||||
data = [[1,2,3,4],[10,20,30]]
|
||||
data.first.compute (Covariance data.second) . should_fail_with Illegal_Argument_Error
|
||||
data.first.compute (Pearson data.second) . should_fail_with Illegal_Argument_Error
|
||||
data.first.compute (Spearman data.second) . should_fail_with Illegal_Argument_Error
|
||||
data.first.compute (R_Squared data.second) . should_fail_with Illegal_Argument_Error
|
||||
Statistics.covariance_matrix data . should_fail_with Illegal_Argument_Error
|
||||
Statistics.pearson_correlation data . should_fail_with Illegal_Argument_Error
|
||||
Statistics.spearman_correlation data . should_fail_with Illegal_Argument_Error
|
||||
|
||||
Test.specify "should fail with Illegal_Argument_Error if not number based" <|
|
||||
text = [["A","BC","CD"], ["0", "1", "2"], ["H", "I", "J"]]
|
||||
text.first.compute (Covariance text.second) . should_fail_with Illegal_Argument_Error
|
||||
text.first.compute (Pearson text.second) . should_fail_with Illegal_Argument_Error
|
||||
text.first.compute (Spearman text.second) . should_fail_with Illegal_Argument_Error
|
||||
text.first.compute (R_Squared text.second) . should_fail_with Illegal_Argument_Error
|
||||
Statistics.covariance_matrix text . should_fail_with Illegal_Argument_Error
|
||||
Statistics.pearson_correlation text . should_fail_with Illegal_Argument_Error
|
||||
Statistics.spearman_correlation text . should_fail_with Illegal_Argument_Error
|
||||
|
||||
Test.group "Statistics - invalid input" <|
|
||||
Test.specify "should fail with Illegal_Argument_Error on number based statistics for text Vector" <|
|
||||
series = [["A", "B", Nothing, "D"], ["A", "B", Nothing, "D"]]
|
||||
Statistics.covariance_matrix series . should_fail_with Illegal_Argument_Error
|
||||
Statistics.pearson_correlation series . should_fail_with Illegal_Argument_Error
|
||||
|
||||
|
||||
main = Test.Suite.run_main here.spec
|
||||
|
@ -34,6 +34,7 @@ import project.Data.Ref_Spec
|
||||
import project.Data.Text_Spec
|
||||
import project.Data.Time.Spec as Time_Spec
|
||||
import project.Data.Vector_Spec
|
||||
import project.Data.Statistics_Spec
|
||||
import project.Data.Text.Regex_Spec
|
||||
import project.Data.Text.Utils_Spec
|
||||
import project.Data.Text.Default_Regex_Engine_Spec
|
||||
@ -104,4 +105,5 @@ main = Test.Suite.run_main <|
|
||||
Time_Spec.spec
|
||||
Uri_Spec.spec
|
||||
Vector_Spec.spec
|
||||
Statistics_Spec.spec
|
||||
Warnings_Spec.spec
|
||||
|
Loading…
Reference in New Issue
Block a user