Carp/core/Statistics.carp

195 lines
6.2 KiB
Plaintext
Raw Permalink Normal View History

2017-10-26 18:37:44 +03:00
(use Int)
(use Double)
2018-06-26 11:03:34 +03:00
(use Array)
2017-10-26 18:37:44 +03:00
2021-07-05 15:48:35 +03:00
(doc Statistics "is a module for providing various statistical analyses on a
collection of values.")
2017-10-26 18:37:44 +03:00
(defmodule Statistics
2017-11-06 20:08:07 +03:00
(deftype Summary [
sum Double,
min Double,
max Double,
2017-11-06 20:08:07 +03:00
mean Double,
median Double,
var Double,
stdev Double,
stdev-pct Double,
median-abs-dev Double,
median-abs-dev-pct Double,
quartiles (Array Double),
iqr Double
])
(hidden Summary)
2017-11-06 20:08:07 +03:00
2018-06-14 12:21:59 +03:00
(doc mean "Compute the mean of the samples data.")
2017-10-26 18:37:44 +03:00
(defn mean [data]
2018-06-11 22:35:25 +03:00
(/ (Array.sum data) (from-int (Array.length data))))
2017-10-26 18:37:44 +03:00
2018-06-14 12:21:59 +03:00
(hidden _pp)
2017-10-26 18:37:44 +03:00
(defn _pp [a mean]
(let [sum 0.0]
(do
(for [i 0 (Array.length a)]
2019-10-31 12:23:23 +03:00
(let [tmp (Double.- (Double.copy (Array.unsafe-nth a i)) mean)]
(set! sum (Double.* tmp tmp))))
2017-10-26 18:37:44 +03:00
sum)))
2018-06-14 12:21:59 +03:00
(hidden _xx)
2017-10-26 18:37:44 +03:00
(defn _xx [a mean]
(let [sum 0.0]
(do
(for [i 0 (Array.length a)]
2019-10-31 12:23:23 +03:00
(set! sum (Double.- (Double.copy (Array.unsafe-nth a i)) mean)))
2017-10-26 18:37:44 +03:00
sum)))
2018-06-14 12:21:59 +03:00
(hidden _ss)
2017-10-26 18:37:44 +03:00
(defn _ss [data]
(let [m (mean data)
tmp (_xx data m)]
(Double.- (_pp data m)
(Double./ (Double.* tmp tmp)
(Double.from-int (Array.length data))))))
2017-10-26 18:37:44 +03:00
2018-06-14 12:21:59 +03:00
(doc median "Compute the median of the samples data.")
2017-10-26 18:37:44 +03:00
(defn median [data]
(let [n (Array.length data)
2018-06-26 11:03:34 +03:00
sorted (Array.sorted data)]
(cond (= n 0) 0.0
2019-10-31 12:23:23 +03:00
(= (mod n 2) 1) @(Array.unsafe-nth data (/ n 2))
(let [mid (/ n 2)] ; else
2019-10-31 12:23:23 +03:00
(/ (+ (the Double @(Array.unsafe-nth data (dec mid)))
@(Array.unsafe-nth data mid))
2.0)))))
2017-10-26 18:37:44 +03:00
2018-06-14 12:21:59 +03:00
(doc low-median "Compute the low median of the samples data.")
2017-10-26 18:37:44 +03:00
(defn low-median [data]
(let [n (Array.length data)
2018-06-26 11:03:34 +03:00
sorted (Array.sorted data)]
(cond (= n 0) 0.0
2019-10-31 12:23:23 +03:00
(= (mod n 2) 1) @(Array.unsafe-nth data (/ n 2))
@(Array.unsafe-nth data (dec (/ n 2)))))) ; else
2017-10-26 18:37:44 +03:00
2018-06-14 12:21:59 +03:00
(doc high-median "Compute the high median of the samples data.")
2017-10-26 18:37:44 +03:00
(defn high-median [data]
(let [n (Array.length data)
2018-06-26 11:03:34 +03:00
sorted (Array.sorted data)]
2017-10-26 18:37:44 +03:00
(if (= n 0)
0.0
2019-10-31 12:23:23 +03:00
@(Array.unsafe-nth data (/ n 2)))))
2017-10-26 18:37:44 +03:00
2018-06-14 12:21:59 +03:00
(doc grouped-median "Compute the grouped median of the samples data.")
2017-10-26 18:37:44 +03:00
(defn grouped-median [data interval]
(let [n (Array.length data)
2018-06-26 11:03:34 +03:00
sorted (Array.sorted data)]
(cond (= n 0) 0.0
2019-10-31 12:23:23 +03:00
(= n 1) @(Array.unsafe-nth data 0)
(let [x @(Array.unsafe-nth data (/ n 2)) ; else
l (- x (/ (from-int interval) 2.0))
cf (Maybe.from (Array.index-of data &x) -1)
f (Array.element-count data &x)]
(+ l (/ (* (from-int interval) (- (/ (from-int n) 2.0) (from-int cf)))
(from-int f)))))))
2017-10-26 18:37:44 +03:00
2018-06-14 12:21:59 +03:00
(doc variance "Compute the variance of the samples data.")
2017-10-26 18:37:44 +03:00
(defn variance [data]
(let [n (Array.length data)
2017-10-26 18:37:44 +03:00
ss (_ss data)]
(/ ss (from-int (dec n)))))
2018-06-14 12:21:59 +03:00
(doc pvariance "Compute the population variance of the samples data.")
2017-10-26 18:37:44 +03:00
(defn pvariance [data]
(let [n (Array.length data)
2017-10-26 18:37:44 +03:00
ss (_ss data)]
(/ ss (from-int n))))
2018-06-14 12:21:59 +03:00
(doc stdev "Compute the standard deviation of the samples data.")
2017-10-26 18:37:44 +03:00
(defn stdev [data]
(Double.sqrt (variance data)))
2018-06-14 12:21:59 +03:00
(doc pstdev "Compute the population standard deviation of the samples data.")
2017-10-26 18:37:44 +03:00
(defn pstdev [data]
(Double.sqrt (pvariance data)))
2017-11-06 20:08:07 +03:00
2018-06-14 12:21:59 +03:00
(doc pstdev-pct "Compute the standard deviation of the samples data as a percentile.")
2017-11-06 20:08:07 +03:00
(defn stdev-pct [data]
(* (/ (stdev data) (mean data)) 100.0))
2018-06-14 12:21:59 +03:00
(hidden median-abs-dev)
2017-11-06 20:08:07 +03:00
(defn median-abs-dev [data]
(let [med (median data)
zero 0.0
abs-devs (Array.replicate (Array.length data) &zero)
2017-11-06 20:08:07 +03:00
n 1.4826] ; taken from Rust and R, because thats how its done apparently
(do
(for [i 0 (Array.length data)]
2019-10-31 12:23:23 +03:00
(Array.aset! &abs-devs i (abs (- med @(Array.unsafe-nth data i)))))
2017-11-06 20:08:07 +03:00
(* (median &abs-devs) n))))
2018-06-14 12:21:59 +03:00
(hidden median-abs-dev-pct)
2017-11-06 20:08:07 +03:00
(defn median-abs-dev-pct [data]
(* (/ (median-abs-dev data) (median data)) 100.0))
2018-06-14 12:21:59 +03:00
(hidden percentile-of-sorted)
2017-11-06 20:08:07 +03:00
(defn percentile-of-sorted [sorted pct]
(cond
(Int.= 0 (Array.length sorted)) -1.0 ; should abort here
2017-11-06 20:08:07 +03:00
(Double.< pct 0.0) -1.0 ; should abort here
(Double.> pct 100.0) -1.0 ; should abort here
2019-10-31 12:23:23 +03:00
(Int.= 1 (Array.length sorted)) @(Array.unsafe-nth sorted 0)
(Double.= 100.0 pct) @(Array.unsafe-nth sorted (Int.dec (Array.length sorted)))
(let [len (Int.dec (Array.length sorted))
2017-11-06 20:08:07 +03:00
rank (Double.* (Double./ pct 100.0) (Double.from-int len))
lrank (Double.floor rank)
d (Double.- rank lrank)
n (Double.to-int lrank)
2019-10-31 12:23:23 +03:00
lo @(Array.unsafe-nth sorted n)
hi @(Array.unsafe-nth sorted (Int.inc n))]
2017-11-06 20:08:07 +03:00
(Double.+ lo (Double.* d (Double.- hi lo))))))
2018-06-14 12:21:59 +03:00
(doc quartiles "Compute the quartiles of the samples data.")
2017-11-06 20:08:07 +03:00
(defn quartiles [data]
2018-06-26 11:03:34 +03:00
(let [tmp (Array.sorted data)
2017-11-06 20:08:07 +03:00
first 25.0
second 50.0
third 75.0
a (percentile-of-sorted &tmp first)
b (percentile-of-sorted &tmp second)
c (percentile-of-sorted &tmp third)]
2017-11-06 20:08:07 +03:00
[a b c]))
2018-06-14 12:21:59 +03:00
(doc iqr "Compute the interquartile range.")
2017-11-06 20:08:07 +03:00
(defn iqr [data]
(let [s &(quartiles data)]
2019-10-31 12:23:23 +03:00
(the Double (- @(Array.unsafe-nth s 2) @(Array.unsafe-nth s 0)))))
2017-11-06 20:08:07 +03:00
2018-06-14 12:21:59 +03:00
(hidden winsorize)
2017-11-06 20:08:07 +03:00
(defn winsorize [samples pct]
2018-06-26 11:03:34 +03:00
(let [tmp &(Array.sorted samples)
2017-11-06 20:08:07 +03:00
lo (Statistics.percentile-of-sorted tmp pct)
hi (Statistics.percentile-of-sorted tmp (Double.- 100.0 pct))]
(do
(for [i 0 (Array.length tmp)]
2019-10-31 12:23:23 +03:00
(let [samp @(Array.unsafe-nth tmp i)]
2017-11-06 20:08:07 +03:00
(cond
(> samp hi) (Array.aset! tmp i hi)
(< samp lo) (Array.aset! tmp i lo)
())))
2017-11-06 20:08:07 +03:00
(Array.copy tmp))))
2018-06-14 12:21:59 +03:00
(doc summary "Compute a variety of statistical values from a list of samples.")
2017-11-06 20:08:07 +03:00
(defn summary [samples]
(Summary.init
2018-06-11 22:35:25 +03:00
(Array.sum samples)
(Maybe.from (Array.minimum samples) 0.0)
(Maybe.from (Array.maximum samples) 0.0)
2017-11-06 20:08:07 +03:00
(mean samples)
(median samples)
(variance samples)
(stdev samples)
(stdev-pct samples)
(median-abs-dev samples)
(median-abs-dev-pct samples)
(quartiles samples)
(iqr samples)))
2017-10-26 18:37:44 +03:00
)