2017-10-26 18:37:44 +03:00
|
|
|
|
(use Int)
|
|
|
|
|
(use Double)
|
2018-06-26 11:03:34 +03:00
|
|
|
|
(use Array)
|
2017-10-26 18:37:44 +03:00
|
|
|
|
|
2021-07-05 15:48:35 +03:00
|
|
|
|
(doc Statistics "is a module for providing various statistical analyses on a
|
|
|
|
|
collection of values.")
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defmodule Statistics
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(deftype Summary [
|
|
|
|
|
sum Double,
|
2019-05-15 02:11:19 +03:00
|
|
|
|
min Double,
|
|
|
|
|
max Double,
|
2017-11-06 20:08:07 +03:00
|
|
|
|
mean Double,
|
|
|
|
|
median Double,
|
|
|
|
|
var Double,
|
|
|
|
|
stdev Double,
|
|
|
|
|
stdev-pct Double,
|
|
|
|
|
median-abs-dev Double,
|
|
|
|
|
median-abs-dev-pct Double,
|
|
|
|
|
quartiles (Array Double),
|
|
|
|
|
iqr Double
|
|
|
|
|
])
|
2021-06-11 14:02:52 +03:00
|
|
|
|
(hidden Summary)
|
2017-11-06 20:08:07 +03:00
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc mean "Compute the mean of the samples data.")
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn mean [data]
|
2018-06-11 22:35:25 +03:00
|
|
|
|
(/ (Array.sum data) (from-int (Array.length data))))
|
2017-10-26 18:37:44 +03:00
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(hidden _pp)
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn _pp [a mean]
|
|
|
|
|
(let [sum 0.0]
|
|
|
|
|
(do
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(for [i 0 (Array.length a)]
|
2019-10-31 12:23:23 +03:00
|
|
|
|
(let [tmp (Double.- (Double.copy (Array.unsafe-nth a i)) mean)]
|
2018-02-02 09:19:10 +03:00
|
|
|
|
(set! sum (Double.* tmp tmp))))
|
2017-10-26 18:37:44 +03:00
|
|
|
|
sum)))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(hidden _xx)
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn _xx [a mean]
|
|
|
|
|
(let [sum 0.0]
|
|
|
|
|
(do
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(for [i 0 (Array.length a)]
|
2019-10-31 12:23:23 +03:00
|
|
|
|
(set! sum (Double.- (Double.copy (Array.unsafe-nth a i)) mean)))
|
2017-10-26 18:37:44 +03:00
|
|
|
|
sum)))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(hidden _ss)
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn _ss [data]
|
|
|
|
|
(let [m (mean data)
|
|
|
|
|
tmp (_xx data m)]
|
|
|
|
|
(Double.- (_pp data m)
|
|
|
|
|
(Double./ (Double.* tmp tmp)
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(Double.from-int (Array.length data))))))
|
2017-10-26 18:37:44 +03:00
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc median "Compute the median of the samples data.")
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn median [data]
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(let [n (Array.length data)
|
2018-06-26 11:03:34 +03:00
|
|
|
|
sorted (Array.sorted data)]
|
2017-11-01 13:20:06 +03:00
|
|
|
|
(cond (= n 0) 0.0
|
2019-10-31 12:23:23 +03:00
|
|
|
|
(= (mod n 2) 1) @(Array.unsafe-nth data (/ n 2))
|
2017-11-01 13:20:06 +03:00
|
|
|
|
(let [mid (/ n 2)] ; else
|
2019-10-31 12:23:23 +03:00
|
|
|
|
(/ (+ (the Double @(Array.unsafe-nth data (dec mid)))
|
|
|
|
|
@(Array.unsafe-nth data mid))
|
2017-11-01 13:20:06 +03:00
|
|
|
|
2.0)))))
|
2017-10-26 18:37:44 +03:00
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc low-median "Compute the low median of the samples data.")
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn low-median [data]
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(let [n (Array.length data)
|
2018-06-26 11:03:34 +03:00
|
|
|
|
sorted (Array.sorted data)]
|
2017-11-01 13:20:06 +03:00
|
|
|
|
(cond (= n 0) 0.0
|
2019-10-31 12:23:23 +03:00
|
|
|
|
(= (mod n 2) 1) @(Array.unsafe-nth data (/ n 2))
|
|
|
|
|
@(Array.unsafe-nth data (dec (/ n 2)))))) ; else
|
2017-10-26 18:37:44 +03:00
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc high-median "Compute the high median of the samples data.")
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn high-median [data]
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(let [n (Array.length data)
|
2018-06-26 11:03:34 +03:00
|
|
|
|
sorted (Array.sorted data)]
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(if (= n 0)
|
|
|
|
|
0.0
|
2019-10-31 12:23:23 +03:00
|
|
|
|
@(Array.unsafe-nth data (/ n 2)))))
|
2017-10-26 18:37:44 +03:00
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc grouped-median "Compute the grouped median of the samples data.")
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn grouped-median [data interval]
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(let [n (Array.length data)
|
2018-06-26 11:03:34 +03:00
|
|
|
|
sorted (Array.sorted data)]
|
2017-11-01 13:20:06 +03:00
|
|
|
|
(cond (= n 0) 0.0
|
2019-10-31 12:23:23 +03:00
|
|
|
|
(= n 1) @(Array.unsafe-nth data 0)
|
|
|
|
|
(let [x @(Array.unsafe-nth data (/ n 2)) ; else
|
2017-11-01 13:20:06 +03:00
|
|
|
|
l (- x (/ (from-int interval) 2.0))
|
2019-05-14 19:26:19 +03:00
|
|
|
|
cf (Maybe.from (Array.index-of data &x) -1)
|
2018-03-12 14:11:33 +03:00
|
|
|
|
f (Array.element-count data &x)]
|
2017-11-01 13:20:06 +03:00
|
|
|
|
(+ l (/ (* (from-int interval) (- (/ (from-int n) 2.0) (from-int cf)))
|
|
|
|
|
(from-int f)))))))
|
2017-10-26 18:37:44 +03:00
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc variance "Compute the variance of the samples data.")
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn variance [data]
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(let [n (Array.length data)
|
2017-10-26 18:37:44 +03:00
|
|
|
|
ss (_ss data)]
|
|
|
|
|
(/ ss (from-int (dec n)))))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc pvariance "Compute the population variance of the samples data.")
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn pvariance [data]
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(let [n (Array.length data)
|
2017-10-26 18:37:44 +03:00
|
|
|
|
ss (_ss data)]
|
|
|
|
|
(/ ss (from-int n))))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc stdev "Compute the standard deviation of the samples data.")
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn stdev [data]
|
|
|
|
|
(Double.sqrt (variance data)))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc pstdev "Compute the population standard deviation of the samples data.")
|
2017-10-26 18:37:44 +03:00
|
|
|
|
(defn pstdev [data]
|
|
|
|
|
(Double.sqrt (pvariance data)))
|
2017-11-06 20:08:07 +03:00
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc pstdev-pct "Compute the standard deviation of the samples data as a percentile.")
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(defn stdev-pct [data]
|
|
|
|
|
(* (/ (stdev data) (mean data)) 100.0))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(hidden median-abs-dev)
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(defn median-abs-dev [data]
|
|
|
|
|
(let [med (median data)
|
|
|
|
|
zero 0.0
|
2018-05-20 10:57:51 +03:00
|
|
|
|
abs-devs (Array.replicate (Array.length data) &zero)
|
2017-11-06 20:08:07 +03:00
|
|
|
|
n 1.4826] ; taken from Rust and R, because that’s how it’s done apparently
|
|
|
|
|
(do
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(for [i 0 (Array.length data)]
|
2019-10-31 12:23:23 +03:00
|
|
|
|
(Array.aset! &abs-devs i (abs (- med @(Array.unsafe-nth data i)))))
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(* (median &abs-devs) n))))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(hidden median-abs-dev-pct)
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(defn median-abs-dev-pct [data]
|
|
|
|
|
(* (/ (median-abs-dev data) (median data)) 100.0))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(hidden percentile-of-sorted)
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(defn percentile-of-sorted [sorted pct]
|
|
|
|
|
(cond
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(Int.= 0 (Array.length sorted)) -1.0 ; should abort here
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(Double.< pct 0.0) -1.0 ; should abort here
|
|
|
|
|
(Double.> pct 100.0) -1.0 ; should abort here
|
2019-10-31 12:23:23 +03:00
|
|
|
|
(Int.= 1 (Array.length sorted)) @(Array.unsafe-nth sorted 0)
|
|
|
|
|
(Double.= 100.0 pct) @(Array.unsafe-nth sorted (Int.dec (Array.length sorted)))
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(let [len (Int.dec (Array.length sorted))
|
2017-11-06 20:08:07 +03:00
|
|
|
|
rank (Double.* (Double./ pct 100.0) (Double.from-int len))
|
|
|
|
|
lrank (Double.floor rank)
|
|
|
|
|
d (Double.- rank lrank)
|
|
|
|
|
n (Double.to-int lrank)
|
2019-10-31 12:23:23 +03:00
|
|
|
|
lo @(Array.unsafe-nth sorted n)
|
|
|
|
|
hi @(Array.unsafe-nth sorted (Int.inc n))]
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(Double.+ lo (Double.* d (Double.- hi lo))))))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc quartiles "Compute the quartiles of the samples data.")
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(defn quartiles [data]
|
2018-06-26 11:03:34 +03:00
|
|
|
|
(let [tmp (Array.sorted data)
|
2017-11-06 20:08:07 +03:00
|
|
|
|
first 25.0
|
|
|
|
|
second 50.0
|
|
|
|
|
third 75.0
|
2018-01-15 18:47:22 +03:00
|
|
|
|
a (percentile-of-sorted &tmp first)
|
|
|
|
|
b (percentile-of-sorted &tmp second)
|
|
|
|
|
c (percentile-of-sorted &tmp third)]
|
2017-11-06 20:08:07 +03:00
|
|
|
|
[a b c]))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc iqr "Compute the interquartile range.")
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(defn iqr [data]
|
|
|
|
|
(let [s &(quartiles data)]
|
2019-10-31 12:23:23 +03:00
|
|
|
|
(the Double (- @(Array.unsafe-nth s 2) @(Array.unsafe-nth s 0)))))
|
2017-11-06 20:08:07 +03:00
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(hidden winsorize)
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(defn winsorize [samples pct]
|
2018-06-26 11:03:34 +03:00
|
|
|
|
(let [tmp &(Array.sorted samples)
|
2017-11-06 20:08:07 +03:00
|
|
|
|
lo (Statistics.percentile-of-sorted tmp pct)
|
|
|
|
|
hi (Statistics.percentile-of-sorted tmp (Double.- 100.0 pct))]
|
|
|
|
|
(do
|
2018-05-20 10:57:51 +03:00
|
|
|
|
(for [i 0 (Array.length tmp)]
|
2019-10-31 12:23:23 +03:00
|
|
|
|
(let [samp @(Array.unsafe-nth tmp i)]
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(cond
|
|
|
|
|
(> samp hi) (Array.aset! tmp i hi)
|
2017-12-15 19:43:06 +03:00
|
|
|
|
(< samp lo) (Array.aset! tmp i lo)
|
|
|
|
|
())))
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(Array.copy tmp))))
|
|
|
|
|
|
2018-06-14 12:21:59 +03:00
|
|
|
|
(doc summary "Compute a variety of statistical values from a list of samples.")
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(defn summary [samples]
|
|
|
|
|
(Summary.init
|
2018-06-11 22:35:25 +03:00
|
|
|
|
(Array.sum samples)
|
2019-05-15 02:11:19 +03:00
|
|
|
|
(Maybe.from (Array.minimum samples) 0.0)
|
|
|
|
|
(Maybe.from (Array.maximum samples) 0.0)
|
2017-11-06 20:08:07 +03:00
|
|
|
|
(mean samples)
|
|
|
|
|
(median samples)
|
|
|
|
|
(variance samples)
|
|
|
|
|
(stdev samples)
|
|
|
|
|
(stdev-pct samples)
|
|
|
|
|
(median-abs-dev samples)
|
|
|
|
|
(median-abs-dev-pct samples)
|
|
|
|
|
(quartiles samples)
|
|
|
|
|
(iqr samples)))
|
2017-10-26 18:37:44 +03:00
|
|
|
|
)
|