Add outlier detection

This commit is contained in:
sharkdp 2018-01-22 07:51:18 +01:00 committed by David Peter
parent cda5816dd3
commit 2ddc21c7d4
4 changed files with 128 additions and 0 deletions

View File

@ -9,6 +9,7 @@ use hyperfine::internal::{get_progress_bar, max, min, CmdFailureAction, Hyperfin
Warnings, MIN_EXECUTION_TIME};
use hyperfine::format::{format_duration, format_duration_unit};
use hyperfine::cputime::{cpu_time_interval, get_cpu_times};
use hyperfine::outlier_detection::{modified_zscores, OUTLIER_THRESHOLD};
/// Results from timing a single shell command
#[derive(Debug, Copy, Clone)]
@ -274,6 +275,14 @@ pub fn run_benchmark(
warnings.push(Warnings::NonZeroExitCode);
}
// Run outlier detection
let scores = modified_zscores(&times_real);
if scores[0] > OUTLIER_THRESHOLD {
warnings.push(Warnings::SlowInitialRun);
} else if scores.iter().any(|&s| s > OUTLIER_THRESHOLD) {
warnings.push(Warnings::OutliersDetected);
}
if !warnings.is_empty() {
eprintln!(" ");
for warning in &warnings {

View File

@ -66,6 +66,8 @@ pub fn get_progress_bar(length: u64, msg: &str) -> ProgressBar {
pub enum Warnings {
FastExecutionTime,
NonZeroExitCode,
SlowInitialRun,
OutliersDetected,
}
impl fmt::Display for Warnings {
@ -77,6 +79,20 @@ impl fmt::Display for Warnings {
MIN_EXECUTION_TIME * 1e3
),
Warnings::NonZeroExitCode => write!(f, "Ignoring non-zero exit code."),
Warnings::SlowInitialRun => write!(
f,
"The first benchmarking run for this command was significantly slower than the \
rest. This could be caused by (filesystem) caches that were not filled. You \
should consider using the '--warmup' option to fill those caches before the \
benchmark. Alternatively, use the '--prepare' option to clear the caches before \
each timing run."
),
Warnings::OutliersDetected => write!(
f,
"Statistical outliers were detected. Consider re-running this benchmark without \
any interferences from other programs. It might help to use the '--warmup' or \
'--prepare' options to mitigate outliers."
),
}
}
}

View File

@ -2,3 +2,4 @@ pub mod benchmark;
pub mod format;
pub mod internal;
pub mod cputime;
pub mod outlier_detection;

View File

@ -0,0 +1,102 @@
/// A module for statistical outlier detection.
///
/// References:
/// - Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and Handle Outliers",
/// The ASQC Basic References in Quality Control: Statistical Techniques, Edward F. Mykytka,
/// Ph.D., Editor.
use statistical::median;
/// Minimum modified Z-score for a datapoint to be an outlier. Here, 1.4826 is a factor that
/// converts the MAD to an estimator for the standard deviation. The second factor is the number
/// of standard deviations.
pub const OUTLIER_THRESHOLD: f64 = 1.4826 * 10.0;
/// Compute modifized Z-scores for a given sample. A (unmodified) Z-score is defined by
/// (x_i - x_mean)/x_stddev whereas the modified Z-score is defined by |x_i - x_median|/MAD where
/// MAD is the median average deviation.
///
/// References:
/// - https://en.wikipedia.org/wiki/Median_absolute_deviation
pub fn modified_zscores(xs: &[f64]) -> Vec<f64> {
assert!(!xs.is_empty());
// Compute sample median:
let x_median = median(&xs);
// Compute the absolute deviations from the median:
let deviations: Vec<f64> = xs.iter().map(|x| (x - x_median).abs()).collect();
// Compute median absolute deviation:
let mad = median(&deviations);
// Compute modified Z-scores (x_i - x_median) / MAD
xs.iter().map(|&x| (x - x_median) / mad).collect()
}
/// Return the number of outliers in a given sample. Outliers are defined as data points with a
/// modified Z-score that is larger than `OUTLIER_THRESHOLD`.
#[cfg(test)]
pub fn num_outliers(xs: &[f64]) -> usize {
if xs.is_empty() {
return 0;
}
let scores = modified_zscores(xs);
scores
.iter()
.filter(|&&s| s.abs() > OUTLIER_THRESHOLD)
.count()
}
#[test]
fn test_detect_outliers() {
// Should not detect outliers in small samples
assert_eq!(0, num_outliers(&[]));
assert_eq!(0, num_outliers(&[50.0]));
assert_eq!(0, num_outliers(&[1000.0, 0.0]));
// Should not detect outliers in low-variance samples
let xs = [-0.2, 0.0, 0.2];
assert_eq!(0, num_outliers(&xs));
// Should detect a single outlier
let xs = [-0.2, 0.0, 0.2, 4.0];
assert_eq!(1, num_outliers(&xs));
// Should detect a single outlier
let xs = [0.5, 0.30, 0.29, 0.31, 0.30];
assert_eq!(1, num_outliers(&xs));
// Should detect no outliers in sample drawn from normal distribution
let xs = [
2.33269488,
1.42195907,
-0.57527698,
-0.31293437,
2.2948158,
0.75813273,
-1.0712388,
-0.96394741,
-1.15897446,
1.10976285,
];
assert_eq!(0, num_outliers(&xs));
// Should detect two outliers that were manually added
let xs = [
2.33269488,
1.42195907,
-0.57527698,
-0.31293437,
2.2948158,
0.75813273,
-1.0712388,
-0.96394741,
-1.15897446,
1.10976285,
20.0,
-500.0,
];
assert_eq!(2, num_outliers(&xs));
}