From 14f0162688a4f1063c9f1a78d6b38623dfdd8362 Mon Sep 17 00:00:00 2001 From: Wez Furlong Date: Sun, 24 Jul 2022 12:48:02 -0700 Subject: [PATCH] rangeset: fix accidentally quadratic complexity When adding sparse ranges the cartesian product of range combinations was explored to find intersections, which is pretty awful if there are 1 million entries to be inserted. This commit employs binary search to reduce the complexity, at the expense of requiring that the internal range array is sorted. --- Cargo.lock | 1 + rangeset/Cargo.toml | 8 +++++ rangeset/benches/rangeset.rs | 43 +++++++++++++++++++++++ rangeset/src/lib.rs | 66 +++++++++++++++++++++++++++++------- 4 files changed, 105 insertions(+), 13 deletions(-) create mode 100644 rangeset/benches/rangeset.rs diff --git a/Cargo.lock b/Cargo.lock index 814f3213b..2b5bbd145 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3602,6 +3602,7 @@ dependencies = [ name = "rangeset" version = "0.1.0" dependencies = [ + "criterion", "num", ] diff --git a/rangeset/Cargo.toml b/rangeset/Cargo.toml index cfae51de2..a9886933c 100644 --- a/rangeset/Cargo.toml +++ b/rangeset/Cargo.toml @@ -6,3 +6,11 @@ edition = "2018" [dependencies] num = "0.3" + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "rangeset" +harness = false + diff --git a/rangeset/benches/rangeset.rs b/rangeset/benches/rangeset.rs new file mode 100644 index 000000000..b7c2c1832 --- /dev/null +++ b/rangeset/benches/rangeset.rs @@ -0,0 +1,43 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rangeset::RangeSet; + +fn build_contig_rangeset(size: usize) -> RangeSet { + let mut set = RangeSet::new(); + for i in 0..size { + set.add(i); + } + set +} + +fn build_sparse_rangeset(size: usize) -> RangeSet { + let mut set = RangeSet::new(); + for i in (0..size).step_by(2) { + set.add(i); + } + set +} + +pub fn criterion_benchmark(c: &mut Criterion) { + c.bench_function("Contig 100", |b| { + b.iter(|| black_box(build_contig_rangeset(100))) + }); + c.bench_function("Contig 10000", |b| { + b.iter(|| black_box(build_contig_rangeset(10000))) + }); + c.bench_function("Contig 1000000", |b| { + b.iter(|| black_box(build_contig_rangeset(1000000))) + }); + + c.bench_function("Sparse 100", |b| { + b.iter(|| black_box(build_sparse_rangeset(100))) + }); + c.bench_function("Sparse 10000", |b| { + b.iter(|| black_box(build_sparse_rangeset(10000))) + }); + c.bench_function("Sparse 1000000", |b| { + b.iter(|| black_box(build_sparse_rangeset(1000000))) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/rangeset/src/lib.rs b/rangeset/src/lib.rs index 69cf326c9..167e8f0f3 100644 --- a/rangeset/src/lib.rs +++ b/rangeset/src/lib.rs @@ -1,5 +1,5 @@ use num::{Integer, ToPrimitive}; -use std::cmp::{max, min}; +use std::cmp::{max, min, Ordering}; use std::fmt::Debug; use std::ops::Range; @@ -9,6 +9,7 @@ use std::ops::Range; #[derive(Debug, Default, Clone, PartialEq, Eq)] pub struct RangeSet { ranges: Vec>, + needs_sort: bool, } pub fn range_is_empty(range: &Range) -> bool { @@ -92,7 +93,10 @@ impl From> for Vec impl RangeSet { /// Create a new set pub fn new() -> Self { - Self { ranges: vec![] } + Self { + ranges: vec![], + needs_sort: false, + } } /// Returns true if this set is empty @@ -220,6 +224,8 @@ impl RangeSet { return; } + self.sort_if_needed(); + match self.intersection_helper(&range) { (Some(a), Some(b)) if b == a + 1 => { // This range intersects with two or more adjacent ranges and will @@ -243,6 +249,7 @@ impl RangeSet { pub fn add_range_unchecked(&mut self, range: Range) { self.ranges.push(range); + self.needs_sort = true; } /// Add a set of ranges to this set @@ -258,28 +265,61 @@ impl RangeSet { } fn intersection_helper(&self, range: &Range) -> (Option, Option) { - let mut first = None; + if self.needs_sort { + panic!("rangeset needs sorting"); + } - for (idx, r) in self.ranges.iter().enumerate() { + let idx = match self.binary_search_ranges(range) { + Ok(idx) => idx, + Err(idx) => idx.saturating_sub(1), + }; + + let mut first = None; + if let Some(r) = self.ranges.get(idx) { if intersects_range(r, range) || r.end == range.start { - if first.is_some() { - return (first, Some(idx)); - } first = Some(idx); } } - + if let Some(r) = self.ranges.get(idx + 1) { + if intersects_range(r, range) || r.end == range.start { + if first.is_some() { + return (first, Some(idx + 1)); + } + } + } (first, None) } - fn insertion_point(&self, range: &Range) -> usize { - for (idx, r) in self.ranges.iter().enumerate() { - if range.end < r.start { - return idx; + pub fn sort_if_needed(&mut self) { + if self.needs_sort { + self.ranges.sort_by_key(|r| r.start); + self.needs_sort = false; + } + } + + fn binary_search_ranges(&self, range: &Range) -> Result { + self.ranges.binary_search_by(|r| { + if range.start >= r.start && range.end <= r.end { + Ordering::Equal + } else if range.start < r.start { + Ordering::Greater + } else if range.end > r.end { + Ordering::Less + } else { + unreachable!() } + }) + } + + fn insertion_point(&self, range: &Range) -> usize { + if self.needs_sort { + panic!("rangeset needs sorting"); } - self.ranges.len() + match self.binary_search_ranges(range) { + Ok(idx) => idx, + Err(idx) => idx, + } } /// Returns an iterator over the ranges that comprise the set