From 79a60403f7f239c86848fcdbd4693a3c9e3af6d7 Mon Sep 17 00:00:00 2001 From: Durham Goode Date: Tue, 23 Oct 2018 17:14:08 -0700 Subject: [PATCH] histpack: sort history entries before writing them Summary: The histpack format requires that entries in each file section be written in topological order, so that future readers can compute ancestors by just linearly scanning. Let's make the rust mutable history pack support this. Technically the rust historypack reader does not require this for now, but the python one does, so we need to enforce it. Reviewed By: kulshrax Differential Revision: D10441286 fbshipit-source-id: dfdb57182909270b760bd79a100873aa3903a2a5 --- lib/revisionstore/src/ancestors.rs | 6 +- lib/revisionstore/src/mutablehistorypack.rs | 143 +++++++++++++++++++- 2 files changed, 143 insertions(+), 6 deletions(-) diff --git a/lib/revisionstore/src/ancestors.rs b/lib/revisionstore/src/ancestors.rs index c3358ab2a2..138be80e46 100644 --- a/lib/revisionstore/src/ancestors.rs +++ b/lib/revisionstore/src/ancestors.rs @@ -225,14 +225,14 @@ mod tests { #[test] fn test_mergey_ancestor_iterator() { - /// Tests for exponential time complexity in a merge ancestory. This doesn't won't fail, - /// but may take a long time if there is bad time complexity. + // Tests for exponential time complexity in a merge ancestory. This doesn't won't fail, + // but may take a long time if there is bad time complexity. let mut rng = ChaChaRng::from_seed([0u8; 32]); let size = 5000; let mut ancestors = Ancestors::new(); let mut keys = vec![]; - for i in 0..size { + for _ in 0..size { keys.push(Key::new(Box::from([]), Node::random(&mut rng))); } diff --git a/lib/revisionstore/src/mutablehistorypack.rs b/lib/revisionstore/src/mutablehistorypack.rs index 1c2709fa93..f30e886e93 100644 --- a/lib/revisionstore/src/mutablehistorypack.rs +++ b/lib/revisionstore/src/mutablehistorypack.rs @@ -1,8 +1,9 @@ use byteorder::WriteBytesExt; use crypto::digest::Digest; use crypto::sha1::Sha1; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet, VecDeque}; use std::io::Write; +use std::iter::FromIterator; use std::path::{Path, PathBuf}; use tempfile::NamedTempFile; @@ -124,7 +125,8 @@ impl MutableHistoryPack { count: node_map.len() as u32, }.write(writer)?; - // TODO: Topo-sort nodes + // Sort the nodes in topological order (ancestors first), as required by the histpack spec + let node_map = topo_sort(node_map)?; // Write nodes for (key, node_info) in node_map.iter() { @@ -146,7 +148,7 @@ impl MutableHistoryPack { )?; node_locations.insert( - key.clone(), + (*key).clone(), NodeLocation { offset: node_offset as u64, }, @@ -158,6 +160,71 @@ impl MutableHistoryPack { } } +fn topo_sort(node_map: &HashMap) -> Result> { + // Sorts the given keys into newest-first topological order + let mut roots = Vec::<&Key>::new(); + + // Child map will be used to perform an oldest-first walk later. + let mut child_map = HashMap::<&Key, HashSet<&Key>>::with_capacity(node_map.len()); + // Parent count will be used to keep track of when all a commit's parents have been processed. + let mut parent_counts = HashMap::with_capacity(node_map.len()); + + for (key, info) in node_map.iter() { + let mut parent_count = 0; + for i in 0..2 { + let parent = &info.parents[i]; + + // Only record the relationship if the parent is also in the provided node_map. + // This also filters out null parents. + if node_map.contains_key(parent) { + parent_count += 1; + let children = child_map.entry(parent).or_default(); + children.insert(key); + } + } + + if parent_count == 0 { + roots.push(key); + } else { + parent_counts.insert(key, parent_count); + } + } + + // Sort the roots so things are deterministic. + roots.sort_unstable(); + + // Process roots, adding children to the queue once all their parents are processed. + let mut pending = VecDeque::<&Key>::from_iter(roots.iter().cloned()); + let mut results = Vec::new(); + while let Some(key) = pending.pop_front() { + results.push((key, node_map.get(key).unwrap())); + + if let Some(children) = child_map.get(key) { + for child in children.iter() { + let mut parent_count = parent_counts + .get(child) + .ok_or_else(|| { + MutableHistoryPackError(format!("missing {:?} during topo sort", child)) + })? + .clone(); + parent_count -= 1; + parent_counts.insert(child, parent_count); + if parent_count == 0 { + // If a child has no more parents, its a root and is ready for processing. + // Put it at the front so ancestor chains are processed contiguously. + pending.push_front(child); + } + } + } + } + + // We built the result in oldest first order, but we need it in newest first order. + results.reverse(); + + assert_eq!(results.len(), node_map.len()); + Ok(results) +} + impl HistoryStore for MutableHistoryPack { fn get_ancestors(&self, key: &Key) -> Result { AncestorIterator::new( @@ -201,8 +268,78 @@ mod tests { use rand::chacha::ChaChaRng; use tempfile::tempdir; + use historypack::HistoryPack; + use repack::IterableStore; use types::node::Node; + #[test] + fn test_topo_order() { + // Tests for exponential time complexity in a merge ancestory. This doesn't won't fail, + // but may take a long time if there is bad time complexity. + let mut rng = ChaChaRng::from_seed([0u8; 32]); + let tempdir = tempdir().unwrap(); + let mut muthistorypack = + MutableHistoryPack::new(tempdir.path(), HistoryPackVersion::One).unwrap(); + let null_key = Key::new(Box::from([]), Node::null_id().clone()); + + let chain_count = 2; + let chain_len = 3; + + let mut chains = HashMap::>::new(); + let mut entries = Vec::<(Key, NodeInfo)>::new(); + for _ in 0..chain_count { + let mut chain = Vec::<(Key, NodeInfo)>::new(); + for i in 0..chain_len { + let p1 = if i > 0 { + chain[i - 1].0.clone() + } else { + null_key.clone() + }; + let p2 = if i > 1 { + chain[i - 2].0.clone() + } else { + null_key.clone() + }; + + let key = Key::new(Box::from([]), Node::random(&mut rng)); + let info = NodeInfo { + parents: [p1, p2], + linknode: Node::random(&mut rng), + }; + entries.push((key.clone(), info.clone())); + chain.push((key.clone(), info.clone())); + if i == chain_len - 1 { + // Reverse it so the newest key is first. + chain.reverse(); + chains.insert(key, chain.clone()); + } + } + } + + // Add them in random order, so we can verify they get sorted correctly + rng.shuffle(&mut entries); + for (key, info) in entries.iter() { + muthistorypack.add(&key, &info).unwrap(); + } + let path = muthistorypack.close().unwrap(); + let pack = HistoryPack::new(&path).unwrap(); + + let actual_order = pack.iter().map(|x| x.unwrap()).collect::>(); + + // Compute the expected order + let mut chains = chains.iter().collect::>(); + chains.sort_unstable(); + chains.reverse(); + let mut expected_order = vec![]; + for (_, chain) in chains.iter() { + for (key, _) in chain.iter() { + expected_order.push(key.clone()); + } + } + + assert_eq!(actual_order, expected_order); + } + quickcheck! { fn test_get_ancestors(keys: Vec<(Key, bool)>) -> bool { let mut rng = ChaChaRng::from_seed([0u8; 32]);