Don't compute md5sums as much in the updater. #326

This commit is contained in:
Dustin Carlino 2021-05-11 11:39:54 -07:00
parent 1e5771897f
commit e02a1c17f2

View File

@ -306,22 +306,24 @@ fn generate_manifest(truth: &Manifest) -> Manifest {
Parallelism::Fastest, Parallelism::Fastest,
paths, paths,
|(orig_path, path)| { |(orig_path, path)| {
let uncompressed_size_bytes = std::fs::metadata(&orig_path).unwrap().len(); // If the file's modtime is newer than 3 hours or the uncompressed size has changed,
// Always calculate the md5sum for files under 1GB. // calculate md5sum. Otherwise assume no change. This heuristic saves lots of time and
let checksum = if uncompressed_size_bytes < 1024 * 1024 * 1024 { // doesn't stress my poor SSD as much.
md5sum(&orig_path) let metadata = std::fs::metadata(&orig_path).unwrap();
} else if truth let uncompressed_size_bytes = metadata.len();
.entries let recent_modtime = metadata.modified().unwrap().elapsed().unwrap()
.get(&path) < std::time::Duration::from_secs(60 * 60 * 3);
.map(|entry| entry.uncompressed_size_bytes == uncompressed_size_bytes)
.unwrap_or(false) let checksum = if recent_modtime
|| truth
.entries
.get(&path)
.map(|entry| entry.uncompressed_size_bytes != uncompressed_size_bytes)
.unwrap_or(true)
{ {
// For files larger than 1GB, don't recalculate the md5sum if the size hasn't
// changed. This saves substantial time for a few gigantic files in data/input that
// rarely change.
truth.entries[&path].checksum.clone()
} else {
md5sum(&orig_path) md5sum(&orig_path)
} else {
truth.entries[&path].checksum.clone()
}; };
( (
path, path,