diff --git a/crates/cfsctl/src/main.rs b/crates/cfsctl/src/main.rs index 881b64c3..6c81c0ea 100644 --- a/crates/cfsctl/src/main.rs +++ b/crates/cfsctl/src/main.rs @@ -135,7 +135,14 @@ enum Command { name: String, }, /// Perform garbage collection - GC, + GC { + /// Additional roots to keep (image or stream names) + #[clap(long, short = 'r')] + root: Vec, + /// Preview what would be deleted without actually deleting + #[clap(long, short = 'n')] + dry_run: bool, + }, /// Imports a composefs image (unsafe!) ImportImage { reference: String }, /// Commands for dealing with OCI layers @@ -402,8 +409,26 @@ where println!("{}", object.to_id()); } } - Command::GC => { - repo.gc()?; + Command::GC { root, dry_run } => { + let roots: Vec<&str> = root.iter().map(|s| s.as_str()).collect(); + let result = if dry_run { + repo.gc_dry_run(&roots)? + } else { + repo.gc(&roots)? + }; + if dry_run { + println!("Dry run (no files deleted):"); + } + println!( + "Objects: {} removed ({} bytes)", + result.objects_removed, result.objects_bytes + ); + if result.images_pruned > 0 || result.streams_pruned > 0 { + println!( + "Pruned symlinks: {} images, {} streams", + result.images_pruned, result.streams_pruned + ); + } } #[cfg(feature = "http")] Command::Fetch { url, name } => { diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index 1006754c..00e2a319 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -5,24 +5,28 @@ //! verification and garbage collection support. use std::{ - collections::HashSet, - ffi::CStr, + collections::{HashMap, HashSet}, + ffi::{CStr, CString, OsStr, OsString}, fs::{canonicalize, File}, io::{Read, Write}, - os::fd::{AsFd, OwnedFd}, + os::{ + fd::{AsFd, OwnedFd}, + unix::ffi::OsStrExt, + }, path::{Path, PathBuf}, sync::Arc, thread::available_parallelism, }; +use log::{debug, trace}; use tokio::sync::Semaphore; use anyhow::{bail, ensure, Context, Result}; use once_cell::sync::OnceCell; use rustix::{ fs::{ - flock, linkat, mkdirat, open, openat, readlinkat, statat, syncfs, AtFlags, Dir, FileType, - FlockOperation, Mode, OFlags, CWD, + flock, linkat, mkdirat, open, openat, readlinkat, statat, syncfs, unlinkat, AtFlags, Dir, + FileType, FlockOperation, Mode, OFlags, CWD, }, io::{Errno, Result as ErrnoResult}, }; @@ -93,6 +97,27 @@ impl Drop for Repository { } } +/// For Repository::gc_category +enum GCCategoryWalkMode { + RefsOnly, + AllEntries, +} + +/// Statistics from a garbage collection operation. +/// +/// Returned by [`Repository::gc`] to report what was (or would be) removed. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct GcResult { + /// Number of unreferenced objects removed (or that would be removed) + pub objects_removed: u64, + /// Total bytes of object data removed (or that would be removed) + pub objects_bytes: u64, + /// Number of broken symlinks removed in images/ + pub images_pruned: u64, + /// Number of broken symlinks removed in streams/ + pub streams_pruned: u64, +} + impl Repository { /// Return the objects directory. pub fn objects_dir(&self) -> ErrnoResult<&OwnedFd> { @@ -780,7 +805,7 @@ impl Repository { Ok(ObjectID::from_object_pathname(link_content.to_bytes())?) } - fn walk_symlinkdir(fd: OwnedFd, objects: &mut HashSet) -> Result<()> { + fn walk_symlinkdir(fd: OwnedFd, entry_digests: &mut HashSet) -> Result<()> { for item in Dir::read_from(&fd)? { let entry = item?; // NB: the underlying filesystem must support returning filetype via direntry @@ -789,12 +814,25 @@ impl Repository { FileType::Directory => { let filename = entry.file_name(); if filename != c"." && filename != c".." { - let dirfd = openat(&fd, filename, OFlags::RDONLY, Mode::empty())?; - Self::walk_symlinkdir(dirfd, objects)?; + let dirfd = openat( + &fd, + filename, + OFlags::RDONLY | OFlags::CLOEXEC, + Mode::empty(), + )?; + Self::walk_symlinkdir(dirfd, entry_digests)?; } } FileType::Symlink => { - objects.insert(Self::read_symlink_hashvalue(&fd, entry.file_name())?); + let link_content = readlinkat(&fd, entry.file_name(), [])?; + let linked_path = Path::new(OsStr::from_bytes(link_content.as_bytes())); + if let Some(entry_name) = linked_path.file_name() { + entry_digests.insert(entry_name.to_os_string()); + } else { + // Does not have a proper file base name (i.e. "..") + // TODO: this case needs to be checked in fsck implementation + continue; + } } _ => { bail!("Unexpected file type encountered"); @@ -816,53 +854,194 @@ impl Repository { ) } - fn gc_category(&self, category: &str) -> Result> { - let mut objects = HashSet::new(); - + // For a GC category (images / streams), return underlying entry digests and + // object IDs for each entry + // Under RefsOnly mode, only entries explicitly referenced in `/refs` + // directory structure would be walked and returned + // Under AllEntries mode, all entires will be returned + // Note that this function assumes all`*/refs/` links link to 1st level entries + // and all 1st level entries link to object store + // TODO: fsck the above noted assumption + fn gc_category( + &self, + category: &str, + mode: GCCategoryWalkMode, + ) -> Result> { let Some(category_fd) = self .openat(category, OFlags::RDONLY | OFlags::DIRECTORY) .filter_errno(Errno::NOENT) - .context("Opening {category} dir in repository")? + .context(format!("Opening {category} dir in repository"))? else { - return Ok(objects); + return Ok(Vec::new()); }; - if let Some(refs) = openat( - &category_fd, - "refs", - OFlags::RDONLY | OFlags::DIRECTORY, - Mode::empty(), - ) - .filter_errno(Errno::NOENT) - .context("Opening {category}/refs dir in repository")? - { - Self::walk_symlinkdir(refs, &mut objects)?; + let mut entry_digests = HashSet::new(); + match mode { + GCCategoryWalkMode::RefsOnly => { + if let Some(refs) = openat( + &category_fd, + "refs", + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .filter_errno(Errno::NOENT) + .context(format!("Opening {category}/refs dir in repository"))? + { + Self::walk_symlinkdir(refs, &mut entry_digests)?; + } + } + GCCategoryWalkMode::AllEntries => { + // All first-level link entries should be directly object references + for item in Dir::read_from(&category_fd)? { + let entry = item?; + let filename = entry.file_name(); + if filename != c"refs" && filename != c"." && filename != c".." { + if entry.file_type() != FileType::Symlink { + bail!("category directory contains non-symlink"); + } + entry_digests.insert(OsString::from(&OsStr::from_bytes( + entry.file_name().to_bytes(), + ))); + } + } + } } - for item in Dir::read_from(&category_fd)? { + let objects = entry_digests + .into_iter() + .map(|entry_fn| { + Ok(( + Self::read_symlink_hashvalue( + &category_fd, + CString::new(entry_fn.as_bytes())?.as_c_str(), + )?, + entry_fn + .to_str() + .context("str conversion fails")? + .to_owned(), + )) + }) + .collect::>()?; + + Ok(objects) + } + + // Remove all broken links from a directory, may operate recursively + /// Remove broken symlinks from a directory. + /// If `dry_run` is true, counts but does not remove. Returns the count. + fn cleanup_broken_links(fd: &OwnedFd, recursive: bool, dry_run: bool) -> Result { + let mut count = 0; + for item in Dir::read_from(fd)? { let entry = item?; - let filename = entry.file_name(); - if filename != c"refs" && filename != c"." && filename != c".." { - if entry.file_type() != FileType::Symlink { - bail!("category directory contains non-symlink"); + match entry.file_type() { + FileType::Directory => { + if !recursive { + continue; + } + let filename = entry.file_name(); + if filename != c"." && filename != c".." { + let dirfd = openat( + fd, + filename, + OFlags::RDONLY | OFlags::CLOEXEC, + Mode::empty(), + )?; + count += Self::cleanup_broken_links(&dirfd, recursive, dry_run)?; + } } - // TODO: we need to sort this out. the symlink itself might be a sha256 content ID - // (as for splitstreams), not an object/ to be preserved. - continue; - - /* - let mut value = Sha256HashValue::EMPTY; - hex::decode_to_slice(filename.to_bytes(), &mut value)?; + FileType::Symlink => { + let filename = entry.file_name(); + let result = statat(fd, filename, AtFlags::empty()) + .filter_errno(Errno::NOENT) + .context("Testing for broken links")?; + if result.is_none() { + count += 1; + if !dry_run { + unlinkat(fd, filename, AtFlags::empty()) + .context("Unlinking broken links")?; + } + } + } - if !objects.contains(&value) { - println!("rm {}/{:?}", category, filename); + _ => { + bail!("Unexpected file type encountered"); } - */ } } + Ok(count) + } - Ok(objects) + /// Clean up broken links in a gc category. Returns count of links removed. + fn cleanup_gc_category(&self, category: &'static str, dry_run: bool) -> Result { + let Some(category_fd) = self + .openat(category, OFlags::RDONLY | OFlags::DIRECTORY) + .filter_errno(Errno::NOENT) + .context(format!("Opening {category} dir in repository"))? + else { + return Ok(0); + }; + // Always cleanup first-level first, then the refs + let mut count = Self::cleanup_broken_links(&category_fd, false, dry_run) + .context(format!("Cleaning up broken links in {category}/"))?; + let ref_fd = openat( + &category_fd, + "refs", + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .filter_errno(Errno::NOENT) + .context(format!("Opening {category}/refs to clean up broken links"))?; + if let Some(ref dirfd) = ref_fd { + count += Self::cleanup_broken_links(dirfd, true, dry_run).context(format!( + "Cleaning up broken links recursively in {category}/refs" + ))?; + } + Ok(count) + } + + // Traverse split streams to resolve all linked objects + fn walk_streams( + &self, + stream_name_map: &HashMap, + stream_name: &str, + walked_streams: &mut HashSet, + objects: &mut HashSet, + ) -> Result<()> { + if walked_streams.contains(stream_name) { + return Ok(()); + } + walked_streams.insert(stream_name.to_owned()); + + let mut split_stream = self.open_stream(stream_name, None, None)?; + // Plain object references, add to live objects set + split_stream.get_object_refs(|id| { + debug!(" with {id:?}"); + objects.insert(id.clone()); + })?; + // Collect all stream names from named references table to be walked next + let streams_to_walk: Vec<_> = split_stream.iter_named_refs().collect(); + // Note that stream name from the named references table is not stream name in repository + // In practice repository name is often table name prefixed with stream types (e.g. oci-config-) + // Here we always match objectID to be absolutely sure + for (stream_name_in_table, stream_object_id) in streams_to_walk { + debug!( + " named reference stream {stream_name_in_table} lives, with {stream_object_id:?}" + ); + objects.insert(stream_object_id.clone()); + if let Some(stream_name_in_repo) = stream_name_map.get(stream_object_id) { + self.walk_streams( + stream_name_map, + stream_name_in_repo, + walked_streams, + objects, + )?; + } else { + // stream is in table but not in repo, the repo is potentially broken, issue a warning + trace!("broken repo: named reference stream {stream_name_in_table} not found as stream in repo"); + } + } + Ok(()) } /// Given an image, return the set of all objects referenced by it. @@ -891,35 +1070,90 @@ impl Repository { tokio::task::spawn_blocking(move || self_.sync()).await? } - /// Perform a garbage collection operation. + /// Perform garbage collection, removing unreferenced objects. + /// + /// Objects reachable from `images/refs/` or `streams/refs/` are preserved, + /// plus any `additional_roots` (looked up in both images and streams). + /// Returns statistics about what was removed. /// /// # Locking /// /// An exclusive lock is held for the duration of this operation. - pub fn gc(&self) -> Result<()> { + pub fn gc(&self, additional_roots: &[&str]) -> Result { flock(&self.repository, FlockOperation::LockExclusive)?; + self.gc_impl(additional_roots, false) + } - let mut objects = HashSet::new(); + /// Preview what garbage collection would remove, without deleting. + /// + /// Returns the same statistics that [`gc`](Self::gc) would return, + /// but no files are actually deleted. + /// + /// # Locking + /// + /// A shared lock is held for the duration of this operation (readers + /// are not blocked). + pub fn gc_dry_run(&self, additional_roots: &[&str]) -> Result { + // Shared lock is sufficient since we don't modify anything + flock(&self.repository, FlockOperation::LockShared)?; + self.gc_impl(additional_roots, true) + } - for ref object in self.gc_category("images")? { - println!("{object:?} lives as an image"); - objects.insert(object.clone()); - objects.extend(self.objects_for_image(&object.to_hex())?); + /// Internal GC implementation (lock must already be held). + fn gc_impl(&self, additional_roots: &[&str], dry_run: bool) -> Result { + let mut result = GcResult::default(); + let mut live_objects = HashSet::new(); + + // Build set of additional roots (checked in both images and streams) + let extra_roots: HashSet<_> = additional_roots.iter().map(|s| s.to_string()).collect(); + + // Collect images: those in images/refs plus caller-specified roots + let all_images = self.gc_category("images", GCCategoryWalkMode::AllEntries)?; + let root_images: Vec<_> = self + .gc_category("images", GCCategoryWalkMode::RefsOnly)? + .into_iter() + .chain( + all_images + .into_iter() + .filter(|(_, name)| extra_roots.contains(name)), + ) + .collect(); + + for ref image in root_images { + debug!("{image:?} lives as an image"); + live_objects.insert(image.0.clone()); + self.objects_for_image(&image.1)?.iter().for_each(|id| { + debug!(" with {id:?}"); + live_objects.insert(id.clone()); + }); } - /* TODO - for object in self.gc_category("streams")? { - println!("{object:?} lives as a stream"); - objects.insert(object.clone()); - - let mut split_stream = self.open_stream(&object.to_hex(), None, None)?; - split_stream.get_object_refs(|id| { - println!(" with {id:?}"); - objects.insert(id.clone()); - })?; + // Collect all streams for the name map, then filter to roots + let all_streams = self.gc_category("streams", GCCategoryWalkMode::AllEntries)?; + let stream_name_map: HashMap<_, _> = all_streams.iter().cloned().collect(); + let root_streams: Vec<_> = self + .gc_category("streams", GCCategoryWalkMode::RefsOnly)? + .into_iter() + .chain( + all_streams + .into_iter() + .filter(|(_, name)| extra_roots.contains(name)), + ) + .collect(); + + let mut walked_streams = HashSet::new(); + for stream in root_streams { + debug!("{stream:?} lives as a stream"); + live_objects.insert(stream.0.clone()); + self.walk_streams( + &stream_name_map, + &stream.1, + &mut walked_streams, + &mut live_objects, + )?; } - */ + // Walk all objects and remove unreferenced ones for first_byte in 0x0..=0xff { let dirfd = match self.openat( &format!("objects/{first_byte:02x}"), @@ -929,25 +1163,819 @@ impl Repository { Err(Errno::NOENT) => continue, Err(e) => Err(e)?, }; - for item in Dir::new(dirfd)? { + for item in Dir::read_from(&dirfd)? { let entry = item?; let filename = entry.file_name(); if filename != c"." && filename != c".." { let id = ObjectID::from_object_dir_and_basename(first_byte, filename.to_bytes())?; - if !objects.contains(&id) { - println!("rm objects/{first_byte:02x}/{filename:?}"); + if !live_objects.contains(&id) { + // Get file size before removing + if let Ok(stat) = statat(&dirfd, filename, AtFlags::empty()) { + result.objects_bytes += stat.st_size as u64; + } + result.objects_removed += 1; + + if !dry_run { + debug!("removing: objects/{first_byte:02x}/{filename:?}"); + unlinkat(&dirfd, filename, AtFlags::empty())?; + } } else { - println!("# objects/{first_byte:02x}/{filename:?} lives"); + trace!("objects/{first_byte:02x}/{filename:?} lives"); } } } } - Ok(flock(&self.repository, FlockOperation::LockShared)?) // XXX: finally { } ? + // Clean up broken symlinks + result.images_pruned = self.cleanup_gc_category("images", dry_run)?; + result.streams_pruned = self.cleanup_gc_category("streams", dry_run)?; + + // Downgrade to shared lock if we had exclusive (for actual GC) + if !dry_run { + flock(&self.repository, FlockOperation::LockShared)?; + } + Ok(result) } // fn fsck(&self) -> Result<()> { // unimplemented!() // } } + +#[cfg(test)] +mod tests { + use std::vec; + + use super::*; + use crate::fsverity::Sha512HashValue; + use crate::test::tempdir; + use rustix::fs::{statat, CWD}; + use tempfile::TempDir; + + /// Create a test repository in insecure mode (no fs-verity required). + fn create_test_repo(path: &Path) -> Result>> { + mkdirat(CWD, path, Mode::from_raw_mode(0o755))?; + let mut repo = Repository::open_path(CWD, path)?; + repo.set_insecure(true); + Ok(Arc::new(repo)) + } + + /// Generate deterministic test data of a given size. + fn generate_test_data(size: u64, seed: u8) -> Vec { + (0..size) + .map(|i| ((i as u8).wrapping_add(seed)).wrapping_mul(17)) + .collect() + } + + fn read_links_in_repo

(tmp: &TempDir, repo_sub_path: P) -> Result> + where + P: AsRef, + { + let full_path = tmp.path().join("repo").join(repo_sub_path); + match readlinkat(CWD, &full_path, Vec::new()) { + Ok(result) => Ok(Some(PathBuf::from(result.to_str()?))), + Err(rustix::io::Errno::NOENT) => Ok(None), + Err(e) => Err(e.into()), + } + } + + // Does not follow symlinks + fn test_path_exists_in_repo

(tmp: &TempDir, repo_sub_path: P) -> Result + where + P: AsRef, + { + let full_path = tmp.path().join("repo").join(repo_sub_path); + match statat(CWD, &full_path, AtFlags::SYMLINK_NOFOLLOW) { + Ok(_) => Ok(true), + Err(rustix::io::Errno::NOENT) => Ok(false), + Err(e) => Err(e.into()), + } + } + + fn test_object_exists(tmp: &TempDir, obj: &Sha512HashValue) -> Result { + let digest = obj.to_hex(); + let (first_two, remainder) = digest.split_at(2); + test_path_exists_in_repo(tmp, &format!("objects/{first_two}/{remainder}")) + } + + #[test] + fn test_gc_removes_one_stream() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1 = generate_test_data(32 * 1024, 0xAE); + let obj2 = generate_test_data(64 * 1024, 0xEA); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id: Sha512HashValue = compute_verity(&obj2); + + let mut writer = repo.create_stream(0); + writer.write_external(&obj2)?; + let _stream_id = repo.write_stream(writer, "test-stream", None)?; + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Now perform gc - should remove 2 objects (obj1 + obj2) and 1 stream symlink + let result = repo.gc(&[])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(!test_object_exists(&tmp, &obj2_id)?); + assert!(!test_path_exists_in_repo(&tmp, "streams/test-stream")?); + + // Verify GcResult: 3 objects removed (obj1, obj2, splitstream), stream symlink pruned + assert_eq!(result.objects_removed, 3); + assert!(result.objects_bytes > 0); + assert_eq!(result.streams_pruned, 1); + assert_eq!(result.images_pruned, 0); + Ok(()) + } + + #[test] + fn test_gc_keeps_one_stream() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1 = generate_test_data(32 * 1024, 0xAE); + let obj2 = generate_test_data(64 * 1024, 0xEA); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id: Sha512HashValue = compute_verity(&obj2); + + let mut writer = repo.create_stream(0); + writer.write_external(&obj2)?; + let _stream_id = repo.write_stream(writer, "test-stream", None)?; + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Now perform gc - should remove only obj1, keep obj2 and stream + let result = repo.gc(&["test-stream"])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Verify GcResult: only 1 object removed, no symlinks pruned + assert_eq!(result.objects_removed, 1); + assert!(result.objects_bytes > 0); + assert_eq!(result.streams_pruned, 0); + assert_eq!(result.images_pruned, 0); + Ok(()) + } + + #[test] + fn test_gc_keeps_one_stream_from_refs() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1 = generate_test_data(32 * 1024, 0xAE); + let obj2 = generate_test_data(64 * 1024, 0xEA); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id: Sha512HashValue = compute_verity(&obj2); + + let mut writer = repo.create_stream(0); + writer.write_external(&obj2)?; + let _stream_id = repo.write_stream(writer, "test-stream", Some("ref-name"))?; + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Now perform gc - stream is kept via ref, only obj1 removed + let result = repo.gc(&[])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Verify GcResult: 1 object removed, no symlinks pruned (stream has ref) + assert_eq!(result.objects_removed, 1); + assert!(result.objects_bytes > 0); + assert_eq!(result.streams_pruned, 0); + assert_eq!(result.images_pruned, 0); + Ok(()) + } + + #[test] + fn test_gc_keeps_one_stream_from_two_overlapped() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1 = generate_test_data(32 * 1024, 0xAE); + let obj2 = generate_test_data(64 * 1024, 0xEA); + let obj3 = generate_test_data(64 * 1024, 0xAA); + let obj4 = generate_test_data(64 * 1024, 0xEE); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id: Sha512HashValue = compute_verity(&obj2); + let obj3_id: Sha512HashValue = compute_verity(&obj3); + let obj4_id: Sha512HashValue = compute_verity(&obj4); + + let mut writer1 = repo.create_stream(0); + writer1.write_external(&obj2)?; + writer1.write_external(&obj3)?; + let _stream1_id = repo.write_stream(writer1, "test-stream1", None)?; + + let mut writer2 = repo.create_stream(0); + writer2.write_external(&obj2)?; + writer2.write_external(&obj4)?; + let _stream2_id = repo.write_stream(writer2, "test-stream2", None)?; + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_object_exists(&tmp, &obj3_id)?); + assert!(test_object_exists(&tmp, &obj4_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream1")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream1")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream2")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream2")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Now perform gc - keep stream1, remove obj1, obj4, and stream2 + let result = repo.gc(&["test-stream1"])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_object_exists(&tmp, &obj3_id)?); + assert!(!test_object_exists(&tmp, &obj4_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream1")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream1")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(!test_path_exists_in_repo(&tmp, "streams/test-stream2")?); + + // Verify GcResult: 3 objects removed (obj1, obj4, stream2's splitstream), 1 stream pruned + assert_eq!(result.objects_removed, 3); + assert!(result.objects_bytes > 0); + assert_eq!(result.streams_pruned, 1); + assert_eq!(result.images_pruned, 0); + Ok(()) + } + + #[test] + fn test_gc_keeps_named_references() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1 = generate_test_data(32 * 1024, 0xAE); + let obj2 = generate_test_data(64 * 1024, 0xEA); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id: Sha512HashValue = compute_verity(&obj2); + + let mut writer1 = repo.create_stream(0); + writer1.write_external(&obj2)?; + let stream1_id = repo.write_stream(writer1, "test-stream1", None)?; + + let mut writer2 = repo.create_stream(0); + writer2.add_named_stream_ref("test-stream1", &stream1_id); + let _stream2_id = repo.write_stream(writer2, "test-stream2", None)?; + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream1")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream1")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream2")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream2")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Now perform gc - stream2 refs stream1, both kept, only obj1 removed + let result = repo.gc(&["test-stream2"])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream1")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream1")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream2")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream2")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Verify GcResult: 1 object removed, no symlinks pruned + assert_eq!(result.objects_removed, 1); + assert!(result.objects_bytes > 0); + assert_eq!(result.streams_pruned, 0); + assert_eq!(result.images_pruned, 0); + Ok(()) + } + + #[test] + fn test_gc_keeps_named_references_with_different_table_name() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1 = generate_test_data(32 * 1024, 0xAE); + let obj2 = generate_test_data(64 * 1024, 0xEA); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id: Sha512HashValue = compute_verity(&obj2); + + let mut writer1 = repo.create_stream(0); + writer1.write_external(&obj2)?; + let stream1_id = repo.write_stream(writer1, "test-stream1", None)?; + + let mut writer2 = repo.create_stream(0); + writer2.add_named_stream_ref("different-table-name-for-test-stream1", &stream1_id); + let _stream2_id = repo.write_stream(writer2, "test-stream2", None)?; + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream1")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream1")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream2")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream2")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Now perform gc - different table name, but same object ID links them + let result = repo.gc(&["test-stream2"])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream1")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream1")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream2")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream2")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Verify GcResult: 1 object removed, no symlinks pruned + assert_eq!(result.objects_removed, 1); + assert!(result.objects_bytes > 0); + assert_eq!(result.streams_pruned, 0); + assert_eq!(result.images_pruned, 0); + Ok(()) + } + + #[test] + fn test_gc_keeps_one_named_reference_from_two_overlapped() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1 = generate_test_data(32 * 1024, 0xAE); + let obj2 = generate_test_data(64 * 1024, 0xEA); + let obj3 = generate_test_data(64 * 1024, 0xAA); + let obj4 = generate_test_data(64 * 1024, 0xEE); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id: Sha512HashValue = compute_verity(&obj2); + let obj3_id: Sha512HashValue = compute_verity(&obj3); + let obj4_id: Sha512HashValue = compute_verity(&obj4); + + let mut writer = repo.create_stream(0); + writer.write_external(&obj2)?; + let stream1_id = repo.write_stream(writer, "test-stream1", None)?; + + let mut writer = repo.create_stream(0); + writer.write_external(&obj3)?; + let stream2_id = repo.write_stream(writer, "test-stream2", None)?; + + let mut writer = repo.create_stream(0); + writer.write_external(&obj4)?; + let stream3_id = repo.write_stream(writer, "test-stream3", None)?; + + let mut writer = repo.create_stream(0); + writer.add_named_stream_ref("test-stream1", &stream1_id); + writer.add_named_stream_ref("test-stream2", &stream2_id); + let _ref_stream1_id = repo.write_stream(writer, "ref-stream1", None)?; + + let mut writer = repo.create_stream(0); + writer.add_named_stream_ref("test-stream1", &stream1_id); + writer.add_named_stream_ref("test-stream3", &stream3_id); + let _ref_stream2_id = repo.write_stream(writer, "ref-stream2", None)?; + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_object_exists(&tmp, &obj3_id)?); + assert!(test_object_exists(&tmp, &obj4_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream1")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream1")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream2")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream2")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream3")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream3")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, "streams/ref-stream1")?); + let link_target = + read_links_in_repo(&tmp, "streams/ref-stream1")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, "streams/ref-stream2")?); + let link_target = + read_links_in_repo(&tmp, "streams/ref-stream2")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + + // Now perform gc - ref-stream1 refs stream1+stream2, so keep those and their objects + let result = repo.gc(&["ref-stream1"])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_object_exists(&tmp, &obj3_id)?); + assert!(!test_object_exists(&tmp, &obj4_id)?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream1")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream1")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, "streams/test-stream2")?); + let link_target = + read_links_in_repo(&tmp, "streams/test-stream2")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(!test_path_exists_in_repo(&tmp, "streams/test-stream3")?); + assert!(test_path_exists_in_repo(&tmp, "streams/ref-stream1")?); + let link_target = + read_links_in_repo(&tmp, "streams/ref-stream1")?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("streams").join(&link_target) + )?); + assert!(!test_path_exists_in_repo(&tmp, "streams/ref-stream2")?); + + // Verify GcResult: objects removed include obj1, obj4, plus splitstreams for stream3 and ref-stream2 + assert_eq!(result.objects_removed, 4); + assert!(result.objects_bytes > 0); + assert_eq!(result.streams_pruned, 2); + assert_eq!(result.images_pruned, 0); + + Ok(()) + } + + use crate::tree::{FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat}; + + /// Create a default root stat for test filesystems + fn test_root_stat() -> Stat { + Stat { + st_mode: 0o755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 0, + xattrs: Default::default(), + } + } + + /// Make a test in-memory filesystem that only contains one externally referenced object + fn make_test_fs(obj: &Sha512HashValue, size: u64) -> FileSystem { + let mut fs: FileSystem = FileSystem::new(test_root_stat()); + let inode = Inode::Leaf(std::rc::Rc::new(Leaf { + stat: Stat { + st_mode: 0o644, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 0, + xattrs: Default::default(), + }, + content: LeafContent::Regular(RegularFile::External(obj.clone(), size)), + })); + fs.root.insert(OsStr::new("data"), inode); + fs + } + + #[test] + fn test_gc_removes_one_image() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1_size: u64 = 32 * 1024; + let obj1 = generate_test_data(obj1_size, 0xAE); + let obj2_size: u64 = 64 * 1024; + let obj2 = generate_test_data(obj2_size, 0xEA); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id = repo.ensure_object(&obj2)?; + + let fs = make_test_fs(&obj2_id, obj2_size); + let image1 = fs.commit_image(&repo, None)?; + let image1_path = format!("images/{}", image1.to_hex()); + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, &image1_path)?); + let link_target = read_links_in_repo(&tmp, &image1_path)?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("images").join(&link_target) + )?); + + // Now perform gc - no refs, so image and both objects removed + let result = repo.gc(&[])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(!test_object_exists(&tmp, &obj2_id)?); + assert!(!test_path_exists_in_repo(&tmp, &image1_path)?); + + // Verify GcResult: 3 objects removed (obj1, obj2, image erofs), 1 image pruned + assert_eq!(result.objects_removed, 3); + assert!(result.objects_bytes > 0); + assert_eq!(result.images_pruned, 1); + assert_eq!(result.streams_pruned, 0); + Ok(()) + } + + #[test] + fn test_gc_keeps_one_image() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1_size: u64 = 32 * 1024; + let obj1 = generate_test_data(obj1_size, 0xAE); + let obj2_size: u64 = 64 * 1024; + let obj2 = generate_test_data(obj2_size, 0xEA); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id = repo.ensure_object(&obj2)?; + + let fs = make_test_fs(&obj2_id, obj2_size); + let image1 = fs.commit_image(&repo, None)?; + let image1_path = format!("images/{}", image1.to_hex()); + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, &image1_path)?); + let link_target = read_links_in_repo(&tmp, &image1_path)?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("images").join(&link_target) + )?); + + // Now perform gc - keep image via additional_roots + let image1_hex = image1.to_hex(); + let result = repo.gc(&[image1_hex.as_str()])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, &image1_path)?); + let link_target = read_links_in_repo(&tmp, &image1_path)?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("images").join(&link_target) + )?); + + // Verify GcResult: 1 object removed (obj1), no symlinks pruned + assert_eq!(result.objects_removed, 1); + assert!(result.objects_bytes > 0); + assert_eq!(result.images_pruned, 0); + assert_eq!(result.streams_pruned, 0); + Ok(()) + } + + #[test] + fn test_gc_keeps_one_image_from_refs() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1_size: u64 = 32 * 1024; + let obj1 = generate_test_data(obj1_size, 0xAE); + let obj2_size: u64 = 64 * 1024; + let obj2 = generate_test_data(obj2_size, 0xEA); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id = repo.ensure_object(&obj2)?; + + let fs = make_test_fs(&obj2_id, obj2_size); + let image1 = fs.commit_image(&repo, Some("ref-name"))?; + let image1_path = format!("images/{}", image1.to_hex()); + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, &image1_path)?); + let link_target = read_links_in_repo(&tmp, &image1_path)?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("images").join(&link_target) + )?); + + // Now perform gc - image kept via ref, only obj1 removed + let result = repo.gc(&[])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_path_exists_in_repo(&tmp, &image1_path)?); + let link_target = read_links_in_repo(&tmp, &image1_path)?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("images").join(&link_target) + )?); + + // Verify GcResult: 1 object removed, no symlinks pruned (image has ref) + assert_eq!(result.objects_removed, 1); + assert!(result.objects_bytes > 0); + assert_eq!(result.images_pruned, 0); + assert_eq!(result.streams_pruned, 0); + Ok(()) + } + + fn make_test_fs_with_two_files( + obj1: &Sha512HashValue, + size1: u64, + obj2: &Sha512HashValue, + size2: u64, + ) -> FileSystem { + let mut fs = make_test_fs(obj1, size1); + let inode = Inode::Leaf(std::rc::Rc::new(Leaf { + stat: Stat { + st_mode: 0o644, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 0, + xattrs: Default::default(), + }, + content: LeafContent::Regular(RegularFile::External(obj2.clone(), size2)), + })); + fs.root.insert(OsStr::new("extra_data"), inode); + fs + } + + #[test] + fn test_gc_keeps_one_image_from_two_overlapped() -> Result<()> { + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj1_size: u64 = 32 * 1024; + let obj1 = generate_test_data(obj1_size, 0xAE); + let obj2_size: u64 = 64 * 1024; + let obj2 = generate_test_data(obj2_size, 0xEA); + let obj3_size: u64 = 64 * 1024; + let obj3 = generate_test_data(obj2_size, 0xAA); + let obj4_size: u64 = 64 * 1024; + let obj4 = generate_test_data(obj2_size, 0xEE); + + let obj1_id = repo.ensure_object(&obj1)?; + let obj2_id = repo.ensure_object(&obj2)?; + let obj3_id = repo.ensure_object(&obj3)?; + let obj4_id = repo.ensure_object(&obj4)?; + + let fs = make_test_fs_with_two_files(&obj2_id, obj2_size, &obj3_id, obj3_size); + let image1 = fs.commit_image(&repo, None)?; + let image1_path = format!("images/{}", image1.to_hex()); + + let fs = make_test_fs_with_two_files(&obj2_id, obj2_size, &obj4_id, obj4_size); + let image2 = fs.commit_image(&repo, None)?; + let image2_path = format!("images/{}", image2.to_hex()); + + repo.sync()?; + + assert!(test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_object_exists(&tmp, &obj3_id)?); + assert!(test_object_exists(&tmp, &obj4_id)?); + assert!(test_path_exists_in_repo(&tmp, &image1_path)?); + let link_target = read_links_in_repo(&tmp, &image1_path)?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("images").join(&link_target) + )?); + assert!(test_path_exists_in_repo(&tmp, &image2_path)?); + let link_target = read_links_in_repo(&tmp, &image2_path)?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("images").join(&link_target) + )?); + + // Now perform gc - keep image1, remove image2 and its unique objects + let image1_hex = image1.to_hex(); + let result = repo.gc(&[image1_hex.as_str()])?; + + assert!(!test_object_exists(&tmp, &obj1_id)?); + assert!(test_object_exists(&tmp, &obj2_id)?); + assert!(test_object_exists(&tmp, &obj3_id)?); + assert!(!test_object_exists(&tmp, &obj4_id)?); + assert!(test_path_exists_in_repo(&tmp, &image1_path)?); + let link_target = read_links_in_repo(&tmp, &image1_path)?.expect("link is not broken"); + assert!(test_path_exists_in_repo( + &tmp, + PathBuf::from("images").join(&link_target) + )?); + assert!(!test_path_exists_in_repo(&tmp, &image2_path)?); + + // Verify GcResult: 3 objects removed (obj1, obj4, image2 erofs), 1 image pruned + assert_eq!(result.objects_removed, 3); + assert!(result.objects_bytes > 0); + assert_eq!(result.images_pruned, 1); + assert_eq!(result.streams_pruned, 0); + Ok(()) + } +}