From 37bab752542222f15331c7116101d47817b34100 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 1 Feb 2025 21:06:32 +0000 Subject: [PATCH 1/4] Store data per fetched file --- dfetch/project/metadata.py | 41 +++++++++++++++++++++++++++++++++++++- dfetch/project/vcs.py | 33 ++++++++++++++++++++++++++++-- dfetch/util/util.py | 35 ++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 3 deletions(-) diff --git a/dfetch/project/metadata.py b/dfetch/project/metadata.py index 839fa26d..9798c760 100644 --- a/dfetch/project/metadata.py +++ b/dfetch/project/metadata.py @@ -2,6 +2,8 @@ import datetime import os +from dataclasses import dataclass +from typing import Iterable, Optional import yaml from typing_extensions import TypedDict @@ -15,6 +17,29 @@ """ +@dataclass +class FileInfo: + """Information about a single fetched file.""" + + path: str + hash: str + permissions: str # octal + + def __repr__(self) -> str: + return f"{self.path.replace("|", r"\|")}|{self.hash}|{self.permissions}" + + @staticmethod + def from_list(data: Iterable[str]) -> Iterable["FileInfo"]: + """Create a list of FileInfo's from a string""" + parsed = [] + for entry in data: + path, hash_digest, permissions = ( + entry.split("|", maxsplit=3) + ["", "", ""] + )[:3] + parsed.append(FileInfo(path, hash_digest, permissions.zfill(3))) + return parsed + + class Options(TypedDict): # pylint: disable=too-many-ancestors """Argument types for Metadata class construction.""" @@ -26,6 +51,7 @@ class Options(TypedDict): # pylint: disable=too-many-ancestors destination: str hash: str patch: str + files: Optional[Iterable[FileInfo]] = None class Metadata: @@ -50,6 +76,9 @@ def __init__(self, kwargs: Options) -> None: self._destination: str = str(kwargs.get("destination", "")) self._hash: str = str(kwargs.get("hash", "")) self._patch: str = str(kwargs.get("patch", "")) + self._files: Optional[Iterable[FileInfo]] = FileInfo.from_list( + kwargs.get("files", []) + ) @classmethod def from_project_entry(cls, project: ProjectEntry) -> "Metadata": @@ -63,6 +92,7 @@ def from_project_entry(cls, project: ProjectEntry) -> "Metadata": "last_fetch": datetime.datetime(2000, 1, 1, 0, 0, 0), "hash": "", "patch": project.patch, + "files": [], } return cls(data) @@ -73,12 +103,19 @@ def from_file(cls, path: str) -> "Metadata": data: Options = yaml.safe_load(metadata_file)["dfetch"] return cls(data) - def fetched(self, version: Version, hash_: str = "", patch_: str = "") -> None: + def fetched( + self, + version: Version, + hash_: str = "", + patch_: str = "", + files: Optional[Iterable[FileInfo]] = None, + ) -> None: """Update metadata.""" self._last_fetch = datetime.datetime.now() self._version = version self._hash = hash_ self._patch = patch_ + self._files = files @property def version(self) -> Version: @@ -147,6 +184,7 @@ def __eq__(self, other: object) -> bool: other._version.revision == self._version.revision, other.hash == self.hash, other.patch == self.patch, + other._files == self._files, ] ) @@ -161,6 +199,7 @@ def dump(self) -> None: "tag": self._version.tag, "hash": self.hash, "patch": self.patch, + "files": [str(info) for info in self._files or []], } } diff --git a/dfetch/project/vcs.py b/dfetch/project/vcs.py index 373a4c9b..0b6c2b58 100644 --- a/dfetch/project/vcs.py +++ b/dfetch/project/vcs.py @@ -13,8 +13,13 @@ from dfetch.manifest.project import ProjectEntry from dfetch.manifest.version import Version from dfetch.project.abstract_check_reporter import AbstractCheckReporter -from dfetch.project.metadata import Metadata -from dfetch.util.util import hash_directory, safe_rm +from dfetch.project.metadata import FileInfo, Metadata +from dfetch.util.util import ( + hash_directory, + hash_file_normalized, + recursive_listdir, + safe_rm, +) from dfetch.util.versions import latest_tag_from_list logger = get_logger(__name__) @@ -130,10 +135,34 @@ def update(self, force: bool = False) -> None: else: logger.warning(f"Skipping non-existent patch {self.__project.patch}") + if os.path.isfile(self.local_path): + files_list = ( + FileInfo( + os.path.basename(self.local_path), + hash_file_normalized(os.path.join(self.local_path)).hexdigest(), + oct(os.stat(os.path.join(self.local_path)).st_mode)[-3:], + ), + ) + else: + all_files = ( + file_path + for file_path in recursive_listdir(self.local_path) + if file_path is not self.__metadata.FILENAME + ) + files_list = ( + FileInfo( + os.path.relpath(file_path, self.local_path), + hash_file_normalized(file_path).hexdigest(), + oct(os.stat(file_path).st_mode)[-3:], + ) + for file_path in all_files + ) + self.__metadata.fetched( actually_fetched, hash_=hash_directory(self.local_path, skiplist=[self.__metadata.FILENAME]), patch_=applied_patch, + files=files_list, ) logger.debug(f"Writing repo metadata to: {self.__metadata.path}") diff --git a/dfetch/util/util.py b/dfetch/util/util.py index eabcd42b..19f7841c 100644 --- a/dfetch/util/util.py +++ b/dfetch/util/util.py @@ -3,6 +3,7 @@ import fnmatch import hashlib import os +import re import shutil import stat from contextlib import contextmanager @@ -104,6 +105,21 @@ def find_file(name: str, path: str = ".") -> List[str]: ] +def recursive_listdir(directory): + """List all entries in the current directory.""" + entries = os.listdir(directory) + + for entry in entries: + full_path = os.path.join(directory, entry) + + if os.path.isdir(full_path): + # If the entry is a directory, recurse into it + yield from recursive_listdir(full_path) + else: + # If the entry is a file, yield its path + yield full_path + + def hash_directory(path: str, skiplist: Optional[List[str]]) -> str: """Hash a directory with all its files.""" digest = hashlib.md5() # nosec @@ -131,3 +147,22 @@ def hash_file(file_path: str, digest: HASH) -> HASH: buf = f_obj.read(1024 * 1024) return digest + + +def hash_file_normalized(file_path: str) -> "hashlib._Hash": + """ + hash a file's contents, ignoring line feed differences (line ending normalization) + """ + digest = hashlib.sha1(usedforsecurity=False) + + if os.path.isfile(file_path): + normalize_re = re.compile(b"\r\n|\r") + + with open(file_path, "rb") as f_obj: + buf = f_obj.read(1024 * 1024) + while buf: + normalized_buf = normalize_re.sub(b"\n", buf) + digest.update(normalized_buf) # nosec + buf = f_obj.read(1024 * 1024) + + return digest From d44017c65558487e48328df8416dbb066a785297 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 1 Feb 2025 21:48:33 +0000 Subject: [PATCH 2/4] Add list of files as last entry in metadata (important info first) --- dfetch/project/metadata.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dfetch/project/metadata.py b/dfetch/project/metadata.py index 9798c760..8b6dbbb1 100644 --- a/dfetch/project/metadata.py +++ b/dfetch/project/metadata.py @@ -194,15 +194,15 @@ def dump(self) -> None: "dfetch": { "remote_url": self.remote_url, "branch": self._version.branch, - "revision": self._version.revision, - "last_fetch": self.last_fetch_string(), - "tag": self._version.tag, "hash": self.hash, + "last_fetch": self.last_fetch_string(), "patch": self.patch, + "revision": self._version.revision, + "tag": self._version.tag, "files": [str(info) for info in self._files or []], } } with open(self.path, "w+", encoding="utf-8") as metadata_file: metadata_file.write(DONT_EDIT_WARNING) - yaml.dump(metadata, metadata_file) + yaml.dump(metadata, metadata_file, sort_keys=False) From 9b2f2e980dfe14896c2c3ec896e1d2883ecf143c Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 1 Feb 2025 21:52:31 +0000 Subject: [PATCH 3/4] Check for local changes on per-file basis --- dfetch/project/metadata.py | 5 +++++ dfetch/project/vcs.py | 32 +++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/dfetch/project/metadata.py b/dfetch/project/metadata.py index 8b6dbbb1..df08a8c5 100644 --- a/dfetch/project/metadata.py +++ b/dfetch/project/metadata.py @@ -127,6 +127,11 @@ def branch(self) -> str: """Branch as stored in the metadata.""" return self._version.branch + @property + def files(self) -> Iterable[FileInfo]: + """File info as stored in the metadata.""" + return self._files + @property def tag(self) -> str: """Tag as stored in the metadata.""" diff --git a/dfetch/project/vcs.py b/dfetch/project/vcs.py index 0b6c2b58..131803d2 100644 --- a/dfetch/project/vcs.py +++ b/dfetch/project/vcs.py @@ -4,7 +4,7 @@ import os import pathlib from abc import ABC, abstractmethod -from typing import List, Optional, Sequence, Tuple +from typing import Iterable, List, Optional, Sequence, Tuple from halo import Halo from patch_ng import fromfile @@ -318,23 +318,24 @@ def on_disk_version(self) -> Optional[Version]: ) return None - def _on_disk_hash(self) -> Optional[str]: + def _on_disk_hash(self) -> Tuple[Iterable[FileInfo], Optional[str]]: """Get the hash of the project on disk. Returns: Str: Could be None if no on disk version """ if not os.path.exists(self.__metadata.path): - return None + return [], None try: - return Metadata.from_file(self.__metadata.path).hash + metadata = Metadata.from_file(self.__metadata.path) + return metadata.files, metadata.hash except TypeError: logger.warning( f"{pathlib.Path(self.__metadata.path).relative_to(os.getcwd()).as_posix()}" " is an invalid metadata file, not checking local hash!" ) - return None + return [], None def _check_for_newer_version(self) -> Optional[Version]: """Check if a newer version is available on the given branch. @@ -374,11 +375,24 @@ def _are_there_local_changes(self) -> bool: Bool: True if there are local changes, false if no were detected or no hash was found. """ logger.debug(f"Checking if there were local changes in {self.local_path}") - on_disk_hash = self._on_disk_hash() - return bool(on_disk_hash) and on_disk_hash != hash_directory( - self.local_path, skiplist=[self.__metadata.FILENAME] - ) + file_info, on_disk_hash = self._on_disk_hash() + + if not file_info: + return bool(on_disk_hash) and on_disk_hash != hash_directory( + self.local_path, skiplist=[self.__metadata.FILENAME] + ) + + for file in file_info: + full_path = os.path.join(self.local_path, file.path) + if hash_file_normalized(full_path).hexdigest() != file.hash: + logger.debug(f"The hash of {full_path} changed!") + return True + if oct(os.stat(full_path).st_mode)[-3:] != file.permissions: + logger.debug(f"The file permissions of {full_path} changed!") + return True + + return False @abstractmethod def _fetch_impl(self, version: Version) -> Version: From 7db9b3e2ad874085e64880d12523bee86c3f8186 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 1 Feb 2025 22:04:10 +0000 Subject: [PATCH 4/4] Only remove fetched files --- dfetch/project/vcs.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/dfetch/project/vcs.py b/dfetch/project/vcs.py index 131803d2..b01b2d64 100644 --- a/dfetch/project/vcs.py +++ b/dfetch/project/vcs.py @@ -4,6 +4,7 @@ import os import pathlib from abc import ABC, abstractmethod +from contextlib import suppress from typing import Iterable, List, Optional, Sequence, Tuple from halo import Halo @@ -116,7 +117,20 @@ def update(self, force: bool = False) -> None: if os.path.exists(self.local_path): logger.debug(f"Clearing destination {self.local_path}") - safe_rm(self.local_path) + + with suppress(TypeError): + metadata_files = Metadata.from_file(self.__metadata.path).files + + if metadata_files: + for file in metadata_files: + full_path = os.path.join(self.local_path, file.path) + safe_rm(full_path) + parent_dir = os.path.dirname(full_path) + # remove parent if empty + if not os.listdir(parent_dir): + safe_rm(parent_dir) + else: + safe_rm(self.local_path) with Halo( text=f"Fetching {self.__project.name} {to_fetch}",