From 7e8504b7adc1bdd48eda7c0893566b5efe5b2112 Mon Sep 17 00:00:00 2001 From: Lalatendu Mohanty Date: Fri, 4 Jul 2025 23:36:34 +0530 Subject: [PATCH] Replacing the Python email library parser with packaging.metadata Replaces the Python email library parser with packaging.metadata.Metadata for parsing wheel/package metadata. Fixes: #561 Co-Authored-By: Claude Signed-off-by: Lalatendu Mohanty --- pyproject.toml | 3 +- src/fromager/bootstrapper.py | 7 +- src/fromager/candidate.py | 45 ++++----- src/fromager/dependencies.py | 85 ++++++++++++++-- tests/test_dependencies.py | 190 +++++++++++++++++++++++++++++++++++ tests/test_pep658_support.py | 9 +- 6 files changed, 294 insertions(+), 45 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2356d2d4..de076a78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,6 @@ dependencies = [ "elfdeps>=0.2.0", "license-expression", "packaging", - "pkginfo", "psutil", "pydantic", "pypi_simple", @@ -204,7 +203,7 @@ exclude = [ [[tool.mypy.overrides]] # packages without typing annotations and stubs -module = ["license_expression", "pyproject_hooks", "requests_mock", "resolver", "stevedore"] +module = ["hatchling", "hatchling.build", "license_expression", "pyproject_hooks", "requests_mock", "resolver", "stevedore"] ignore_missing_imports = true [tool.basedpyright] diff --git a/src/fromager/bootstrapper.py b/src/fromager/bootstrapper.py index 2516fdca..9bdf837d 100644 --- a/src/fromager/bootstrapper.py +++ b/src/fromager/bootstrapper.py @@ -12,7 +12,6 @@ import tempfile import typing import zipfile -from email.parser import BytesParser from urllib.parse import urlparse from packaging.requirements import Requirement @@ -1242,10 +1241,8 @@ def _get_version_from_package_metadata( config_settings=pbi.config_settings, ) metadata_filename = source_dir.parent / metadata_dir_base / "METADATA" - with open(metadata_filename, "rb") as f: - p = BytesParser() - metadata = p.parse(f, headersonly=True) - return Version(metadata["Version"]) + metadata = dependencies.parse_metadata(metadata_filename) + return metadata.version def _resolve_prebuilt_with_history( self, diff --git a/src/fromager/candidate.py b/src/fromager/candidate.py index 09a4658e..65c9f72f 100644 --- a/src/fromager/candidate.py +++ b/src/fromager/candidate.py @@ -2,12 +2,10 @@ import datetime import logging import typing -from email.message import EmailMessage, Message -from email.parser import BytesParser from io import BytesIO -from typing import TYPE_CHECKING from zipfile import ZipFile +from packaging.metadata import Metadata from packaging.requirements import Requirement from packaging.utils import BuildTag, canonicalize_name from packaging.version import Version @@ -16,13 +14,6 @@ logger = logging.getLogger(__name__) -# fix for runtime errors caused by inheriting classes that are generic in stubs but not runtime -# https://mypy.readthedocs.io/en/latest/runtime_troubles.html#using-classes-that-are-generic-in-stubs-but-not-at-runtime -if TYPE_CHECKING: - Metadata = Message[str, str] -else: - Metadata = Message - @dataclasses.dataclass(frozen=True, order=True, slots=True, repr=False, kw_only=True) class Candidate: @@ -73,11 +64,10 @@ def metadata(self) -> Metadata: return self._metadata def _get_dependencies(self) -> typing.Iterable[Requirement]: - deps = self.metadata.get_all("Requires-Dist", []) + deps = self.metadata.requires_dist or [] extras = self.extras if self.extras else [""] - for d in deps: - r = Requirement(d) + for r in deps: if r.marker is None: yield r else: @@ -95,19 +85,22 @@ def dependencies(self) -> list[Requirement]: @property def requires_python(self) -> str | None: - return self.metadata.get("Requires-Python") + spec = self.metadata.requires_python + return str(spec) if spec is not None else None -def get_metadata_for_wheel(url: str, metadata_url: str | None = None) -> Metadata: - """ - Get metadata for a wheel, supporting PEP 658 metadata endpoints. +def get_metadata_for_wheel( + url: str, metadata_url: str | None = None, *, validate: bool = True +) -> Metadata: + """Get metadata for a wheel, supporting PEP 658 metadata endpoints. Args: url: URL of the wheel file metadata_url: Optional URL of the metadata file (PEP 658) + validate: Whether to validate metadata (default: True) Returns: - Parsed metadata as a Message object + Parsed metadata as a Metadata object """ # Try PEP 658 metadata endpoint first if available if metadata_url: @@ -118,9 +111,9 @@ def get_metadata_for_wheel(url: str, metadata_url: str | None = None) -> Metadat response = session.get(metadata_url) response.raise_for_status() - # Parse metadata directly from the response content - p = BytesParser() - metadata = p.parse(BytesIO(response.content), headersonly=True) + # Parse metadata directly using packaging.metadata.Metadata + # (avoiding circular import with dependencies module) + metadata = Metadata.from_email(response.content, validate=validate) logger.debug(f"Successfully retrieved metadata via PEP 658 for {url}") return metadata @@ -136,8 +129,10 @@ def get_metadata_for_wheel(url: str, metadata_url: str | None = None) -> Metadat with ZipFile(BytesIO(data)) as z: for n in z.namelist(): if n.endswith(".dist-info/METADATA"): - p = BytesParser() - return p.parse(z.open(n), headersonly=True) + metadata_content = z.read(n) + # Parse metadata directly using packaging.metadata.Metadata + # (avoiding circular import with dependencies module) + return Metadata.from_email(metadata_content, validate=validate) - # If we didn't find the metadata, return an empty dict - return EmailMessage() + # If we didn't find the metadata, raise an error + raise ValueError(f"Could not find METADATA file in wheel: {url}") diff --git a/src/fromager/dependencies.py b/src/fromager/dependencies.py index bcc5932f..98764894 100644 --- a/src/fromager/dependencies.py +++ b/src/fromager/dependencies.py @@ -6,8 +6,8 @@ import pathlib import tempfile import typing +import zipfile -import pkginfo import pyproject_hooks import tomlkit from packaging.metadata import Metadata @@ -344,14 +344,23 @@ def default_get_install_dependencies_of_sdist( return set(metadata.requires_dist) -def parse_metadata(metadata_file: pathlib.Path, *, validate: bool = True) -> Metadata: - """Parse a dist-info/METADATA file +def parse_metadata( + metadata_source: pathlib.Path | bytes, *, validate: bool = True +) -> Metadata: + """Parse metadata from a file path or bytes. + + Args: + metadata_source: Path to METADATA file or bytes containing metadata + validate: Whether to validate metadata (default: True) - The default parse mode is 'strict'. It even fails for a mismatch of field - and core metadata version, e.g. a package with metadata 2.2 and - license-expression field (added in 2.4). + Returns: + Parsed Metadata object """ - return Metadata.from_email(metadata_file.read_bytes(), validate=validate) + if isinstance(metadata_source, pathlib.Path): + metadata_bytes = metadata_source.read_bytes() + else: + metadata_bytes = metadata_source + return Metadata.from_email(metadata_bytes, validate=validate) def pep517_metadata_of_sdist( @@ -418,9 +427,27 @@ def validate_dist_name_version( def get_install_dependencies_of_wheel( req: Requirement, wheel_filename: pathlib.Path, requirements_file_dir: pathlib.Path ) -> set[Requirement]: + """Get install dependencies from a wheel file. + + Extracts and parses the METADATA file from the wheel to get the + Requires-Dist entries. + + Args: + req: The requirement being processed + wheel_filename: Path to the wheel file + requirements_file_dir: Directory to write the requirements file + + Returns: + Set of requirements from the wheel's metadata + """ logger.info(f"getting installation dependencies from {wheel_filename}") - wheel = pkginfo.Wheel(str(wheel_filename)) - deps = _filter_requirements(req, wheel.requires_dist) + # Disable validation because many third-party packages have metadata version + # mismatches (e.g., setuptools declares Metadata-Version: 2.2 but uses + # license-file which was introduced in 2.4). The old pkginfo library + # didn't validate this, so we maintain backward compatibility. + metadata = _get_metadata_from_wheel(wheel_filename, validate=False) + requires_dist = metadata.requires_dist or [] + deps = _filter_requirements(req, requires_dist) _write_requirements_file( deps, requirements_file_dir / INSTALL_REQ_FILE_NAME, @@ -428,6 +455,46 @@ def get_install_dependencies_of_wheel( return deps +def _get_metadata_from_wheel( + wheel_filename: pathlib.Path, *, validate: bool = True +) -> Metadata: + """Extract and parse METADATA from a wheel file. + + Args: + wheel_filename: Path to the wheel file + validate: Whether to validate metadata (default: True) + + Returns: + Parsed Metadata object + + Raises: + ValueError: If no METADATA file is found in the wheel + """ + # Predict the dist-info directory name from the wheel filename + # Wheel format: {distribution}-{version}(-{build})?-{python}-{abi}-{platform}.whl + # Dist-info format: {distribution}-{version}.dist-info + # Note: We extract from the filename directly rather than using parse_wheel_filename + # because the dist-info directory uses the original (non-normalized) name + wheel_name_parts = wheel_filename.stem.split("-") + dist_name = wheel_name_parts[0] + dist_version = wheel_name_parts[1] + predicted_dist_info = f"{dist_name}-{dist_version}.dist-info/METADATA" + + with zipfile.ZipFile(wheel_filename) as whl: + # Try predicted path first for efficiency + if predicted_dist_info in whl.namelist(): + metadata_content = whl.read(predicted_dist_info) + return parse_metadata(metadata_content, validate=validate) + + # Fallback to iterating if prediction fails (e.g., non-standard naming) + for entry in whl.namelist(): + if entry.endswith(".dist-info/METADATA"): + metadata_content = whl.read(entry) + return parse_metadata(metadata_content, validate=validate) + + raise ValueError(f"Could not find METADATA file in wheel: {wheel_filename}") + + def get_pyproject_contents(sdist_root_dir: pathlib.Path) -> dict[str, typing.Any]: pyproject_toml_filename = sdist_root_dir / "pyproject.toml" if not os.path.exists(pyproject_toml_filename): diff --git a/tests/test_dependencies.py b/tests/test_dependencies.py index cbc47485..e375471f 100644 --- a/tests/test_dependencies.py +++ b/tests/test_dependencies.py @@ -1,11 +1,14 @@ import functools import itertools +import os import pathlib import shutil import textwrap import typing +import zipfile from unittest.mock import Mock, patch +import hatchling.build import pytest from packaging.metadata import Metadata from packaging.requirements import Requirement @@ -14,6 +17,77 @@ from fromager import build_environment, context, dependencies + +def build_test_wheel( + tmp_path: pathlib.Path, + name: str, + version: str, + pkg_deps: list[str] | None = None, + optional_deps: dict[str, list[str]] | None = None, +) -> pathlib.Path: + """Build a real wheel using hatchling for testing. + + Args: + tmp_path: Temporary directory for building + name: Package name + version: Package version + pkg_deps: List of dependencies (e.g., ["requests>=2.0", "urllib3"]) + optional_deps: Dict of extras (e.g., {"test": ["pytest"]}) + + Returns: + Path to the built wheel file + """ + # Create package directory structure + pkg_dir = tmp_path / "pkg_source" + pkg_dir.mkdir() + src_dir = pkg_dir / "src" / name.replace("-", "_") + src_dir.mkdir(parents=True) + (src_dir / "__init__.py").write_text('"""Test package."""\n') + + # Build pyproject.toml + deps_str = "" + if pkg_deps: + deps_list = ", ".join(f'"{d}"' for d in pkg_deps) + deps_str = f"dependencies = [{deps_list}]" + + optional_deps_str = "" + if optional_deps: + optional_deps_str = "[project.optional-dependencies]\n" + for extra, extra_deps in optional_deps.items(): + extra_deps_list = ", ".join(f'"{d}"' for d in extra_deps) + optional_deps_str += f"{extra} = [{extra_deps_list}]\n" + + pyproject_content = textwrap.dedent(f"""\ + [build-system] + requires = ["hatchling"] + build-backend = "hatchling.build" + + [project] + name = "{name}" + version = "{version}" + {deps_str} + + {optional_deps_str} + + [tool.hatch.build.targets.wheel] + packages = ["src/{name.replace("-", "_")}"] + """) + (pkg_dir / "pyproject.toml").write_text(pyproject_content) + + # Build the wheel using hatchling directly + wheel_dir = tmp_path / "wheels" + wheel_dir.mkdir() + + original_cwd = pathlib.Path.cwd() + try: + os.chdir(pkg_dir) + wheel_filename: str = hatchling.build.build_wheel(str(wheel_dir)) + finally: + os.chdir(original_cwd) + + return wheel_dir / wheel_filename + + _fromager_root = pathlib.Path(__file__).parent.parent @@ -308,3 +382,119 @@ def test_validate_dist_name_version( else: with pytest.raises(exc): validate() + + +def test_get_install_dependencies_of_wheel(tmp_path: pathlib.Path) -> None: + """Test extracting install dependencies from a wheel file built with real tools.""" + # Arrange: Build a real wheel with dependencies + wheel_file = build_test_wheel( + tmp_path, + name="test-pkg", + version="1.0.0", + pkg_deps=["requests>=2.0", "urllib3"], + optional_deps={"test": ["pytest"]}, + ) + + req = Requirement("test-pkg") + output_dir = tmp_path / "output" + output_dir.mkdir() + + # Act + result = dependencies.get_install_dependencies_of_wheel(req, wheel_file, output_dir) + + # Assert: Should get requests and urllib3, but not pytest (extra not requested) + result_names = {r.name for r in result} + assert result_names == {"requests", "urllib3"} + + +def test_get_install_dependencies_of_wheel_no_deps(tmp_path: pathlib.Path) -> None: + """Test extracting dependencies from a wheel with no dependencies.""" + # Arrange: Build a real wheel without dependencies + wheel_file = build_test_wheel( + tmp_path, + name="simple-pkg", + version="1.0.0", + ) + + req = Requirement("simple-pkg") + output_dir = tmp_path / "output" + output_dir.mkdir() + + # Act + result = dependencies.get_install_dependencies_of_wheel(req, wheel_file, output_dir) + + # Assert + assert result == set() + + +def test_get_metadata_from_wheel_missing_metadata(tmp_path: pathlib.Path) -> None: + """Test that missing METADATA file raises ValueError.""" + # Arrange: Create a wheel file without METADATA + wheel_file = tmp_path / "broken_pkg-1.0.0-py3-none-any.whl" + with zipfile.ZipFile(wheel_file, "w") as zf: + zf.writestr("broken_pkg/__init__.py", "") + + # Act & Assert + with pytest.raises(ValueError, match="Could not find METADATA file"): + dependencies._get_metadata_from_wheel(wheel_file) + + +def test_get_metadata_from_wheel_with_build_tag(tmp_path: pathlib.Path) -> None: + """Test extracting metadata from a wheel with a build tag in the filename. + + Wheel format: {distribution}-{version}-{build}-{python}-{abi}-{platform}.whl + The build tag is optional but when present, it affects the filename parsing. + This tests the fallback iteration path when the predicted dist-info path fails. + """ + # Arrange: Create a wheel with a build tag that has non-standard dist-info naming + # The wheel filename has a build tag (123) but dist-info uses standard naming + wheel_file = tmp_path / "mypkg-1.0.0-123-py3-none-any.whl" + metadata_content = textwrap.dedent("""\ + Metadata-Version: 2.1 + Name: mypkg + Version: 1.0.0 + Requires-Dist: requests + """) + with zipfile.ZipFile(wheel_file, "w") as zf: + # dist-info directory doesn't include the build tag + zf.writestr("mypkg-1.0.0.dist-info/METADATA", metadata_content) + zf.writestr("mypkg-1.0.0.dist-info/WHEEL", "Wheel-Version: 1.0") + zf.writestr("mypkg/__init__.py", "") + + # Act: The predicted path would be "mypkg-123.dist-info/METADATA" (wrong) + # but fallback iteration should find the correct path + metadata = dependencies._get_metadata_from_wheel(wheel_file) + + # Assert + assert metadata.name == "mypkg" + assert str(metadata.version) == "1.0.0" + assert metadata.requires_dist is not None + assert len(metadata.requires_dist) == 1 + assert str(metadata.requires_dist[0]) == "requests" + + +def test_get_metadata_from_wheel_validation_disabled(tmp_path: pathlib.Path) -> None: + """Test that validation can be disabled when parsing wheel metadata. + + Some wheels may have metadata that doesn't strictly conform to PEP standards + but is still usable. The validate=False option allows parsing such metadata. + """ + # Arrange: Create a wheel with slightly non-conformant metadata + # (missing Metadata-Version which is technically required) + wheel_file = tmp_path / "testpkg-1.0.0-py3-none-any.whl" + metadata_content = textwrap.dedent("""\ + Name: testpkg + Version: 1.0.0 + Requires-Dist: urllib3 + """) + with zipfile.ZipFile(wheel_file, "w") as zf: + zf.writestr("testpkg-1.0.0.dist-info/METADATA", metadata_content) + zf.writestr("testpkg-1.0.0.dist-info/WHEEL", "Wheel-Version: 1.0") + zf.writestr("testpkg/__init__.py", "") + + # Act: Parse with validation disabled + metadata = dependencies._get_metadata_from_wheel(wheel_file, validate=False) + + # Assert: Should still parse the basic fields + assert metadata.name == "testpkg" + assert str(metadata.version) == "1.0.0" diff --git a/tests/test_pep658_support.py b/tests/test_pep658_support.py index 0bbaf77c..3af992ce 100644 --- a/tests/test_pep658_support.py +++ b/tests/test_pep658_support.py @@ -57,10 +57,11 @@ def test_get_metadata_with_pep658_success(self, mock_session: typing.Any) -> Non metadata = get_metadata_for_wheel(wheel_url, metadata_url) # Verify the metadata was parsed correctly - assert metadata["Name"] == "test-package" - assert metadata["Version"] == "1.0.0" - assert metadata["Summary"] == "A test package" - assert "requests >= 2.0.0" in metadata.get_all("Requires-Dist", []) + assert metadata.name == "test-package" + assert str(metadata.version) == "1.0.0" + assert metadata.summary == "A test package" + assert metadata.requires_dist is not None + assert any(str(req) == "requests>=2.0.0" for req in metadata.requires_dist) # Verify only the metadata URL was called, not the wheel URL mock_session.get.assert_called_once_with(metadata_url)