From cfdd0e6ce16bbe601ea0431abb3e6dae79e83fa2 Mon Sep 17 00:00:00 2001 From: Douglas Coburn Date: Thu, 11 Dec 2025 17:34:42 -0800 Subject: [PATCH 1/4] feat: add batched PURL endpoint calls and conditional license fetching - Add --max-purl-batch-size flag (default: 5000, range: 1-9999) to control batch size for license detail API calls - Skip PURL endpoint entirely when --generate-license is not set, improving performance for scans that don't need license attribution/details - Implement batching in get_license_text_via_purl() to process packages in configurable chunks, preventing API overload on large repos - Add validation for max_purl_batch_size parameter with clear error messages - Remove unused check_full_scans_status() method (dead code cleanup) This change optimizes license data retrieval by: 1. Only calling PURL endpoint when license output is actually needed 2. Processing packages in manageable batches to avoid timeouts/limits 3. Providing tunable batch sizes for different repo sizes --- pyproject.toml | 2 +- socketsecurity/__init__.py | 2 +- socketsecurity/config.py | 14 ++++ socketsecurity/core/__init__.py | 126 ++++++++++++++------------------ 4 files changed, 70 insertions(+), 74 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9d22de8..22a656a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "hatchling.build" [project] name = "socketsecurity" -version = "2.2.51" +version = "2.2.52" requires-python = ">= 3.10" license = {"file" = "LICENSE"} dependencies = [ diff --git a/socketsecurity/__init__.py b/socketsecurity/__init__.py index afff87d..d75cde0 100644 --- a/socketsecurity/__init__.py +++ b/socketsecurity/__init__.py @@ -1,3 +1,3 @@ __author__ = 'socket.dev' -__version__ = '2.2.51' +__version__ = '2.2.52' USER_AGENT = f'SocketPythonCLI/{__version__}' diff --git a/socketsecurity/config.py b/socketsecurity/config.py index 28aa39b..c7af6df 100644 --- a/socketsecurity/config.py +++ b/socketsecurity/config.py @@ -78,6 +78,7 @@ class CliConfig: reach_additional_params: Optional[List[str]] = None only_facts_file: bool = False reach_use_only_pregenerated_sboms: bool = False + max_purl_batch_size: int = 5000 @classmethod def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig': @@ -141,6 +142,7 @@ def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig': 'reach_additional_params': args.reach_additional_params, 'only_facts_file': args.only_facts_file, 'reach_use_only_pregenerated_sboms': args.reach_use_only_pregenerated_sboms, + 'max_purl_batch_size': args.max_purl_batch_size, 'version': __version__ } try: @@ -187,6 +189,11 @@ def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig': logging.error("--reach-concurrency must be >= 1") exit(1) + # Validate max_purl_batch_size is within allowed range + if args.max_purl_batch_size < 1 or args.max_purl_batch_size > 9999: + logging.error("--max-purl-batch-size must be between 1 and 9999") + exit(1) + return cls(**config_args) def to_dict(self) -> dict: @@ -446,6 +453,13 @@ def create_argument_parser() -> argparse.ArgumentParser: action="store_true", help="Exclude license details from the diff report (boosts performance for large repos)" ) + output_group.add_argument( + "--max-purl-batch-size", + dest="max_purl_batch_size", + type=int, + default=5000, + help="Maximum batch size for PURL endpoint calls when generating license info (default: 5000, min: 1, max: 9999)" + ) output_group.add_argument( "--disable-security-issue", diff --git a/socketsecurity/core/__init__.py b/socketsecurity/core/__init__.py index 2a2987c..a76a794 100644 --- a/socketsecurity/core/__init__.py +++ b/socketsecurity/core/__init__.py @@ -659,54 +659,6 @@ def create_full_scan_with_report_url( # Return result in the format expected by the user return diff - def check_full_scans_status(self, head_full_scan_id: str, new_full_scan_id: str) -> bool: - is_ready = False - current_timeout = self.config.timeout - self.sdk.set_timeout(0.5) - try: - self.sdk.fullscans.stream(self.config.org_slug, head_full_scan_id) - except Exception: - log.debug(f"Queued up full scan for processing ({head_full_scan_id})") - - try: - self.sdk.fullscans.stream(self.config.org_slug, new_full_scan_id) - except Exception: - log.debug(f"Queued up full scan for processing ({new_full_scan_id})") - self.sdk.set_timeout(current_timeout) - start_check = time.time() - head_is_ready = False - new_is_ready = False - while not is_ready: - head_full_scan_metadata = self.sdk.fullscans.metadata(self.config.org_slug, head_full_scan_id) - if head_full_scan_metadata: - head_state = head_full_scan_metadata.get("scan_state") - else: - head_state = None - new_full_scan_metadata = self.sdk.fullscans.metadata(self.config.org_slug, new_full_scan_id) - if new_full_scan_metadata: - new_state = new_full_scan_metadata.get("scan_state") - else: - new_state = None - if head_state and head_state == "resolve": - head_is_ready = True - if new_state and new_state == "resolve": - new_is_ready = True - if head_is_ready and new_is_ready: - is_ready = True - current_time = time.time() - if current_time - start_check >= self.config.timeout: - log.debug( - f"Timeout reached while waiting for full scans to be ready " - f"({head_full_scan_id}, {new_full_scan_id})" - ) - break - total_time = time.time() - start_check - if is_ready: - log.info(f"Full scans are ready in {total_time:.2f} seconds") - else: - log.warning(f"Full scans are not ready yet ({head_full_scan_id}, {new_full_scan_id})") - return is_ready - def get_full_scan(self, full_scan_id: str) -> FullScan: """ Get a FullScan object for an existing full scan including sbom_artifacts and packages. @@ -846,28 +798,54 @@ def update_package_values(pkg: Package) -> Package: pkg.url += f"/{pkg.name}/overview/{pkg.version}" return pkg - def get_license_text_via_purl(self, packages: dict[str, Package]) -> dict: - components = [] + def get_license_text_via_purl(self, packages: dict[str, Package], batch_size: int = 5000) -> dict: + """Get license attribution and details via PURL endpoint in batches. + + Args: + packages: Dictionary of packages to get license info for + batch_size: Maximum number of packages to process per API call (1-9999) + + Returns: + Updated packages dictionary with licenseAttrib and licenseDetails populated + """ + # Validate batch size + batch_size = max(1, min(9999, batch_size)) + + # Build list of all components + all_components = [] for purl in packages: full_purl = f"pkg:/{purl}" - components.append({"purl": full_purl}) - results = self.sdk.purl.post( - license=True, - components=components, - licenseattrib=True, - licensedetails=True - ) - purl_packages = [] - for result in results: - ecosystem = result["type"] - name = result["name"] - package_version = result["version"] - licenseDetails = result.get("licenseDetails") - licenseAttrib = result.get("licenseAttrib") - purl = f"{ecosystem}/{name}@{package_version}" - if purl not in purl_packages and purl in packages: - packages[purl].licenseAttrib = licenseAttrib - packages[purl].licenseDetails = licenseDetails + all_components.append({"purl": full_purl}) + + # Process in batches + total_components = len(all_components) + log.debug(f"Processing {total_components} packages in batches of {batch_size}") + + for i in range(0, total_components, batch_size): + batch_components = all_components[i:i + batch_size] + batch_num = (i // batch_size) + 1 + total_batches = (total_components + batch_size - 1) // batch_size + log.debug(f"Processing batch {batch_num}/{total_batches} ({len(batch_components)} packages)") + + results = self.sdk.purl.post( + license=True, + components=batch_components, + licenseattrib=True, + licensedetails=True + ) + + purl_packages = [] + for result in results: + ecosystem = result["type"] + name = result["name"] + package_version = result["version"] + licenseDetails = result.get("licenseDetails") + licenseAttrib = result.get("licenseAttrib") + purl = f"{ecosystem}/{name}@{package_version}" + if purl not in purl_packages and purl in packages: + packages[purl].licenseAttrib = licenseAttrib + packages[purl].licenseDetails = licenseDetails + return packages def get_added_and_removed_packages( @@ -960,7 +938,14 @@ def get_added_and_removed_packages( log.error(f"Artifact details - name: {artifact.name}, version: {artifact.version}") log.error("No matching packages found in head_full_scan") - packages = self.get_license_text_via_purl(packages) + # Only fetch license details if generate_license is enabled + if self.cli_config and self.cli_config.generate_license: + log.debug("Fetching license details via PURL endpoint") + batch_size = self.cli_config.max_purl_batch_size if self.cli_config else 5000 + packages = self.get_license_text_via_purl(packages, batch_size=batch_size) + else: + log.debug("Skipping PURL endpoint call (--generate-license not set)") + return added_packages, removed_packages, packages def create_new_diff( @@ -1092,9 +1077,6 @@ def create_new_diff( log.warning(f"Failed to clean up temporary file {temp_file}: {e}") # Handle diff generation - now we always have both scans - scans_ready = self.check_full_scans_status(head_full_scan_id, new_full_scan.id) - if scans_ready is False: - log.error(f"Full scans did not complete within {self.config.timeout} seconds") ( added_packages, removed_packages, From 8a079947f93375e2753eafaf22a16a1457869164 Mon Sep 17 00:00:00 2001 From: Douglas Coburn Date: Thu, 11 Dec 2025 17:41:31 -0800 Subject: [PATCH 2/4] Fixing --ignore-commit-files to properly work again --- pyproject.toml | 2 +- socketsecurity/__init__.py | 2 +- socketsecurity/socketcli.py | 37 +++++++++++++++++++++++-------------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 22a656a..8d14ede 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "hatchling.build" [project] name = "socketsecurity" -version = "2.2.52" +version = "2.2.53" requires-python = ">= 3.10" license = {"file" = "LICENSE"} dependencies = [ diff --git a/socketsecurity/__init__.py b/socketsecurity/__init__.py index d75cde0..ef7533a 100644 --- a/socketsecurity/__init__.py +++ b/socketsecurity/__init__.py @@ -1,3 +1,3 @@ __author__ = 'socket.dev' -__version__ = '2.2.52' +__version__ = '2.2.53' USER_AGENT = f'SocketPythonCLI/{__version__}' diff --git a/socketsecurity/socketcli.py b/socketsecurity/socketcli.py index 00b64a7..e62d770 100644 --- a/socketsecurity/socketcli.py +++ b/socketsecurity/socketcli.py @@ -356,6 +356,7 @@ def main_code(): # Determine files to check based on the new logic files_to_check = [] force_api_mode = False + force_diff_mode = False if files_explicitly_specified: # Case 2: Files are specified - use them and don't check commit details @@ -365,10 +366,20 @@ def main_code(): # Case 1: Files not specified and --ignore-commit-files not set - try to find changed files from commit files_to_check = git_repo.changed_files log.debug(f"Using changed files from commit: {files_to_check}") + elif config.ignore_commit_files and is_repo: + # Case 3: Git repo with --ignore-commit-files - force diff mode + files_to_check = [] + force_diff_mode = True + log.debug("Git repo with --ignore-commit-files: forcing diff mode") else: - # ignore_commit_files is set or not a repo - scan everything but force API mode if no supported files + # Case 4: Not a git repo (ignore_commit_files was auto-set to True) files_to_check = [] - log.debug("No files to check from commit (ignore_commit_files=True or not a repo)") + # If --enable-diff is set, force diff mode for non-git repos + if config.enable_diff: + force_diff_mode = True + log.debug("Non-git repo with --enable-diff: forcing diff mode") + else: + log.debug("Non-git repo without --enable-diff: will use full scan mode") # Check if we have supported manifest files has_supported_files = files_to_check and core.has_manifest_files(files_to_check) @@ -389,22 +400,19 @@ def main_code(): has_supported_files = False # Case 3: If no supported files or files are empty, force API mode (no PR comments) - if not has_supported_files: + # BUT: Don't force API mode if we're in force_diff_mode + if not has_supported_files and not force_diff_mode: force_api_mode = True log.debug("No supported manifest files found, forcing API mode") # Determine scan behavior should_skip_scan = False # Always perform scan, but behavior changes based on supported files - if config.ignore_commit_files and not files_explicitly_specified: - # Force full scan when ignoring commit files and no explicit files - should_skip_scan = False - log.debug("Forcing full scan due to ignore_commit_files") - elif not has_supported_files: - # No supported files - still scan but in API mode + if not has_supported_files and not force_diff_mode: + # No supported files and not forcing diff - still scan but in API mode should_skip_scan = False log.debug("No supported files but will scan in API mode") else: - log.debug("Found supported manifest files, proceeding with normal scan") + log.debug("Found supported manifest files or forcing diff mode, proceeding with normal scan") org_slug = core.config.org_slug if config.repo_is_public: @@ -531,14 +539,15 @@ def main_code(): output_handler.handle_output(diff) - elif config.enable_diff and not force_api_mode: - # New logic: --enable-diff forces diff mode even with --integration api (no SCM) + elif (config.enable_diff or force_diff_mode) and not force_api_mode: + # New logic: --enable-diff or force_diff_mode (from --ignore-commit-files in git repos) forces diff mode log.info("Diff mode enabled without SCM integration") diff = core.create_new_diff(scan_paths, params, no_change=should_skip_scan, save_files_list_path=config.save_submitted_files_list, save_manifest_tar_path=config.save_manifest_tar, base_paths=base_paths, explicit_files=sbom_files_to_submit) output_handler.handle_output(diff) - elif config.enable_diff and force_api_mode: - # User requested diff mode but no manifest files were detected + elif (config.enable_diff or force_diff_mode) and force_api_mode: + # User requested diff mode but no manifest files were detected - this should not happen with new logic + # but keeping as a safety net log.warning("--enable-diff was specified but no supported manifest files were detected in the changed files. Falling back to full scan mode.") log.info("Creating Socket Report (full scan)") serializable_params = { From c81618a6471787d88194538cee2aa4bd8ea68367 Mon Sep 17 00:00:00 2001 From: Douglas Coburn Date: Thu, 11 Dec 2025 17:52:18 -0800 Subject: [PATCH 3/4] properly included the enable diff param to the main module --- pyproject.toml | 2 +- socketsecurity/__init__.py | 2 +- socketsecurity/config.py | 1 + socketsecurity/socketcli.py | 4 ++++ 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8d14ede..58d3994 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "hatchling.build" [project] name = "socketsecurity" -version = "2.2.53" +version = "2.2.54" requires-python = ">= 3.10" license = {"file" = "LICENSE"} dependencies = [ diff --git a/socketsecurity/__init__.py b/socketsecurity/__init__.py index ef7533a..805b432 100644 --- a/socketsecurity/__init__.py +++ b/socketsecurity/__init__.py @@ -1,3 +1,3 @@ __author__ = 'socket.dev' -__version__ = '2.2.53' +__version__ = '2.2.54' USER_AGENT = f'SocketPythonCLI/{__version__}' diff --git a/socketsecurity/config.py b/socketsecurity/config.py index c7af6df..8ebda71 100644 --- a/socketsecurity/config.py +++ b/socketsecurity/config.py @@ -107,6 +107,7 @@ def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig': 'commit_sha': args.commit_sha, 'generate_license': args.generate_license, 'enable_debug': args.enable_debug, + 'enable_diff': args.enable_diff, 'allow_unverified': args.allow_unverified, 'enable_json': args.enable_json, 'enable_sarif': args.enable_sarif, diff --git a/socketsecurity/socketcli.py b/socketsecurity/socketcli.py index e62d770..644a967 100644 --- a/socketsecurity/socketcli.py +++ b/socketsecurity/socketcli.py @@ -375,6 +375,7 @@ def main_code(): # Case 4: Not a git repo (ignore_commit_files was auto-set to True) files_to_check = [] # If --enable-diff is set, force diff mode for non-git repos + log.debug(f"Case 4: Non-git repo - config.enable_diff={config.enable_diff}, type={type(config.enable_diff)}") if config.enable_diff: force_diff_mode = True log.debug("Non-git repo with --enable-diff: forcing diff mode") @@ -401,9 +402,11 @@ def main_code(): # Case 3: If no supported files or files are empty, force API mode (no PR comments) # BUT: Don't force API mode if we're in force_diff_mode + log.debug(f"files_to_check={files_to_check}, has_supported_files={has_supported_files}, force_diff_mode={force_diff_mode}, config.enable_diff={config.enable_diff}") if not has_supported_files and not force_diff_mode: force_api_mode = True log.debug("No supported manifest files found, forcing API mode") + log.debug(f"force_api_mode={force_api_mode}") # Determine scan behavior should_skip_scan = False # Always perform scan, but behavior changes based on supported files @@ -465,6 +468,7 @@ def main_code(): diff.report_url = "" # Handle SCM-specific flows + log.debug(f"Flow decision: scm={scm is not None}, force_diff_mode={force_diff_mode}, force_api_mode={force_api_mode}, enable_diff={config.enable_diff}") if scm is not None and scm.check_event_type() == "comment": # FIXME: This entire flow should be a separate command called "filter_ignored_alerts_in_comments" # It's not related to scanning or diff generation - it just: From 93ef52cdba52e6715b3f8e52732c64750335376b Mon Sep 17 00:00:00 2001 From: Douglas Coburn Date: Thu, 11 Dec 2025 17:56:35 -0800 Subject: [PATCH 4/4] Adding NPM CLI to Dockerfile --- Dockerfile | 2 +- pyproject.toml | 2 +- socketsecurity/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index a5ef959..4f31e10 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,7 +57,7 @@ RUN if [ "$DOTNET_VERSION" = "6" ]; then \ fi # Install additional tools -RUN npm install @coana-tech/cli -g && \ +RUN npm install @coana-tech/cli socket -g && \ gem install bundler && \ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ . ~/.cargo/env && \ diff --git a/pyproject.toml b/pyproject.toml index 58d3994..12fa227 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "hatchling.build" [project] name = "socketsecurity" -version = "2.2.54" +version = "2.2.55" requires-python = ">= 3.10" license = {"file" = "LICENSE"} dependencies = [ diff --git a/socketsecurity/__init__.py b/socketsecurity/__init__.py index 805b432..0feb3be 100644 --- a/socketsecurity/__init__.py +++ b/socketsecurity/__init__.py @@ -1,3 +1,3 @@ __author__ = 'socket.dev' -__version__ = '2.2.54' +__version__ = '2.2.55' USER_AGENT = f'SocketPythonCLI/{__version__}'