diff --git a/.github/ETHICS_QUESTIONNAIRE.MD b/.github/ETHICS_QUESTIONNAIRE.MD index 57223a3..53337ba 100644 --- a/.github/ETHICS_QUESTIONNAIRE.MD +++ b/.github/ETHICS_QUESTIONNAIRE.MD @@ -1,36 +1,40 @@ -**Ethics & Regulatory Questionnaire** -*This PR cannot be merged until this form is completed.* - -Please reply to this comment and answer all questions below (you can copy-paste and fill it). - -1. Does this change involve any of the following? (check all that apply) - - [ ] Training or fine-tuning of AI/ML models - - [ ] Inference/serving of AI/ML models in production - - [ ] Processing of personal data (PII, health, biometric, financial, children’s data, etc.) - - [ ] Dual-use or military-applicable technology - - [ ] Safety-critical systems (medical device, aviation, automotive, etc.) - - [ ] High-impact algorithmic decision-making (credit, hiring, criminal justice, etc.) - - [ ] None of the above (pure docs, tests, CI, formatting, etc.) - -2. Estimated risk level (your honest assessment) - - [ ] Low – no ethical or regulatory impact - - [ ] Medium – possible fairness/privacy concerns - - [ ] High – potential for serious harm or legal non-compliance - -3. Brief description of any ethical/regulatory impact (or write “None”) - +**Pull Request Ethics, Security & Compliance Checklist** +*(Mandatory for all pull requests. This helps you develop professional habits required in industry and academia.)* + +1. Nature of the contribution (select all that apply) + - [ ] Purely non-functional changes (documentation, formatting, tests, CI/CD, refactoring without behavioral change) + - [ ] Introduction or modification of cryptographic functions or security-critical code + - [ ] Collection, storage, transmission, or processing of personal data (PII, health data, biometric data, location, etc.) + - [ ] Use or processing of data belonging to children under 13 (or reasonably likely to include such data) + - [ ] Implementation or modification of AI/ML models (training, fine-tuning, inference, prompt engineering) + - [ ] Code interacting with external networks, APIs, or third-party services + +2. Security & responsible practice assessment + - [ ] No security, privacy, or ethical implications identified + - [ ] Potential security or privacy implications present (e.g., input validation, error handling, data exposure) + - [ ] High-risk implications (potential for unauthorized access, data leakage, bias, or legal non-compliance) + +3. Relevant laws, regulations, and frameworks considered (list all that may apply, or if none, refer to content in D2L Ethics module) + Examples: + - Australian Privacy Act 1988 (Cth) & Privacy Amendment (Notifiable Data Breaches) Act 2017 + - GDPR (EU), CCPA/CPRA (California), COPPA (US children’s privacy) + - University Human Research Ethics requirements (HREC/NHRMC) + - NIST Cybersecurity Framework / OWASP Top 10 / ASVS + - ISO/IEC 27001, NIST AI Risk Management Framework (AI RMF) + - Export controls (ITAR, EAR, Australian Defence Export Controls) > -4. Relevant regulations / standards considered (e.g., EU AI Act, GDPR, HIPAA, NIST AI RMF, export controls, etc.) - List them or write “N/A” - +4. Security and responsible engineering measures implemented (select and describe) + - [ ] Yes — specify below (e.g., input sanitization, prepared statements, least-privilege access, data minimization, anonymization, secure defaults, dependency scanning, bias audit, consent mechanism, etc.) + - [ ] Partially implemented — further review recommended + - [ ] No — measures appear necessary + - [ ] Not applicable > -5. Have mitigation measures been implemented (bias testing, data minimization, consent flows, etc.)? - - [ ] Yes → describe below - - [ ] No - - [ ] Not applicable - +5. Additional notes (optional) + Any references to secure coding guidelines followed (e.g., ISO27001, OWASP Secure Coding Practices, university security standards), vulnerability scans performed, or ethical review status. > -Thank you! The ethics gate will evaluate your answers automatically. \ No newline at end of file +By submitting this pull request, I confirm I have considered the security, privacy, and ethical implications of my contribution in accordance with university policy and applicable legal and professional standards. + +Thank you for helping maintain a secure and responsible codebase. \ No newline at end of file diff --git a/.github/workflows/ethics-gate.yaml b/.github/workflows/ethics-gate.yaml index 769d821..ca06dd9 100644 --- a/.github/workflows/ethics-gate.yaml +++ b/.github/workflows/ethics-gate.yaml @@ -1,18 +1,14 @@ on: pull_request_target: types: [opened, reopened, synchronize] - issue_comment: - types: [created] permissions: contents: read # needed for checkout - pull-requests: write # needed for commenting & reviews (gh) when running in pull_request_target - checks: write # needed to create check runs + pull-requests: write # needed for posting comments jobs: - # Job that posts the questionnaire (runs in the trusted pull_request_target context). post-questionnaire: - if: github.event_name == 'pull_request_target' && github.event.pull_request.draft == false + if: github.event.pull_request.draft == false runs-on: ubuntu-latest steps: - name: Checkout base repo (safe; do NOT checkout PR head here) @@ -47,103 +43,3 @@ jobs: fi gh pr comment ${{ github.event.pull_request.number }} --body-file .github/ETHICS_QUESTIONNAIRE.MD echo "Posted ethics questionnaire to PR #${{ github.event.pull_request.number }}." - - # Ethics engine: collects comments, runs evaluation, posts a check, and requests changes for HIGH risk. - # This job runs in the trusted context for pull_request_target and also on issue_comment (untrusted). - # For untrusted issue_comment runs, write actions (requesting changes) may be skipped if permissions are restricted. - ethics-engine: - runs-on: ubuntu-latest - needs: post-questionnaire - steps: - - name: Checkout base repo (we run parser from base repo) - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.base.sha || github.ref }} - fetch-depth: 0 - - - name: Authenticate gh CLI with GITHUB_TOKEN - run: | - echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token - - - name: Determine PR number - id: prnumber - run: | - # Determine PR number whether triggered by pull_request_target or issue_comment - PR_NUMBER=$(jq -r 'if .pull_request then .pull_request.number elif .issue then .issue.number else empty end' "$GITHUB_EVENT_PATH") - if [[ -z "$PR_NUMBER" ]]; then - echo "No PR number found in event payload; exiting." - echo "risk=UNKNOWN" >> $GITHUB_OUTPUT - exit 0 - fi - echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT - - - name: Collect comments - id: collect - run: | - PR=${{ steps.prnumber.outputs.pr_number }} - # Gather all PR comments into a single string (robust to empty) - ANSWERS=$(gh pr view "$PR" --json comments --jq '[.comments[].body] | join("\n\n")' 2>/dev/null || true) - echo "$ANSWERS" > answers.txt - # Expose the answers (trim to avoid huge output) - echo "answers=$(echo "$ANSWERS" | head -c 32768 | sed -e 's/"/'"'"'"/g')" >> $GITHUB_OUTPUT - - - name: Run ethics parser & evaluator (safe runs code from base repo) - id: run_engine - env: - PR_NUMBER: ${{ steps.prnumber.outputs.pr_number }} - run: | - # Ensure parser exists - if [[ ! -f .github/workflows/parse_and_evaluate.py ]]; then - echo "Parser .github/workflows/parse_and_evaluate.py not found in base repo; aborting." - echo "RISK_LEVEL=UNKNOWN" > result.txt - else - python3 .github/workflows/parse_and_evaluate.py "$(cat answers.txt)" > result.txt || true - fi - cat result.txt - # Extract RISK_LEVEL=XYZ from result.txt if present - RISK=$(grep -m1 '^RISK_LEVEL=' result.txt | cut -d= -f2 || echo "LOW") - echo "risk=$RISK" >> $GITHUB_OUTPUT - - - name: Create/update "Ethics Review" check run - uses: actions/github-script@v7 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const risk = "${{ steps.run_engine.outputs.risk }}".trim(); - const conclusions = { - "LOW": "success", - "MEDIUM": "action_required", - "HIGH": "failure" - }; - const conclusion = conclusions[risk] || "failure"; - const head_sha = (context.payload.pull_request && context.payload.pull_request.head && context.payload.pull_request.head.sha) || (context.payload.issue && context.payload.issue.pull_request && context.payload.issue.number ? undefined : undefined) || github.event.pull_request?.head?.sha; - await github.rest.checks.create({ - owner: context.repo.owner, - repo: context.repo.repo, - name: "Ethics Review", - head_sha: head_sha || context.sha, - status: "completed", - conclusion, - output: { - title: risk === "LOW" ? "Ethics cleared" : `Ethics review: ${risk}`, - summary: risk === "LOW" ? "Low risk – automatically approved" : `Risk level ${risk} – review required` - } - }); - - - name: Request changes on HIGH risk (trusted-only; skip on untrusted events) - if: steps.run_engine.outputs.risk == 'HIGH' - run: | - PR=${{ steps.prnumber.outputs.pr_number }} - # Only attempt to request changes when running in pull_request_target context (trusted). - if [[ "${GITHUB_EVENT_NAME}" != "pull_request_target" ]]; then - echo "Not in pull_request_target context; skipping request-changes (insufficient permissions for fork PRs)." - exit 0 - fi - # Request changes using gh (GITHUB_TOKEN from pull_request_target has write rights) - gh pr review "$PR" --request-changes -b "@ethics-team Required manual review for high-risk change" - echo "Requested changes on PR #$PR due to HIGH risk." - - - name: Final status message - run: | - echo "Ethics engine completed. Risk level: ${{ steps.run_engine.outputs.risk }}" - echo "Ethics engine completed. Risk level: ${{ steps.run_engine.outputs.risk }}" diff --git a/.github/workflows/redeengine.py b/.github/workflows/redeengine.py deleted file mode 100644 index c9f5be2..0000000 --- a/.github/workflows/redeengine.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -import json -import sys - -def evaluate_risk(answers): - risk_score = 0 - flags = [] - - if answers.get("involves_ai", False): - risk_score += 3 - flags.append("AI/ML component") - if answers.get("processes_pii", False): - risk_score += 5 - flags.append("Personal data") - if answers.get("dual_use", False): - risk_score += 10 - flags.append("🚨 Dual-use technology") - if answers.get("safety_critical", False): - risk_score += 8 - flags.append("Safety-critical") - - if "purely documentation" in answers.get("safe_changes", []): - return "LOW", "No ethical concerns detected." - - if risk_score >= 10: - return "HIGH", " | ".join(flags) - elif risk_score >= 5: - return "MEDIUM", " | ".join(flags) - else: - return "LOW", "Minor changes" - -# Parse comment or form submission here (simplified) -# In real use, you'd parse the actual comment body -answers = json.loads(sys.argv[1]) # passed from workflow -level, reason = evaluate_risk(answers) - -print(f"RISK_LEVEL={level}") -print(f"REASON={reason}") \ No newline at end of file diff --git a/README.md b/README.md index 22fc73f..f099a5a 100644 Binary files a/README.md and b/README.md differ diff --git a/asset-scanner/file_handler.py b/asset-scanner/file_handler.py index abd571c..4fec9c5 100644 --- a/asset-scanner/file_handler.py +++ b/asset-scanner/file_handler.py @@ -4,12 +4,21 @@ from scan_media import extract_text_from_file def find_files(directory, exts=None): - exts = exts or [] + """Find all matching files in a directory, optionally filtered by extensions.""" + exts = exts or [] # List of extensions to filter files matches = [] - for dirpath, _, filenames in os.walk(directory): - for fn in filenames: - if not exts or any(fn.lower().endswith(e) for e in exts): - matches.append(os.path.join(dirpath, fn)) + + try: + for dirpath, _, filenames in os.walk(directory): # Traverse directory recursively + for fn in filenames: + # Skip files not matching the desired extensions + if not exts or any(fn.lower().endswith(e.lower()) for e in exts): + matches.append(os.path.join(dirpath, fn)) + except PermissionError: + print(f"[!] Permission denied for directory: {directory}. Skipping...") + # Continue to the next folder + pass + return matches def read_file(path): diff --git a/asset-scanner/scan_report.json b/asset-scanner/scan_report.json deleted file mode 100644 index e69de29..0000000 diff --git a/asset-scanner/scanner.py b/asset-scanner/scanner.py index 04103cd..93cc4fc 100644 --- a/asset-scanner/scanner.py +++ b/asset-scanner/scanner.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - #!/usr/bin/env python3 """ scanner.py — Redback Ethics PII & Secrets Scanner (Presidio-powered) @@ -140,34 +138,55 @@ def scan_text(text: str, file_path: str, analyzer: AnalyzerEngine, patterns_meta return findings +def scan_folder_or_file(file=None, root=None, extensions=None): + """Scan either a single file or all relevant files in the folder.""" + if file: + # Single file scan + return [file] # Return as a single-element list for compatibility + else: + # Scan entire folder + return find_files(root, exts=extensions) + # file scanner -def scan_paths(paths: Iterable[str], analyzer: AnalyzerEngine, patterns_meta: Dict) -> List[Dict[str, Any]]: +def scan_paths(paths, analyzer, patterns_meta): + """Process and scan all provided files.""" all_findings = [] - for path in paths: - print(f"\n[i] Reading: {path}") - content = read_file(path) - if isinstance(content, bytes): - try: - content = content.decode("utf-8") - except: - content = content.decode("latin-1", errors="ignore") - if isinstance(content, str) and content.strip(): - print(f" → Extracted {len(content):,} characters") - all_findings.extend(scan_text(content, path, analyzer, patterns_meta)) - else: - print(" → No text extracted (image-only PDF?)") + + for path in paths: # Loop through `paths`, one file at a time + print(f"[i] Scanning file: {path}") # Log the file being scanned + try: + content = read_file(path) # Pass a single file to `read_file()` + if not content.strip(): # Skip empty or unsupported files + print(f"[i] Skipping unsupported or empty file: {path}") + continue + + # Scan file content + findings = scan_text(content, path, analyzer, patterns_meta) + all_findings.extend(findings) # Collect results + except Exception as e: + print(f"[!] Error processing file {path}: {e}") + continue # Skip to the next file on error + return all_findings # CLI & main def parse_args(argv=None): - # parse_args function ap = argparse.ArgumentParser(description="Sensitive data scanner") - ap.add_argument("--file", help="Single file to scan") - ap.add_argument("--root", default=".", help="Root directory") - ap.add_argument("--patterns", default=DEFAULT_PATTERNS_FILE) - ap.add_argument("--out", default=DEFAULT_OUT) - ap.add_argument("--ext", nargs="*", default=DEFAULT_TARGET_EXTS) - ap.add_argument("--no-console", action="store_true") + ap.add_argument( + "--file", nargs="*", help="One or more specific files to scan (space-separated list)" + ) # `nargs="*"` allows multiple files + ap.add_argument( + "--root", nargs="*", help="One or more directories for recursive scanning" + ) + ap.add_argument( + "--patterns", default="patterns.json", help="Path to patterns.json" + ) + ap.add_argument( + "--ext", nargs="*", default=[".txt", ".json"], help="File extensions to include (e.g., .txt .pdf)" + ) + ap.add_argument( + "--out", default="scan_report.local.json", help="Output file for results" + ) return ap.parse_args(argv or sys.argv[1:]) def get_valid_path(): @@ -180,24 +199,39 @@ def get_valid_path(): print("Invalid path, try again.") def main(): - ns = parse_args() + # Parse arguments from the CLI + ns = parse_args() # Contains file, root, patterns, ext, and out args + + # Load patterns and initialize the analyzer patterns_meta = load_patterns(ns.patterns) analyzer = get_analyzer() - if ns.file: - paths = [ns.file] - print(f"[i] Scanning single file: {ns.file}") - else: - directory = get_valid_path() - paths = list(find_files(directory, ns.ext)) - print(f"[i] Found {len(paths)} files to scan in {directory}") + # Determine files to scan (using --file and --root) + paths = [] # Initialize an empty list to store all files + if ns.file: # Add files passed using the --file argument + paths.extend(ns.file) # ns.file is already a list of files + + if ns.root: # Add files from folders passed using --root + for folder in ns.root: + folder_files = find_files(folder, extensions=ns.ext) # Recursively find files + paths.extend(folder_files) + + # Validate if any files were found + if not paths: + print("[!] No files found to scan. Please check your input.") + return 0 + + print(f"[i] Found {len(paths)} files to scan.") + # Scan the files findings = scan_paths(paths, analyzer, patterns_meta) + # Write the scan results to an output report file enriched = write_report(findings, out_path=ns.out) print(f"\n[i] Full report (with paths & raw PII) saved locally → {ns.out}") print(" This file is git-ignored and must NEVER be committed.") + # Handle scan results and risk evaluation if any(f.get("risk") == "High" for f in enriched): print("\n[!] HIGH-RISK PII DETECTED → SCAN FAILED") return 1 @@ -205,7 +239,7 @@ def main(): print(f"\n[i] {len(findings)} findings → check {ns.out}") else: print("\n[Success] NO PII FOUND!") - return 0 + return if __name__ == "__main__": raise SystemExit(main()) \ No newline at end of file