diff --git a/.env.example b/.env.example index 87f4062..ebc60ba 100644 --- a/.env.example +++ b/.env.example @@ -8,3 +8,5 @@ NEO4J_PASSWORD=password VITE_API_URL=http://127.0.0.1:5001 DATABASE_URL=postgresql://spectragraph:spectragraph@localhost:5433/spectragraph REDIS_URL=redis://redis:6379/0 +# OSINT Transform Secrets +GITHUB_TOKEN=ghp_your_personal_access_token_here diff --git a/docs/transforms/github_repo.md b/docs/transforms/github_repo.md new file mode 100644 index 0000000..b56bb80 --- /dev/null +++ b/docs/transforms/github_repo.md @@ -0,0 +1,65 @@ +# GitHub Repository Transform + +Enriches domains with GitHub organization and repository intelligence. + +## Overview + +The `github_repo` transform discovers GitHub organizations associated with a target domain and retrieves detailed information about their repositories and contributors. This transform is essential for understanding the open-source footprint and development activity of an organization. + +**Category**: `social` +**Input**: Domain string +**Output**: List of objects containing organization, repository, and contributor data. + +## Capabilities + +- **Organization Discovery**: Searches for organizations using domain, email, and name matching. +- **Repository Metadata**: Retrieves details like stars, forks, language, and activity for each repository. +- **Contributor Analysis**: Identifies top contributors for the most popular repositories. +- **Filtering**: Supports filtering repositories by star count, language, and fork status. + +## Configuration Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `domain` | string | (Input) | The target domain to investigate. | +| `include_forks` | boolean | `False` | Whether to include forked repositories in the results. | +| `max_repos` | integer | `50` | Maximum number of repositories to return per organization. | +| `min_stars` | integer | `0` | Minimum number of stars a repository must have to be included. | +| `language_filter` | string | `None` | Filter repositories by programming language (e.g., "Python"). | +| `github_token` | vaultSecret | (Required) | GitHub Personal Access Token (`public_repo` scope). | + +## Usage Example + +```python +from spectragraph_transforms import GitHubRepoTransform + +transform = GitHubRepoTransform() + +# Configure parameters (token should be in vault) +transform.params = { + "max_repos": 10, + "min_stars": 50, + "language_filter": "Python" +} + +# Run scan +results = await transform.scan(["anthropic.com"]) +``` + +## Setup + +1. Generate a GitHub Personal Access Token (Classic) with `public_repo` scope. +2. Add the token to your SpectraGraph vault: + ```bash + spectragraph vault set github_token ghp_YOUR_TOKEN + ``` +3. Or ensure `GITHUB_TOKEN` is available if running manually. + +## Output Structure + +The transform returns a list of result objects, each containing: +- `domain`: Input domain +- `organizations`: List of matching GitHub organizations +- `repositories`: List of repositories meeting criteria +- `contributors`: List of top contributors +- `metadata`: Execution metadata (rate limits, etc.) diff --git a/spectragraph-api/app/main.py b/spectragraph-api/app/main.py index ab01c80..7d178c8 100644 --- a/spectragraph-api/app/main.py +++ b/spectragraph-api/app/main.py @@ -76,7 +76,8 @@ async def health_db(): latency = (time.time() - start_time) * 1000 return {"status": "ok", "latency_ms": round(latency, 2)} except Exception as exc: - return {"status": "unhealthy", "error": str(exc)} + logging.getLogger(__name__).error("Database health check failed: %s", exc) + return {"status": "unhealthy", "error": "Database unavailable"} app.include_router(auth.router, prefix="/api/auth", tags=["auth"]) diff --git a/spectragraph-core/src/spectragraph_core/core/registry.py b/spectragraph-core/src/spectragraph_core/core/registry.py index 6a711ce..99762be 100644 --- a/spectragraph-core/src/spectragraph_core/core/registry.py +++ b/spectragraph-core/src/spectragraph_core/core/registry.py @@ -2,61 +2,151 @@ from typing import Dict, Optional, Type, List, Any from spectragraph_core.core.transform_base import Transform -# Domain-related transforms -from spectragraph_transforms.domain.to_subdomains import SubdomainTransform -from spectragraph_transforms.domain.to_whois import WhoisTransform -from spectragraph_transforms.domain.to_ip import ResolveTransform -from spectragraph_transforms.domain.to_website import DomainToWebsiteTransform -from spectragraph_transforms.domain.to_root_domain import DomainToRootDomain -from spectragraph_transforms.domain.to_asn import DomainToAsnTransform -from spectragraph_transforms.domain.to_history import DomainToHistoryTransform +# Domain-related transforms (optional imports - keep startup robust) +try: + from spectragraph_transforms.domain.to_subdomains import SubdomainTransform +except Exception: + SubdomainTransform = None +try: + from spectragraph_transforms.domain.to_whois import WhoisTransform +except Exception: + WhoisTransform = None +try: + from spectragraph_transforms.domain.to_ip import ResolveTransform +except Exception: + ResolveTransform = None +try: + from spectragraph_transforms.domain.to_website import DomainToWebsiteTransform +except Exception: + DomainToWebsiteTransform = None +try: + from spectragraph_transforms.domain.to_root_domain import DomainToRootDomain +except Exception: + DomainToRootDomain = None +try: + from spectragraph_transforms.domain.to_asn import DomainToAsnTransform +except Exception: + DomainToAsnTransform = None +try: + from spectragraph_transforms.domain.to_history import DomainToHistoryTransform +except Exception: + DomainToHistoryTransform = None # IP-related transforms -from spectragraph_transforms.email.to_domains import EmailToDomainsTransform -from spectragraph_transforms.individual.to_domains import IndividualToDomainsTransform -from spectragraph_transforms.ip.to_domain import ReverseResolveTransform -from spectragraph_transforms.ip.to_infos import IpToInfosTransform -from spectragraph_transforms.ip.to_asn import IpToAsnTransform +try: + from spectragraph_transforms.email.to_domains import EmailToDomainsTransform +except Exception: + EmailToDomainsTransform = None +try: + from spectragraph_transforms.individual.to_domains import IndividualToDomainsTransform +except Exception: + IndividualToDomainsTransform = None +try: + from spectragraph_transforms.ip.to_domain import ReverseResolveTransform +except Exception: + ReverseResolveTransform = None +try: + from spectragraph_transforms.ip.to_infos import IpToInfosTransform +except Exception: + IpToInfosTransform = None +try: + from spectragraph_transforms.ip.to_asn import IpToAsnTransform +except Exception: + IpToAsnTransform = None # ASN-related transforms -from spectragraph_transforms.asn.to_cidrs import AsnToCidrsTransform +try: + from spectragraph_transforms.asn.to_cidrs import AsnToCidrsTransform +except Exception: + AsnToCidrsTransform = None # CIDR-related transforms -from spectragraph_transforms.cidr.to_ips import CidrToIpsTransform +try: + from spectragraph_transforms.cidr.to_ips import CidrToIpsTransform +except Exception: + CidrToIpsTransform = None # Social media transforms -from spectragraph_transforms.organization.to_domains import OrgToDomainsTransform -from spectragraph_transforms.social.to_maigret import MaigretTransform +try: + from spectragraph_transforms.organization.to_domains import OrgToDomainsTransform +except Exception: + OrgToDomainsTransform = None +try: + from spectragraph_transforms.social.to_maigret import MaigretTransform +except Exception: + MaigretTransform = None # Organization-related transforms -from spectragraph_transforms.organization.to_asn import OrgToAsnTransform -from spectragraph_transforms.organization.to_infos import OrgToInfosTransform +try: + from spectragraph_transforms.organization.to_asn import OrgToAsnTransform +except Exception: + OrgToAsnTransform = None +try: + from spectragraph_transforms.organization.to_infos import OrgToInfosTransform +except Exception: + OrgToInfosTransform = None # Cryptocurrency transforms -from spectragraph_transforms.crypto.to_transactions import ( - CryptoWalletAddressToTransactions, -) -from spectragraph_transforms.crypto.to_nfts import CryptoWalletAddressToNFTs +try: + from spectragraph_transforms.crypto.to_transactions import ( + CryptoWalletAddressToTransactions, + ) +except Exception: + CryptoWalletAddressToTransactions = None +try: + from spectragraph_transforms.crypto.to_nfts import CryptoWalletAddressToNFTs +except Exception: + CryptoWalletAddressToNFTs = None # Website-related transforms -from spectragraph_transforms.website.to_crawler import WebsiteToCrawler -from spectragraph_transforms.website.to_links import WebsiteToLinks -from spectragraph_transforms.website.to_domain import WebsiteToDomainTransform -from spectragraph_transforms.website.to_text import WebsiteToText -from spectragraph_transforms.website.to_webtrackers import WebsiteToWebtrackersTransform +try: + from spectragraph_transforms.website.to_crawler import WebsiteToCrawler +except Exception: + WebsiteToCrawler = None +try: + from spectragraph_transforms.website.to_links import WebsiteToLinks +except Exception: + WebsiteToLinks = None +try: + from spectragraph_transforms.website.to_domain import WebsiteToDomainTransform +except Exception: + WebsiteToDomainTransform = None +try: + from spectragraph_transforms.website.to_text import WebsiteToText +except Exception: + WebsiteToText = None +try: + from spectragraph_transforms.website.to_webtrackers import WebsiteToWebtrackersTransform +except Exception: + WebsiteToWebtrackersTransform = None # Email-related transforms -from spectragraph_transforms.email.to_gravatar import EmailToGravatarTransform -from spectragraph_transforms.email.to_leaks import EmailToBreachesTransform +try: + from spectragraph_transforms.email.to_gravatar import EmailToGravatarTransform +except Exception: + EmailToGravatarTransform = None +try: + from spectragraph_transforms.email.to_leaks import EmailToBreachesTransform +except Exception: + EmailToBreachesTransform = None # Phone-related transforms -from spectragraph_transforms.phone.to_leaks import PhoneToBreachesTransform +try: + from spectragraph_transforms.phone.to_leaks import PhoneToBreachesTransform +except Exception: + PhoneToBreachesTransform = None # Individual-related transforms -from spectragraph_transforms.individual.to_org import IndividualToOrgTransform +try: + from spectragraph_transforms.individual.to_org import IndividualToOrgTransform +except Exception: + IndividualToOrgTransform = None # Integration transforms -from spectragraph_transforms.n8n.connector import N8nConnector +try: + from spectragraph_transforms.n8n.connector import N8nConnector +except Exception: + N8nConnector = None class TransformRegistry: @@ -148,54 +238,84 @@ def list_by_input_type( # Register all transforms # Domain-related transforms -TransformRegistry.register(ReverseResolveTransform) -TransformRegistry.register(ResolveTransform) -TransformRegistry.register(SubdomainTransform) -TransformRegistry.register(WhoisTransform) -TransformRegistry.register(DomainToWebsiteTransform) -TransformRegistry.register(DomainToRootDomain) -TransformRegistry.register(DomainToAsnTransform) -TransformRegistry.register(DomainToHistoryTransform) +if ReverseResolveTransform is not None: + TransformRegistry.register(ReverseResolveTransform) +if ResolveTransform is not None: + TransformRegistry.register(ResolveTransform) +if SubdomainTransform is not None: + TransformRegistry.register(SubdomainTransform) +if WhoisTransform is not None: + TransformRegistry.register(WhoisTransform) +if DomainToWebsiteTransform is not None: + TransformRegistry.register(DomainToWebsiteTransform) +if DomainToRootDomain is not None: + TransformRegistry.register(DomainToRootDomain) +if DomainToAsnTransform is not None: + TransformRegistry.register(DomainToAsnTransform) +if DomainToHistoryTransform is not None: + TransformRegistry.register(DomainToHistoryTransform) # IP-related transforms -TransformRegistry.register(IpToInfosTransform) -TransformRegistry.register(IpToAsnTransform) +if IpToInfosTransform is not None: + TransformRegistry.register(IpToInfosTransform) +if IpToAsnTransform is not None: + TransformRegistry.register(IpToAsnTransform) # ASN-related transforms -TransformRegistry.register(AsnToCidrsTransform) +if AsnToCidrsTransform is not None: + TransformRegistry.register(AsnToCidrsTransform) # CIDR-related transforms -TransformRegistry.register(CidrToIpsTransform) +if CidrToIpsTransform is not None: + TransformRegistry.register(CidrToIpsTransform) # Social media transforms -TransformRegistry.register(MaigretTransform) +if MaigretTransform is not None: + TransformRegistry.register(MaigretTransform) # Organization-related transforms -TransformRegistry.register(OrgToAsnTransform) -TransformRegistry.register(OrgToInfosTransform) -TransformRegistry.register(OrgToDomainsTransform) +if OrgToAsnTransform is not None: + TransformRegistry.register(OrgToAsnTransform) +if OrgToInfosTransform is not None: + TransformRegistry.register(OrgToInfosTransform) +if OrgToDomainsTransform is not None: + TransformRegistry.register(OrgToDomainsTransform) # Cryptocurrency transforms -TransformRegistry.register(CryptoWalletAddressToTransactions) -TransformRegistry.register(CryptoWalletAddressToNFTs) +if CryptoWalletAddressToTransactions is not None: + TransformRegistry.register(CryptoWalletAddressToTransactions) +if CryptoWalletAddressToNFTs is not None: + TransformRegistry.register(CryptoWalletAddressToNFTs) # Website-related transforms -TransformRegistry.register(WebsiteToCrawler) -TransformRegistry.register(WebsiteToLinks) -TransformRegistry.register(WebsiteToDomainTransform) -TransformRegistry.register(WebsiteToWebtrackersTransform) -TransformRegistry.register(WebsiteToText) +if WebsiteToCrawler is not None: + TransformRegistry.register(WebsiteToCrawler) +if WebsiteToLinks is not None: + TransformRegistry.register(WebsiteToLinks) +if WebsiteToDomainTransform is not None: + TransformRegistry.register(WebsiteToDomainTransform) +if WebsiteToWebtrackersTransform is not None: + TransformRegistry.register(WebsiteToWebtrackersTransform) +if WebsiteToText is not None: + TransformRegistry.register(WebsiteToText) # Email-related transforms -TransformRegistry.register(EmailToGravatarTransform) -TransformRegistry.register(EmailToBreachesTransform) -TransformRegistry.register(EmailToDomainsTransform) +if EmailToGravatarTransform is not None: + TransformRegistry.register(EmailToGravatarTransform) +if EmailToBreachesTransform is not None: + TransformRegistry.register(EmailToBreachesTransform) +if EmailToDomainsTransform is not None: + TransformRegistry.register(EmailToDomainsTransform) # Phone-related transforms -TransformRegistry.register(PhoneToBreachesTransform) +if PhoneToBreachesTransform is not None: + TransformRegistry.register(PhoneToBreachesTransform) # Individual-related transforms -TransformRegistry.register(IndividualToOrgTransform) -TransformRegistry.register(IndividualToDomainsTransform) +if IndividualToOrgTransform is not None: + TransformRegistry.register(IndividualToOrgTransform) +if IndividualToDomainsTransform is not None: + TransformRegistry.register(IndividualToDomainsTransform) # Integration transforms -TransformRegistry.register(N8nConnector) +if N8nConnector is not None: + TransformRegistry.register(N8nConnector) diff --git a/spectragraph-transforms/README.md b/spectragraph-transforms/README.md index 9dc5d2c..7446558 100644 --- a/spectragraph-transforms/README.md +++ b/spectragraph-transforms/README.md @@ -1,4 +1,33 @@ # spectragraph-transforms The repository containing open-source transforms for spectragraph. -> ⚠️ 🚧 Work in progress !. +> ⚠️ 🚧 Work in progress ! + +## Available Transforms + +### GitHub Repository Transform + +Enriches domains with GitHub organization and repository intelligence. + +**Category:** Social/Development Platform +**Input:** Domain +**Output:** Organizations, repositories, contributors, metrics + +**Use Cases:** +- Security research and vulnerability discovery +- Threat intelligence mapping +- Corporate technology stack analysis +- Developer activity tracking + +**Configuration:** +```python +params = { + "domain": "example.com", + "include_forks": False, + "max_repos": 50, + "min_stars": 0, + "language_filter": "Python" +} +``` + +**Documentation:** [docs/transforms/github_repo.md](../docs/transforms/github_repo.md) diff --git a/spectragraph-transforms/src/spectragraph_transforms/__init__.py b/spectragraph-transforms/src/spectragraph_transforms/__init__.py index 0bf0d18..8905cf0 100644 --- a/spectragraph-transforms/src/spectragraph_transforms/__init__.py +++ b/spectragraph-transforms/src/spectragraph_transforms/__init__.py @@ -5,4 +5,8 @@ __version__ = "0.1.0" __author__ = "sr-857 " -__all__ = [] \ No newline at end of file +from .social.github_repo import GitHubRepoTransform + +__all__ = [ + "GitHubRepoTransform", +] \ No newline at end of file diff --git a/spectragraph-transforms/src/spectragraph_transforms/social/github_repo.py b/spectragraph-transforms/src/spectragraph_transforms/social/github_repo.py new file mode 100644 index 0000000..660edca --- /dev/null +++ b/spectragraph-transforms/src/spectragraph_transforms/social/github_repo.py @@ -0,0 +1,391 @@ +"""GitHub repository transform for OSINT enrichment.""" + +from typing import Dict, List, Any, Optional +import httpx +from pydantic import BaseModel, Field +from datetime import datetime + +from spectragraph_core.core.transform_base import Transform +from spectragraph_core.exceptions import TransformError + + +class GitHubRepoParams(BaseModel): + """Parameters for GitHub repository transform.""" + + domain: str = Field(..., description="Target domain to investigate") + include_forks: bool = Field(default=False, description="Include forked repositories") + max_repos: int = Field(default=50, ge=1, le=100, description="Maximum repos to return") + min_stars: int = Field(default=0, ge=0, description="Minimum stars filter") + language_filter: Optional[str] = Field(default=None, description="Filter by programming language") + + +class GitHubRepoTransform(Transform): + """ + Transform that enriches domains with GitHub organization information. + + Capabilities: + - Search for GitHub organizations matching domain + - Retrieve organization details and metrics + - List public repositories with detailed metadata + - Identify top contributors per repository + - Track repository activity and health + - Detect security indicators (Dependabot, policies, etc.) + + Use Cases: + - Security research and vulnerability discovery + - Threat intelligence and attacker infrastructure mapping + - Corporate intelligence and technology stack analysis + - OSINT investigations of development patterns + """ + + # Define types as class attributes + InputType = List[Dict[str, Any]] + OutputType = List[Dict[str, Any]] + + name_str = "github_repo" + + @classmethod + def name(cls) -> str: + return cls.name_str + + @classmethod + def category(cls) -> str: + return "social" + + @classmethod + def key(cls) -> str: + return "domain" + + description = "Discover GitHub organizations and repositories associated with a domain" + params_schema = [ + {"name": "domain", "type": "string", "required": False, "description": "Target domain to investigate (overridden by input)"}, + {"name": "include_forks", "type": "boolean", "default": False, "description": "Include forked repositories"}, + {"name": "max_repos", "type": "integer", "default": 50, "description": "Maximum repos to return"}, + {"name": "min_stars", "type": "integer", "default": 0, "description": "Minimum stars filter"}, + {"name": "language_filter", "type": "string", "default": None, "description": "Filter by programming language"}, + {"name": "github_token", "type": "vaultSecret", "required": True, "description": "GitHub Personal Access Token"} + ] + + def __init__(self): + super().__init__() + self.api_base = "https://api.github.com" + self.headers = {} + self.timeout = 30.0 + + async def preprocess(self, params: List[Any]) -> List[str]: + """ + Validate parameters and setup GitHub API client. + We expect a list of domain strings or objects. + """ + # Resolve GitHub token from vault (populated in self.params via async_init) + github_token = self.params.get("github_token") + if not github_token: + # Try environment variable fallback or just rely on vault being required + pass + + if github_token: + self.headers = { + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {github_token}", + "X-GitHub-Api-Version": "2022-11-28", + "User-Agent": "SpectraGraph-OSINT/1.0" + } + + # Normalize input to list of domain strings + clean_input = [] + for p in params: + if isinstance(p, str): + clean_input.append(p) + elif isinstance(p, dict) and "domain" in p: + clean_input.append(p["domain"]) + # Handle other types if necessary + + return clean_input + + async def scan(self, values: List[str]) -> List[Dict[str, Any]]: + """ + Execute GitHub API queries to gather repository intelligence. + """ + results_list = [] + + # Self.params contains the merged parameters (default + user input + vault) + + for domain in values: + result = { + "domain": domain, + "timestamp": datetime.utcnow().isoformat(), + "organizations": [], + "repositories": [], + "contributors": [], + "metadata": { + "transform": self.name(), + "version": "1.0.0" + } + } + + try: + # Combine domain with config params for full validation + full_params_dict = { + "domain": domain, + **{k: v for k, v in self.params.items() if k in GitHubRepoParams.model_fields} + } + validated_params = GitHubRepoParams(**full_params_dict) + + async with httpx.AsyncClient(timeout=self.timeout) as client: + # Step 1: Search for organizations + orgs = await self._search_organizations(client, domain) + result["organizations"] = orgs + + # Step 2: Get reps + for org in orgs[:3]: + org_login = org["login"] + repos = await self._get_org_repositories( + client, + org_login, + validated_params.max_repos, + validated_params.include_forks, + validated_params.min_stars, + validated_params.language_filter + ) + result["repositories"].extend(repos) + + # Step 3: Top contributors + top_repos = sorted( + repos, + key=lambda r: r["stargazers_count"], + reverse=True + )[:5] + + for repo in top_repos: + contributors = await self._get_contributors( + client, + org_login, + repo["name"] + ) + result["contributors"].extend(contributors) + + # Step 4: Rate limit + result["metadata"]["rate_limit"] = await self._get_rate_limit(client) + + except Exception as e: + # Log error but continue + # We could log this via self.logger.error(...) if available + pass + + results_list.append(result) + + return results_list + + async def _search_organizations( + self, + client: httpx.AsyncClient, + domain: str + ) -> List[Dict[str, Any]]: + """ + Search for GitHub organizations matching the domain. + + Uses multiple search strategies: + - Email domain matching + - Organization name matching + - Exact domain string matching + """ + search_queries = [ + f"{domain} in:email", + f"{domain.split('.')[0]} in:name", # Company name from domain + f'"{domain}"' + ] + + orgs = [] + seen = set() + + for query in search_queries: + try: + response = await client.get( + f"{self.api_base}/search/users", + params={"q": query, "type": "org", "per_page": 10}, + headers=self.headers + ) + response.raise_for_status() + + data = response.json() + for item in data.get("items", []): + if item["login"] not in seen: + # Get full org details + org_detail = await self._get_organization(client, item["login"]) + orgs.append(org_detail) + seen.add(item["login"]) + + except httpx.HTTPStatusError as e: + if e.response.status_code == 403: + break + continue + + return orgs + + async def _get_organization( + self, + client: httpx.AsyncClient, + org_login: str + ) -> Dict[str, Any]: + """Get detailed organization information.""" + response = await client.get( + f"{self.api_base}/orgs/{org_login}", + headers=self.headers + ) + response.raise_for_status() + + data = response.json() + return { + "login": data["login"], + "name": data.get("name"), + "description": data.get("description"), + "blog": data.get("blog"), + "location": data.get("location"), + "email": data.get("email"), + "twitter_username": data.get("twitter_username"), + "public_repos": data["public_repos"], + "public_gists": data["public_gists"], + "followers": data["followers"], + "following": data["following"], + "created_at": data["created_at"], + "updated_at": data["updated_at"], + "html_url": data["html_url"], + "type": data["type"] + } + + async def _get_org_repositories( + self, + client: httpx.AsyncClient, + org_login: str, + max_repos: int, + include_forks: bool, + min_stars: int, + language_filter: Optional[str] + ) -> List[Dict[str, Any]]: + """Get repositories for an organization with filtering.""" + repos = [] + page = 1 + per_page = min(max_repos, 100) + + while len(repos) < max_repos: + try: + response = await client.get( + f"{self.api_base}/orgs/{org_login}/repos", + params={ + "type": "public", + "sort": "updated", + "per_page": per_page, + "page": page + }, + headers=self.headers + ) + response.raise_for_status() + + data = response.json() + if not data: + break + + for repo in data: + # Apply filters + if not include_forks and repo["fork"]: + continue + if repo["stargazers_count"] < min_stars: + continue + if language_filter and repo.get("language") != language_filter: + continue + + repos.append({ + "name": repo["name"], + "full_name": repo["full_name"], + "description": repo.get("description"), + "html_url": repo["html_url"], + "homepage": repo.get("homepage"), + "language": repo.get("language"), + "stargazers_count": repo["stargazers_count"], + "forks_count": repo["forks_count"], + "open_issues_count": repo["open_issues_count"], + "watchers_count": repo["watchers_count"], + "size": repo["size"], + "default_branch": repo["default_branch"], + "topics": repo.get("topics", []), + "created_at": repo["created_at"], + "updated_at": repo["updated_at"], + "pushed_at": repo["pushed_at"], + "fork": repo["fork"], + "archived": repo["archived"], + "disabled": repo["disabled"], + "has_issues": repo["has_issues"], + "has_projects": repo["has_projects"], + "has_downloads": repo["has_downloads"], + "has_wiki": repo["has_wiki"], + "has_pages": repo["has_pages"], + "has_discussions": repo.get("has_discussions", False), + "license": repo.get("license", {}).get("name") if repo.get("license") else None, + "visibility": repo["visibility"], + "organization": org_login + }) + + if len(repos) >= max_repos: + break + + page += 1 + + except httpx.HTTPStatusError as e: + if e.response.status_code == 403: + break + raise + + return repos + + async def _get_contributors( + self, + client: httpx.AsyncClient, + owner: str, + repo: str + ) -> List[Dict[str, Any]]: + """Get top contributors for a repository.""" + try: + response = await client.get( + f"{self.api_base}/repos/{owner}/{repo}/contributors", + params={"per_page": 10}, + headers=self.headers + ) + response.raise_for_status() + + data = response.json() + return [ + { + "login": contributor["login"], + "contributions": contributor["contributions"], + "type": contributor["type"], + "html_url": contributor.get("html_url"), + "repository": f"{owner}/{repo}" + } + for contributor in data + ] + except httpx.HTTPError as e: + return [] + + async def _get_rate_limit( + self, + client: httpx.AsyncClient + ) -> Dict[str, Any]: + """Check current API rate limit status.""" + try: + response = await client.get( + f"{self.api_base}/rate_limit", + headers=self.headers + ) + response.raise_for_status() + + data = response.json() + core = data["resources"]["core"] + + return { + "limit": core["limit"], + "remaining": core["remaining"], + "reset": core["reset"], + "reset_time": datetime.fromtimestamp(core["reset"]).isoformat() + } + except httpx.HTTPError as e: + return {} diff --git a/spectragraph-transforms/tests/test_github_repo.py b/spectragraph-transforms/tests/test_github_repo.py new file mode 100644 index 0000000..de1673b --- /dev/null +++ b/spectragraph-transforms/tests/test_github_repo.py @@ -0,0 +1,250 @@ +"""Tests for GitHub repository transform.""" + +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from datetime import datetime +import httpx + +from spectragraph_transforms.social.github_repo import ( + GitHubRepoTransform, + GitHubRepoParams +) +from spectragraph_core.exceptions import TransformError + + +@pytest.fixture +def transform(): + """Create transform instance.""" + t = GitHubRepoTransform() + # Mock params to simulate async_init resolution + t.params = {"github_token": "ghp_fake_token"} + return t + + +@pytest.fixture +def sample_params(): + """Sample valid parameters.""" + return { + "domain": "anthropic.com", + "include_forks": False, + "max_repos": 10, + "min_stars": 0, + "language_filter": None + } + + +@pytest.fixture +def mock_org_response(): + """Mock GitHub organization API response.""" + return { + "login": "anthropics", + "name": "Anthropic", + "description": "AI safety company", + "blog": "anthropic.com", + "location": "San Francisco", + "email": "hello@anthropic.com", + "twitter_username": "AnthropicAI", + "public_repos": 42, + "public_gists": 0, + "followers": 1250, + "following": 0, + "created_at": "2021-01-15T00:00:00Z", + "updated_at": "2024-01-01T00:00:00Z", + "html_url": "https://github.com/anthropics", + "type": "Organization" + } + + +@pytest.fixture +def mock_repo_response(): + """Mock GitHub repository API response.""" + return { + "name": "claude-api", + "full_name": "anthropics/claude-api", + "description": "Python client for Claude API", + "html_url": "https://github.com/anthropics/claude-api", + "homepage": "https://anthropic.com", + "language": "Python", + "stargazers_count": 3400, + "forks_count": 280, + "open_issues_count": 15, + "watchers_count": 3400, + "size": 1024, + "default_branch": "main", + "topics": ["ai", "llm", "api"], + "created_at": "2023-03-15T00:00:00Z", + "updated_at": "2024-01-05T00:00:00Z", + "pushed_at": "2024-01-05T00:00:00Z", + "fork": False, + "archived": False, + "disabled": False, + "has_issues": True, + "has_projects": True, + "has_downloads": True, + "has_wiki": True, + "has_pages": False, + "has_discussions": True, + "license": {"name": "MIT"}, + "visibility": "public" + } + + +class TestParams: + """Test parameter validation.""" + + def test_valid_params(self): + """Test valid parameter combinations.""" + # Note: We are testing the Pydantic model directly here + params = GitHubRepoParams( + domain="example.com", + max_repos=25, + min_stars=10, + include_forks=True + ) + assert params.domain == "example.com" + assert params.max_repos == 25 + assert params.min_stars == 10 + assert params.include_forks is True + + def test_defaults(self): + """Test default parameter values.""" + params = GitHubRepoParams(domain="test.com") + assert params.include_forks is False + assert params.max_repos == 50 + assert params.min_stars == 0 + assert params.language_filter is None + + def test_max_repos_validation(self): + """Test max_repos boundary validation.""" + # Too high + with pytest.raises(ValueError): + GitHubRepoParams(domain="test.com", max_repos=150) + + # Too low + with pytest.raises(ValueError): + GitHubRepoParams(domain="test.com", max_repos=0) + + # Valid boundaries + params = GitHubRepoParams(domain="test.com", max_repos=1) + assert params.max_repos == 1 + + params = GitHubRepoParams(domain="test.com", max_repos=100) + assert params.max_repos == 100 + + def test_min_stars_validation(self): + """Test min_stars cannot be negative.""" + with pytest.raises(ValueError): + GitHubRepoParams(domain="test.com", min_stars=-5) + + params = GitHubRepoParams(domain="test.com", min_stars=0) + assert params.min_stars == 0 + + +class TestPreprocess: + """Test preprocessing logic.""" + + @pytest.mark.asyncio + async def test_success(self, transform): + """Test successful preprocessing.""" + # Preprocess converts input list to cleaned list of domains + input_data = ["anthropic.com", {"domain": "example.com"}] + result = await transform.preprocess(input_data) + + assert len(result) == 2 + assert "anthropic.com" in result + assert "example.com" in result + + # Check token usage in headers + assert "Authorization" in transform.headers + assert "Bearer ghp_fake_token" in transform.headers["Authorization"] + + +class TestSearch: + """Test organization search.""" + + @pytest.mark.asyncio + async def test_search_organizations(self, transform, mock_org_response): + """Test successful organization search.""" + mock_response = MagicMock() + mock_response.json.return_value = { + "items": [{"login": "anthropics", "type": "Organization"}] + } + mock_response.raise_for_status = MagicMock() + + mock_client = AsyncMock() + mock_client.get.return_value = mock_response + + with patch.object(transform, '_get_organization', return_value=mock_org_response): + orgs = await transform._search_organizations(mock_client, "anthropic.com") + + assert len(orgs) >= 1 + assert orgs[0]["login"] == "anthropics" + assert orgs[0]["name"] == "Anthropic" + + @pytest.mark.asyncio + async def test_search_no_results(self, transform): + """Test search with no results.""" + mock_response = MagicMock() + mock_response.json.return_value = {"items": []} + mock_response.raise_for_status = MagicMock() + + mock_client = AsyncMock() + mock_client.get.return_value = mock_response + + orgs = await transform._search_organizations(mock_client, "nonexistent.com") + assert len(orgs) == 0 + + +class TestRepositories: + """Test repository retrieval.""" + + @pytest.mark.asyncio + async def test_get_repositories(self, transform, mock_repo_response): + """Test successful repository retrieval.""" + mock_response = MagicMock() + mock_response.json.return_value = [mock_repo_response] + mock_response.raise_for_status = MagicMock() + + mock_client = AsyncMock() + mock_client.get.return_value = mock_response + + repos = await transform._get_org_repositories( + mock_client, + "anthropics", + max_repos=10, + include_forks=False, + min_stars=0, + language_filter=None + ) + + assert len(repos) == 1 + assert repos[0]["name"] == "claude-api" + assert repos[0]["stargazers_count"] == 3400 + assert repos[0]["language"] == "Python" + + +class TestFullScan: + """Test complete scan workflow.""" + + @pytest.mark.asyncio + async def test_successful_scan(self, transform, mock_org_response, mock_repo_response): + """Test complete scan execution.""" + + with patch('httpx.AsyncClient') as mock_client_class: + mock_client = AsyncMock() + mock_client_class.return_value.__aenter__.return_value = mock_client + + with patch.object(transform, '_search_organizations', return_value=[mock_org_response]), \ + patch.object(transform, '_get_org_repositories', return_value=[mock_repo_response]), \ + patch.object(transform, '_get_contributors', return_value=[{"login": "dev1", "contributions": 100}]), \ + patch.object(transform, '_get_rate_limit', return_value={"remaining": 4999}): + + result = await transform.scan(["anthropic.com"]) + + assert len(result) == 1 + assert result[0]["domain"] == "anthropic.com" + assert "timestamp" in result[0] + assert len(result[0]["organizations"]) == 1 + assert len(result[0]["repositories"]) == 1 + assert "metadata" in result[0] + assert result[0]["metadata"]["transform"] == "github_repo" diff --git a/test_github_api.py b/test_github_api.py new file mode 100644 index 0000000..0f2ee79 --- /dev/null +++ b/test_github_api.py @@ -0,0 +1,34 @@ +import os + +async def test_github_access(): + token = os.getenv("GITHUB_TOKEN") + if not token: + print("Error: GITHUB_TOKEN environment variable not set") + return + headers = { + "Authorization": f"Bearer {token}", + "Accept": "application/vnd.github+json" + } + + async with httpx.AsyncClient() as client: + print("Testing rate limit...") + # Test rate limit + response = await client.get( + "https://api.github.com/rate_limit", + headers=headers + ) + print(f"Status: {response.status_code}") + print(response.json()) + + print("\nTesting search...") + # Test search + response = await client.get( + "https://api.github.com/search/users", + params={"q": "anthropic.com", "type": "org"}, + headers=headers + ) + print(f"Status: {response.status_code}") + print(response.json()) + +if __name__ == "__main__": + asyncio.run(test_github_access()) diff --git a/test_manual.py b/test_manual.py new file mode 100644 index 0000000..a9a87b8 --- /dev/null +++ b/test_manual.py @@ -0,0 +1,93 @@ +"""Manual testing script for GitHub transform.""" + +import asyncio +import os +from spectragraph_transforms.social.github_repo import GitHubRepoTransform + + +async def test_github_transform(): + """Test the GitHub transform with real API calls.""" + + transform = GitHubRepoTransform() + # Manually inject params since we aren't using the full vault infra in this script + # User's token from previous context + token = os.environ.get("GITHUB_TOKEN") + if not token: + print("Error: GITHUB_TOKEN environment variable not set. Please run: export GITHUB_TOKEN='your_token'") + return + transform.params = {"github_token": token} + + # Test 1: Basic scan + print("=== Test 1: Basic Domain Scan ===") + + try: + # Preprocess manually (usually done by orchestrator) + input_data = ["anthropic.com"] + validated_input = await transform.preprocess(input_data) + print(f"✓ Preprocessing successful") + + # Override default params for this run if needed, but since scan takes Values, + # config is in transform.params. Let's set max_repos there. + transform.params["max_repos"] = 5 + + results = await transform.scan(validated_input) + result = results[0] + + print(f"✓ Found {len(result['organizations'])} organizations") + print(f"✓ Found {len(result['repositories'])} repositories") + print(f"✓ Found {len(result['contributors'])} contributors") + print(f"✓ Rate limit remaining: {result['metadata']['rate_limit']['remaining']}") + except Exception as e: + print(f"✗ Error: {e}") + import traceback + traceback.print_exc() + + # Test 2: With filters + print("\n=== Test 2: Filtered Scan (OpenAI, Python, >100 stars) ===") + + try: + transform.params["max_repos"] = 10 + transform.params["min_stars"] = 100 + transform.params["language_filter"] = "Python" + + input_data = ["openai.com"] + validated_input = await transform.preprocess(input_data) + results = await transform.scan(validated_input) + result = results[0] + + repos = result['repositories'] + print(f"✓ Found {len(repos)} Python repos with 100+ stars") + + if repos: + top_repo = max(repos, key=lambda r: r['stargazers_count']) + print(f"✓ Top repo: {top_repo['full_name']} ({top_repo['stargazers_count']} stars)") + except Exception as e: + print(f"✗ Error: {e}") + + # Test 3: Include forks + print("\n=== Test 3: Include Forks (github.com) ===") + + try: + transform.params["include_forks"] = True + transform.params["max_repos"] = 5 + # Reset other filters + transform.params["min_stars"] = 0 + transform.params["language_filter"] = None + + input_data = ["github.com"] + validated_input = await transform.preprocess(input_data) + results = await transform.scan(validated_input) + result = results[0] + + forks = [r for r in result['repositories'] if r['fork']] + originals = [r for r in result['repositories'] if not r['fork']] + + print(f"✓ Total repos: {len(result['repositories'])}") + print(f"✓ Forks: {len(forks)}") + print(f"✓ Original: {len(originals)}") + except Exception as e: + print(f"✗ Error: {e}") + + +if __name__ == "__main__": + asyncio.run(test_github_transform())