From ecdbaefeaad1128387f9ea5f6d8583aa9b464eaa Mon Sep 17 00:00:00 2001 From: quinnwai Date: Fri, 9 Jan 2026 12:45:21 -0800 Subject: [PATCH 1/3] lovely day --- README.md | 89 +++++--- docs/commands.md | 440 ++++++++++++++++++++++++++++++++++++++++ docs/configuration.md | 62 ++++++ docs/getting-started.md | 188 +++++++++++++++++ docs/metadata.md | 254 +++++++++++++++++++++++ metadata/directory.go | 1 - metadata/meta.go | 4 +- 7 files changed, 1007 insertions(+), 31 deletions(-) create mode 100644 docs/commands.md create mode 100644 docs/configuration.md create mode 100644 docs/getting-started.md create mode 100644 docs/metadata.md diff --git a/README.md b/README.md index 9ccf2f6..a0aff7b 100644 --- a/README.md +++ b/README.md @@ -1,47 +1,80 @@ -# Meta +# Forge -Metadata handling for CALPYR data platform +FHIR metadata management for CALYPR Gen3 data repositories. -## Workflow -- General Design Paramters +Forge works alongside [git-drs](https://github.com/calypr/git-drs/blob/main/README.md) to generate and publish FHIR-compliant metadata, making your datasets discoverable on the CALYPR platform. -This repo is designed to produce git hook commands that take care of metadata additions / subtractions that are run before or after certain git commands like commit and push. Draft workflow currently: +## Quick Start -## Example user workflow +```bash +# Verify your connection to CALYPR +forge ping -``` -git clone repo -forge init -- exactly same as git-drs init, just a wrapper around it -git add files -git commit -m "test" -- same as git-drs -git push origin main -- same as git-drs -forge publish [github personal access token] +# Publish metadata to CALYPR +forge publish ghp_your_github_token + +# Monitor the job +forge list +forge status ``` -To generate a personal access token for a github repo check these docs: -https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens +## What Forge Does -## Command descriptions +**1. Manage Project Metadata** +- `forge publish` - Generate and upload metadata to CALYPR +- `forge empty` - Remove project metadata +- `forge meta` - Preview metadata locally +- `forge validate` - Check metadata validity -### ping +**2. Monitor Platform State** +- `forge ping` - Check connection and credentials +- `forge list` - View all processing jobs +- `forge status` - Check specific job status +- `forge output` - View job logs -Same as ping in g3t +**3. Configure Portal Frontend** +- `forge config` - Generate a CALYPR explorer template -### meta +## Installation + +```bash +git clone https://github.com/calypr/forge.git +cd forge +go build -o forge +sudo mv forge /usr/local/bin/ +``` -Generates metadata from non checked in .meta files. If .meta files are already checked in you can regen metadata with -r flag. This command is run as part of the pre-commit command +## Prerequisites -### validate +- Git DRS installed and configured +- Data files pushed to CALYPR via git-drs +- Gen3 credentials (configured through git-drs) +- GitHub Personal Access Token ([create token](https://github.com/settings/tokens)) -Validates metadata against the jsonschema in grip +## Documentation -### precommit +- [Getting Started](docs/getting-started.md) - Setup and basic workflows +- [Command Reference](docs/commands.md) - Detailed command documentation +- [Configuration Guide](docs/configuration.md) - Git-drs configuration +- [Metadata Structure](docs/metadata.md) - Understanding FHIR resources -Runs meta init command then locates all .ndjson files in META directory and validates each file. +## Example Workflow -### publish +```bash +# Use git-drs to track and push files +git lfs track "*.fastq.gz" +git add data/sample.fastq.gz +git commit -m "Add sequencing data" +git push + +# Publish metadata to CALYPR +forge publish ghp_abc123def456 + +# Monitor the job +forge list +# Uid: job-xyz789 Name: fhir_import_export Status: Succeeded +``` -Validates that your Personal Access token exists and is valid -Packages together relevent information used to init the git repo in a remote job -Kicks off a sower job to process the metadata files that you have just pushed up +## Support -No git hook for publish, users are expected to run that themselves. +Part of the CALYPR data commons ecosystem. diff --git a/docs/commands.md b/docs/commands.md new file mode 100644 index 0000000..9d27857 --- /dev/null +++ b/docs/commands.md @@ -0,0 +1,440 @@ +# Command Reference + +Detailed documentation for all forge commands. + +**Quick Navigation:** [publish](#publish) | [meta](#meta) | [validate](#validate) | [empty](#empty) | [ping](#ping) | [list](#list) | [status](#status) | [output](#output) | [config](#config) + +--- + +## Manage Project Metadata + +### publish + +Create a metadata upload job for your project. + +**Usage:** +```bash +forge publish [--remote REMOTE_NAME] +``` + +**What it does:** + +This is the main command you'll use. It dispatches a Sower job on the CALYPR platform that: +1. Validates your GitHub token +2. Clones your git repository +3. Generates FHIR metadata from your git-drs tracked files +4. Validates the metadata against FHIR schemas +5. Uploads the metadata to CALYPR + +**Example:** +```bash +$ forge publish ghp_abc123def456 + +Using remote: production +Uid: job-xyz789-abc123 Name: fhir_import_export Status: Pending +``` + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to use (default: default_remote) + +**Common errors:** +- `Error: invalid token` - Your GitHub token is expired or missing the `repo` scope +- `Error: repository has no origin` - Push your repository to GitHub first +- `Error: could not locate remote` - Run this in a git-drs initialized repository + +**When to use:** After pushing new or updated files via git-drs, run this to make them discoverable on CALYPR. + +--- + +### meta + +Generate FHIR metadata locally for debugging. + +**Usage:** +```bash +forge meta [--remote REMOTE_NAME] +``` + +**What it does:** + +Generates FHIR metadata files locally in the `META/` directory. This is useful for debugging what metadata will be created, but it's not required for normal workflows (metadata is generated automatically during the publish job). + +The command: +1. Queries your CALYPR project for all DRS objects +2. Reads git-lfs tracked files in your repository +3. Matches them by SHA256 hash +4. Creates DocumentReference resources (one per file) +5. Creates Directory resources (one per folder) +6. Creates or updates the ResearchStudy resource (one per project) +7. Writes NDJSON files to `META/` + +**Example:** +```bash +$ forge meta + +Loaded existing ResearchStudy from ./META/ResearchStudy.ndjson with ID abc123... +Processed 15 records +Finished writing all DocumentReference records. +Finished writing all Directory records. +``` + +**Output structure:** +``` +your-repo/ +├── META/ +│ ├── DocumentReference.ndjson +│ ├── Directory.ndjson +│ └── ResearchStudy.ndjson +``` + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to use (default: default_remote) + +**When to use:** When you want to inspect metadata before publishing, or debug why validation is failing. + +--- + +### validate + +Validate metadata and configuration files. + +#### validate data + +Validate FHIR NDJSON metadata files against schemas. + +**Usage:** +```bash +forge validate data [PATH] [--remote REMOTE_NAME] +``` + +**What it does:** + +Checks that NDJSON files in the `META/` directory conform to FHIR R5 schemas. It validates: +- Required fields are present +- Field types are correct +- Values follow FHIR constraints +- JSON structure is valid + +**Example (success):** +```bash +$ forge validate data + +Validating NDJSON files in META/... +✓ META/DocumentReference.ndjson (15 resources validated) +✓ META/Directory.ndjson (8 resources validated) +✓ META/ResearchStudy.ndjson (1 resource validated) + +All files valid! +``` + +**Example (error):** +```bash +$ forge validate data + +Error in META/DocumentReference.ndjson line 3: + Missing required field: status +Validation failed. +``` + +**Arguments:** +- `PATH` - Path to NDJSON files directory (default: `./META`) + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to use (default: default_remote) + +**When to use:** Before publishing, or when debugging failed publish jobs. + +--- + +#### validate config + +Validate explorer configuration files. + +**Usage:** +```bash +forge validate config [PATH] [--remote REMOTE_NAME] +``` + +**What it does:** + +Validates JSON configuration files used by the CALYPR explorer frontend. Checks that the structure matches the expected schema. + +**Example:** +```bash +$ forge validate config ./CONFIG/my-project.json + +✓ Configuration valid +``` + +**Arguments:** +- `PATH` - Path to config file (default: `./CONFIG`) + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to use (default: default_remote) + +**When to use:** After generating or editing explorer config files. + +--- + +#### validate edge + +Check for orphaned edges in metadata graph. + +**Usage:** +```bash +forge validate edge [PATH] [--remote REMOTE_NAME] [--export-vertices] [--export-edges] +``` + +**What it does:** + +Validates that all references between FHIR resources are valid. For example, checks that DocumentReference resources reference valid Directory resources, and that all Directory references point to existing directories. + +**Example:** +```bash +$ forge validate edge + +Checking graph integrity... +✓ No orphaned edges found +All references valid. +``` + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to use (default: default_remote) +- `--export-vertices` - Export all vertices to a file for inspection +- `--export-edges` - Export all edges to a file for inspection + +**When to use:** When debugging complex directory structures or reference issues. + +--- + +### empty + +Remove all metadata for your project from CALYPR. + +**Usage:** +```bash +forge empty [--remote REMOTE_NAME] +``` + +**What it does:** + +Dispatches a Sower job that deletes all FHIR metadata for your project from the CALYPR platform. Your data files remain in storage, but the metadata that makes them discoverable is removed. + +**Example:** +```bash +$ forge empty + +Using remote: production +Uid: job-delete-xyz789 Name: fhir_import_export Status: Pending +``` + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to use (default: default_remote) + +**Warning:** This operation cannot be undone. Use with caution. + +**When to use:** When you need to clear out old metadata before re-publishing, or when decommissioning a project. + +--- + +## Monitor Platform State + +### ping + +Check your connection and credentials. + +**Usage:** +```bash +forge ping [--remote REMOTE_NAME] +``` + +**What it does:** + +Verifies that: +- Your credentials are valid +- You can authenticate with the CALYPR platform +- Your project configuration is correct +- You have access to the configured storage buckets + +**Example:** +```bash +$ forge ping +profile: production +username: researcher +endpoint: https://calypr-public.ohsu.edu +bucket_programs: + bucket_name: program_name +your_access: + /programs/my_program/projects/my_project: '*,create,delete,update,write-storage,file_upload' +``` + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to ping (default: default_remote) + +**When to use:** As a first step to verify your setup, or when debugging authentication issues. + +--- + +### list + +View all Sower jobs for your project. + +**Usage:** +```bash +forge list [--remote REMOTE_NAME] +``` + +**What it does:** + +Shows all processing jobs that have been dispatched for your project, including their status. + +**Example:** +```bash +$ forge list + +Using remote: production +Uid: job-abc123-456def Name: fhir_import_export Status: Completed +Uid: job-xyz789-012ghi Name: fhir_import_export Status: Running +Uid: job-old111-222jkl Name: fhir_import_export Status: Failed +``` + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to use (default: default_remote) + +**Job statuses:** +- `Pending` - Job is queued, waiting to start +- `Running` - Job is currently executing +- `Succeeded` - Job completed successfully +- `Failed` - Job encountered an error + +**When to use:** After running `forge publish` to check if your job has completed. + +--- + +### status + +Check the status of a specific job. + +**Usage:** +```bash +forge status [--remote REMOTE_NAME] +``` + +**What it does:** + +Shows the current status of a specific Sower job. Use the UID from the `forge list` or `forge publish` output. + +**Example:** +```bash +$ forge status job-abc123-456def + +Using remote: production +Uid: job-abc123-456def Name: fhir_import_export Status: Completed +``` + +**Arguments:** +- `UID` - The job UID (get this from `forge list` or `forge publish`) + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to use (default: default_remote) + +**When to use:** To check on a specific job without listing all jobs. + +--- + +### output + +View logs from a specific job. + +**Usage:** +```bash +forge output [--remote REMOTE_NAME] +``` + +**What it does:** + +Retrieves and displays the output logs from a Sower job. This is essential for debugging failed jobs. + +**Example (successful job):** +```bash +$ forge list + +Using remote: production +Uid: job-abc123-456def Name: fhir_import_export Status: Completed +``` + +**Example (failed job):** +```bash +$ forge output job-xyz789-fail + +Using remote: production +Logs: +Cloning repository... +Generating metadata... +Error: Validation failed for META/DocumentReference.ndjson line 5 +Missing required field: status +Job failed. +``` + +**Arguments:** +- `UID` - The job UID (get this from `forge list` or `forge publish`) + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to use (default: default_remote) + +**When to use:** When a job fails, or when you need detailed information about what happened during processing. + +--- + +## Configure Portal Frontend + +### config + +Generate an explorer configuration template. + +**Usage:** +```bash +forge config [--remote REMOTE_NAME] +``` + +**What it does:** + +Creates a skeleton configuration file for the CALYPR explorer frontend. This file controls how your project appears in the CALYPR data portal. + +The command generates a template at `CONFIG/[projectID].json` that you can customize. + +**Example:** +```bash +$ forge config + +Using remote: production +Created configuration template at CONFIG/my-project-123.json +``` + +**Output:** +``` +your-repo/ +├── CONFIG/ +│ └── my_program-my_project.json +``` + +**Flags:** +- `--remote`, `-r` - Specify which CALYPR remote to use (default: default_remote) + +**When to use:** When setting up a new project and you need to configure how it appears in the CALYPR portal. + +--- + +## Global Flags + +All commands support these flags: + +- `--remote`, `-r` - Specify which git-drs remote to use (dev, staging, production, etc.) +- `--help`, `-h` - Show help for a command + +**Examples:** +```bash +forge publish ghp_token --remote staging +forge ping --remote dev +forge list --remote production +``` \ No newline at end of file diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 0000000..67f0559 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,62 @@ +# Configuration Guide + +Forge uses Git DRS configuration for all its settings: no separate configuration needed. + +## Configuration Files + +### `.drs/config.yaml` + +This is the main configuration file created by Git DRS. It contains: +- Remote server configurations (URLs, credentials, project IDs, buckets) +- Default remote settings +- Authentication profiles + +Forge reads this file to determine which CALYPR instance to connect to and how to authenticate. + +**Location:** `.drs/config.yaml` in your git repository root + + +### CALYPR Credentials + +CALYPR credentials are provided by your platform administrator. You'll typically receive: +- A website URL +- A project ID +- Credentials (retrieved from /Profile page on the website) + +These can be used in the git-drs repo to authenticate yourself + +### Git Credentials + +You can place those in a common place like a ~/.bash_profile or ~/.zshrc by adding the line + +```sh +export GH_PAT= +``` + +you can then reference them when publishing projects + +```sh +forge publish $GH_PAT +``` + +## Explorer Configuration + +The `forge config` command generates a configuration template for the CALYPR explorer frontend. This is separate from the Git DRS configuration and controls how your project appears on the CALYPR platform. + +```bash +forge config +``` + +This creates `CONFIG/[projectID].json` which you can customize to: +- Define visible fields +- Configure filters +- Set up shared filters across metadata +- Specify custom data visualizations + +See the CALYPR documentation for details on explorer configuration options. + +## See Also + +- [Git DRS Documentation](https://github.com/calypr/git-drs/blob/main/README.md) - Complete Git DRS setup guide +- [Commands Reference](commands.md) - All forge commands +- [Getting Started](getting-started.md) - Quick start guide \ No newline at end of file diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 0000000..ac783e8 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,188 @@ +# Getting Started with Forge + +Forge manages FHIR metadata for datasets in the CALYPR Gen3 platform. It works alongside git-drs to make your data files discoverable and searchable through the CALYPR platform. + +## Prerequisites + +Before using forge, you'll need: + +- Git DRS installed and configured (see the [git-drs documentation](https://github.com/calypr/git-drs/blob/main/README.md)) +- A git-drs repository with data files already pushed to CALYPR +- CALYPR credentials set up through git-drs +- A GitHub Personal Access Token with `repo` scope ([create one here](https://github.com/settings/tokens)) +- Go 1.21+ to build from source + +## Installation + +```bash +# Clone the repository and build +git clone https://github.com/calypr/forge.git +cd forge +go build -o forge + +# Optionally, move to your PATH +sudo mv forge /usr/local/bin/ + +# Verify it works +forge --help +``` + +## Quick Start + +Here's the basic flow to publish your dataset metadata: + +```bash +# First, check your connection to CALYPR +forge ping + +# Publish metadata (this kicks off a processing job) +forge publish ghp_your_github_token_here + +# Monitor the job status +forge list # provides you the job UID +forge status +forge output +``` + +Once the job succeeds, your dataset will be searchable onthe CALYPR platform! + +## What Forge Does + +Forge provides three main capabilities: + +### 1. Manage Project Metadata +- **`forge publish `** - Generate and upload FHIR metadata to CALYPR +- **`forge empty`** - Remove all metadata for your project from CALYPR +- **`forge meta`** - Preview what metadata will be generated (for debugging) +- **`forge validate data`** - Check that metadata is valid before publishing + +### 2. Monitor Platform State +- **`forge ping`** - Verify your connection and credentials +- **`forge list`** - See all your processing jobs +- **`forge status `** - Check if a specific job succeeded or failed +- **`forge output `** - View detailed logs from a job + +### 3. Configure Portal Frontend +- **`forge config`** - Generate a template configuration file for the CALYPR explorer page + +## Common Workflows + +### Publishing Your First Dataset + +Make sure your files are already tracked and pushed through git-drs: + +```bash +# Verify files are tracked +git lfs ls-files + +# Publish the metadata +forge publish ghp_your_token_here + +# Check the job status +forge list +# You'll see: Uid: job-abc123 Name: fhir_import_export Status: Succeeded +``` + +### Adding More Files Later + +When you add new data files, just push them with git-drs via git and re-publish: + +```bash +# Add and push new files through git-drs +git add new-data/*.fastq.gz +git commit -m "Add more sequencing data" +git push origin main + +# Update the metadata +forge publish ghp_token +``` + +### Debugging Metadata Before Publishing + +If you want to see what metadata will be generated before publishing: +```bash +# Generate metadata locally +forge meta + + +# Look at what was created +ls -la META/ +cat META/DocumentReference.ndjson | jq . +``` + +If you are supplying your own metadata and want to validate it before publishing: + +```sh +# Validate it +forge validate data +``` + +### Working with Multiple Environments + +If you need to push to a different environment: + +```bash +# Specify which remote to use +forge ping --remote dev +forge publish ghp_token --remote staging +forge list --remote production +``` + +## How It Works + +Here's what happens under the hood: + +1. Your data files are tracked with git-drs and uploaded to CALYPR storage +2. You run `forge publish` which dispatches a job in the background +3. The job clones your repository, generates FHIR metadata, and uploads it to CALYPR +4. Once the job completes, your dataset appears in the CALYPR platform + +**Important:** Metadata generation happens on the server during the publish job. The `forge meta` command is just for local debugging. + +## What Gets Generated + +Forge creates three types of FHIR R5 resources: + +- **DocumentReference** - One for each file, with metadata like size, hash, URL, and creation date +- **Directory** - One for each folder, showing the directory structure +- **ResearchStudy** - One for the entire project, linking everything together + +These resources are stored as NDJSON files in the `META/` directory. + +## Troubleshooting + +**"Error: could not locate remote"** + +You need to run forge in a git-drs initialized repository: +```bash +git drs remote list +``` + +**"Error: no credentials found"** + +Git-drs needs to be configured with your Gen3 credentials: +```bash +git drs remote add gen3 production \ + --cred ~/.gen3/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket +``` + +**Job shows "Failed" status** + +Check the logs to see what went wrong: +```bash +forge output + +# Common issues: +# - Validation errors: try running `forge validate data` locally +# - Missing files: make sure `git push` completed successfully +# - Credential problems: verify with `forge ping` +``` + +## Next Steps + +- [Commands Reference](commands.md) - Detailed documentation for all commands +- [Configuration Guide](configuration.md) - Understanding git-drs configuration +- [Metadata Structure](metadata.md) - Deep dive into FHIR resources diff --git a/docs/metadata.md b/docs/metadata.md new file mode 100644 index 0000000..ef42043 --- /dev/null +++ b/docs/metadata.md @@ -0,0 +1,254 @@ +# FHIR Metadata Structure + +Forge generates FHIR R5 (Fast Healthcare Interoperability Resources) metadata to describe your data files in a standardized format. This makes your datasets discoverable and searchable through the Gen3 portal. + +## What is FHIR? + +FHIR is a healthcare data standard that provides a common way to represent and exchange information. While it was designed for healthcare, its structured approach works well for any scientific data that needs rich metadata. + +Forge uses FHIR because Gen3 can index and search FHIR resources, making your data discoverable through the portal's search interface. + +## Generated Resources + +Forge creates three types of FHIR resources: + +### 1. DocumentReference (one per file) + +Represents a single data file in your repository. + +**What it contains:** +- Unique identifier (deterministic UUID based on file hash) +- DRS object ID for retrieving the file +- File metadata: name, size, MIME type, creation date +- Hash values: MD5, SHA256, SHA512 +- Storage URL (DRS endpoint) +- Reference to the parent ResearchStudy + +**Example:** +```json +{ + "resourceType": "DocumentReference", + "id": "abc123-def456-...", + "identifier": [{ + "system": "https://calypr-public.ohsu.edu/drs", + "value": "drs://dg.4503/abc123..." + }], + "status": "current", + "date": "2024-01-15T10:30:00Z", + "content": [{ + "attachment": { + "title": "sample_001.fastq.gz", + "contentType": "application/gzip", + "url": "drs://calypr-public.ohsu.edu/abc123...", + "size": 1073741824, + "creation": "2024-01-15T10:30:00Z" + } + }], + "subject": { + "reference": "ResearchStudy/project-xyz789" + } +} +``` + +**Key fields:** +- `id` - Generated from SHA1 hash of endpoint + filename +- `identifier` - DRS object ID for file retrieval +- `status` - Always "current" for active files +- `content.attachment` - File details (name, size, URL, type) +- `subject` - Links to the parent ResearchStudy + +### 2. Directory (one per folder) + +Represents a folder in your project's directory structure. + +**What it contains:** +- Unique identifier for the directory +- Directory name +- References to child directories and files +- Position in the hierarchy + +**Example:** +```json +{ + "resourceType": "Directory", + "id": "dir-abc123-...", + "name": "sequencing-data", + "child": [ + {"reference": "DocumentReference/abc123..."}, + {"reference": "DocumentReference/def456..."}, + {"reference": "Directory/subdir-xyz789..."} + ] +} +``` + +**Key fields:** +- `id` - Generated from SHA1 hash of endpoint + directory path +- `name` - Folder name +- `child` - Array of references to files and subdirectories + +**Note:** Directory is not a standard FHIR R5 resource - it's a custom extension used by Gen3 to represent file system structure. + +### 3. ResearchStudy (one per project) + +Represents your entire project or dataset. + +**What it contains:** +- Project-level identifier +- Gen3 project ID +- Project description +- Status +- Reference to the root directory + +**Example:** +```json +{ + "resourceType": "ResearchStudy", + "id": "project-abc123", + "identifier": [{ + "use": "official", + "system": "https://calypr-public.ohsu.edu/my-project-123", + "value": "my-project-123" + }], + "status": "active", + "description": "Skeleton ResearchStudy for my-project-123", + "rootDir": { + "reference": "Directory/root-dir-id" + } +} +``` + +**Key fields:** +- `id` - Generated from endpoint + project ID +- `identifier.value` - Your Gen3 project ID +- `status` - "active" for current projects +- `rootDir` - Custom extension linking to root Directory + +**Note:** The `rootDir` field is a custom extension, not part of standard FHIR. + +## File Format: NDJSON + +Metadata is stored as NDJSON (Newline Delimited JSON) files: +- One JSON object per line +- Each line is a complete FHIR resource +- No commas between lines +- Files are stored in the `META/` directory + +**Example NDJSON file:** +``` +{"resourceType":"DocumentReference","id":"abc123","status":"current",...} +{"resourceType":"DocumentReference","id":"def456","status":"current",...} +{"resourceType":"DocumentReference","id":"ghi789","status":"current",...} +``` + +**Generated files:** +- `META/DocumentReference.ndjson` - All file metadata +- `META/Directory.ndjson` - All directory metadata +- `META/ResearchStudy.ndjson` - Project metadata + +## How Files Are Mapped + +When forge generates metadata, it follows this process: + +### 1. Discover Files + +Queries Gen3 IndexD to find all DRS objects in your project, then reads git-lfs tracked files in your local repository. + +### 2. Match by Hash + +Matches DRS objects to local files using SHA256 hashes. This ensures each file is correctly identified even if filenames change. + +### 3. Generate DocumentReference + +For each matched file, creates a DocumentReference resource with: +- File path and name from git-lfs +- Size and hashes from DRS object +- Creation date from git-lfs metadata +- DRS URL for retrieval + +### 4. Build Directory Tree + +Parses file paths to construct the directory hierarchy. Creates a Directory resource for each unique folder path. + +### 5. Link Everything + +Connects DocumentReferences to their parent Directories, Directories to parent Directories, and all top-level Directories to the ResearchStudy via the rootDir field. + +## ID Generation + +All resource IDs are deterministic, meaning the same input always produces the same ID. This ensures consistency across metadata updates. + +**ID generation algorithm:** +``` +ID = SHA1(SHA1(endpoint) + resource_path) +``` + +**Examples:** +- DocumentReference: `SHA1(SHA1(endpoint) + file_path)` +- Directory: `SHA1(SHA1(endpoint) + directory_path)` +- ResearchStudy: `SHA1(SHA1(endpoint) + "ResearchStudy" + project_id)` + +This approach ensures: +- IDs are globally unique +- The same file always gets the same ID +- No collisions between different resources + +## Custom Extensions + +Forge adds some non-standard FHIR fields for Gen3 integration: + +### rootDir (in ResearchStudy) + +Links the ResearchStudy to the root Directory resource. + +```json +{ + "resourceType": "ResearchStudy", + "rootDir": { + "reference": "Directory/root-id" + } +} +``` + +This allows Gen3 to navigate the entire directory tree starting from the project level. + +### Directory Resource + +The entire Directory resource type is a custom extension. It's not part of FHIR R5, but follows FHIR conventions for structure and references. + +## Validation + +Forge validates metadata against FHIR R5 schemas to ensure: +- Required fields are present +- Field types are correct +- Values follow FHIR constraints +- References point to valid resources + +**Run validation:** +```bash +forge validate data +``` + +**Common validation errors:** +- Missing required fields (status, id, resourceType) +- Invalid field types (string vs number) +- Invalid references (pointing to non-existent resources) +- Malformed dates or timestamps + +## Updating Metadata + +When you add or modify files and run `forge publish` again, Forge either uses the metadata provided or regenerates all metadata: + +1. Existing DocumentReferences are updated with new information +2. New files get new DocumentReference resources +3. Deleted files have their DocumentReferences removed +4. Directory structure is rebuilt to reflect current state + +Because IDs are deterministic, the same files keep the same IDs across updates. + + +## See Also + +- [Commands Reference](commands.md) - Using `forge meta` and `forge validate` +- [Getting Started](getting-started.md) - Basic workflow +- [FHIR R5 Specification](https://hl7.org/fhir/R5/) - Official FHIR documentation +- [Gen3 Documentation](https://gen3.org) - Gen3 platform details diff --git a/metadata/directory.go b/metadata/directory.go index 7ce73a6..aa153e0 100644 --- a/metadata/directory.go +++ b/metadata/directory.go @@ -166,7 +166,6 @@ func BuildDirectoryTreeFromDocRef(endpoint string, docRef *drpb.DocumentReferenc } if !isAlreadyLinked { - fmt.Println("HELLO WE HERE") parentDir.Child = append(parentDir.Child, fileRef) } } diff --git a/metadata/meta.go b/metadata/meta.go index 47268e6..18d9e7f 100644 --- a/metadata/meta.go +++ b/metadata/meta.go @@ -76,7 +76,7 @@ func CreateMeta(outPath string, remote config.Remote) error { idxCl, ok := val.(*indexd_client.IndexDClient) if !ok { - return fmt.Errorf("Config is not IndexDClient") + return fmt.Errorf("config is not IndexDClient") } marshaller, err := jsonformat.NewMarshaller(false, "", "", fver.R5) @@ -364,7 +364,7 @@ func processDRSRecordsAndUpdateFHIR(drsRecords []*drs.DRSObject, LfsRecords []LF containedResource = templateDocRef(drsRecord, endpoint, project, researchStudyID) } - if foundMatch == true { + if foundMatch { break } } From b9ffacdebda022f4c6806c0626ab673b7b55f84c Mon Sep 17 00:00:00 2001 From: quinnwai Date: Fri, 9 Jan 2026 12:50:14 -0800 Subject: [PATCH 2/3] bonk --- metadata/meta.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata/meta.go b/metadata/meta.go index 18d9e7f..bc13be3 100644 --- a/metadata/meta.go +++ b/metadata/meta.go @@ -274,7 +274,7 @@ func getResearchStudy(fhirDirectory string, projectId string, endpoint string, m return id, nil } -type LSFIles struct { +type LSFiles struct { Files []LFSRecord `json:"files"` } @@ -297,7 +297,7 @@ func findLFSRecords() ([]LFSRecord, error) { } return nil, fmt.Errorf("failed to run git-lfs command: %w", err) } - var records LSFIles + var records LSFiles if err := json.Unmarshal(output, &records); err != nil { return nil, fmt.Errorf("failed to unmarshal JSON output: %w", err) } From 8a45b54c28fac23232f6854e448d5ba006f71171 Mon Sep 17 00:00:00 2001 From: quinnwai Date: Fri, 9 Jan 2026 12:57:16 -0800 Subject: [PATCH 3/3] refactor validate to use paths --- cmd/validate/main.go | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/cmd/validate/main.go b/cmd/validate/main.go index 2cbf89e..497b93b 100644 --- a/cmd/validate/main.go +++ b/cmd/validate/main.go @@ -23,13 +23,20 @@ import ( // Holds the value of the --out-dir flag var outputDir string +var dataPath string +var edgePath string +var configPath string + var ValidateParentCmd = &cobra.Command{ Use: "validate", Short: "Contains subcommands for validating config, data, and edges", } func init() { + ValidateDataCmd.Flags().StringVarP(&dataPath, "path", "p", META_PATH, "Path to metadata file(s) to validate") + ValidateEdgeCmd.Flags().StringVarP(&edgePath, "path", "p", META_PATH, "Path to metadata files directory") ValidateEdgeCmd.Flags().StringVarP(&outputDir, "out-dir", "o", "", "Directory to save vertices and edges files") + ValidateConfigCmd.Flags().StringVarP(&configPath, "path", "p", CONFIG_PATH, "Path to config file to validate") } const META_PATH = "META" @@ -37,14 +44,12 @@ const CONFIG_PATH = "CONFIG" // ValidateCmd remains unchanged var ValidateDataCmd = &cobra.Command{ - Use: "data ", - Short: "data data files given a jsonschema and a ndjson data target file or directory", - Args: cobra.MaximumNArgs(1), + Use: "data", + Short: "validate metadata files given a jsonschema and a ndjson data target file or directory", + Args: cobra.NoArgs, + Long: "Validates metadata files. Use --path to specify a file or directory (defaults to META if not provided)", RunE: func(cmd *cobra.Command, args []string) error { - path := META_PATH - if len(args) > 0 { - path = args[0] - } + path := dataPath sch, err := schema.NewSchema() if err != nil { return errors.Wrap(err, "failed to create schema") @@ -115,15 +120,12 @@ var ValidateDataCmd = &cobra.Command{ } var ValidateEdgeCmd = &cobra.Command{ - Use: "edge ", + Use: "edge", Short: "Check for orphaned edges in graph data from FHIR .ndjson files", - Long: "Generates graph elements from FHIR .ndjson files and checks for edges referencing non-existent vertices", - Args: cobra.MaximumNArgs(1), + Long: "Generates graph elements from FHIR .ndjson files and checks for edges referencing non-existent vertices. Use --path to specify directory (defaults to META if not provided)", + Args: cobra.NoArgs, RunE: func(cmd *cobra.Command, args []string) error { - path := META_PATH - if len(args) > 0 { - path = args[0] - } + path := edgePath sch, err := schema.NewSchema() if err != nil { return errors.Wrap(err, "failed to create schema") @@ -361,14 +363,12 @@ var ValidateEdgeCmd = &cobra.Command{ } var ValidateConfigCmd = &cobra.Command{ - Use: "config ", - Short: "config explorer config file", - Args: cobra.MaximumNArgs(1), + Use: "config", + Short: "validate explorer config file", + Long: "Validates explorer config file. Use --path to specify config file (defaults to CONFIG if not provided)", + Args: cobra.NoArgs, RunE: func(cmd *cobra.Command, args []string) error { - path := CONFIG_PATH - if len(args) > 0 { - path = args[0] - } + path := configPath info, err := os.Stat(path) if err != nil {