diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5fd1d7f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,9 @@ +[submodule "products/funnel"] + path = products/funnel + url = https://github.com/ohsu-comp-bio/funnel.git +[submodule "products/grip"] + path = products/grip + url = https://github.com/bmeg/grip.git +[submodule "products/git-drs"] + path = products/git-drs + url = https://github.com/calypr/git-drs.git diff --git a/docs/.nav.yaml b/docs/.nav.yaml new file mode 100644 index 0000000..5a6659e --- /dev/null +++ b/docs/.nav.yaml @@ -0,0 +1,19 @@ + +nav: +- index.md +- getting-started +- Funnel: + - funnel/* +- GRIP: + - grip/* +- Git-DRS: [] +- CALYPR: + - Getting Started: + - requirements.md + - getting-started.md + - Data Management: + - data-management/git-drs.md + - data-management/meta-data.md + - data-model/integration.md + - data-model/introduction.md + - data-model/metadata.md diff --git a/docs/assets/banner.png b/docs/assets/banner.png new file mode 100644 index 0000000..2d2525d Binary files /dev/null and b/docs/assets/banner.png differ diff --git a/docs/assets/banner_fade.png b/docs/assets/banner_fade.png new file mode 100644 index 0000000..b66a8cf Binary files /dev/null and b/docs/assets/banner_fade.png differ diff --git a/docs/assets/calypr_family.png b/docs/assets/calypr_family.png new file mode 100644 index 0000000..c33677c Binary files /dev/null and b/docs/assets/calypr_family.png differ diff --git a/docs/assets/funnel.png b/docs/assets/funnel.png new file mode 100644 index 0000000..6830652 Binary files /dev/null and b/docs/assets/funnel.png differ diff --git a/docs/assets/git-drs.png b/docs/assets/git-drs.png new file mode 100644 index 0000000..3ed8d9d Binary files /dev/null and b/docs/assets/git-drs.png differ diff --git a/docs/assets/grip.png b/docs/assets/grip.png new file mode 100644 index 0000000..aeb6f9b Binary files /dev/null and b/docs/assets/grip.png differ diff --git a/docs/calypr/.nav.yaml b/docs/calypr/.nav.yaml new file mode 100644 index 0000000..2696c40 --- /dev/null +++ b/docs/calypr/.nav.yaml @@ -0,0 +1 @@ +title: CALYPR \ No newline at end of file diff --git a/docs/workflows/query.md b/docs/calypr/analysis/query.md similarity index 100% rename from docs/workflows/query.md rename to docs/calypr/analysis/query.md diff --git a/docs/workflows/add-users.md b/docs/calypr/calypr-admin/add-users.md similarity index 85% rename from docs/workflows/add-users.md rename to docs/calypr/calypr-admin/add-users.md index 86cedac..b2f06cd 100644 --- a/docs/workflows/add-users.md +++ b/docs/calypr/calypr-admin/add-users.md @@ -12,12 +12,12 @@ There are two ways to request the addition additional users to the project: To give another user full access to the project, run the following: ```sh -g3t collaborator add --write user-can-write@example.com +calypr-admin collaborator add --write user-can-write@example.com ``` Alternatively, to give another user read access only (without the ability to upload to the project), run the following: ```sh -g3t collaborator add user-read-only@example.com +calypr-admin collaborator add user-read-only@example.com ``` diff --git a/docs/workflows/approve-requests.md b/docs/calypr/calypr-admin/approve-requests.md similarity index 90% rename from docs/workflows/approve-requests.md rename to docs/calypr/calypr-admin/approve-requests.md index 0c836d0..9a10684 100644 --- a/docs/workflows/approve-requests.md +++ b/docs/calypr/calypr-admin/approve-requests.md @@ -16,8 +16,8 @@ * Ony users with the steward role can approve and sign a request ```text -g3t collaborator approve --help -Usage: g3t collaborator approve [OPTIONS] +calypr-admin collaborator approve --help +Usage: calypr-admin collaborator approve [OPTIONS] Sign an existing request (privileged). @@ -40,9 +40,9 @@ Note: This example uses the ohsu program, but the same process applies to all pr ```text ## As an admin, I need to grant data steward privileges add the requester reader and updater role on a program to an un-privileged user -g3t collaborator add add data_steward_example@.edu --resource_path /programs//projects --steward +calypr-admin collaborator add add data_steward_example@.edu --resource_path /programs//projects --steward # As an admin, approve that request -g3t collaborator approve +calypr-admin collaborator approve diff --git a/docs/workflows/creating-project.md b/docs/calypr/calypr-admin/creating-project.md similarity index 97% rename from docs/workflows/creating-project.md rename to docs/calypr/calypr-admin/creating-project.md index 0239524..bd98872 100644 --- a/docs/workflows/creating-project.md +++ b/docs/calypr/calypr-admin/creating-project.md @@ -2,12 +2,12 @@ title: Creating a Project --- -{% include '/note.md' %} + ## CLI ```bash -$ g3t init --help +$ git-drs init --help Usage: g3t init [OPTIONS] [PROJECT_ID] diff --git a/docs/workflows/common-errors.md b/docs/calypr/calypr-projects/common-errors.md similarity index 92% rename from docs/workflows/common-errors.md rename to docs/calypr/calypr-projects/common-errors.md index eea553e..efb7c08 100644 --- a/docs/workflows/common-errors.md +++ b/docs/calypr/calypr-projects/common-errors.md @@ -1,11 +1,11 @@ # Common Errors ## .ndjson is out of date -**Error:** After `g3t` adding and committing a file, when you go to submit your data, "DocumentReference.ndjson is out of date", +**Error:** After `git-drs` adding and committing a file, when you go to submit your data, "DocumentReference.ndjson is out of date", ```sh -$ g3t add file.txt -$ g3t commit -m "adding file.txt" -$ g3t push +$ git add file.txt +$ git commit -m "adding file.txt" +$ git push Please correct issues before pushing. Command `g3t status` failed with error code 1, stderr: WARNING: DocumentReference.ndjson is out of date 1969-12-31T16:00:00. The most recently changed file is MANIFEST/file.txt.dvc 2025-02-28T09:24:46.283870. Please check DocumentReferences.ndjson No data file changes. diff --git a/docs/workflows/metadata.md b/docs/calypr/calypr-projects/metadata.md similarity index 97% rename from docs/workflows/metadata.md rename to docs/calypr/calypr-projects/metadata.md index f44204a..1d5693b 100644 --- a/docs/workflows/metadata.md +++ b/docs/calypr/calypr-projects/metadata.md @@ -43,8 +43,7 @@ Depending on if a `patient` or `specimen` flag was specified, other resources ca To add a cram file that's associated with a subject, sample, and particular task ```sh -g3t add myfile.cram --patient P0 --specimen P0-BoneMarrow --task_id P0-Sequencing -g3t meta init +git add myfile.cram --patient P0 --specimen P0-BoneMarrow --task_id P0-Sequencing ``` This will produce metadata with the following relationships: @@ -54,8 +53,8 @@ This will produce metadata with the following relationships: When the project is committed, the system will validate new or changed records. You may validate the metadata on demand by: ```sh -$ g3t meta validate --help -Usage: g3t meta validate [OPTIONS] DIRECTORY +$ forge meta validate --help +Usage: forge meta validate [OPTIONS] DIRECTORY Validate FHIR data in DIRECTORY. diff --git a/docs/calypr/data-management/git-drs.md b/docs/calypr/data-management/git-drs.md new file mode 100644 index 0000000..9cdc96a --- /dev/null +++ b/docs/calypr/data-management/git-drs.md @@ -0,0 +1,69 @@ + +## **3.5: Commit and Upload you files** + +\# Commit files (creates DRS records via pre-commit hook) +``` +git commit -m "Add genomic data files" +``` + +\# Upload to object store +``` +git push +``` + +What happens during push: + +1. Git-DRS creates DRS records for each tracked file +2. Files are uploaded to the configured S3 bucket +3. DRS URIs are registered in the Gen3 system +4. Pointer files are committed to the repository + +## + +### 3.5.1 Verifying upload + +``` +git lfs ls-files +``` + +Files should now show \* prefix (localized/uploaded): + +``` +* data/sample1.bam +* data/sample2.bam +* results/analysis.vcf.gz +``` + +The \- prefix means files are staged but not yet committed. + +After completing the workflow: + +* Files visible in Git repository (as LFS pointers) +* DRS records created (check .drs/ logs) +* Files accessible via git lfs pull +* Can share DRS URIs with collaborators +* Files NOT searchable in CALYPR web interface (expected) + +## 4.5: Committing Changes + +``` +# Stage all changes +git add . +``` + +``` +# Commit (triggers forge precommit hook) +git commit \-m "Register S3 files with custom FHIR metadata" +``` + +``` +# Push to register DRS records +git push +``` + +What happens during push: + +1. Git-DRS creates DRS records pointing to S3 +2. DRS URIs are registered +3. No file upload occurs +4. Pointer files committed to repository \ No newline at end of file diff --git a/docs/calypr/data-management/meta-data.md b/docs/calypr/data-management/meta-data.md new file mode 100644 index 0000000..5b2ba65 --- /dev/null +++ b/docs/calypr/data-management/meta-data.md @@ -0,0 +1,174 @@ +# Managing Metadata + +Metadata in Calypr is formatted using the Fast Healthcare Interoperability Resources (FHIR) schema. If you choose to bring your own FHIR newline delimited json data, you will need to create a directory called “META” in your git-drs repository in the same directory that you initialized your git-drs repository, and place your metadata files in that directory. +The META/ folder contains newline-delimited JSON (.ndjson) files representing FHIR resources describing the project, its data, and related entities. Large files are tracked using Git LFS, with a required correlation between each data file and a DocumentReference resource. This project follows a standardized structure to manage large research data files and associated FHIR metadata in a version-controlled, DRS and FHIR compatible format. +Each file must contain only one type of FHIR resource type, for example META/ResearchStudy.ndjson only contains research study resource typed FHIR objects. The name of the file doesn’t have to match the resource type name, unless you bring your own document references, then you must use DocumentReference.ndjson. For all other FHIR file types this is simply a good organizational practice for organizing your FHIR metadata. + +## META/ResearchStudy.ndjson + +* The File directory structure root research study is based on the 1st Research Study in the document. This research study is the research study that the autogenerated document references are connected to. Any additional research studies that are provided will be ignored when populating the miller table file tree. +* Contains at least one FHIR ResearchStudy resource describing the project. +* Defines project identifiers, title, description, and key attributes. + +## META/DocumentReference.ndjson + +* Contains one FHIR DocumentReference resource per Git LFS-managed file. +* Each DocumentReference.content.attachment.url field: + * Must exactly match the relative path of the corresponding file in the repository. + * Example: + +{ + "resourceType": "DocumentReference", + "id": "docref-file1", + "status": "current", + "content": \[ + { + "attachment": { + "url": "data/file1.bam", + "title": "BAM file for Sample X" + } + } + \] +} + +Place your custom FHIR ndjson files in the META/ directory: + +\# Copy your prepared FHIR metadata +cp \~/my-data/patients.ndjson META/ +cp \~/my-data/observations.ndjson META/ +cp \~/my-data/specimens.ndjson META/ +cp \~/my-data/document-references.ndjson META/ + +## Other FHIR data + +\[TODO More intro text here\] + +* Patient.ndjson: Participant records. +* Specimen.ndjson: Biological specimens. +* ServiceRequest.ndjson: Requested procedures. +* Observation.ndjson: Measurements or results. +* Other valid FHIR resource types as required. + +## Link Files to Metadata + +Ensure your FHIR DocumentReference resources reference the DRS URIs: + +Example DocumentReference linking to S3 file: + +{ + "resourceType": "DocumentReference", + "id": "doc-001", + "status": "current", + "content": \[{ + "attachment": { + "url": "drs://calypr-public.ohsu.edu/your-drs-id", + "title": "sample1.bam", + "contentType": "application/octet-stream" + } + }\], + "subject": { + "reference": "Patient/patient-001" + } +} + + +--- + +## Validating Metadata + +To ensure that the FHIR files you have added to the project are correct and pass schema checking, you can use the forge software. + +forge validate + +Successful output: + +✓ Validating META/patients.ndjson... OK +✓ Validating META/observations.ndjson... OK +✓ Validating META/specimens.ndjson... OK +✓ Validating META/document-references.ndjson... OK +All metadata files are valid. + +Fix any validation errors and re-run until all files pass. + + +### Forge Data Quality Assurance Command Line Commands + +If you have provided your own FHIR resources there are two commands that might be useful to you for ensuring that your FHIR metadata will appear on the CALYPR data platform as expected. These commands are validate and check-edge + +**Validate-** Example: + +\`\`\`forge validate META\`\`\` or \`\`\`forge validate META/DocumentReference.ndjson\`\`\` + +Validate checks to see if the provided directory or file will be accepted by the CALYPR data platform or whether there are validation errors that make it not accepted into the data platform. Validation errors range from improper JSON formatting to FHIR schema validation errors. We are currently using FHIR version R5 so the earlier version will not validate against our schema. + +**Check-edge-** Example: + +\`\`\`forge check-edge META\`\`\` or \`\`\`forge validate META/DocumentReference.ndjson\`\`\` + +Check edge emulates exactly what will happen during data submission to your FHIR files. Your FHIR files will be loaded into a graph database. In order to create the graph edges must be generated from the references specified in your FHIR data to connect your vertices, which are essentially the rest of the NDJSON FHIR files that have been provided. + +Check edge aims to ensure that the references that have been specified in the files do connect to known vertices and aren’t ‘orphaned’. Check edge does not take into account existing vertices that are already in the CALYPR graph and could potentially claim certain edges do not connect to anything if they are connecting to vertices that are in CALYPR but outside of the data that is provided when doing an edge check. + +### Validation Process + +#### 1\. Schema Validation + +* Each .ndjson file in META/ (like ResearchStudy.ndjson, DocumentReference.ndjson, etc.) is read line by line. +* Every line is parsed as JSON and checked against the corresponding FHIR schema for that resourceType. +* Syntax errors, missing required fields, or invalid FHIR values trigger clear error messages with line numbers. + +#### 2\. Mandatory Files Presence + +* Confirms that: + * ResearchStudy.ndjson exists and has at least one valid record. + * DocumentReference.ndjson exists and contains at least one record. +* If either is missing or empty, validation fails. + +#### 3\. One-to-One Mapping of Files to DocumentReference + +* Scans the working directory for Git LFS-managed files in expected locations (e.g., data/). +* For each file, locates a corresponding DocumentReference resource whose content.attachment.url matches the file’s relative path. +* Validates: + * All LFS files have a matching DocumentReference. + * All DocumentReferences point to existing files. + +#### 4\. Project-level Referential Checks + +* Validates that DocumentReference resources reference the same ResearchStudy via relatesTo or other linking mechanisms. +* If FHIR resources like Patient, Specimen, ServiceRequest, Observation are present, ensures: + * Their id fields are unique. + * DocumentReference correctly refers to those resources (e.g., via subject or related fields). + +#### 5\. Cross-Entity Consistency + +* If multiple optional FHIR .ndjson files exist: + * Confirms IDs referenced in one file exist in others. + * Detects dangling references (e.g., a DocumentReference.patient ID that's not in Patient.ndjson). + +--- + +#### ✅ Example Error Output + +ERROR META/DocumentReference.ndjson line 4: url "data/some\_missing.bam" does not resolve to an existing file +ERROR META/Specimen.ndjson line 2: id "specimen-123" referenced in Observation.ndjson but not defined + +--- + +#### 🎯 Purpose & Benefits + +* Ensures all files and metadata are in sync before submission. +* Prevents submission failures due to missing pointers or invalid FHIR payloads. +* Enables CI integration, catching issues early in the development workflow. + +--- + +#### Validation Requirements + +Automated tools or CI processes must: + +* Verify presence of META/ResearchStudy.ndjson with at least one record. +* Verify presence of META/DocumentReference.ndjson with one record per LFS-managed file. +* Confirm every DocumentReference.url matches an existing file path. +* Check proper .ndjson formatting. + +--- \ No newline at end of file diff --git a/docs/data-model/integration.md b/docs/calypr/data-model/integration.md similarity index 100% rename from docs/data-model/integration.md rename to docs/calypr/data-model/integration.md diff --git a/docs/data-model/introduction.md b/docs/calypr/data-model/introduction.md similarity index 100% rename from docs/data-model/introduction.md rename to docs/calypr/data-model/introduction.md diff --git a/docs/data-model/metadata.md b/docs/calypr/data-model/metadata.md similarity index 100% rename from docs/data-model/metadata.md rename to docs/calypr/data-model/metadata.md diff --git a/docs/calypr/getting-started/environment.md b/docs/calypr/getting-started/environment.md new file mode 100644 index 0000000..222f306 --- /dev/null +++ b/docs/calypr/getting-started/environment.md @@ -0,0 +1,189 @@ +# Environment Initialization + +All tools can be installed via pip, conda, or binary releases. +The following steps assume a Unix‑like shell (bash/zsh). + +## Install Git & Git‑LFS & Git-DRS + +Calypr project management is handled using git. If you already have that installed, you'll need the Large File Storage (LFS) plugin that allows git to track files that are bigger than the standard text source code it was originally designed to work with. You'll also need the git-drs plugin, that talks directly to Calyp's storage and indexing system. +``` +# Install Git +sudo apt-get update && sudo apt-get install \-y git + +# Install Git‑LFS +curl \-s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash +sudo apt-get install \-y git-lfs + +# Enable Git‑LFS for the repo +git lfs install + +# Install git-drs +\[TODO\] Put instructions here +``` + +Once these elements are set up, you'll need to copy in the API credentials you obtained in section 2\. + +# Initialize git-drs +git drs init \--cred \~/Downloads/calypr-credentials.json \--profile calypr + +| g3t\_etl | Given spreadsheets-style metadata, convert it into a standardized graph model | +| :---- | :---- | +| git-drs | Given a set of files, register them with CALYPR | +| forge | Given a set of metadata, publish it to users on the CALYPR platform | +| configurator | Given a set of metadata, customize how it’s displayed on the platform | + +Table: Tools that are part of Calypr project management + +## Clone project repository + +With your environment set up, you can clone in the project. To ensure that you don't automatically download all of the large files associated with the project (which could be several TBs and takes days to complete) make sure that you've run \[TODO: add git-lfs command here\] +``` +# Clone repository +git clone https://github.com/your-org/your-calypr-repo.git +cd your-calypr-repo +``` + +### Formatting a new project + +If you are creating a new project, you may need to initialize some of the storage parameters. These define how the DRS system stores files related to your project. + +``` +# Clone new repository +git clone https://github.com/your-org/new-calypr-repo.git +cd new-calypr-repo + +# Initialize with full configuration +git drs init \--profile calypr \\ + \--url https://calypr-public.ohsu.edu/ \\ + \--cred \~/Downloads/calypr-credentials.json \\ + \--project my-project-id \\ + \--bucket my-bucket-name +``` +Get project details from your data coordinator if needed. + +#### Directory Structure +``` +/ +├── .gitattributes +├── .gitignore +├── META/ +│ ├── ResearchStudy.ndjson +│ ├── DocumentReference.ndjson +│ ├── Patient.ndjson (optional) +│ ├── Specimen.ndjson (optional) +│ ├── ServiceRequest.ndjson (optional) +│ ├── Observation.ndjson (optional) +│ └── \.ndjson (optional) +├── data/ +│ ├── file1.bam +│ ├── file2.fastq.gz +│ └── \ +``` + +--- + +#### Example Minimal Project +``` +my-project/ +├── .gitattributes +├── META/ +│ ├── ResearchStudy.ndjson \# 1 record +│ ├── DocumentReference.ndjson \# 2 records, one per file below +├── data/ +│ ├── sample1.bam +│ ├── sample2.fastq.gz +``` + +## Verify configuration + +You'll want to double check your storage settings, to ensure you know where files are being stored. First, use the DRS config list command: + +``` +git drs list-config +``` + +The expected output would be: + +``` +current\_server: gen3 +servers: + gen3: + endpoint: https://calypr-public.ohsu.edu/ + project\_id: my-project-id + bucket: my-bucket-name +``` + +Next you'll need check with files LFS is tracking. If LFS doesn't track a file, it could be uploaded to Github. This should be avoided because it isn't managed by the Calypr project access control system and it isn't designed to store large files. + +To view the current files that are being tracked: +``` +git lfs track +``` + +You can add more files to be tracked using the `git lfs track` command +``` +# Track specific file extensions +git lfs track "\*.bam" +git lfs track "\*.vcf.gz" +git lfs track "\*.fastq.gz" + +# Track entire directories +git lfs track "data/\*\*" + +# Commit tracking configuration +git add .gitattributes +git commit \-m "Configure LFS file tracking" +git push +``` + +--- + +## Add Your Files + +By using git-lfs and git-drs you will have a number of different options to add new files to a project. You can 1\) add a file that exists within your workspace, 2\) Add a file that has already been uploaded to an S3 bucket and 3\) Add a file that has already been registered with DRS. + +### Add local files +``` +# Add data files +git add data/sample1.bam +git add data/sample2.bam +git add results/analysis.vcf.gz + +# Verify LFS is tracking them +git lfs ls-files +``` + +Expected output: + +``` +- data/sample1.bam +- data/sample2.bam +- results/analysis.vcf.gz +``` + +### Register S3 Files + +Using Environment Variables +``` + +# Set AWS credentials +export AWS\_ACCESS\_KEY\_ID="your-access-key" +export AWS\_SECRET\_ACCESS\_KEY="your-secret-key" +# Register file +git drs add-url s3://bucket-name/path/to/file.bam \ + --sha256 abc123def456… +``` + +Using Command Flags +``` +# Register file with inline credentials +git drs add-url s3://bucket-name/path/to/file.bam \ + --sha256 abc123def456... \ + --aws-access-key "your-access-key" \ + --aws-secret-key "your-secret-key" +``` + +Using AWS Profile +\[WIP\] + +📖 More details: [Git-DRS Add-URL Docs](https://github.com/calypr/git-drs/blob/main/docs/adding-s3-files.md) diff --git a/docs/getting-started.md b/docs/calypr/getting-started/getting-started.md similarity index 95% rename from docs/getting-started.md rename to docs/calypr/getting-started/getting-started.md index 6aa94c3..3a250b4 100644 --- a/docs/getting-started.md +++ b/docs/calypr/getting-started/getting-started.md @@ -2,7 +2,8 @@ title: Getting Started --- -{% include '/note.md' %} +!!! note + The tools listed here are under development and may be subject to change. Use case: As an analyst, in order to share data with collaborators, I need a way to create a project, upload files and associate those files with metadata. The system should be capable of adding files in an incremental manner. diff --git a/docs/calypr/getting-started/login.md b/docs/calypr/getting-started/login.md new file mode 100644 index 0000000..6437417 --- /dev/null +++ b/docs/calypr/getting-started/login.md @@ -0,0 +1,8 @@ + + + +# Calypr Login + +To get started you'll need to log into the website and get a copy of API keys. + +\[TODO\] Instructions on how to go to website and get an API \ No newline at end of file diff --git a/docs/calypr/index.md b/docs/calypr/index.md new file mode 100644 index 0000000..98f792b --- /dev/null +++ b/docs/calypr/index.md @@ -0,0 +1,7 @@ + + +This is the page \ No newline at end of file diff --git a/docs/calypr/project-management/create-project.md b/docs/calypr/project-management/create-project.md new file mode 100644 index 0000000..474f144 --- /dev/null +++ b/docs/calypr/project-management/create-project.md @@ -0,0 +1,17 @@ + + +# Create a Project (gen3 \+ GitHub) + +Status: *Manual and DevOps‑only at the moment* + +The standard way to start a new Calypr project is to create a Git repository that will hold your FHIR NDJSON files and a set of Git‑LFS tracked files. + +For now you will need to ask a Calypr management team to create the project and provide you with the following: + +* GitHub repository URL +* Calypr project ID +* Initial git config settings (branch, remotes, etc.) + +Future Work: Automate this step with a CLI wizard. + +TODO – Write the DevOps‑only project creation guide. diff --git a/docs/calypr/project-management/custom-views.md b/docs/calypr/project-management/custom-views.md new file mode 100644 index 0000000..c9e2709 --- /dev/null +++ b/docs/calypr/project-management/custom-views.md @@ -0,0 +1,200 @@ + +# Project Customization + +## Dataframer Configuration + +The dataframer is used to render the FHIR .ndjson files into the tabular space that is used in the explorer page table. If you want to customize your project’s explorer page you will need to specify database field names that are defined in the dataframer, thus you will need to run the dataframer on your data ahead of time in order to know these field names. + +See below steps for setting up git-drs and running dataframer commands: + +\`\`\` + +python \-m venv venv + +source venv/bin/activate + +pip install gen3-tracker==0.0.7rc27 + +git-drs meta dataframe DocumentReference + +\`\`\` + +The explorer config is a large JSON document. This will be explained in further detail later but, one of the keys of note is that “guppyConfig”. The guppyConfig key is used to specify what index is to be used for the explorer page tab that you have defined. Notice that when you run \`\`\`git-drs meta dataframe\`\`\` it outputs + +\`\`\` + +Usage: git-drs meta dataframe \[OPTIONS\] {Specimen|DocumentReference|ResearchSubjec + + t|MedicationAdministration|GroupMember} + + \[DIRECTORY\_PATH\] \[OUTPUT\_PATH\] + +Try 'git-drs meta dataframe \--help' for help. + +\`\`\` + +Where Specimen|DocumentReference… are the supported indices that can be run in the dataframe and defined in the explorerConfig under ‘guppyConfig’ key name. + +Note that the guppyConfig index names use snake\_case formatting whereas the dataframer uses uppercase for each word. + +## 5.2 Explorer Page Configuration + +Forge currently supports customization of explorer pages by routing to: [https://commons-url/Explorer/\[program\]-\[project](https://commons-url/Explorer/[program]-[project)\] + +Explorer Configs can be customized by running "\`\`Forge config init\`\`\` and then filling out the template config that has been provided to you. + +The first thing you should notice is that the explorer config is a JSON document with 1 top level key called “explorerConfig” which can host a list of “tab” configs. Looking at the image above the tabs called “Patient”, “Specimen”, and “File” Denote a list element in this explorer config. + +In this example if you look at the “guppyConfig” key you will notice that the dataType specified for this tab is “document\_reference”, this is why we ran the DocumentReference dataframer command above. It will create the document reference data frame so that you can select database field names from the Excel spreadsheet that is created from running this command. + +\`\`\`{ + "explorerConfig": \[ + { + "tabTitle": "TEST", + "guppyConfig": { + "dataType": "document\_reference", + "nodeCountTitle": "file Count", + "fieldMapping": \[\] + }, + "filters": { + "tabs": \[ + { + "title": "Filters", + "fields": \[ + "document\_reference\_assay", + "document\_reference\_creation", + "project\_id" + \], + "fieldsConfig": { + "project\_id": { + "field": "project\_id", + "dataField": "", + "index": "", + "label": "Project Id", + "type": "enum" + }, + "assay": { + "field": "document\_reference\_assay", + "dataField": "", + "index": "", + "label": "Assay", + "type": "enum" + }, + "creation": { + "field": "document\_reference\_creation", + "dataField": "", + "index": "", + "label": "Creation", + "type": "enum" + } + } + } + \] + }, + "table": { + "enabled": true, + "fields": \[ + "project\_id", + "document\_reference\_assay", + "document\_reference\_creation" + \], + "columns": { + "project\_id": { + "field": "project\_id", + "title": "Project ID" + }, + "assay": { + "field": "document\_reference\_assay", + "title": "Assay" + }, + "creation": { + "field": "document\_reference\_creation", + "title": "Creation" + } + } + }, + "dropdowns": {}, + "buttons": \[\], + "loginForDownload": false + } + \] +} +\`\`\` +And here is what this config looks like in the frontend: + +Note that since there is only one element in the explorerConfig there is only one tab called “TEST” in the explorer page which is housed as “tabTitle” in the config. + +#### Filters + +The next important section is the “filters” key. This defines the filters column on the left-hand side of the page. Within that block there is the “fields” key and the “fieldsConfig” key. The fields key is used to specify the names of the fields that you want to filter on. In order to get the names of the fields you will need to install git-drs via PYPI and run a dataframer command which essentially creates this explorer table dataframe, so that you can configure in the frontend what parts of this dataframe you want to be shown. + +Now, going back to the configuration, these fields that were specified come directly from the column names at the top of the excel spreadsheet that are generated from running the dataframer command. You can choose any number / combination of these column names, but note that in any list that is specified in this config, the elements in the list are rendered in the frontend in that exact order that is specified. + +The “fieldsConfig” key is a decorator dict that is optional but can be applied to every filter that is specified. Notice that the “label” key is used to denote the preferred display name that is to be used for the database key name that was taken from the dataframer excel spreadsheet. + +#### Table + +The last import section is the “table” key. Like with the filters structure, “fields” is used to denote all of the database column names that should be displayed in the explorer table. Also similar to the filters structure, “columns” is where you specify the label that you want displayed for the database field. In this case it is “field” is the db name and “title” is the label display name. + +The rest of the config is templating that is needed for the explorer page to load, but not anything that is directly useful. + +#### Shared Filters + +Imagine you want to filter on multiple index facets, similar to a RESTFUL join operation. Like for example give me all of the PATIENTS who belong on this project\_id that also have a specimen that matches this project\_id. + +This is known as “shared filtering” because you are making the assumption that you want to carry your filters over to the new node when you click a new tab. This only works if there exists an equivalent field on the other index/tab, so it must be configurable and is not applicable for all normal filterable fields. + +It sounds complex but setting it up isn;’t that complex at all. Simply specify a filter that you want to do shared filtering on, ie: “project\_id”, then specify the indices and the field names for each index that the field is shared on. For our purposes project\_id is known as project\_id on all indices but this may not always be the case, and proper inspection or knowledge of the dataset may be required to determine this. + +Then you simply specify each “shared filter” as a JSON dictionary list element under the field that you have specified and you have successfully setup shared filtering on that field. In order to define additional shared filters, it is as simple as adding another key under the “defined” dictionary key and specifying a list of indices and fields that the shared filter can be joined on. See the example below for details. + +"sharedFilters": { + "defined": { + "project\_id": \[ + { "index": "research\_subject", "field": "project\_id" }, + { "index": "specimen", "field": "project\_id" }, + { "index": "document\_reference", "field": "project\_id" } + \], + } + }, + +## 5.3 Configurator + +Now that you have the basics down tThis frontend GUI might start to make some sense. Notice this is the exact same config that was shown earlier, except it is customizable via the GUI so that you don’t need to wrestle with the JSON to get a working, correctly formatted config. Notice also that there is a 3rd column here: Charts. Charts are defined very simply: + + "charts": { + + "specimen\_collection": { + + "chartType": "fullPie", + + "title": "Metastasis Site" + + }, + +Just provide the DB column name as the parent key, and then the chart type and the label title of the chart. The chart will generate a binned histogram counts style chart. Currently only “fullPie”, “bar” or “donut” type charts are supported but in the future other chart types might be added + +As stated earlier, configs have a very specific naming convention: \[program\]-\[project\].json and will be rejected if you do not have write permissions on the program, project configuration that is specified or if the name of the configuration is not of that form. You can also load any configs that you have access to too, an edit them and then repost them. + +All customizable explorer pages are viewable when routing to /Explorer/\[program\]-\[project\] assuming that all database fields that are specified exist in the db. + +# **Advanced Docs** + +--- + +# **🧬 Managing Identifiers with calypr meta** + +This guide explains how to manage dataset identifiers, both manually and through the command line, and how those identifiers integrate with Git-LFS and git-drs for reproducible, FAIR-compliant data management. + +### 🧭 Introduction: Where This Fits in Your Research Data Lifecycle + +This document applies once you’ve begun organizing data files for a research study and are ready to make their metadata machine-readable and FAIR-compliant. Researchers typically progress through several stages: + +1. **Files only**: you start with a set of raw or processed data files associated with a research study. +2. **Files with identifiers**: each file is linked to key entities such as Patients, Specimens, or Assays using META/identifiers.tsv. +3. **Files with identifiers \+ attributes**: you begin adding structured tabular metadata (e.g., Patient.tsv, Specimen.tsv, Observation.tsv) describing those entities. +4. **Files with complete FHIR metadata**: you can now transform these TSVs into fully-formed FHIR resources (Patient.ndjson, Specimen.ndjson, etc.) suitable for sharing, indexing, and integration with clinical or genomic data platforms. + +This guide focuses on stage 2,3 — converting well-structured TSV metadata files into standard FHIR resources, while validating that every entity’s identifier corresponds to the entries defined in META/identifiers.tsv. + +--- \ No newline at end of file diff --git a/docs/calypr/project-management/publishing-project.md b/docs/calypr/project-management/publishing-project.md new file mode 100644 index 0000000..f068e64 --- /dev/null +++ b/docs/calypr/project-management/publishing-project.md @@ -0,0 +1,49 @@ + +## 4.6: Publishing changes to Gen3 + +In order to publish metadata to CALYPR, regardless of whether you have provided your own metadata or you are simply uploading files to the system, if you want these files to be viewable in the CALYPR site, you will need to publish your data. Publishing data is done with the Forge command line utility. + +Since forge relies on your Github repository in order to know which files should have metadata records on the CALYPR platform, a Github personal access token is needed for [source.ohsu.edu](http://source.ohsu.edu) . To create your own personal access token login to [https://source.ohsu.edu/settings/tokens](https://source.ohsu.edu/settings/tokens) click “generate new token”. Make sure the token has clone permissions at the minimum. + +Then run the forge publish command: + +\`\`\`forge publish \[your\_generated\_access\_token\]\`\`\` + +\[Insert Basic information on explorer page and what it is used for here\] + +forge publish \ + +What happens: + +1. Forge validates your GitHub Personal Access Token +2. Packages repository information +3. Submits a Sower job to Gen3 +4. Gen3 ingests FHIR metadata from META/ +5. Metadata becomes searchable in CALYPR + +Successful output: + +✓ Personal Access Token validated +✓ Repository information packaged +✓ Sower job submitted: job-id-12345 +✓ Metadata ingestion started + +Check job status: forge status \ +Get all job ids: forge list + +📖 More details: [Forge Publish Command](https://github.com/copilot/tools/forge/commands.md#forge-publish) + +--- + +### Verification Checklist + +After completing the workflow: + +* LFS pointer files in Git repository +* DRS records created (check .drs/ logs) +* DRS URIs point to S3 locations +* Metadata files validated successfully +* Sower job completed without errors +* Data searchable in CALYPR web interface +* Can query patients/observations in Gen3 +* Files accessible via S3 (no duplicate storage) \ No newline at end of file diff --git a/docs/requirements.md b/docs/calypr/requirements.md similarity index 100% rename from docs/requirements.md rename to docs/calypr/requirements.md diff --git a/docs/workflows/download-single-file.png b/docs/calypr/website/download-single-file.png similarity index 100% rename from docs/workflows/download-single-file.png rename to docs/calypr/website/download-single-file.png diff --git a/docs/workflows/explorer.png b/docs/calypr/website/explorer.png similarity index 100% rename from docs/workflows/explorer.png rename to docs/calypr/website/explorer.png diff --git a/docs/workflows/file-list.png b/docs/calypr/website/file-list.png similarity index 100% rename from docs/workflows/file-list.png rename to docs/calypr/website/file-list.png diff --git a/docs/workflows/file-manifest.png b/docs/calypr/website/file-manifest.png similarity index 100% rename from docs/workflows/file-manifest.png rename to docs/calypr/website/file-manifest.png diff --git a/docs/workflows/portal-download.md b/docs/calypr/website/portal-download.md similarity index 97% rename from docs/workflows/portal-download.md rename to docs/calypr/website/portal-download.md index b3047f4..b75773b 100644 --- a/docs/workflows/portal-download.md +++ b/docs/calypr/website/portal-download.md @@ -2,8 +2,6 @@ title: Download --- -{% include '/note.md' %} - There are two main ways to download files: 1. Individually through the browser or through the command line with the `gen3-client` diff --git a/docs/workflows/portal-explore.md b/docs/calypr/website/portal-explore.md similarity index 100% rename from docs/workflows/portal-explore.md rename to docs/calypr/website/portal-explore.md diff --git a/docs/index.md b/docs/index.md index 7ad8184..6a4fed8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,16 +1,69 @@ -# Welcome to the CALYPR Documentation +--- +template: home.html +hide: + - navigation + - toc + - header +--- -![CALYPR site](./images/website_header.png) + + -This documentation will walk you through the steps for submitting data to the [CALYPR Data Commons](https://calypr.ohsu.edu.org). +
+ +
+
+ CALYPR +
+
+

CALYPR

+

Scalable genomics data science platform for biological insights.

+

Next-generation genomics data science platform with scalable cloud / on-prem hybrid infrastructure, streamlining the journey from raw data to discovery.

+ Learn more +
+
-## About -The [gen3-tracker](https://github.com/CALYPR/gen3_util/) (g3t) command line utility is a combination of tools that facilitate data sharing on the CALYPR platform. It allows you to create a unified data project, upload files, and associate those files with metadata in an incremental manner. Submitted data with g3t gives you all the benefits the data platform offers: data indexing, data exploration, consolidated access, and more! + +
+
+ GRIP +
+
+

GRIP

+

Graph-based data integration for complex research datasets.

+

High-performance graph query engine that provides a unified interface across MongoDB, SQL, and key-value stores. Ideal for complex relational discovery in genomics.

+ Learn more +
+
-The following guide details the steps a data contributor must take to submit a project to the CALYPR data commons. + +
+
+ Funnel +
+
+

Funnel

+

Distributed task execution for petabyte-scale pipelines.

+

Standardized batch computing using the GA4GH TES API. Run Docker-based tasks seamlessly across AWS, Google Cloud, and Kubernetes at any scale.

+ Learn more +
+
-## Getting Started + +
+
+ Git-DRS +
+
+

Git-DRS

+

Secure data repository system with version control.

+

Manage large-scale genomic data with integrated versioning and metadata management, ensuring reproducibility and data integrity throughout research cycles.

+ Learn more +
+
+
-To navigate through each page, use pages list in the top left or using the navigation arrow on the bottom left and right! Otherwise, check out our [requirements](requirements.md) page to get started. - -![Main landing page for CALYPR IDP](./images/main-page.png) diff --git a/docs/note.md b/docs/note.md deleted file mode 100644 index 1b65636..0000000 --- a/docs/note.md +++ /dev/null @@ -1,2 +0,0 @@ -!!! note - The tools listed here are under development and may be subject to change. diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index 638e7e8..d73b224 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -1,10 +1,27 @@ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); + /* Prevent the '$' character in shell blocks from being copied */ .gp { user-select: none; } +:root { + --md-primary-fg-color: #0057B7; + --md-primary-fg-color--light: #4698CA; + --card-background: #ffffff; + --card-shadow: 0 4px 20px rgba(0, 0, 0, 0.08); + --card-shadow-hover: 0 12px 30px rgba(0, 0, 0, 0.12); + --text-muted: #64748b; + --transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); +} + +body { + font-family: 'Inter', -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; +} + h1, h2, h3 { - font-weight: bold !important; + font-weight: 700 !important; + letter-spacing: -0.02em; } /* horizontal dividers */ @@ -13,13 +30,160 @@ h1, h2, h3 { display: block; width: 100%; height: 1px; - background-color: lightgrey; + background-color: #e2e8f0; margin-top: 0.5em; margin-bottom: 1.5em; } -/* colors */ -:root > * { - --md-primary-fg-color: #0057B7; - --md-primary-fg-color--light: #4698CA; +/* Hero section container */ +.md-hero { + background-image: linear-gradient(135deg, var(--md-primary-fg-color), #1e40af); + background: + linear-gradient(rgba(0, 48, 102, 0.4), rgba(0, 48, 102, 0.4)), + url("../assets/banner_fade.png"); + background-size: cover; + background-position: center; + color: white; + padding: 6rem 0; + clip-path: ellipse(150% 100% at 50% 0%); +} + +.md-hero__inner { + display: flex; + flex-direction: column; + align-items: center; + text-align: center; +} + +.md-hero__content h1 { + font-size: 3rem; + font-weight: 800; + margin-bottom: 1rem; + text-shadow: 0 2px 10px rgba(0,0,0,0.1); +} + +.md-hero__content div { + font-size: 1.25rem; + max-width: 40rem; + margin-bottom: 2rem; + opacity: 0.95; + font-weight: 400; +} + +/* Product Grid */ +.product-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); + gap: 2rem; + max-width: 1100px; + margin: -3rem auto 4rem; + padding: 0 1rem; + position: relative; + z-index: 10; +} + +/* Professional Product Card */ +.product-card { + background: var(--card-background); + border-radius: 12px; + box-shadow: var(--card-shadow); + overflow: hidden; + transition: var(--transition); + border: 1px solid rgba(226, 232, 240, 0.8); + display: flex; + flex-direction: column; +} + +.product-card:hover { + transform: translateY(-8px); + box-shadow: var(--card-shadow-hover); + border-color: var(--md-primary-fg-color--light); +} + +.product-card__image-wrap { + position: relative; + width: 100%; + height: 180px; + background: #f8fafc; + border-bottom: 1px solid #f1f5f9; + display: flex; + align-items: center; + justify-content: center; + overflow: hidden; +} + +.product-card__image { + max-width: 80%; + max-height: 80%; + object-fit: contain; + transition: var(--transition); +} + +.product-card:hover .product-card__image { + transform: scale(1.05); +} + +.product-card__content { + padding: 1.5rem; + flex-grow: 1; + display: flex; + flex-direction: column; +} + +.product-card__title { + color: #0f172a; + font-size: 1.5rem; + font-weight: 700; + margin-bottom: 0.75rem; +} + +.product-card__summary { + color: #334155; + font-size: 0.95rem; + font-weight: 500; + margin-bottom: 0.75rem; + line-height: 1.4; +} + +.product-card__description { + color: var(--text-muted); + font-size: 0.875rem; + line-height: 1.6; + margin-bottom: 1.5rem; + flex-grow: 1; +} + +.product-card__link { + display: inline-flex; + align-items: center; + color: var(--md-primary-fg-color); + font-weight: 600; + font-size: 0.95rem; + text-decoration: none; + transition: var(--transition); +} + +.product-card__link i { + margin-left: 0.25rem; + transition: var(--transition); +} + +.product-card__link:hover { + color: #1a73e8; +} + +.product-card__link:hover i { + transform: translateX(4px); +} + +/* Responsive */ +@media screen and (max-width: 768px) { + .product-grid { + grid-template-columns: 1fr; + margin-top: 2rem; + } + + .md-hero__content h1 { + font-size: 2.25rem; + } } diff --git a/docs/tools/.nav.yaml b/docs/tools/.nav.yaml new file mode 100644 index 0000000..508b931 --- /dev/null +++ b/docs/tools/.nav.yaml @@ -0,0 +1 @@ +title: Tools \ No newline at end of file diff --git a/docs/tools/funnel/docs.md b/docs/tools/funnel/docs.md new file mode 100644 index 0000000..fa2a781 --- /dev/null +++ b/docs/tools/funnel/docs.md @@ -0,0 +1,82 @@ +--- +title: Overview +menu: + main: + identifier: docs + weight: -1000 +--- + +# Overview + +Funnel makes distributed, batch processing easier by providing a simple task API and a set of +components which can easily adapted to a vareity of platforms. + +### Task + +A task defines a unit of work: metadata, input files to download, a sequence of Docker containers + commands to run, +output files to upload, state, and logs. The API allows you to create, get, list, and cancel tasks. + +Tasks are accessed via the `funnel task` command. There's an HTTP client in the [client package][clientpkg], +and a set of utilities and a gRPC client in the [proto/tes package][tespkg]. + +There's a lot more you can do with the task API. See the [tasks docs](/docs/tasks/) for more. + +### Server + +The server serves the task API, web dashboard, and optionally runs a task scheduler. +It serves both HTTP/JSON and gRPC/Protobuf. + +The server is accessible via the `funnel server` command and the [server package][serverpkg]. + +### Storage + +Storage provides access to file systems such as S3, Google Storage, and local filesystems. +Tasks define locations where files should be downloaded from and uploaded to. Workers handle +the downloading/uploading. + +See the [storage docs](/docs/storage/) for more information on configuring storage backends. +The storage clients are available in the [storage package][storagepkg]. + +### Worker + +A worker is reponsible for executing a task. There is one worker per task. A worker: + +- downloads the inputs +- runs the sequence of executors (usually via Docker) +- uploads the outputs + +Along the way, the worker writes logs to event streams and databases: + +- start/end time +- state changes (initializing, running, error, etc) +- executor start/end times +- executor exit codes +- executor stdout/err logs +- a list of output files uploaded, with sizes +- system logs, such as host name, docker command, system error messages, etc. + +The worker is accessible via the `funnel worker` command and the [worker package][workerpkg]. + +### Node Scheduler + +A node is a service that stays online and manages a pool of task workers. A Funnel cluster +runs a node on each VM. Nodes communicate with a Funnel scheduler, which assigns tasks +to nodes based on available resources. Nodes handle starting workers when for each assigned +task. + +Nodes aren't always required. In some cases it often makes sense to rely on an existing, +external system for scheduling tasks and managing cluster resources, such as AWS Batch +or HPC systems like HTCondor, Slurm, Grid Engine, etc. Funnel provides integration with +these services that doesn't include nodes or scheduling by Funnel. + +See [Deploying a cluster](/docs/compute/deployment/) for more information about running a cluster of nodes. + +The node is accessible via the `funnel node` command and the [scheduler package][schedpkg]. + +[tes]: https://github.com/ga4gh/task-execution-schemas +[serverpkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/server +[workerpkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/worker +[schedpkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/compute/scheduler +[clientpkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/client +[tespkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/proto/tes +[storagepkg]: https://github.com/ohsu-comp-bio/funnel/tree/master/storage diff --git a/docs/tools/funnel/docs/compute.md b/docs/tools/funnel/docs/compute.md new file mode 100644 index 0000000..3a1a7f6 --- /dev/null +++ b/docs/tools/funnel/docs/compute.md @@ -0,0 +1,8 @@ +--- +title: Compute +menu: + main: + weight: -5 +--- + +# Compute diff --git a/docs/tools/funnel/docs/compute/aws-batch.md b/docs/tools/funnel/docs/compute/aws-batch.md new file mode 100644 index 0000000..bebc256 --- /dev/null +++ b/docs/tools/funnel/docs/compute/aws-batch.md @@ -0,0 +1,100 @@ +--- +title: AWS Batch +menu: + main: + parent: Compute + weight: 20 +--- + +# AWS Batch + +This guide covers deploying a Funnel server that leverages [DynamoDB][0] for storage +and [AWS Batch][1] for task execution. + +## Setup + +Get started by creating a compute environment, job queue and job definition using either +the Funnel CLI or the AWS Batch web console. To manage the permissions of instanced +AWS Batch jobs create a new IAM role. For the Funnel configuration outlined +in this document, this role will need to provide read and write access to both S3 and DynamoDB. + +_Note_: We recommend creating the Job Definition with Funnel by running: `funnel aws batch create-job-definition`. +Funnel expects the JobDefinition to start a Funnel worker process with a specific configuration. +Only advanced users should consider making any substantial changes to this Job Definition. + +AWS Batch tasks, by default, launch the ECS Optimized AMI which includes +an 8GB volume for the operating system and a 22GB volume for Docker image and metadata +storage. The default Docker configuration allocates up to 10GB of this storage to +each container instance. [Read more about the default AMI][8]. Due to these limitations, we +recommend [creating a custom AMI][7]. Because AWS Batch has the same requirements for your +AMI as Amazon ECS, use the default Amazon ECS-optimized Amazon Linux AMI as a base and change it +to better suit your tasks. + +### Steps +* [Create a Compute Environment][3] +* (_Optional_) [Create a custom AMI][7] +* [Create a Job Queue][4] +* [Create an EC2ContainerTaskRole with policies for managing access to S3 and DynamoDB][5] +* [Create a Job Definition][6] + +For more information check out AWS Batch's [getting started guide][2]. + +### Quickstart + +``` +$ funnel aws batch create-all-resources --region us-west-2 + +``` + +This command will create a compute environment, job queue, IAM role and job definition. + +## Configuring the Funnel Server + +Below is an example configuration. Note that the `Key` +and `Secret` fields are left blank in the configuration of the components. This is because +Funnel will, by default, try to automatically load credentials from the environment. +Alternatively, you may explicitly set the credentials in the config. + +```YAML +Database: "dynamodb" +Compute: "aws-batch" +EventWriters: + - "log" + +Dynamodb: + TableBasename: "funnel" + Region: "us-west-2" + Key: "" + Secret: "" + +Batch: + JobDefinition: "funnel-job-def" + JobQueue: "funnel-job-queue" + Region: "us-west-2" + Key: "" + Secret: "" + +AmazonS3: + Key: "" + Secret: "" +``` + +### Start the server + +```sh +funnel server run --config /path/to/config.yaml +``` + +### Known issues + +The `Task.Resources.DiskGb` field does not have any effect. See [issue 317](https://github.com/ohsu-comp-bio/funnel/issues/317). + +[0]: http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Introduction.html +[1]: http://docs.aws.amazon.com/batch/latest/userguide/what-is-batch.html +[2]: http://docs.aws.amazon.com/batch/latest/userguide/Batch_GetStarted.html +[3]: https://us-west-2.console.aws.amazon.com/batch/home?region=us-west-2#/compute-environments/new +[4]: https://us-west-2.console.aws.amazon.com/batch/home?region=us-west-2#/queues/new +[5]: https://console.aws.amazon.com/iam/home?region=us-west-2#/roles$new?step=permissions&selectedService=EC2ContainerService&selectedUseCase=EC2ContainerTaskRole +[6]: https://us-west-2.console.aws.amazon.com/batch/home?region=us-west-2#/job-definitions/new +[7]: http://docs.aws.amazon.com/batch/latest/userguide/create-batch-ami.html +[8]: http://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-optimized_AMI.html diff --git a/docs/tools/funnel/docs/compute/deployment.md b/docs/tools/funnel/docs/compute/deployment.md new file mode 100644 index 0000000..2ea266f --- /dev/null +++ b/docs/tools/funnel/docs/compute/deployment.md @@ -0,0 +1,79 @@ +--- +title: Deploying a cluster +menu: + main: + parent: Compute + weight: -50 +--- + +# Deploying a cluster + +This guide describes the basics of starting a cluster of Funnel nodes. +This guide is a work in progress. + +A node is a service +which runs on each machine in a cluster. The node connects to the Funnel server and reports +available resources. The Funnel scheduler process assigns tasks to nodes. When a task is +assigned, a node will start a worker process. There is one worker process per task. + +Nodes aren't always required. In some cases it makes sense to rely on an existing, +external system for scheduling tasks and managing cluster resources, such as AWS Batch, +HTCondor, Slurm, Grid Engine, etc. Funnel provides integration with +these services without using nodes or the scheduler. + +### Usage + +Nodes are available via the `funnel node` command. To start a node, run +```sh +funnel node run --config node.config.yml +``` + +To activate the Funnel scheduler, use the `manual` backend in the config. + +The available scheduler and node config: +```yaml +# Activate the Funnel scheduler. +Compute: manual + +Scheduler: + # How often to run a scheduler iteration. + ScheduleRate: 1s + + # How many tasks to schedule in one iteration. + ScheduleChunk: 10 + + # How long to wait between updates before marking a node dead. + NodePingTimeout: 1m + + # How long to wait for a node to start, before marking the node dead. + NodeInitTimeout: 5m + + +Node: + # If empty, a node ID will be automatically generated using the hostname. + ID: "" + + # If the node has been idle for longer than the timeout, it will shut down. + # -1 means there is no timeout. 0 means timeout immediately after the first task. + Timeout: -1s + + # A Node will automatically try to detect what resources are available to it. + # Defining Resources in the Node configuration overrides this behavior. + Resources: + # CPUs available. + # Cpus: 0 + # RAM available, in GB. + # RamGb: 0.0 + # Disk space available, in GB. + # DiskGb: 0.0 + + # For low-level tuning. + # How often to sync with the Funnel server. + UpdateRate: 5s + +Logger: + # Logging levels: debug, info, error + Level: info + # Write logs to this path. If empty, logs are written to stderr. + OutputFile: "" +``` diff --git a/docs/tools/funnel/docs/compute/grid-engine.md b/docs/tools/funnel/docs/compute/grid-engine.md new file mode 100644 index 0000000..98139d1 --- /dev/null +++ b/docs/tools/funnel/docs/compute/grid-engine.md @@ -0,0 +1,36 @@ +--- +title: Grid Engine +menu: + main: + parent: Compute + weight: 20 +--- +# Grid Engine + +Funnel can be configured to submit workers to [Grid Engine][ge] by making calls +to `qsub`. + +The Funnel server needs to run on a submission node. +Configure Funnel to use Grid Engine by including the following config: + +It is recommended to update the submit file template so that the +`funnel worker run` command takes a config file as an argument +(e.g. `funnel worker run --config /opt/funnel_config.yml --taskID {{.TaskId}}`) + +```YAML +{{< gridengine-template >}} +``` +The following variables are available for use in the template: + +| Variable | Description | +|:------------|:-------------| +|TaskId | funnel task id | +|WorkDir | funnel working directory | +|Cpus | requested cpu cores | +|RamGb | requested ram | +|DiskGb | requested free disk space | +|Zone | requested zone (could be used for queue name) | + +See https://golang.org/pkg/text/template for information on creating templates. + +[ge]: http://gridscheduler.sourceforge.net/documentation.html diff --git a/docs/tools/funnel/docs/compute/htcondor.md b/docs/tools/funnel/docs/compute/htcondor.md new file mode 100644 index 0000000..bd5db3d --- /dev/null +++ b/docs/tools/funnel/docs/compute/htcondor.md @@ -0,0 +1,36 @@ +--- +title: HTCondor +menu: + main: + parent: Compute + weight: 20 +--- +# HTCondor + +Funnel can be configured to submit workers to [HTCondor][htcondor] by making +calls to `condor_submit`. + +The Funnel server needs to run on a submission node. +Configure Funnel to use HTCondor by including the following config: + +It is recommended to update the submit file template so that the +`funnel worker run` command takes a config file as an argument +(e.g. `funnel worker run --config /opt/funnel_config.yml --taskID {{.TaskId}}`) + +```YAML +{{< htcondor-template >}} +``` +The following variables are available for use in the template: + +| Variable | Description | +|:------------|:-------------| +|TaskId | funnel task id | +|WorkDir | funnel working directory | +|Cpus | requested cpu cores | +|RamGb | requested ram | +|DiskGb | requested free disk space | +|Zone | requested zone (could be used for queue name) | + +See https://golang.org/pkg/text/template for information on creating templates. + +[htcondor]: https://research.cs.wisc.edu/htcondor/ diff --git a/docs/tools/funnel/docs/compute/kubernetes.md b/docs/tools/funnel/docs/compute/kubernetes.md new file mode 100644 index 0000000..b484fda --- /dev/null +++ b/docs/tools/funnel/docs/compute/kubernetes.md @@ -0,0 +1,119 @@ +--- +title: Kubernetes +menu: + main: + parent: Compute + weight: 20 +--- + +> Funnel on Kubernetes is in active development and may involve frequent updates 🚧 + +# Quick Start + +## 1. Deploying with Helm ⚡️ + +```sh +helm repo add ohsu https://ohsu-comp-bio.github.io/helm-charts +helm repo update +helm upgrade --install ohsu funnel +``` + +{{< details title="(Alternative) Deploying with `kubectl` ⚙️" >}} + +### 1. Create a Service: + +Deploy it: + +```sh +kubectl apply -f funnel-service.yml +``` + +### 2. Create Funnel config files + +> *[funnel-server.yaml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/funnel-server.yaml)* + +> *[funnel-worker.yaml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/funnel-worker.yaml)* + +Get the clusterIP: + +```sh +export HOSTNAME=$(kubectl get services funnel --output=jsonpath='{.spec.clusterIP}') + +sed -i "s|\${HOSTNAME}|${HOSTNAME}|g" funnel-worker.yaml +``` + +### 3. Create a ConfigMap + +```sh +kubectl create configmap funnel-config --from-file=funnel-server.yaml --from-file=funnel-worker.yaml +``` + +### 4. Create a Service Account for Funnel + +Define a Role and RoleBinding: + +> *[role.yml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/role.yml)* + +> *[role_binding.yml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/role_binding.yml)* + +```sh +kubectl create serviceaccount funnel-sa --namespace default +kubectl apply -f role.yml +kubectl apply -f role_binding.yml +``` + +### 5. Create a Persistent Volume Claim + +> *[funnel-storage-pvc.yml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/funnel-storage-pvc.yml)* + +```sh +kubectl apply -f funnel-storage-pvc.yml +``` + +### 6. Create a Deployment + +> *[funnel-deployment.yml](https://github.com/ohsu-comp-bio/funnel/blob/develop/deployments/kubernetes/funnel-deployment.yml)* + +```sh +kubectl apply -f funnel-deployment.yml +``` + +{{< /details >}} + +# 2. Proxy the Service for local testing + +```sh +kubectl port-forward service/funnel 8000:8000 +``` + +Now the funnel server can be accessed as if it were running locally. This can be verified by listing all tasks, which will return an empty JSON list: + +```sh +funnel task list +# {} +``` + +A task can then be submitted following the [standard workflow](../../tasks): + +```sh +funnel examples hello-world > hello-world.json + +funnel task create hello-world.json +# +``` + +# Storage Architecture + + + + + +# Additional Resources 📚 + +- [Helm Repo](https://ohsu-comp-bio.github.io/helm-charts) + +- [Helm Repo Source](https://github.com/ohsu-comp-bio/helm-charts) + +- [Helm Charts](https://github.com/ohsu-comp-bio/funnel/tree/develop/deployments/kubernetes/helm) + +- [The Chart Best Practices Guide](https://helm.sh/docs/chart_best_practices/) diff --git a/docs/tools/funnel/docs/compute/pbs-torque.md b/docs/tools/funnel/docs/compute/pbs-torque.md new file mode 100644 index 0000000..f55e621 --- /dev/null +++ b/docs/tools/funnel/docs/compute/pbs-torque.md @@ -0,0 +1,36 @@ +--- +title: PBS/Torque +menu: + main: + parent: Compute + weight: 20 +--- +# PBS/Torque + +Funnel can be configured to submit workers to [PBS/Torque][pbs] by making calls +to `qsub`. + +The Funnel server needs to run on a submission node. +Configure Funnel to use PBS by including the following config: + +It is recommended to update the submit file template so that the +`funnel worker run` command takes a config file as an argument +(e.g. `funnel worker run --config /opt/funnel_config.yml --taskID {{.TaskId}}`) + +```YAML +{{< pbs-template >}} +``` +The following variables are available for use in the template: + +| Variable | Description | +|:------------|:-------------| +|TaskId | funnel task id | +|WorkDir | funnel working directory | +|Cpus | requested cpu cores | +|RamGb | requested ram | +|DiskGb | requested free disk space | +|Zone | requested zone (could be used for queue name) | + +See https://golang.org/pkg/text/template for information on creating templates. + +[pbs]: http://www.adaptivecomputing.com/products/open-source/torque/ diff --git a/docs/tools/funnel/docs/compute/slurm.md b/docs/tools/funnel/docs/compute/slurm.md new file mode 100644 index 0000000..fc7b8b6 --- /dev/null +++ b/docs/tools/funnel/docs/compute/slurm.md @@ -0,0 +1,36 @@ +--- +title: Slurm +menu: + main: + parent: Compute + weight: 20 +--- +# Slurm + +Funnel can be configured to submit workers to [Slurm][slurm] by making calls +to `sbatch`. + +The Funnel server needs to run on a submission node. +Configure Funnel to use Slurm by including the following config: + +It is recommended to update the submit file template so that the +`funnel worker run` command takes a config file as an argument +(e.g. `funnel worker run --config /opt/funnel_config.yml --taskID {{.TaskId}}`) + +```YAML +{{< slurm-template >}} +``` +The following variables are available for use in the template: + +| Variable | Description | +|:------------|:-------------| +|TaskId | funnel task id | +|WorkDir | funnel working directory | +|Cpus | requested cpu cores | +|RamGb | requested ram | +|DiskGb | requested free disk space | +|Zone | requested zone (could be used for queue name) | + +See https://golang.org/pkg/text/template for information on creating templates. + +[slurm]: https://slurm.schedmd.com/ diff --git a/docs/tools/funnel/docs/databases.md b/docs/tools/funnel/docs/databases.md new file mode 100644 index 0000000..5eeb638 --- /dev/null +++ b/docs/tools/funnel/docs/databases.md @@ -0,0 +1,8 @@ +--- +title: Databases +menu: + main: + weight: 5 +--- + +# Databases diff --git a/docs/tools/funnel/docs/databases/boltdb.md b/docs/tools/funnel/docs/databases/boltdb.md new file mode 100644 index 0000000..ea5885e --- /dev/null +++ b/docs/tools/funnel/docs/databases/boltdb.md @@ -0,0 +1,24 @@ +--- +title: Embedded +menu: + main: + parent: Databases + weight: -10 +--- + +# Embedded + +By default, Funnel uses an embedded database named [BoltDB][bolt] to store task +and scheduler data. This is great for development and a simple server without +external dependencies, but it doesn't scale well to larger clusters. + +Available config: +```yaml +Database: boltdb + +BoltDB: + # Path to database file + Path: ./funnel-work-dir/funnel.db +``` + +[bolt]: https://github.com/boltdb/bolt diff --git a/docs/tools/funnel/docs/databases/datastore.md b/docs/tools/funnel/docs/databases/datastore.md new file mode 100644 index 0000000..ea31d8c --- /dev/null +++ b/docs/tools/funnel/docs/databases/datastore.md @@ -0,0 +1,94 @@ +--- +title: Datastore +menu: + main: + parent: Databases +--- + +# Google Cloud Datastore + +Funnel supports storing tasks (but not scheduler data) in Google Cloud Datastore. + +This implementation currently doesn't work with Appengine, since Appengine places +special requirements on the context of requests and requires a separate library. + +Two entity types are used, "Task" and "TaskPart" (for larger pieces of task content, +such as stdout/err logs). + +Funnel will, by default, try to automatically load credentials from the +environment. Alternatively, you may explicitly set the credentials in the config. +You can read more about providing the credentials +[here](https://cloud.google.com/docs/authentication/application-default-credentials). + +Config: +```yaml +Database: datastore + +Datastore: + Project: "" + # Path to account credentials file. + # Optional. If possible, credentials will be automatically discovered + # from the environment. + CredentialsFile: "" +``` + +Please also import some [composite +indexes](https://cloud.google.com/datastore/docs/concepts/indexes?hl=en) +to support the task-list queries. +This is typically done through command-line by referencing an **index.yaml** +file (do not change the filename) with the following content: + +```shell +gcloud datastore indexes create path/to/index.yaml --database='funnel' +``` + +```yaml +indexes: + +- kind: Task + properties: + - name: Owner + - name: State + - name: TagStrings + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: Owner + - name: State + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: Owner + - name: TagStrings + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: Owner + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: State + - name: TagStrings + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: State + - name: CreationTime + direction: desc + +- kind: Task + properties: + - name: TagStrings + - name: CreationTime + direction: desc +``` \ No newline at end of file diff --git a/docs/tools/funnel/docs/databases/dynamodb.md b/docs/tools/funnel/docs/databases/dynamodb.md new file mode 100644 index 0000000..3e536c2 --- /dev/null +++ b/docs/tools/funnel/docs/databases/dynamodb.md @@ -0,0 +1,30 @@ +--- +title: DynamoDB +menu: + main: + parent: Databases +--- + +# DynamoDB + +Funnel supports storing task data in DynamoDB. Storing scheduler data is not supported currently, so using the node scheduler with DynamoDB won't work. Using AWS Batch for compute scheduling may be a better option. +Funnel will, by default, try to automatically load credentials from the environment. Alternatively, you may explicitly set the credentials in the config. + +Available Config: +```yaml +Database: dynamodb + +DynamoDB: + # Basename to use for dynamodb tables + TableBasename: "funnel" + # AWS region + Region: "us-west-2" + # AWS Access key ID + Key: "" + # AWS Secret Access Key + Secret: "" +``` + +### Known issues + +Dynamo does not store scheduler data. See [issue 340](https://github.com/ohsu-comp-bio/funnel/issues/340). diff --git a/docs/tools/funnel/docs/databases/elasticsearch.md b/docs/tools/funnel/docs/databases/elasticsearch.md new file mode 100644 index 0000000..e397348 --- /dev/null +++ b/docs/tools/funnel/docs/databases/elasticsearch.md @@ -0,0 +1,30 @@ +--- +title: Elasticsearch +menu: + main: + parent: Databases +--- + +# Elasticsearch + +Funnel supports storing tasks and scheduler data in Elasticsearch (v8). + +Config: +```yaml +Database: elastic + +Elastic: + # Prefix to use for indexes + IndexPrefix: "funnel" + URL: http://localhost:9200 + # Optional. Username for HTTP Basic Authentication. + Username: + # Optional. Password for HTTP Basic Authentication. + Password: + # Optional. Endpoint for the Elastic Service (https://elastic.co/cloud). + CloudID: + # Optional. Base64-encoded token for authorization; if set, overrides username/password and service token. + APIKey: + # Optional. Service token for authorization; if set, overrides username/password. + ServiceToken: +``` diff --git a/docs/tools/funnel/docs/databases/mongodb.md b/docs/tools/funnel/docs/databases/mongodb.md new file mode 100644 index 0000000..4a6e8ab --- /dev/null +++ b/docs/tools/funnel/docs/databases/mongodb.md @@ -0,0 +1,24 @@ +--- +title: MongoDB +menu: + main: + parent: Databases +--- + +# MongoDB + +Funnel supports storing tasks and scheduler data in MongoDB. + +Config: +```yaml +Database: mongodb + +MongoDB: + # Addresses for the seed servers. + Addrs: + - "localhost" + # Database name used within MongoDB to store funnel data. + Database: "funnel" + Username: "" + Password: "" +``` diff --git a/docs/tools/funnel/docs/development.md b/docs/tools/funnel/docs/development.md new file mode 100644 index 0000000..1412f0d --- /dev/null +++ b/docs/tools/funnel/docs/development.md @@ -0,0 +1,8 @@ +--- +title: Development +menu: + main: + weight: 30 +--- + +# Development diff --git a/docs/tools/funnel/docs/development/developers.md b/docs/tools/funnel/docs/development/developers.md new file mode 100644 index 0000000..2d19f0a --- /dev/null +++ b/docs/tools/funnel/docs/development/developers.md @@ -0,0 +1,97 @@ +--- +title: Funnel Developers + +menu: + main: + parent: Development + weight: 30 +--- + +# Developers + +This page contains a rough collection of notes for people wanting to build Funnel from source and/or edit the code. + +### Building the Funnel source + +1. Install [Go 1.21+][go]. Check the version with `go version`. +2. Ensure GOPATH is set. See [the docs][gopath] for help. Also, you probably want to add `$GOPATH/bin` to your `PATH`. +3. Clone funnel and build + + ```shell + $ git clone https://github.com/ohsu-comp-bio/funnel.git + $ cd funnel + $ make + ``` + +4. Funnel is now downloaded and installed. Try `funnel version`. +5. You can edit the code and run `make install` to recompile. + +### Developer Tools + +A Funnel development environment includes: + +- [Go 1.21+][go] for the majority of the code. +- [Task Execution Schemas][tes] for task APIs. +- [Protobuf][protobuf] + [gRPC][grpc] for RPC communication. +- [gRPC Gateway][gateway] for HTTP communication. +- [Angular][angular] and [SASS][sass] for the web dashboard. +- [GNU Make][make] for development tasks. +- [Docker][docker] for executing task containers (tested with v1.12, v1.13). +- [dep][dep] for Go dependency vendoring. +- [Make][make] for development/build commands. +- [NodeJS][node] and [NPM][npm] for web dashboard development. + +### Makefile + +Most development tasks are run through `make` commands, including build, release, testing, website docs, lint, tidy, webdash dev, and more. See the [Makefile](https://github.com/ohsu-comp-bio/funnel/blob/master/Makefile) for an up-to-date list of commands. + +### Go Tests + +Run all tests: `make test` +Run the worker tests: `go test ./worker/...` +Run the worker tests with "Cancel" in the name: `go test ./worker -run Cancel` + +You get the idea. See the `go test` docs for more. + +### Mocking + +The [testify][testify] and [mockery][mockery] tools are used to generate and use +mock interfaces in test code, for example, to mock the Google Cloud APIs. + +[go]: https://golang.org +[angular]: https://angularjs.org/ +[protobuf]: https://github.com/google/protobuf +[grpc]: http://www.grpc.io/ +[sass]: http://sass-lang.com/ +[make]: https://www.gnu.org/software/make/ +[docker]: https://docker.io +[python]: https://www.python.org/ +[dep]: https://golang.github.io/dep/ +[node]: https://nodejs.org +[npm]: https://www.npmjs.com/ +[gateway]: https://github.com/grpc-ecosystem/grpc-gateway +[tes]: https://github.com/ga4gh/task-execution-schemas +[testify]: https://github.com/stretchr/testify +[mockery]: https://github.com/vektra/mockery +[gopath]: https://golang.org/doc/code.html#GOPATH + +### Making a release + +- Update Makefile, edit `FUNNEL_VERSION` and `LAST_PR_NUMBER` + - `LAST_PR_NUMBER` can be found by looking at the previous release notes + from the previous release. +- Run `make website`, which updates the download links and other content. + - Check the website locally by running `make website-dev` +- Commit these changes. + - Because goreleaser requires a clean working tree in git + - This is a special case where it's easiest to commit to master. +- Create a git tag: `git tag X.Y.Z` +- Run `make release` + - This will build cross-platform binaries, build release notes, + and draft an unpublished GitHub release. + - Check the built artifacts by downloading the tarballs from the GitHub draft release + and running `funnel version`. +- `git push origin master` to push your website and release changes. +- A tagged docker image for the release will be built automatically on [dockerhub](https://hub.docker.com/repository/docker/quay.io/ohsu-comp-bio/funnel). +- Publish the draft release on GitHub. +- Copy `build/release/funnel.rb` to the `ohsu-comp-bio/homebrew-formula/Formula/funnel.rb` Homebrew formula repo, and push those changes to master. diff --git a/docs/tools/funnel/docs/events.md b/docs/tools/funnel/docs/events.md new file mode 100644 index 0000000..87941ef --- /dev/null +++ b/docs/tools/funnel/docs/events.md @@ -0,0 +1,7 @@ +--- +title: Events +menu: + main: + weight: 5 +--- +# Events diff --git a/docs/tools/funnel/docs/events/kafka.md b/docs/tools/funnel/docs/events/kafka.md new file mode 100644 index 0000000..242ce93 --- /dev/null +++ b/docs/tools/funnel/docs/events/kafka.md @@ -0,0 +1,22 @@ +--- +title: Kafka +menu: + main: + parent: Events +--- + +# Kafka + +Funnel supports writing task events to a Kafka topic. To use this, add an event +writer to the config: + +```yaml +EventWriters: + - kafka + - log + +Kafka: + Servers: + - localhost:9092 + Topic: funnel-events +``` diff --git a/docs/tools/funnel/docs/integrations/nextflow.md b/docs/tools/funnel/docs/integrations/nextflow.md new file mode 100644 index 0000000..3090a94 --- /dev/null +++ b/docs/tools/funnel/docs/integrations/nextflow.md @@ -0,0 +1,100 @@ +--- +title: Nextflow +menu: + main: + parent: Integrations +--- + +> ⚠️ Nextflow support is currently in development and requires a few additional steps to run which are included below. + +# Nextflow + +[Nextflow](https://nextflow.io/) is a workflow engine with a [rich ecosystem]() of pipelines centered around biological analysis. + +> Nextflow enables scalable and reproducible scientific workflows using software containers. It allows the adaptation of pipelines written in the most common scripting languages. + +> Its fluent DSL simplifies the implementation and the deployment of complex parallel and reactive workflows on clouds and clusters. + +Since Nextflow [includes support](https://www.nextflow.io/docs/latest/executor.html#ga4gh-tes) for the TES API, it can be used in conjunction with Funnel to run tasks or to interact with a common TES endpoint. + +## Getting Started + +To set up Nextflow to use Funnel as the TES executor, run the following steps: + +### 1. Install Nextflow + +*Adapted from the [Nextflow Documentation](https://nextflow.io/docs/latest/install.html)* + +#### a. Install Nextflow: + +```sh +curl -s https://get.nextflow.io | bash +``` + +This will create the nextflow executable in the current directory. + +#### b. Make Nextflow executable: + +```sh +chmod +x nextflow +``` + +#### c. Move Nextflow into an executable path: + +```sh +sudo mv nextflow /usr/local/bin +``` + +#### d. Confirm that Nextflow is installed correctly: + +```sh +nextflow info +``` + +### 2. Update Nextflow Config + +Add the following to your `nextflow.config` in order to use the GA4GH TES plugin: + +```yaml +cat <> nextflow.config +plugins { + id 'nf-ga4gh' +} + +process.executor = 'tes' +tes.endpoint = 'http://localhost:8000' # <--- Funnel's default address +EOF +``` + +### 3. Start the Funnel Server + +Start the Funnel server: + +```sh +funnel server run +``` + +### 4. Run Nextflow + +In another window, run the workflow: + +```sh +nextflow run main.nf -c nextflow.config +``` + +## Additional Resources + +- [Nextflow Homepage](https://nextflow.io/) + +- [Nextflow Documentation](https://www.nextflow.io/docs) + +- [Nextflow's TES Support](https://www.nextflow.io/docs/latest/executor.html#ga4gh-tes) + +- [nf-core](https://nf-co.re/) + > A community effort to collect a curated set of analysis pipelines built using Nextflow. + +- [nf-canary](https://github.com/seqeralabs/nf-canary) + > A minimal Nextflow workflow for testing infrastructure. + +- [Nextflow Patterns](https://nextflow-io.github.io/patterns/) + > A curated collection of Nextflow implementation patterns diff --git a/docs/tools/funnel/docs/integrations/py-tes.md b/docs/tools/funnel/docs/integrations/py-tes.md new file mode 100644 index 0000000..7b12061 --- /dev/null +++ b/docs/tools/funnel/docs/integrations/py-tes.md @@ -0,0 +1,50 @@ +--- +title: py-tes +menu: + main: + parent: Integrations +--- + +> ⚠️ py-tes support is in active development and may be subject to change. + +# py-tes + +[py-tes](https://github.com/ohsu-comp-bio/py-tes) is a library for interacting with servers implementing the [GA4GH Task Execution Schema](https://github.com/ga4gh/task-execution-schemas). + +## Getting Started + +### Install + +Available on [PyPI](https://pypi.org/project/py-tes/). + +```sh +pip install py-tes +``` + +### Example Python Script + +```py +import tes + +task = tes.Task( + executors=[ + tes.Executor( + image="alpine", + command=["echo", "hello"] + ) + ] +) + +cli = tes.HTTPClient("http://funnel.example.com", timeout=5) +task_id = cli.create_task(task) +res = cli.get_task(task_id) +cli.cancel_task(task_id) +``` + +## Additional Resources + +- [py-tes Homepage](https://github.com/ohsu-comp-bio/py-tes) + +- [py-tes Documentation](https://ohsu-comp-bio.github.io/py-tes/) + +- [py-tes on PyPi](https://pypi.org/project/py-tes/) diff --git a/docs/tools/funnel/docs/metrics.md b/docs/tools/funnel/docs/metrics.md new file mode 100644 index 0000000..1077112 --- /dev/null +++ b/docs/tools/funnel/docs/metrics.md @@ -0,0 +1,8 @@ +--- +title: Metrics +menu: + main: + identifier: Metrics + weight: 6 +--- +# Metrics diff --git a/docs/tools/funnel/docs/metrics/prometheus.md b/docs/tools/funnel/docs/metrics/prometheus.md new file mode 100644 index 0000000..1b3495b --- /dev/null +++ b/docs/tools/funnel/docs/metrics/prometheus.md @@ -0,0 +1,36 @@ +--- +title: Prometheus +menu: + main: + parent: Metrics +--- + +# Prometheus + +[Prometheus][prom] is a monitoring and metrics collection service. It pulls metrics +from various "exporters", collects them in a time-series database, provides +a query langauge for access that data, and integrates closely with tools +such as [Grafana][graf] for visualization and dashboard building. + +Funnel exports these metrics: + +- `funnel_tasks_state_count`: the number of tasks + in each state (queued, running, etc). +- `funnel_nodes_state_count`: the number of nodes + in each state (alive, dead, draining, etc). +- `funnel_nodes_total_cpus`: the total number + of CPUs available by all nodes. +- `funnel_nodes_total_ram_bytes`: the total number + of bytes of RAM available by all nodes. +- `funnel_nodes_total_disk_bytes`: the total number + of bytes of disk space available by all nodes. +- `funnel_nodes_available_cpus`: the available number + of CPUs available by all nodes. +- `funnel_nodes_available_ram_bytes`: the available number + of bytes of RAM available by all nodes. +- `funnel_nodes_available_disk_bytes`: the available number + of bytes of disk space available by all nodes. + +[prom]: https://prometheus.io/ +[gauge]: https://prometheus.io/docs/concepts/metric_types/#gauge +[graf]: https://grafana.com/ diff --git a/docs/tools/funnel/docs/security.md b/docs/tools/funnel/docs/security.md new file mode 100644 index 0000000..c3dba45 --- /dev/null +++ b/docs/tools/funnel/docs/security.md @@ -0,0 +1,8 @@ +--- +title: Security +menu: + main: + weight: 10 +--- + +# Security diff --git a/docs/tools/funnel/docs/security/advanced.md b/docs/tools/funnel/docs/security/advanced.md new file mode 100644 index 0000000..3864e34 --- /dev/null +++ b/docs/tools/funnel/docs/security/advanced.md @@ -0,0 +1,29 @@ +--- +title: Advanced Auth +menu: + main: + parent: Security + weight: 10 +--- + +# Overview 🔐 + +Thanks to our collaborators at CTDS — Funnel is currently adding support for "Per-User/Per-Bucket" credentials to allow Users to access S3 Buckets without having to store their credentials in the Funnel Server. + +The high level overview of this feature will be such Funnel will be able to speak with a custom credential "Wrapper Script" that will: + +- Take the User Credentials +- Create an S3 Bucket +- Generate a Key (optionally for use in Nextflow Config) +- Send the Key to Funnel + +In this way this Wrapper can manage the bucket and the keys (the Wrapper would be the middleware between the User and Funnel). + +Stay tuned for this feature's development! This feature is being tracked with the following: + +- GitHub Branch: https://github.com/ohsu-comp-bio/funnel/tree/feature/credentials +- Pull Request: https://github.com/ohsu-comp-bio/funnel/pull/1098 + +# Credits 🙌 + +This feature and its development would not be possible without our continuing collaboration with [Pauline Ribeyre](https://github.com/paulineribeyre), [Jawad Qureshi](https://github.com/jawadqur), [Michael Fitzsimons](https://www.linkedin.com/in/michael-fitzsimons-ab8a6111), and the entire [CTDS](https://ctds.uchicago.edu) team at the [University of Chicago](https://www.uchicago.edu/)! diff --git a/docs/tools/funnel/docs/security/basic.md b/docs/tools/funnel/docs/security/basic.md new file mode 100644 index 0000000..0b19e07 --- /dev/null +++ b/docs/tools/funnel/docs/security/basic.md @@ -0,0 +1,59 @@ +--- +title: Basic Auth +menu: + main: + parent: Security + weight: 10 +--- +# Basic Auth + +By default, a Funnel server allows open access to its API endpoints, but it +can be configured to require basic password authentication. To enable this, +include users and passwords in your config file: + +```yaml +Server: + BasicAuth: + - User: admin + Password: someReallyComplexSecret + Admin: true + - User: funnel + Password: abc123 + + TaskAccess: OwnerOrAdmin +``` + +The `TaskAccess` property configures the visibility and access-mode for tasks: + +* `All` (default) - all tasks are visible to everyone +* `Owner` - tasks are visible to the users who created them +* `OwnerOrAdmin` - extends `Owner` by allowing Admin-users (`Admin: true`) + access everything + +As new tasks are created, the username behind the request is recorded as the +owner of the task. Depending on the `TaskAccess` property, if owner-based +acces-mode is enabled, the owner of the task is compared to username of current +request to decide if the user may see and interact with the task. + +If you are using BoltDB or Badger, the Funnel worker communicates to the server via gRPC +so you will also need to configure the RPC client. + +```yaml +RPCClient: + User: funnel + Password: abc123 +``` + +Make sure to properly protect the configuration file so that it's not readable +by everyone: + +```bash +$ chmod 600 funnel.config.yml +``` + +To use the password, set the `FUNNEL_SERVER_USER` and `FUNNEL_SERVER_PASSWORD` environment variables: +```bash +$ export FUNNEL_SERVER_USER=funnel +$ export FUNNEL_SERVER_PASSWORD=abc123 +$ funnel task list +``` diff --git a/docs/tools/funnel/docs/security/oauth2.md b/docs/tools/funnel/docs/security/oauth2.md new file mode 100644 index 0000000..4b4232d --- /dev/null +++ b/docs/tools/funnel/docs/security/oauth2.md @@ -0,0 +1,74 @@ +--- +title: OAuth2 +menu: + main: + parent: Security + weight: 10 +--- +# OAuth2 + +By default, a Funnel server allows open access to its API endpoints, but in +addition to Basic authentication it can also be configured to require a valid +JWT in the request. + +Funnel itself does not redirect users to perform the login. +It just validates that the presented token is issued by a trusted service +(specified in the YAML configuration file) and the token has not expired. +In addition, if the OIDC provides a token introspection endpoint (in its +configuration JSON), Funnel server also calls that endpoint to make sure the +token is still active (i.e., no token invalidation before expiring). + +Optionally, Funnel can also validate the scope and audience claims to contain +specific values. + +To enable JWT authentication, specify `OidcAuth` section in your config file: + +```yaml +Server: + OidcAuth: + # URL of the OIDC service configuration: + ServiceConfigURL: "https://my.oidc.service/.well-known/openid-configuration" + + # Client ID and secret are sent with the token introspection request + # (Basic authentication): + ClientId: your-client-id + ClientSecret: your-client-secret + + # Optional: if specified, this scope value must be in the token: + RequireScope: funnel-id + + # Optional: if specified, this audience value must be in the token: + RequireAudience: tes-api + + # The URL where OIDC should redirect after login (keep the path '/login') + RedirectURL: "http://localhost:8000/login" + + # List of OIDC subjects promoted to Admin status. + Admins: + - user.one@example.org + - user.two@example.org + + TaskAccess: OwnerOrAdmin +``` + +The `TaskAccess` property configures the visibility and access-mode for tasks: + +* `All` (default) - all tasks are visible to everyone +* `Owner` - tasks are visible to the users who created them +* `OwnerOrAdmin` - extends `Owner` by allowing Admin-users (defined under + `Admins`) access everything + +As new tasks are created, the username behind the request is recorded as the +owner of the task. Depending on the `TaskAccess` property, if owner-based +acces-mode is enabled, the owner of the task is compared to username of current +request to decide if the user may see and interact with the task. + +Make sure to properly protect the configuration file so that it's not readable +by everyone: + +```bash +$ chmod 600 funnel.config.yml +``` + +Note that the Funnel UI supports login through an OIDC service. However, OIDC +authentication is not supported at command-line. diff --git a/docs/tools/funnel/docs/storage.md b/docs/tools/funnel/docs/storage.md new file mode 100644 index 0000000..9297161 --- /dev/null +++ b/docs/tools/funnel/docs/storage.md @@ -0,0 +1,8 @@ +--- +title: Storage +menu: + main: + identifier: Storage + weight: -10 +--- +# Storage diff --git a/docs/tools/funnel/docs/storage/ftp.md b/docs/tools/funnel/docs/storage/ftp.md new file mode 100644 index 0000000..79b2439 --- /dev/null +++ b/docs/tools/funnel/docs/storage/ftp.md @@ -0,0 +1,38 @@ +--- +title: FTP +menu: + main: + parent: Storage +--- + +# FTP + +Funnel supports download and uploading files via FTP. + +Currently authentication credentials are take from the URL, e.g. `ftp://username:password@ftp.host.tld`. This will be improved soon to allow credentials to be added to the configuration file. + +The FTP storage client is enabled by default, but may be explicitly disabled in the +worker config: + +```yaml +FTPStorage: + Disabled: false +``` + +### Example task +```json +{ + "name": "Hello world", + "inputs": [{ + "url": "ftp://my.ftpserver.xyz/hello.txt", + "path": "/inputs/hello.txt" + }, { + "url": "ftp://user:mypassword123@my.ftpserver.xyz/hello.txt", + "path": "/inputs/hello.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + }] +} +``` diff --git a/docs/tools/funnel/docs/storage/google-storage.md b/docs/tools/funnel/docs/storage/google-storage.md new file mode 100644 index 0000000..d8fde4f --- /dev/null +++ b/docs/tools/funnel/docs/storage/google-storage.md @@ -0,0 +1,43 @@ +--- +title: Google Storage +menu: + main: + parent: Storage +--- + +# Google Storage + +Funnel supports using [Google Storage][gs] (GS) for file storage. + +The Google storage client is enabled by default, and will try to automatically +load credentials from the environment. Alternatively, you +may explicitly set the credentials in the worker config: + +```yaml +GoogleStorage: + Disabled: false + # Path to account credentials file. + AccountFile: "" +``` + +### Example task +```json +{ + "name": "Hello world", + "inputs": [{ + "url": "gs://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + }], + "outputs": [{ + "url": "gs://funnel-bucket/output.txt", + "path": "/outputs/hello-out.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + "stdout": "/outputs/hello-out.txt", + }] +} +``` + +[gs]: https://cloud.google.com/storage/ diff --git a/docs/tools/funnel/docs/storage/http.md b/docs/tools/funnel/docs/storage/http.md new file mode 100644 index 0000000..8192205 --- /dev/null +++ b/docs/tools/funnel/docs/storage/http.md @@ -0,0 +1,37 @@ +--- +title: HTTP(S) +menu: + main: + parent: Storage +--- + +# HTTP(S) + +Funnel supports downloading files from public URLs via GET requests. No authentication +mechanism is allowed. This backend can be used to fetch objects from cloud storage +providers exposed using presigned URLs. + +The HTTP storage client is enabled by default, but may be explicitly disabled in the +worker config: + +```yaml +HTTPStorage: + Disabled: false + # Timeout for http(s) GET requests. + Timeout: 30s +``` + +### Example task +```json +{ + "name": "Hello world", + "inputs": [{ + "url": "http://fakedomain.com/hello.txt", + "path": "/inputs/hello.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + }] +} +``` diff --git a/docs/tools/funnel/docs/storage/local.md b/docs/tools/funnel/docs/storage/local.md new file mode 100644 index 0000000..eb68669 --- /dev/null +++ b/docs/tools/funnel/docs/storage/local.md @@ -0,0 +1,63 @@ +--- +title: Local +menu: + main: + parent: Storage + weight: -10 +--- + +# Local + +Funnel supports using the local filesystem for file storage. + +Funnel limits which directories may be accessed, by default only allowing directories +under the current working directory of the Funnel worker. + +Config: +```yaml +LocalStorage: + # Whitelist of local directory paths which Funnel is allowed to access. + AllowedDirs: + - ./ + - /path/to/allowed/dir + - ...etc +``` + +### Example task + +Files must be absolute paths in `file:///path/to/file.txt` URL form. + +``` +{ + "name": "Hello world", + "inputs": [{ + "url": "file:///path/to/funnel-data/hello.txt", + "path": "/inputs/hello.txt" + }], + "outputs": [{ + "url": "file:///path/to/funnel-data/output.txt", + "path": "/outputs/hello-out.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + "stdout": "/outputs/hello-out.txt", + }] +} +``` + +### File hard linking behavior + +For efficiency, Funnel will attempt not to copy the input files, instead trying +create a hard link to the source file. In some cases this isn't possible. For example, +if the source file is on a network file system mount (e.g. NFS) but the Funnel worker's +working directory is on the local scratch disk, a hard link would cross a file system +boundary, which is not possible. In this case, Funnel will copy the file. + +### File ownership behavior + +One difficult area of files and Docker containers is file owner/group management. +If a Docker container runs as root, it's likely that the file will end up being owned +by root on the host system. In this case, some step (Funnel or another task) will +likely fail to access it. This is a tricky problem with no good solution yet. +See [issue 66](https://github.com/ohsu-comp-bio/funnel/issues/66). diff --git a/docs/tools/funnel/docs/storage/s3.md b/docs/tools/funnel/docs/storage/s3.md new file mode 100644 index 0000000..75a0271 --- /dev/null +++ b/docs/tools/funnel/docs/storage/s3.md @@ -0,0 +1,96 @@ +--- +title: S3 +menu: + main: + parent: Storage +--- + +# S3 + +## Amazon S3 + +Funnel supports using [AWS S3](https://aws.amazon.com/s3/) for file storage. + +The Amazon S3 storage client is enabled by default, and will try to automatically +load credentials from the environment. Alternatively, you +may explicitly set the credentials in the worker config: + +```yaml +AmazonS3: + Disabled: false + # The maximum number of times that a request will be retried for failures. + MaxRetries: 10 + Key: "" + Secret: "" +``` + +The Amazon S3 storage client also supports SSE-KMS and SSE-C configurations. + +For SSE-KMS as long as your credentials can access the KMS key used for the +given bucket, no special configuration is required. However, you can specifiy a +specific KMS key if desired: + +```yaml +AmazonS3: + SSE: + KMSKey: "1a03ce70-5f03-484e-8396-0e97de661b79" +``` + +For SSE-C: + +Generate a key file: + +```sh +openssl rand -out sse-c.key 32 +``` + +Then configure the storage client to use it: + +```yaml +AmazonS3: + SSE: + CustomerKeyFile: "./sse-c.key" +``` + +Note that this file will need to be available to all Funnel workers. + +## Other S3 API Providers + +Funnel also supports using non-Amazon S3 API providers ([Ceph][ceph], +[Cleversafe][cleversafe], [Minio][minio], etc.) for file storage. + +These other S3 storage clients are NOT enabled by default. You must configure them. + +This storage client also supports the [version 4 signing process](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-authenticating-requests.html). + +```yaml +GenericS3: + - Disabled: false + Endpoint: "" + Key: "" + Secret: "" +``` + +### Example task +```json +{ + "name": "Hello world", + "inputs": [{ + "url": "s3://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + }], + "outputs": [{ + "url": "s3://funnel-bucket/output.txt", + "path": "/outputs/hello-out.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + "stdout": "/outputs/hello-out.txt" + }] +} +``` + +[ceph]: http://ceph.com/ +[cleversafe]: https://www.ibm.com/cloud/object-storage +[minio]: https://minio.io/ diff --git a/docs/tools/funnel/docs/storage/swift.md b/docs/tools/funnel/docs/storage/swift.md new file mode 100644 index 0000000..3de323a --- /dev/null +++ b/docs/tools/funnel/docs/storage/swift.md @@ -0,0 +1,53 @@ +--- +title: OpenStack Swift +menu: + main: + parent: Storage +--- + +# OpenStack Swift + +Funnel supports using [OpenStack Swift][swift] for file storage. + +The Swift storage client is enabled by default, and will try to automatically +load credentials from the environment. Alternatively, you +may explicitly set the credentials in the worker config: + +```yaml +Swift: + Disabled: false + UserName: "" + Password: "" + AuthURL: "" + TenantName: "" + TenantID: "" + RegionName: "" + # 500 MB + ChunkSizeBytes: 500000000 +``` + +### Example task +```json +{ + "name": "Hello world", + "inputs": [{ + "url": "swift://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + }], + "outputs": [{ + "url": "swift://funnel-bucket/output.txt", + "path": "/outputs/hello-out.txt" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + "stdout": "/outputs/hello-out.txt", + }] +} +``` + +### Known Issues: + +The config currently only supports OpenStack v2 auth. See [issue #336](https://github.com/ohsu-comp-bio/funnel/issues/336). + +[swift]: https://docs.openstack.org/swift/latest/ diff --git a/docs/tools/funnel/docs/tasks.md b/docs/tools/funnel/docs/tasks.md new file mode 100644 index 0000000..ed9d25f --- /dev/null +++ b/docs/tools/funnel/docs/tasks.md @@ -0,0 +1,494 @@ +--- +title: Tasks +menu: + main: + identifier: tasks + weight: -70 +--- + +# Tasks + +A task defines a unit of work: + +- metadata +- input files to download +- a sequence of Docker containers + commands to run, +- output files to upload +- state +- logs + +The example task below downloads a file named `hello.txt` from S3 and calls `cat hello.txt` using the [alpine][alpine] container. This task also writes the executor's stdout to a file, and uploads the stdout to s3. + +``` +{ + "name": "Hello world", + "inputs": [{ + # URL to download file from. + "url": "s3://funnel-bucket/hello.txt", + # Path to download file to. + "path": "/inputs/hello.txt" + }], + "outputs": [{ + # URL to upload file to. + "url": "s3://funnel-bucket/output.txt", + # Local path to upload file from. + "path": "/outputs/stdout" + }], + "executors": [{ + # Container image name. + "image": "alpine", + # Command to run (argv). + "command": ["cat", "/inputs/hello.txt"], + # Capture the stdout of the command to /outputs/stdout + "stdout": "/outputs/stdout" + }] +} +``` + +Tasks have multiple "executors"; containers and commands run in a sequence. +Funnel runs executors via Docker. + +Tasks also have state and logs: +``` +{ + "id": "b85khc2rl6qkqbhg8vig", + "state": "COMPLETE", + "name": "Hello world", + "inputs": [ + { + "url": "s3://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + } + ], + "outputs": [ + { + "url": "s3://funnel-bucket/output.txt", + "path": "/outputs/stdout" + } + ], + "executors": [ + { + "image": "alpine", + "command": [ + "cat", + "/inputs/hello.txt" + ], + "stdout": "/outputs/stdout" + } + ], + "logs": [ + { + "logs": [ + { + "startTime": "2017-11-14T11:49:05.127885125-08:00", + "endTime": "2017-11-14T11:49:08.484461502-08:00", + "stdout": "Hello, Funnel!\n" + } + ], + "startTime": "2017-11-14T11:49:04.433593468-08:00", + "endTime": "2017-11-14T11:49:08.487707039-08:00" + } + ], + "creationTime": "2017-11-14T11:49:04.427163701-08:00" +} +``` + +There are logs for each task attempt and each executor. Notice that the stdout is +conveniently captured by `logs[0].logs[0].stdout`. + +### Task API + +The API lets you create, get, list, and cancel tasks. + +### Create +``` +POST /v1/tasks +{ + "name": "Hello world", + "inputs": [{ + "url": "s3://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + }], + "outputs": [{ + "url": "s3://funnel-bucket/output.txt", + "path": "/outputs/stdout" + }], + "executors": [{ + "image": "alpine", + "command": ["cat", "/inputs/hello.txt"], + "stdout": "/outputs/stdout" + }] +} + + +# The response is a task ID: +b85khc2rl6qkqbhg8vig +``` + +### Get +``` +GET /v1/tasks/b85khc2rl6qkqbhg8vig + +{"id": "b85khc2rl6qkqbhg8vig", "state": "COMPLETE"} +``` + +By default, the minimal task view is returned which describes only the ID and state. +In order to get the original task with some basic logs, use the "BASIC" task view: +``` +GET /v1/tasks/b85khc2rl6qkqbhg8vig?view=BASIC +{ + "id": "b85khc2rl6qkqbhg8vig", + "state": "COMPLETE", + "name": "Hello world", + "inputs": [ + { + "url": "gs://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + } + ], + "outputs": [ + { + "url": "s3://funnel-bucket/output.txt", + "path": "/outputs/stdout" + } + ], + "executors": [ + { + "image": "alpine", + "command": [ + "cat", + "/inputs/hello.txt" + ], + "stdout": "/outputs/stdout", + } + ], + "logs": [ + { + "logs": [ + { + "startTime": "2017-11-14T11:49:05.127885125-08:00", + "endTime": "2017-11-14T11:49:08.484461502-08:00", + } + ], + "startTime": "2017-11-14T11:49:04.433593468-08:00", + "endTime": "2017-11-14T11:49:08.487707039-08:00" + } + ], + "creationTime": "2017-11-14T11:49:04.427163701-08:00" +} +``` + +The "BASIC" doesn't include some fields such as stdout/err logs, because these fields may be potentially large. +In order to get everything, use the "FULL" view: +``` +GET /v1/tasks/b85khc2rl6qkqbhg8vig?view=FULL +{ + "id": "b85khc2rl6qkqbhg8vig", + "state": "COMPLETE", + "name": "Hello world", + "inputs": [ + { + "url": "gs://funnel-bucket/hello.txt", + "path": "/inputs/hello.txt" + } + ], + "executors": [ + { + "image": "alpine", + "command": [ + "cat", + "/inputs/hello.txt" + ], + "stdout": "/outputs/stdout", + } + ], + "logs": [ + { + "logs": [ + { + "startTime": "2017-11-14T11:49:05.127885125-08:00", + "endTime": "2017-11-14T11:49:08.484461502-08:00", + "stdout": "Hello, Funnel!\n" + } + ], + "startTime": "2017-11-14T11:49:04.433593468-08:00", + "endTime": "2017-11-14T11:49:08.487707039-08:00" + } + ], + "creationTime": "2017-11-14T11:49:04.427163701-08:00" +} +``` + +### List +``` +GET /v1/tasks +{ + "tasks": [ + { + "id": "b85l8tirl6qkqbhg8vj0", + "state": "COMPLETE" + }, + { + "id": "b85khc2rl6qkqbhg8vig", + "state": "COMPLETE" + }, + { + "id": "b85kgt2rl6qkpuptua70", + "state": "SYSTEM_ERROR" + }, + { + "id": "b857gnirl6qjfou61fh0", + "state": "SYSTEM_ERROR" + } + ] +} +``` + +List has the same task views as Get: MINIMAL, BASIC, and FULL. + +The task list is paginated: +``` +GET /v1/tasks?page_token=1h123h12j2h3k +{ + "next_page_token": "1n3n1j23k12n3k123", + "tasks": [ + { + "id": "b85l8tirl6qkqbhg8vj0", + "state": "COMPLETE" + }, + # ... more tasks here ... + ] +} +``` + +### Cancel + +Tasks cannot be modified by the user after creation, with one exception – they can be canceled. +``` +POST /v1/tasks/b85l8tirl6qkqbhg8vj0:cancel +``` + + +### Full task spec + +Here's a more detailed description of a task. +For a full, in-depth spec, read the TES standard's [task_execution.proto](https://github.com/ga4gh/task-execution-schemas/blob/master/task_execution.proto). + +``` +{ + # The task's ID. Set by the server. + # Output only. + "id": "1234567", + + # The task's state. Possible states: + # QUEUED + # INITILIZING + # RUNNING + # PAUSED + # COMPLETE + # EXECUTOR_ERROR + # SYSTEM_ERROR + # CANCELED + # + # Output only. + "state": "QUEUED", + + # Metadata + "name": "Task name.", + "description": "Task description.", + "tags": { + "custom-tag-1": "tag-value-1", + "custom-tag-2": "tag-value-2", + }, + + # Resource requests + "resources": { + # Number of CPU cores requested. + "cpuCores": 1, + + # RAM request, in gigabytes. + "ramGb": 1.0, + + # Disk space request, in gigabytes. + "diskGb": 100.0, + + # Request preemptible machines, + # e.g. preemptible VM in Google Cloud, an instance from the AWS Spot Market, etc. + "preemptible": false, + + # Request that the task run in these compute zones. + "zones": ["zone1", "zone2"], + }, + + # Input files will be downloaded by the worker. + # This example uses s3, but Funnel supports multiple filesystems. + "inputs": [ + { + "name": "Input file.", + "description": "Input file description.", + + # URL to download file from. + "url": "s3://my-bucket/object/path/file.txt", + # Path to download file to. + "path": "/container/input.txt" + }, + { + "name": "Input directory.", + "description": "Directories are also supported.", + "url": "s3://my-bucket/my-data/", + "path": "/inputs/my-data/", + "type": "DIRECTORY" + }, + + # A task may include the file content directly in the task message. + # This is sometimes useful for small files such as scripts, + # which you want to include without talking directly to the filesystem. + { + "path": "/inputs/script.py", + "content": "import socket; print socket.gethostname()" + } + ], + + # Output files will be uploaded to storage by the worker. + "outputs": [ + { + "name": "Output file.", + "description": "Output file description.", + "url": "s3://my-bucket/output-data/results.txt", + "path": "/outputs/results.txt" + }, + { + "name": "Output directory.", + "description": "Directories are also supported.", + "url": "s3://my-bucket/output-data/output-dir/", + "path": "/outputs/data-dir/", + "type": "DIRECTORY" + } + ], + + # Executors define a sequence of containers + commands to run. + # Execution stop on the first non-zero exit code. + "executors": [ + { + # Container image name. + # Funnel supports running executor containers via Docker. + "image": "ubuntu", + + # Command arguments (argv). + # The first item is the executable to run. + "command": ["my-tool-1", "/container/input"], + + # Local file path to read stdin from. + "stdin": "/inputs/stdin.txt", + + # Local file path to write stdout to. + "stdout": "/container/output", + + # Local file path to write stderr to. + "stderr": "/container/stderr", + + # Set the working directory before executing the command. + "workdir": "/data/workdir", + + # Environment variables + "env": { + "ENV1": "value1", + "ENV2": "value2", + } + }, + + # Second executor runs after the first completes, on the same machine. + { + "image": "ubuntu", + "command": ["cat", "/container/input"], + "stdout": "/container/output", + "stderr": "/container/stderr", + "workdir": "/tmp" + } + ] + + # Date/time the task was created. + # Set the the server. + # Output only. + "creationTime": "2017-11-14T11:49:04.427163701-08:00" + + # Task logs. + # Output only. + # + # If there's a system error, the task may be attempted multiple times, + # so this field is a list of attempts. In most cases, there will be only + # one or zero entries here. + "logs": [ + + # Attempt start/end times, in RFC3339 format. + "startTime": "2017-11-14T11:49:04.433593468-08:00", + "endTime": "2017-11-14T11:49:08.487707039-08:00" + + # Arbitrary metadata set by Funnel. + "metadata": { + "hostname": "worker-1", + }, + + # Arbitrary system logs which Funnel thinks are useful to the user. + "systemLogs": [ + "task was assigned to worker 1", + "docker command: docker run -v /vol:/data alpine cmd arg1 arg2", + ], + + # Log of files uploaded to storage by the worker, + # including all files in directories, with file sizes. + "outputs": [ + { + "url": "s3://my-bucket/output-data/results.txt", + "path": "/outputs/results.txt", + "sizeBytes": 123 + }, + { + "url": "s3://my-bucket/output-data/output-dir/file1.txt", + "path": "/outputs/data-dir/file1.txt", + "sizeBytes": 123 + }, + { + "url": "s3://my-bucket/output-data/output-dir/file2.txt", + "path": "/outputs/data-dir/file2.txt", + "sizeBytes": 123 + } + { + "url": "s3://my-bucket/output-data/output-dir/subdir/file3.txt", + "path": "/outputs/data-dir/subdir/file3.txt", + "sizeBytes": 123 + } + ], + + # Executor logs. One entry per executor. + "logs": [ + { + # Executor start/end time, in RFC3339 format. + "startTime": "2017-11-14T11:49:05.127885125-08:00", + "endTime": "2017-11-14T11:49:08.484461502-08:00", + + # Executor stdout/err. Only available in the FULL task view. + # + # There is a size limit for these fields, which is configurable + # and defaults to 10KB. If more than 10KB is generated, only the + # tail will be logged. If the full output is needed, the task + # may use Executor.stdout and an output to upload the full content + # to storage. + "stdout": "Hello, Funnel!", + "stderr": "", + + # Exit code + "exit_code": 0, + }, + { + "startTime": "2017-11-14T11:49:05.127885125-08:00", + "endTime": "2017-11-14T11:49:08.484461502-08:00", + "stdout": "Hello, Funnel!\n" + } + ], + } + ], +} +``` + +[alpine]: https://hub.docker.com/_/alpine/ diff --git a/docs/tools/funnel/download.md b/docs/tools/funnel/download.md new file mode 100644 index 0000000..cded4a2 --- /dev/null +++ b/docs/tools/funnel/download.md @@ -0,0 +1,38 @@ +--- +title: Download 0.11.0 +menu: + main: + weight: -2000 +--- + +{{< download-links >}} + +Funnel is a single binary. +Funnel requires [Docker][docker]. +Funnel is beta quality. APIs might break, bugs exist, data might be lost. + +### Homebrew + +```sh +brew tap ohsu-comp-bio/formula +brew install funnel@0.11 +``` + +

Build the lastest development version optional

+ +In order to build the latest code, run: +```shell +$ git clone https://github.com/ohsu-comp-bio/funnel.git +$ cd funnel +$ make +``` + +Funnel requires Go 1.21+. Check out the [development docs][dev] for more detail. + +### Release History + +See the [Releases](https://github.com/ohsu-comp-bio/funnel/releases) page for release history. + + +[dev]: /docs/development/developers/ +[docker]: https://docker.io diff --git a/docs/tools/funnel/index.md b/docs/tools/funnel/index.md new file mode 100644 index 0000000..b65f002 --- /dev/null +++ b/docs/tools/funnel/index.md @@ -0,0 +1,10 @@ +## Funnel Tool Documentation + +The Funnel tool is designed to streamline data processing workflows, enabling efficient data transformation and analysis. Key features include: + +- **S3 Integration**: Seamlessly add and manage files from Amazon S3. +- **Data Transformation**: Predefined pipelines for common data processing tasks. +- **Automation**: Schedule and automate repetitive data workflows. +- **Monitoring**: Track the status and performance of data jobs in real-time. +- **Workflow engine compatibile**: Compatible with Nextflow + diff --git a/docs/tools/git-drs/adding-s3-files.md b/docs/tools/git-drs/adding-s3-files.md new file mode 100644 index 0000000..c44d6a7 --- /dev/null +++ b/docs/tools/git-drs/adding-s3-files.md @@ -0,0 +1,209 @@ +# Adding S3 Files to Git DRS + +The `git drs add-url` command allows you to associate an S3 URL with a Git DRS repository without moving the actual data. This command registers the S3 file location in the Gen3 indexd service and creates a Git LFS pointer file. + +## Use Cases + +There are two main use cases for adding S3 files: + +### 1. Adding S3 Files from Gen3-Registered Buckets +If the S3 bucket is already registered in Gen3, the system can automatically retrieve the region and endpoint information from the Gen3 configuration. You only need to supply AWS credentials. + +### 2. Adding S3 Files from Non-Registered Buckets +If the S3 bucket is not registered in Gen3, you must provide both AWS credentials and bucket configuration (region and endpoint URL). + +## AWS Configuration + +This command follows the standard AWS CLI authentication and configuration precedence as documented in the [AWS CLI Authentication Guide](https://docs.aws.amazon.com/cli/v1/userguide/cli-chap-authentication.html) + +### Configuration Priority (Highest to Lowest) + +1. **Command-line flags**: `--aws-access-key-id`, `--aws-secret-access-key`, `--region`, `--endpoint-url` +2. **Environment variables**: `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_REGION`, `AWS_ENDPOINT_URL` +3. **AWS configuration files**: `~/.aws/credentials` first, then `~/.aws/config` +4. **Gen3 bucket registration**: For registered buckets, region and endpoint are retrieved from Gen3 +5. **IAM roles**: For EC2 instances or containers with attached IAM roles + +See the [AWS CLI Configuration Guide](https://github.com/aws/aws-cli#configuration) for the various ways to set up your credentials. + +## Prerequisites + +- Git LFS tracking must be configured for the file +- AWS credentials with read access to the S3 bucket +- For non-registered buckets: AWS region and endpoint URL + +## Command Syntax + +```bash +git drs add-url s3://bucket/path/to/file --sha256 [options] +``` + +### Required Parameters + +- `s3://bucket/path/to/file`: The S3 URL of the file to be added +- `--sha256 `: The SHA256 hash of the file (64-character hexadecimal string) + +### Optional Parameters + +- `--aws-access-key-id `: AWS access key for authentication +- `--aws-secret-access-key `: AWS secret key for authentication +- `--region `: AWS region (e.g., `us-west-2`, `us-east-1`) + - Required for buckets not registered in Gen3 (unless configured in AWS config file) +- `--endpoint-url `: S3 endpoint URL (e.g., `https://s3.example.com`) + - Required for buckets not registered in Gen3 (unless configured in AWS config file) + +## Examples + +### Example 1: Gen3-Registered Bucket with Command-Line Credentials + +If your bucket is registered in Gen3, you only need to provide AWS credentials: + +```bash +# Track the file with Git LFS +git lfs track "my-file" +git add .gitattributes + +# Add the S3 file using command-line credentials +git drs add-url s3://my-registered-bucket/path/to/my-file \ + --sha256 1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef \ + --aws-access-key-id myAccessKey \ + --aws-secret-access-key mySecretKey + +# Commit and push +git commit -m "Add file from registered bucket" +git push +``` + +### Example 2: Gen3-Registered Bucket with Environment Variables + +```bash +# Set AWS credentials via environment variables +export AWS_ACCESS_KEY_ID=myAccessKey +export AWS_SECRET_ACCESS_KEY=mySecretKey + +# Track the file with Git LFS +git lfs track "my-file" +git add .gitattributes + +# Add the S3 file (credentials from environment) +git drs add-url s3://my-registered-bucket/path/to/my-file \ + --sha256 1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef + +# Commit and push +git commit -m "Add file from registered bucket" +git push +``` + +### Example 3: Non-Registered Bucket with Command-Line Credentials + +For buckets not registered in Gen3, provide region and endpoint: + +```bash +# Set credentials via environment variables +export AWS_ACCESS_KEY_ID=myAccessKey +export AWS_SECRET_ACCESS_KEY=mySecretKey + +# Track the file with Git LFS +git lfs track "my-file" +git add .gitattributes + +# Add the S3 file with region and endpoint +git drs add-url s3://my-custom-bucket/path/to/my-file \ + --sha256 1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef \ + --region us-west-2 \ + --endpoint-url https://s3.custom-provider.com + +# Commit and push +git commit -m "Add file from custom bucket" +git push +``` + +### Example 4: Non-Registered Bucket with AWS Configuration Files + +You can also configure AWS credentials and settings in `~/.aws/credentials` and `~/.aws/config`: + +**~/.aws/credentials:** +```ini +[default] +aws_access_key_id = myAccessKey +aws_secret_access_key = mySecretKey +``` + +**~/.aws/config:** +```ini +[default] +region = us-west-2 +s3 = + endpoint_url = https://s3.custom-provider.com +``` + +Then run the command without any credential flags: + +```bash +git lfs track "my-file" +git add .gitattributes + +# Credentials and configuration loaded from ~/.aws/ files +git drs add-url s3://my-bucket/path/to/my-file \ + --sha256 1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef + +git commit -m "Add file using AWS config files" +git push +``` + +### Example 5: Multiple Files from Registered Bucket + +```bash +# Track all files in a directory +git lfs track "data-directory/**" +git add .gitattributes + +# Set credentials once +export AWS_ACCESS_KEY_ID=myAccessKey +export AWS_SECRET_ACCESS_KEY=mySecretKey + +# Add multiple files +git drs add-url s3://my-bucket/data-directory/file-1.dat \ + --sha256 1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef + +git drs add-url s3://my-bucket/data-directory/subdir/file-2.dat \ + --sha256 abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890 + +git drs add-url s3://my-bucket/data-directory/file-3.dat \ + --sha256 fedcba0987654321fedcba0987654321fedcba0987654321fedcba0987654321 + +# Commit all at once +git commit -m "Add multiple data files" +git push +``` + + +## Notes + +- **Git LFS Tracking**: Files must be tracked by Git LFS before running `add-url`. Use `git lfs track ` to configure tracking. +- **SHA256 Hash**: You must calculate the SHA256 hash of your file beforehand. Use `shasum -a 256 ` or similar tools. +- **Credentials Security**: Avoid putting credentials directly in command-line history. Use environment variables or AWS configuration files. +- **Bucket Registration**: For frequently used buckets, consider registering them in Gen3 to simplify the process. +- **Multiple URLs**: If a file is already registered, running `add-url` with a different S3 URL will add that URL to the existing record. +- **Project Isolation**: Each Git DRS project maintains separate indexd records, even for identical file hashes. + +## Troubleshooting + +### "file is not tracked by LFS" +Run `git lfs track ` to track the file pattern, then `git add .gitattributes`. + +### "Unable to get bucket details" +This means the bucket is not registered in Gen3. Provide `--region` and `--endpoint-url` flags or configure them in your AWS config file. + +### "unable to load AWS SDK config" +Check your AWS configuration: +- Verify credentials are set (via flags, environment, or `~/.aws/credentials`) +- Ensure `~/.aws/config` file is valid if you're using it +- Check that IAM roles are properly configured if running on EC2/ECS + +### "failed to head object" +This usually means: +- Credentials don't have permission to access the object +- The S3 URL is incorrect +- The endpoint or region is misconfigured +- Network connectivity issues diff --git a/docs/tools/git-drs/commands.md b/docs/tools/git-drs/commands.md new file mode 100644 index 0000000..23db84c --- /dev/null +++ b/docs/tools/git-drs/commands.md @@ -0,0 +1,507 @@ +# Commands Reference + +Complete reference for Git DRS and related Git LFS commands. + +> **Navigation:** [Getting Started](getting-started.md) → **Commands Reference** → [Troubleshooting](troubleshooting.md) + +## Git DRS Commands + +### `git drs init` + +Initialize Git DRS in a repository. Sets up Git LFS custom transfer hooks and configures `.gitignore` patterns. + +**Usage:** + +```bash +git drs init [flags] +``` + +**Options:** + +- `--transfers `: Number of concurrent transfers (default: 4) + +**Example:** + +```bash +git drs init +``` + +**What it does:** + +- Creates `.drs/` directory structure +- Configures Git LFS custom transfer agent +- Updates `.gitignore` to exclude DRS cache files +- Stages `.gitignore` changes automatically + +**Note:** Run this before adding remotes. + +### `git drs remote` + +Manage DRS remote server configurations. Git DRS supports multiple remotes for working with development, staging, and production servers. + +#### `git drs remote add gen3 ` + +Add a Gen3 DRS server configuration. + +**Usage:** + +```bash +git drs remote add gen3 \ + --url \ + --cred \ + --project \ + --bucket +``` + +**Options:** + +- `--url `: Gen3 server endpoint (required) +- `--cred `: Path to credentials JSON file (required) +- `--token `: Token for temporary access (alternative to --cred) +- `--project `: Project ID in format `-` (required) +- `--bucket `: S3 bucket name (required) + +**Examples:** + +```bash +# Add production remote +git drs remote add gen3 production \ + --url https://calypr-public.ohsu.edu \ + --cred /path/to/credentials.json \ + --project my-project \ + --bucket my-bucket + +# Add staging remote +git drs remote add gen3 staging \ + --url https://staging.calypr.ohsu.edu \ + --cred /path/to/staging-credentials.json \ + --project staging-project \ + --bucket staging-bucket +``` + +**Note:** The first remote you add automatically becomes the default remote. + +#### `git drs remote add anvil ` + +Add an AnVIL/Terra DRS server configuration. + +> **Note:** AnVIL support is under active development. For production use, we recommend Gen3 workflows or version 0.2.2 for AnVIL functionality. + +**Usage:** + +```bash +git drs remote add anvil --terraProject +``` + +**Options:** + +- `--terraProject `: Terra/Google Cloud project ID (required) + +**Example:** + +```bash +git drs remote add anvil development --terraProject my-terra-project +``` + +#### `git drs remote list` + +List all configured DRS remotes. + +**Usage:** + +```bash +git drs remote list +``` + +**Example Output:** + +``` +* production gen3 https://calypr-public.ohsu.edu + staging gen3 https://staging.calypr.ohsu.edu + development gen3 https://dev.calypr.ohsu.edu +``` + +The `*` indicates the default remote used by all commands unless specified otherwise. + +#### `git drs remote set ` + +Set the default DRS remote for all operations. + +**Usage:** + +```bash +git drs remote set +``` + +**Examples:** + +```bash +# Switch to staging for testing +git drs remote set staging + +# Switch back to production +git drs remote set production + +# Verify change +git drs remote list +``` + +### `git drs fetch [remote-name]` + +Fetch DRS object metadata from remote server. Downloads metadata only, not actual files. + +**Usage:** + +```bash +# Fetch from default remote +git drs fetch + +# Fetch from specific remote +git drs fetch staging +git drs fetch production +``` + +**Note:** `fetch` and `push` are commonly used together for cross-remote workflows. See `git drs push` below. + +**What it does:** + +- Identifies remote and project from configuration +- Transfers all DRS records for a given project from the server to the local `.drs/lfs/objects/` directory + +### `git drs push [remote-name]` + +Push local DRS objects to server. Uploads new files and registers metadata. + +**Usage:** + +```bash +# Push to default remote +git drs push + +# Push to specific remote +git drs push staging +git drs push production +``` + +**What it does:** + +- Checks local `.drs/lfs/objects/` for DRS metadata +- For each object, uploads file to bucket if file exists locally +- If file doesn't exist locally (metadata only), registers metadata without upload +- This enables cross-remote promotion workflows + +**Cross-Remote Promotion:** + +Transfer DRS records from one remote to another (eg staging to production) without re-uploading files: + +```bash +# Fetch metadata from staging +git drs fetch staging + +# Push metadata to production (no file upload since files don't exist locally) +git drs push production +``` + +This is useful when files are already in the production bucket with matching SHA256 hashes. It can also be used to reupload files given that the files are pulled to the repo first. + +**Note:** `fetch` and `push` are commonly used together. `fetch` pulls metadata from one remote, `push` registers it to another. + +### `git drs add-url` + +Add a file reference via S3 URL without copying the data. + +**Usage:** + +```bash +# Use default remote +git drs add-url s3://bucket/path/file --sha256 + +# Use specific remote +git drs add-url s3://bucket/path/file --sha256 --remote staging +``` + +**With AWS Credentials:** + +```bash +git drs add-url s3://bucket/path/file \ + --sha256 \ + --aws-access-key \ + --aws-secret-key +``` + +**Options:** + +- `--sha256 `: Required SHA256 hash of the file +- `--remote `: Target remote (default: default_remote) +- `--aws-access-key `: AWS access key +- `--aws-secret-key `: AWS secret key +- `--endpoint `: Custom S3 endpoint +- `--region `: AWS region + +### `git drs create-cache` + +Create a cache from a manifest file (Terra/AnVIL). + +```bash +git drs create-cache manifest.tsv +``` + +### `git drs version` + +Display Git DRS version information. + +```bash +git drs version +``` + +### Internal Commands + +These commands are called automatically by Git hooks: + +- `git drs precommit`: Process staged files during commit +- `git drs transfer`: Handle file transfers during push/pull +- `git drs transferref`: Handle reference transfers (AnVIL/Terra) + +## Git LFS Commands + +### `git lfs track` + +Manage file tracking patterns. + +**View Tracked Patterns:** + +```bash +git lfs track +``` + +**Track New Pattern:** + +```bash +git lfs track "*.bam" +git lfs track "data/**" +git lfs track "specific-file.txt" +``` + +**Untrack Pattern:** + +```bash +git lfs untrack "*.bam" +``` + +### `git lfs ls-files` + +List LFS-tracked files in the repository. + +**All Files:** + +```bash +git lfs ls-files +``` + +**Specific Pattern:** + +```bash +git lfs ls-files -I "*.bam" +git lfs ls-files -I "data/**" +``` + +**Output Format:** + +- `*` prefix: File is localized (downloaded) +- `-` prefix: File is not localized +- No prefix: File status unknown + +### `git lfs pull` + +Download LFS-tracked files. + +**All Files:** + +```bash +git lfs pull +``` + +**Specific Files:** + +```bash +git lfs pull -I "*.bam" +git lfs pull -I "data/important.txt" +git lfs pull -I "results/**" +``` + +**Multiple Patterns:** + +```bash +git lfs pull -I "*.bam" -I "*.vcf" +``` + +### `git lfs install` + +Configure Git LFS for the system or repository. + +**System-wide:** + +```bash +git lfs install --skip-smudge +``` + +**Repository-only:** + +```bash +git lfs install --local --skip-smudge +``` + +The `--skip-smudge` option prevents automatic downloading of all LFS files during clone/checkout. + +## Standard Git Commands + +Git DRS integrates with standard Git commands: + +### `git add` + +Stage files for commit. LFS-tracked files are automatically processed. + +```bash +git add myfile.bam +git add data/ +git add . +``` + +### `git commit` + +Commit changes. Git DRS pre-commit hook runs automatically. + +```bash +git commit -m "Add new data files" +``` + +### `git push` + +Push commits to remote. Git DRS automatically uploads new files to DRS server. + +```bash +git push +git push origin main +``` + +### `git clone` + +Clone repository. Use with Git DRS initialization: + +```bash +git clone +cd +git drs init +git drs remote add gen3 production --cred /path/to/credentials.json --url ... --project ... --bucket ... +``` + +## Workflow Examples + +### Complete File Addition Workflow + +```bash +# 1. Ensure file type is tracked +git lfs track "*.bam" +git add .gitattributes + +# 2. Add your file +git add mydata.bam + +# 3. Verify tracking +git lfs ls-files -I "mydata.bam" + +# 4. Commit (creates DRS record) +git commit -m "Add analysis results" + +# 5. Push (uploads to default DRS server) +git push +``` + +### Selective File Download + +```bash +# Check what's available +git lfs ls-files + +# Download specific files +git lfs pull -I "results/*.txt" +git lfs pull -I "important-dataset.bam" + +# Verify download +git lfs ls-files -I "results/*.txt" +``` + +### Repository Setup from Scratch + +```bash +# 1. Create and clone repo +git clone +cd + +# 2. Initialize Git DRS +git drs init + +# 3. Add DRS remote +git drs remote add gen3 production \ + --url https://calypr-public.ohsu.edu \ + --cred /path/to/credentials.json \ + --project my-project \ + --bucket my-bucket + +# 4. Set up file tracking +git lfs track "*.bam" +git lfs track "*.vcf.gz" +git lfs track "data/**" +git add .gitattributes +git commit -m "Configure LFS tracking" +git push + +# 5. Add data files +git add data/sample1.bam +git commit -m "Add sample data" +git push +``` + +### Cross-Remote Promotion Workflow + +```bash +# 1. Add multiple remotes +git drs remote add gen3 staging \ + --url https://staging.calypr.ohsu.edu \ + --cred /path/to/staging-credentials.json \ + --project staging-project \ + --bucket staging-bucket + +git drs remote add gen3 production \ + --url https://calypr-public.ohsu.edu \ + --cred /path/to/prod-credentials.json \ + --project prod-project \ + --bucket prod-bucket + +# 2. Fetch metadata from staging +git drs fetch staging + +# 3. Push metadata to production (no re-upload) +git drs push production +``` + +## Environment Variables + +Git DRS respects these environment variables: + +- `AWS_ACCESS_KEY_ID`: AWS access key (for S3 operations) +- `AWS_SECRET_ACCESS_KEY`: AWS secret key (for S3 operations) +- `GOOGLE_PROJECT`: Google Cloud project ID (for AnVIL) +- `WORKSPACE_BUCKET`: Terra workspace bucket (for AnVIL) + +## Help and Documentation + +Use `--help` with any command for detailed usage: + +```bash +git-drs --help +git-drs init --help +git-drs add-url --help +git lfs --help +git lfs track --help +``` diff --git a/docs/tools/git-drs/developer-guide.md b/docs/tools/git-drs/developer-guide.md new file mode 100644 index 0000000..c4553ea --- /dev/null +++ b/docs/tools/git-drs/developer-guide.md @@ -0,0 +1,185 @@ +# Developer Guide + +This guide covers Git DRS internals, architecture, and development information. + +## Architecture Overview + +Git DRS integrates with Git through several mechanisms: + +### Git Hooks Integration + +**Pre-commit Hook**: `git drs precommit` +- Triggered automatically before each commit +- Processes all staged LFS files +- Creates DRS records for new files +- Only processes files that don't already exist on the DRS server +- Prepares metadata for later upload during push + +**Custom Transfer Protocol** +- Git LFS uses custom transfers to communicate with Git DRS +- Handles both upload (push) and download (pull) operations +- Transfers run automatically during `git push` and `git lfs pull` +- Information passed through JSON protocol between Git LFS and Git DRS + +### File Processing Flow + +``` +1. Developer: git add file.bam +2. Developer: git commit -m "Add data" +3. Git Hook: git drs precommit + - Creates DRS object metadata + - Stores in .drs/ directory +4. Developer: git push +5. Git LFS: Initiates custom transfer +6. Git DRS: + - Registers file with DRS server (indexd record) + - Uploads file to configured bucket + - Updates transfer logs +``` + +## Custom Transfer Protocol + +Git DRS implements the [Git LFS Custom Transfer Protocol](https://github.com/git-lfs/git-lfs/blob/main/docs/custom-transfers.md). + +### Transfer Types + +**Upload Transfer (gen3)**: +- Creates indexd record on DRS server +- Uploads file to Gen3-registered S3 bucket +- Updates DRS object with access URLs + +**Download Transfer (gen3)**: +- Retrieves file metadata from DRS server +- Downloads file from configured storage +- Validates checksums + +**Reference Transfer**: +- Handles S3 URL references without data movement +- Links existing S3 objects to DRS records + +### Protocol Communication + +Git LFS and Git DRS communicate via JSON messages: + +```json +{ + "event": "init", + "operation": "upload", + "remote": "origin", + "concurrent": 3, + "concurrenttransfers": 3 +} +``` + +Response handling and logging occurs in transfer clients to avoid interfering with Git LFS stdout expectations. + +## Repository Structure + +### Core Components + +``` +cmd/ # CLI command implementations +├── initialize/ # Repository initialization +├── transfer/ # Custom transfer handlers +├── precommit/ # Pre-commit hook +├── addurl/ # S3 URL reference handling +└── ... + +client/ # DRS client implementations +├── interface.go # Client interface definitions +├── indexd.go # Gen3/indexd client +├── anvil.go # AnVIL client +└── drs-map.go # File mapping utilities + +config/ # Configuration management +└── config.go # Config file handling + +drs/ # DRS object utilities +├── object.go # DRS object structures +└── util.go # Utility functions + +lfs/ # Git LFS integration +└── messages.go # LFS protocol messages + +utils/ # Shared utilities +├── common.go # Common functions +├── lfs-track.go # LFS tracking utilities +└── util.go # General utilities +``` + +### Configuration System + +**Repository Configuration**: `.drs/config` +```yaml +current_server: gen3 +servers: + gen3: + endpoint: "https://data.example.org/" + profile: "myprofile" + project: "project-123" + bucket: "data-bucket" +``` + +### DRS Object Management + +Objects are stored in `.drs/objects/` during pre-commit and referenced during transfers. + +## Development Setup + +### Prerequisites + +- Go 1.24+ +- Git LFS installed +- Access to a DRS server for testing + +### Building from Source + +```bash +# Clone repository +git clone https://github.com/calypr/git-drs.git +cd git-drs + +# Install dependencies +go mod download + +# Build +go build + +# Install locally +export PATH=$PATH:$(pwd) +``` + +### Development Workflow + +1. **Make changes** to source code +2. **Build and test**: + ```bash + go build + go test ./... + ``` +3. **Test with real repository**: + ```bash + cd /path/to/test-repo + /path/to/git-drs/git-drs --help + ``` + +## Debugging and Logging + +### Log Locations + +- **Commit logs**: `.drs/git-drs.log` +- **Transfer logs**: `.drs/git-drs.log` + + +## Testing + +### Unit Tests + +```bash +# Test specific functionality +go test ./utils -run TestLFSTrack +``` + +### Integration Tests + +**WIP** \ No newline at end of file diff --git a/docs/tools/git-drs/getting-started.md b/docs/tools/git-drs/getting-started.md new file mode 100644 index 0000000..85a4476 --- /dev/null +++ b/docs/tools/git-drs/getting-started.md @@ -0,0 +1,324 @@ +# Getting Started + +This guide walks you through setting up Git DRS and performing common workflows. + +> **Navigation:** [Installation](installation.md) → **Getting Started** → [Commands Reference](commands.md) → [Troubleshooting](troubleshooting.md) + +## Repository Initialization + +Every Git repository using Git DRS requires configuration, whether you're creating a new repo or cloning an existing one. + +### Cloning Existing Repository (Gen3) + +1. **Clone the Repository** + + ```bash + git clone .git + cd + ``` + +2. **Configure SSH** (if using SSH URLs) + + If using SSH URLs like `git@github.com:user/repo.git`, add to `~/.ssh/config`: + + ``` + Host github.com + TCPKeepAlive yes + ServerAliveInterval 30 + ``` + +3. **Get Credentials** + + - Log in to your data commons (e.g., https://calypr-public.ohsu.edu/) + - Profile → Create API Key → Download JSON + - **Note**: Credentials expire after 30 days + +4. **Initialize Repository** + + ```bash + git drs init + ``` + +5. **Verify Configuration** + + ```bash + git drs remote list + ``` + + Output: + ``` + * production gen3 https://calypr-public.ohsu.edu/ + ``` + + The `*` indicates this is the default remote. + +### New Repository Setup (Gen3) + +1. **Create and Clone Repository** + + ```bash + git clone .git + cd + ``` + +2. **Configure SSH** (if needed - same as above) + +3. **Get Credentials** (same as above) + +4. **Get Project Details** + + Contact your data coordinator for: + - DRS server URL + - Project ID + - Bucket name + +5. **Initialize Git DRS** + + ```bash + git drs init + ``` + +6. **Add Remote Configuration** + + ```bash + git drs remote add gen3 production \ + --cred /path/to/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket + ``` + + **Note:** Since this is your first remote, it automatically becomes the default. No need to run `git drs remote set`. + +7. **Verify Configuration** + + ```bash + git drs remote list + ``` + + Output: + ``` + * production gen3 https://calypr-public.ohsu.edu + ``` + +**Managing Additional Remotes** + +You can add more remotes later for multi-environment workflows (development, staging, production): + +```bash +# Add staging remote +git drs remote add gen3 staging \ + --cred /path/to/staging-credentials.json \ + --url https://staging.calypr.ohsu.edu \ + --project staging-project \ + --bucket staging-bucket + +# View all remotes +git drs remote list + +# Switch default remote +git drs remote set staging + +# Or use specific remote for one command +git drs push production +git drs fetch staging +``` + +## File Tracking + +Git DRS uses Git LFS to track files. You must explicitly track file patterns before adding them. + +### View Current Tracking + +```bash +git lfs track +``` + +### Track Files + +**Single File** + +```bash +git lfs track path/to/specific-file.txt +git add .gitattributes +``` + +**File Pattern** + +```bash +git lfs track "*.bam" +git add .gitattributes +``` + +**Directory** + +```bash +git lfs track "data/**" +git add .gitattributes +``` + +### Untrack Files + +```bash +# View tracked patterns +git lfs track + +# Remove pattern +git lfs untrack "*.bam" + +# Stage changes +git add .gitattributes +``` + +## Basic Workflows + +### Adding and Pushing Files + +```bash +# Track file type (if not already tracked) +git lfs track "*.bam" +git add .gitattributes + +# Add your file +git add myfile.bam + +# Verify LFS is tracking it +git lfs ls-files + +# Commit and push +git commit -m "Add new data file" +git push +``` + +> **Note**: Git DRS automatically creates DRS records during commit and uploads files to the default remote during push. + +### Downloading Files + +**Single File** + +```bash +git lfs pull -I path/to/file.bam +``` + +**Pattern** + +```bash +git lfs pull -I "*.bam" +``` + +**All Files** + +```bash +git lfs pull +``` + +**Directory** + +```bash +git lfs pull -I "data/**" +``` + +### Checking File Status + +```bash +# List all LFS-tracked files +git lfs ls-files + +# Check specific pattern +git lfs ls-files -I "*.bam" + +# View localization status +# (-) = not localized, (*) = localized +git lfs ls-files +``` + +## Working with S3 Files + +You can add references to existing S3 files without copying them: + +```bash +# Track the file pattern first +git lfs track "myfile.txt" +git add .gitattributes + +# Add S3 reference +git drs add-url s3://bucket/path/to/file \ + --sha256 \ + --aws-access-key \ + --aws-secret-key + +# Commit and push +git commit -m "Add S3 file reference" +git push +``` + +See [S3 Integration Guide](adding-s3-files.md) for detailed examples. + +## Configuration Management + +### View Configuration + +```bash +git drs remote list +``` + +### Update Configuration + +```bash +# Refresh credentials - re-add remote with new credentials +git drs remote add gen3 production \ + --cred /path/to/new-credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket + +# Switch default remote +git drs remote set staging +``` + +### View Logs + +- Logs location: `.drs/` directory + +## Command Summary + +| Action | Commands | +| ------------------ | ------------------------------------------- | +| **Initialize** | `git drs init` | +| **Add remote** | `git drs remote add gen3 --cred...` | +| **View remotes** | `git drs remote list` | +| **Set default** | `git drs remote set ` | +| **Track files** | `git lfs track "pattern"` | +| **Check tracked** | `git lfs ls-files` | +| **Add files** | `git add file.ext` | +| **Commit** | `git commit -m "message"` | +| **Push** | `git push` | +| **Download** | `git lfs pull -I "pattern"` | + +## Session Workflow + +For each work session: + +1. **Refresh credentials** (if expired - credentials expire after 30 days) + + ```bash + git drs remote add gen3 production \ + --cred /path/to/new-credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket + ``` + +2. **Work with files** (track, add, commit, push) + +3. **Download files as needed** + + ```bash + git lfs pull -I "required-files*" + ``` + +## Next Steps + +- [Commands Reference](commands.md) - Complete command documentation +- [Troubleshooting](troubleshooting.md) - Common issues and solutions +- [Developer Guide](developer-guide.md) - Advanced usage and internals diff --git a/docs/tools/git-drs/index.md b/docs/tools/git-drs/index.md new file mode 100644 index 0000000..a737352 --- /dev/null +++ b/docs/tools/git-drs/index.md @@ -0,0 +1,7 @@ +## git-drs Tool Documentation + +The git-drs (Git Data Repository Service) tool enables seamless integration of Git with data management workflows, providing version control for data files and collaborative data processing. Key features include: + +- **Git Integration**: Track changes to data files using Git version control. +- **Data Versioning**: Manage versions of datasets, configurations, and analysis results. +- **Collaboration Tools**: Facilitate team workflows with branching, merging, and pull requests. diff --git a/docs/tools/git-drs/installation.md b/docs/tools/git-drs/installation.md new file mode 100644 index 0000000..bc1c85a --- /dev/null +++ b/docs/tools/git-drs/installation.md @@ -0,0 +1,213 @@ +# Installation Guide + +This guide covers installation of Git DRS across different environments and target DRS servers. + +## Prerequisites + +All installations require [Git LFS](https://git-lfs.com/) to be installed first: + +```bash +# macOS +brew install git-lfs + +# Linux (download binary) +wget https://github.com/git-lfs/git-lfs/releases/download/v3.7.0/git-lfs-linux-amd64-v3.7.0.tar.gz +tar -xvf git-lfs-linux-amd64-v3.7.0.tar.gz +export PREFIX=$HOME +./git-lfs-v3.7.0/install.sh + +# Configure LFS +git lfs install --skip-smudge +``` + +## Local Installation (Gen3 Server) + +**Target Environment**: Local development machine targeting Gen3 data commons (e.g., CALYPR) + +### Steps + +1. **Install Git DRS** + ```bash + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/main/install.sh)" + ``` + +2. **Update PATH** + ```bash + # Add to ~/.bash_profile or ~/.zshrc + export PATH="$PATH:$HOME/.local/bin" + source ~/.bash_profile # or source ~/.zshrc + ``` + +3. **Verify Installation** + ```bash + git-drs --help + ``` + +4. **Get Credentials** + - Log in to your data commons (e.g., https://calypr-public.ohsu.edu/) + - Click your email → Profile → Create API Key → Download JSON + - Note the download path for later configuration + +## HPC Installation (Gen3 Server) + +**Target Environment**: High-performance computing systems targeting Gen3 servers + +### Steps + +1. **Install Git LFS on HPC** + ```bash + # Download and install Git LFS + wget https://github.com/git-lfs/git-lfs/releases/download/v3.7.1/git-lfs-linux-amd64-v3.7.1.tar.gz + tar -xvf git-lfs-linux-amd64-v3.7.1.tar.gz + export PREFIX=$HOME + ./git-lfs-3.7.1/install.sh + + # Make permanent + echo 'export PATH="$HOME/bin:$PATH"' >> ~/.bash_profile + source ~/.bash_profile + + # Configure + git lfs install --skip-smudge + + # Cleanup + rm git-lfs-linux-amd64-v3.7.0.tar.gz + rm -r git-lfs-3.7.0/ + ``` + +2. **Configure Git/SSH (if needed)** + ```bash + # Generate SSH key + ssh-keygen -t ed25519 -C "your_email@example.com" + + # Add to ssh-agent + eval "$(ssh-agent -s)" + ssh-add ~/.ssh/id_ed25519 + + # Add public key to GitHub/GitLab + cat ~/.ssh/id_ed25519.pub + ``` + +3. **Install Git DRS** + ```bash + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/fix/install-error-macos/install.sh)" + + # Update PATH + echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bash_profile + source ~/.bash_profile + ``` + +4. **Verify Installation** + ```bash + git-drs version + ``` + +## Terra/Jupyter Installation (AnVIL Server) + +**Target Environment**: Terra Jupyter notebooks targeting AnVIL DRS servers + +### Steps + +1. **Launch Jupyter Environment** in Terra + +2. **Open Terminal** in Jupyter + +3. **Install Dependencies** + ```bash + # Install Git DRS + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/fix/install-error-macos/install.sh)" + + # Install DRS Downloader + wget https://github.com/anvilproject/drs_downloader/releases/download/0.1.6-rc.4/drs_downloader + chmod 755 drs_downloader + ``` + +4. **Verify Installation** + ```bash + git-drs --help + drs_downloader --help + ``` + +5. **Example Workflow** + ```bash + # Clone example repository + git clone https://github.com/quinnwai/super-cool-anvil-analysis.git + cd super-cool-anvil-analysis/ + + # Initialize and configure for your Terra project + git drs init + git drs remote add anvil development --terraProject $GOOGLE_PROJECT + + # Work with manifests + gsutil cp $WORKSPACE_BUCKET/anvil-manifest.tsv . + git drs create-cache anvil-manifest.tsv + + # List and pull files + git lfs ls-files + git lfs pull -I data_tables_sequencing_dataset.tsv + ``` + +## Local Installation (AnVIL Server) + +**Target Environment**: Local development machine targeting AnVIL servers + +### Steps + +1. **Install Git DRS** (same as Gen3 local installation) + ```bash + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/fix/install-error-macos/install.sh)" + ``` + +2. **Get Terra Project ID** + - Log in to [AnVIL Workspaces](https://anvil.terra.bio/#workspaces) + - Select your workspace + - Copy the Google Project ID from "CLOUD INFORMATION" + +3. **Configure AnVIL Access** + ```bash + # Check existing configuration + git drs remote list + + # If no AnVIL server configured, add it + git drs init + git drs remote add anvil development --terraProject + ``` + +## Build from Source + +For development or custom builds: + +```bash +# Clone repository +git clone https://github.com/calypr/git-drs.git +cd git-drs + +# Build +go build + +# Make accessible +export PATH=$PATH:$(pwd) +``` + +## Post-Installation + +After installation, verify your setup: + +```bash +# Check Git DRS version +git-drs version + +# Check Git LFS +git lfs version + +# View configured remotes (after setup) +git drs remote list +``` + +## Next Steps + +After installation, see: + +> **Navigation:** [Installation](installation.md) → [Getting Started](getting-started.md) → [Commands Reference](commands.md) + +- **[Getting Started](getting-started.md)** - Repository setup and basic workflows +- **[Commands Reference](commands.md)** - Complete command documentation diff --git a/docs/workflows/remove-files.md b/docs/tools/git-drs/remove-files.md similarity index 95% rename from docs/workflows/remove-files.md rename to docs/tools/git-drs/remove-files.md index e97ec77..30c0cbc 100644 --- a/docs/workflows/remove-files.md +++ b/docs/tools/git-drs/remove-files.md @@ -6,12 +6,12 @@ title: Removing Files When removing data files from your project, it's crucial to also update the manifest and associated metadata to maintain consistency. -### 1. Remove File(s) Using `g3t rm` +### 1. Remove File(s) Using `git rm` -Use the `g3t rm` command to delete files and automatically update the manifest and metadata: +Use the `git rm` command to delete files and automatically update the manifest and metadata: ```bash -g3t rm DATA/subject-123/vcf/sample1.vcf.gz +git rm DATA/subject-123/vcf/sample1.vcf.gz ``` This command performs the following actions: diff --git a/docs/tools/git-drs/troubleshooting.md b/docs/tools/git-drs/troubleshooting.md new file mode 100644 index 0000000..9423564 --- /dev/null +++ b/docs/tools/git-drs/troubleshooting.md @@ -0,0 +1,421 @@ +# Troubleshooting + +Common issues and solutions when working with Git DRS. + +> **Navigation:** [Getting Started](getting-started.md) → [Commands Reference](commands.md) → **Troubleshooting** + +## When to Use Which Tool + +Understanding when to use Git, Git LFS, or Git DRS commands: + +### Git DRS Commands + +**Use for**: Repository and remote configuration + +- `git drs init` - Initialize Git LFS hooks +- `git drs remote add` - Configure DRS server connections +- `git drs remote list` - View configured remotes +- `git drs add-url` - Add S3 file references + +**When**: + +- Setting up a new repository +- Adding/managing DRS remotes +- Refreshing expired credentials +- Adding external file references + +### Git LFS Commands + +**Use for**: File tracking and management + +- `git lfs track` - Define which files to track +- `git lfs ls-files` - See tracked files and status +- `git lfs pull` - Download specific files +- `git lfs untrack` - Stop tracking file patterns + +**When**: + +- Managing which files are stored externally +- Downloading specific files +- Checking file localization status + +### Standard Git Commands + +**Use for**: Version control operations + +- `git add` - Stage files for commit +- `git commit` - Create commits +- `git push` - Upload commits and trigger file uploads +- `git pull` - Get latest commits + +**When**: + +- Normal development workflow +- Git DRS runs automatically in the background + +## Common Error Messages + +### Authentication Errors + +**Error**: `Upload error: 403 Forbidden` or `401 Unauthorized` + +**Cause**: Expired or invalid credentials + +**Solution**: + +```bash +# Download new credentials from your data commons +# Then refresh them by re-adding the remote +git drs remote add gen3 production \ + --cred /path/to/new-credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket +``` + +**Prevention**: + +- Credentials expire after 30 days +- Set a reminder to refresh them regularly + +--- + +**Error**: `Upload error: 503 Service Unavailable` + +**Cause**: DRS server is temporarily unavailable or credentials expired + +**Solutions**: + +1. Wait and retry the operation +2. Refresh credentials: + ```bash + git drs remote add gen3 production \ + --cred /path/to/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket + ``` +3. If persistent, download new credentials from the data commons + +### Network Errors + +**Error**: `net/http: TLS handshake timeout` + +**Cause**: Network connectivity issues + +**Solution**: + +- Simply retry the command +- These are usually temporary network issues + +--- + +**Error**: Git push timeout during large file uploads + +**Cause**: Long-running operations timing out + +**Solution**: Add to `~/.ssh/config`: + +``` +Host github.com + TCPKeepAlive yes + ServerAliveInterval 30 +``` + +### File Tracking Issues + +**Error**: Files not being tracked by LFS + +**Symptoms**: + +- Large files committed directly to Git +- `git lfs ls-files` doesn't show your files + +**Solution**: + +```bash +# Check what's currently tracked +git lfs track + +# Track your file type +git lfs track "*.bam" +git add .gitattributes + +# Remove from Git and re-add +git rm --cached large-file.bam +git add large-file.bam +git commit -m "Track large file with LFS" +``` + +--- + +**Error**: `[404] Object does not exist on the server` + +**Symptoms**: + +- After clone, git pull fails + +**Solution**: + +```bash +# confirm repo has complete configuration +git drs list-config + +# init your git drs project +git drs init --cred /path/to/cred/file --profile + +# attempt git pull again +git lfs pull -I path/to/file +``` + +--- + +**Error**: `git lfs ls-files` shows files but they won't download + +**Cause**: Files may not have been properly uploaded or DRS records missing + +**Solution**: + +```bash +# Check repository status +git drs list-config + +# Try pulling with verbose output +git lfs pull -I "problematic-file*" --verbose + +# Check logs +cat .drs/*.log +``` + +### Configuration Issues + +**Error**: `git drs remote list` shows empty or incomplete configuration + +**Cause**: Repository not properly initialized or no remotes configured + +**Solution**: + +```bash +# Initialize repository if needed +git drs init + +# Add Gen3 remote +git drs remote add gen3 production \ + --cred /path/to/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket + +# For AnVIL +git drs remote add anvil development --terraProject + +# Verify configuration +git drs remote list +``` + +--- + +**Error**: Configuration exists but commands fail + +**Cause**: Mismatched configuration between global and local settings, or expired credentials + +**Solution**: + +```bash +# Check configuration +git drs remote list + +# Refresh credentials by re-adding the remote +git drs remote add gen3 production \ + --cred /path/to/new-credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket +``` + +### Remote Configuration Issues + +**Error**: `no default remote configured` + +**Cause**: Repository initialized but no remotes added yet + +**Solution**: + +```bash +# Add your first remote (automatically becomes default) +git drs remote add gen3 production \ + --cred /path/to/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket +``` + +--- + +**Error**: `default remote 'X' not found` + +**Cause**: Default remote was deleted or configuration is corrupted + +**Solution**: + +```bash +# List available remotes +git drs remote list + +# Set a different remote as default +git drs remote set staging + +# Or add a new remote +git drs remote add gen3 production \ + --cred /path/to/credentials.json \ + --url https://calypr-public.ohsu.edu \ + --project my-project \ + --bucket my-bucket +``` + +--- + +**Error**: Commands using wrong remote + +**Cause**: Default remote is not the one you want to use + +**Solution**: + +```bash +# Check current default +git drs remote list + +# Option 1: Change default remote +git drs remote set production + +# Option 2: Specify remote for single command +git drs push staging +git drs fetch production +``` + +## Undoing Changes + +### Untrack LFS Files + +If you accidentally tracked the wrong files: + +```bash +# See current tracking +git lfs track + +# Remove incorrect pattern +git lfs untrack "wrong-dir/**" + +# Add correct pattern +git lfs track "correct-dir/**" + +# Stage the changes +git add .gitattributes +git commit -m "Fix LFS tracking patterns" +``` + +### Undo Git Add + +Remove files from staging area: + +```bash +# Check what's staged +git status + +# Unstage specific files +git restore --staged file1.bam file2.bam + +# Unstage all files +git restore --staged . +``` + +### Undo Last Commit + +To retry a commit with different files: + +```bash +# Undo last commit, keep files in working directory +git reset --soft HEAD~1 + +# Or undo and unstage files +git reset HEAD~1 + +# Or completely undo commit and changes (BE CAREFUL!) +git reset --hard HEAD~1 +``` + +### Remove Files from LFS History + +If you committed large files directly to Git by mistake: + +```bash +# Remove from Git history (use carefully!) +git filter-branch --tree-filter 'rm -f large-file.dat' HEAD + +# Then track properly with LFS +git lfs track "*.dat" +git add .gitattributes +git add large-file.dat +git commit -m "Track large file with LFS" +``` + +## Diagnostic Commands + +### Check System Status + +```bash +# Git DRS version and help +git-drs version +git-drs --help + +# Configuration +git drs remote list + +# Repository status +git status +git lfs ls-files +``` + +### View Logs + +```bash +# Git DRS logs (in repository) +ls -la .drs/ +cat .drs/*.log +``` + +### Test Connectivity + +```bash +# Test basic Git operations +git lfs pull --dry-run + +# Test DRS configuration +git drs remote list +``` + +## Getting Help + +### Log Analysis + +When reporting issues, include: + +```bash +# System information +git-drs version +git lfs version +git --version + +# Configuration +git drs remote list + +# Recent logs +tail -50 .drs/*.log +``` + +## Prevention Best Practices + +1. **Test in small batches** - Don't commit hundreds of files at once +2. **Verify tracking** - Always check `git lfs ls-files` after adding files +3. **Use .gitignore** - Prevent accidental commits of temporary files +4. **Monitor repository size** - Keep an eye on `.git` directory size diff --git a/docs/tools/grip/clients.md b/docs/tools/grip/clients.md new file mode 100644 index 0000000..00abc55 --- /dev/null +++ b/docs/tools/grip/clients.md @@ -0,0 +1,141 @@ +--- +title: Client Library +menu: + main: + identifier: clients + weight: 25 +--- + + +# Getting Started + +GRIP has an API for making graph queries using structured data. Queries are defined using a series of step [operations](/docs/queries/operations). + +## Install the Python Client + +Available on [PyPI](https://pypi.org/project/gripql/). + +``` +pip install gripql +``` + +Or install the latest development version: + +``` +pip install "git+https://github.com/bmeg/grip.git#subdirectory=gripql/python" +``` + + +## Using the Python Client + +Let's go through the features currently supported in the python client. + +First, import the client and create a connection to an GRIP server: + +```python +import gripql +G = gripql.Connection("https://bmeg.io").graph("bmeg") +``` + +Some GRIP servers may require authorizaiton to access its API endpoints. The client can be configured to pass +authorization headers in its requests. + +```python +import gripql + +# Basic Auth Header - {'Authorization': 'Basic dGVzdDpwYXNzd29yZA=='} +G = gripql.Connection("https://bmeg.io", user="test", password="password").graph("bmeg") +# + +# Bearer Token - {'Authorization': 'Bearer iamnotarealtoken'} +G = gripql.Connection("https://bmeg.io", token="iamnotarealtoken").graph("bmeg") + +# OAuth2 / Custom - {"OauthEmail": "fake.user@gmail.com", "OauthAccessToken": "iamnotarealtoken", "OauthExpires": 1551985931} +G = gripql.Connection("https://bmeg.io", credential_file="~/.grip_token.json").graph("bmeg") +``` + +Now that we have a connection to a graph instance, we can use this to make all of our queries. + +One of the first things you probably want to do is find some vertex out of all of the vertexes available in the system. In order to do this, we need to know something about the vertex we are looking for. To start, let's see if we can find a specific gene: + +```python +result = G.V().hasLabel("Gene").has(gripql.eq("symbol", "TP53")).execute() +print(result) +``` + +A couple things about this first and simplest query. We start with `O`, our grip client instance connected to the "bmeg" graph, and create a new query with ``. This query is now being constructed. You can chain along as many operations as you want, and nothing will actually get sent to the server until you print the results. + +Once we make this query, we get a result: + +```python +[ + { + u'_id': u'ENSG00000141510', + u'_label': u'Gene' + u'end': 7687550, + u'description': u'tumor protein p53 [Source:HGNC Symbol%3BAcc:HGNC:11998]', + u'symbol': u'TP53', + u'start': 7661779, + u'seqId': u'17', + u'strand': u'-', + u'id': u'ENSG00000141510', + u'chromosome': u'17' + } +] +``` + +This represents the vertex we queried for above. All vertexes in the system will have a similar structure, basically: + +* _\_id_: This represents the global identifier for this vertex. In order to draw edges between different vertexes from different data sets we need an identifier that can be constructed from available data. Often, the `_id` will be the field that you query on as a starting point for a traversal. +* _\_label_: The label represents the type of the vertex. All vertexes with a given label will share many property keys and edge labels, and form a logical group within the system. + +The data on a query result can be accessed as properties on the result object; for example `result[0].data.symbol` would return: + +```python +u'TP53' +``` + +You can also do a `has` query with a list of items using `gripql.within([...])` (other conditions exist, see the `Conditions` section below): + +```python +result = G.V().hasLabel("Gene").has(gripql.within("symbol", ["TP53", "BRCA1"])).render({"_id": "_id", "symbol":"symbol"}).execute() +print(result) +``` + +This returns both Gene vertexes: + +``` +[ + {u'symbol': u'TP53', u'_id': u'ENSG00000141510'}, + {u'symbol': u'BRCA1', u'_id': u'ENSG00000012048'} +] +``` + +Once you are on a vertex, you can travel through that vertex's edges to find the vertexes it is connected to. Sometimes you don't even need to go all the way to the next vertex, the information on the edge between them may be sufficient. + +Edges in the graph are directional, so there are both incoming and outgoing edges from each vertex, leading to other vertexes in the graph. Edges also have a _label_, which distinguishes the kind of connections different vertexes can have with one another. + +Starting with gene TP53, and see what kind of other vertexes it is connected to. + +```python +result = G.V().hasLabel("Gene").has(gripql.eq("symbol", "TP53")).in_("TranscriptFor")render({"id": "_id", "label":"_label"}).execute() +print(result) +``` + +Here we have introduced a couple of new steps. The first is `.in_()`. This starts from wherever you are in the graph at the moment and travels out along all the incoming edges. +Additionally, we have provided `TranscriptFor` as an argument to `.in_()`. This limits the returned vertices to only those connected to the `Gene` verticies by edges labeled `TranscriptFor`. + + +``` +[ + {u'_label': u'Transcript', u'_id': u'ENST00000413465'}, + {u'_label': u'Transcript', u'_id': u'ENST00000604348'}, + ... +] +``` + +View a list of all available query operations [here](/docs/queries/operations). + +### Using the command line + +Grip command line syntax is defined at gripql/javascript/gripql.js diff --git a/docs/tools/grip/commands.md b/docs/tools/grip/commands.md new file mode 100644 index 0000000..ed56bc0 --- /dev/null +++ b/docs/tools/grip/commands.md @@ -0,0 +1,8 @@ +--- +title: GRIP Commands +menu: + main: + identifier: commands + weight: 10 +--- + diff --git a/docs/tools/grip/commands/create.md b/docs/tools/grip/commands/create.md new file mode 100644 index 0000000..bde4aa8 --- /dev/null +++ b/docs/tools/grip/commands/create.md @@ -0,0 +1,27 @@ + +--- +title: create + +menu: + main: + parent: commands + weight: 2 +--- + +# `create` + +## Usage + +```bash +gripql-cli create --host +``` + +- ``: The name of the graph to be created (required). +- `--host `: The URL of the GripQL server (default is "localhost:8202"). + +## Example + +```bash +gripql-cli create my_new_graph --host myserver.com:8202 +``` + diff --git a/docs/tools/grip/commands/delete.md b/docs/tools/grip/commands/delete.md new file mode 100644 index 0000000..e4deeab --- /dev/null +++ b/docs/tools/grip/commands/delete.md @@ -0,0 +1,42 @@ +--- +title: delete +menu: + main: + parent: commands + weight: 3 +--- + +# `delete` Command + +## Usage + +```bash +gripql-cli delete --host --file --edges --vertices +``` + +### Options + +- ``: Name of the graph (required) +- `--host `: GripQL server URL (default: "localhost:8202") +- `--file `: Path to a JSON file containing data to delete +- `--edges `: Comma-separated list of edge IDs to delete (ignored if JSON file is provided) +- `--vertices `: Comma-separated list of vertex IDs to delete (ignored if JSON file is provided) + +## Example + +```bash +gripql-cli delete my_graph --host myserver.com:8202 --edges edge1,edge2 --vertices vertex3,vertex4 +``` + +## JSON File Format + +JSON file format for data to be deleted: + +```json +{ + "graph": "graph_name", + "edges": ["list of edge ids"], + "vertices": ["list of vertex ids"] +} +``` + diff --git a/docs/tools/grip/commands/drop.md b/docs/tools/grip/commands/drop.md new file mode 100644 index 0000000..c43aa0b --- /dev/null +++ b/docs/tools/grip/commands/drop.md @@ -0,0 +1,14 @@ +--- +title: drop + +menu: + main: + parent: commands + weight: 4 +--- + +``` +grip drop +``` + +Deletes a graph. diff --git a/docs/tools/grip/commands/er.md b/docs/tools/grip/commands/er.md new file mode 100644 index 0000000..28d34ee --- /dev/null +++ b/docs/tools/grip/commands/er.md @@ -0,0 +1,49 @@ +--- +title: er + +menu: + main: + parent: commands + weight: 6 +--- + +``` +grip er +``` + +The *External Resource* system allows GRIP to plug into existing data systems and +integrate them into queriable graphs. The `grip er` sub command acts as a client +to the external resource plugin proxies, issues command and displays the results. +This is often useful for debugging external resources before making them part of +an actual graph. + + +List collections provided by external resource +``` +grip er list +``` + +Get info about a collection +``` +grip er info +``` + +List ids from a collection +``` +grip er ids +``` + +List rows from a collection +``` +grip er rows +``` + +List rows with field match +``` +grip get +``` + +List rows with field match +``` +grip er query +``` diff --git a/docs/tools/grip/commands/list.md b/docs/tools/grip/commands/list.md new file mode 100644 index 0000000..f018b6a --- /dev/null +++ b/docs/tools/grip/commands/list.md @@ -0,0 +1,40 @@ +--- +title: list + +menu: + main: + parent: commands + weight: 3 +--- + +The `list tables` command is used to display all available tables in the grip server. Each table is represented by its source, name, fields, and link map. Here's a breakdown of how to use this command: + +- **Usage:** `gripql list tables` +- **Short Description:** List all available tables in the grip server. +- **Long Description:** This command connects to the grip server and retrieves information about all available tables. It then prints each table's source, name, fields, and link map to the console. +- **Arguments:** None +- **Flags:** + - `--host`: The URL of the grip server (default: "localhost:8202") + +## `gripql list graphs` Command Documentation + +The `list graphs` command is used to display all available graphs in the grip server. Here's a breakdown of how to use this command: + +- **Usage:** `gripql list graphs` +- **Short Description:** List all available graphs in the grip server. +- **Long Description:** This command connects to the grip server and retrieves information about all available graphs. It then prints each graph's name to the console. +- **Arguments:** None +- **Flags:** + - `--host`: The URL of the grip server (default: "localhost:8202") + +## `gripql list labels` Command Documentation + +The `list labels` command is used to display all available vertex and edge labels in a specific graph. Here's a breakdown of how to use this command: + +- **Usage:** `gripql list labels ` +- **Short Description:** List the vertex and edge labels in a specific graph. +- **Long Description:** This command takes one argument, the name of the graph, and connects to the grip server. It retrieves information about all available vertex and edge labels in that graph and prints them to the console in JSON format. +- **Arguments:** + - ``: The name of the graph to list labels for. +- **Flags:** + - `--host`: The URL of the grip server (default: "localhost:8202") \ No newline at end of file diff --git a/docs/tools/grip/commands/mongoload.md b/docs/tools/grip/commands/mongoload.md new file mode 100644 index 0000000..e9372d7 --- /dev/null +++ b/docs/tools/grip/commands/mongoload.md @@ -0,0 +1,12 @@ +--- +title: mongoload + +menu: + main: + parent: commands + weight: 4 +--- + +``` +grip mongoload +``` diff --git a/docs/tools/grip/commands/query.md b/docs/tools/grip/commands/query.md new file mode 100644 index 0000000..bb4f7ec --- /dev/null +++ b/docs/tools/grip/commands/query.md @@ -0,0 +1,20 @@ +--- +title: query + +menu: + main: + parent: commands + weight: 2 +--- + +``` +grip query +``` + +Run a query on a graph. + +Examples +```bash +grip query pc12 'V().hasLabel("Pathway").count()' +``` + diff --git a/docs/tools/grip/commands/server.md b/docs/tools/grip/commands/server.md new file mode 100644 index 0000000..fb7e82a --- /dev/null +++ b/docs/tools/grip/commands/server.md @@ -0,0 +1,53 @@ +--- +title: server +menu: + main: + parent: commands + weight: 1 +--- + +# `server` +The server command starts up a graph server and waits for incoming requests. + +## Default Configuration +If invoked with no arguments or config files, GRIP will start up in embedded mode, using a Badger based graph driver. + +## Networking +By default the GRIP server operates on 2 ports, `8201` is the HTTP based interface. Port `8202` is a GRPC based interface. Python, R and Javascript clients are designed to connect to the HTTP interface on `8201`. The `grip` command will often use port `8202` in order to complete operations. For example if you call `grip list graphs` it will contact port `8202`, rather then using the HTTP port. This means that if you are working with a server that is behind a firewall, and only the HTTP port is available, then the grip command line program will not be able to issue commands, even if the server is visible to client libraries. + +## CLI Usage +The `server` command can take several flags for configuration: +- `--config` or `-c` - Specifies a YAML config file with server settings. This overwrites all other settings. Defaults to "" (empty string). +- `--http-port` - Sets the port used by the HTTP interface. Defaults to "8201". +- `--rpc-port` - Sets the port used by the GRPC interface. Defaults to "8202". +- `--read-only` - Start server in read-only mode. Defaults to false. +- `--log-level` or `--log-format` - Set logging level and format, respectively. Defaults are "info" for log level and "text" for format. +- `--log-requests` - Log all requests. Defaults to false. +- `--verbose` - Sets the log level to debug if true. +- `--plugins` or `-p` - Specifies a directory with GRIPPER plugins to load. If not specified, no plugins will be loaded by default. +- `--driver` or `-d` - Specifies the default driver for graph storage. Defaults to "badger". Other possible options are: "pebble", "mongo", "grids", and "sqlite". +- `--endpoint` or `-w` - Load a web endpoint plugin. Use multiple times to load multiple plugins. The format is key=value where key is the plugin name and value is the configuration string for the plugin. +- `--endpoint-config` or `-l` - Configure a loaded web endpoint plugin. Use multiple times to configure multiple plugins. The format is key=value where key is in the form 'pluginname:key' and value is the configuration value for that key. +- `--er` or `-e` - Set GRIPPER source addresses. This flag can be used multiple times to specify multiple addresses. Defaults to an empty map. + +## Examples + +```bash +# Load server with a specific config file +grip server --config /path/to/your_config.yaml + +# Set the HTTP port to 9001 +grip server --http-port 9001 + +# Start in read-only mode +grip server --read-only + +# Enable verbose logging (sets log level to debug) +grip server --verbose + +# Load a web endpoint plugin named 'foo' with configuration string 'config=value' +grip server --endpoint foo=config=value + +# Configure the loaded 'foo' web endpoint plugin, setting its key 'key1' to value 'val1' +grip server --endpoint-config foo:key1=val1 +``` diff --git a/docs/tools/grip/databases.md b/docs/tools/grip/databases.md new file mode 100644 index 0000000..3ad6b01 --- /dev/null +++ b/docs/tools/grip/databases.md @@ -0,0 +1,120 @@ +--- +title: Database Configuration +menu: + main: + identifier: Databases + weight: 20 +--- + + +# Embedded Key Value Stores + +GRIP supports storing vertices and edges in a variety of key-value stores including: + + * [Pebble](https://github.com/cockroachdb/pebble) + * [Badger](https://github.com/dgraph-io/badger) + * [BoltDB](https://github.com/boltdb/bolt) + * [LevelDB](https://github.com/syndtr/goleveldb) + +Config: + +```yaml +Default: kv + +Driver: + kv: + Badger: grip.db +``` + +---- + +# MongoDB + +GRIP supports storing vertices and edges in [MongoDB][mongo]. + +Config: + +```yaml +Default: mongo + +Drivers: + mongo: + MongoDB: + URL: "mongodb://localhost:27000" + DBName: "gripdb" + Username: "" + Password: "" + UseCorePipeline: False + BatchSize: 0 +``` + +[mongo]: https://www.mongodb.com/ + +`UseCorePipeline` - Default is to use Mongo pipeline API to do graph traversals. +By enabling `UseCorePipeline`, GRIP will do the traversal logic itself, only using +Mongo for graph storage. + +`BatchSize` - For core engine operations, GRIP dispatches element lookups in +batches to minimize query overhead. If missing from config file (which defaults to 0) +the engine will default to 1000. + +---- + + +# GRIDS + +This is an indevelopment high performance graph storage system. + +Config: + +```yaml +Default: db + +Drivers: + db: + Grids: grip-grids.db + +``` + +---- + +# PostgreSQL + +GRIP supports storing vertices and edges in [PostgreSQL][psql]. + +Config: + +```yaml +Default: psql + +Drivers: + psql: + PSQL: + Host: localhost + Port: 15432 + User: "" + Password: "" + DBName: "grip" + SSLMode: disable +``` + +[psql]: https://www.postgresql.org/ + +--- + +# SQLite + +GRIP supports storing vertices and edges in [SQLite] + +Config: + +```yaml +Default: sqlite + +Drivers: + sqlite: + Sqlite: + DBName: tester/sqliteDB +``` + +[psql]: https://sqlite.org/ diff --git a/docs/tools/grip/developer/architecture.d2 b/docs/tools/grip/developer/architecture.d2 new file mode 100644 index 0000000..ecbd726 --- /dev/null +++ b/docs/tools/grip/developer/architecture.d2 @@ -0,0 +1,90 @@ + + +gripql-python: "gripql/python" { + text: |md +# gripql + +Python library +| +} + +gripql-python -> gripql.http + +grip-client : "cmd/" { + graph { + create + drop + stream + list + schema + } + + data { + kvload + load + dump + mongoload + query + delete + } + + config { + mapping + plugin + info + } + + jobs { + job + } +} + +grip-client -> gripql.grpc + +gripql : "gripql/" { + + text: |md +Protobuf defined code +| + grpc + grpc-gateway + + http -> grpc-gateway + grpc-gateway -> grpc : protobuf via network + + http -> grpc-dgw +} + + +gripql.grpc -> server +gripql.grpc-dgw -> server + +server : "server/" { + +} + +server -> pipeline + +pipeline { + gripql-parser + compiler +} + +gdbi { + mongo + mongo-core + pebble +} + +pipeline.compiler -> gdbi + +server -> jobs + +jobs { + store + search + drivers : { + opensearch + flat file + } +} \ No newline at end of file diff --git a/docs/tools/grip/developers.md b/docs/tools/grip/developers.md new file mode 100644 index 0000000..5208ffb --- /dev/null +++ b/docs/tools/grip/developers.md @@ -0,0 +1,6 @@ +--- +title: Developers +menu: + main: + weight: 100 +--- diff --git a/docs/tools/grip/graphql.md b/docs/tools/grip/graphql.md new file mode 100644 index 0000000..5504bde --- /dev/null +++ b/docs/tools/grip/graphql.md @@ -0,0 +1,7 @@ +--- +title: GraphQL +menu: + main: + identifier: graphql + weight: 70 +--- diff --git a/docs/tools/grip/graphql/graph_schemas.md b/docs/tools/grip/graphql/graph_schemas.md new file mode 100644 index 0000000..f68bff4 --- /dev/null +++ b/docs/tools/grip/graphql/graph_schemas.md @@ -0,0 +1,37 @@ +--- +title: Graph Schemas +menu: + main: + parent: graphql + weight: 30 +--- + +# Graph Schemas + +Most GRIP based graphs are not required to have a strict schema. However, GraphQL requires +a graph schema as part of it's API. To utilize the GraphQL endpoint, there must be a +Graph Schema provided to be used by the GRIP engine to determine how to render a GraphQL endpoint. +Graph schemas are themselves an instance of a graph. As such, they can be traversed like any other graph. +The schemas are automatically added to the database following the naming pattern. `{graph-name}__schema__` + +## Get the Schema of a Graph + +The schema of a graph can be accessed via a GET request to `/v1/graph/{graph-name}/schema` + +Alternatively, you can use the grip CLI. `grip schema get {graph-name}` + +## Post a graph schema + +A schema can be attached to an existing graph via a POST request to `/v1/graph/{graph-name}/schema` + +Alternatively, you can use the grip CLI. `grip schema post [graph_name] --jsonSchema {file}` + +Schemas must be loaded as a json file in JSON schema format. see [jsonschema](https://json-schema.org/) spec for more details + +## Raw bulk loading + +Once a schema is attached to a graph, raw json records can be loaded directly to grip without having to be in native grip vertex/edge format. +Schema validation is enforced when using this POST `/v1/rawJson` method. + +A grip CLI alternative is also available with `grip jsonload [ndjson_file_path] [graph_name]` +See https://github.com/bmeg/grip/blob/develop/conformance/tests/ot_bulk_raw.py for a full example using gripql python package. diff --git a/docs/tools/grip/graphql/graphql.md b/docs/tools/grip/graphql/graphql.md new file mode 100644 index 0000000..1de2f34 --- /dev/null +++ b/docs/tools/grip/graphql/graphql.md @@ -0,0 +1,29 @@ +--- +title: GraphQL +menu: + main: + parent: graphql + weight: 25 +--- + +# GraphQL + +Grip graphql tools are defined as go standard library plugins and are located at https://github.com/bmeg/grip-graphql. +A schema based approach was used for defining read plugins. + +## Json Schema + +grip also supports using jsonschema with hypermedia extensions. Given an existing graph called TEST + +``` +./grip schema post TEST --jsonSchema schema.json +``` + +This schema will attach to the TEST graph, and subsequent calls to the bulkAddRaw method with raw Json +as defined by the attached the jsonschema will load directly into grip. + +see conformance/tests/ot_bulk_raw.py for an example + +## Future work + +In the future, autogenerated json schema may be added back to grip to continue to support graphql queries. Currently there is not support for graphql in base Grip without using the plugin repos specified above. diff --git a/docs/tools/grip/gripper.md b/docs/tools/grip/gripper.md new file mode 100644 index 0000000..40a181f --- /dev/null +++ b/docs/tools/grip/gripper.md @@ -0,0 +1,7 @@ +--- +title: GRIP Plugable External Resources +menu: + main: + identifier: gripper + weight: 120 +--- diff --git a/docs/tools/grip/gripper/graphmodel.md b/docs/tools/grip/gripper/graphmodel.md new file mode 100644 index 0000000..cd678cf --- /dev/null +++ b/docs/tools/grip/gripper/graphmodel.md @@ -0,0 +1,255 @@ +--- +title: Graph Model + +menu: + main: + parent: gripper + weight: 3 +--- + +# GRIPPER + +GRIP Plugable External Resources + +## Graph Model + +The graph model describes how GRIP will access multiple gripper servers. The mapping +of these data resources is done using a graph. The `vertices` represent how each vertex +type will be mapped, and the `edges` describe how edges will be created. The `_id` +of each vertex represents the prefix domain of all vertices that can be found in that +source. + +The `sources` referenced by the graph are provided to GRIP at run time, each named resource is a +different GRIPPER plugin that abstracts an external resource. +The `vertices` section describes how different collections +found in these sources will be turned into Vertex found in the graph. Finally, the +`edges` section describes the different kinds of rules that can be used build the +edges in the graph. + +Edges can be built from two rules `fieldToField` and `edgeTable`. In `fieldToField`, +a field value found in one vertex can be used to look up matching destination vertices +by using an indexed field found in another collection that has been mapped to a vertex. +For `edgeTable` connections, there is a single collection that represents a connection between +two other collections that have been mapped to vertices. + +## Runtime External Resource Config + +External resources are passed to GRIP as command line options. For the command line: + +``` +grip server config.yaml --er tableServer=localhost:50051 --er pfb=localhost:50052 +``` + +`tableServer` is a ER plugin that serves table data (see `gripper/test-graph`) +while `pfb` parses PFB based files (see https://github.com/bmeg/grip_pfb ) + +The `config.yaml` is + +``` +Default: badger + +Drivers: + badger: + Badger: grip-badger.db + + swapi-driver: + Gripper: + ConfigFile: ./swapi.yaml + Graph: swapi + +``` + +This runs with a default `badger` based driver, but also provides a GRIPPER based +graph from the `swapi` mapping (see example graph map below). + +## Example graph map + +``` +vertices: + - _id: "Character:" + _label: Character + source: tableServer + collection: Character + + - _id: "Planet:" + _label: Planet + collection: Planet + source: tableServer + + - _id: "Film:" + _label: Film + collection: Film + source: tableServer + + - _id: "Species:" + _label: Species + source: tableServer + collection: Species + + - _id: "Starship:" + _label: Starship + source: tableServer + collection: Starship + + - _id: "Vehicle:" + _label: Vehicle + source: tableServer + collection: Vehicle + +edges: + - _id: "homeworld" + _from: "Character:" + _to: "Planet:" + _label: homeworld + fieldToField: + fromField: $.homeworld + toField: $.id + + - _id: species + _from: "Character:" + _to: "Species:" + _label: species + fieldToField: + fromField: $.species + toField: $.id + + - _id: people + _from: "Species:" + _to: "Character:" + _label: people + edgeTable: + source: tableServer + collection: speciesCharacter + fromField: $.from + toField: $.to + + - _id: residents + _from: "Planet:" + _to: "Character:" + _label: residents + edgeTable: + source: tableServer + collection: planetCharacter + fromField: $.from + toField: $.to + + - _id: filmVehicles + _from: "Film:" + _to: "Vehicle:" + _label: "vehicles" + edgeTable: + source: tableServer + collection: filmVehicles + fromField: "$.from" + toField: "$.to" + + - _id: vehicleFilms + _to: "Film:" + _from: "Vehicle:" + _label: "films" + edgeTable: + source: tableServer + collection: filmVehicles + toField: "$.from" + fromField: "$.to" + + - _id: filmStarships + _from: "Film:" + _to: "Starship:" + _label: "starships" + edgeTable: + source: tableServer + collection: filmStarships + fromField: "$.from" + toField: "$.to" + + - _id: starshipFilms + _to: "Film:" + _from: "Starship:" + _label: "films" + edgeTable: + source: tableServer + collection: filmStarships + toField: "$.from" + fromField: "$.to" + + - _id: filmPlanets + _from: "Film:" + _to: "Planet:" + _label: "planets" + edgeTable: + source: tableServer + collection: filmPlanets + fromField: "$.from" + toField: "$.to" + + - _id: planetFilms + _to: "Film:" + _from: "Planet:" + _label: "films" + edgeTable: + source: tableServer + collection: filmPlanets + toField: "$.from" + fromField: "$.to" + + - _id: filmSpecies + _from: "Film:" + _to: "Species:" + _label: "species" + edgeTable: + source: tableServer + collection: filmSpecies + fromField: "$.from" + toField: "$.to" + + - _id: speciesFilms + _to: "Film:" + _from: "Species:" + _label: "films" + edgeTable: + source: tableServer + collection: filmSpecies + toField: "$.from" + fromField: "$.to" + + - _id: filmCharacters + _from: "Film:" + _to: "Character:" + _label: characters + edgeTable: + source: tableServer + collection: filmCharacters + fromField: "$.from" + toField: "$.to" + + - _id: characterFilms + _from: "Character:" + _to: "Film:" + _label: films + edgeTable: + source: tableServer + collection: filmCharacters + toField: "$.from" + fromField: "$.to" + + - _id: characterStarships + _from: "Character:" + _to: "Starship:" + _label: "starships" + edgeTable: + source: tableServer + collection: characterStarships + fromField: "$.from" + toField: "$.to" + + - _id: starshipCharacters + _to: "Character:" + _from: "Starship:" + _label: "pilots" + edgeTable: + source: tableServer + collection: characterStarships + toField: "$.from" + fromField: "$.to" +``` diff --git a/docs/tools/grip/gripper/gripper.md b/docs/tools/grip/gripper/gripper.md new file mode 100644 index 0000000..ab41e83 --- /dev/null +++ b/docs/tools/grip/gripper/gripper.md @@ -0,0 +1,22 @@ +--- +title: Intro + +menu: + main: + parent: gripper + weight: 1 +--- + +# GRIPPER +## GRIP Plugin External Resources + +GRIP Plugin External Resources (GRIPPERs) are GRIP drivers that take external +resources and allow GRIP to access them are part of a unified graph. +To integrate new resources into the graph, you +first deploy griper proxies that plug into the external resources. They are unique +and configured to access specific resources. These provide a view into external +resources as a series of document collections. For example, an SQL gripper would +plug into an SQL server and provide the tables as a set of collections with each +every row a document. A gripper is written as a gRPC server. + +![GIPPER Architecture](/img/gripper_architecture.png) diff --git a/docs/tools/grip/gripper/proxy.md b/docs/tools/grip/gripper/proxy.md new file mode 100644 index 0000000..e232715 --- /dev/null +++ b/docs/tools/grip/gripper/proxy.md @@ -0,0 +1,50 @@ +--- +title: External Resource Proxies + +menu: + main: + parent: gripper + weight: 2 +--- + +# GRIPPER + +## GRIPPER proxy + +With the external resources normalized to a single data model, the graph model +describes how to connect the set of collections into a graph model. Each GRIPPER +is required to provide a GRPC interface that allows access to collections stored +in the resource. + +The required functions include: + +``` +rpc GetCollections(Empty) returns (stream Collection); +``` +`GetCollections` returns a list of all of the Collections accessible via this server. + +``` +rpc GetCollectionInfo(Collection) returns (CollectionInfo); +``` +`GetCollectionInfo` provides information, such as the list of indexed fields, in a collection. + +``` +rpc GetIDs(Collection) returns (stream RowID); +``` +`GetIDs` returns a stream of all of the IDs found in a collection. + +``` +rpc GetRows(Collection) returns (stream Row); +``` +`GetRows` returns a stream of all of the rows in a collection. + +``` +rpc GetRowsByID(stream RowRequest) returns (stream Row); +``` +`GetRowsByID` accepts a stream of row requests, each one requesting a single row +by it's id, and then returns a stream of results. + +``` +rpc GetRowsByField(FieldRequest) returns (stream Row); +``` +`GetRowsByField` searches a collection, looking for values found in an indexed field. diff --git a/docs/tools/grip/index.md b/docs/tools/grip/index.md new file mode 100644 index 0000000..a9093d4 --- /dev/null +++ b/docs/tools/grip/index.md @@ -0,0 +1,10 @@ +## GRIP Tool Documentation + +GRIP (Generic Resource Integration Platform) is a powerful framework for building and managing distributed data processing systems. Key features include: + +- **Distributed Computing**: Scalable processing across multiple nodes. +- **Database Integration**: Built-in support for MongoDB, PostgreSQL, and SQL databases. +- **API Endpoints**: RESTful APIs for managing data workflows and monitoring. +- **Flexible Query Language**: GRIPQL for complex data queries and transformations. +- **Job Management**: Schedule, monitor, and manage data processing jobs in real-time. + diff --git a/docs/tools/grip/jobs_api.md b/docs/tools/grip/jobs_api.md new file mode 100644 index 0000000..7589015 --- /dev/null +++ b/docs/tools/grip/jobs_api.md @@ -0,0 +1,63 @@ +--- +title: Jobs API +menu: + main: + identifier: Jobs + weight: 40 +--- + +# Jobs API + +Not all queries return instantaneously, additionally some queries elements are used +repeatedly. The query Jobs API provides a mechanism to submit graph traversals +that will be evaluated asynchronously and can be retrieved at a later time. + + +### Submitting a job + +``` +job = G.V().hasLabel("Planet").out().submit() +``` + +### Getting job status +``` +jinfo = G.getJob(job["id"]) +``` + +Example job info: +```json +{ + "id": "job-326392951", + "graph": "test_graph_qd7rs7", + "state": "COMPLETE", + "count": "12", + "query": [{"v": []}, {"hasLabel": ["Planet"]}, {"as": "a"}, {"out": []}], + "timestamp": "2021-03-30T23:12:01-07:00" +} +``` + +### Reading job results +``` +for row in G.readJob(job["id"]): + print(row) +``` + +### Search for jobs + +Find jobs that match the prefix of the current request (example should find job from G.V().hasLabel("Planet").out()) + +``` +jobs = G.V().hasLabel("Planet").out().out().count().searchJobs() +``` + +If there are multiple jobs that match the prefix of the search, all of them will be returned. It will be a client side +job to decide which of the jobs to use as a starting point. This can either be the job with the longest matching prefix, or +the most recent job. Note, that if the underlying database has changed since the job was run, adding additional steps to the +traversal may produce inaccurate results. + +Once `job` has been selected from the returned list you can use these existing results and continue the traversal. + +``` +for res in G.resume(job["id"]).out().count(): + print(res) +``` diff --git a/docs/tools/grip/queries.md b/docs/tools/grip/queries.md new file mode 100644 index 0000000..fcf6fbe --- /dev/null +++ b/docs/tools/grip/queries.md @@ -0,0 +1,7 @@ +--- +title: Query a Graph +menu: + main: + identifier: Queries + weight: 30 +--- diff --git a/docs/tools/grip/queries/aggregation.md b/docs/tools/grip/queries/aggregation.md new file mode 100644 index 0000000..0012a81 --- /dev/null +++ b/docs/tools/grip/queries/aggregation.md @@ -0,0 +1,84 @@ +--- +title: Aggregation +menu: + main: + parent: Queries + weight: 6 +--- + +# Aggregation + +These methods provide a powerful way to analyze and summarize data in your GripQL graph database. They allow you to perform various types of aggregations, including term frequency, histograms, percentiles, and more. By combining these with other traversal functions like `has`, `hasLabel`, etc., you can create complex queries that extract specific insights from your data. + +## `.aggregate([aggregations])` +Groups and summarizes data from the graph. It allows you to perform calculations on vertex or edge properties. The following aggregation types are available: + +## Aggregation Types +### `.gripql.term(name, field, size)` +Return top n terms and their counts for a field. +```python +G.V().hasLabel("Person").aggregate(gripql.term("top-names", "name", 10)) +``` +Counts `name` occurrences across `Person` vertices and returns the 10 most frequent `name` values. + +### `.gripql.histogram(name, field, interval)` +Return binned counts for a field. +```python +G.V().hasLabel("Person").aggregate(gripql.histogram("age-hist", "age", 5)) +``` +Creates a histogram of `age` values with bins of width 5 across `Person` vertices. + +### `.gripql.percentile(name, field, percents=[])` +Return percentiles for a field. +```python +G.V().hasLabel("Person").aggregate(gripql.percentile("age-percentiles", "age", [25, 50, 75])) +``` +Calculates the 25th, 50th, and 75th percentiles for `age` values across `Person` vertices. + +### `.gripql.field("fields", "$")` +Returns all of the fields found in the data structure. Use `$` to get a listing of all fields found at the root level of the `data` property of vertices or edges. + +--- + +## `.count()` +Returns the total number of elements in the traversal. +```python +G.V().hasLabel("Person").count() +``` +This query returns the total number of vertices with the label "Person". + +--- + +## `.distinct([fields])` +Filters the traversal to return only unique elements. If `fields` are provided, uniqueness is determined by the combination of values in those fields; otherwise, the `_id` is used. +```python +G.V().hasLabel("Person").distinct(["name", "age"]) +``` +This query returns only unique "Person" vertices, where uniqueness is determined by the combination of "name" and "age" values. + +--- + +## `.sort([fields])` +Sort the output using the field values. You can sort in ascending or descending order by providing `descending=True` as an argument to `sort()` method. +```python +G.V().hasLabel("Person").sort("age") +``` +This query sorts "Person" vertices based on their age in ascending order. + +## `.limit(n)` +Limits the number of results returned by your query. +```python +G.V().hasLabel("Person").limit(10) +``` +This query limits the results to the first 10 "Person" vertices found. + +--- + +## `.skip(n)` +Offsets the results returned by your query. +```python +G.V().hasLabel("Person").skip(5) +``` +This query skips the first 5 "Person" vertices and returns the rest. + + diff --git a/docs/tools/grip/queries/filtering.md b/docs/tools/grip/queries/filtering.md new file mode 100644 index 0000000..2baac70 --- /dev/null +++ b/docs/tools/grip/queries/filtering.md @@ -0,0 +1,151 @@ +--- +title: Filtering +menu: + main: + parent: Queries + weight: 4 +--- + +# Filtering in GripQL + +GripQL provides powerful filtering capabilities using the .has() method and various condition functions. +Here's a comprehensive guide:.has()The .has() method is used to filter elements (vertices or edges) based on specified conditions. + +Conditions are functions provided by the gripql module that define the filtering criteria. + +## Comparison Operators + +### `gripql.eq(variable, value)` +Equal to (==) + +``` +G.V().has(gripql.eq("symbol", "TP53")) +# Returns vertices where the 'symbol' property is equal to 'TP53'. +``` + +### `gripql.neq(variable, value)` +Not equal to (!=) + +``` +G.V().has(gripql.neq("symbol", "TP53")) +# Returns vertices where the 'symbol' property is not equal to 'TP53'. +``` + +### `gripql.gt(variable, value)` +Greater than (>) + +``` +G.V().has(gripql.gt("age", 45)) +# Returns vertices where the 'age' property is greater than 45. +``` + +### `gripql.lt(variable, value)` +Less than (<) +``` +G.V().has(gripql.lt("age", 45)) +# Returns vertices where the 'age' property is less than 45. +``` + +### `gripql.gte(variable, value)` +Greater than or equal to (>=) +``` +G.V().has(gripql.gte("age", 45)) +# Returns vertices where the 'age' property is greater than or equal to 45. +``` + +### `gripql.lte(variable, value)` +Less than or equal to (<=) + +``` +G.V().has(gripql.lte("age", 45)) +# Returns vertices where the 'age' property is less than or equal to 45. +``` + +--- + +## Range Operators + +### `gripql.inside(variable, [lower_bound, upper_bound])` +lower_bound < variable < upper_bound (exclusive) + +``` +G.V().has(gripql.inside("age", [30, 45])) +# Returns vertices where the 'age' property is greater than 30 and less than 45. +``` + +### `gripql.outside(variable, [lower_bound, upper_bound])` +variable < lower_bound OR variable > upper_bound + +``` +G.V().has(gripql.outside("age", [30, 45])) +# Returns vertices where the 'age' property is less than 30 or greater than 45. +``` + +### `gripql.between(variable, [lower_bound, upper_bound])` +lower_bound <= variable < upper_bound + +``` +G.V().has(gripql.between("age", [30, 45])) +# Returns vertices where the 'age' property is greater than or equal to 30 and less than 45. +``` + +--- + +## Set Membership Operators + +### `gripql.within(variable, values)` +variable is in values + +``` +G.V().has(gripql.within("symbol", ["TP53", "BRCA1"])) +# Returns vertices where the 'symbol' property is either 'TP53' or 'BRCA1'. +``` + +### `gripql.without(variable, values)` +variable is not in values + +``` +G.V().has(gripql.without("symbol", ["TP53", "BRCA1"])) +# Returns vertices where the 'symbol' property is neither 'TP53' nor 'BRCA1'. +``` + +--- + +## String/Array Containment + +### `gripql.contains(variable, value)` +The variable (which is typically a list/array) contains value. + +``` +G.V().has(gripql.contains("groups", "group1")) +# Returns vertices where the 'groups' property (which is a list) contains the value "group1". +# Example: {"groups": ["group1", "group2", "group3"]} would match. +``` + +--- + +## Logical Operators + +### `gripql.and_([condition1, condition2, ...])` +Logical AND; all conditions must be true. + +``` +G.V().has(gripql.and_([gripql.lte("age", 45), gripql.gte("age", 35)])) +# Returns vertices where the 'age' property is less than or equal to 45 AND greater than or equal to 35. +``` + +### `gripql.or_([condition1, condition2, ...])` +Logical OR; at least one condition must be true. + +``` +G.V().has(gripql.or_([gripql.eq("symbol", "TP53"), gripql.eq("symbol", "BRCA1")])) +# Returns vertices where the 'symbol' property is either 'TP53' OR 'BRCA1'. +``` + +### `gripql.not_(condition)` +Logical NOT; negates the condition + +``` +G.V().has(gripql.not_(gripql.eq("symbol", "TP53"))) +# Returns vertices where the 'symbol' property is NOT equal to 'TP53'. +``` diff --git a/docs/tools/grip/queries/iterations.md b/docs/tools/grip/queries/iterations.md new file mode 100644 index 0000000..5af26c9 --- /dev/null +++ b/docs/tools/grip/queries/iterations.md @@ -0,0 +1,61 @@ +--- +title: Iteration +menu: + main: + parent: Queries + weight: 16 +--- + +# Iteration Commands + +A common operation in graph search is the ability to iteratively repeat a search pattern. For example, a 'friend of a friend' search may become a 'friend of a friend of a friend' search. In the GripQL language cycles, iterations and conditional operations are encoded using 'mark' and 'jump' based interface. This operation is similar to using a 'goto' statement in traditional programming languages. While more primitive than the repeat mechanisms seen in Gremlin, this pattern allows for much simpler query compilation and implementation. + +However, due to security concerns regarding potential denial of service attacks that could be created with the use of 'mark' and 'jump', these operations are restricted in most accounts. This is enforced by the server rejecting any queries from unauthorized users that utilize these commands without execution. In future upgrades, a proposed security feature will also allow the server to track the total number of iterations a traveler has made in a cycle and provide a hard cutoff. For example, a user could submit code with a maximum of 5 iterations. + +## Operation Commands +### `.mark(name)` +Mark a segment in the stream processor, with a name, that can receive jumps. This command is used to label sections of the query operation list that can accept travelers from the `jump` command. + +**Parameters:** +- `name` (str): The name given to the marked segment. + +### jump(dest, condition, emit) +If a condition is true, send traveler to mark. If `emit` is True, also send a copy down the processing chain. If `condition` is None, always do the jump. This command is used to move travelers from one marked segment to another based on a specified condition. + +**Parameters:** +- `dest` (str): The name of the destination mark segment. Travelers are moved to this point when their position matches the `condition` parameter. +- `condition` (_expr_ or None): An expression that determines if the traveler should jump. If it evaluates to True, the traveler jumps to the specified destination. If None, the traveler always jumps to the specified destination. +- `emit` (bool): Determines whether a copy of the traveler is emitted down the processing chain after jumping. If False, only the original traveler is processed. + +### `.set(field, value)` +Set values within the traveler's memory. These values can be used to store cycle counts. This command sets a field in the traveler's memory to a specified value. + +**Parameters:** +- `field` (str): The name of the field to set. +- `value` (_expr_): The value to set for the specified field. This can be any valid GripQL expression that resolves to a scalar value. + +### `.increment(field, value)` +Increment a field by a specified value. This command increments a field in the traveler's memory by a specified amount. + +**Parameters:** +- `field` (str): The name of the field to increment. +- `value` (_expr_): The amount to increment the specified field by. This can be any valid GripQL expression that resolves to an integer value. + +## Example Queries +The following examples demonstrate how to use these commands in a query: + +```python +q = G.V("Character:1").set("count", 0).as_("start").mark("a").out().increment("$start.count") +q = q.has(gripql.lt("$start.count", 2)) +q = q.jump("a", None, True) +``` +This query starts from a vertex with the ID "Character:1". It sets a field named "count" to 0 and annotates this vertex as "start". Then it marks this position in the operation list for future reference. The `out` command moves travelers to the outgoing edges of their current positions, incrementing the "count" field each time. If the count is less than 2, the traveler jumps back to the marked location, effectively creating a loop. + +```python +q = G.V("Character:1").set("count", 0).as_("start").mark("a").out().increment("$start.count") +q = q.has(gripql.lt("$start.count", 2)) +q = q.jump("a", None, False) +``` +This query is similar to the previous one, but in this case, the traveler only jumps back without emitting a copy down the processing chain. The result is that only one vertex will be included in the output, even though there are multiple iterations due to the jump command. + +In both examples, the use of `mark` and `jump` commands create an iterative pattern within the query operation list, effectively creating a 'friend of a friend' search that can repeat as many times as desired. These patterns are crucial for complex graph traversals in GripQL. diff --git a/docs/tools/grip/queries/jobs.md b/docs/tools/grip/queries/jobs.md new file mode 100644 index 0000000..49e132f --- /dev/null +++ b/docs/tools/grip/queries/jobs.md @@ -0,0 +1,26 @@ + + + +## .submit() +Post the traversal as an asynchronous job and get a job ID. + +Example: Submit a query to be processed in the background + +```python +job_id = G.V('vertexID').hasLabel('Vertex').submit() +print(job_id) # print job ID +``` +--- + +## .searchJobs() +Find jobs that match this query and get their status and results if available. + +Example: Search for jobs with the specified query and print their statuses and results + +```python +for result in G.V('vertexID').hasLabel('Vertex').searchJobs(): + print(result['status']) # print job status + if 'results' in result: + print(result['results']) # print job results +``` +--- diff --git a/docs/tools/grip/queries/jsonpath.md b/docs/tools/grip/queries/jsonpath.md new file mode 100644 index 0000000..2f26511 --- /dev/null +++ b/docs/tools/grip/queries/jsonpath.md @@ -0,0 +1,84 @@ +--- +title: Referencing Fields +menu: + main: + parent: Queries + weight: 2 +--- + +# Referencing Vertex/Edge Properties + +Several operations (where, fields, render, etc.) reference properties of the vertices/edges during the traversal. +GRIP uses a variation on JSONPath syntax as described in http://goessner.net/articles/ to reference fields during traversals. + +The following query: + +``` +O.V(["ENSG00000012048"]).as_("gene").out("variant") +``` + +Starts at vertex `ENSG00000012048` and marks as `gene`: + +```json +{ + "_id": "ENSG00000012048", + "_label": "gene", + "symbol": { + "ensembl": "ENSG00000012048", + "hgnc": 1100, + "entrez": 672, + "hugo": "BRCA1" + }, + "transcipts": ["ENST00000471181.7", "ENST00000357654.8", "ENST00000493795.5"] +} +``` + +as "gene" and traverses the graph to: + +```json +{ + "_id": "NM_007294.3:c.4963_4981delTGGCCTGACCCCAGAAG", + "_label": "variant", + "type": "deletion", + "publications": [ + { + "pmid": 29480828, + "doi": "10.1097/MD.0000000000009380" + }, + { + "pmid": 23666017, + "doi": "10.1097/IGC.0b013e31829527bd" + } + ] +} +``` + +Below is a table of field and the values they would reference in subsequent traversal operations. + +| jsonpath | result | +| :------------------------- | :------------------- | +| _id | "NM_007294.3:c.4963_4981delTGGCCTGACCCCAGAAG" | +| _label | "variant" | +| type | "deletion" | +| publications[0].pmid | 29480828 | +| publications[:].pmid | [29480828, 23666017] | +| publications.pmid | [29480828, 23666017] | +| $gene.symbol.hugo | "BRCA1" | +| $gene.transcripts[0] | "ENST00000471181.7" | + + +## Usage Example: + +``` +O.V(["ENSG00000012048"]).as_("gene").out("variant").render({"variant_id": "_id", "variant_type": "type", "gene_id": "$gene._id"}) +``` + +returns + +``` +{ + "variant_id": "NM_007294.3:c.4963_4981delTGGCCTGACCCCAGAAG", + "variant_type": "deletion", + "gene_id": "ENSG00000012048" +} +``` diff --git a/docs/tools/grip/queries/output.md b/docs/tools/grip/queries/output.md new file mode 100644 index 0000000..964e776 --- /dev/null +++ b/docs/tools/grip/queries/output.md @@ -0,0 +1,74 @@ +--- +title: Output Control +menu: + main: + parent: Queries + weight: 10 +--- + +--- + +# Output control + +## `.limit(count)` +Limit number of total output rows +```python +G.V().limit(5) +``` +--- +## `.skip(count)` +Start return after offset + +Example: +```python +G.V().skip(10).limit(5) + +``` +This query skips the first 10 vertices and then returns the next 5. +--- +## `.range(start, stop)` +Selects a subset of the results based on their index. `start` is inclusive, and `stop` is exclusive. +Example: +```python +G.V().range(5, 10) +``` +--- +## `.fields([fields])` +Specifies which fields of a vertex or edge to include or exclude in the output. By default, `_id`, `_label`, `_from`, and `_to` are included. + +If `fields` is empty, all properties are excluded. +If `fields` contains field names, only those properties are included. +If `fields` contains field names prefixed with `-`, those properties are excluded, and all others are included. + +Examples: + +Include only the 'symbol' property: +```python +G.V("vertex1").fields(["symbol"]) +``` + +Exclude the 'symbol' property: +```python +G.V("vertex1").fields(["-symbol"]) +``` +Exclude all properties: +```python +G.V("vertex1").fields([]) +``` + +--- + +## `.render(template)` + +Transforms the current selection into an arbitrary data structure defined by the `template`. The `template` is a string that can include placeholders for vertex/edge properties. + +Example: +```python +G.V("vertex1").render( {"node_info" : {"id": "$._id", "label": "$._label"}, "data" : {"whatToExpect": "$.climate"}} ) +``` + +Assuming `vertex1` has `_id`, `_label`, and `symbol` properties, this would return a JSON object with those fields. + +```json +{"node_info" : {"id" :"Planet:2", "label":"Planet"}, "data":{"whatToExpect":"arid"} } +``` diff --git a/docs/tools/grip/queries/record_transforms.md b/docs/tools/grip/queries/record_transforms.md new file mode 100644 index 0000000..bf3d589 --- /dev/null +++ b/docs/tools/grip/queries/record_transforms.md @@ -0,0 +1,131 @@ +--- +title: Record Transforms +menu: + main: + parent: Queries + weight: 5 +--- + + +# Record Manipulation + +## `.unwind(fields)` +Expands an array-valued field into multiple rows, one for each element in the array. +Example: + +Graph +```python +{"vertex" : {"_id":"1", "_label":"Thing", "stuff" : ["1", "2", "3"]}} +``` + +Query +```python +G.V("1").unwind("stuff") +``` + +Result +```json +{"_id":"1", "_label":"Thing", "stuff" : "1"} +{"_id":"1", "_label":"Thing", "stuff" : "2"} +{"_id":"1", "_label":"Thing", "stuff" : "3"} +``` + +## `.group({"dest":"field"})` +Collect all travelers that are on the same element while aggregating specific fields + +For the example: +```python +G.V().hasLabel("Planet").as_("planet").out("residents").as_("character").select("planet").group( {"people" : "$character.name"} ) +``` + +All of the travelers that start on the same planet go out to residents, collect them using the `as_` and then returning to the origin + +using the `select` statement. The group statement aggrigates the `name` fields from the character nodes that were visited and collects them + +into a list named `people` that is added to the current planet node. + +Output: +```json +{ + "vertex": { + "_id": "Planet:2", + "_label": "Planet", + "climate": "temperate", + "diameter": 12500, + "gravity": null, + "name": "Alderaan", + "orbital_period": 364, + "people": [ + "Leia Organa", + "Raymus Antilles" + ], + "population": 2000000000, + "rotation_period": 24, + "surface_water": 40, + "system": { + "created": "2014-12-10T11:35:48.479000Z", + "edited": "2014-12-20T20:58:18.420000Z" + }, + "terrain": [ + "grasslands", + "mountains" + ], + "url": "https://swapi.co/api/planets/2/" + } +} +{ + "vertex": { + "_id": "Planet:1", + "_label": "Planet", + "climate": "arid", + "diameter": 10465, + "gravity": null, + "name": "Tatooine", + "orbital_period": 304, + "people": [ + "Luke Skywalker", + "C-3PO", + "Darth Vader", + "Owen Lars", + "Beru Whitesun lars", + "R5-D4", + "Biggs Darklighter" + ], + "population": 200000, + "rotation_period": 23, + "surface_water": 1, + "system": { + "created": "2014-12-09T13:50:49.641000Z", + "edited": "2014-12-21T20:48:04.175778Z" + }, + "terrain": [ + "desert" + ], + "url": "https://swapi.co/api/planets/1/" + } +} +``` + +## `.pivot(id, key, value)` + +Aggregate fields across multiple records into a single record using a pivot operations. A pivot is +an operation where a two column matrix, with one columns for keys and another column for values, is +transformed so that the keys are used to name the columns and the values are put in those columns. + +So the stream of vertices: + +``` +{"_id":"observation_a1", "_label":"Observation", "subject":"Alice", "key":"age", "value":36} +{"_id":"observation_a2", "_label":"Observation", "subject":"Alice", "key":"sex", "value":"Female"} +{"_id":"observation_a3", "_label":"Observation", "subject":"Alice", "key":"blood_pressure", "value":"111/78"} +{"_id":"observation_b1", "_label":"Observation", "subject":"Bob", "key":"age", "value":42} +{"_id":"observation_b2", "_label":"Observation", "subject":"Bob", "key":"sex", "value":"Male"} +{"_id":"observation_b3", "_label":"Observation", "subject":"Bob", "key":"blood_pressure", "value":"120/80"} +``` + +with `.pivot("subject", "key", "value")` will produce: + +``` +{"_id":"Alice", "age":36, "sex":"Female", "blood_pressure":"111/78"} +{"_id":"Bob", "age":42, "sex":"Male", "blood_pressure":"120/80"} +``` diff --git a/docs/tools/grip/queries/traversal_start.md b/docs/tools/grip/queries/traversal_start.md new file mode 100644 index 0000000..6a4fd1d --- /dev/null +++ b/docs/tools/grip/queries/traversal_start.md @@ -0,0 +1,30 @@ + +--- +title: Start a Traversal +menu: + main: + parent: Queries + weight: 1 +--- + +# Start a Traversal + +All traversal based queries must start with a `V()` command, starting the travalers on the vertices of the graph. + +## `.V([ids])` +Start query from Vertex + +```python +G.V() +``` + +Returns all vertices in graph + +```python +G.V(["vertex1"]) +``` + +Returns: +```json +{"_id" : "vertex1", "_label":"TestVertex"} +``` diff --git a/docs/tools/grip/queries/traverse_graph.md b/docs/tools/grip/queries/traverse_graph.md new file mode 100644 index 0000000..568f27d --- /dev/null +++ b/docs/tools/grip/queries/traverse_graph.md @@ -0,0 +1,76 @@ +--- +title: Traverse the Graph +menu: + main: + parent: Queries + weight: 3 +--- + +# Traverse the graph +To move travelers between different elements of the graph, the traversal commands `in_` and `out` move along the edges, respecting the directionality. The `out` commands follow `_from` to `_to`, while the `in_` command follows `_to` to `_from`. + +## `.in_(), inV()` +Following incoming edges. Optional argument is the edge label (or list of labels) that should be followed. If no argument is provided, all incoming edges. + +```python +G.V().in_(label=['edgeLabel1', 'edgeLabel2']) +``` +--- + +## `.out(), .outV()` +Following outgoing edges. Optional argument is the edge label (or list of labels) that should be followed. If no argument is provided, all outgoing edges. + +```python +G.V().out(label='edgeLabel') +``` +--- + +## `.both(), .bothV()` +Following all edges (both in and out). Optional argument is the edge label (or list of labels) that should be followed. If no argument is provided, all edges. + +```python +G.V().outE().both(label='edgeLabel') +``` +--- + +## `.inE()` +Following incoming edges, but return the edge as the next element. This can be used to inspect edge properties. Optional argument is the edge label (or list of labels) that should be followed. To return back to a vertex, use `.in_` or `.out` + +```python +G.V().inE(label='edgeLabel') +``` +--- + +## `.outE()` +Following outgoing edges, but return the edge as the next element. This can be used to inspect edge properties. Optional argument is the edge label (or list of labels) that should be followed. To return back to a vertex, use `.in_` or `.out` + +```python +G.V().outE(label='edgeLabel') +``` +--- + +## `.bothE()` +Following all edges, but return the edge as the next element. This can be used to inspect edge properties. Optional argument is the edge label (or list of labels) that should be followed. To return back to a vertex, use `.in_` or `.out` + +```python +G.V().bothE(label='edgeLabel') +``` +--- + +# AS and SELECT + +The `as_` and `select` commands allow a traveler to mark a step in the traversal and return to it as a later step. + +## `.as_(name)` +Store current row for future reference + +```python +G.V().as_("a").out().as_("b") +``` + +## `.select(name)` +Move traveler to previously marked position + +```python +G.V().mark("a").out().mark("b").select("a") +``` diff --git a/docs/tools/grip/security.md b/docs/tools/grip/security.md new file mode 100644 index 0000000..cbb9f7b --- /dev/null +++ b/docs/tools/grip/security.md @@ -0,0 +1,7 @@ +--- +title: Security +menu: + main: + identifier: Security + weight: 50 +--- diff --git a/docs/tools/grip/security/basic.md b/docs/tools/grip/security/basic.md new file mode 100644 index 0000000..4bf232e --- /dev/null +++ b/docs/tools/grip/security/basic.md @@ -0,0 +1,60 @@ +--- +title: Basic Auth + +menu: + main: + parent: Security + weight: 1 +--- + +# Basic Auth + +By default, an GRIP server allows open access to its API endpoints, but it +can be configured to require basic password authentication. To enable this, +include users and passwords in your config file: + +```yaml +Server: + BasicAuth: + - User: testuser + Password: abc123 +``` + +Make sure to properly protect the configuration file so that it's not readable +by everyone: + +```bash +$ chmod 600 grip.config.yml +``` + +To use the password, set the `GRIP_USER` and `GRIP_PASSWORD` environment variables: +```bash +$ export GRIP_USER=testuser +$ export GRIP_PASSWORD=abc123 +$ grip list +``` + +## Using the Python Client + +Some GRIP servers may require authorizaiton to access its API endpoints. The client can be configured to pass +authorization headers in its requests: + +```python +import gripql + +# Basic Auth Header - {'Authorization': 'Basic dGVzdDpwYXNzd29yZA=='} +G = gripql.Connection("https://bmeg.io", user="test", password="password").graph("bmeg") +``` + +Although GRIP only supports basic password authentication, some servers may be proctected via a nginx or apache +server. The python client can be configured to handle these cases as well: + +```python +import gripql + +# Bearer Token - {'Authorization': 'Bearer iamnotarealtoken'} +G = gripql.Connection("https://bmeg.io", token="iamnotarealtoken").graph("bmeg") + +# OAuth2 / Custom - {"OauthEmail": "fake.user@gmail.com", "OauthAccessToken": "iamnotarealtoken", "OauthExpires": 1551985931} +G = gripql.Connection("https://bmeg.io", credential_file="~/.grip_token.json").graph("bmeg") +``` diff --git a/docs/tools/grip/tutorials.md b/docs/tools/grip/tutorials.md new file mode 100644 index 0000000..d3b8f1f --- /dev/null +++ b/docs/tools/grip/tutorials.md @@ -0,0 +1,7 @@ +--- +title: Tutorials +menu: + main: + identifier: Tutorials + weight: 40 +--- diff --git a/docs/tools/grip/tutorials/amazon.md b/docs/tools/grip/tutorials/amazon.md new file mode 100644 index 0000000..f215d7c --- /dev/null +++ b/docs/tools/grip/tutorials/amazon.md @@ -0,0 +1,75 @@ +--- +title: Amazon Purchase Network + +menu: + main: + parent: Tutorials + weight: 1 +--- + +# Explore Amazon Product Co-Purchasing Network Metadata + +Download the data + +``` +curl -O http://snap.stanford.edu/data/bigdata/amazon/amazon-meta.txt.gz +``` + +Convert the data into vertices and edges + +``` +python $GOPATH/src/github.com/bmeg/grip/example/amazon_convert.py amazon-meta.txt.gz amazon.data +``` + +Turn on grip and create a graph called 'amazon' + +``` +grip server & ; sleep 1 ; grip create amazon +``` + +Load the vertices/edges into the graph + +``` +grip load amazon --edge amazon.data.edge --vertex amazon.data.vertex +``` + +Query the graph + +_command line client_ + +``` +grip query amazon 'V().hasLabel("Video").out()' +``` + +The full command syntax and command list can be found at grip/gripql/javascript/gripql.js + +_python client_ + +Initialize a virtual environment and install gripql python package + +``` +python -m venv venv ; source venv/bin/activate +pip install -e gripql/python +``` + +Example code + +```python +import gripql + +conn = gripql.Connection("http://localhost:8201") + +g = conn.graph("amazon") + +# Count the Vertices +print("Total vertices: ", g.V().count().execute()) +# Count the Edges +print("Total edges: ", g.V().outE().count().execute()) + +# Try simple travesral +print("Edges connected to 'B00000I06U' vertex: %s" %g.V("B00000I06U").outE().execute()) + +# Find every Book that is similar to a DVD +for result in g.V().has(gripql.eq("group", "Book")).as_("a").out("similar").has(gripql.eq("group", "DVD")).as_("b").select("a"): + print(result) +``` diff --git a/docs/tools/grip/tutorials/pathway-commons.md b/docs/tools/grip/tutorials/pathway-commons.md new file mode 100644 index 0000000..d0d2308 --- /dev/null +++ b/docs/tools/grip/tutorials/pathway-commons.md @@ -0,0 +1,11 @@ + + +Get Pathway Commons release +``` +curl -O http://www.pathwaycommons.org/archives/PC2/v10/PathwayCommons10.All.BIOPAX.owl.gz +``` + +Convert to Property Graph +``` +grip rdf --dump --gzip pc PathwayCommons10.All.BIOPAX.owl.gz -m "http://pathwaycommons.org/pc2/#=pc:" -m "http://www.biopax.org/release/biopax-level3.owl#=biopax:" +``` diff --git a/docs/tools/grip/tutorials/tcga-rna.md b/docs/tools/grip/tutorials/tcga-rna.md new file mode 100644 index 0000000..3098295 --- /dev/null +++ b/docs/tools/grip/tutorials/tcga-rna.md @@ -0,0 +1,133 @@ +--- +title: TCGA RNA Expression + +menu: + main: + parent: Tutorials + weight: 2 +--- + +### Explore TCGA RNA Expression Data + +Create the graph + +``` +grip create tcga-rna +``` + +Get the data + +``` +curl -O http://download.cbioportal.org/gbm_tcga_pub2013.tar.gz +tar xvzf gbm_tcga_pub2013.tar.gz +``` + +Load clinical data + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_clinical.txt --row-label 'Donor' +``` + +Load RNASeq data + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t --index-col 1 --row-label RNASeq --row-prefix "RNA:" --exclude RNA:Hugo_Symbol +``` + +Connect RNASeq data to Clinical data + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt -t --index-col 1 --no-vertex --edge 'RNA:{_id}' rna +``` + +Connect Clinical data to subtypes + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_clinical.txt --no-vertex -e "{EXPRESSION_SUBTYPE}" subtype --dst-vertex "{EXPRESSION_SUBTYPE}" Subtype +``` + +Load Hugo Symbol to EntrezID translation table from RNA matrix annotations + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_RNA_Seq_v2_expression_median.txt --column-include Entrez_Gene_Id --row-label Gene +``` + +Load Mutation Information + +``` +./example/load_matrix.py tcga-rna gbm_tcga_pub2013/data_mutations_extended.txt --skiprows 1 --index-col -1 --regex Matched_Norm_Sample_Barcode '\-\d\d$' '' --edge '{Matched_Norm_Sample_Barcode}' variantIn --edge '{Hugo_Symbol}' effectsGene --column-exclude ma_func.impact ma_fi.score MA_FI.score MA_Func.Impact MA:link.MSA MA:FImpact MA:protein.change MA:link.var MA:FIS MA:link.PDB --row-label Variant +``` + +Load Proneural samples into a matrix + +```python +import pandas +import gripql + +conn = gripql.Connection("http://localhost:8201") +g = conn.graph("tcga-rna") +genes = {} +for k, v in g.V().hasLabel("Gene").render(["_id", "Hugo_Symbol"]): + genes[k] = v +data = {} +for row in g.V("Proneural").in_().out("rna").render(["_id", "_data"]): + data[row[0]] = row[1] +samples = pandas.DataFrame(data).rename(genes).transpose().fillna(0.0) +``` + +# Matrix Load project + +``` +usage: load_matrix.py [-h] [--sep SEP] [--server SERVER] + [--row-label ROW_LABEL] [--row-prefix ROW_PREFIX] [-t] + [--index-col INDEX_COL] [--connect] + [--col-label COL_LABEL] [--col-prefix COL_PREFIX] + [--edge-label EDGE_LABEL] [--edge-prop EDGE_PROP] + [--columns [COLUMNS [COLUMNS ...]]] + [--column-include COLUMN_INCLUDE] [--no-vertex] + [-e EDGE EDGE] [--dst-vertex DST_VERTEX DST_VERTEX] + [-x EXCLUDE] [-d] + db input + +positional arguments: + db Destination Graph + input Input File + +optional arguments: + -h, --help show this help message and exit + --sep SEP TSV delimiter + --server SERVER Server Address + --row-label ROW_LABEL + Vertex Label used when loading rows + --row-prefix ROW_PREFIX + Prefix added to row vertex id + -t, --transpose Transpose matrix + --index-col INDEX_COL + Column number to use as index (and id for vertex + load) + --connect Switch to 'fully connected mode' and load matrix cell + values on edges between row and column names + --col-label COL_LABEL + Column vertex label in 'connect' mode + --col-prefix COL_PREFIX + Prefix added to col vertex id in 'connect' mode + --edge-label EDGE_LABEL + Edge label for edges in 'connect' mode + --edge-prop EDGE_PROP + Property name for storing value when in 'connect' mode + --columns [COLUMNS [COLUMNS ...]] + Rename columns in TSV + --column-include COLUMN_INCLUDE + List subset of columns to use from TSV + --no-vertex Do not load row as vertex + -e EDGE EDGE, --edge EDGE EDGE + Create an edge the connected the current row vertex + args: + --dst-vertex DST_VERTEX DST_VERTEX + Create a destination vertex, args: + + -x EXCLUDE, --exclude EXCLUDE + Exclude row id + -d Run in debug mode. Print actions and make no changes + +``` diff --git a/docs/workflows/add-files.md b/docs/workflows/add-files.md deleted file mode 100644 index 91b54b7..0000000 --- a/docs/workflows/add-files.md +++ /dev/null @@ -1,120 +0,0 @@ ---- -title: Adding Files ---- - -{% include '/note.md' %} - -## Background - -Adding files to a project is a two-step process: - -1. Adding file metadata entries to the manifest -2. Creating FHIR-compliant metadata using the manifest - -This page will guide you through the first step, detailing the multiple ways to add file metadata to the manifest. - -## Overview - -A manifest is a collection of file metadata entries. Just as a ship's manifest is an inventory of its cargo, the `MANIFEST/` directory is an inventory of your file metadata. We update that manifest using `g3t add`. When you `g3t add` a file, an entry is written to a `.dvc` file in the `MANIFEST` directory, where the dvc file path mirrors the original file path relative to the root of the project. For instance, `g3t add path/to/file.txt` writes an entry to `MANIFEST/path/to/file/txt`. This manifest is then used by `g3t meta init` to create FHIR-complaint metadata used to populate the data platform. - -Here are a couple ways to add file metadata to the manifest. - - -## Adding a local file to the manifest - -To add a single file from your current working directory to the manifest, - -```bash -g3t add path/to/file -``` - -In this command, `g3t` creates a metadata entry for the specified data file, automatically calculating metadata like the file's md5sum, type, date modified, size, and path. - -## Adding a remote file to the manifest - -Sometimes you might want to generate metadata for a remote file. To add a file in an Amazon S3 bucket to the manifest, - -```sh -g3t add s3:/// \ - --etag {ETag} \ - --modified {system_time} \ - --size {system_size} \ - {static_parameters} -``` - -Since the file is not localized, we need to manually provide some file information, specifically... -1. **Hash:** a unique fixe-length string generated from the file's contents - 1. `etag` is used here but `md5`, `sha1`, `sha256`, `sha512`, and `crc` are also valid. -2. **Date modified:** Date modified in system time -3. **Size:** File size in bytes - -To get the ETag, size, and date modified for a remote file mirrored on S3, run the following `mc stat` command using the [MinIO client](https://min.io/docs/minio/linux/reference/minio-mc.html): -```sh -mc stat --json ceph/example-bucket/example.bam -{ - "status": "success", - "name": "example.bam", - "lastModified": "2024-02-21T09:20:24-08:00", - "size": 148299010745, - "etag": "17a5275404b41f52b042b43eb351f5ba-8840", - "type": "file", - "metadata": { - "Content-Type": "application/gzip" - } -} -``` - -Then to add the remote file, run the following: - -```sh -g3t add s3://example-bucket/file.bam \ - --etag "17a5275404b41f52b042b43eb351f5ba-8840" \ - --size 148299010745 \ - --modified "2024-02-21T09:20:24-08:00" \ -``` - -## Modifying an existing file in the manifest - -You can use `g3t add` to also modify existing files that are already tracked in the manifest. When you do so, the `g3t add` command will maintain the existing information and update any values that are passed in as flags. For local files, it will re-populate the file metadata according to its current state in the repo. For remote files, you will have to manually update those fields yourself. - -## Associating files with other entities - -In some cases, you might want to associate a file to a particular entity like a subject or sample. This can be done with a flag: - -```bash -g3t add path/to/file --patient patient_abc -g3t add path/to/file --specimen specimen_001 -``` - -The flag name corresponds to a type of FHIR resource specifically either a [patient](https://build.fhir.org/patient.html), [specimen](https://build.fhir.org/specimen.html), [task](https://build.fhir.org/task.html), or [observation](https://build.fhir.org/observation.html). The info passed into the command after the flag represents the identifier that will be used to group file data on a patient. This can be combined with the above methods as an additional flag. - -## Adding multiple files to the manifest - -Adding multiple files at once is possible as well, here are some examples - -```bash -g3t add "dir/*" # recursively add all files in the top level of dir directory to manifest -g3t add "*.txt" # add all .txt files in the current directory to manifest -``` - -Make sure to surround your wildcard string in quotes, as it will only add the first matching file otherwise. - -## Migration of existing project files - -If you have an existing project that you want to migrate using g3t, you can do so by following these steps: - -* Create a new repository using `g3t init`. See [Creating a project](creating-project.md) -* Either move or copy your existing data files into the new repository. Alternatively, symbolic links are supported. - -## A note on data directories - -When creating metadata, all paths referring to data directories are stored **relative to the root of the project**. For instance, when doing `g3t add path/to/file.txt`, the output is stored in `MANIFEST/path/to/file.txt`. Here's a brief explanation of this convention: - -* Portability: Relative paths make your project more portable, allowing it to be moved to different locations or shared with others without causing issues with file references. This is particularly important in data engineering projects where datasets and files may be stored in different locations. -* Ease of Collaboration: When working on a data engineering project with multiple team members, using relative paths ensures that everyone can run the code without having to modify file paths based on their local directory structure. This promotes smoother collaboration. -* Consistency Across Environments: Data engineering projects often involve processing large datasets, and the code needs to run consistently across different environments (e.g., development, testing, production). Relative paths help maintain this consistency by allowing the code to reference files and directories relative to the project's root. - - -## Next steps - -* See [metadata workflow](metadata.md) for more information on how to create and upload metadata. \ No newline at end of file diff --git a/docs/workflows/clone.md b/docs/workflows/clone.md deleted file mode 100644 index f657b15..0000000 --- a/docs/workflows/clone.md +++ /dev/null @@ -1,19 +0,0 @@ - -# Cloning a Project - -The `g3t clone` command is used to clone a project from the remote repository. Here's a brief explanation of what happens when you use g3t clone: - -* A subdirectory is created for the project, it is named after the `project_id`. -* The project is initialized locally, including the `.g3t` and `META` directories. -* The current metadata is downloaded from the remote repository. -* By default, data files are not downloaded by default - -```sh -g3t clone --help -Usage: g3t clone [OPTIONS] PROJECT_ID - - Clone meta and files from remote. - -Options: - --help Show this message and exit. -``` diff --git a/docs/workflows/commit-push.md b/docs/workflows/commit-push.md deleted file mode 100644 index 1eee44d..0000000 --- a/docs/workflows/commit-push.md +++ /dev/null @@ -1,47 +0,0 @@ -# Publishing a Project - -The following page will outline how to publish your project to the data platform. - -## Committing Changes - -The `g3t commit` command saves your changes to the local repository. Here's a brief explanation of what happens when you use g3t commit: - -- Like git, this command bundles the staged files into a single set of changes. - - The `-m` flag provides a commit message detailing the changes - - If the commit is successful, you will see a summary of the changes logged -- As a reminder, the files committed to git are the FHIR metadata in `META/` and the file metadata entries in `MANIFEST/`, not the data files themselves. -- See `g3t commit --help` for more info - -You can confirm all your changes have been staged and committed using `g3t status`. This will ensure that your manifest data and FHIR metadata is up to date. - -## Pushing Changes - -### How to Push Changes - -The `g3t push` command uploads your changes to the data platform. -Here's a brief explanation of what happens when you use `g3t push`: - -1. Checks that all files are committed before pushing -2. Checks that the FHIR metadata in `META/` is valid -3. Indexes the data files using the file metadata in the `MANIFEST/` directory -4. Uploads the FHIR metadata to our databases -5. Once the job is complete: - * Changes are available on the platform - * Changes are available for other users to download - * Job logs are available in the logs directory - -### Updating Files - -When pushing data, `g3t` checks the manifest (`MANIFEST/` directory) to see if there are any new files to index. If no new files have been added, then the push will not go through. To update the metadata for a file that has already been pushed or update the FHIR metadata, use the `--overwrite` flag: - -```console -$ g3t push --overwrite -``` - -### Logging - -Make sure to check the logs generated by the command. - -* If a job is successful, you will get a green success message. -* If a job fails, you will get a red error message: look for more information in the specified logs directory. -* The logs directory stores rolling logs, where each line is a JSON representing a single submission attempt. \ No newline at end of file diff --git a/docs/workflows/quick-start-guide.md b/docs/workflows/quick-start-guide.md deleted file mode 100644 index 9327fa1..0000000 --- a/docs/workflows/quick-start-guide.md +++ /dev/null @@ -1,222 +0,0 @@ - -# Quickstart Guide - -{% include '/note.md' %} - -## About - -gen3 tracker, or g3t, is a command line tool for the CALYPR platform. It provides a set of utilities for users to upload data to and download data from the platform. The following tutorial will walk you through the steps for two different use cases: - -1. Uploading files for a new project to the platform -2. Downloading an existing project from the platform - -Each step will outline the command to execute followed by a brief description of the command's functionality. - -## Requirements - -Please ensure you have completed the following setup from the [Requirements](/requirements) page: - -1. Installed gen3-client -2. Configured a gen3-client profile with credentials -3. Installed gen3-tracker - - -To confirm all dependencies are set up as expected, run - -```sh -g3t --profile ping -``` - -You should get a message like this - -> msg: 'Configuration OK: Connected using profile:calypr' -endpoint: https://calypr.ohsu.edu.org -username: someone@example.com - -along with the set of projects you have been provided access to. - -## General Usage - -```sh -g3t [OPTIONS] COMMAND [ARGS]... -``` - -g3t is built on git, so many commands behave similarly to git with some key differences. These differences will be outlined for each step in the submission process. - -## 1. Upload Data to a Newly Approved Project - -The first use case we will cover is how to add data to a new project on the CALYPR. - -!!! note - The following examples will use the `calypr` program with a project called `myproject` and an `calypr` g3t profile. - -### Check Project Permissions - -To start, check what projects you have access to using the command - -```sh -g3t projects ls -``` - -Check that you have permission to edit `calypr-myproject`. This is what allows you to push data up to the platform. If you do not have the correct permissions, please contact a system administrator. - -### Specify a gen3 Profile - -For most g3t commands, you need to specify the gen3-client profile you want to use. This ensures that you are uploading projects to the right platform with the right credentials. There are two ways to set your profile... - -To set a profile using an environmental variable: -```sh -export G3T_PROFILE=calypr -``` - -To pass the profile as a flag to the `ping` command for example: - -```sh -g3t --profile calypr ping -``` - -For the rest of the tutorial, we will assume you have exported a `G3T_PROFILE` environment variable so we don't have to use the `--profile` flag each time. - -### Initialize a new project - -To initialize your new project locally, you can use `g3t init` - -```bash -mkdir calypr-myproject -cd calypr-myproject -g3t init calypr-myproject -``` - -* Similar to `git init`, this command creates a new project in the current directory -* Within the project, there are a couple important directories... - * `MANIFEST/`: stores file metadata entries - * `META/`: stores metadata converted into the [FHIR](https://hl7.org/fhir/) standard - * `.g3t/`: hidden, stores and manages g3t state for the project -* The project ID is `calypr-myproject` made from the program name `calypr` and project name `myproject`. Specifically, - * **Program name:** is predefined by the institution, defining what remote data buckets and endpoints you have access to - * **Project name:** must be unique within the server, be alphanumeric, and contain no spaces or hyphens -* For more information, see [creating a project](creating-project.md) - -### Add files to the manifest - -Once your project is initialized, you can add files to the project's manifest. For example, let's say you have tsv files in a `folder/` directory within your current repository. Each of the tsv files are associated with a particular subject, say `patient_1` and `patient_2`. To add them using `g3t add`, - -```bash -g3t add folder/file.tsv --patient patient_1 -g3t add folder/file2.tsv --patient patient_2 -``` - -* Each `g3t add` above creates a metadata entry for the specified data file, automatically calculating metadata like the file's md5sum, type, date modified, size, and path. - - Just as a ship's manifest is an inventory of its cargo, the `MANIFEST/` directory is an inventory for each file's metadata - - Each metadata entry is stored as a `.dvc` file in the `MANIFEST` directory, where the dvc file path mirrors the original file path - - **Example:** `folder/file.tsv` creates a `MANIFEST/folder/file.tsv.dvc` entry -* Using the patient flag is one way to associate a file with a particular subject, in this case associating each file with a specified patient identifier. -* `g3t add` varies from `git add`, as the `.dvc` file is what gets staged rather than the potentially large data file -* Multiple files can be added at the same time by wrapping a wildcard string in quotes, for example, `g3t add "*.csv"`. -* For more information on usage, such as adding entries for remote files or how to associate files with a sample, see [adding files](add-files.md) - - -### Create metadata - -Now that your files have been staged with metadata entries, you can create FHIR-compliant metadata using the `g3t meta init` command - -```bash -g3t meta init -``` - -* Using the file metadata entries created by the `g3t add` command, `g3t meta init` creates FHIR-compliant metadata files in the `META/` directory, where each file corresponds to a [FHIR resource](https://build.fhir.org/resourcelist.html). At a minimum, the directory will contain: - -| File | Contents | -|--------------------------|----------------------------| -| ResearchStudy.ndjson | Description of the project | -| DocumentReference.ndjson | File information | - -- Additional metadata files for patient, specimen, and other entities will be generated based on options provided to the `add` command. - -| File | Contents | -|------------------------|------------------------| -| Patient.ndjson | Patient information | -| ResearchSubject.ndjson | Enrollment information | -| Specimen.ndjson | Sample information | - -- `meta init` is a good example of where g3t differs from git! While you might go from `git add` straight to `git commit` in a git workflow, we have to do `g3t add` > `g3t meta init` > `g3t commit` to track both the files and each file's metadata in g3t. -- `meta init` focuses on creating metadata specific to the files you added. For your particular use case, you may also want to supply your own FHIR data, see [adding FHIR metadata](metadata.md) - -### Check that the metadata is valid - -To ensure that the FHIR data has been properly formatted, you can call `g3t meta validate`. - -```bash -g3t meta validate -``` - -- The system will print summary counts and informative messages if the metadata is invalid. - - -### Check that the expected files are queued for upload - -You can double-check that all of your files have been staged with `g3t status` - -```bash -g3t status -``` - -### Commit files - -With all checks complete, you can commit the metadata we created using `g3t commit`. - -```bash -g3t commit -m "adding tsv metadata" -``` - -- Like git, this command bundles the staged files into a single set of changes. - - The `-m` flag adds a commit message to the changes - - If the commit is successful, you will see a summary of the changes logged -- As a reminder, the files that are committed to git are the FHIR metadata in META/ and the .dvc entries in MANIFEST/, not the data files themselves -- See [publishing a project](commit-push.md) for more info - -### Push to CALYPR - -To submit the files and metadata to the data platform, we can use `g3t push` - -```bash -g3t push -``` - -* This command launches a job to upload project data to the specified data platform. -* Specifically, it... - 1. Checks that all files are committed before pushing - 1. Checks that the `META/` metadata is valid - 2. Indexes the data files using the file metadata in the `MANIFEST/` directory - 3. Uploads the FHIR metadata in the `META/` directory into our databases -* A push will fail if no new files are being submitted. If you need to update existing files in the manifest or update the FHIR metadata, use the `--overwrite` option to force an upload. -* A job is successful if you get a green success message. -* For other publishing options and specialized use cases, see [publishing a project](commit-push.md) - -### View the Data on the Platform - -Congratulations, you have submitted data to the platform! To check that your data was uploaded, login and navigate to the Exploration page on [calypr.ohsu.edu.org](https://calypr.ohsu.edu.org)! - -## 2. Bulk Download Data from a Project on CALYPR - -Sometimes you might want the most recent version of a data project that has already been published to the platform. To download the metadata for an existing project, use the `g3t clone` command. - -```sh -g3t clone calypr-myproject -``` - -- The clone command will download the metadata associated with the project into a new directory -- Specifically, it downloads the metadata `.dvc` entries in `MANIFEST/` and the FHIR-compliant metadata in `META/` - -To retrieve the actual data files described by manifest as opposed to just the file metadata, use the pull command. - -```bash -g3t clone calypr-myproject -cd calypr-myproject -g3t pull -``` - -- The pull command will retrieve the actual data files associated with the metadata. - - -To download only a subset of files, refer to the downloads [page](https://calypr.github.io/workflows/portal-download/). For more information on other commands or use cases, see the Use Cases & Workflows section. \ No newline at end of file diff --git a/docs/workflows/status.md b/docs/workflows/status.md deleted file mode 100644 index 0525406..0000000 --- a/docs/workflows/status.md +++ /dev/null @@ -1,16 +0,0 @@ -# g3t status - -## Show the working tree status - -The `g3t status` command is used to view the state of your working directory. It functions the same as `git status`, providing info on: - -* Untracked files: files that have not been staged -* Changes to be committed: files that have been staged (added) but not committed - - -``` - g3t status --help -Usage: g3t status - - Show the working tree status. -``` diff --git a/docs/workflows/tabular.md b/docs/workflows/tabular.md deleted file mode 100644 index 4edb1eb..0000000 --- a/docs/workflows/tabular.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: Create Tabular Metadata ---- - -{% include '/note.md' %} - -## Creating Tabular Data - -On the Explorer page on the data platform, FHIR metadata gets flattened out from a graph structure to a tabular format so that it is more easily visualized by users. For complex use cases, you might want to see what the flattened version of the metadata looks like before submitting the data through `g3t push`. This can be done using `g3t meta dataframe` - -```sh -Usage: g3t meta dataframe [OPTIONS] {Specimen|DocumentReference|ResearchSubjec - t|MedicationAdministration|GroupMember} - [DIRECTORY_PATH] [OUTPUT_PATH] - - Render a metadata dataframe. - - DIRECTORY_PATH: The directory path to the metadata. - OUTPUT_PATH: The output path for the dataframe. Optional, defaults to "{Specimen|DocumentReference|ResearchSubject|MedicationAdministration|GroupMember}.csv" - -Options: - --dtale Open the graph in a browser using the dtale package for interactive - data exploration. Requires pip install dtale - --debug - --help Show this message and exit -``` diff --git a/mkdocs.yml b/mkdocs.yml index 4e64651..317571f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,54 +4,33 @@ # https://www.mkdocs.org/ # https://mkdocstrings.github.io/recipes/ -site_name: CALYPR Documentation - -nav: -- index.md -- Getting Started: - - requirements.md - - workflows/quick-start-guide.md -- CLI Reference: - - workflows/creating-project.md - - workflows/add-files.md - - Adding FHIR Metadata: - - data-model/introduction.md - - workflows/metadata.md - - workflows/remove-files.md - - data-model/integration.md - - workflows/commit-push.md - - workflows/clone.md - - Exploring the Platform: - - workflows/portal-explore.md - - workflows/portal-download.md - - workflows/query.md - - Utilities: - - workflows/approve-requests.md - - workflows/add-users.md - - Experimental Features: - - workflows/tabular.md - - workflows/common-errors.md -- Status Monitor ↗: https://calypr.github.io/status-monitor +site_name: CALYPR +site_url: https://calypr.org plugins: - search - macros + - awesome-nav - open-in-new-tab # automatically open ext links in new tab - -repo_url: https://github.com/calypr/calypr.github.io -repo_name: calypr.github.io theme: name: material + custom_dir: overrides palette: - scheme: default primary: custom features: - navigation.indexes - - navigation.footer + # - navigation.footer - content.code.copy + - navigation.tabs + - navigation.sections + - navigation.top + - header.autohide markdown_extensions: + - attr_list + - md_in_html - admonition - pymdownx.superfences - pymdownx.tabbed: diff --git a/overrides/home.html b/overrides/home.html new file mode 100644 index 0000000..e6631b0 --- /dev/null +++ b/overrides/home.html @@ -0,0 +1,22 @@ +{% extends "main.html" %} + +{% block tabs %} + {{ super() }} +
+
+
+

Next-Generation Genomics Data Science

+
Unlocking biological insights with scalable, cloud/on-prem hybrid infrastructure.
+
+
+
+{% endblock %} + + +{% block content %} + {{ super() }} +{% endblock %} + +{% block footer %} + {{ super() }} +{% endblock %} \ No newline at end of file diff --git a/products/funnel b/products/funnel new file mode 160000 index 0000000..c811d1b --- /dev/null +++ b/products/funnel @@ -0,0 +1 @@ +Subproject commit c811d1b43188e03f6ef77a1a04467bfcf9b754cb diff --git a/products/git-drs b/products/git-drs new file mode 160000 index 0000000..0dbe7e5 --- /dev/null +++ b/products/git-drs @@ -0,0 +1 @@ +Subproject commit 0dbe7e55d1f34435e25eadc319bf8bddf21b2083 diff --git a/products/grip b/products/grip new file mode 160000 index 0000000..d54d9a0 --- /dev/null +++ b/products/grip @@ -0,0 +1 @@ +Subproject commit d54d9a03cb7d407187d78499b2f45eb97e5d29f2 diff --git a/requirements.txt b/requirements.txt index 7f415fe..f608f75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ mkdocs-material mkdocs-macros-plugin mkdocs-open-in-new-tab mkdocs-linkcheck - +mkdocs-awesome-nav diff --git a/sync_product_docs.py b/sync_product_docs.py new file mode 100755 index 0000000..2fa58ce --- /dev/null +++ b/sync_product_docs.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python + +import subprocess + +def sync_funnel_docs(): + subprocess.check_call(["rsync", "-av", "products/funnel/website/content/", "./docs/tools/funnel"]) + +def sync_grip_docs(): + subprocess.check_call(["rsync", "-av", "products/grip/website/content/docs/", "./docs/tools/grip"]) + +def sync_git_drs_docs(): + subprocess.check_call(["rsync", "-av", "products/git-drs/docs/", "./docs/tools/git-drs"]) + +if __name__ == "__main__": + sync_funnel_docs() + sync_grip_docs() + sync_git_drs_docs()