diff --git a/docusaurus.config.js b/docusaurus.config.js index 40d653018..9ff467a88 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -277,11 +277,13 @@ export default async function createConfigAsync() { "json", "yaml", "groovy", + "java", "ini", "nginx", "bash", "docker", "python", + "r", "typescript", ], }, diff --git a/platform_versioned_docs/version-23.4/enterprise/configuration/overview.mdx b/platform_versioned_docs/version-23.4/enterprise/configuration/overview.mdx index 56aed36e1..505e8fddd 100644 --- a/platform_versioned_docs/version-23.4/enterprise/configuration/overview.mdx +++ b/platform_versioned_docs/version-23.4/enterprise/configuration/overview.mdx @@ -495,7 +495,11 @@ Do not replace the [Seqera-provided default image](../../functionality_matrix/fu ## Seqera API -Enable the API endpoints to host the Seqera Enterprise OpenAPI specification and use the [tw CLI](https://github.com/seqeralabs/tower-cli). +Enable the API endpoints to host the Seqera Enterprise OpenAPI specification and use the [tw CLI](https://github.com/seqeralabs/tower-cli). Set custom API rate limits and timeouts. + +:::note +To configure API rate limit environment variables, you must add `ratelim` to the `MICRONAUT_ENVIRONMENTS`. Without `ratelim` being set, the rate limit configuration variables below are ignored. +::: @@ -503,6 +507,9 @@ Enable the API endpoints to host the Seqera Enterprise OpenAPI specification and | Environment variable | Description | Value | | ---------------------- | ----------------------------------------------------------------------------- | --------------- | | `TOWER_ENABLE_OPENAPI` | Enable the OpenAPI documentation endpoint, e.g., [cloud.seqera.io/openapi/index.html](https://cloud.seqera.io/openapi/index.html). | Default: `true` | +| `TOWER_RATELIMIT_PERIOD` | Specify the maximum number of HTTP requests that can be made during the `TOWER_RATELIMIT_REFRESH` period. | Default: `20` | +| `TOWER_RATELIMIT_REFRESH` | API rate limit refresh period. | Default: `1s` | +| `TOWER_RATELIMIT_TIMEOUT` | The waiting period before rejecting requests over the `TOWER_RATELIMIT_PERIOD` limit during the refresh period. | Default: `500ms` | diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/cpu-table-2.png b/platform_versioned_docs/version-23.4/getting-started/_images/cpu-table-2.png new file mode 100644 index 000000000..919aec152 Binary files /dev/null and b/platform_versioned_docs/version-23.4/getting-started/_images/cpu-table-2.png differ diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/create-ce.gif b/platform_versioned_docs/version-23.4/getting-started/_images/create-ce.gif new file mode 100644 index 000000000..fa2d48f04 Binary files /dev/null and b/platform_versioned_docs/version-23.4/getting-started/_images/create-ce.gif differ diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/create-ds.gif b/platform_versioned_docs/version-23.4/getting-started/_images/create-ds.gif new file mode 100644 index 000000000..bce8331d9 Binary files /dev/null and b/platform_versioned_docs/version-23.4/getting-started/_images/create-ds.gif differ diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/launch-form-1.gif b/platform_versioned_docs/version-23.4/getting-started/_images/launch-form-1.gif new file mode 100644 index 000000000..f863ccaee Binary files /dev/null and b/platform_versioned_docs/version-23.4/getting-started/_images/launch-form-1.gif differ diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/launch-form-2.gif b/platform_versioned_docs/version-23.4/getting-started/_images/launch-form-2.gif new file mode 100644 index 000000000..435236bd9 Binary files /dev/null and b/platform_versioned_docs/version-23.4/getting-started/_images/launch-form-2.gif differ diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/launch-form-3.gif b/platform_versioned_docs/version-23.4/getting-started/_images/launch-form-3.gif new file mode 100644 index 000000000..f59bd2c69 Binary files /dev/null and b/platform_versioned_docs/version-23.4/getting-started/_images/launch-form-3.gif differ diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/nf-core-rnaseq_metro_map_grey_static.svg b/platform_versioned_docs/version-23.4/getting-started/_images/nf-core-rnaseq_metro_map_grey_static.svg new file mode 100644 index 000000000..a0e7a4ccc --- /dev/null +++ b/platform_versioned_docs/version-23.4/getting-started/_images/nf-core-rnaseq_metro_map_grey_static.svg @@ -0,0 +1,239 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/pipelines-add.gif b/platform_versioned_docs/version-23.4/getting-started/_images/pipelines-add.gif new file mode 100644 index 000000000..e37292541 Binary files /dev/null and b/platform_versioned_docs/version-23.4/getting-started/_images/pipelines-add.gif differ diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/process-runtime-2.png b/platform_versioned_docs/version-23.4/getting-started/_images/process-runtime-2.png new file mode 100644 index 000000000..1139ad7aa Binary files /dev/null and b/platform_versioned_docs/version-23.4/getting-started/_images/process-runtime-2.png differ diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/rstudio.gif b/platform_versioned_docs/version-23.4/getting-started/_images/rstudio.gif new file mode 100644 index 000000000..31a66d4b8 Binary files /dev/null and b/platform_versioned_docs/version-23.4/getting-started/_images/rstudio.gif differ diff --git a/platform_versioned_docs/version-23.4/getting-started/_images/staging-vs-real.png b/platform_versioned_docs/version-23.4/getting-started/_images/staging-vs-real.png new file mode 100644 index 000000000..261585e9a Binary files /dev/null and b/platform_versioned_docs/version-23.4/getting-started/_images/staging-vs-real.png differ diff --git a/platform_versioned_docs/version-23.4/getting-started/quickstart-demo/add-pipelines.mdx b/platform_versioned_docs/version-23.4/getting-started/quickstart-demo/add-pipelines.mdx index 461939279..4ff125054 100644 --- a/platform_versioned_docs/version-23.4/getting-started/quickstart-demo/add-pipelines.mdx +++ b/platform_versioned_docs/version-23.4/getting-started/quickstart-demo/add-pipelines.mdx @@ -37,7 +37,7 @@ To launch pipelines directly with CLI tools, select the **Launch Pipeline** tab ![Launch Seqera Pipeline](assets/seqera-pipelines-launch-cli.png) ::: -### Add nf-core/rnaseq from the Launchpad +### Add nf-core/rnaseq manually ![Add nf-core/rnaseq pipeline](assets/sp-cloud-add-rnaseq.gif) diff --git a/platform_versioned_docs/version-23.4/getting-started/rnaseq.mdx b/platform_versioned_docs/version-23.4/getting-started/rnaseq.mdx new file mode 100644 index 000000000..cc968f820 --- /dev/null +++ b/platform_versioned_docs/version-23.4/getting-started/rnaseq.mdx @@ -0,0 +1,752 @@ +--- +title: "RNA-Seq" +description: "An introduction to running nf-core/rnaseq in Seqera Platform" +date: "21 Jul 2024" +tags: [platform, seqera pipelines, data studios, rnaseq, compute environment, aws] +toc_max_heading_level: 2 +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +This guide details how to run bulk RNA sequencing (RNA-Seq) data analysis, from quality control to differential expression analysis, on an AWS Batch compute environment in Platform. It includes: + +- Creating an AWS Batch compute environment to run your pipeline and analysis environment +- Adding pipelines to your workspace +- Importing your pipeline input data +- Launching the pipeline and monitoring execution from your workspace +- Setting up a custom analysis environment with Data Studios +- Resource allocation guidance for RNA-Seq data + +:::info[**Prerequisites**] +You will need the following to get started: + +- [Admin](../orgs-and-teams/roles.mdx) permissions in an existing organization workspace. See [Set up your workspace](./workspace-setup.mdx) to create an organization and workspace from scratch. +- An existing AWS cloud account with access to the AWS Batch service. +- Existing access credentials with permissions to create and manage resources in your AWS account. See [IAM](../compute-envs/aws-batch.mdx#iam) for guidance to set up IAM permissions for Platform. +::: + +## Compute environment + +Compute and storage requirements for RNA-Seq analysis are dependent on the number of samples and the sequencing depth of your input data. See [RNA-Seq data and requirements](#rna-seq-data-and-requirements) for details on RNA-Seq datasets and the CPU and memory requirements for important steps of RNA-Seq pipelines. + +In this guide, you will create an AWS Batch compute environment with sufficient resources allocated to run the [nf-core/rnaseq](https://github.com/nf-core/rnaseq) pipeline with a large dataset. This compute environment will also be used to run a Data Studios RStudio environment for tertiary analysis of the resulting pipeline data. + +:::note +The compute recommendations below are based on internal benchmarking performed by Seqera. See [RNA-Seq data and requirements](#rna-seq-data-and-requirements) for more information. +::: + +### Recommended compute environment resources + +The following compute resources are recommended for production RNA-Seq pipelines, depending on the size of your input dataset: + +| **Setting** | **Value** | +|--------------------------------|---------------------------------------| +| **Instance Types** | `m5,r5` | +| **vCPUs** | 2 - 8 | +| **Memory (GiB)** | 8 - 32 | +| **Max CPUs** | >500 | +| **Min CPUs** | 0 | + +#### Fusion file system + +The [Fusion](../supported_software/fusion/fusion.mdx) file system enables seamless read and write operations to cloud object stores, leading to +simpler pipeline logic and faster, more efficient execution. While Fusion is not required to run nf-core/rnaseq, it is recommended for optimal performance. See [nf-core/rnaseq performance in Platform](#nf-corernaseq-performance-in-platform) at the end of this guide. + +Fusion works best with AWS NVMe instances (fast instance storage) as this delivers the fastest performance when compared to environments using only AWS EBS (Elastic Block Store). Batch Forge selects instances automatically based on your compute environment configuration, but you can optionally specify instance types. To enable fast instance storage (see Create compute environment below), you must select EC2 instances with NVMe SSD storage (`m5d` or `r5d` families). + +:::note +Fusion requires a license for use in Seqera Platform compute environments or directly in Nextflow. Fusion can be trialed at no cost. [Contact Seqera](https://seqera.io/contact-us/) for more details. +::: + +### Create compute environment + +![Add Platform compute environment](./_images/create-ce.gif) + +From the **Compute Environments** tab in your organization workspace, select **Add compute environment** and complete the following fields: + +| **Field** | **Description** | +|---------------------------------------|------------------------------------------------------------| +| **Name** | A unique name for the compute environment. | +| **Platform** | AWS Batch | +| **Credentials** | Select existing credentials, or **+** to create new credentials:| +| **Access Key** | AWS access key ID. | +| **Secret Key** | AWS secret access key. | +| **Region** | The target execution region. | +| **Pipeline work directory** | An S3 bucket path in the same execution region. | +| **Enable Wave Containers** | Use the Wave containers service to provision containers. | +| **Enable Fusion v2** | Access your S3-hosted data via the Fusion v2 file system. | +| **Enable fast instance storage** | Use NVMe instance storage to speed up I/O and disk access. Requires Fusion v2.| +| **Config Mode** | Batch Forge | +| **Provisioning Model** | Choose between Spot and On-demand instances. | +| **Max CPUs** | Sensible values for production use range between 2000 and 5000.| +| **Enable Fargate for head job** | Run the Nextflow head job using the Fargate container service to speed up pipeline launch. Requires Fusion v2.| +| **Allowed S3 buckets** | Additional S3 buckets or paths to be granted read-write permission for this compute environment. Add data paths to be mounted in your data studio here, if different from your pipeline work directory.| +| **Resource labels** | `name=value` pairs to tag the AWS resources created by this compute environment.| + + +## Add pipeline to Platform + +:::info +The [nf-core/rnaseq](https://github.com/nf-core/rnaseq) pipeline is a highly configurable and robust workflow designed to analyze RNA-Seq data. It performs quality control, alignment and quantification. + +![nf-core/rnaseq subway map](./_images/nf-core-rnaseq_metro_map_grey_static.svg) +::: + +[Seqera Pipelines](https://seqera.io/pipelines) is a curated collection of quality open-source pipelines that can be imported directly to your workspace Launchpad in Platform. Each pipeline includes a curated test dataset to use in a test run to confirm compute environment compatibility in just a few steps. + +To use Seqera Pipelines to import the `nf-core/rnaseq` pipeline to your workspace: + +![Seqera Pipelines add to Launchpad](./_images/pipelines-add.gif) + +1. Search for `nf-core/rnaseq` and select **Launch** next to the pipeline name in the list. In the **Add pipeline** tab, select **Cloud** or **Enterprise** depending on your Platform account type, then provide the information needed for Seqera Pipelines to access your Platform instance: + - **Seqera Cloud**: Paste your Platform **Access token** and select **Next**. + - **Seqera Enterprise**: Specify the **Seqera Platform URL** (hostname) and **Base API URL** for your Enterprise instance, then paste your Platform **Access token** and select **Next**. + :::tip + If you do not have a Platform access token, select **Get your access token from Seqera Platform** to open the Access tokens page in a new browser tab. + ::: +1. Select your Platform **Organization**, **Workspace**, and **Compute environment** for the imported pipeline. +1. (Optional) Customize the **Pipeline Name** and **Pipeline Description**. +1. Select **Add Pipeline**. + +:::info +To add a custom pipeline not listed in Seqera Pipelines to your Platform workspace, see [Add pipelines](./quickstart-demo/add-pipelines.mdx#) for manual Launchpad instructions. +::: + +## Pipeline input data + +The [nf-core/rnaseq](https://github.com/nf-core/rnaseq) pipeline works with input datasets (samplesheets) containing sample names, FASTQ file locations (paths to FASTQ files in cloud or local storage), and strandedness. For example, the dataset used in the `test_full` profile is derived from the publicly available iGenomes collection of datasets, commonly used in bioinformatics analyses. + +This dataset represents RNA-Seq samples from various human cell lines (GM12878, K562, MCF7, and H1) with biological replicates, stored in an AWS S3 bucket (`s3://ngi-igenomes`) as part of the iGenomes resource. These RNA-Seq datasets consist of paired-end sequencing reads, which can be used to study gene expression patterns in different cell types. + +
+ **nf-core/rnaseq test_full profile dataset** + + | sample | fastq_1 | fastq_2 | strandedness | + |--------|---------|---------|--------------| + | GM12878_REP1 | s3://ngi-igenomes/test-data/rnaseq/SRX1603629_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX1603629_T1_2.fastq.gz | reverse | + | GM12878_REP2 | s3://ngi-igenomes/test-data/rnaseq/SRX1603630_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX1603630_T1_2.fastq.gz | reverse | + | K562_REP1 | s3://ngi-igenomes/test-data/rnaseq/SRX1603392_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX1603392_T1_2.fastq.gz | reverse | + | K562_REP2 | s3://ngi-igenomes/test-data/rnaseq/SRX1603393_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX1603393_T1_2.fastq.gz | reverse | + | MCF7_REP1 | s3://ngi-igenomes/test-data/rnaseq/SRX2370490_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX2370490_T1_2.fastq.gz | reverse | + | MCF7_REP2 | s3://ngi-igenomes/test-data/rnaseq/SRX2370491_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX2370491_T1_2.fastq.gz | reverse | + | H1_REP1 | s3://ngi-igenomes/test-data/rnaseq/SRX2370468_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX2370468_T1_2.fastq.gz | reverse | + | H1_REP2 | s3://ngi-igenomes/test-data/rnaseq/SRX2370469_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX2370469_T1_2.fastq.gz | reverse | + +
+ +In Platform, samplesheets and other data can be made easily accessible in one of two ways: +- Use **Data Explorer** to browse and interact with remote data from AWS S3, Azure Blob Storage, and Google Cloud Storage repositories, directly in your organization workspace. +- Use **Datasets** to upload structured data to your workspace in CSV (Comma-Separated Values) or TSV (Tab-Separated Values) format. + +
+ **Add a cloud bucket via Data Explorer** + + Private cloud storage buckets accessible with the credentials in your workspace are added to Data Explorer automatically by default. However, you can also add custom directory paths within buckets to your workspace to simplify direct access. + + To add individual buckets (or directory paths within buckets): + + ![Add public bucket](./quickstart-demo/assets/data-explorer-add-bucket.gif) + + 1. From the **Data Explorer** tab, select **Add cloud bucket**. + 1. Specify the bucket details: + - The cloud **Provider**. + - An existing cloud **Bucket path**. + - A unique **Name** for the bucket. + - The **Credentials** used to access the bucket. For public cloud buckets, select **Public**. + - An optional bucket **Description**. + 1. Select **Add**. + + You can now select data directly from this bucket as input when launching your pipeline, without the need to interact with cloud consoles or CLI tools. + +
+ +
+ **Add a dataset** + + From the **Datasets** tab, select **Add Dataset**. + + ![Add a dataset](./quickstart-demo/assets/sp-cloud-add-a-dataset.gif) + + Specify the following dataset details: + + - A **Name** for the dataset, such as `nf-core-rnaseq-dataset`. + - A **Description** for the dataset. + - Select the **First row as header** option to prevent Platform from parsing the header row of the samplesheet as sample data. + - Select **Upload file** and browse to your CSV or TSV samplesheet file in local storage, or simply drag and drop it into the box. + + The dataset is now listed in your organization workspace datasets and can be selected as input when launching your pipeline. + + :::info + Platform does not store the data used for analysis in pipelines. The dataset must specify the locations of data stored on your own infrastructure. + ::: + +
+ +## Launch pipeline + +:::note +This guide is based on version 3.15.1 of the nf-core/rnaseq pipeline. Launch form parameters and tools may differ in other versions. +::: + +With your compute environment created, nf-core/rnaseq added to your workspace Launchpad, and your samplesheet accessible in Platform, you are ready to launch your pipeline. Navigate to the Launchpad and select **Launch** next to `nf-core-rnaseq` to open the launch form. + +The launch form consists of **General config**, **Run parameters**, and **Advanced options** sections to specify your run parameters before execution, and an execution summary. Use section headings or select the **Previous** and **Next** buttons at the bottom of the page to navigate between sections. + +### General config + +![General config tab](./_images/launch-form-2.gif) + +- **Pipeline to launch**: The pipeline Git repository name or URL. For saved pipelines, this is prefilled and cannot be edited. +- **Revision number**: A valid repository commit ID, tag, or branch name. For saved pipelines, this is prefilled and cannot be edited. +- **Config profiles**: One or more [configuration profile](https://www.nextflow.io/docs/latest/config.html#config-profiles) names to use for the execution. Config profiles must be defined in the `nextflow.config` file in the pipeline repository. +- **Workflow run name**: An identifier for the run, pre-filled with a random name. This can be customized. +- **Labels**: Assign new or existing [labels](../labels/overview.mdx) to the run. +- **Compute environment**: Your AWS Batch compute environment. +- **Work directory**: The cloud storage path where pipeline scratch data is stored. Platform will create a scratch sub-folder if only a cloud bucket location is specified. + :::note + The credentials associated with the compute environment must have access to the work directory. + ::: + +### Run parameters + +![Run parameters](./_images/launch-form-3.gif) + +There are three ways to enter **Run parameters** prior to launch: + +- The **Input form view** displays form fields to enter text or select attributes from lists, and browse input and output locations with [Data Explorer](../data/data-explorer.mdx). +- The **Config view** displays raw configuration text that you can edit directly. Select JSON or YAML format from the **View as** list. +- **Upload params file** allows you to upload a JSON or YAML file with run parameters. + +Platform uses the `nextflow_schema.json` file in the root of the pipeline repository to dynamically create a form with the necessary pipeline parameters. + +Specify your pipeline input and output and modify other pipeline parameters as needed. + +
+ **input** + + Use **Browse** to select your pipeline input data: + + - In the **Data Explorer** tab, select the existing cloud bucket that contains your samplesheet, browse or search for the samplesheet file, and select the chain icon to copy the file path before closing the data selection window and pasting the file path in the input field. + - In the **Datasets** tab, search for and select your existing dataset. + +
+
+ **outdir** + + Use the `outdir` parameter to specify where the pipeline outputs are published. `outdir` must be unique for each pipeline run. Otherwise, your results will be overwritten. + + **Browse** and copy cloud storage directory paths using Data Explorer, or enter a path manually. + +
+ +Modify other parameters to customize the pipeline execution through the parameters form. For example, under **Read trimming options**, change the `trimmer` and select `fastp` instead of `trimgalore`. + +![Read trimming options](./quickstart-demo/assets/trimmer-settings.png) + +### Advanced settings + +- Use [resource labels](../resource-labels/overview.mdx) to tag the computing resources created during the workflow execution. While resource labels for the run are inherited from the compute environment and pipeline, workspace admins can override them from the launch form. Applied resource label names must be unique. +- [Pipeline secrets](../secrets/overview.mdx) store keys and tokens used by workflow tasks to interact with external systems. Enter the names of any stored user or workspace secrets required for the workflow execution. +- See [Advanced options](../launch/advanced.mdx) for more details. + +After you have filled the necessary launch details, select **Launch**. The **Runs** tab shows your new run in a **submitted** status at the top of the list. Select the run name to navigate to the [**View Workflow Run**](../monitoring/overview.mdx) page and view the configuration, parameters, status of individual tasks, and run report. + +
+ **Run monitoring** + + Select your new run from the **Runs** tab list to view the run details. + + #### Run details page + + As the pipeline runs, run details will populate with the following tabs: + + - **Command-line**: The Nextflow command invocation used to run the pipeline. This includes details about the pipeline version (`-r` flag) and profile, if specified (`-profile` flag). + - **Parameters**: The exact set of parameters used in the execution. This is helpful for reproducing the results of a previous run. + - **Resolved Nextflow configuration**: The full Nextflow configuration settings used for the run. This includes parameters, but also settings specific to task execution (such as memory, CPUs, and output directory). + - **Execution Log**: A summarized Nextflow log providing information about the pipeline and the status of the run. + - **Datasets**: Link to datasets, if any were used in the run. + - **Reports**: View pipeline outputs directly in the Platform. + + ![View the nf-core/rnaseq run](./quickstart-demo/assets/sp-cloud-run-info.gif) + + #### View reports + + Most Nextflow pipelines generate reports or output files which are useful to inspect at the end of the pipeline execution. Reports can contain quality control (QC) metrics that are important to assess the integrity of the results. + + ![Reports tab](./quickstart-demo/assets/reports-tab.png) + + For example, for the nf-core/rnaseq pipeline, view the [MultiQC](https://docs.seqera.io/multiqc) report generated. MultiQC is a helpful reporting tool to generate aggregate statistics and summaries from bioinformatics tools. + + ![Reports MultiQC preview](./quickstart-demo/assets/reports-preview.png) + + The paths to report files point to a location in cloud storage (in the `outdir` directory specified during launch), but you can view the contents directly and download each file without navigating to the cloud or a remote filesystem. + + :::info + See [Reports](../reports/overview.mdx) for more information. + ::: + + #### View general information + + The run details page includes general information about who executed the run, when it was executed, the Git commit ID and/or tag used, and additional details about the compute environment and Nextflow version used. + + ![General run information](./quickstart-demo/assets/general-run-details.gif) + + #### View details for a task + + Scroll down the page to view: + + - The progress of individual pipeline **Processes** + - **Aggregated stats** for the run (total walltime, CPU hours) + - **Workflow metrics** (CPU efficiency, memory efficiency) + - A **Task details** table for every task in the workflow + + The task details table provides further information on every step in the pipeline, including task statuses and metrics. + + #### Task details + + Select a task in the task table to open the **Task details** dialog. The dialog has three tabs: + + ![Task details window](./quickstart-demo/assets/task-details.gif) + + - The **About** tab contains extensive task execution details. + - The **Execution log** tab provides a real-time log of the selected task's execution. Task execution and other logs (such as stdout and stderr) are available for download from here, if still available in your compute environment. + - The **Data Explorer** tab allows you to view the task working directory directly in Platform. + + Nextflow hash-addresses each task of the pipeline and creates unique directories based on these hashes. Data Explorer allows you to view the log files and output files generated for each task in its working directory, directly within Platform. You can view, download, and retrieve the link for these intermediate files in cloud storage from the **Data Explorer** tab to simplify troubleshooting. + + ![Task Data Explorer](./quickstart-demo/assets/sp-cloud-task-data-explorer.gif) + +
+ +## Tertiary analysis with Data Studios + +**Data Studios** streamlines the process of creating interactive analysis environments for Platform users. With built-in templates for platforms like Jupyter Notebook, RStudio, and VSCode, creating a data studio is as simple as adding and sharing pipelines or datasets. The data studio URL can also be shared with any user with the [Connect role](../orgs-and-teams/roles.mdx) for real-time access and collaboration. + +For the purposes of this guide, an RStudio environment will be used to normalize the pipeline output data, perform differential expression analysis, and visualize the data with exploratory plots. + +### Prepare your data + +#### Gene counts + +Salmon is the default tool used during the `pseudo-aligner` step of the nf-core/rnaseq pipeline. In the pipeline output data, the `/salmon` directory contains the tool's output, including a `salmon.merged.gene_counts_length_scaled.tsv` file. + +#### Sample info + +The analysis script provided in this section requires a sample information file to parse the counts data in the `salmon.merged.gene_counts_length_scaled.tsv` file. nf-core/rnaseq does not produce this sample information file automatically. See below to create a sample information file based on the genes in your `salmon.merged.gene_counts_length_scaled.tsv` file. + +
+ **Create a sample info file** + + 1. Note the names of the columns (excluding the first column, which typically contains gene IDs) in your `salmon.merged.gene_counts_length_scaled.tsv` file. These are your sample names. + 1. Identify the group or condition that each sample belongs to. This information should come from your experimental design. + 1. Create a new text file named `sampleinfo.txt`, with two columns: + - First column header: Sample + - Second column header: Group + 1. For each sample in your `salmon.merged.gene_counts_length_scaled.tsv` file: + - In the "Sample" column, write the exact sample name as it appears in the gene counts file. + - In the "Group" column, write the corresponding group name. + + For example, for the dataset used in a `test_full` run of nf-core/rnaseq, the `sampleinfo.txt` looks like this: + + ``` + Sample Group + GM12878_REP1 GM12878 + GM12878_REP2 GM12878 + H1_REP1 H1 + H1_REP2 H1 + K562_REP1 K562 + K562_REP2 K562 + MCF7_REP1 MCF7 + MCF7_REP2 MCF7 + ``` + + To make your `sampleinfo.txt` file accessible to the data studio, upload it to the directory that contains your pipeline output data. Select this bucket or directory when you **Mount data** during data studio setup. + +
+ +### Create an RStudio analysis environment with Data Studios + +![Add data studio](./_images/create-ds.gif) + +From the **Data Studios** tab, select **Add a data studio** and complete the following: +- Select the latest **RStudio** container image template from the list. +- Select your AWS Batch compute environment. +:::note +Data studios compete for computing resources when sharing compute environments. Ensure your compute environment has sufficient resources to run both your pipelines and data studio sessions. The default CPU and memory allocation for a data studio is 2 CPUs and 8192 MB RAM. +::: +- Mount data using Data Explorer: Mount the S3 bucket or directory path that contains the pipeline work directory of your RNA-Seq run. +- Optional: Enter CPU and memory allocations. The default values are 2 CPUs and 8192 MB memory (RAM). +- Select **Add**. +- Once the data studio has been created, select the options menu next to it and select **Start**. +- When the data studio is in a running state, **Connect** to it. + +### Perform the analysis and explore results + +The RStudio environment can be configured with the packages you wish to install and the R script you wish to run. For the purposes of this guide, run the following scripts in the RStudio console to install the necessary packages and perform the analysis: + +1. Install and load the necessary packages and libraries: + + ```r + # Install required packages + if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install(c("limma", "edgeR", "ggplot2", "gplots")) + + # Load required libraries + library(limma) + library(edgeR) + library(ggplot2) + library(gplots) + ``` + +1. Read and convert the count data and sample information: + + :::info + Replace `` and `` with the paths to your `salmon.merged.gene_counts_length_scaled.tsv` and `sampleinfo.txt` files. + ::: + + ```r + # Read in the count data + counts <- read.delim(file = "/workspace/data/", row.names = + 1) + + # Remove the gene_name column if it exists + if ("gene_name" %in% colnames(counts)) { + counts <- counts[, -which(colnames(counts) == "gene_name")] + } + + # Convert to matrix + counts <- as.matrix(counts) + + # Read in the sample information + targets <- read.table( + file = "/workspace/data/", + header = TRUE, + stringsAsFactors = FALSE, + sep = "", + check.names = FALSE + ) + + # Ensure column names are correct + colnames(targets) <- c("Sample", "Group") + ``` + +1. Create a DGEList object and filter out low-count genes: + + ```r + # Create a DGEList object + y <- DGEList(counts, group = targets$Group) + + # Calculate CPM (counts per million) values + mycpm <- cpm(y) + + # Filter low count genes + thresh <- mycpm > 0.5 + keep <- rowSums(thresh) >= 2 + y <- y[keep, , keep.lib.sizes = FALSE] + ``` + +1. Normalize the data: + + ```r + # Normalize the data + y <- calcNormFactors(y) + ``` + +1. Print a summary of the filtered data: + + ```r + # Print summary of filtered data + print(dim(y)) + print(y$samples) + ``` + +1. Create an MDS plot, displayed in RStudio plots viewer (`a`) and saved as a PNG file (`b`): + + :::info + MDS plots are used to visualize the overall similarity between RNA-Seq samples based on their gene expression profiles, helping to identify sample clusters and potential batch effects. + ::: + + ```r + # Create MDS plot + # a. Display in RStudio + plotMDS(y, col = as.numeric(factor(targets$Group)), labels = targets$Group) + legend( + "topright", + legend = levels(factor(targets$Group)), + col = 1:nlevels(factor(targets$Group)), + pch = 20 + ) + + # b. Save MDS plot to file (change `png` to `pdf` to create a PDF file) + png("MDS_plot.png", width = 800, height = 600) + plotMDS(y, col = as.numeric(factor(targets$Group)), labels = targets$Group) + legend( + "topright", + legend = levels(factor(targets$Group)), + col = 1:nlevels(factor(targets$Group)), + pch = 20 + ) + dev.off() + ``` + +1. Perform differential expression analysis: + + ```r + # Design matrix + design <- model.matrix( ~ 0 + group, data = y$samples) + colnames(design) <- levels(y$samples$group) + + # Estimate dispersion + y <- estimateDisp(y, design) + + # Fit the model + fit <- glmQLFit(y, design) + + # Define contrasts + my.contrasts <- makeContrasts( + GM12878vsH1 = GM12878 - H1, + GM12878vsK562 = GM12878 - K562, + GM12878vsMCF7 = GM12878 - MCF7, + H1vsK562 = H1 - K562, + H1vsMCF7 = H1 - MCF7, + K562vsMCF7 = K562 - MCF7, + levels = design + ) + + # Perform differential expression analysis for each contrast + results <- lapply(colnames(my.contrasts), function(contrast) { + qlf <- glmQLFTest(fit, contrast = my.contrasts[, contrast]) + topTags(qlf, n = Inf) + }) + names(results) <- colnames(my.contrasts) + ``` + + :::info + This script is written for the analysis of human data, based on nf-core/rnaseq's `test_full` dataset. To adapt the script for your data, modify the contrasts based on the comparisons you want to make between your sample groups: + + ```r + my.contrasts <- makeContrasts( + Sample1vsSample2 = Sample1 - Sample2, + Sample2vsSample3 = Sample2 - Sample3, + ... + levels = design + ) + ``` + ::: + +1. Print the number of differentially expressed genes for each comparison and save the results to CSV files: + + ```r + # Print the number of differentially expressed genes for each comparison + for (name in names(results)) { + de_genes <- sum(results[[name]]$table$FDR < 0.05) + print(paste("Number of DE genes in", name, ":", de_genes)) + } + + # Save results + for (name in names(results)) { + write.csv(results[[name]], file = paste0("DE_genes_", name, ".csv")) + } + ``` + +1. Create volcano plots for each differential expression comparison, displayed in RStudio plots viewer and saved as PNG files: + + :::info + Volcano plots in RNA-Seq analysis display the magnitude of gene expression changes (log2 fold change) against their statistical significance. This allows for quick identification of significantly up- and down-regulated genes between two conditions. + ::: + + ```r + # Create volcano plots for differential expression comparisons + # Function to create a volcano plot + create_volcano_plot <- function(res, title) { + ggplot(res$table, aes(x = logFC, y = -log10(FDR))) + + geom_point(aes(color = FDR < 0.05 & + abs(logFC) > 1), size = 0.5) + + scale_color_manual(values = c("black", "red")) + + labs(title = title, x = "Log2 Fold Change", y = "-Log10 FDR") + + theme_minimal() + } + + # Create volcano plots for each comparison + for (name in names(results)) { + p <- create_volcano_plot(results[[name]], name) + # Display in RStudio + print(p) + # Save to file (change `.png` to `.pdf` to create PDF files) + ggsave( + paste0("volcano_plot_", name, ".png"), + p, + width = 8, + height = 6, + dpi = 300 + ) + } + ``` + +1. Create a heatmap of the top 50 differentially expressed genes: + + :::info + Heatmaps in RNA-Seq analysis provide a color-coded representation of gene expression levels across multiple samples or conditions, enabling the visualization of expression patterns and sample clustering based on similarity. + ::: + + ```r + # Create a heatmap of top 50 differentially expressed genes + # Get top 50 DE genes from each comparison + top_genes <- unique(unlist(lapply(results, function(x) + rownames(x$table)[1:50]))) + + # Get log-CPM values for these genes + log_cpm <- cpm(y, log = TRUE) + top_gene_expr <- log_cpm[top_genes, ] + + # Print dimensions of top_gene_expr + print(dim(top_gene_expr)) + + # Create a color palette + my_palette <- colorRampPalette(c("blue", "white", "red"))(100) + + # Create a heatmap using heatmap.2 + # Display in RStudio + heatmap.2( + as.matrix(top_gene_expr), + scale = "row", + col = my_palette, + trace = "none", + dendrogram = "column", + margins = c(5, 10), + labRow = FALSE, + ColSideColors = rainbow(length(unique(y$samples$group)))[factor(y$samples$group)], + main = "Top DE Genes Across Samples" + ) + + # Save heatmap to file (change `png` to `pdf` to create a PDF file) + png("heatmap_top_DE_genes.png", + width = 1000, + height = 1200) + heatmap.2( + as.matrix(top_gene_expr), + scale = "row", + col = my_palette, + trace = "none", + dendrogram = "column", + margins = c(5, 10), + labRow = FALSE, + ColSideColors = rainbow(length(unique(y$samples$group)))[factor(y$samples$group)], + main = "Top DE Genes Across Samples" + ) + dev.off() + + # Print the number of top genes in the heatmap + print(paste("Number of top DE genes in heatmap:", length(top_genes))) + ``` + +![RStudio plots](./_images/rstudio.gif) + +### Collaborate in the data studio + +To share your results or allow colleagues to perform exploratory analysis, share a link to the data studio by selecting the options menu for the data studio you want to share, then select **Copy data studio URL**. With this link, other authenticated users with the **Connect** [role](../orgs-and-teams/roles.mdx) (or greater) can access the session directly. + +## RNA-Seq data and requirements + +RNA-Seq data typically consists of raw sequencing reads from high-throughput sequencing technologies. These reads are used to quantify gene expression levels and discover novel transcripts. A typical RNA-Seq dataset can range from a few GB to several hundred GB, depending on the number of samples and the sequencing depth. + +### nf-core/rnaseq performance in Platform + +The compute recommendations in this guide are based on internal benchmarking performed by Seqera. Benchmark runs of [nf-core/rnaseq](https://github.com/nf-core/rnaseq) used profile `test_full`, consisting of an input dataset with 16 FASTQ files (8 paired-end samples) and a total size of approximately 123.5 GB. + +This benchmark compares pipeline run metrics between single nf-core/rnaseq runs in an AWS Batch compute environment with Fusion file system and fast instance storage enabled (**Fusion** group) and an identical AWS Batch compute environment using S3 storage without Fusion (**AWS S3** group). + +### Pipeline steps and computing resource requirements + +The nf-core/rnaseq pipeline involves several key steps, each with distinct computational requirements. Resource needs in this table are based on the `test_full` runs detailed previously: + +| **Pipeline step** | **Tools** | **Resource needs** | **Description** | +|-------------------------------------|---------------------------|------------------------------|---------------------------------------------------------------------------------------------------| +| **Quality Control (QC)** | FastQC, MultiQC | Low-moderate CPU (50-200% single-core usage), low memory (1-7 GB peak) | Initial quality checks of raw reads to assess sequencing quality and identify potential issues. | +| **Read Trimming** | Trim Galore! | High CPU (up to 700% single-core usage), low memory (6 GB peak) | Removal of adapter sequences and low-quality bases to prepare reads for alignment. | +| **Read Alignment** | HISAT2, STAR | Moderate-high CPU (480-600% single-core usage), high memory (36 GB peak) | Alignment of trimmed reads to a reference genome, typically the most resource-intensive step. | +| **Pseudoalignment** | Salmon, Kallisto | Moderate-high CPU (420% single-core usage), moderate memory (18 GB peak) | A faster, more accurate method of gene expression quantification than alignment using read compatibility. | +| **Quantification** | featureCounts, Salmon | Moderate-high CPU (500-600% single-core usage), moderate memory (18 GB peak) | Counting the number of reads mapped to each gene or transcript to measure expression levels. | +| **Differential Expression Analysis**| DESeq2, edgeR | High CPU (650% single-core usage), low memory (up to 2 GB peak ) | Statistical analysis to identify genes with significant changes in expression between conditions. | + +#### Overall run metrics + +**Total pipeline run cost (USD)**: + +- Fusion file system with fast instance storage: $34.90 +- Plain S3 storage without Fusion: $58.40 + +**Pipeline runtime**: + +The Fusion file system used with NVMe instance storage contributed to a 34% improvement in total pipeline runtime and a 49% reduction in CPU hours. + +![Run metrics overview](./_images/cpu-table-2.png) + +#### Process run time + +The Fusion file system demonstrates significant performance improvements for most processes in the nf-core/rnaseq pipeline, particularly for I/O-intensive tasks: + +- The most time-consuming processes see improvements of 36.07% to 70.15%, saving hours of runtime in a full pipeline execution. +- Most processes show significant performance improvements with Fusion, with time savings ranging from 35.57% to 99.14%. +- The most substantial improvements are seen in I/O-intensive tasks like SAMTOOLS_FLAGSTAT (95.20% faster) and SAMTOOLS_IDXSTATS (99.14% faster). +- SALMON_INDEX shows a notable 70.15% improvement, reducing runtime from 102.18 minutes to 30.50 minutes. +- STAR_ALIGN_IGENOMES, one of the most time-consuming processes, is 53.82% faster with Fusion, saving nearly an hour of runtime. + +![Average runtime of nf-core/rnaseq processes for eight samples using the Fusion file system and plain S3 storage. Error bars = standard deviation of the mean.](./_images/process-runtime-2.png) + +| Process | S3 Runtime (min) | Fusion Runtime (min) | Time Saved (min) | Improvement (%) | +|---------|------------------|----------------------|------------------|-----------------| +| SAMTOOLS_IDXSTATS | 18.54 | 0.16 | 18.38 | 99.14% | +| SAMTOOLS_FLAGSTAT | 22.94 | 1.10 | 21.84 | 95.20% | +| SAMTOOLS_STATS | 22.54 | 3.18 | 19.36 | 85.89% | +| SALMON_INDEX | 102.18 | 30.50 | 71.68 | 70.15% | +| BEDTOOLS_GENOMECOV_FW | 19.53 | 7.10 | 12.43 | 63.64% | +| BEDTOOLS_GENOMECOV_REV | 18.88 | 7.35 | 11.53 | 61.07% | +| PICARD_MARKDUPLICATES | 102.15 | 41.60 | 60.55 | 59.27% | +| STRINGTIE | 17.63 | 7.60 | 10.03 | 56.89% | +| RSEQC_READDISTRIBUTION | 16.33 | 7.19 | 9.14 | 55.97% | +| STAR_ALIGN_IGENOMES | 106.42 | 49.15 | 57.27 | 53.82% | +| SALMON_QUANT | 30.83 | 15.58 | 15.25 | 49.46% | +| RSEQC_READDUPLICATION | 19.42 | 12.15 | 7.27 | 37.44% | +| QUALIMAP_RNASEQ | 141.40 | 90.40 | 51.00 | 36.07% | +| TRIMGALORE | 51.22 | 33.00 | 18.22 | 35.57% | +| DUPRADAR | 49.04 | 77.81 | -28.77 | -58.67% | + +
+ **Pipeline optimization** + + Seqera Platform's task-level resource usage metrics allow you to determine the resources requested for a task and what was actually used. This information helps you fine-tune your configuration more accurately. + + However, manually adjusting resources for every task in your pipeline is impractical. Instead, you can leverage the pipeline optimization feature on the Launchpad. + + Pipeline optimization analyzes resource usage data from previous runs to optimize the resource allocation for future runs. After a successful run, optimization becomes available, indicated by the lightbulb icon next to the pipeline turning black. + + #### Optimize nf-core/rnaseq + + Select the lightbulb icon next to nf-core/rnaseq in your workspace Launchpad to view the optimized profile. You have the flexibility to tailor the optimization's target settings and incorporate a retry strategy as needed. + + #### View optimized configuration + + When you select the lightbulb, you can access an optimized configuration profile in the second tab of the **Customize optimization profile** window. + + This profile consists of Nextflow configuration settings for each process and each resource directive (where applicable): **cpus**, **memory**, and **time**. The optimized setting for a given process and resource directive is based on the maximum use of that resource across all tasks in that process. + + Once optimization is selected, subsequent runs of that pipeline will inherit the optimized configuration profile, indicated by the black lightbulb icon with a checkmark. + + :::info + Optimization profiles are generated from one run at a time, defaulting to the most recent run, and _not_ an aggregation of previous runs. + ::: + + ![Optimized configuration](./quickstart-demo/assets/optimize-configuration.gif) + + Verify the optimized configuration of a given run by inspecting the resource usage plots for that run and these fields in the run's task table: + + | Description | Key | + | ------------ | ---------------------- | + | CPU usage | `pcpu` | + | Memory usage | `peakRss` | + | Runtime | `start` and `complete` | + +
\ No newline at end of file diff --git a/platform_versioned_docs/version-24.1/compute-envs/azure-batch.mdx b/platform_versioned_docs/version-24.1/compute-envs/azure-batch.mdx index d2efd23ad..e52e967ac 100644 --- a/platform_versioned_docs/version-24.1/compute-envs/azure-batch.mdx +++ b/platform_versioned_docs/version-24.1/compute-envs/azure-batch.mdx @@ -8,116 +8,172 @@ tags: [azure, batch, compute environment] :::note This guide assumes you already have an Azure account with a valid Azure Subscription. For details, visit [Azure Free Account][az-create-account]. -Ensure you have sufficient permissions to create resource groups, an Azure Storage account, and a Batch account. +Ensure you have sufficient permissions to create resource groups, an Azure Storage account, and an Azure Batch account. ::: -## Concepts +## Azure concepts -### Accounts +#### Regions -Seqera Platform relies on an existing Azure Storage and Azure Batch account. You need at least 1 valid Azure Storage account and Azure Batch account within your subscription. +Azure regions are specific geographic locations around the world where Microsoft has established data centers to host its cloud services. Each Azure region is a collection of data centers that provide users with high availability, fault tolerance, and low latency for cloud services. Each region offers a wide range of Azure services that can be chosen to optimize performance, ensure data residency compliance, and meet regulatory requirements. Azure regions also enable redundancy and disaster recovery options by allowing resources to be replicated across different regions, enhancing the resilience of applications and data. -Azure uses 'accounts' for each service. For example, an [Azure Storage account][az-learn-storage] will house a collection of blob containers, file shares, queues, and tables. While you can have multiple Azure Storage and Azure Batch accounts in an Azure subscription, each compute environment on the platform can only use one of each (one storage and one Batch account). You can set up multiple compute environments on the platform with different credentials, storage accounts, and Batch accounts. +#### Resource groups -### Resource group +An Azure resource group is a logical container that holds related Azure resources such as virtual machines, storage accounts, databases, and more. A resource group serves as a management boundary to organize, deploy, monitor, and manage the resources within it as a single entity. Resources in a resource group share the same lifecycle, meaning they can be deployed, updated, and deleted together. This also enables easier access control, monitoring, and cost management, making resource groups a foundational element in organizing and managing cloud infrastructure in Azure. -To create Azure Batch and Azure Storage accounts, first create a [resource group][az-learn-rg] in your preferred region. +#### Accounts -:::note -A resource group can be created while creating an Azure Storage Account or Azure Batch account. -::: +Azure uses accounts for each service. For example, an [Azure Storage account][az-learn-storage] will house a collection of blob containers, file shares, queues, and tables. An Azure subscription can have multiple Azure Storage and Azure Batch accounts - however, a Platform compute environment can only use one of each. Multiple Platform compute environments can be created to use separate credentials, Azure Storage accounts, and Azure Batch accounts. -### Regions +#### Service principals -Azure resources can operate across regions, but this incurs additional costs and security requirements. It is recommended to place all resources in the same region. See the [Azure product page on data residency][az-data-residency] for more information. +An Azure service principal is an identity created specifically for applications, hosted services, or automated tools to access Azure resources. It acts like a user identity with a defined set of permissions, enabling resources authenticated through the service principal to perform actions within the Azure account. The platform can utilize an Azure service principal to authenticate and access Azure Batch for job execution and Azure Storage for data management. -## Resource group +## Create Azure resources -A resource group in Azure is a unit of related resources in Azure. As a rule of thumb, resources that have a similar lifecycle should be within the same resource group. You can delete a resource group and all associated components together. We recommend placing all platform compute resources in the same resource group, but this is not necessary. +### Resource group -### Create a resource group +Create a resource group to link your Azure Batch and Azure Storage account: -1. Log in to your Azure account, go to the [Create Resource group][az-create-rg] page, and select **Create new resource group**. -2. Enter a name for the resource group, e.g., _towerrg_. -3. Choose the preferred region. -4. Select **Review and Create** to proceed. -5. Select **Create**. +:::note +A resource group can be created while creating an Azure Storage account or Azure Batch account. +::: -## Storage account +1. Log in to your Azure account, go to the [Create Resource group][az-create-rg] page, and select **Create new resource group**. +1. Enter a name for the resource group, such as _seqeracompute_. +1. Choose the preferred region. +1. Select **Review and Create** to proceed. +1. Select **Create**. -After creating a resource group, set up an [Azure storage account][az-learn-storage]. +### Storage account -### Create a storage account +After creating a resource group, set up an [Azure Storage account][az-learn-storage]: 1. Log in to your Azure account, go to the [Create storage account][az-create-storage] page, and select **Create a storage account**. - :::note If you haven't created a resource group, you can do so now. ::: - -2. Enter a name for the storage account (e.g., _towerrgstorage_). -3. Choose the preferred region (same as the Batch account). -4. The platform supports any performance or redundancy settings — select the most appropriate settings for your use case. -5. Select **Next: Advanced**. -6. Enable _storage account key access_. -7. Select **Next: Networking**. +1. Enter a name for the storage account, such as _seqeracomputestorage_. +1. Choose the preferred region. This must be the same region as the Batch account. +1. Platform supports all performance or redundancy settings — select the most appropriate settings for your use case. +1. Select **Next: Advanced**. +1. Enable _storage account key access_. +1. Select **Next: Networking**. - Enable public access from all networks. You can enable public access from selected virtual networks and IP addresses, but you will be unable to use Forge to create compute resources. Disabling public access is not supported. -8. Select **Data protection**. +1. Select **Data protection**. - Configure appropriate settings. All settings are supported by the platform. -9. Select **Encryption**. +1. Select **Encryption**. - Only Microsoft-managed keys (MMK) are supported. -10. In **tags**, add any required tags for the storage account. -11. Select **Review and Create**. -12. Select **Create** to create the Azure Storage account. -13. You will need at least one blob storage container to act as a working directory for Nextflow. -14. Go to your new storage account and select **+ Container** to create a new Blob storage container. A new container dialogue will open. Enter a suitable name, e.g., _towerrgstorage-container_. -15. Go to the **Access Keys** section of your new storage account (_towerrgstorage_ in this example). -16. Store the access keys for your Azure Storage account, to be used when you create a Seqera compute environment. +1. In **tags**, add any required tags for the storage account. +1. Select **Review and Create**. +1. Select **Create** to create the Azure Storage account. + - You will need at least one Blob Storage container to act as a working directory for Nextflow. +1. Go to your new storage account and select **+ Container** to create a new Blob Storage container. A new container dialogue will open. Enter a suitable name, such as _seqeracomputestorage-container_. +1. Go to the **Access Keys** section of your new storage account (_seqeracomputestorage_ in this example). +1. Store the access keys for your Azure Storage account, to be used when you create a Seqera compute environment. :::caution Blob container storage credentials are associated with the Batch pool configuration. Avoid changing these credentials in your Seqera instance after you have created the compute environment. ::: -## Batch account - -After you have created a resource group and storage account, create a [Batch account][az-learn-batch]. +### Batch account -### Create a Batch account +After you have created a resource group and Storage account, create a [Batch account][az-learn-batch]: 1. Log in to your Azure account and select **Create a batch account** on [this page][az-create-batch]. -2. Select the existing resource group or create a new one. -3. Enter a name for the Batch account, e.g., _towerrgbatch_. -4. Choose the preferred region (same as the storage account). -5. Select **Advanced**. -6. For _Pool allocation mode_, select Batch service. -7. For _Authentication mode_, ensure _Shared Key_ is selected. -8. Select **Networking**. Ensure networking access is sufficient for the platform and any additional required resources. -9. In **tags**, add any required tags for the Batch account. -10. Select **Review and Create**. -11. Select **Create**. -12. Go to your new Batch account, then select **Access Keys**. -13. Store the access keys for your Azure Batch account, to be used when you create a Seqera compute environment. - +1. Select the existing resource group or create a new one. +1. Enter a name for the Batch account, such as _seqeracomputebatch_. +1. Choose the preferred region. This must be the same region as the Storage account. +1. Select **Advanced**. +1. For **Pool allocation mode**, select **Batch service**. +1. For **Authentication mode**, select _Shared Key_. +1. Select **Networking**. Ensure networking access is sufficient for Platform and any additional required resources. +1. Add any **Tags** to the Batch account, if needed. +1. Select **Review and Create**. +1. Select **Create**. +1. Go to your new Batch account, then select **Access Keys**. +1. Store the access keys for your Azure Batch account, to be used when you create a Seqera compute environment. :::caution A newly-created Azure Batch account may not be entitled to create virtual machines without making a service request to Azure. See [Azure Batch service quotas and limits][az-batch-quotas] for more information. ::: - -14. Select the **+ Quotas** tab of the Azure Batch account to check and increase existing quotas if necessary. -15. Select **+ Request quota increase** and add the quantity of resources you require. Here is a brief guideline: - +1. Select the **+ Quotas** tab of the Azure Batch account to check and increase existing quotas if necessary. +1. Select **+ Request quota increase** and add the quantity of resources you require. Here is a brief guideline: - **Active jobs and schedules**: Each Nextflow process will require an active Azure Batch job per pipeline while running, so increase this number to a high level. See [here][az-learn-jobs] to learn more about jobs in Azure Batch. - **Pools**: Each platform compute environment requires one Azure Batch pool. Each pool is composed of multiple machines of one virtual machine size. - :::note To use separate pools for head and compute nodes, see [this FAQ entry](../faqs.mdx#azure). ::: - - **Batch accounts per region per subscription**: Set this to the number of Azure Batch accounts per region per subscription. Only one is required. - **Spot/low-priority vCPUs**: Platform does not support spot or low-priority machines when using Forge, so when using Forge this number can be zero. When manually setting up a pool, select an appropriate number of concurrent vCPUs here. - **Total Dedicated vCPUs per VM series**: See the Azure documentation for [virtual machine sizes][az-vm-sizes] to help determine the machine size you need. We recommend the latest version of the ED series available in your region as a cost-effective and appropriately-sized machine for running Nextflow. However, you will need to select alternative machine series that have additional requirements, such as those with additional GPUs or faster storage. Increase the quota by the number of required concurrent CPUs. In Azure, machines are charged per cpu minute so there is no additional cost for a higher number. -### Compute environment +### Credentials + +There are two types of Azure credentials available: access keys and Entra service principals. + +Access keys are simple to use but have several limitations: +- Access keys are long-lived. +- Access keys provide full access to the Azure Storage and Azure Batch accounts. +- Azure allows only two access keys per account, making them a single point of failure. + +Entra service principals are accounts which can be granted access to Azure Batch and Azure Storage resources: +- Service principals enable role-based access control with more precise permissions. +- Service principals map to a many-to-many relationship with Azure Batch and Azure Storage accounts. +- Some Azure Batch features are only available when using a service principal. + +:::note +The two Azure credential types use different authentication methods. You can add more than one credential to a workspace, but Platform compute environments use only one credential at any given time. While separate credentials can be used by separate compute environments concurrently, they are not cross-compatible — access granted by one credential will not be shared with the other. +::: + +#### Access keys + +:::info +Batch Forge compute environments must use access keys for authentication. Service principals are only supported in manual compute environments. +::: + +To create an access key: + +1. Navigate to the Azure Portal and sign in. +1. Locate the Azure Batch account and select **Keys** under **Account management**. The Primary and Secondary keys are listed here. Copy one of the keys and save it in a secure location for later use. +1. Locate the Azure Storage account and, under the **Security and Networking** section, select **Access keys**. Key1 and Key2 options are listed here. Copy one of them and save it in a secure location for later use. +1. In your Platform workspace **Credentials** tab, select the **Add credentials** button and complete the following fields: + - Enter a **Name** for the credentials + - **Provider**: Azure + - Select the **Shared key** tab + - Add the **Batch account** and **Blob Storage account** names and access keys to the relevant fields. +1. Delete the copied keys from their temporary location after they have been added to a credential in Platform. + +#### Entra service principal + +:::info +Batch Forge compute environments must use access keys for authentication. Service principals are only supported in manual compute environments. + +The use of Entra service principals in manual compute environments requires the use of a [managed identity](#managed-identity). +::: + +See [Create a service principal][az-create-sp] for more details. + +To create an Entra service principal: + +1. In the Azure Portal, navigate to **Microsoft Entra ID**. Under **App registrations**, select **New registration**. +1. Provide a name for the application. The application will automatically have a service principal associated with it. +1. Assign roles to the service principal: + 1. Go to the Azure Storage account. Under **Access Control (IAM)**, select **Add role assignment**. + 1. Select the **Storage Blob Data Reader** and **Storage Blob Data Contributor** roles. + 1. Select **Members**, then **Select Members**. Search for your newly created service principal and assign the role. + 1. Repeat the same process for the Azure Batch account, using the **Azure Batch Contributor** role. +1. Platform will need credentials to authenticate as the service principal: + 1. Navigate back to the app registration. On the **Overview** page, save the **Application (client) ID** value for use in Platform. + 1. Select **Certificates & secrets**, then **New client secret**. A new secret is created containing a value and secret ID. Save both values securely for use in Platform. +1. In your Platform workspace **Credentials** tab, select the **Add credentials** button and complete the following fields: + - Enter a **Name** for the credentials + - **Provider**: Azure + - Select the **Entra** tab + - Complete the remaining fields: **Batch account name**, **Blob Storage account name**, **Tenant ID** (Application (client) ID in Azure), **Client ID** (Client secret ID in Azure), **Client secret** (Client secret value in Azure). +1. Delete the ID and secret values from their temporary location after they have been added to a credential in Platform. + +## Platform compute environment There are two ways to create an Azure Batch compute environment in Seqera Platform: @@ -133,16 +189,15 @@ Batch Forge automatically creates resources that you may be charged for in your Create a Batch Forge Azure Batch compute environment: 1. In a workspace, select **Compute Environments > New Environment**. -1. Enter a descriptive name, e.g., _Azure Batch (east-us)_. +1. Enter a descriptive name, such as _Azure Batch (east-us)_. 1. Select **Azure Batch** as the target platform. -1. Choose existing Azure credentials or add a new credential. If you are using existing credentials, skip to step 7. - :::tip - You can create multiple credentials in your Seqera environment. +1. Choose existing Azure credentials or add a new credential. + :::info + Batch Forge compute environments must use access keys for authentication. Entra service principals are only supported in manual compute environments. ::: -1. Enter a name for the credentials, e.g., _Azure Credentials_. 1. Add the **Batch account** and **Blob Storage** account names and access keys. -1. Select a **Region**, e.g., _eastus_. -1. In the **Pipeline work directory** field, enter the Azure blob container created previously, e.g., `az://towerrgstorage-container/work`. +1. Select a **Region**, such as _eastus_. +1. In the **Pipeline work directory** field, enter the Azure blob container created previously. For example, `az://seqeracomputestorage-container/work`. :::note When you specify a Blob Storage bucket as your work directory, this bucket is used for the Nextflow [cloud cache](https://www.nextflow.io/docs/latest/cache-and-resume.html#cache-stores) by default. You can specify an alternative cache location with the **Nextflow config file** field on the pipeline [launch](../launch/launchpad.mdx#launch-form) form. ::: @@ -198,19 +253,17 @@ This section is for users with a pre-configured Azure Batch pool. This requires Your Seqera compute environment uses resources that you may be charged for in your Azure account. See [Cloud costs](../monitoring/cloud-costs.mdx) for guidelines to manage cloud resources effectively and prevent unexpected costs. ::: -**Create a manual Seqera Azure Batch compute environment** +Create a manual Seqera Azure Batch compute environment: 1. In a workspace, select **Compute Environments > New Environment**. -1. Enter a descriptive name for this environment, e.g., _Azure Batch (east-us)_. -1. Select **Azure Batch** as the target platform. -1. Select your existing Azure credentials or select **+** to add new credentials. If you choose to use existing credentials, skip to step 7. - :::tip - You can create multiple credentials in your Seqera environment. +1. Enter a descriptive name for this environment, such as _Azure Batch (east-us)_. +1. For **Provider**, select **Azure Batch**. +1. Select your existing Azure credentials (access keys or Entra service principal) or select **+** to add new credentials. + :::note + To authenticate using an Entra service principal, you must include a user-assigned managed identity. See [Managed identity](#managed-identity) below. ::: -1. Enter a name, e.g., _Azure Credentials_. -1. Add the **Batch account** and **Blob Storage** credentials you created previously. -1. Select a **Region**, e.g., _eastus (East US)_. -1. In the **Pipeline work directory** field, add the Azure blob container created previously, e.g., `az://towerrgstorage-container/work`. +1. Select a **Region**, such as _eastus (East US)_. +1. In the **Pipeline work directory** field, add the Azure blob container created previously. For example, `az://seqeracomputestorage-container/work`. :::note When you specify a Blob Storage bucket as your work directory, this bucket is used for the Nextflow [cloud cache](https://www.nextflow.io/docs/latest/cache-and-resume.html#cache-stores) by default. You can specify an alternative cache location with the **Nextflow config file** field on the pipeline [launch](../launch/launchpad.mdx#launch-form) form. ::: @@ -242,13 +295,16 @@ Your Seqera compute environment uses resources that you may be charged for in yo 1. Set the **Config mode** to **Manual**. 1. Enter the **Compute Pool name**. This is the name of the Azure Batch pool you created previously in the Azure Batch account. :::note - The default Azure Batch implementation uses a single pool for head and compute nodes. To use separate pools for head and compute nodes (e.g., to use low-priority VMs for compute jobs), see [this FAQ entry](../faqs.mdx#azure). + The default Azure Batch implementation uses a single pool for head and compute nodes. To use separate pools for head and compute nodes (for example, to use low-priority VMs for compute jobs), see [this FAQ entry](../faqs.mdx#azure). ::: 1. Enter a user-assigned **Managed identity client ID**, if one is attached to your Azure Batch pool. See [Managed Identity](#managed-identity) below. 1. Apply [**Resource labels**](../resource-labels/overview.mdx). This will populate the **Metadata** fields of the Azure Batch pool. 1. Expand **Staging options** to include: - Optional [pre- or post-run Bash scripts](../launch/advanced.mdx#pre--post-run-scripts) that execute before or after the Nextflow pipeline execution in your environment. - Global Nextflow configuration settings for all pipeline runs launched with this compute environment. Configuration settings in this field override the same values in the pipeline Nextflow config file. + :::info + To use managed identities, Platform requires Nextflow version 24.06.0-edge or later. Add `export NXF_VER=24.06.0-edge` to the **Global Nextflow config** field for your compute environment to use this Nextflow version by default. + ::: 1. Define custom **Environment Variables** for the **Head Job** and/or **Compute Jobs**. 1. Configure any necessary advanced options: - Use **Jobs cleanup policy** to control how Nextflow process jobs are deleted on completion. Active jobs consume the quota of the Azure Batch account. By default, jobs are terminated by Nextflow and removed from the quota when all tasks succesfully complete. If set to _Always_, all jobs are deleted by Nextflow after pipeline completion. If set to _Never_, jobs are never deleted. If set to _On success_, successful tasks are removed but failed tasks will be left for debugging purposes. @@ -261,13 +317,17 @@ See [Launch pipelines](../launch/launchpad.mdx) to start executing workflows in ### Managed identity +:::info +To use managed identities, Platform requires requires Nextflow version 24.06.0-edge or later. Add `export NXF_VER=24.06.0-edge` to the **Global Nextflow config** field in advanced options for your compute environment to use this Nextflow version by default (see manual instructions above). +::: + Nextflow can authenticate to Azure services using a managed identity. This method offers enhanced security compared to access keys, but must run on Azure infrastructure. -When you use a manually configured compute environment with a managed identity attached to the Azure Batch Pool, Nextflow can use this managed identity for authentication. However, Platform still needs to use access keys to submit the initial task to Azure Batch to run Nextflow, which will then proceed with the managed identity for subsequent authentication. +When you use a manually configured compute environment with a managed identity attached to the Azure Batch Pool, Nextflow can use this managed identity for authentication. However, Platform still needs to use access keys or an Entra service principal to submit the initial task to Azure Batch to run Nextflow, which will then proceed with the managed identity for subsequent authentication. 1. In Azure, create a user-assigned managed identity. See [Manage user-assigned managed identities](https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/how-manage-user-assigned-managed-identities) for detailed steps. After creation, record the Client ID of the managed identity. 1. The user-assigned managed identity must have the necessary access roles for Nextflow. See [Required role assignments](https://www.nextflow.io/docs/latest/azure.html#required-role-assignments) for more information. -1. Associate the user-assigned managed identity with the Azure Batch Pool. See [Set up managed identity in your batch pool](https://learn.microsoft.com/en-us/troubleshoot/azure/hpc/batch/use-managed-identities-azure-batch-account-pool#set-up-managed-identity-in-your-batch-pool) for more information. +1. Associate the user-assigned managed identity with the Azure Batch Pool. See [Set up managed identity in your Batch pool](https://learn.microsoft.com/en-us/troubleshoot/azure/hpc/batch/use-managed-identities-azure-batch-account-pool#set-up-managed-identity-in-your-batch-pool) for more information. 1. When you set up the Platform compute environment, select the Azure Batch pool by name and enter the managed identity client ID in the specified field as instructed above. When you submit a pipeline to this compute environment, Nextflow will authenticate using the managed identity associated with the Azure Batch node it runs on, rather than relying on access keys. @@ -283,6 +343,7 @@ When you submit a pipeline to this compute environment, Nextflow will authentica [az-learn-jobs]: https://learn.microsoft.com/en-us/azure/batch/jobs-and-tasks [az-create-rg]: https://portal.azure.com/#create/Microsoft.ResourceGroup [az-create-storage]: https://portal.azure.com/#create/Microsoft.StorageAccount-ARM +[az-create-sp]: https://learn.microsoft.com/en-us/entra/identity-platform/howto-create-service-principal-portal [wave-docs]: https://docs.seqera.io/wave [fusion-docs]: https://docs.seqera.io/fusion diff --git a/platform_versioned_docs/version-24.1/compute-envs/google-cloud-batch.mdx b/platform_versioned_docs/version-24.1/compute-envs/google-cloud-batch.mdx index 3eb4f810b..ce05e8a12 100644 --- a/platform_versioned_docs/version-24.1/compute-envs/google-cloud-batch.mdx +++ b/platform_versioned_docs/version-24.1/compute-envs/google-cloud-batch.mdx @@ -48,7 +48,7 @@ Alternatively, you can enable each API manually by selecting your project in the ### IAM -Seqera requires a service account with appropriate permissions to interact with your Google Cloud resources. As an IAM user, you must have access to the service account that will be submitting Batch jobs. +Seqera requires a service account with appropriate permissions to interact with your Google Cloud resources. As an IAM user, you must have access to the service account that submits Batch jobs. :::caution By default, Google Cloud Batch uses the default Compute Engine service account to submit jobs. This service account is granted the Editor (`roles/Editor`) role. While this service account has the necessary permissions needed by Seqera, this role is not recommended for production environments. Control job access using a custom service account with only the permissions necessary for Seqera to execute Batch jobs instead. @@ -86,7 +86,7 @@ To configure a credential in Seqera, you must first create a [service account JS 4. Select **JSON** as the key type. 5. Select **Create**. -A JSON file will be downloaded to your computer. This file contains the credential needed to configure the compute environment in Seqera. +A JSON file is downloaded to your computer. This file contains the credential needed to configure the compute environment in Seqera. You can manage your key from the **Service Accounts** page. @@ -176,7 +176,7 @@ Select **Enable Fusion v2** to allow access to your Google Cloud Storage data vi When Fusion v2 is enabled, the following virtual machine settings are applied: - A 375 GB local NVMe SSD is selected for all compute jobs. - - If you do not specify a machine type, a VM from families that support local SSDs will be selected. + - If you do not specify a machine type, a VM from families that support local SSDs is selected. - Any machine types you specify in the Nextflow config must support local SSDs. - Local SSDs are only offered in multiples of 375 GB. You can increment the number of SSDs used per process with the `disk` directive to request multiples of 375 GB. To work with files larger than 100 GB, use at least two SSDs (750 GB or more). - Fusion v2 can also use persistent disks for caching. Override the disk requested by Fusion using the `disk` directive and the `type: pd-standard`. @@ -210,8 +210,8 @@ If you use VM instance templates for the head or compute jobs (see step 6 below) ::: 1. Enable **Use Private Address** to ensure that your Google Cloud VMs aren't accessible to the public internet. -1. Use **Boot disk size** to control the boot disk size of VMs. -1. Use **Head Job CPUs** and **Head Job Memory** to specify the CPUs and memory allocated for head jobs. +1. Use **Boot disk size** to control the persistent disk size that each task and the head job are provided. +1. Use **Head Job CPUs** and **Head Job Memory** to specify the CPUs and memory allocated for the head job. 1. Use **Service Account email** to specify a service account email address other than the Compute Engine default to execute workflows with this compute environment (recommended for productions environments). 1. Use **VPC** and **Subnet** to specify the name of a VPC network and subnet to be used by this compute environment. If your organization's VPC architecture relies on network tags, you can apply network tags to VM instance templates used for the Nextflow head and compute jobs (see below). :::note diff --git a/platform_versioned_docs/version-24.1/compute-envs/hpc.mdx b/platform_versioned_docs/version-24.1/compute-envs/hpc.mdx index 7105c21de..dd846e22b 100644 --- a/platform_versioned_docs/version-24.1/compute-envs/hpc.mdx +++ b/platform_versioned_docs/version-24.1/compute-envs/hpc.mdx @@ -27,9 +27,20 @@ Seqera requires SSH access to your HPC cluster to run pipelines. Use [managed id You can also use workspace [SSH credentials](../credentials/ssh_credentials.mdx) for cluster login, but this provides service account access to your HPC to all Platform users. This means that all users will be granted the same file system access, and all activity is logged under the same user account on your HPC cluster. -For HPC clusters that do not allow direct access through an SSH client, a secure connection can be authenticated with [Tower Agent](../supported_software/agent/agent.mdx). +For HPC clusters that do not allow direct access through an SSH client, a secure connection can be authenticated with [Tower Agent](../supported_software/agent/agent.mdx). -## Seqera HPC compute environment +## Work and launch directories + +For instances where the work directory or launch directory must be set dynamically at runtime, you can use variable expansion. This works in conjunction with Tower Agent. The path that results from variable expansion must exist before workflow execution as the agent does not create directories. + +For example, if the HPC cluster file system has a `/workspace` directory with subdirectories for each user that can run jobs, the value for the work directory can be the following: `/workspace/$TW_AGENT_USER`. For a user `user1`, the work directory resolves to the `/workspace/user1` directory. + +The following variables are supported: + +- `TW_AGENT_WORKDIR`: Resolves to the work directory for Tower Agent. By default, this directory resolves to the `${HOME}/work` path, where `HOME` is the home directory of the user that the agent runs as. The work directory can be overridden by specifying the `--work-dir` argument when configuring Tower Agent. For more information, see the [Tower Agent][agent] documentation. +- `TW_AGENT_USER`: Resolves to the username that the agent is running as. By default, this is the Unix username that the agent runs as. On systems where the agent cannot determine which user it runs as, it falls back to the value of the `USER` environment variable. + +## HPC compute environment To create a new **HPC** compute environment: @@ -37,9 +48,9 @@ To create a new **HPC** compute environment: 1. Enter a descriptive name for this environment. Use only alphanumeric characters, dashes, and underscores. 1. Select your HPC environment from the **Platform** dropdown menu. 1. Select your existing managed identity, SSH, or Tower Agent credentials, or select **+** and **SSH** or **Tower Agent** to add new credentials. -1. Enter the absolute path of the **Work directory** to be used on the cluster. +1. Enter the absolute path of the **Work directory** to be used on the cluster. You can use the `TW_AGENT_WORKDIR` and `TW_AGENT_USER` variables in the file system path. - :::caution + :::caution All managed identity users must be a part of the same Linux user group. The group must have access to the HPC compute environment work directory. Set group permissions for the work directory as follows (replace `sharedgroupname` and `` with your group name and work directory): ```bash @@ -74,3 +85,7 @@ To create a new **HPC** compute environment: 1. Select **Create** to finalize the creation of the compute environment. See [Launch pipelines](../launch/launchpad.mdx) to start executing workflows in your HPC compute environment. + + + +[agent]: ../supported_software/agent/agent.mdx diff --git a/platform_versioned_docs/version-24.1/enterprise/configuration/overview.mdx b/platform_versioned_docs/version-24.1/enterprise/configuration/overview.mdx index 9ab0fcb20..b9e8f82f9 100644 --- a/platform_versioned_docs/version-24.1/enterprise/configuration/overview.mdx +++ b/platform_versioned_docs/version-24.1/enterprise/configuration/overview.mdx @@ -489,7 +489,11 @@ Do not replace the [Seqera-provided default image](../../functionality_matrix/fu ## Seqera API -Enable the API endpoints to host the Seqera Enterprise OpenAPI specification and use the [tw CLI](https://github.com/seqeralabs/tower-cli). +Enable the API endpoints to host the Seqera Enterprise OpenAPI specification and use the [tw CLI](https://github.com/seqeralabs/tower-cli). Set custom API rate limits and timeouts. + +:::note +To configure API rate limit environment variables, you must add `ratelim` to the `MICRONAUT_ENVIRONMENTS`. Without `ratelim` being set, the rate limit configuration variables below are ignored. +::: @@ -497,6 +501,9 @@ Enable the API endpoints to host the Seqera Enterprise OpenAPI specification and | Environment variable | Description | Value | | ---------------------- | ----------------------------------------------------------------------------- | --------------- | | `TOWER_ENABLE_OPENAPI` | Enable the OpenAPI documentation endpoint, e.g., [cloud.seqera.io/openapi/index.html](https://cloud.seqera.io/openapi/index.html). | Default: `true` | +| `TOWER_RATELIMIT_PERIOD` | Specify the maximum number of HTTP requests that can be made during the `TOWER_RATELIMIT_REFRESH` period. | Default: `20` | +| `TOWER_RATELIMIT_REFRESH` | API rate limit refresh period. | Default: `1s` | +| `TOWER_RATELIMIT_TIMEOUT` | The waiting period before rejecting requests over the `TOWER_RATELIMIT_PERIOD` limit during the refresh period. | Default: `500ms` | diff --git a/platform_versioned_docs/version-24.1/getting-started/_images/cpu-table-2.png b/platform_versioned_docs/version-24.1/getting-started/_images/cpu-table-2.png new file mode 100644 index 000000000..919aec152 Binary files /dev/null and b/platform_versioned_docs/version-24.1/getting-started/_images/cpu-table-2.png differ diff --git a/platform_versioned_docs/version-24.1/getting-started/_images/create-ce.gif b/platform_versioned_docs/version-24.1/getting-started/_images/create-ce.gif new file mode 100644 index 000000000..fa2d48f04 Binary files /dev/null and b/platform_versioned_docs/version-24.1/getting-started/_images/create-ce.gif differ diff --git a/platform_versioned_docs/version-24.1/getting-started/_images/create-ds.gif b/platform_versioned_docs/version-24.1/getting-started/_images/create-ds.gif new file mode 100644 index 000000000..bce8331d9 Binary files /dev/null and b/platform_versioned_docs/version-24.1/getting-started/_images/create-ds.gif differ diff --git a/platform_versioned_docs/version-24.1/getting-started/_images/launch-form-1.gif b/platform_versioned_docs/version-24.1/getting-started/_images/launch-form-1.gif new file mode 100644 index 000000000..f863ccaee Binary files /dev/null and b/platform_versioned_docs/version-24.1/getting-started/_images/launch-form-1.gif differ diff --git a/platform_versioned_docs/version-24.1/getting-started/_images/launch-form-2.gif b/platform_versioned_docs/version-24.1/getting-started/_images/launch-form-2.gif new file mode 100644 index 000000000..435236bd9 Binary files /dev/null and b/platform_versioned_docs/version-24.1/getting-started/_images/launch-form-2.gif differ diff --git a/platform_versioned_docs/version-24.1/getting-started/_images/launch-form-3.gif b/platform_versioned_docs/version-24.1/getting-started/_images/launch-form-3.gif new file mode 100644 index 000000000..f59bd2c69 Binary files /dev/null and b/platform_versioned_docs/version-24.1/getting-started/_images/launch-form-3.gif differ diff --git a/platform_versioned_docs/version-24.1/getting-started/_images/nf-core-rnaseq_metro_map_grey_static.svg b/platform_versioned_docs/version-24.1/getting-started/_images/nf-core-rnaseq_metro_map_grey_static.svg new file mode 100644 index 000000000..a0e7a4ccc --- /dev/null +++ b/platform_versioned_docs/version-24.1/getting-started/_images/nf-core-rnaseq_metro_map_grey_static.svg @@ -0,0 +1,239 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/platform_versioned_docs/version-24.1/getting-started/_images/pipelines-add.gif b/platform_versioned_docs/version-24.1/getting-started/_images/pipelines-add.gif new file mode 100644 index 000000000..e37292541 Binary files /dev/null and b/platform_versioned_docs/version-24.1/getting-started/_images/pipelines-add.gif differ diff --git a/platform_versioned_docs/version-24.1/getting-started/_images/process-runtime-2.png b/platform_versioned_docs/version-24.1/getting-started/_images/process-runtime-2.png new file mode 100644 index 000000000..1139ad7aa Binary files /dev/null and b/platform_versioned_docs/version-24.1/getting-started/_images/process-runtime-2.png differ diff --git a/platform_versioned_docs/version-24.1/getting-started/_images/rstudio.gif b/platform_versioned_docs/version-24.1/getting-started/_images/rstudio.gif new file mode 100644 index 000000000..31a66d4b8 Binary files /dev/null and b/platform_versioned_docs/version-24.1/getting-started/_images/rstudio.gif differ diff --git a/platform_versioned_docs/version-24.1/getting-started/rnaseq.mdx b/platform_versioned_docs/version-24.1/getting-started/rnaseq.mdx new file mode 100644 index 000000000..cc968f820 --- /dev/null +++ b/platform_versioned_docs/version-24.1/getting-started/rnaseq.mdx @@ -0,0 +1,752 @@ +--- +title: "RNA-Seq" +description: "An introduction to running nf-core/rnaseq in Seqera Platform" +date: "21 Jul 2024" +tags: [platform, seqera pipelines, data studios, rnaseq, compute environment, aws] +toc_max_heading_level: 2 +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +This guide details how to run bulk RNA sequencing (RNA-Seq) data analysis, from quality control to differential expression analysis, on an AWS Batch compute environment in Platform. It includes: + +- Creating an AWS Batch compute environment to run your pipeline and analysis environment +- Adding pipelines to your workspace +- Importing your pipeline input data +- Launching the pipeline and monitoring execution from your workspace +- Setting up a custom analysis environment with Data Studios +- Resource allocation guidance for RNA-Seq data + +:::info[**Prerequisites**] +You will need the following to get started: + +- [Admin](../orgs-and-teams/roles.mdx) permissions in an existing organization workspace. See [Set up your workspace](./workspace-setup.mdx) to create an organization and workspace from scratch. +- An existing AWS cloud account with access to the AWS Batch service. +- Existing access credentials with permissions to create and manage resources in your AWS account. See [IAM](../compute-envs/aws-batch.mdx#iam) for guidance to set up IAM permissions for Platform. +::: + +## Compute environment + +Compute and storage requirements for RNA-Seq analysis are dependent on the number of samples and the sequencing depth of your input data. See [RNA-Seq data and requirements](#rna-seq-data-and-requirements) for details on RNA-Seq datasets and the CPU and memory requirements for important steps of RNA-Seq pipelines. + +In this guide, you will create an AWS Batch compute environment with sufficient resources allocated to run the [nf-core/rnaseq](https://github.com/nf-core/rnaseq) pipeline with a large dataset. This compute environment will also be used to run a Data Studios RStudio environment for tertiary analysis of the resulting pipeline data. + +:::note +The compute recommendations below are based on internal benchmarking performed by Seqera. See [RNA-Seq data and requirements](#rna-seq-data-and-requirements) for more information. +::: + +### Recommended compute environment resources + +The following compute resources are recommended for production RNA-Seq pipelines, depending on the size of your input dataset: + +| **Setting** | **Value** | +|--------------------------------|---------------------------------------| +| **Instance Types** | `m5,r5` | +| **vCPUs** | 2 - 8 | +| **Memory (GiB)** | 8 - 32 | +| **Max CPUs** | >500 | +| **Min CPUs** | 0 | + +#### Fusion file system + +The [Fusion](../supported_software/fusion/fusion.mdx) file system enables seamless read and write operations to cloud object stores, leading to +simpler pipeline logic and faster, more efficient execution. While Fusion is not required to run nf-core/rnaseq, it is recommended for optimal performance. See [nf-core/rnaseq performance in Platform](#nf-corernaseq-performance-in-platform) at the end of this guide. + +Fusion works best with AWS NVMe instances (fast instance storage) as this delivers the fastest performance when compared to environments using only AWS EBS (Elastic Block Store). Batch Forge selects instances automatically based on your compute environment configuration, but you can optionally specify instance types. To enable fast instance storage (see Create compute environment below), you must select EC2 instances with NVMe SSD storage (`m5d` or `r5d` families). + +:::note +Fusion requires a license for use in Seqera Platform compute environments or directly in Nextflow. Fusion can be trialed at no cost. [Contact Seqera](https://seqera.io/contact-us/) for more details. +::: + +### Create compute environment + +![Add Platform compute environment](./_images/create-ce.gif) + +From the **Compute Environments** tab in your organization workspace, select **Add compute environment** and complete the following fields: + +| **Field** | **Description** | +|---------------------------------------|------------------------------------------------------------| +| **Name** | A unique name for the compute environment. | +| **Platform** | AWS Batch | +| **Credentials** | Select existing credentials, or **+** to create new credentials:| +| **Access Key** | AWS access key ID. | +| **Secret Key** | AWS secret access key. | +| **Region** | The target execution region. | +| **Pipeline work directory** | An S3 bucket path in the same execution region. | +| **Enable Wave Containers** | Use the Wave containers service to provision containers. | +| **Enable Fusion v2** | Access your S3-hosted data via the Fusion v2 file system. | +| **Enable fast instance storage** | Use NVMe instance storage to speed up I/O and disk access. Requires Fusion v2.| +| **Config Mode** | Batch Forge | +| **Provisioning Model** | Choose between Spot and On-demand instances. | +| **Max CPUs** | Sensible values for production use range between 2000 and 5000.| +| **Enable Fargate for head job** | Run the Nextflow head job using the Fargate container service to speed up pipeline launch. Requires Fusion v2.| +| **Allowed S3 buckets** | Additional S3 buckets or paths to be granted read-write permission for this compute environment. Add data paths to be mounted in your data studio here, if different from your pipeline work directory.| +| **Resource labels** | `name=value` pairs to tag the AWS resources created by this compute environment.| + + +## Add pipeline to Platform + +:::info +The [nf-core/rnaseq](https://github.com/nf-core/rnaseq) pipeline is a highly configurable and robust workflow designed to analyze RNA-Seq data. It performs quality control, alignment and quantification. + +![nf-core/rnaseq subway map](./_images/nf-core-rnaseq_metro_map_grey_static.svg) +::: + +[Seqera Pipelines](https://seqera.io/pipelines) is a curated collection of quality open-source pipelines that can be imported directly to your workspace Launchpad in Platform. Each pipeline includes a curated test dataset to use in a test run to confirm compute environment compatibility in just a few steps. + +To use Seqera Pipelines to import the `nf-core/rnaseq` pipeline to your workspace: + +![Seqera Pipelines add to Launchpad](./_images/pipelines-add.gif) + +1. Search for `nf-core/rnaseq` and select **Launch** next to the pipeline name in the list. In the **Add pipeline** tab, select **Cloud** or **Enterprise** depending on your Platform account type, then provide the information needed for Seqera Pipelines to access your Platform instance: + - **Seqera Cloud**: Paste your Platform **Access token** and select **Next**. + - **Seqera Enterprise**: Specify the **Seqera Platform URL** (hostname) and **Base API URL** for your Enterprise instance, then paste your Platform **Access token** and select **Next**. + :::tip + If you do not have a Platform access token, select **Get your access token from Seqera Platform** to open the Access tokens page in a new browser tab. + ::: +1. Select your Platform **Organization**, **Workspace**, and **Compute environment** for the imported pipeline. +1. (Optional) Customize the **Pipeline Name** and **Pipeline Description**. +1. Select **Add Pipeline**. + +:::info +To add a custom pipeline not listed in Seqera Pipelines to your Platform workspace, see [Add pipelines](./quickstart-demo/add-pipelines.mdx#) for manual Launchpad instructions. +::: + +## Pipeline input data + +The [nf-core/rnaseq](https://github.com/nf-core/rnaseq) pipeline works with input datasets (samplesheets) containing sample names, FASTQ file locations (paths to FASTQ files in cloud or local storage), and strandedness. For example, the dataset used in the `test_full` profile is derived from the publicly available iGenomes collection of datasets, commonly used in bioinformatics analyses. + +This dataset represents RNA-Seq samples from various human cell lines (GM12878, K562, MCF7, and H1) with biological replicates, stored in an AWS S3 bucket (`s3://ngi-igenomes`) as part of the iGenomes resource. These RNA-Seq datasets consist of paired-end sequencing reads, which can be used to study gene expression patterns in different cell types. + +
+ **nf-core/rnaseq test_full profile dataset** + + | sample | fastq_1 | fastq_2 | strandedness | + |--------|---------|---------|--------------| + | GM12878_REP1 | s3://ngi-igenomes/test-data/rnaseq/SRX1603629_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX1603629_T1_2.fastq.gz | reverse | + | GM12878_REP2 | s3://ngi-igenomes/test-data/rnaseq/SRX1603630_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX1603630_T1_2.fastq.gz | reverse | + | K562_REP1 | s3://ngi-igenomes/test-data/rnaseq/SRX1603392_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX1603392_T1_2.fastq.gz | reverse | + | K562_REP2 | s3://ngi-igenomes/test-data/rnaseq/SRX1603393_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX1603393_T1_2.fastq.gz | reverse | + | MCF7_REP1 | s3://ngi-igenomes/test-data/rnaseq/SRX2370490_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX2370490_T1_2.fastq.gz | reverse | + | MCF7_REP2 | s3://ngi-igenomes/test-data/rnaseq/SRX2370491_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX2370491_T1_2.fastq.gz | reverse | + | H1_REP1 | s3://ngi-igenomes/test-data/rnaseq/SRX2370468_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX2370468_T1_2.fastq.gz | reverse | + | H1_REP2 | s3://ngi-igenomes/test-data/rnaseq/SRX2370469_T1_1.fastq.gz | s3://ngi-igenomes/test-data/rnaseq/SRX2370469_T1_2.fastq.gz | reverse | + +
+ +In Platform, samplesheets and other data can be made easily accessible in one of two ways: +- Use **Data Explorer** to browse and interact with remote data from AWS S3, Azure Blob Storage, and Google Cloud Storage repositories, directly in your organization workspace. +- Use **Datasets** to upload structured data to your workspace in CSV (Comma-Separated Values) or TSV (Tab-Separated Values) format. + +
+ **Add a cloud bucket via Data Explorer** + + Private cloud storage buckets accessible with the credentials in your workspace are added to Data Explorer automatically by default. However, you can also add custom directory paths within buckets to your workspace to simplify direct access. + + To add individual buckets (or directory paths within buckets): + + ![Add public bucket](./quickstart-demo/assets/data-explorer-add-bucket.gif) + + 1. From the **Data Explorer** tab, select **Add cloud bucket**. + 1. Specify the bucket details: + - The cloud **Provider**. + - An existing cloud **Bucket path**. + - A unique **Name** for the bucket. + - The **Credentials** used to access the bucket. For public cloud buckets, select **Public**. + - An optional bucket **Description**. + 1. Select **Add**. + + You can now select data directly from this bucket as input when launching your pipeline, without the need to interact with cloud consoles or CLI tools. + +
+ +
+ **Add a dataset** + + From the **Datasets** tab, select **Add Dataset**. + + ![Add a dataset](./quickstart-demo/assets/sp-cloud-add-a-dataset.gif) + + Specify the following dataset details: + + - A **Name** for the dataset, such as `nf-core-rnaseq-dataset`. + - A **Description** for the dataset. + - Select the **First row as header** option to prevent Platform from parsing the header row of the samplesheet as sample data. + - Select **Upload file** and browse to your CSV or TSV samplesheet file in local storage, or simply drag and drop it into the box. + + The dataset is now listed in your organization workspace datasets and can be selected as input when launching your pipeline. + + :::info + Platform does not store the data used for analysis in pipelines. The dataset must specify the locations of data stored on your own infrastructure. + ::: + +
+ +## Launch pipeline + +:::note +This guide is based on version 3.15.1 of the nf-core/rnaseq pipeline. Launch form parameters and tools may differ in other versions. +::: + +With your compute environment created, nf-core/rnaseq added to your workspace Launchpad, and your samplesheet accessible in Platform, you are ready to launch your pipeline. Navigate to the Launchpad and select **Launch** next to `nf-core-rnaseq` to open the launch form. + +The launch form consists of **General config**, **Run parameters**, and **Advanced options** sections to specify your run parameters before execution, and an execution summary. Use section headings or select the **Previous** and **Next** buttons at the bottom of the page to navigate between sections. + +### General config + +![General config tab](./_images/launch-form-2.gif) + +- **Pipeline to launch**: The pipeline Git repository name or URL. For saved pipelines, this is prefilled and cannot be edited. +- **Revision number**: A valid repository commit ID, tag, or branch name. For saved pipelines, this is prefilled and cannot be edited. +- **Config profiles**: One or more [configuration profile](https://www.nextflow.io/docs/latest/config.html#config-profiles) names to use for the execution. Config profiles must be defined in the `nextflow.config` file in the pipeline repository. +- **Workflow run name**: An identifier for the run, pre-filled with a random name. This can be customized. +- **Labels**: Assign new or existing [labels](../labels/overview.mdx) to the run. +- **Compute environment**: Your AWS Batch compute environment. +- **Work directory**: The cloud storage path where pipeline scratch data is stored. Platform will create a scratch sub-folder if only a cloud bucket location is specified. + :::note + The credentials associated with the compute environment must have access to the work directory. + ::: + +### Run parameters + +![Run parameters](./_images/launch-form-3.gif) + +There are three ways to enter **Run parameters** prior to launch: + +- The **Input form view** displays form fields to enter text or select attributes from lists, and browse input and output locations with [Data Explorer](../data/data-explorer.mdx). +- The **Config view** displays raw configuration text that you can edit directly. Select JSON or YAML format from the **View as** list. +- **Upload params file** allows you to upload a JSON or YAML file with run parameters. + +Platform uses the `nextflow_schema.json` file in the root of the pipeline repository to dynamically create a form with the necessary pipeline parameters. + +Specify your pipeline input and output and modify other pipeline parameters as needed. + +
+ **input** + + Use **Browse** to select your pipeline input data: + + - In the **Data Explorer** tab, select the existing cloud bucket that contains your samplesheet, browse or search for the samplesheet file, and select the chain icon to copy the file path before closing the data selection window and pasting the file path in the input field. + - In the **Datasets** tab, search for and select your existing dataset. + +
+
+ **outdir** + + Use the `outdir` parameter to specify where the pipeline outputs are published. `outdir` must be unique for each pipeline run. Otherwise, your results will be overwritten. + + **Browse** and copy cloud storage directory paths using Data Explorer, or enter a path manually. + +
+ +Modify other parameters to customize the pipeline execution through the parameters form. For example, under **Read trimming options**, change the `trimmer` and select `fastp` instead of `trimgalore`. + +![Read trimming options](./quickstart-demo/assets/trimmer-settings.png) + +### Advanced settings + +- Use [resource labels](../resource-labels/overview.mdx) to tag the computing resources created during the workflow execution. While resource labels for the run are inherited from the compute environment and pipeline, workspace admins can override them from the launch form. Applied resource label names must be unique. +- [Pipeline secrets](../secrets/overview.mdx) store keys and tokens used by workflow tasks to interact with external systems. Enter the names of any stored user or workspace secrets required for the workflow execution. +- See [Advanced options](../launch/advanced.mdx) for more details. + +After you have filled the necessary launch details, select **Launch**. The **Runs** tab shows your new run in a **submitted** status at the top of the list. Select the run name to navigate to the [**View Workflow Run**](../monitoring/overview.mdx) page and view the configuration, parameters, status of individual tasks, and run report. + +
+ **Run monitoring** + + Select your new run from the **Runs** tab list to view the run details. + + #### Run details page + + As the pipeline runs, run details will populate with the following tabs: + + - **Command-line**: The Nextflow command invocation used to run the pipeline. This includes details about the pipeline version (`-r` flag) and profile, if specified (`-profile` flag). + - **Parameters**: The exact set of parameters used in the execution. This is helpful for reproducing the results of a previous run. + - **Resolved Nextflow configuration**: The full Nextflow configuration settings used for the run. This includes parameters, but also settings specific to task execution (such as memory, CPUs, and output directory). + - **Execution Log**: A summarized Nextflow log providing information about the pipeline and the status of the run. + - **Datasets**: Link to datasets, if any were used in the run. + - **Reports**: View pipeline outputs directly in the Platform. + + ![View the nf-core/rnaseq run](./quickstart-demo/assets/sp-cloud-run-info.gif) + + #### View reports + + Most Nextflow pipelines generate reports or output files which are useful to inspect at the end of the pipeline execution. Reports can contain quality control (QC) metrics that are important to assess the integrity of the results. + + ![Reports tab](./quickstart-demo/assets/reports-tab.png) + + For example, for the nf-core/rnaseq pipeline, view the [MultiQC](https://docs.seqera.io/multiqc) report generated. MultiQC is a helpful reporting tool to generate aggregate statistics and summaries from bioinformatics tools. + + ![Reports MultiQC preview](./quickstart-demo/assets/reports-preview.png) + + The paths to report files point to a location in cloud storage (in the `outdir` directory specified during launch), but you can view the contents directly and download each file without navigating to the cloud or a remote filesystem. + + :::info + See [Reports](../reports/overview.mdx) for more information. + ::: + + #### View general information + + The run details page includes general information about who executed the run, when it was executed, the Git commit ID and/or tag used, and additional details about the compute environment and Nextflow version used. + + ![General run information](./quickstart-demo/assets/general-run-details.gif) + + #### View details for a task + + Scroll down the page to view: + + - The progress of individual pipeline **Processes** + - **Aggregated stats** for the run (total walltime, CPU hours) + - **Workflow metrics** (CPU efficiency, memory efficiency) + - A **Task details** table for every task in the workflow + + The task details table provides further information on every step in the pipeline, including task statuses and metrics. + + #### Task details + + Select a task in the task table to open the **Task details** dialog. The dialog has three tabs: + + ![Task details window](./quickstart-demo/assets/task-details.gif) + + - The **About** tab contains extensive task execution details. + - The **Execution log** tab provides a real-time log of the selected task's execution. Task execution and other logs (such as stdout and stderr) are available for download from here, if still available in your compute environment. + - The **Data Explorer** tab allows you to view the task working directory directly in Platform. + + Nextflow hash-addresses each task of the pipeline and creates unique directories based on these hashes. Data Explorer allows you to view the log files and output files generated for each task in its working directory, directly within Platform. You can view, download, and retrieve the link for these intermediate files in cloud storage from the **Data Explorer** tab to simplify troubleshooting. + + ![Task Data Explorer](./quickstart-demo/assets/sp-cloud-task-data-explorer.gif) + +
+ +## Tertiary analysis with Data Studios + +**Data Studios** streamlines the process of creating interactive analysis environments for Platform users. With built-in templates for platforms like Jupyter Notebook, RStudio, and VSCode, creating a data studio is as simple as adding and sharing pipelines or datasets. The data studio URL can also be shared with any user with the [Connect role](../orgs-and-teams/roles.mdx) for real-time access and collaboration. + +For the purposes of this guide, an RStudio environment will be used to normalize the pipeline output data, perform differential expression analysis, and visualize the data with exploratory plots. + +### Prepare your data + +#### Gene counts + +Salmon is the default tool used during the `pseudo-aligner` step of the nf-core/rnaseq pipeline. In the pipeline output data, the `/salmon` directory contains the tool's output, including a `salmon.merged.gene_counts_length_scaled.tsv` file. + +#### Sample info + +The analysis script provided in this section requires a sample information file to parse the counts data in the `salmon.merged.gene_counts_length_scaled.tsv` file. nf-core/rnaseq does not produce this sample information file automatically. See below to create a sample information file based on the genes in your `salmon.merged.gene_counts_length_scaled.tsv` file. + +
+ **Create a sample info file** + + 1. Note the names of the columns (excluding the first column, which typically contains gene IDs) in your `salmon.merged.gene_counts_length_scaled.tsv` file. These are your sample names. + 1. Identify the group or condition that each sample belongs to. This information should come from your experimental design. + 1. Create a new text file named `sampleinfo.txt`, with two columns: + - First column header: Sample + - Second column header: Group + 1. For each sample in your `salmon.merged.gene_counts_length_scaled.tsv` file: + - In the "Sample" column, write the exact sample name as it appears in the gene counts file. + - In the "Group" column, write the corresponding group name. + + For example, for the dataset used in a `test_full` run of nf-core/rnaseq, the `sampleinfo.txt` looks like this: + + ``` + Sample Group + GM12878_REP1 GM12878 + GM12878_REP2 GM12878 + H1_REP1 H1 + H1_REP2 H1 + K562_REP1 K562 + K562_REP2 K562 + MCF7_REP1 MCF7 + MCF7_REP2 MCF7 + ``` + + To make your `sampleinfo.txt` file accessible to the data studio, upload it to the directory that contains your pipeline output data. Select this bucket or directory when you **Mount data** during data studio setup. + +
+ +### Create an RStudio analysis environment with Data Studios + +![Add data studio](./_images/create-ds.gif) + +From the **Data Studios** tab, select **Add a data studio** and complete the following: +- Select the latest **RStudio** container image template from the list. +- Select your AWS Batch compute environment. +:::note +Data studios compete for computing resources when sharing compute environments. Ensure your compute environment has sufficient resources to run both your pipelines and data studio sessions. The default CPU and memory allocation for a data studio is 2 CPUs and 8192 MB RAM. +::: +- Mount data using Data Explorer: Mount the S3 bucket or directory path that contains the pipeline work directory of your RNA-Seq run. +- Optional: Enter CPU and memory allocations. The default values are 2 CPUs and 8192 MB memory (RAM). +- Select **Add**. +- Once the data studio has been created, select the options menu next to it and select **Start**. +- When the data studio is in a running state, **Connect** to it. + +### Perform the analysis and explore results + +The RStudio environment can be configured with the packages you wish to install and the R script you wish to run. For the purposes of this guide, run the following scripts in the RStudio console to install the necessary packages and perform the analysis: + +1. Install and load the necessary packages and libraries: + + ```r + # Install required packages + if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install(c("limma", "edgeR", "ggplot2", "gplots")) + + # Load required libraries + library(limma) + library(edgeR) + library(ggplot2) + library(gplots) + ``` + +1. Read and convert the count data and sample information: + + :::info + Replace `` and `` with the paths to your `salmon.merged.gene_counts_length_scaled.tsv` and `sampleinfo.txt` files. + ::: + + ```r + # Read in the count data + counts <- read.delim(file = "/workspace/data/", row.names = + 1) + + # Remove the gene_name column if it exists + if ("gene_name" %in% colnames(counts)) { + counts <- counts[, -which(colnames(counts) == "gene_name")] + } + + # Convert to matrix + counts <- as.matrix(counts) + + # Read in the sample information + targets <- read.table( + file = "/workspace/data/", + header = TRUE, + stringsAsFactors = FALSE, + sep = "", + check.names = FALSE + ) + + # Ensure column names are correct + colnames(targets) <- c("Sample", "Group") + ``` + +1. Create a DGEList object and filter out low-count genes: + + ```r + # Create a DGEList object + y <- DGEList(counts, group = targets$Group) + + # Calculate CPM (counts per million) values + mycpm <- cpm(y) + + # Filter low count genes + thresh <- mycpm > 0.5 + keep <- rowSums(thresh) >= 2 + y <- y[keep, , keep.lib.sizes = FALSE] + ``` + +1. Normalize the data: + + ```r + # Normalize the data + y <- calcNormFactors(y) + ``` + +1. Print a summary of the filtered data: + + ```r + # Print summary of filtered data + print(dim(y)) + print(y$samples) + ``` + +1. Create an MDS plot, displayed in RStudio plots viewer (`a`) and saved as a PNG file (`b`): + + :::info + MDS plots are used to visualize the overall similarity between RNA-Seq samples based on their gene expression profiles, helping to identify sample clusters and potential batch effects. + ::: + + ```r + # Create MDS plot + # a. Display in RStudio + plotMDS(y, col = as.numeric(factor(targets$Group)), labels = targets$Group) + legend( + "topright", + legend = levels(factor(targets$Group)), + col = 1:nlevels(factor(targets$Group)), + pch = 20 + ) + + # b. Save MDS plot to file (change `png` to `pdf` to create a PDF file) + png("MDS_plot.png", width = 800, height = 600) + plotMDS(y, col = as.numeric(factor(targets$Group)), labels = targets$Group) + legend( + "topright", + legend = levels(factor(targets$Group)), + col = 1:nlevels(factor(targets$Group)), + pch = 20 + ) + dev.off() + ``` + +1. Perform differential expression analysis: + + ```r + # Design matrix + design <- model.matrix( ~ 0 + group, data = y$samples) + colnames(design) <- levels(y$samples$group) + + # Estimate dispersion + y <- estimateDisp(y, design) + + # Fit the model + fit <- glmQLFit(y, design) + + # Define contrasts + my.contrasts <- makeContrasts( + GM12878vsH1 = GM12878 - H1, + GM12878vsK562 = GM12878 - K562, + GM12878vsMCF7 = GM12878 - MCF7, + H1vsK562 = H1 - K562, + H1vsMCF7 = H1 - MCF7, + K562vsMCF7 = K562 - MCF7, + levels = design + ) + + # Perform differential expression analysis for each contrast + results <- lapply(colnames(my.contrasts), function(contrast) { + qlf <- glmQLFTest(fit, contrast = my.contrasts[, contrast]) + topTags(qlf, n = Inf) + }) + names(results) <- colnames(my.contrasts) + ``` + + :::info + This script is written for the analysis of human data, based on nf-core/rnaseq's `test_full` dataset. To adapt the script for your data, modify the contrasts based on the comparisons you want to make between your sample groups: + + ```r + my.contrasts <- makeContrasts( + Sample1vsSample2 = Sample1 - Sample2, + Sample2vsSample3 = Sample2 - Sample3, + ... + levels = design + ) + ``` + ::: + +1. Print the number of differentially expressed genes for each comparison and save the results to CSV files: + + ```r + # Print the number of differentially expressed genes for each comparison + for (name in names(results)) { + de_genes <- sum(results[[name]]$table$FDR < 0.05) + print(paste("Number of DE genes in", name, ":", de_genes)) + } + + # Save results + for (name in names(results)) { + write.csv(results[[name]], file = paste0("DE_genes_", name, ".csv")) + } + ``` + +1. Create volcano plots for each differential expression comparison, displayed in RStudio plots viewer and saved as PNG files: + + :::info + Volcano plots in RNA-Seq analysis display the magnitude of gene expression changes (log2 fold change) against their statistical significance. This allows for quick identification of significantly up- and down-regulated genes between two conditions. + ::: + + ```r + # Create volcano plots for differential expression comparisons + # Function to create a volcano plot + create_volcano_plot <- function(res, title) { + ggplot(res$table, aes(x = logFC, y = -log10(FDR))) + + geom_point(aes(color = FDR < 0.05 & + abs(logFC) > 1), size = 0.5) + + scale_color_manual(values = c("black", "red")) + + labs(title = title, x = "Log2 Fold Change", y = "-Log10 FDR") + + theme_minimal() + } + + # Create volcano plots for each comparison + for (name in names(results)) { + p <- create_volcano_plot(results[[name]], name) + # Display in RStudio + print(p) + # Save to file (change `.png` to `.pdf` to create PDF files) + ggsave( + paste0("volcano_plot_", name, ".png"), + p, + width = 8, + height = 6, + dpi = 300 + ) + } + ``` + +1. Create a heatmap of the top 50 differentially expressed genes: + + :::info + Heatmaps in RNA-Seq analysis provide a color-coded representation of gene expression levels across multiple samples or conditions, enabling the visualization of expression patterns and sample clustering based on similarity. + ::: + + ```r + # Create a heatmap of top 50 differentially expressed genes + # Get top 50 DE genes from each comparison + top_genes <- unique(unlist(lapply(results, function(x) + rownames(x$table)[1:50]))) + + # Get log-CPM values for these genes + log_cpm <- cpm(y, log = TRUE) + top_gene_expr <- log_cpm[top_genes, ] + + # Print dimensions of top_gene_expr + print(dim(top_gene_expr)) + + # Create a color palette + my_palette <- colorRampPalette(c("blue", "white", "red"))(100) + + # Create a heatmap using heatmap.2 + # Display in RStudio + heatmap.2( + as.matrix(top_gene_expr), + scale = "row", + col = my_palette, + trace = "none", + dendrogram = "column", + margins = c(5, 10), + labRow = FALSE, + ColSideColors = rainbow(length(unique(y$samples$group)))[factor(y$samples$group)], + main = "Top DE Genes Across Samples" + ) + + # Save heatmap to file (change `png` to `pdf` to create a PDF file) + png("heatmap_top_DE_genes.png", + width = 1000, + height = 1200) + heatmap.2( + as.matrix(top_gene_expr), + scale = "row", + col = my_palette, + trace = "none", + dendrogram = "column", + margins = c(5, 10), + labRow = FALSE, + ColSideColors = rainbow(length(unique(y$samples$group)))[factor(y$samples$group)], + main = "Top DE Genes Across Samples" + ) + dev.off() + + # Print the number of top genes in the heatmap + print(paste("Number of top DE genes in heatmap:", length(top_genes))) + ``` + +![RStudio plots](./_images/rstudio.gif) + +### Collaborate in the data studio + +To share your results or allow colleagues to perform exploratory analysis, share a link to the data studio by selecting the options menu for the data studio you want to share, then select **Copy data studio URL**. With this link, other authenticated users with the **Connect** [role](../orgs-and-teams/roles.mdx) (or greater) can access the session directly. + +## RNA-Seq data and requirements + +RNA-Seq data typically consists of raw sequencing reads from high-throughput sequencing technologies. These reads are used to quantify gene expression levels and discover novel transcripts. A typical RNA-Seq dataset can range from a few GB to several hundred GB, depending on the number of samples and the sequencing depth. + +### nf-core/rnaseq performance in Platform + +The compute recommendations in this guide are based on internal benchmarking performed by Seqera. Benchmark runs of [nf-core/rnaseq](https://github.com/nf-core/rnaseq) used profile `test_full`, consisting of an input dataset with 16 FASTQ files (8 paired-end samples) and a total size of approximately 123.5 GB. + +This benchmark compares pipeline run metrics between single nf-core/rnaseq runs in an AWS Batch compute environment with Fusion file system and fast instance storage enabled (**Fusion** group) and an identical AWS Batch compute environment using S3 storage without Fusion (**AWS S3** group). + +### Pipeline steps and computing resource requirements + +The nf-core/rnaseq pipeline involves several key steps, each with distinct computational requirements. Resource needs in this table are based on the `test_full` runs detailed previously: + +| **Pipeline step** | **Tools** | **Resource needs** | **Description** | +|-------------------------------------|---------------------------|------------------------------|---------------------------------------------------------------------------------------------------| +| **Quality Control (QC)** | FastQC, MultiQC | Low-moderate CPU (50-200% single-core usage), low memory (1-7 GB peak) | Initial quality checks of raw reads to assess sequencing quality and identify potential issues. | +| **Read Trimming** | Trim Galore! | High CPU (up to 700% single-core usage), low memory (6 GB peak) | Removal of adapter sequences and low-quality bases to prepare reads for alignment. | +| **Read Alignment** | HISAT2, STAR | Moderate-high CPU (480-600% single-core usage), high memory (36 GB peak) | Alignment of trimmed reads to a reference genome, typically the most resource-intensive step. | +| **Pseudoalignment** | Salmon, Kallisto | Moderate-high CPU (420% single-core usage), moderate memory (18 GB peak) | A faster, more accurate method of gene expression quantification than alignment using read compatibility. | +| **Quantification** | featureCounts, Salmon | Moderate-high CPU (500-600% single-core usage), moderate memory (18 GB peak) | Counting the number of reads mapped to each gene or transcript to measure expression levels. | +| **Differential Expression Analysis**| DESeq2, edgeR | High CPU (650% single-core usage), low memory (up to 2 GB peak ) | Statistical analysis to identify genes with significant changes in expression between conditions. | + +#### Overall run metrics + +**Total pipeline run cost (USD)**: + +- Fusion file system with fast instance storage: $34.90 +- Plain S3 storage without Fusion: $58.40 + +**Pipeline runtime**: + +The Fusion file system used with NVMe instance storage contributed to a 34% improvement in total pipeline runtime and a 49% reduction in CPU hours. + +![Run metrics overview](./_images/cpu-table-2.png) + +#### Process run time + +The Fusion file system demonstrates significant performance improvements for most processes in the nf-core/rnaseq pipeline, particularly for I/O-intensive tasks: + +- The most time-consuming processes see improvements of 36.07% to 70.15%, saving hours of runtime in a full pipeline execution. +- Most processes show significant performance improvements with Fusion, with time savings ranging from 35.57% to 99.14%. +- The most substantial improvements are seen in I/O-intensive tasks like SAMTOOLS_FLAGSTAT (95.20% faster) and SAMTOOLS_IDXSTATS (99.14% faster). +- SALMON_INDEX shows a notable 70.15% improvement, reducing runtime from 102.18 minutes to 30.50 minutes. +- STAR_ALIGN_IGENOMES, one of the most time-consuming processes, is 53.82% faster with Fusion, saving nearly an hour of runtime. + +![Average runtime of nf-core/rnaseq processes for eight samples using the Fusion file system and plain S3 storage. Error bars = standard deviation of the mean.](./_images/process-runtime-2.png) + +| Process | S3 Runtime (min) | Fusion Runtime (min) | Time Saved (min) | Improvement (%) | +|---------|------------------|----------------------|------------------|-----------------| +| SAMTOOLS_IDXSTATS | 18.54 | 0.16 | 18.38 | 99.14% | +| SAMTOOLS_FLAGSTAT | 22.94 | 1.10 | 21.84 | 95.20% | +| SAMTOOLS_STATS | 22.54 | 3.18 | 19.36 | 85.89% | +| SALMON_INDEX | 102.18 | 30.50 | 71.68 | 70.15% | +| BEDTOOLS_GENOMECOV_FW | 19.53 | 7.10 | 12.43 | 63.64% | +| BEDTOOLS_GENOMECOV_REV | 18.88 | 7.35 | 11.53 | 61.07% | +| PICARD_MARKDUPLICATES | 102.15 | 41.60 | 60.55 | 59.27% | +| STRINGTIE | 17.63 | 7.60 | 10.03 | 56.89% | +| RSEQC_READDISTRIBUTION | 16.33 | 7.19 | 9.14 | 55.97% | +| STAR_ALIGN_IGENOMES | 106.42 | 49.15 | 57.27 | 53.82% | +| SALMON_QUANT | 30.83 | 15.58 | 15.25 | 49.46% | +| RSEQC_READDUPLICATION | 19.42 | 12.15 | 7.27 | 37.44% | +| QUALIMAP_RNASEQ | 141.40 | 90.40 | 51.00 | 36.07% | +| TRIMGALORE | 51.22 | 33.00 | 18.22 | 35.57% | +| DUPRADAR | 49.04 | 77.81 | -28.77 | -58.67% | + +
+ **Pipeline optimization** + + Seqera Platform's task-level resource usage metrics allow you to determine the resources requested for a task and what was actually used. This information helps you fine-tune your configuration more accurately. + + However, manually adjusting resources for every task in your pipeline is impractical. Instead, you can leverage the pipeline optimization feature on the Launchpad. + + Pipeline optimization analyzes resource usage data from previous runs to optimize the resource allocation for future runs. After a successful run, optimization becomes available, indicated by the lightbulb icon next to the pipeline turning black. + + #### Optimize nf-core/rnaseq + + Select the lightbulb icon next to nf-core/rnaseq in your workspace Launchpad to view the optimized profile. You have the flexibility to tailor the optimization's target settings and incorporate a retry strategy as needed. + + #### View optimized configuration + + When you select the lightbulb, you can access an optimized configuration profile in the second tab of the **Customize optimization profile** window. + + This profile consists of Nextflow configuration settings for each process and each resource directive (where applicable): **cpus**, **memory**, and **time**. The optimized setting for a given process and resource directive is based on the maximum use of that resource across all tasks in that process. + + Once optimization is selected, subsequent runs of that pipeline will inherit the optimized configuration profile, indicated by the black lightbulb icon with a checkmark. + + :::info + Optimization profiles are generated from one run at a time, defaulting to the most recent run, and _not_ an aggregation of previous runs. + ::: + + ![Optimized configuration](./quickstart-demo/assets/optimize-configuration.gif) + + Verify the optimized configuration of a given run by inspecting the resource usage plots for that run and these fields in the run's task table: + + | Description | Key | + | ------------ | ---------------------- | + | CPU usage | `pcpu` | + | Memory usage | `peakRss` | + | Runtime | `start` and `complete` | + +
\ No newline at end of file diff --git a/platform_versioned_docs/version-24.1/limits/limits.mdx b/platform_versioned_docs/version-24.1/limits/limits.mdx index e46121b3e..34fb23b72 100644 --- a/platform_versioned_docs/version-24.1/limits/limits.mdx +++ b/platform_versioned_docs/version-24.1/limits/limits.mdx @@ -36,10 +36,10 @@ Some Enterprise instances on older licenses are limited to 100 labels per worksp | File size | 10 MB | | Versions per dataset | 100 | -If you need higher limits and capabilities, [contact us](https://seqera.io/contact-us/) to discuss your application requirements. - ### Data Studios | Description | Value | | -------------------- | ----- | | Running Data Studio sessions | 100 | + +If you need higher limits and capabilities, [contact us](https://seqera.io/contact-us/) to discuss your application requirements. diff --git a/platform_versioned_docs/version-24.1/supported_software/agent/agent.mdx b/platform_versioned_docs/version-24.1/supported_software/agent/agent.mdx index 14bf28d62..7bfbdd40d 100644 --- a/platform_versioned_docs/version-24.1/supported_software/agent/agent.mdx +++ b/platform_versioned_docs/version-24.1/supported_software/agent/agent.mdx @@ -46,7 +46,7 @@ export TOWER_ACCESS_TOKEN= ### Tips - If you're using the agent with Seqera Platform Enterprise (on-prem), you can set the API URL using the `TOWER_API_ENDPOINT` environment variable or the `--url` option. -- By default, the Agent uses the folder `${HOME}/work` as the Nextflow work directory. You can change this using the `--work-dir` option. +- By default, the Agent uses the folder `${HOME}/work` as the Nextflow work directory. You can change this directory using the `--work-dir` option. - The work directory _must_ exist before running the agent. - You can also change the work directory in Seqera when you create a compute environment or launch a pipeline. diff --git a/platform_versioned_sidebars/version-23.4-sidebars.json b/platform_versioned_sidebars/version-23.4-sidebars.json index de36ec02b..833019108 100644 --- a/platform_versioned_sidebars/version-23.4-sidebars.json +++ b/platform_versioned_sidebars/version-23.4-sidebars.json @@ -6,7 +6,8 @@ "label": "Tutorials", "collapsed": true, "items": [ - "getting-started/quickstart-demo/comm-showcase" + "getting-started/quickstart-demo/comm-showcase", + "getting-started/rnaseq" ] }, { @@ -84,7 +85,6 @@ "type": "category", "label": "Data", "items": [ - "getting-started/quickstart-demo/add-data", "data/data-explorer", "data/datasets" ] @@ -93,7 +93,6 @@ "type": "category", "label": "Pipelines", "items": [ - "getting-started/quickstart-demo/add-pipelines", "pipeline-schema/overview", "launch/launchpad", "labels/overview", @@ -117,7 +116,6 @@ "type": "category", "label": "Administration", "items": [ - "getting-started/workspace-setup", "administration/overview", "orgs-and-teams/organizations", "orgs-and-teams/workspace-management", @@ -186,11 +184,6 @@ "label": "Developer tools", "collapsed": true, "items": [ - { - "type": "doc", - "id": "getting-started/quickstart-demo/automation", - "label": "Overview" - }, "api/overview", { "type": "category", diff --git a/platform_versioned_sidebars/version-24.1-sidebars.json b/platform_versioned_sidebars/version-24.1-sidebars.json index 5f542a464..c8316e0f3 100644 --- a/platform_versioned_sidebars/version-24.1-sidebars.json +++ b/platform_versioned_sidebars/version-24.1-sidebars.json @@ -6,8 +6,9 @@ "label": "Tutorials", "collapsed": true, "items": [ - "getting-started/quickstart-demo/comm-showcase" - ] + "getting-started/quickstart-demo/comm-showcase", + "getting-started/rnaseq" + ] }, { "type": "category", @@ -85,7 +86,6 @@ "type": "category", "label": "Data", "items": [ - "getting-started/quickstart-demo/add-data", "data/data-explorer", "data/datasets" ] @@ -94,7 +94,6 @@ "type": "category", "label": "Pipelines", "items": [ - "getting-started/quickstart-demo/add-pipelines", "pipeline-schema/overview", "launch/launchpad", "labels/overview", @@ -119,7 +118,6 @@ "type": "category", "label": "Administration", "items": [ - "getting-started/workspace-setup", "administration/overview", "orgs-and-teams/organizations", "orgs-and-teams/workspace-management", @@ -190,11 +188,6 @@ "label": "Developer tools", "collapsed": true, "items": [ - { - "type": "doc", - "id": "getting-started/quickstart-demo/automation", - "label": "Overview" - }, "api/overview", { "type": "category",