docs/clustering/pipeline.yml

# Pipeline pipeline_clustering_scanpy.py configuration file
# ==============================================

# compute resource options
# ------------------------
resources:
  # Number of threads used for parallel jobs
  # this must be enough memory to load your mudata and do computationally intensive tasks
  threads_high: 2
  # this must be enough memory to load your mudata and do computationally light tasks
  threads_medium: 2
# this must be enough memory to load text files and do plotting, required much less memory than the other two
  threads_low: 2

  fewer_jobs: True
# Add path to conda env, leave blank if running native or your cluster automatically inherits the login node environment
condaenv: 
# Start
# --------------------------
sample_prefix: teaseq
scaled_obj: ../teaseq_temp.h5mu
# full obj only applicable if you have filtered your scaled object by hvgs
# in this case panpipes will use the full obj to calculate rank_gene_groups and for plotting those genes
# it should contain all the genes you want to include in rank_gene_groups, plus logged_counts as a layer
# if your scaled_obj contains all the genes then leave full_obj blank
full_obj: 

# run clustering on each individual modality:
modalities:
  rna: True 
  prot: True
  atac: True

# if True, will look for WNN, or totalVI output
multimodal:
  run_clustering: True
  #WNN, mofa, totalVI # this will tell us where to look for 
  integration_method: wnn

# batch_correction: harmony  # None, harmony, scanorama, bbknn or combat
# ---------------------------------------
# parameters for find neighbours
# ---------------------------------------
# find neighbour parameters
#-----------------------------
# number of neighbors to use when calculating the graph for clustering and umap.
neighbors:
  rna:
    use_existing: True
    # number of Principal Components to calculate for neighbours and umap:
    dim_red: X_pca
    #how many components to use for clustering
    n_dim_red: 30
    # number of neighbours
    k: 30
    # metric: euclidean | cosine
    metric: euclidean
    # scanpy | hnsw (from scvelo)
    method: scanpy
  prot:
    use_existing: True
    # number of Principal Components to calculate for neighbours and umap:
    dim_red: X_pca
    #how many components to use for clustering
    n_dim_red: 30
    # number of neighbours
    k: 30
    # metric: euclidean | cosine
    metric: euclidean
    # scanpy | hnsw (from scvelo)
    method: scanpy
  atac:
    use_existing: True
    # number of Principal Components to calculate for neighbours and umap:
    dim_red: X_pca
    #how many components to use for clustering
    n_dim_red: 30
    # number of neighbours
    k: 30
    # metric: euclidean | cosine
    metric: euclidean
    # scanpy | hnsw (from scvelo)
    method: scanpy

# ---------------------------------------
# parameters for umap calculation
# ---------------------------------------
umap:
# set run to False if you are happy with the existing umap from integration
  run: True
  rna:
    mindist:
      - 0.25
      - 0.5
  prot:
    mindist: 
      - 0.1
  atac: 
    mindist:
      - 0.5
  multimodal:
    mindist:
      - 0.5

# UMAP reduced dimensions will be stored using the format 
# ---------------------------------------
# parameters for clustering
# ---------------------------------------
clusterspecs:
  rna:
    resolutions:
     - 0.2
     - 0.6
     - 1 
    algorithm: leiden # (louvain or leiden)
  prot:
    resolutions:
     - 0.2
     - 0.6
     - 1 
    algorithm: leiden # (louvain or leiden)
  atac:
    resolutions:
     - 0.2
     - 0.3
    algorithm: leiden # (louvain or leiden)
  multimodal:
    resolutions:
      - 0.5
      - 0.7
    algorithm: leiden
# ---------------------------------------
# parameters for finding marker genes
# ---------------------------------------
# where pseudo_suerat is set to False 
# we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
# where pseudo_seurat is set to True, we run an a python implementation of Seurat::FindMarkers (written by CRG) is used,

markerspecs:
  rna:
    run: True
    layer: logged_counts
    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
    method: t-test_overestim_var
    mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis
    # where pseudo_suerat is set to False 
    # we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
    # where pseudo_seurat is set to True, we run an a python implementation of Seurat::FindMarkers (written by CRG) is used,
    pseudo_seurat: False
    # these next two settings do not matter unless pseudo_seurat is set to True,
    # If applicable look at Seurat documentation for FindMarkers for details
    minpct: 0.1
    threshuse: 0.25

  prot:
    run:
    layer: clr
    mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis
    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
    method: wilcoxon
    pseudo_seurat: False
    minpct: 0.1
    threshuse: 0.25

  atac:
    run: True
    layer: signac_norm
    mincells: 10
    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
    method: wilcoxon
    pseudo_seurat: False
    minpct: 0.1
    threshuse: 0.25

  multimodal:
    mincells: 10
    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
    method: wilcoxon
    pseudo_seurat: False
    minpct: 0.1
    threshuse: 0.25


# ---------------------------------------
# plot specs are used to define which metadata columns are used in the visualisations
# ---------------------------------------
plotspecs:
  layers:
    rna: logged_counts
    prot: clr
    atac: signac_norm
  top_n_markers: 10