From de6ae27a64418398325d95c00dc4f541841ad7f4 Mon Sep 17 00:00:00 2001 From: SarahOuologuem Date: Thu, 18 Apr 2024 12:30:58 +0000 Subject: [PATCH] add log info --- .../python_scripts/batch_correct_bbknn.py | 4 +- .../python_scripts/batch_correct_harmony.py | 31 +++++----- panpipes/python_scripts/batch_correct_mofa.py | 26 +++++---- .../python_scripts/batch_correct_multivi.py | 57 +++++++++--------- .../python_scripts/batch_correct_scanorama.py | 34 +++++------ panpipes/python_scripts/batch_correct_scvi.py | 44 +++++++------- .../python_scripts/batch_correct_totalvi.py | 58 +++++++++++-------- panpipes/python_scripts/batch_correct_wnn.py | 38 ++++++------ 8 files changed, 152 insertions(+), 140 deletions(-) diff --git a/panpipes/python_scripts/batch_correct_bbknn.py b/panpipes/python_scripts/batch_correct_bbknn.py index a3b5f8ff..2588c73c 100644 --- a/panpipes/python_scripts/batch_correct_bbknn.py +++ b/panpipes/python_scripts/batch_correct_bbknn.py @@ -68,7 +68,7 @@ sc.tl.pca(adata, n_comps=min(50,adata.var.shape[0]-1), svd_solver='arpack', random_state=0) if "X_pca" not in adata.obsm: - L.info("X_pca could not be found in adata.obsm. Computing PCA with default parameters.") + L.warning("X_pca could not be found in adata.obsm. Computing PCA with default parameters.") n_pcs = 50 if adata.var.shape[0] < n_pcs: L.info("You have less features than number of PCs you intend to calculate") @@ -103,7 +103,7 @@ n_pcs = int(args.neighbors_n_pcs), neighbors_within_batch=nnb) # calculates the neighbours -L.info("Calculating UMAP") +L.info("Computing UMAP") sc.tl.umap(adata) # write out diff --git a/panpipes/python_scripts/batch_correct_harmony.py b/panpipes/python_scripts/batch_correct_harmony.py index 28f75388..f093f932 100644 --- a/panpipes/python_scripts/batch_correct_harmony.py +++ b/panpipes/python_scripts/batch_correct_harmony.py @@ -56,14 +56,12 @@ args, opt = parser.parse_known_args() - -L.info("reading data and starting integration pipeline with script: ") -L.info(os.path.basename(__file__)) -L.info("Running with options: %s", args) +L.info(args) # this should be an object that contains the full log normalised data (adata_log1p.h5ad) # prior to hvgs and filtering #adata = read_anndata(args.input_anndata, use_muon=use_muon, modality=args.modality) +L.info("Reading in MuData from '%s'" % args.input_anndata) mdata = mu.read(args.input_anndata) adata = mdata.mod[args.modality] @@ -76,14 +74,16 @@ dimred = "X_lsi" if dimred not in adata.obsm: - L.info("i need a dimred to start, computing pca with default param") + L.warning("Dimred '%s' could not be found in adata.obsm. Computing PCA with default parameters." % dimred) dimred = "X_pca" n_pcs = 50 if adata.var.shape[0] < n_pcs: L.info("You have less features than number of PCs you intend to calculate") n_pcs = adata.var.shape[0] - 1 - L.info("Setting n PCS to %i" % int(n_pcs)) + L.info("Setting n PCS to %i" % int(n_pcs)) + L.info("Scaling data") sc.pp.scale(adata) + L.info("Computing PCA") sc.tl.pca(adata, n_comps=n_pcs, svd_solver='arpack', random_state=0) @@ -91,14 +91,14 @@ if len(columns)>1: - L.info("using 2 columns to integrate on more variables") + L.info("Using 2 columns to integrate on more variables") #comb_columns = "_".join(columns) adata.obs["comb_columns"] = adata.obs[columns].apply(lambda x: '|'.join(x), axis=1) # make sure that batch is a categorical adata.obs["comb_columns"] = adata.obs["comb_columns"].astype("category") # run harmony - + L.info("Running Harmony") ho = hm.run_harmony(adata.obsm[dimred][:,0:int(args.harmony_npcs)], adata.obs, ["comb_columns"], sigma = float(args.sigma_val),theta = float(args.theta_val),verbose=True,max_iter_kmeans=30, max_iter_harmony=40) @@ -107,6 +107,7 @@ # make sure that batch is a categorical adata.obs[args.integration_col] = adata.obs[args.integration_col].astype("category") # run harmony + L.info("Running Harmony") ho = hm.run_harmony(adata.obsm[dimred][:,0:int(args.harmony_npcs)], adata.obs, [args.integration_col], @@ -116,11 +117,9 @@ max_iter_harmony=40) -L.info("integration run now calculate umap") - +L.info("Saving harmony co-ords to .obsm['X_harmony']") adjusted_pcs = pd.DataFrame(ho.Z_corr).T adata.obsm['X_harmony']=adjusted_pcs.values -L.info("harmony co-ords derived") if int(args.neighbors_n_pcs) >adata.obsm['X_harmony'].shape[1]: L.warn(f"N PCs is larger than X_harmony dimensions, reducing n PCs to {adata.obsm['X_harmony'].shape[1] -1}") @@ -128,6 +127,7 @@ n_pcs= min(int(args.neighbors_n_pcs), adata.obsm['X_harmony'].shape[1]-1) # run neighbours and umap +L.info("Computing neighbors") run_neighbors_method_choice(adata, method=args.neighbors_method, n_neighbors=int(args.neighbors_k), @@ -137,10 +137,11 @@ nthreads=max([threads_available, 6])) -L.info("done n_neighbours, moving to umap") +L.info("Computing UMAP") sc.tl.umap(adata) -L.info("done umap, saving stuff") + # write out +L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv) umap = pd.DataFrame(adata.obsm['X_umap'], adata.obs.index) umap.to_csv(args.output_csv) @@ -149,11 +150,9 @@ outfiletmp = ("tmp/harmony_scaled_adata_" + args.modality + ".h5ad" ) - -L.info("saving harmony corrected adata") +L.info("Saving AnnData to '%s'" % outfiletmp) write_anndata(adata, outfiletmp, use_muon=False, modality=args.modality) - L.info("Done") diff --git a/panpipes/python_scripts/batch_correct_mofa.py b/panpipes/python_scripts/batch_correct_mofa.py index 6a3fff6d..8a4d2d62 100644 --- a/panpipes/python_scripts/batch_correct_mofa.py +++ b/panpipes/python_scripts/batch_correct_mofa.py @@ -68,30 +68,31 @@ threads_available = multiprocessing.cpu_count() params = pp.io.read_yaml("pipeline.yml") +L.info("Reading in MuData from '%s'" % args.scaled_anndata) mdata = mu.read(args.scaled_anndata) if params['multimodal']['mofa']['modalities'] is not None: modalities= params['multimodal']['mofa']['modalities'] modalities = [x.strip() for x in modalities.split(",")] - L.info(f"using modalities :{modalities}") + L.info(f"Using modalities :{modalities}") removed_mods = None if all(x in modalities for x in mdata.mod.keys()): tmp = mdata.copy() - L.info('using all modalities') + L.info('Using all modalities') else: tmp = mdata.copy() removed_mods = list(set(mdata.mod.keys()) - set(modalities)) - L.info(f"removing modalities {removed_mods}") + L.info(f"Removing modalities {removed_mods}") for rmod in removed_mods: del tmp.mod[rmod] else: - L.warning("""you specified no modalities, so i will default to all available - this may be a problem if you have repertoire in here""") + L.warning("""No modalities were specified, therefore all available modalities will be used. + This may be problematic if repertoire is present as modality.""") removed_mods = None tmp = mdata.copy() -print(tmp.mod.keys()) +L.info("Intersecting modality obs before running mofa") mu.pp.intersect_obs(tmp) mofa_kwargs={} @@ -111,7 +112,6 @@ del mofa_kwargs['filter_by_hvg'] for mod in tmp.mod.keys(): if "highly_variable" not in tmp[mod].var.columns: - print(mod) tmp[mod].var["highly_variable"] = True tmp.update() @@ -148,20 +148,19 @@ if args.use_gpu is True or args.use_gpu=='True': mofa_kwargs['gpu_mode'] = True -L.info(mofa_kwargs) -L.info(tmp.var.columns) - +L.info("Running mofa") mu.tl.mofa(tmp, **mofa_kwargs) #This adds X_mofa embeddings to the .obsm slot of the MuData object #write the discovered latent rep to the original mudata object +L.info("Adding X_mofa to .obsm of MuData") mdata.obsm['X_mofa'] = tmp.obsm['X_mofa'].copy() if int(args.neighbors_n_pcs) > mdata.obsm['X_mofa'].shape[1]: L.warning(f"N PCs is larger than X_mofa dimensions, reducing n PCs to {mdata.obsm['X_mofa'].shape[1]-1}") n_pcs= min(int(args.neighbors_n_pcs), mdata.obsm['X_mofa'].shape[1]-1) - +L.info("Computing neighbors") run_neighbors_method_choice(mdata, method=args.neighbors_method, n_neighbors=int(args.neighbors_k), @@ -170,9 +169,12 @@ use_rep='X_mofa', nthreads=max([threads_available, 6])) +L.info("Computing UMAP") sc.tl.umap(mdata, min_dist=0.4) +L.info("Computing Leiden clustering") sc.tl.leiden(mdata, key_added="leiden_mofa") +L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv) umap = pd.DataFrame(mdata.obsm['X_umap'], mdata.obs.index) umap.to_csv(args.output_csv) @@ -180,8 +182,8 @@ for rmd in removed_mods: tmp.mod[rmd] = mdata.mod[rmd].copy() +L.info("Saving MuData to 'tmp/mofa_scaled_adata.h5mu'") tmp.write("tmp/mofa_scaled_adata.h5mu") - L.info("Done") diff --git a/panpipes/python_scripts/batch_correct_multivi.py b/panpipes/python_scripts/batch_correct_multivi.py index 93032afe..38e776c6 100644 --- a/panpipes/python_scripts/batch_correct_multivi.py +++ b/panpipes/python_scripts/batch_correct_multivi.py @@ -51,6 +51,7 @@ args, opt = parser.parse_known_args() +L.info(args) # scanpy settings sc.set_figure_params(facecolor="white") @@ -69,8 +70,7 @@ params['MultiVI']['training_args']['max_epochs'] = 10 # ------------------------------------------------------------------ -L.info("Running multivi") - +L.info("Reading in MuData from '%s'" % args.scaled_anndata) mdata = mu.read(args.scaled_anndata) rna = mdata['rna'].copy() atac = mdata['atac'].copy() @@ -79,9 +79,9 @@ if check_for_bool(params["multimodal"]["MultiVI"]["lowmem"]): if 'hvg' in atac.uns.keys(): - L.info("subsetting atac to top HVF") + L.info("Subsetting ATAC to HVFs") atac = atac[:, atac.var.highly_variable].copy() - L.info("calculating and subsetting atac to top 25k HVF") + L.info("Calculating and subsetting ATAC to top 25k HVF") sc.pp.highly_variable_genes(atac, n_top_genes=25000) atac = atac[:, atac.var.highly_variable].copy() @@ -98,8 +98,8 @@ if rna.shape[0] == atac.shape[0]: n=int(rna.shape[0]) else: - sys.exit("rna and atac have different number of cells, \ - can't deal with this in this version of MultiVI integration") + sys.exit("RNA and ATAC have different number of cells, \ + Can't deal with this in this version of MultiVI integration") gc.collect() @@ -108,24 +108,27 @@ if "raw_counts" in rna.layers.keys(): - L.info("raw counts found in layer for RNA") + L.info("Found raw RNA counts in .layers['raw_counts']") elif X_is_raw(rna): # this means the X layer is already raw and we can make the layer we need - L.info("raw counts found in X for RNA") + L.info("Found raw RNA counts in .X. Saving raw RNA counts to .layers['raw_counts']") rna.layers["raw_counts"] = rna.X.copy() else: - sys.exit("please provide a mu/anndata with raw counts for RNA") + L.error("Could not find raw counts for RNA in .X and .layers['raw_counts']") + sys.exit("Could not find raw counts for RNA in .X and .layers['raw_counts']") + if "raw_counts" in atac.layers.keys(): - L.info("raw counts found in layer for ATAC") + L.info("Found raw ATAC counts in .layers['raw_counts']") elif X_is_raw(atac): # this means the X layer is already raw and we can make the layer we need - L.info("raw counts found in X for ATAC") + L.info("Found raw ATAC counts in .X. Saving raw ATAC counts to .layers['raw_counts']") atac.layers["raw_counts"] = atac.X.copy() else: - sys.exit("please provide a mu/anndata with raw counts for ATAC") + L.error("Could not find raw counts for ATAC in .X and .layers['raw_counts']") + sys.exit("Could not find raw counts for ATAC in .X and .layers['raw_counts']") -L.info("concatenating modalities to comply with multiVI") +L.info("Concatenating modalities to comply with multiVI") # adata_paired = ad.concat([rna, atac], join="outer") # adata_paired.var = pd.concat([rna.var,atac.var]) if rna.is_view: @@ -134,10 +137,6 @@ if atac.is_view: L.info("ATAC is view") atac = atac.copy() - -L.info(atac.is_view) - - adata_paired = ad.concat([rna.T, atac.T]).T rna_cols=rna.obs.columns @@ -152,7 +151,6 @@ if "modality" not in adata_paired.obs.columns: adata_paired.obs["modality"] = "paired" -print(adata_paired) # adata = ad.concat([rna,atac],join="outer") # adata.var = pd.concat([rna.var,atac.var]) @@ -160,6 +158,8 @@ del [rna , atac ] gc.collect() + +L.info("Organizing multiome AnnDatas") adata_mvi = scvi.data.organize_multiome_anndatas(adata_paired) @@ -170,10 +170,8 @@ columns = [] for cc in cols: if cc in rna_cols: - print("categorical batch covariate in rna") columns.append("rna:"+cc) elif cc in atac_cols: - print("categorical batch covariate in atac") columns.append("atac:"+cc) if args.integration_col_continuous is not None : if args.integration_col_continuous in rna_cols: @@ -187,7 +185,7 @@ if columns is not None: print(columns) if len(columns) > 1: - L.info("using 2 columns to integrate on more variables") + L.info("Using 2 columns to integrate on more variables") # bc_batch = "_".join(columns) adata_mvi.obs["bc_batch"] = adata_mvi.obs[columns].apply(lambda x: '|'.join(x), axis=1) # make sure that batch is a categorical @@ -204,10 +202,10 @@ adata_mvi.obs['bc_batch_continuous'] = adata_mvi.obs[args.integration_col_continuous] kwargs['continuous_covariate_keys'] = ["bc_batch_continuous"] -print(kwargs) # 1 setup anndata #scvi.model.MULTIVI.setup_anndata(adata_mvi, batch_key='modality', **kwargs) +L.info("Setting up AnnData") scvi.model.MULTIVI.setup_anndata( adata_mvi, batch_key='modality', @@ -219,8 +217,7 @@ else: multivi_model_args = {k: v for k, v in params["multimodal"]['MultiVI']['model_args'].items() if v is not None} -print(multivi_model_args) - +L.info("Defining model") mvi = scvi.model.MULTIVI( adata_mvi, n_genes=n_genes, @@ -234,23 +231,21 @@ multivi_training_args={} else: multivi_training_args = {k: v for k, v in params["multimodal"]['MultiVI']['training_args'].items() if v is not None} -L.info("multivi training args") -print(multivi_training_args) if params["multimodal"]['MultiVI']['training_plan'] is None: multivi_training_plan = {} else: multivi_training_plan = {k: v for k, v in params["multimodal"]['MultiVI']['training_plan'].items() if v is not None} -L.info("multivi training plan") -print(multivi_training_plan) mvi.view_anndata_setup() +L.info("Running multiVI") mvi.train( **multivi_training_args, **multivi_training_plan) mvi.save(os.path.join("batch_correction", "multivi_model"), anndata=False) +L.info("Plotting ELBO") plt.plot(mvi.history["elbo_train"], label="train") plt.plot(mvi.history["elbo_validation"], label="validation") plt.title("Negative ELBO over training epochs") @@ -267,6 +262,7 @@ """) mdata = mu.read(args.scaled_anndata) +L.info("Extracting latent space and saving latent to X_MultiVI") mdata.obsm["X_MultiVI"] = mvi.get_latent_representation() #adata_mvi.obsm["X_MultiVI"] = mvi.get_latent_representation() @@ -274,6 +270,7 @@ L.warn(f"N PCs is larger than X_MultiVI dimensions, reducing n PCs to {mdata.obsm['X_MultiVI'].shape[1] -1}") n_pcs= min(int(args.neighbors_n_pcs), mdata.obsm['X_MultiVI'].shape[1]-1) +L.info("Computing neighbors") run_neighbors_method_choice(mdata, method=args.neighbors_method, n_neighbors=int(args.neighbors_k), @@ -281,12 +278,16 @@ metric=args.neighbors_metric, use_rep='X_MultiVI', nthreads=max([threads_available, 6])) +L.info("Computing UMAP") sc.tl.umap(mdata, min_dist=0.4) +L.info("Computing Leiden clustering") sc.tl.leiden(mdata, key_added="leiden_multiVI") +L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv) umap = pd.DataFrame(mdata.obsm['X_umap'], mdata.obs.index) umap.to_csv(args.output_csv) +L.info("Saving MuData to 'tmp/multivi_scaled_adata.h5mu'") write_anndata(mdata, "tmp/multivi_scaled_adata.h5mu",use_muon=False) L.info("Done") diff --git a/panpipes/python_scripts/batch_correct_scanorama.py b/panpipes/python_scripts/batch_correct_scanorama.py index 26980ee9..ecc8d64c 100644 --- a/panpipes/python_scripts/batch_correct_scanorama.py +++ b/panpipes/python_scripts/batch_correct_scanorama.py @@ -57,16 +57,12 @@ args, opt = parser.parse_known_args() - - -L.info("Running with options: %s", args) +L.info(args) # Scanorama is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods, # including those for imputation and highly-variable gene filtering. -L.info("reading data and starting integration pipeline with script: ") -L.info(os.path.basename(__file__)) - #adata = read_anndata(args.input_anndata, use_muon=use_muon, modality="rna") +L.info("Reading in MuData from '%s'" % args.input_anndata) mdata = mu.read(args.input_anndata) adata = mdata.mod[args.modality] bcs = adata.obs_names.tolist() @@ -74,7 +70,7 @@ # scanorama can't integrate on 2+ variables, so create a fake column with combined information columns = [x.strip() for x in args.integration_col.split(",")] if len(columns) > 1: - L.info("using 2 columns to integrate on more variables") + L.info("Using 2 columns to integrate on more variables") # comb_columns = "_".join(columns) adata.obs["comb_columns"] = adata.obs[columns].apply(lambda x: '|'.join(x), axis=1) @@ -86,22 +82,22 @@ adata.obs['batch'] = adata.obs[args.integration_col] batches = adata.obs['batch'].unique() -L.info("define scanorama inputs based on batches and number of PCs") # need contiguous batches scanorama_input = adata[adata.obs.sort_values(by="batch").index.tolist(), :] # filter by HVGs to make it equivalent to the old scripts, # which inputted the scaled object after filtering by hvgs. +L.info("Filtering data by HVGs") scanorama_input = scanorama_input[:, scanorama_input.var.highly_variable] #also filter the X_PCA to be the number of PCs we actually want to use +L.info("Subsetting X_pca to %s PCs" % args.neighbors_n_pcs) scanorama_input.obsm['X_pca'] = scanorama_input.obsm['X_pca'][:,0:int(args.neighbors_n_pcs)] # run scanoramam using the scanpy integrated approach -L.info("run_scanorama") +L.info("Running scanorama") sce.pp.scanorama_integrate(scanorama_input, key='batch', batch_size=int(args.batch_size)) -L.info("scanorama done") # not integrated @@ -118,22 +114,22 @@ scanorama_input = scanorama_input[bcs, :] # check it worked if all(scanorama_input.obs_names == bcs): - L.info("barcode order is correct") + L.info("Barcode order is correct") else: - L.debug("barcode order in batch corrected object is incorrect") - sys.exit("barcode order in batch corrected object is incorrect") + L.error("Barcode order in batch corrected object is incorrect") + sys.exit("Barcode order in batch corrected object is incorrect") -L.info(adata) +L.info("Saving scanorama embedding to X_scanorama") # add to the AnnData object adata.obsm["X_scanorama"] = scanorama_input.obsm["X_scanorama"] -L.info("integration run now calculate umap") if int(args.neighbors_n_pcs) > adata.obsm['X_scanorama'].shape[1]: L.warn(f"N PCs is larger than X_scanorama dimensions, reducing n PCs to {adata.obsm['X_scanorama'].shape[1] -1}") n_pcs= min(int(args.neighbors_n_pcs), adata.obsm['X_scanorama'].shape[1]-1) # run neighbours and umap +L.info("Computing neighbors") run_neighbors_method_choice(adata, method=args.neighbors_method, n_neighbors=int(args.neighbors_k), @@ -142,15 +138,17 @@ use_rep='X_scanorama', nthreads=max([threads_available, 6])) - +L.info("Computing UMAP") sc.tl.umap(adata) -L.info("done umap, saving stuff") + # write out umap = pd.DataFrame(adata.obsm['X_umap'], adata.obs.index) +L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv) umap.to_csv(args.output_csv) # save the scanorama dim reduction in case scanorama is our favourite +L.info("Saving AnnData to 'tmp/scanorama_scaled_adata.h5ad'") adata.write("tmp/scanorama_scaled_adata.h5ad") -L.info("done") +L.info("Done") diff --git a/panpipes/python_scripts/batch_correct_scvi.py b/panpipes/python_scripts/batch_correct_scvi.py index 6c6ac24b..f8b73383 100644 --- a/panpipes/python_scripts/batch_correct_scvi.py +++ b/panpipes/python_scripts/batch_correct_scvi.py @@ -53,8 +53,6 @@ # load parameters params = read_yaml("pipeline.yml") -L.info("Running scvi script") -L.info("threads available: %s", threads_available) # scanpy settings sc.set_figure_params(facecolor="white") @@ -72,9 +70,7 @@ # load an process the scaled snRNAseq dataset -------------------------------------------------- -L.info("loading in anndata object ") - - +L.info("Reading in Data from '%s'" % args.scaled_anndata) mdata = mu.read(args.scaled_anndata) if type(mdata) is mu.MuData: rna = mdata['rna'] @@ -83,8 +79,8 @@ rna = mdata L.info(rna) else: - L.error("unknown file type") - sys.exit("file not MuData or Anndata") + L.error("Unknown file type. File not MuData or Anndata") + sys.exit("Unknown file type. File not MuData or Anndata") # this is a copy so we can subset vars and obs without changing the original object @@ -92,7 +88,7 @@ # in case of more than 1 variable, create a fake column with combined information columns = [x.strip() for x in args.integration_col.split(",")] if len(columns) > 1: - L.info("using 2 columns to integrate on more variables") + L.info("Using 2 columns to integrate on more variables") # bc_batch = "_".join(columns) rna.obs["bc_batch"] = rna.obs[columns].apply(lambda x: '|'.join(x), axis=1) @@ -105,22 +101,26 @@ # add in raw counts as a layer # add in test to see if raw layer exists alread if "raw_counts" in rna.layers.keys(): - L.info("raw counts found in layer") + L.info("Found raw counts in .layers['raw_counts']") elif X_is_raw(rna): + L.info("Found raw counts in .X. Saving raw counts to .layers['raw_counts']") rna.layers["raw_counts"] = rna.X.copy() else: - L.info("merge in raw counts") + L.info("Merging in raw counts from '%s" % args.raw_anndata) sc_raw = read_anndata(args.raw_anndata, use_muon=True, modality="rna") #filter by barcodes in the scaled object sc_raw = sc_raw[sc_raw.obs_names.isin(rna.obs_names),: ] + L.info("Saving raw counts to .layers['raw_counts]") rna.layers["raw_counts"] = sc_raw.X.copy() # filter out mitochondria if params['rna']['scvi']['exclude_mt_genes']: + L.info("Filtering out mitochondrial genes") rna = rna[:, ~rna.var[params['rna']['scvi']['mt_column']]] # filter by Hvgs +L.info("Filtering by HVGs") rna = rna[:, rna.var.highly_variable] @@ -131,31 +131,28 @@ rna = rna.copy() -# scvi.model.data.setup_anndata( + +L.info("Setting up AnnData") scvi.model.SCVI.setup_anndata( rna, layer="raw_counts", batch_key='bc_batch' ) - scvi_model_args = {k: v for k, v in params['rna']['scvi']['model_args'].items() if v is not None} -print(scvi_model_args) scvi_training_args = {k: v for k, v in params['rna']['scvi']['training_args'].items() if v is not None} -print(scvi_training_args) scvi_training_plan = {k: v for k, v in params['rna']['scvi']['training_plan'].items() if v is not None} -print(scvi_training_plan) - - +L.info("Defining model") vae = scvi.model.SCVI(rna, **scvi_model_args) - +L.info("Running scVI") vae.train(**scvi_training_args, plan_kwargs=scvi_training_plan) vae.save(os.path.join("batch_correction", "scvi_model"), anndata=False) # no early stopping? +L.info("Plotting ELBO") vae.history["elbo_train"].plot() plt.savefig(os.path.join(args.figdir, "scvi_elbo_train.png")) @@ -168,12 +165,13 @@ plt.savefig(os.path.join(args.figdir, "scvi_metrics.png")) # vae.history['elbo_train']['elbo_train'].to_list() - +L.info("Extracting latent space") latent = vae.get_latent_representation() - +L.info("Saving latent to X_scVI") rna.obsm["X_scVI"] = latent rna.obs['bc_batch'] = rna.obs['bc_batch'] +L.info("Plotting latent") sc.pl.embedding(rna, "X_scVI", color="bc_batch", save="_batch.png") plot_df = pd.DataFrame(latent[:, 0:2]) @@ -184,9 +182,9 @@ n_pcs= min(int(args.neighbors_n_pcs), rna.obsm['X_scVI'].shape[1]-1) -print(rna.obsm['X_scVI'].shape) # use scVI latent space for UMAP generation # sc.pp.neighbors(rna,n_neighbors=int(args.n_neighbors), use_rep="X_scVI") +L.info("Computing neighbors") run_neighbors_method_choice(rna, method=args.neighbors_method, n_neighbors=int(args.neighbors_k), @@ -195,18 +193,22 @@ use_rep='X_scVI', nthreads=max([threads_available, 6])) +L.info("Computing UMAP") sc.tl.umap(rna) +L.info("Plotting UMAP") sc.pl.umap( rna, color=["bc_batch",], frameon=False, save="_scvi_batch.png" ) +L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv) umap = pd.DataFrame(rna.obsm['X_umap'], rna.obs.index) umap.to_csv(args.output_csv) # save anndata to be used by other scvi tools applications +L.info("Saving AnnData to 'tmp/scvi_scaled_adata_rna.h5ad'") rna.write(os.path.join("tmp", "scvi_scaled_adata_rna.h5ad")) L.info("Done") diff --git a/panpipes/python_scripts/batch_correct_totalvi.py b/panpipes/python_scripts/batch_correct_totalvi.py index b7214484..d3fd34d7 100644 --- a/panpipes/python_scripts/batch_correct_totalvi.py +++ b/panpipes/python_scripts/batch_correct_totalvi.py @@ -70,11 +70,11 @@ params['multimodal']['totalvi']['training_args']['max_epochs'] = 10 # ------------------------------------------------------------------ -L.info("Running totalvi script") - +L.info("Reading in MuData from '%s'" % args.scaled_anndata) mdata = mu.read(args.scaled_anndata) # we need to intersect the prot and rna modalities. +L.info("Intersecting prot and rna modalities") intersect_obs_by_mod(mdata, ['rna', 'prot']) # this is a copy so we can subset vars and obs without changing the original object @@ -87,7 +87,7 @@ columns = [x.strip() for x in args.integration_col_categorical.split(",")] columns =[ x.replace("rna:","") for x in columns] if len(columns) > 1: - L.info("using 2 columns to integrate on more variables") + L.info("Using 2 columns to integrate on more variables") # bc_batch = "_".join(columns) rna.obs["bc_batch"] = rna.obs[columns].apply(lambda x: '|'.join(x), axis=1) # make sure that batch is a categorical @@ -103,24 +103,25 @@ rna.obs['bc_batch_continuous'] = rna.obs[args.integration_col_continuous] kwargs["continuous_covariate_keys"] = ["bc_batch_continuous"] -L.debug(kwargs) # add in raw counts as a layer # try to find or load the raw counts if "counts" in rna.layers.keys(): - L.info("raw counts found in layer") + L.info("Found raw RNA counts in .layers['counts']") elif "raw_counts" in rna.layers.keys(): + L.info("Found raw RNA counts in .layers['raw_counts']") + L.info("Saving raw RNA counts to .layers['counts']") rna.layers["counts"] = rna.layers["raw_counts"].copy() - L.info("raw counts found in layer") elif X_is_raw(rna): # this means the X layer is already raw and we can make the layer we need - L.info("raw counts found in X") + L.info("Found raw RNA counts in .X. Saving raw RNA counts to .layers['counts']") rna.layers["counts"] = rna.X.copy() else: - L.info("merge in raw counts") + L.info("Merging in raw RNA counts from '%s" % args.raw_anndata) sc_raw = read_anndata(args.raw_anndata, use_muon=True, modality="rna") #filter by barcodes in the scaled object sc_raw = sc_raw[sc_raw.obs_names.isin(rna.obs_names),: ] + L.info("Saving raw RNA counts to .layers['counts]") rna.layers["counts"] = sc_raw.X.copy() # filter prot outliers @@ -128,23 +129,25 @@ # for this to work the user needs to (manually) make a column called prot_outliers # actually there is a thing in the qc pipe that calculates outliers, I don't like it very much though if "prot_outliers" in mdata['prot'].columns: + L.info("Filtering out prot outliers") mu.pp.filter_obs(mdata, "prot_outliers") else: - raise ValueError("prot_outliers column not found in mdata['prot'].obs") + raise ValueError("'prot_outliers' column not found in mdata['prot'].obs") # exluding isotypes if 'isotype' in prot.var.columns: + L.info("Excluding isotypes") prot = prot[:, ~prot.var.isotype] # filter out mitochondria if params['multimodal']['totalvi']['exclude_mt_genes']: - L.info("excluding mt genes") + L.info("Filtering out mitochondrial genes") rna = rna[:, ~rna.var[params['multimodal']['totalvi']['mt_column']]] # filter by Hvgs if params['multimodal']['totalvi']['filter_by_hvg']: - L.info("filtering by highly_variable") + L.info("Filtering by HVGs") rna = rna[:, rna.var.highly_variable] mdata.update() @@ -152,24 +155,27 @@ #make protein obsm pandas # need to find the raw counts in prot if X_is_raw(prot): + L.info("Found raw prot counts in .X") X_array = prot.X.copy() elif "raw_counts" in prot.layers.keys(): + L.info("Found raw prot counts in .layers['raw_counts']") X_array = prot.layers['raw_counts'].copy() else: - raise AttributeError("raw counts not found in prot, \ - store in either X or in 'raw_counts' layer") + raise AttributeError("Raw counts not found in prot, \ + Store in either .X or in 'raw_counts' layer") X_df = pd.DataFrame(X_array.todense(), index=prot.obs_names, columns=prot.var_names) if X_df.shape[0] == rna.X.shape[0]: - L.info("adding protein_expression to obsm") + L.info("Adding protein_expression to .obsm['protein_expression']") #check the obs are in the correct order X_df = X_df.loc[rna.obs_names,:] X_df = X_df.astype('int') rna.obsm['protein_expression'] = X_df else: - L.error("dimensions do not match, cannot integrate protein") + L.error("Dimension 0 of protein expression matrix and RNA expression matrix do not match") + sys.exit("Dimension 0 of protein expression matrix and RNA expression matrix do not match") # clear up to preserve ram del X_array, X_df @@ -177,7 +183,7 @@ mdata.update() -L.debug(rna.obs.columns) +L.info("Setting up AnnData") scvi.model.TOTALVI.setup_anndata( rna, layer="counts", @@ -191,31 +197,28 @@ else: totalvi_model_args = {k: v for k, v in params['multimodal']['totalvi']['model_args'].items() if v is not None} -print(totalvi_model_args) if params['multimodal']['totalvi']['training_args'] is None: totalvi_training_args = {} else: totalvi_training_args = {k: v for k, v in params['multimodal']['totalvi']['training_args'].items() if v is not None} -print(totalvi_training_args) if params['multimodal']['totalvi']['training_plan'] is None: totalvi_training_plan = {} else: totalvi_training_plan = {k: v for k, v in params['multimodal']['totalvi']['training_plan'].items() if v is not None} -print(totalvi_training_plan) - - +L.info("Defining model") vae = scvi.model.TOTALVI(rna, **totalvi_model_args) - +L.info("Running totalVI") vae.train(**totalvi_training_args, plan_kwargs=totalvi_training_plan) vae.save(os.path.join("batch_correction", "totalvi_model"), anndata=False, overwrite=True ) +L.info("Plotting ELBO") plt.plot(vae.history["elbo_train"], label="train") plt.plot(vae.history["elbo_validation"], label="validation") plt.title("Negative ELBO over training epochs") @@ -234,10 +237,11 @@ Mudata obsm slot, and any single modality processing in its own modality slot """) + +L.info("Extracting latent space and saving latent to X_totalVI") mdata.obsm["X_totalVI"] = vae.get_latent_representation() if batch_categories is not None: - L.debug(batch_categories) if type(batch_categories) is not list: batch_categories = [batch_categories] normX, protein = vae.get_normalized_expression( @@ -245,7 +249,7 @@ return_mean=True, transform_batch=batch_categories ) - + L.info("Saving denoised data to .obsm['totalvi_denoised_rna'] and .obsm['totalvi_denoised_protein']") # this is stored in obsm, because if we have filtered by hvg, it no longer fits specifications for a layer mdata['rna'].obsm["totalvi_denoised_rna"] = normX # reorder if rna and prot are in different orders @@ -256,6 +260,7 @@ return_mean=True, transform_batch=batch_categories ) + L.info("Saving protein foreground prob to .obsm['totalvi_protein_foreground_prob']") mdata['prot'].obsm["totalvi_protein_foreground_prob"] = df.loc[mdata['prot'].obs_names,:] mdata.update() @@ -268,6 +273,7 @@ # neighbors function has been made to magically calculate mudata neighbors using sc.pp.neighbors # because we want to use_rep = X_totalvi (as a pseudo single modality) not run multimodal mu.pp.neighbors +L.info("Computing neighbors") run_neighbors_method_choice(mdata, method=args.neighbors_method, n_neighbors=int(args.neighbors_k), @@ -276,12 +282,16 @@ use_rep='X_totalVI', nthreads=max([threads_available, 6])) +L.info("Computing UMAP") sc.tl.umap(mdata, min_dist=0.4) +L.info("Computing Leiden clustering") sc.tl.leiden(mdata, key_added="leiden_totalVI") +L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv) umap = pd.DataFrame(mdata.obsm['X_umap'], mdata['rna'].obs.index) umap.to_csv(args.output_csv) +L.info("Saving MuData to 'tmp/totalvi_scaled_adata.h5mu'") mdata.write("tmp/totalvi_scaled_adata.h5mu") L.info("Done") diff --git a/panpipes/python_scripts/batch_correct_wnn.py b/panpipes/python_scripts/batch_correct_wnn.py index ccee3892..3032ec75 100644 --- a/panpipes/python_scripts/batch_correct_wnn.py +++ b/panpipes/python_scripts/batch_correct_wnn.py @@ -56,14 +56,15 @@ if params['multimodal']['WNN']['modalities'] is not None: modalities= params['multimodal']['WNN']['modalities'] modalities = [x.strip() for x in modalities.split(",")] - L.info(f"using modalities :{modalities}") + L.info(f"Using modalities :{modalities}") -L.info("running with batch corrections:") +L.info("Running with batch corrections:") wnn_params_bc = params['multimodal']['WNN']['batch_corrected'] if modalities is not None: wnn_params_bc= {k: wnn_params_bc[k] for k in wnn_params_bc.keys() & modalities} L.info( wnn_params_bc ) +L.info("Reading in MuData from '%s'" % args.scaled_anndata) mdata = mu.read(args.scaled_anndata) if all(x in modalities for x in mdata.mod.keys()): @@ -72,11 +73,11 @@ else: tmp = mdata.copy() removed_mods = list(set(mdata.mod.keys()) - set(modalities)) - L.info(f"removing modalities {removed_mods}") + L.info(f"Removing modalities {removed_mods}") for rmod in removed_mods: del tmp.mod[rmod] -L.info("intersecting modality obs before running wnn") +L.info("Intersecting modality obs before running WNN") mu.pp.intersect_obs(tmp) @@ -90,7 +91,6 @@ else: dict_graph[x]["obsm"] = None -L.debug(dict_graph) if dict_graph["rna"]["obsm"] == "X_scvi": dict_graph["rna"]["obsm"] = "X_scVI" @@ -100,13 +100,13 @@ pkmod=params['multimodal']['WNN']['knn'][kmod] if dict_graph[kmod]["obsm"] is not None: if dict_graph[kmod]["obsm"] not in tmp.mod[kmod].obsm.keys(): - L.info("provided mdata doesn't have the desired obsm, just checking if it's bbknn you want.") + L.info("Provided mdata doesn't have the desired obsm, just checking if it's bbknn you want.") if dict_graph[kmod]["obsm"] == "X_bbknn": if len(tmp.mod[kmod].obsp.keys()) > 0 and "neighbors" in tmp.mod[kmod].uns.keys() : - L.info("i found a populated obsp slot and I assume it's bbknn") + L.info("Populated obsp slot found. Assuming it's bbknn") else: if dict_graph[kmod]["anndata"] is not None: - L.info("reading precomputed connectivities for bbknn") + L.info("Reading precomputed connectivities for bbknn") adata = mu.read(dict_graph[kmod]["anndata"]) tmp.mod[kmod].obsp = adata.obsp.copy() tmp.mod[kmod].obsm[dict_graph[kmod]["obsm"]] = adata.obsm[dict_graph[kmod]["obsm"]].copy() @@ -114,7 +114,7 @@ tmp.update() else: if dict_graph[kmod]["anndata"] is not None: - L.info("provided mdata doesn't have the desired obsm. reading the batch corrected data from another stored object") + L.info("Provided mdata doesn't have the desired obsm. reading the batch corrected data from another stored object") adata = mu.read(dict_graph[kmod]["anndata"]) L.debug(kmod + "object") L.debug(adata) @@ -124,15 +124,15 @@ tmp.update() repuse = dict_graph[kmod]["obsm"] else: - L.info("could not find the desired obsm and the anndata slot is empty, will calculate on the flight") + L.warning("Could not find the desired obsm and the anndata slot is empty, will calculate on the flight") if kmod =="atac": if "X_lsi" in tmp.mod[kmod].obsm.keys(): repuse = "X_lsi" else: repuse = "X_pca" - L.info("falling back on %s" %(repuse) ) + L.info("Falling back on %s" %(repuse) ) - L.info("calculating neighbours") + L.info("Computing neighbours") if repuse != "X_bbknn": run_neighbors_method_choice(tmp.mod[kmod], method=pkmod['method'], @@ -145,7 +145,7 @@ else: L.info("Using %s" %(dict_graph[kmod]["obsm"])) else: - L.info("could not find the desired obsm and the anndata slot is empty, will calculate on the flight") + L.warning("Could not find the desired obsm and the anndata slot is empty, will calculate on the flight") repuse ="X_pca" if kmod =="atac": if "X_lsi" in tmp.mod[kmod].obsm.keys(): @@ -154,8 +154,7 @@ repuse = "X_pca" L.info("falling back on %s" %(repuse) ) - L.info("calculating neighbours") - + L.info("Computing neighbours") run_neighbors_method_choice(tmp.mod[kmod], method=pkmod['method'], n_neighbors=int(pkmod['k']), @@ -165,11 +164,9 @@ use_rep=repuse, nthreads=max([threads_available, 6])) -L.debug(tmp) tmp.update() -L.debug(tmp) -L.info("Now running WNN") +L.info("Running WNN") mu.pp.neighbors(tmp, n_neighbors= int(args.n_neighbors), n_bandwidth_neighbors= int(args.n_bandwidth_neighbors), @@ -177,12 +174,14 @@ metric= args.metric, low_memory= check_for_bool(args.low_memory), key_added='wnn') -L.info("WNN finished now calculate umap") +L.info("Computing UMAP") mu.tl.umap(tmp,min_dist=0.4, neighbors_key='wnn') #For taking use of mdata.obsp['connectivities'], it’s scanpy.tl.leiden() that should be used. not muon.tl.leiden +L.info("Computing Leiden clustering") sc.tl.leiden(tmp, neighbors_key='wnn', key_added='leiden_wnn') +L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv) umap = pd.DataFrame(tmp.obsm['X_umap'], tmp.obs.index) umap.to_csv(args.output_csv) @@ -192,6 +191,7 @@ for rmd in removed_mods: tmp.mod[rmd] = mdata.mod[rmd].copy() +L.info("Saving MuData to 'tmp/wnn_scaled_adata.h5mu'") tmp.write("tmp/wnn_scaled_adata.h5mu") L.info("Done")