From de6ae27a64418398325d95c00dc4f541841ad7f4 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Thu, 18 Apr 2024 12:30:58 +0000
Subject: [PATCH] add log info

---
 .../python_scripts/batch_correct_bbknn.py     |  4 +-
 .../python_scripts/batch_correct_harmony.py   | 31 +++++-----
 panpipes/python_scripts/batch_correct_mofa.py | 26 +++++----
 .../python_scripts/batch_correct_multivi.py   | 57 +++++++++---------
 .../python_scripts/batch_correct_scanorama.py | 34 +++++------
 panpipes/python_scripts/batch_correct_scvi.py | 44 +++++++-------
 .../python_scripts/batch_correct_totalvi.py   | 58 +++++++++++--------
 panpipes/python_scripts/batch_correct_wnn.py  | 38 ++++++------
 8 files changed, 152 insertions(+), 140 deletions(-)

diff --git a/panpipes/python_scripts/batch_correct_bbknn.py b/panpipes/python_scripts/batch_correct_bbknn.py
index a3b5f8ff..2588c73c 100644
--- a/panpipes/python_scripts/batch_correct_bbknn.py
+++ b/panpipes/python_scripts/batch_correct_bbknn.py
@@ -68,7 +68,7 @@
         sc.tl.pca(adata, n_comps=min(50,adata.var.shape[0]-1), svd_solver='arpack', random_state=0) 
 
 if "X_pca" not in adata.obsm:
-    L.info("X_pca could not be found in adata.obsm. Computing PCA with default parameters.")
+    L.warning("X_pca could not be found in adata.obsm. Computing PCA with default parameters.")
     n_pcs = 50
     if adata.var.shape[0] < n_pcs:
         L.info("You have less features than number of PCs you intend to calculate")
@@ -103,7 +103,7 @@
                         n_pcs = int(args.neighbors_n_pcs),
                         neighbors_within_batch=nnb)  # calculates the neighbours
 
-L.info("Calculating UMAP")
+L.info("Computing UMAP")
 sc.tl.umap(adata)
 
 # write out
diff --git a/panpipes/python_scripts/batch_correct_harmony.py b/panpipes/python_scripts/batch_correct_harmony.py
index 28f75388..f093f932 100644
--- a/panpipes/python_scripts/batch_correct_harmony.py
+++ b/panpipes/python_scripts/batch_correct_harmony.py
@@ -56,14 +56,12 @@
 
 args, opt = parser.parse_known_args()
 
-
-L.info("reading data and starting integration pipeline with script: ")
-L.info(os.path.basename(__file__))
-L.info("Running with options: %s", args)
+L.info(args)
 
 # this should be an object that contains the full log normalised data (adata_log1p.h5ad)
 # prior to hvgs and filtering
 #adata = read_anndata(args.input_anndata, use_muon=use_muon, modality=args.modality)
+L.info("Reading in MuData from '%s'" % args.input_anndata)
 mdata = mu.read(args.input_anndata)
 adata = mdata.mod[args.modality] 
 
@@ -76,14 +74,16 @@
     dimred = "X_lsi"
 
 if dimred not in adata.obsm:
-    L.info("i need a dimred to start, computing pca with default param")
+    L.warning("Dimred '%s' could not be found in adata.obsm. Computing PCA with default parameters." % dimred)
     dimred = "X_pca" 
     n_pcs = 50
     if adata.var.shape[0] < n_pcs:
         L.info("You have less features than number of PCs you intend to calculate")
         n_pcs = adata.var.shape[0] - 1
-        L.info("Setting n PCS to %i" % int(n_pcs))    
+        L.info("Setting n PCS to %i" % int(n_pcs)) 
+    L.info("Scaling data")   
     sc.pp.scale(adata)
+    L.info("Computing PCA")
     sc.tl.pca(adata, n_comps=n_pcs, 
                     svd_solver='arpack', 
                     random_state=0) 
@@ -91,14 +91,14 @@
 
 
 if len(columns)>1: 
-    L.info("using 2 columns to integrate on more variables")
+    L.info("Using 2 columns to integrate on more variables")
     #comb_columns = "_".join(columns)
     adata.obs["comb_columns"] = adata.obs[columns].apply(lambda x: '|'.join(x), axis=1)
 
     # make sure that batch is a categorical
     adata.obs["comb_columns"] = adata.obs["comb_columns"].astype("category")
     # run harmony
-
+    L.info("Running Harmony")
     ho = hm.run_harmony(adata.obsm[dimred][:,0:int(args.harmony_npcs)], adata.obs, ["comb_columns"], 
                                        sigma = float(args.sigma_val),theta = float(args.theta_val),verbose=True,max_iter_kmeans=30, 
                                        max_iter_harmony=40)
@@ -107,6 +107,7 @@
     # make sure that batch is a categorical
     adata.obs[args.integration_col] = adata.obs[args.integration_col].astype("category")
     # run harmony
+    L.info("Running Harmony")
     ho = hm.run_harmony(adata.obsm[dimred][:,0:int(args.harmony_npcs)],
                         adata.obs,
                         [args.integration_col],
@@ -116,11 +117,9 @@
                         max_iter_harmony=40)
 
 
-L.info("integration run now calculate umap")
-
+L.info("Saving harmony co-ords to .obsm['X_harmony']")
 adjusted_pcs = pd.DataFrame(ho.Z_corr).T
 adata.obsm['X_harmony']=adjusted_pcs.values
-L.info("harmony co-ords derived")
 
 if int(args.neighbors_n_pcs) >adata.obsm['X_harmony'].shape[1]:
     L.warn(f"N PCs is larger than X_harmony dimensions, reducing n PCs to  {adata.obsm['X_harmony'].shape[1] -1}")
@@ -128,6 +127,7 @@
 n_pcs= min(int(args.neighbors_n_pcs), adata.obsm['X_harmony'].shape[1]-1)
 
 # run neighbours and umap 
+L.info("Computing neighbors")
 run_neighbors_method_choice(adata, 
     method=args.neighbors_method, 
     n_neighbors=int(args.neighbors_k), 
@@ -137,10 +137,11 @@
     nthreads=max([threads_available, 6]))
 
 
-L.info("done n_neighbours, moving to umap")
+L.info("Computing UMAP")
 sc.tl.umap(adata)
-L.info("done umap, saving stuff")
+
 # write out
+L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv)
 umap = pd.DataFrame(adata.obsm['X_umap'], adata.obs.index)
 umap.to_csv(args.output_csv)
 
@@ -149,11 +150,9 @@
 
 
 outfiletmp = ("tmp/harmony_scaled_adata_" + args.modality + ".h5ad" )
-
-L.info("saving harmony corrected adata")
+L.info("Saving AnnData to '%s'" % outfiletmp)
 write_anndata(adata, outfiletmp, use_muon=False, modality=args.modality)
 
-
 L.info("Done")
 
 
diff --git a/panpipes/python_scripts/batch_correct_mofa.py b/panpipes/python_scripts/batch_correct_mofa.py
index 6a3fff6d..8a4d2d62 100644
--- a/panpipes/python_scripts/batch_correct_mofa.py
+++ b/panpipes/python_scripts/batch_correct_mofa.py
@@ -68,30 +68,31 @@
 threads_available = multiprocessing.cpu_count()
 params = pp.io.read_yaml("pipeline.yml")
 
+L.info("Reading in MuData from '%s'" % args.scaled_anndata)
 mdata = mu.read(args.scaled_anndata)
 
 if params['multimodal']['mofa']['modalities'] is not None:
     modalities= params['multimodal']['mofa']['modalities']
     modalities = [x.strip() for x in modalities.split(",")]
-    L.info(f"using modalities :{modalities}")
+    L.info(f"Using modalities :{modalities}")
     removed_mods = None
     if all(x in modalities for x in mdata.mod.keys()):
         tmp = mdata.copy()
-        L.info('using all modalities')
+        L.info('Using all modalities')
     else:
         tmp = mdata.copy()
         removed_mods = list(set(mdata.mod.keys()) - set(modalities))
-        L.info(f"removing modalities {removed_mods}")
+        L.info(f"Removing modalities {removed_mods}")
         for rmod in removed_mods:
             del tmp.mod[rmod]
 else:
-    L.warning("""you specified no modalities, so i will default to all available
-                this may be a problem if you have repertoire in here""")
+    L.warning("""No modalities were specified, therefore all available modalities will be used.
+                This may be problematic if repertoire is present as modality.""")
     removed_mods = None
     tmp = mdata.copy()  
 
-print(tmp.mod.keys())
 
+L.info("Intersecting modality obs before running mofa")
 mu.pp.intersect_obs(tmp)
 
 mofa_kwargs={}
@@ -111,7 +112,6 @@
     del mofa_kwargs['filter_by_hvg']
     for mod in tmp.mod.keys():
         if "highly_variable" not in tmp[mod].var.columns:
-            print(mod)
             tmp[mod].var["highly_variable"] = True
         
     tmp.update()
@@ -148,20 +148,19 @@
 if args.use_gpu is True or args.use_gpu=='True':
      mofa_kwargs['gpu_mode'] = True
 
-L.info(mofa_kwargs)
-L.info(tmp.var.columns)
-
 
+L.info("Running mofa")
 mu.tl.mofa(tmp, **mofa_kwargs)
 #This adds X_mofa embeddings to the .obsm slot of the MuData object
 #write the discovered latent rep to the original mudata object
+L.info("Adding X_mofa to .obsm of MuData")
 mdata.obsm['X_mofa'] = tmp.obsm['X_mofa'].copy()   
 
 if int(args.neighbors_n_pcs) > mdata.obsm['X_mofa'].shape[1]:
     L.warning(f"N PCs is larger than X_mofa dimensions, reducing n PCs to  {mdata.obsm['X_mofa'].shape[1]-1}")
 n_pcs= min(int(args.neighbors_n_pcs), mdata.obsm['X_mofa'].shape[1]-1)
 
-
+L.info("Computing neighbors")
 run_neighbors_method_choice(mdata, 
     method=args.neighbors_method, 
     n_neighbors=int(args.neighbors_k), 
@@ -170,9 +169,12 @@
     use_rep='X_mofa',
     nthreads=max([threads_available, 6]))
 
+L.info("Computing UMAP")
 sc.tl.umap(mdata, min_dist=0.4)
+L.info("Computing Leiden clustering")
 sc.tl.leiden(mdata,  key_added="leiden_mofa")
 
+L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv)
 umap = pd.DataFrame(mdata.obsm['X_umap'], mdata.obs.index)
 umap.to_csv(args.output_csv)
 
@@ -180,8 +182,8 @@
     for rmd in removed_mods:
         tmp.mod[rmd] = mdata.mod[rmd].copy()
 
+L.info("Saving MuData to 'tmp/mofa_scaled_adata.h5mu'")
 tmp.write("tmp/mofa_scaled_adata.h5mu")
 
-
 L.info("Done")
 
diff --git a/panpipes/python_scripts/batch_correct_multivi.py b/panpipes/python_scripts/batch_correct_multivi.py
index 93032afe..38e776c6 100644
--- a/panpipes/python_scripts/batch_correct_multivi.py
+++ b/panpipes/python_scripts/batch_correct_multivi.py
@@ -51,6 +51,7 @@
 
 
 args, opt = parser.parse_known_args()
+L.info(args)
 
 # scanpy settings
 sc.set_figure_params(facecolor="white")
@@ -69,8 +70,7 @@
     params['MultiVI']['training_args']['max_epochs'] = 10
 
 # ------------------------------------------------------------------
-L.info("Running multivi")
-
+L.info("Reading in MuData from '%s'" % args.scaled_anndata)
 mdata = mu.read(args.scaled_anndata)
 rna = mdata['rna'].copy()
 atac = mdata['atac'].copy()
@@ -79,9 +79,9 @@
 
 if check_for_bool(params["multimodal"]["MultiVI"]["lowmem"]):
     if 'hvg' in atac.uns.keys():
-        L.info("subsetting atac to top HVF")
+        L.info("Subsetting ATAC to HVFs")
         atac = atac[:, atac.var.highly_variable].copy()
-    L.info("calculating and subsetting atac to top 25k HVF")
+    L.info("Calculating and subsetting ATAC to top 25k HVF")
     sc.pp.highly_variable_genes(atac, n_top_genes=25000)
     atac = atac[:, atac.var.highly_variable].copy()
 
@@ -98,8 +98,8 @@
 if rna.shape[0] == atac.shape[0]:
     n=int(rna.shape[0])
 else:
-    sys.exit("rna and atac have different number of cells, \
-        can't deal with this in this version of MultiVI integration")
+    sys.exit("RNA and ATAC have different number of cells, \
+        Can't deal with this in this version of MultiVI integration")
 
 gc.collect()
 
@@ -108,24 +108,27 @@
 
 
 if "raw_counts" in rna.layers.keys():
-    L.info("raw counts found in layer for RNA")
+    L.info("Found raw RNA counts in .layers['raw_counts']")
 elif X_is_raw(rna):
     # this means the X layer is already raw and we can make the layer we need
-    L.info("raw counts found in X for RNA")
+    L.info("Found raw RNA counts in .X. Saving raw RNA counts to .layers['raw_counts']")
     rna.layers["raw_counts"] = rna.X.copy()
 else:
-    sys.exit("please provide a mu/anndata with raw counts for RNA")
+    L.error("Could not find raw counts for RNA in .X and .layers['raw_counts']")
+    sys.exit("Could not find raw counts for RNA in .X and .layers['raw_counts']")
+
 
 if "raw_counts" in atac.layers.keys():
-    L.info("raw counts found in layer for ATAC")
+     L.info("Found raw ATAC counts in .layers['raw_counts']")
 elif X_is_raw(atac):
     # this means the X layer is already raw and we can make the layer we need
-    L.info("raw counts found in X for ATAC")
+    L.info("Found raw ATAC counts in .X. Saving raw ATAC counts to .layers['raw_counts']")
     atac.layers["raw_counts"] = atac.X.copy()
 else:
-    sys.exit("please provide a mu/anndata with raw counts for ATAC")
+    L.error("Could not find raw counts for ATAC in .X and .layers['raw_counts']")
+    sys.exit("Could not find raw counts for ATAC in .X and .layers['raw_counts']")
 
-L.info("concatenating modalities to comply with multiVI")
+L.info("Concatenating modalities to comply with multiVI")
 # adata_paired = ad.concat([rna, atac], join="outer")
 # adata_paired.var = pd.concat([rna.var,atac.var])
 if rna.is_view:
@@ -134,10 +137,6 @@
 if atac.is_view:
     L.info("ATAC is view")
     atac = atac.copy()
-
-L.info(atac.is_view)
-
-
 adata_paired = ad.concat([rna.T, atac.T]).T
 
 rna_cols=rna.obs.columns
@@ -152,7 +151,6 @@
 if "modality" not in adata_paired.obs.columns:
     adata_paired.obs["modality"] = "paired" 
 
-print(adata_paired)
 
 # adata = ad.concat([rna,atac],join="outer")
 # adata.var = pd.concat([rna.var,atac.var])
@@ -160,6 +158,8 @@
 del [rna , atac ]
 gc.collect()
 
+
+L.info("Organizing multiome AnnDatas")
 adata_mvi = scvi.data.organize_multiome_anndatas(adata_paired)
 
 
@@ -170,10 +170,8 @@
     columns = []
     for cc in cols:
         if cc in rna_cols:
-            print("categorical batch covariate in rna")
             columns.append("rna:"+cc)
         elif cc in atac_cols:
-            print("categorical batch covariate in atac")
             columns.append("atac:"+cc)
 if args.integration_col_continuous is not None :
     if args.integration_col_continuous in rna_cols:
@@ -187,7 +185,7 @@
 if columns is not None:
     print(columns)
     if len(columns) > 1:
-        L.info("using 2 columns to integrate on more variables")
+        L.info("Using 2 columns to integrate on more variables")
         # bc_batch = "_".join(columns)
         adata_mvi.obs["bc_batch"] = adata_mvi.obs[columns].apply(lambda x: '|'.join(x), axis=1) 
         # make sure that batch is a categorical
@@ -204,10 +202,10 @@
     adata_mvi.obs['bc_batch_continuous'] = adata_mvi.obs[args.integration_col_continuous]
     kwargs['continuous_covariate_keys'] = ["bc_batch_continuous"]
 
-print(kwargs)
 
 # 1 setup anndata
 #scvi.model.MULTIVI.setup_anndata(adata_mvi, batch_key='modality', **kwargs)
+L.info("Setting up AnnData")
 scvi.model.MULTIVI.setup_anndata(
     adata_mvi, 
     batch_key='modality',
@@ -219,8 +217,7 @@
 else:
     multivi_model_args =  {k: v for k, v in params["multimodal"]['MultiVI']['model_args'].items() if v is not None}
 
-print(multivi_model_args)
-
+L.info("Defining model")
 mvi = scvi.model.MULTIVI(
     adata_mvi,
     n_genes=n_genes,
@@ -234,23 +231,21 @@
     multivi_training_args={}
 else:
     multivi_training_args =  {k: v for k, v in params["multimodal"]['MultiVI']['training_args'].items() if v is not None}
-L.info("multivi training args")
-print(multivi_training_args)
 
 if params["multimodal"]['MultiVI']['training_plan'] is None:
     multivi_training_plan = {}
 else:
     multivi_training_plan =  {k: v for k, v in params["multimodal"]['MultiVI']['training_plan'].items() if v is not None}
-L.info("multivi training plan")
-print(multivi_training_plan)
 
 mvi.view_anndata_setup()
 
+L.info("Running multiVI")
 mvi.train( **multivi_training_args, **multivi_training_plan)
 
 mvi.save(os.path.join("batch_correction", "multivi_model"), 
                   anndata=False)
 
+L.info("Plotting ELBO")
 plt.plot(mvi.history["elbo_train"], label="train")
 plt.plot(mvi.history["elbo_validation"], label="validation")
 plt.title("Negative ELBO over training epochs")
@@ -267,6 +262,7 @@
             """)
 
 mdata = mu.read(args.scaled_anndata)
+L.info("Extracting latent space and saving latent to X_MultiVI")
 mdata.obsm["X_MultiVI"] = mvi.get_latent_representation()
 #adata_mvi.obsm["X_MultiVI"] = mvi.get_latent_representation()
 
@@ -274,6 +270,7 @@
     L.warn(f"N PCs is larger than X_MultiVI dimensions, reducing n PCs to  {mdata.obsm['X_MultiVI'].shape[1] -1}")
 n_pcs= min(int(args.neighbors_n_pcs), mdata.obsm['X_MultiVI'].shape[1]-1)
 
+L.info("Computing neighbors")
 run_neighbors_method_choice(mdata, 
     method=args.neighbors_method, 
     n_neighbors=int(args.neighbors_k), 
@@ -281,12 +278,16 @@
     metric=args.neighbors_metric, 
     use_rep='X_MultiVI',
     nthreads=max([threads_available, 6]))
+L.info("Computing UMAP")
 sc.tl.umap(mdata, min_dist=0.4)
+L.info("Computing Leiden clustering")
 sc.tl.leiden(mdata, key_added="leiden_multiVI")
 
+L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv)
 umap = pd.DataFrame(mdata.obsm['X_umap'], mdata.obs.index)
 umap.to_csv(args.output_csv)
 
+L.info("Saving MuData to 'tmp/multivi_scaled_adata.h5mu'")
 write_anndata(mdata, "tmp/multivi_scaled_adata.h5mu",use_muon=False)
 
 L.info("Done")
diff --git a/panpipes/python_scripts/batch_correct_scanorama.py b/panpipes/python_scripts/batch_correct_scanorama.py
index 26980ee9..ecc8d64c 100644
--- a/panpipes/python_scripts/batch_correct_scanorama.py
+++ b/panpipes/python_scripts/batch_correct_scanorama.py
@@ -57,16 +57,12 @@
 
 args, opt = parser.parse_known_args()
 
-
-
-L.info("Running with options: %s", args)
+L.info(args)
 
 # Scanorama is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods,
 # including those for imputation and highly-variable gene filtering.
-L.info("reading data and starting integration pipeline with script: ")
-L.info(os.path.basename(__file__))
-
 #adata = read_anndata(args.input_anndata, use_muon=use_muon, modality="rna")
+L.info("Reading in MuData from '%s'" % args.input_anndata)
 mdata = mu.read(args.input_anndata)
 adata = mdata.mod[args.modality] 
 bcs = adata.obs_names.tolist()
@@ -74,7 +70,7 @@
 # scanorama can't integrate on 2+ variables, so create a fake column with combined information
 columns = [x.strip() for x in args.integration_col.split(",")]
 if len(columns) > 1:
-    L.info("using 2 columns to integrate on more variables")
+    L.info("Using 2 columns to integrate on more variables")
     # comb_columns = "_".join(columns)
     adata.obs["comb_columns"] = adata.obs[columns].apply(lambda x: '|'.join(x), axis=1)
 
@@ -86,22 +82,22 @@
         adata.obs['batch'] = adata.obs[args.integration_col]
 
 batches = adata.obs['batch'].unique()
-L.info("define scanorama inputs based on batches and number of PCs")
 # need contiguous batches
 scanorama_input = adata[adata.obs.sort_values(by="batch").index.tolist(), :]
 
 # filter by HVGs to make it equivalent to the old scripts,
 # which inputted the scaled object after filtering by hvgs.
+L.info("Filtering data by HVGs")
 scanorama_input = scanorama_input[:, scanorama_input.var.highly_variable]
 #also filter the X_PCA to be the number of PCs we actually want to use
+L.info("Subsetting X_pca to %s PCs" % args.neighbors_n_pcs)
 scanorama_input.obsm['X_pca'] = scanorama_input.obsm['X_pca'][:,0:int(args.neighbors_n_pcs)]
 
 
 # run scanoramam using the scanpy integrated approach
-L.info("run_scanorama")
+L.info("Running scanorama")
 
 sce.pp.scanorama_integrate(scanorama_input, key='batch', batch_size=int(args.batch_size))
-L.info("scanorama done")
 
 # not integrated
 
@@ -118,22 +114,22 @@
 scanorama_input = scanorama_input[bcs, :]
 # check it worked
 if all(scanorama_input.obs_names == bcs):
-    L.info("barcode order is correct")
+    L.info("Barcode order is correct")
 else:
-    L.debug("barcode order in  batch corrected object is incorrect")
-    sys.exit("barcode order in  batch corrected object is incorrect")
+    L.error("Barcode order in  batch corrected object is incorrect")
+    sys.exit("Barcode order in  batch corrected object is incorrect")
 
-L.info(adata)
+L.info("Saving scanorama embedding to X_scanorama")
 # add to the AnnData object
 adata.obsm["X_scanorama"] = scanorama_input.obsm["X_scanorama"]
 
-L.info("integration run now calculate umap")
 
 if int(args.neighbors_n_pcs) > adata.obsm['X_scanorama'].shape[1]:
     L.warn(f"N PCs is larger than X_scanorama dimensions, reducing n PCs to  {adata.obsm['X_scanorama'].shape[1] -1}")
 n_pcs= min(int(args.neighbors_n_pcs), adata.obsm['X_scanorama'].shape[1]-1)
 
 # run neighbours and umap 
+L.info("Computing neighbors")
 run_neighbors_method_choice(adata, 
     method=args.neighbors_method, 
     n_neighbors=int(args.neighbors_k), 
@@ -142,15 +138,17 @@
     use_rep='X_scanorama',
     nthreads=max([threads_available, 6]))
 
-
+L.info("Computing UMAP")
 sc.tl.umap(adata)
-L.info("done umap, saving stuff")
+
 # write out
 umap = pd.DataFrame(adata.obsm['X_umap'], adata.obs.index)
+L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv)
 umap.to_csv(args.output_csv)
 
 # save the scanorama dim reduction in case scanorama is our favourite
+L.info("Saving AnnData to 'tmp/scanorama_scaled_adata.h5ad'")
 adata.write("tmp/scanorama_scaled_adata.h5ad")
 
-L.info("done")
+L.info("Done")
 
diff --git a/panpipes/python_scripts/batch_correct_scvi.py b/panpipes/python_scripts/batch_correct_scvi.py
index 6c6ac24b..f8b73383 100644
--- a/panpipes/python_scripts/batch_correct_scvi.py
+++ b/panpipes/python_scripts/batch_correct_scvi.py
@@ -53,8 +53,6 @@
 
 # load parameters
 params = read_yaml("pipeline.yml")
-L.info("Running scvi script")
-L.info("threads available: %s", threads_available)
 
 # scanpy settings
 sc.set_figure_params(facecolor="white")
@@ -72,9 +70,7 @@
 # load an process the scaled snRNAseq dataset --------------------------------------------------
 
 
-L.info("loading in anndata object ")
-
-
+L.info("Reading in Data from '%s'" % args.scaled_anndata)
 mdata = mu.read(args.scaled_anndata)
 if type(mdata) is mu.MuData:
     rna = mdata['rna']
@@ -83,8 +79,8 @@
     rna = mdata
     L.info(rna)
 else:
-    L.error("unknown file type")
-    sys.exit("file not MuData or Anndata")
+    L.error("Unknown file type. File not MuData or Anndata")
+    sys.exit("Unknown file type. File not MuData or Anndata")
 
 # this is a copy so we can subset vars and obs without changing the original object
 
@@ -92,7 +88,7 @@
 # in case of more than 1 variable, create a fake column with combined information
 columns = [x.strip() for x in args.integration_col.split(",")]
 if len(columns) > 1:
-    L.info("using 2 columns to integrate on more variables")
+    L.info("Using 2 columns to integrate on more variables")
     # bc_batch = "_".join(columns)
     rna.obs["bc_batch"] = rna.obs[columns].apply(lambda x: '|'.join(x), axis=1)
 
@@ -105,22 +101,26 @@
 # add in raw counts as a layer 
 # add in test to see if raw layer exists alread
 if "raw_counts" in rna.layers.keys():
-    L.info("raw counts found in layer")
+    L.info("Found raw counts in .layers['raw_counts']")
 elif X_is_raw(rna):
+    L.info("Found raw counts in .X. Saving raw counts to .layers['raw_counts']")
     rna.layers["raw_counts"] = rna.X.copy()
 else:
-    L.info("merge in raw counts")
+    L.info("Merging in raw counts from '%s" % args.raw_anndata)
     sc_raw = read_anndata(args.raw_anndata, use_muon=True, modality="rna")
     #filter by barcodes in the scaled object
     sc_raw = sc_raw[sc_raw.obs_names.isin(rna.obs_names),: ]
+    L.info("Saving raw counts to .layers['raw_counts]")
     rna.layers["raw_counts"] = sc_raw.X.copy()
 
 
 # filter out mitochondria
 if params['rna']['scvi']['exclude_mt_genes']:
+    L.info("Filtering out mitochondrial genes")
     rna = rna[:, ~rna.var[params['rna']['scvi']['mt_column']]]
 
 # filter by Hvgs
+L.info("Filtering by HVGs")
 rna = rna[:, rna.var.highly_variable]
 
 
@@ -131,31 +131,28 @@
 
 rna = rna.copy()
 
-# scvi.model.data.setup_anndata(
+
+L.info("Setting up AnnData")
 scvi.model.SCVI.setup_anndata(
     rna,
     layer="raw_counts",
     batch_key='bc_batch'
 )
 
-
 scvi_model_args =  {k: v for k, v in params['rna']['scvi']['model_args'].items() if v is not None}
-print(scvi_model_args)
 scvi_training_args =  {k: v for k, v in params['rna']['scvi']['training_args'].items() if v is not None}
-print(scvi_training_args)
 scvi_training_plan =  {k: v for k, v in params['rna']['scvi']['training_plan'].items() if v is not None}
-print(scvi_training_plan)
-
-
 
+L.info("Defining model")
 vae = scvi.model.SCVI(rna, **scvi_model_args) 
-
+L.info("Running scVI")
 vae.train(**scvi_training_args, plan_kwargs=scvi_training_plan) 
 
 vae.save(os.path.join("batch_correction", "scvi_model"), 
                   anndata=False)
 
 # no early stopping?
+L.info("Plotting ELBO")
 vae.history["elbo_train"].plot()
 plt.savefig(os.path.join(args.figdir, "scvi_elbo_train.png"))
 
@@ -168,12 +165,13 @@
 plt.savefig(os.path.join(args.figdir, "scvi_metrics.png"))
 
 # vae.history['elbo_train']['elbo_train'].to_list()
-
+L.info("Extracting latent space")
 latent = vae.get_latent_representation()
-
+L.info("Saving latent to X_scVI")
 rna.obsm["X_scVI"] = latent
 rna.obs['bc_batch'] = rna.obs['bc_batch']
 
+L.info("Plotting latent")
 sc.pl.embedding(rna, "X_scVI", color="bc_batch", save="_batch.png")
 
 plot_df = pd.DataFrame(latent[:, 0:2])
@@ -184,9 +182,9 @@
 
 n_pcs= min(int(args.neighbors_n_pcs), rna.obsm['X_scVI'].shape[1]-1)
 
-print(rna.obsm['X_scVI'].shape)
 # use scVI latent space for UMAP generation
 # sc.pp.neighbors(rna,n_neighbors=int(args.n_neighbors), use_rep="X_scVI")
+L.info("Computing neighbors")
 run_neighbors_method_choice(rna, 
     method=args.neighbors_method, 
     n_neighbors=int(args.neighbors_k), 
@@ -195,18 +193,22 @@
     use_rep='X_scVI',
     nthreads=max([threads_available, 6]))
 
+L.info("Computing UMAP")
 sc.tl.umap(rna)
 
+L.info("Plotting UMAP")
 sc.pl.umap(
     rna,
     color=["bc_batch",],
     frameon=False, save="_scvi_batch.png"
 )
 
+L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv)
 umap = pd.DataFrame(rna.obsm['X_umap'], rna.obs.index)
 umap.to_csv(args.output_csv)
 
 # save anndata to be used by other scvi tools applications
+L.info("Saving AnnData to 'tmp/scvi_scaled_adata_rna.h5ad'")
 rna.write(os.path.join("tmp", "scvi_scaled_adata_rna.h5ad"))
 
 L.info("Done")
diff --git a/panpipes/python_scripts/batch_correct_totalvi.py b/panpipes/python_scripts/batch_correct_totalvi.py
index b7214484..d3fd34d7 100644
--- a/panpipes/python_scripts/batch_correct_totalvi.py
+++ b/panpipes/python_scripts/batch_correct_totalvi.py
@@ -70,11 +70,11 @@
     params['multimodal']['totalvi']['training_args']['max_epochs'] = 10
 
 # ------------------------------------------------------------------
-L.info("Running totalvi script")
-
+L.info("Reading in MuData from '%s'" % args.scaled_anndata)
 mdata = mu.read(args.scaled_anndata)
 
 # we need to intersect the prot and rna modalities.
+L.info("Intersecting prot and rna modalities")
 intersect_obs_by_mod(mdata, ['rna', 'prot'])
 
 # this is a copy so we can subset vars and obs without changing the original object
@@ -87,7 +87,7 @@
     columns = [x.strip() for x in args.integration_col_categorical.split(",")]
     columns =[ x.replace("rna:","") for x in columns]
     if len(columns) > 1:
-        L.info("using 2 columns to integrate on more variables")
+        L.info("Using 2 columns to integrate on more variables")
         # bc_batch = "_".join(columns)
         rna.obs["bc_batch"] = rna.obs[columns].apply(lambda x: '|'.join(x), axis=1)
         # make sure that batch is a categorical
@@ -103,24 +103,25 @@
         rna.obs['bc_batch_continuous'] = rna.obs[args.integration_col_continuous]
         kwargs["continuous_covariate_keys"] = ["bc_batch_continuous"]
 
-L.debug(kwargs)
 
 # add in raw counts as a layer 
 # try to find or load the raw counts
 if "counts" in rna.layers.keys():
-    L.info("raw counts found in layer")
+    L.info("Found raw RNA counts in .layers['counts']")
 elif "raw_counts" in rna.layers.keys():
+    L.info("Found raw RNA counts in .layers['raw_counts']")
+    L.info("Saving raw RNA counts to .layers['counts']")
     rna.layers["counts"] = rna.layers["raw_counts"].copy() 
-    L.info("raw counts found in layer")
 elif X_is_raw(rna):
     # this means the X layer is already raw and we can make the layer we need
-    L.info("raw counts found in X")
+    L.info("Found raw RNA counts in .X. Saving raw RNA counts to .layers['counts']")
     rna.layers["counts"] = rna.X.copy()
 else:
-    L.info("merge in raw counts")
+    L.info("Merging in raw RNA counts from '%s" % args.raw_anndata)
     sc_raw = read_anndata(args.raw_anndata, use_muon=True, modality="rna")
     #filter by barcodes in the scaled object
     sc_raw = sc_raw[sc_raw.obs_names.isin(rna.obs_names),: ]
+    L.info("Saving raw RNA counts to .layers['counts]")
     rna.layers["counts"] = sc_raw.X.copy()
 
 # filter prot outliers 
@@ -128,23 +129,25 @@
     # for this to work the user needs to (manually) make a column called prot_outliers
     # actually there is a thing in the qc pipe that calculates outliers, I don't like it very much though
     if "prot_outliers" in mdata['prot'].columns:
+        L.info("Filtering out prot outliers")
         mu.pp.filter_obs(mdata, "prot_outliers")
     else:
-        raise ValueError("prot_outliers column not found in mdata['prot'].obs")
+        raise ValueError("'prot_outliers' column not found in mdata['prot'].obs")
 
 # exluding isotypes
 if 'isotype' in prot.var.columns:
+    L.info("Excluding isotypes")
     prot = prot[:, ~prot.var.isotype]
 
 # filter out mitochondria
 if params['multimodal']['totalvi']['exclude_mt_genes']:
-    L.info("excluding mt genes")
+    L.info("Filtering out mitochondrial genes")
     rna = rna[:, ~rna.var[params['multimodal']['totalvi']['mt_column']]]
     
 # filter by Hvgs
 
 if params['multimodal']['totalvi']['filter_by_hvg']:
-    L.info("filtering by highly_variable")
+    L.info("Filtering by HVGs")
     rna = rna[:, rna.var.highly_variable]
 
 mdata.update()
@@ -152,24 +155,27 @@
 #make protein obsm pandas
 # need to find the raw counts in prot
 if X_is_raw(prot):
+    L.info("Found raw prot counts in .X")
     X_array = prot.X.copy()
 elif "raw_counts" in prot.layers.keys(): 
+    L.info("Found raw prot counts in .layers['raw_counts']")
     X_array = prot.layers['raw_counts'].copy()
 else:
-    raise AttributeError("raw counts not found in prot, \
-                        store in either X or in 'raw_counts' layer")
+    raise AttributeError("Raw counts not found in prot, \
+                        Store in either .X or in 'raw_counts' layer")
 
 
 X_df = pd.DataFrame(X_array.todense(), index=prot.obs_names, columns=prot.var_names)
 
 if X_df.shape[0] == rna.X.shape[0]:
-    L.info("adding protein_expression to obsm")
+    L.info("Adding protein_expression to .obsm['protein_expression']")
     #check the obs are in the correct order
     X_df = X_df.loc[rna.obs_names,:]
     X_df = X_df.astype('int')
     rna.obsm['protein_expression'] = X_df 
 else:
-    L.error("dimensions do not match, cannot integrate protein")
+    L.error("Dimension 0 of protein expression matrix and RNA expression matrix do not match")
+    sys.exit("Dimension 0 of protein expression matrix and RNA expression matrix do not match")
 
 # clear up to preserve ram
 del X_array, X_df
@@ -177,7 +183,7 @@
 
 mdata.update()
 
-L.debug(rna.obs.columns)
+L.info("Setting up AnnData")
 scvi.model.TOTALVI.setup_anndata(
     rna,
     layer="counts",
@@ -191,31 +197,28 @@
 else:
     totalvi_model_args =  {k: v for k, v in params['multimodal']['totalvi']['model_args'].items() if v is not None}
 
-print(totalvi_model_args)
 
 if params['multimodal']['totalvi']['training_args'] is None:
     totalvi_training_args = {}
 else:
     totalvi_training_args =  {k: v for k, v in params['multimodal']['totalvi']['training_args'].items() if v is not None}
 
-print(totalvi_training_args)
 
 if params['multimodal']['totalvi']['training_plan'] is None:
     totalvi_training_plan = {}
 else:
     totalvi_training_plan =  {k: v for k, v in params['multimodal']['totalvi']['training_plan'].items() if v is not None}
 
-print(totalvi_training_plan)
-
-
+L.info("Defining model")
 vae = scvi.model.TOTALVI(rna, **totalvi_model_args)
-
+L.info("Running totalVI")
 vae.train(**totalvi_training_args, plan_kwargs=totalvi_training_plan)
 
 vae.save(os.path.join("batch_correction", "totalvi_model"), 
                   anndata=False, overwrite=True )
 
 
+L.info("Plotting ELBO")
 plt.plot(vae.history["elbo_train"], label="train")
 plt.plot(vae.history["elbo_validation"], label="validation")
 plt.title("Negative ELBO over training epochs")
@@ -234,10 +237,11 @@
         Mudata obsm slot, and any single modality processing in
         its own modality slot
             """)
+
+L.info("Extracting latent space and saving latent to X_totalVI")
 mdata.obsm["X_totalVI"] = vae.get_latent_representation()
 
 if batch_categories is not None:
-    L.debug(batch_categories)
     if type(batch_categories) is not list:
         batch_categories = [batch_categories]
     normX, protein = vae.get_normalized_expression(
@@ -245,7 +249,7 @@
         return_mean=True,
         transform_batch=batch_categories
     )
-
+    L.info("Saving denoised data to .obsm['totalvi_denoised_rna'] and .obsm['totalvi_denoised_protein']")
     # this is stored in obsm, because if we have filtered by hvg, it no longer fits specifications for a layer
     mdata['rna'].obsm["totalvi_denoised_rna"] = normX
     # reorder if rna and prot are in different orders
@@ -256,6 +260,7 @@
         return_mean=True,
         transform_batch=batch_categories
     )
+    L.info("Saving protein foreground prob to .obsm['totalvi_protein_foreground_prob']")
     mdata['prot'].obsm["totalvi_protein_foreground_prob"] = df.loc[mdata['prot'].obs_names,:]
 
 mdata.update()
@@ -268,6 +273,7 @@
 
 # neighbors function has been made to magically calculate mudata neighbors using sc.pp.neighbors
 # because we want to use_rep = X_totalvi (as a pseudo single modality) not run multimodal mu.pp.neighbors
+L.info("Computing neighbors")
 run_neighbors_method_choice(mdata, 
     method=args.neighbors_method, 
     n_neighbors=int(args.neighbors_k), 
@@ -276,12 +282,16 @@
     use_rep='X_totalVI',
     nthreads=max([threads_available, 6]))
 
+L.info("Computing UMAP")
 sc.tl.umap(mdata, min_dist=0.4)
+L.info("Computing Leiden clustering")
 sc.tl.leiden(mdata, key_added="leiden_totalVI")
 
+L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv)
 umap = pd.DataFrame(mdata.obsm['X_umap'], mdata['rna'].obs.index)
 umap.to_csv(args.output_csv)
 
+L.info("Saving MuData to 'tmp/totalvi_scaled_adata.h5mu'")
 mdata.write("tmp/totalvi_scaled_adata.h5mu")
 
 L.info("Done")
diff --git a/panpipes/python_scripts/batch_correct_wnn.py b/panpipes/python_scripts/batch_correct_wnn.py
index ccee3892..3032ec75 100644
--- a/panpipes/python_scripts/batch_correct_wnn.py
+++ b/panpipes/python_scripts/batch_correct_wnn.py
@@ -56,14 +56,15 @@
 if params['multimodal']['WNN']['modalities'] is not None:
     modalities= params['multimodal']['WNN']['modalities']
     modalities = [x.strip() for x in modalities.split(",")]
-    L.info(f"using modalities :{modalities}")
+    L.info(f"Using modalities :{modalities}")
 
-L.info("running with batch corrections:")
+L.info("Running with batch corrections:")
 wnn_params_bc = params['multimodal']['WNN']['batch_corrected'] 
 if modalities is not None:
     wnn_params_bc= {k: wnn_params_bc[k] for k in wnn_params_bc.keys() & modalities}
 L.info( wnn_params_bc )
 
+L.info("Reading in MuData from '%s'" % args.scaled_anndata)
 mdata = mu.read(args.scaled_anndata)
 
 if all(x in modalities for x in mdata.mod.keys()):
@@ -72,11 +73,11 @@
 else:
     tmp = mdata.copy()
     removed_mods = list(set(mdata.mod.keys()) - set(modalities))
-    L.info(f"removing modalities {removed_mods}")
+    L.info(f"Removing modalities {removed_mods}")
     for rmod in removed_mods:
         del tmp.mod[rmod]
 
-L.info("intersecting modality obs before running wnn")
+L.info("Intersecting modality obs before running WNN")
 mu.pp.intersect_obs(tmp)
 
 
@@ -90,7 +91,6 @@
     else: 
         dict_graph[x]["obsm"] = None
 
-L.debug(dict_graph)
 
 if dict_graph["rna"]["obsm"] == "X_scvi":
     dict_graph["rna"]["obsm"] = "X_scVI"
@@ -100,13 +100,13 @@
     pkmod=params['multimodal']['WNN']['knn'][kmod]
     if dict_graph[kmod]["obsm"] is not None:
         if dict_graph[kmod]["obsm"] not in tmp.mod[kmod].obsm.keys():
-            L.info("provided mdata doesn't have the desired obsm, just checking if it's bbknn you want.")
+            L.info("Provided mdata doesn't have the desired obsm, just checking if it's bbknn you want.")
             if dict_graph[kmod]["obsm"] == "X_bbknn":
                 if len(tmp.mod[kmod].obsp.keys()) > 0 and "neighbors" in tmp.mod[kmod].uns.keys() : 
-                     L.info("i found a populated obsp slot and I assume it's bbknn")
+                     L.info("Populated obsp slot found. Assuming it's bbknn")
                 else:
                     if dict_graph[kmod]["anndata"] is not None:
-                        L.info("reading precomputed connectivities for bbknn")
+                        L.info("Reading precomputed connectivities for bbknn")
                         adata = mu.read(dict_graph[kmod]["anndata"])
                         tmp.mod[kmod].obsp = adata.obsp.copy()
                         tmp.mod[kmod].obsm[dict_graph[kmod]["obsm"]] = adata.obsm[dict_graph[kmod]["obsm"]].copy()
@@ -114,7 +114,7 @@
                         tmp.update()
             else:
                 if dict_graph[kmod]["anndata"] is not None:
-                    L.info("provided mdata doesn't have the desired obsm. reading the batch corrected data from another stored object")
+                    L.info("Provided mdata doesn't have the desired obsm. reading the batch corrected data from another stored object")
                     adata = mu.read(dict_graph[kmod]["anndata"])
                     L.debug(kmod + "object")
                     L.debug(adata)
@@ -124,15 +124,15 @@
                     tmp.update()
                     repuse = dict_graph[kmod]["obsm"] 
                 else:
-                    L.info("could not find the desired obsm and the anndata slot is empty, will calculate on the flight")
+                    L.warning("Could not find the desired obsm and the anndata slot is empty, will calculate on the flight")
                     if kmod =="atac":
                         if "X_lsi" in tmp.mod[kmod].obsm.keys():
                             repuse = "X_lsi"
                         else:
                             repuse = "X_pca"
-                        L.info("falling back on %s" %(repuse) )
+                        L.info("Falling back on %s" %(repuse) )
             
-                    L.info("calculating neighbours")
+                    L.info("Computing neighbours")
                     if repuse != "X_bbknn":
                         run_neighbors_method_choice(tmp.mod[kmod], 
                             method=pkmod['method'], 
@@ -145,7 +145,7 @@
         else:
             L.info("Using %s" %(dict_graph[kmod]["obsm"]))            
     else:
-        L.info("could not find the desired obsm and the anndata slot is empty, will calculate on the flight")
+        L.warning("Could not find the desired obsm and the anndata slot is empty, will calculate on the flight")
         repuse ="X_pca"
         if kmod =="atac":
             if "X_lsi" in tmp.mod[kmod].obsm.keys():
@@ -154,8 +154,7 @@
                 repuse = "X_pca"
             L.info("falling back on %s" %(repuse) )
 
-        L.info("calculating neighbours")
-        
+        L.info("Computing neighbours")
         run_neighbors_method_choice(tmp.mod[kmod], 
             method=pkmod['method'], 
             n_neighbors=int(pkmod['k']), 
@@ -165,11 +164,9 @@
             use_rep=repuse,
             nthreads=max([threads_available, 6]))
 
-L.debug(tmp)
 tmp.update()
-L.debug(tmp)
-L.info("Now running WNN")
 
+L.info("Running WNN")
 mu.pp.neighbors(tmp, 
                 n_neighbors= int(args.n_neighbors),
                 n_bandwidth_neighbors= int(args.n_bandwidth_neighbors),
@@ -177,12 +174,14 @@
                 metric= args.metric,
                 low_memory= check_for_bool(args.low_memory),   
                 key_added='wnn')
-L.info("WNN finished now calculate umap")
 
+L.info("Computing UMAP")
 mu.tl.umap(tmp,min_dist=0.4, neighbors_key='wnn')
 #For taking use of mdata.obsp['connectivities'], it’s scanpy.tl.leiden() that should be used. not muon.tl.leiden
+L.info("Computing Leiden clustering")
 sc.tl.leiden(tmp, neighbors_key='wnn', key_added='leiden_wnn') 
 
+L.info("Saving UMAP coordinates to csv file '%s" % args.output_csv)
 umap = pd.DataFrame(tmp.obsm['X_umap'], tmp.obs.index)
 umap.to_csv(args.output_csv)
 
@@ -192,6 +191,7 @@
     for rmd in removed_mods:
         tmp.mod[rmd] = mdata.mod[rmd].copy()
 
+L.info("Saving MuData to 'tmp/wnn_scaled_adata.h5mu'")
 tmp.write("tmp/wnn_scaled_adata.h5mu")
 
 L.info("Done")