add log info

SarahOuologuem · SarahOuologuem · commit e3de2cfb1c77 · 2024-04-16T11:16:06.000Z
diff --git a/panpipes/R_scripts/plotclustree.R b/panpipes/R_scripts/plotclustree.R
@@ -24,16 +24,14 @@ opt <- parse_args(OptionParser(option_list=option_list))
 # 2. name columns
 # 3. run clustree
 
-message("Running with options:")
-
-print(opt)
 
 # # run clustree
 m = readr::read_tsv(opt$infile)
 # this is a little dodge, but works ;) 
 example_column=colnames(m)[2]
 col_prefix=substr(example_column, 1, nchar(example_column)-3 )
 # run clustree
+print("Running Clustree")
 gg <- clustree(m, prefix =col_prefix) + ggtitle(opt$plot_title)
 
 
@@ -42,6 +40,7 @@ if (!(dir.exists(dirname(opt$outfile)))){
 }
 
 # save
+print("Saving Clustree")
 ggsave(gg, filename=opt$outfile, height=10,width=12, type="cairo")
 
-message("clustree done")
+print("Done")
diff --git a/panpipes/python_scripts/aggregate_csvs.py b/panpipes/python_scripts/aggregate_csvs.py
@@ -13,7 +13,7 @@
 formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s')
 log_handler.setFormatter(formatter)
 L.addHandler(log_handler)
-L.debug("test logging message")
+
 
 # parse arguments
 parser = argparse.ArgumentParser()
@@ -36,17 +36,20 @@
 
 infiles = re.split(',', args.input_files_str)
 if args.clusters_or_markers == "clusters":
+    L.info("Aggregating cluster columns")
     combined_csv = pd.concat([pd.read_csv(f, sep='\t', index_col=0) for f in infiles], axis=1)
     # get colnames
     cnames = []
     for f in infiles:
         alg = extract_parameter_from_fname(f, 'alg', prefix=args.sample_prefix)
         res = extract_parameter_from_fname(f, 'res', prefix=args.sample_prefix)
         cnames.append(alg + '_res_' + str(res))
+    L.info("Saving combined cluster columns to tsv file '%s'" % args.output_file)
     combined_csv.to_csv(args.output_file, sep='\t', header=cnames, index=True)
 
 
 if args.clusters_or_markers == "markers":
+    L.info("Aggregating marker files")
     li = []
     all_markers_file = re.sub("_top", "_all", args.output_file)
     excel_file = re.sub("_top.txt.gz", "_all.xlsx", args.output_file)
@@ -69,6 +72,7 @@
     frame.to_csv(all_markers_file, sep='\t', header=True, index=False)
     frame_sub = frame[frame['p.adj.bonferroni'] < 0.05]
     frame_sub = frame_sub[frame_sub['avg_logFC'] > 0]
+    L.info("Saving combined marker files to tsv file '%s'" % args.output_file)
     frame_sub.to_csv(args.output_file, sep='\t', header=True, index=False)
 
 
diff --git a/panpipes/python_scripts/collate_mdata.py b/panpipes/python_scripts/collate_mdata.py
@@ -31,10 +31,13 @@
                     default="mdata.h5mu",
                     help="file name, format: .h5mu")
 args, opt = parser.parse_known_args()
+
 L.info(args)
+
+L.info("Reading in MuData from '%s'" % args.input_mudata)
 mdata = mu.read(args.input_mudata)
 
-L.info("loading clusters")
+L.info("Reading in cluster information")
 cf = pd.read_csv(args.clusters_files_csv)
 
 if isinstance(mdata, MuData):
@@ -44,7 +47,7 @@
     if len(mds)>1:
         sys.exit("You have clustered multiple modalities but are providing only a unimodal anndata")
     else:
-        L.warn("found one modality, converting to mudata: %s " % mds[0] )    
+        L.warn("Found one modality, converting to mudata: %s " % mds[0] )    
         tmp = MuData({mds[0]:mdata})
         del mdata
         mdata = tmp
@@ -53,7 +56,7 @@
 # add in the clusters
 
 
-
+L.info("Adding cluster information to MuData")
 for i in range(cf.shape[0]):
     cf_df = pd.read_csv(cf['fpath'][i], sep='\t', index_col=0) 
     cf_df['clusters'] = cf_df['clusters'].astype('str').astype('category')
@@ -64,8 +67,9 @@
     else:
         mdata.obs = mdata.obs.merge(cf_df, left_index=True, right_index=True)
 
-uf = pd.read_csv(args.umap_files_csv)
 
+L.info("Adding UMAP coordinates to MuData")
+uf = pd.read_csv(args.umap_files_csv)
 
 for i in range(uf.shape[0]):
     uf_df = pd.read_csv(uf['fpath'][i], sep='\t', index_col=0) 
@@ -75,17 +79,22 @@
         if all(mdata[mod].obs_names == uf_df.index):
             mdata[mod].obsm[new_key] =  uf_df.to_numpy()
         else:
-            L.warn("cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
+            L.warn("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
     else:
         # check the observations are the same
         if set(mdata.obs_names).difference(uf_df.index) == set():
             # put the observations in the same order
             uf_df = uf_df.loc[mdata.obs_names,:]
             mdata.obsm[new_key] =  uf_df.to_numpy()
         else:
-            L.warning("cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
+            L.warning("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
+
 
+L.info("Saving updated MuData to '%s'" % args.output_mudata)
 mdata.write(args.output_mudata)
-mdata.obs.to_csv(re.sub(".h5mu", "_cell_metdata.tsv", args.output_mudata), sep='\t')
 
-L.info("done")
+output_csv = re.sub(".h5mu", "_cell_metdata.tsv", args.output_mudata)
+L.info("Saving metadata to '%s'" % output_csv)
+mdata.obs.to_csv(output_csv, sep='\t')
+
+L.info("Done")
diff --git a/panpipes/python_scripts/plot_cluster_umaps.py b/panpipes/python_scripts/plot_cluster_umaps.py
@@ -33,7 +33,6 @@
                     help="list of modalities to search for UMAPs in")
 args, opt = parser.parse_known_args()
 
-L.info("running with:")
 L.info(args)
 # args = argparse.Namespace(infile='mdata_clustered.h5mu', figdir=None)
 # ---------
@@ -42,11 +41,11 @@ def main(adata,figdir):
     # get all possible umap coords
     pattern="X_umap(.*)"
     obsm_keys = [x for x in adata.obsm.keys() if re.search(pattern, x)]
-    L.info("Umap keys founds %s" % obsm_keys)
-    # get all possible clustersclusters
+    L.info("UMAP keys found: %s" % obsm_keys)
+    # get all possible clusters
     pattern=re.compile(r'^leiden|^louvain') 
     cluster_keys  = [x for x in adata.obs.columns if re.search(pattern, x)]
-    L.info("Cluster keys founds %s" % cluster_keys)
+    L.info("Cluster keys found: %s" % cluster_keys)
     if len(obsm_keys) == 0  or len(cluster_keys) == 0:
         return
     
@@ -55,22 +54,24 @@ def main(adata,figdir):
         adata.obs[ck] = adata.obs[ck].astype('category')
     # plot all the umaps
     for ok in obsm_keys:
+        L.info("Plotting UMAP on %s coloured by %s" % (ok, cluster_keys))
         fig = sc.pl.embedding(adata, basis = ok,color=cluster_keys,
          show=False, return_fig=True, legend_loc='on data')
         for ax in fig.axes:
             ax.set(xlabel="UMAP_1", ylabel="UMAP_2")
         fig.suptitle(ok, y=1.0)
+        L.info("Saving figure to '%s'" % os.path.join(figdir, ok +  "_clusters.png"))
         fig.savefig(os.path.join(figdir, ok +  "_clusters.png"))
 
 def plot_spatial(adata,figdir):
     # get all possible umap coords
     pattern="spatial(.*)"
     obsm_keys = [x for x in adata.obsm.keys() if re.search(pattern, x)]
-    L.info("Umap keys founds %s" % obsm_keys)
+    L.info("UMAP keys found: %s" % obsm_keys)
     # get all possible clustersclusters
     pattern=re.compile(r'^leiden|^louvain') 
     cluster_keys  = [x for x in adata.obs.columns if re.search(pattern, x)]
-    L.info("Cluster keys founds %s" % cluster_keys)
+    L.info("Cluster keys found: %s" % cluster_keys)
     if len(obsm_keys) == 0  or len(cluster_keys) == 0:
         return
     
@@ -79,15 +80,18 @@ def plot_spatial(adata,figdir):
         adata.obs[ck] = adata.obs[ck].astype('category')
     # plot all the umaps
     for ok in obsm_keys:
+        L.info("Plotting UMAP on %s coloured by %s" % (ok, cluster_keys))
         fig = sc.pl.embedding(adata, basis = ok,color=cluster_keys,
          show=False, return_fig=True, legend_loc='on data')
         for ax in fig.axes:
             ax.set(xlabel="spatial1", ylabel="spatial2")
         fig.suptitle(ok, y=1.0)
+        L.info("Saving figure to '%s'" % os.path.join(figdir, ok +  "_clusters.png"))
         fig.savefig(os.path.join(figdir, ok +  "_clusters.png"))
 
 
-L.debug("load data")
+
+L.info("Reading in MuData from '%s'" % args.infile)
 mdata = read(args.infile)
 
 mods = args.modalities.split(',')
@@ -97,14 +101,15 @@ def plot_spatial(adata,figdir):
 if 'multimodal' in mods:
     if os.path.exists("multimodal/figures") is False:
         os.makedirs("multimodal/figures")
+    L.info("Plotting multimodal figures")
     main(mdata, figdir="multimodal/figures")
 
 
 # we also need to plot per modality
 if type(mdata) is MuData:
     for mod in mdata.mod.keys():
         if mod in mods:
-            L.info("plotting for modality: %s" % mod)
+            L.info("Plotting for modality: %s" % mod)
             figdir  = os.path.join(mod, "figures")
             if os.path.exists(figdir) is False:
                 os.makedirs(figdir)
@@ -115,6 +120,4 @@ def plot_spatial(adata,figdir):
 
 
 
-
-
-L.info('done')
+L.info('Done')
diff --git a/panpipes/python_scripts/plot_scanpy_markers.py b/panpipes/python_scripts/plot_scanpy_markers.py
@@ -71,15 +71,17 @@ def calc_dendrogram(adata, group_col):
 
 
 def do_plots(adata, mod, group_col, mf, n=10, layer=None):
-    L.debug("check layers")
     # get markers for plotting
+    L.info("Subsetting on markers with avg logFC > 0")
     mf = mf[mf['avg_logFC'] > 0]
+    L.info("Extracting top markers for each cluster")
+    mf['scores'] = pd.to_numeric(mf["scores"])
     df = mf.groupby(group_col).apply(lambda x: x.nlargest(n, ['scores'])).reset_index(drop=True)
     marker_list={str(k): list(v) for k,v in df.groupby(group_col)["gene"]}
     # add cluseter col to obs
     # check whether a dendrogram is computed/
     incl_dendrogram = calc_dendrogram(adata, group_col)
-    L.info("start plotting")
+    L.info("Plotting stacked violin")
     sc.pl.stacked_violin(adata,
                 marker_list,
                 groupby=group_col,
@@ -88,18 +90,21 @@ def do_plots(adata, mod, group_col, mf, n=10, layer=None):
                 dendrogram=incl_dendrogram,
                 # figsize=(24, 5)
                 )
+    L.info("Plotting matrix plot")
     sc.pl.matrixplot(adata,
                     marker_list,
                     groupby=group_col,
                     save=  '_top_markers'+ mod +'.png',
                     dendrogram=incl_dendrogram,
                     figsize=(24, 5))
+    L.info("Plotting dotplot")
     sc.pl.dotplot(adata,
                 marker_list,
                 groupby=group_col,
                 save=  '_top_markers'+ mod +'.png',
                 dendrogram=incl_dendrogram,
                 figsize=(24, 5))
+    L.info("Plotting heatmap")
     sc.pl.heatmap(adata,
                 marker_list,
                 groupby=group_col,
@@ -110,6 +115,7 @@ def do_plots(adata, mod, group_col, mf, n=10, layer=None):
 
 
 # read data
+L.info("Reading in MuData from '%s'" % args.infile)
 mdata = mu.read(args.infile)
 
 if type(mdata) is AnnData:
@@ -118,10 +124,10 @@ def do_plots(adata, mod, group_col, mf, n=10, layer=None):
 elif type(mdata) is mu.MuData and args.modality is not None:
     adata = mdata[args.modality]    
 else:
-    sys.exit('if inputting a mudata object, you need to specify a modality')
-    
+    L.error("If the input is a MuData object, a modality needs to be specified")
+    sys.exit('If the input is a MuData object, a modality needs to be specified')
 
-L.info("load marker file")
+L.info("Loading marker information from '%s'" % args.marker_file)
 mf = pd.read_csv(args.marker_file, sep='\t' )
 mf[args.group_col] = mf['cluster'].astype('category')
 
diff --git a/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py b/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py
@@ -43,41 +43,44 @@
 
 
 for mod in neighbor_dict.keys():
-    if neighbor_dict[mod]['use_existing']:
-        L.info('Using existing neighbors graph for %s' % mod)
-        pass
-    else:
-        L.info("Computing new neighbors for %s" % mod)
-        if type(mdata) is MuData:
-            adata=mdata[mod]
-        if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in adata.obsm.keys()):
-            L.info("X_pca not found, computing it using default parameters")
-            sc.tl.pca(adata)
-            if (mod == "atac") and (neighbor_dict[mod]['dim_remove'] is not None):
-                dimrem = int(neighbor_dict[mod]['dim_remove'])
-                adata.obsm['X_pca'] = adata.obsm['X_pca'][:, dimrem:]
-                adata.varm["PCs"] = adata.varm["PCs"][:, dimrem:]
-        if mod == "atac":
-            if (neighbor_dict[mod]['dim_red'] == "X_lsi") and ("X_lsi" not in adata.obsm.keys()):
-                L.info("X_lsi not found, computing it using default parameters")
-                lsi(adata=adata, num_components=50)
-                if neighbor_dict[mod]['dim_remove'] is not None:
+    if mod in mdata.mod.keys():
+        if neighbor_dict[mod]['use_existing']:
+            L.info('Using existing neighbors graph for %s' % mod)
+            pass
+        else:
+            L.info("Computing new neighbors for modality %s on %s" % (mod, neighbor_dict[mod]['dim_red']))
+            if type(mdata) is MuData:
+                adata=mdata[mod]
+            if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in adata.obsm.keys()):
+                L.info("X_pca not found, computing it using default parameters")
+                sc.tl.pca(adata)
+                if (mod == "atac") and (neighbor_dict[mod]['dim_remove'] is not None):
                     dimrem = int(neighbor_dict[mod]['dim_remove'])
-                    adata.obsm['X_lsi'] = adata.obsm['X_lsi'][:, dimrem:]
-                    adata.varm["LSI"] = adata.varm["LSI"][:, dimrem:]
-                    adata.uns["lsi"]["stdev"] = adata.uns["lsi"]["stdev"][dimrem:]
+                    adata.obsm['X_pca'] = adata.obsm['X_pca'][:, dimrem:]
+                    adata.varm["PCs"] = adata.varm["PCs"][:, dimrem:]
+            if mod == "atac":
+                if (neighbor_dict[mod]['dim_red'] == "X_lsi") and ("X_lsi" not in adata.obsm.keys()):
+                    L.info("X_lsi not found, computing it using default parameters")
+                    lsi(adata=adata, num_components=50)
+                    if neighbor_dict[mod]['dim_remove'] is not None:
+                        L.info("Removing dimension %s from X_lsi" % neighbor_dict[mod]['dim_remove'])
+                        dimrem = int(neighbor_dict[mod]['dim_remove'])
+                        adata.obsm['X_lsi'] = adata.obsm['X_lsi'][:, dimrem:]
+                        adata.varm["LSI"] = adata.varm["LSI"][:, dimrem:]
+                        adata.uns["lsi"]["stdev"] = adata.uns["lsi"]["stdev"][dimrem:]
 
-        # run command
-        opts = dict(method=neighbor_dict[mod]['method'],
-                    n_neighbors=int(neighbor_dict[mod]['k']),
-                    n_pcs=int(neighbor_dict[mod]['n_dim_red']),
-                    metric=neighbor_dict[mod]['metric'],
-                    nthreads=args.n_threads,
-                    use_rep=neighbor_dict[mod]['dim_red'])
+            # run command
+            opts = dict(method=neighbor_dict[mod]['method'],
+                        n_neighbors=int(neighbor_dict[mod]['k']),
+                        n_pcs=int(neighbor_dict[mod]['n_dim_red']),
+                        metric=neighbor_dict[mod]['metric'],
+                        nthreads=args.n_threads,
+                        use_rep=neighbor_dict[mod]['dim_red'])
 
 
-        run_neighbors_method_choice(adata,**opts)
-        mdata.update()
+            run_neighbors_method_choice(adata,**opts)
+            mdata.mod[mod] = adata
+            mdata.update()
 
 
 
diff --git a/panpipes/python_scripts/run_clustering.py b/panpipes/python_scripts/run_clustering.py
@@ -46,17 +46,17 @@
 # check sc.pp.neihgbours has been run
 if uns_key not in adata.uns.keys():
     # sys.exit("Error: sc.pp.neighbours has not been run on this object")
-    L.warning("Running neighbors with default parameters since no neighbors graph found in this data object")
+    L.warning("Running neighbors for modality %s with default parameters since no neighbors graph found in this data object" % args.modality)
     sc.pp.neighbors(adata)
     uns_key="neighbors"
 
 
 # run command
 if args.algorithm == "louvain":
-    L.info("Running Louvain clustering")
+    L.info("Running Louvain clustering for modality %s and resolution %s on %s", (args.modality, args.resolution, uns_key))
     sc.tl.louvain(adata, resolution=float(args.resolution), key_added='clusters', neighbors_key=uns_key)
 elif args.algorithm == "leiden":
-    L.info("Running Leiden clustering")
+    L.info("Running Leiden clustering for modality %s and resolution %s on %s", (args.modality, args.resolution, uns_key))
     sc.tl.leiden(adata, resolution=float(args.resolution), key_added='clusters', neighbors_key=uns_key)
 else:
     L.error("Could not find clustering algorithm '%s'. Please specify 'louvain' or 'leiden'" % args.algorithm)
diff --git a/panpipes/python_scripts/run_find_markers_multi.py b/panpipes/python_scripts/run_find_markers_multi.py
diff --git a/panpipes/python_scripts/run_umap.py b/panpipes/python_scripts/run_umap.py