From 921126e292e4bc91adb97d8aa7f2fd5f5535bcf4 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Fri, 4 Oct 2024 10:47:13 +0200
Subject: [PATCH 01/57] create spatialdata visium

---
 .../make_mudataspatial_from_csv.py            | 75 ++++++++++++-------
 1 file changed, 47 insertions(+), 28 deletions(-)

diff --git a/panpipes/python_scripts/make_mudataspatial_from_csv.py b/panpipes/python_scripts/make_mudataspatial_from_csv.py
index 06453411..afad522c 100644
--- a/panpipes/python_scripts/make_mudataspatial_from_csv.py
+++ b/panpipes/python_scripts/make_mudataspatial_from_csv.py
@@ -8,6 +8,7 @@
 import warnings
 from muon._atac.tools import add_peak_annotation, locate_fragments
 import squidpy as sq
+import spatialdata_io as sd_io
 from mudata import MuData
 import os
 """
@@ -52,6 +53,15 @@
 parser.add_argument('--spatial_counts', 
                     default=None,
                     help='')
+parser.add_argument('--scalefactors_file', 
+                    default=None,
+                    help='')
+parser.add_argument('--fullres_image_file', 
+                    default=None,
+                    help='')
+parser.add_argument('--tissue_positions_file', 
+                    default=None,
+                    help='')
 parser.add_argument('--spatial_metadata', 
                     default=None,
                     help='')
@@ -64,21 +74,24 @@
 L.info("Running with params: %s", args)
 
 # unimodal mu (check if all the modalities)
-if isinstance(args.mode_dictionary, dict):
-    mode_dictionary = args.mode_dictionary
-else:
-    mode_dictionary = read_yaml(args.mode_dictionary) 
+#if isinstance(args.mode_dictionary, dict):
+#    mode_dictionary = args.mode_dictionary
+#else:
+#    mode_dictionary = read_yaml(args.mode_dictionary) 
 #{'spatialT': True}
 
-permf = [key for key, value in mode_dictionary.items() if value == True]
+#permf = [key for key, value in mode_dictionary.items() if value == True]
 all_files = {
-            "spatial":[args.spatial_infile, #path, mandatory for squidpy
+            "spatial":[args.spatial_infile, #path
                         args.spatial_filetype, #needed for the load_adata_in function to call one of vizgen,visium
                         args.spatial_counts, #name of the counts file, mandatory for squidpy
-                        args.spatial_metadata, #name of the metadata file, mandatory for squidpy
-                        args.spatial_transformation]}
+                        args.fullres_image_file, # visium
+                        args.tissue_positions_file, #visium
+                        args.scalefactors_file]} # visium 
+#                        args.spatial_metadata, #name of the metadata file, mandatory for squidpy
+#                        args.spatial_transformation]}
 #subset to the modalities we want from permf (in this case only spatial)
-all_files = {nm: x  for (nm, x) in all_files.items() if nm in permf}
+#all_files = {nm: x  for (nm, x) in all_files.items() if nm in permf}
 
 #[check_filetype(x[0], x[1]) for x in all_files.values()]
 # read the spatial data with one of the functions inside
@@ -125,16 +138,22 @@ def check_dir_transform(infile_path, transform_file):
     adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns = adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns.astype(str)
 elif args.spatial_filetype =="visium":
     L.info("Reading in Visium data with squidpy.read.visium() into AnnData from directory " + args.spatial_infile)
-    adata = sq.read.visium(path = args.spatial_infile, #path, mandatory for squidpy
-                        counts_file=args.spatial_counts, #name of the counts file, mandatory for squidpy
-                        library_id = str(args.sample_id)
-                        ) #this also has kwargs for read_10x_h5 but keep simple
+    sdata = sd_io.visium(path=args.spatial_infile, 
+                         dataset_id=str(args.sample_id), 
+                         counts_file=args.spatial_counts, 
+                         fullres_image_file=args.fullres_image_file,
+                         tissue_positions_file=args.tissue_positions_file, 
+                         scalefactors_file=args.scalefactors_file)
+    #adata = sq.read.visium(path = args.spatial_infile, #path, mandatory for squidpy
+    #                    counts_file=args.spatial_counts, #name of the counts file, mandatory for squidpy
+    #                    library_id = str(args.sample_id)
+    #                    ) #this also has kwargs for read_10x_h5 but keep simple
 
-L.info("Resulting AnnData is:")
-L.info(adata)
-L.info("Creating MuData with .mod['spatial']")
+L.info("Resulting SpatialData is:")
+L.info(sdata)
+#L.info("Creating MuData with .mod['spatial']")
 
-mdata = MuData({"spatial": adata})
+#mdata = MuData({"spatial": adata})
 
 
 #---------------
@@ -143,25 +162,25 @@ def check_dir_transform(infile_path, transform_file):
 
 L.info("Making var names unique")
 #make var names unique
-for mm in mdata.mod.keys():
-    mdata[mm].var_names_make_unique()
+#for mm in mdata.mod.keys():
+sdata["table"].var_names_make_unique()
 
 L.info("Adding sample_id '%s'to MuData.obs and MuData.mod['spatial'].obs" % args.sample_id)
-mdata.obs['sample_id'] = str(args.sample_id)
+sdata["table"].obs['sample_id'] = str(args.sample_id)
 
 # copy the sample_id to each modality
-for mm in mdata.mod.keys():
+#for mm in mdata.mod.keys():
     # mdata[mm].obs['sample_id'] = mdata.obs['sample_id']
-    mdata[mm].obs['sample_id'] = mdata.obs.loc[mdata[mm].obs_names,:]['sample_id']
+sdata["table"].obs['sample_id'] = sdata["table"].obs.loc[sdata["table"].obs_names,:]['sample_id']
 
-mdata.update()
+#mdata.update()
 
-L.info("Resulting MuData is:")
-L.info(mdata)
+L.info("Resulting SpatialData is:")
+L.info(sdata)
 
-L.info("Saving MuData to '%s'" % args.output_file)
-L.debug(mdata)
-mdata.write(args.output_file)
+L.info("Saving SpatialData to '%s'" % args.output_file)
+L.debug(sdata)
+sdata.write(args.output_file)
 
 L.info("Done")
 

From 43e0c1e23823fa336705459613821e505ed74c84 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Fri, 4 Oct 2024 10:53:25 +0200
Subject: [PATCH 02/57] create spatialdata vizgen

---
 .../python_scripts/make_mudataspatial_from_csv.py   | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/panpipes/python_scripts/make_mudataspatial_from_csv.py b/panpipes/python_scripts/make_mudataspatial_from_csv.py
index afad522c..95c06ff0 100644
--- a/panpipes/python_scripts/make_mudataspatial_from_csv.py
+++ b/panpipes/python_scripts/make_mudataspatial_from_csv.py
@@ -130,12 +130,13 @@ def check_dir_transform(infile_path, transform_file):
 
 if args.spatial_filetype=="vizgen":
     L.info("Reading in Vizgen data with squidpy.read.vizgen() into AnnData from directory " + args.spatial_infile)
-    adata = sq.read.vizgen(path = args.spatial_infile, #path, mandatory for squidpy
-                        counts_file=args.spatial_counts, #name of the counts file, mandatory for squidpy
-                        meta_file = args.spatial_metadata, #name of the metadata file, mandatory for squidpy
-                        transformation_file=args.spatial_transformation,
-                        library_id = str(args.sample_id)) #this also has kwargs for read_10x_h5 but keep simple
-    adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns = adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns.astype(str)
+    sdata = sd_io.merscope(path = args.spatial_infile)
+#    adata = sq.read.vizgen(path = args.spatial_infile, #path, mandatory for squidpy
+#                        counts_file=args.spatial_counts, #name of the counts file, mandatory for squidpy
+#                        meta_file = args.spatial_metadata, #name of the metadata file, mandatory for squidpy
+#                        transformation_file=args.spatial_transformation,
+#                        library_id = str(args.sample_id)) #this also has kwargs for read_10x_h5 but keep simple
+#    adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns = adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns.astype(str)
 elif args.spatial_filetype =="visium":
     L.info("Reading in Visium data with squidpy.read.visium() into AnnData from directory " + args.spatial_infile)
     sdata = sd_io.visium(path=args.spatial_infile, 

From c1e823a471e06375d24f9a364fb3c8768d6e576b Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 8 Oct 2024 09:52:45 +0200
Subject: [PATCH 03/57] rename python file

---
 panpipes/panpipes/pipeline_qc_spatial.py                        | 2 +-
 ...e_mudataspatial_from_csv.py => make_spatialData_from_csv.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename panpipes/python_scripts/{make_mudataspatial_from_csv.py => make_spatialData_from_csv.py} (100%)

diff --git a/panpipes/panpipes/pipeline_qc_spatial.py b/panpipes/panpipes/pipeline_qc_spatial.py
index 538cf67c..f0ceac69 100644
--- a/panpipes/panpipes/pipeline_qc_spatial.py
+++ b/panpipes/panpipes/pipeline_qc_spatial.py
@@ -98,7 +98,7 @@ def load_mudatas(spatial_path, outfile,
     assays[outfile] = spatial_filetype
     
     cmd = """
-        python %(py_path)s/make_mudataspatial_from_csv.py 
+        python %(py_path)s/make_spatialData_from_csv.py 
         --mode_dictionary "%(modality_dict)s"
         --sample_id %(sample_id)s
         --output_file %(outfile)s 
diff --git a/panpipes/python_scripts/make_mudataspatial_from_csv.py b/panpipes/python_scripts/make_spatialData_from_csv.py
similarity index 100%
rename from panpipes/python_scripts/make_mudataspatial_from_csv.py
rename to panpipes/python_scripts/make_spatialData_from_csv.py

From e3d6f2e5ff512d64e01005b999aabf880457e358 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 8 Oct 2024 10:27:26 +0200
Subject: [PATCH 04/57] adjust spatial loading for spatialData

---
 panpipes/funcs/io.py | 53 ++++++++++++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py
index 77d74d2b..96b4dca2 100644
--- a/panpipes/funcs/io.py
+++ b/panpipes/funcs/io.py
@@ -159,33 +159,52 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
             if caf['spatial_filetype'][nn]=="vizgen":
                 spatial_filetype = caf['spatial_filetype'][nn]
                 #path, counts and metadata are mandatory
-                if pd.notna(caf["spatial_counts"][nn]):
-                    spatial_counts= caf["spatial_counts"][nn]
-                else:
-                    spatial_counts = None
-                if pd.notna(caf["spatial_metadata"][nn]):
-                    spatial_metadata = caf["spatial_metadata"][nn]
-                else: 
-                    spatial_metadata = None
+                #if pd.notna(caf["spatial_counts"][nn]):
+                #    spatial_counts= caf["spatial_counts"][nn]
+                #else:
+                #    spatial_counts = None
+                #if pd.notna(caf["spatial_metadata"][nn]):
+                #    spatial_metadata = caf["spatial_metadata"][nn]
+                #else: 
+                #    spatial_metadata = None
                 #transformation is optional
-                if pd.notna(caf["spatial_transformation"][nn]):
-                    spatial_transformation = caf["spatial_transformation"][nn]
-                else:
-                    spatial_transformation = None
+                #if pd.notna(caf["spatial_transformation"][nn]):
+                #    spatial_transformation = caf["spatial_transformation"][nn]
+                #else:
+                #    spatial_transformation = None
             elif caf['spatial_filetype'][nn]=="visium":
-                spatial_metadata= None
-                spatial_transformation = None
+                #spatial_metadata= None
+                #spatial_transformation = None
                 spatial_filetype = caf['spatial_filetype'][nn]
+                #counts file
                 if pd.notna(caf["spatial_counts"][nn]):
                     spatial_counts= caf["spatial_counts"][nn]
                 else:
                     spatial_counts = None  
+                # fullres image
+                if pd.notna(caf["spatial_fullres_image_file"][nn]):
+                    spatial_fullres_image_file= caf["spatial_fullres_image_file"][nn]
+                else:
+                    spatial_fullres_image_file = None 
+                # tissue position 
+                if pd.notna(caf["spatial_tissue_positions_file"][nn]):
+                    spatial_tissue_positions_file= caf["spatial_tissue_positions_file"][nn]
+                else:
+                    spatial_tissue_positions_file = None  
+                # scalefactor
+                if pd.notna(caf["spatial_scalefactors_file"][nn]):
+                    spatial_scalefactors_file= caf["spatial_scalefactors_file"][nn]
+                else:
+                    spatial_scalefactors_file = None  
         else:
             spatial_path= None
             spatial_filetype = None
             spatial_counts = None
-            spatial_metadata = None
-            spatial_transformation = None
+            spatial_fullres_image_file = None
+            spatial_tissue_positions_file = None
+            spatial_scalefactors_file = None
+            #spatial_metadata = None
+            #spatial_transformation = None
             
         if 'barcode_mtd_path' in caf.columns:
             cell_mtd_path = caf['barcode_mtd_path'][nn] #not yielding this right now!
@@ -199,7 +218,7 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
             outfile = outfile + ".h5mu"
         sample_id = caf['sample_id'][nn]
         yield spatial_path,  outfile, \
-              sample_id, spatial_filetype, spatial_counts, spatial_metadata, spatial_transformation
+              sample_id, spatial_filetype, spatial_counts, spatial_fullres_image_file, spatial_tissue_positions_file, spatial_scalefactors_file #spatial_metadata, spatial_transformation
 
 
 def read_anndata(

From 07844b5d1b37f3d413e6200e7c109a1bf95a3602 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 8 Oct 2024 10:28:03 +0200
Subject: [PATCH 05/57] adjust spatial loading for spatialData

---
 panpipes/panpipes/pipeline_qc_spatial.py | 35 +++++++++++++-----------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/panpipes/panpipes/pipeline_qc_spatial.py b/panpipes/panpipes/pipeline_qc_spatial.py
index f0ceac69..c84ade7a 100644
--- a/panpipes/panpipes/pipeline_qc_spatial.py
+++ b/panpipes/panpipes/pipeline_qc_spatial.py
@@ -73,12 +73,9 @@ def gen_load_spatial_anndata_jobs():
 @follows(mkdir("logs"))
 @follows(mkdir("tmp"))
 @files(gen_load_spatial_anndata_jobs)
-def load_mudatas(spatial_path, outfile, 
-                 sample_id,
-                 spatial_filetype, 
-                 spatial_counts, 
-                 spatial_metadata, 
-                 spatial_transformation):
+def load_mudatas(spatial_path,  outfile, 
+                 sample_id, spatial_filetype, spatial_counts, 
+                 spatial_fullres_image_file, spatial_tissue_positions_file, spatial_scalefactors_file):
     
     path_dict = {'spatial':spatial_path}
                  
@@ -86,12 +83,17 @@ def load_mudatas(spatial_path, outfile,
     print('sample_id = %s' % str(sample_id))
     print('outfile = %s' % str(outfile))
     print('spatial_filetype = %s' % str(spatial_filetype))
-    print('spatial_counts = %s' % str(spatial_counts))
-    if spatial_filetype == "vizgen":
-        print('spatial_metadata = %s' % str(spatial_metadata))
-        print('spatial_transformation = %s' % str(spatial_transformation))
-    else:
-        print("visium")
+    #print('spatial_counts = %s' % str(spatial_counts))
+    #if spatial_filetype == "vizgen":
+    #    print('spatial_metadata = %s' % str(spatial_metadata))
+    #    print('spatial_transformation = %s' % str(spatial_transformation))
+    #else:
+    #    print("visium")
+    if spatial_filetype == "visium":
+        print('spatial_counts = %s' % str(spatial_counts))
+        print('spatial_fullres_image_file= %s' % str(spatial_fullres_image_file))
+        print('spatial_tissue_positions_file= %s' % str(spatial_tissue_positions_file))
+        print('spatial_scalefactors_file= %s' % str(spatial_scalefactors_file))
     modality_dict = {k:True if path_dict[k] is not None else False for k,v in {'spatial': True}.items() }
     print(modality_dict)
 
@@ -104,12 +106,13 @@ def load_mudatas(spatial_path, outfile,
         --output_file %(outfile)s 
         --spatial_filetype %(spatial_filetype)s
         --spatial_infile %(spatial_path)s
-        --spatial_counts %(spatial_counts)s
     """
-    if spatial_filetype == "vizgen":
+    if spatial_filetype == "visium":
         cmd += """
-        --spatial_metadata %(spatial_metadata)s 
-        --spatial_transformation %(spatial_transformation)s
+        --spatial_counts %(spatial_counts)s
+        --scalefactors_file %(spatial_scalefactors_file)s 
+        --fullres_image_file %(spatial_fullres_image_file)s
+        --tissue_positions_file %(spatial_tissue_positions_file)s
         """
     cmd += " > logs/1_make_mudatas_%(sample_id)s.log"
     job_kwargs["job_threads"] = PARAMS['resources_threads_medium']

From 2c92e9a4eeb4e4f50632a278b1048531913e8e8a Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 8 Oct 2024 10:28:22 +0200
Subject: [PATCH 06/57] remove squidpy merfish parameters

---
 panpipes/python_scripts/make_spatialData_from_csv.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/panpipes/python_scripts/make_spatialData_from_csv.py b/panpipes/python_scripts/make_spatialData_from_csv.py
index 95c06ff0..81050809 100644
--- a/panpipes/python_scripts/make_spatialData_from_csv.py
+++ b/panpipes/python_scripts/make_spatialData_from_csv.py
@@ -62,12 +62,12 @@
 parser.add_argument('--tissue_positions_file', 
                     default=None,
                     help='')
-parser.add_argument('--spatial_metadata', 
-                    default=None,
-                    help='')
-parser.add_argument('--spatial_transformation', 
-                    default=None,
-                    help='')
+#parser.add_argument('--spatial_metadata', 
+#                    default=None,
+#                    help='')
+#parser.add_argument('--spatial_transformation', 
+#                    default=None,
+#                    help='')
 
 parser.set_defaults(verbose=True)
 args, opt = parser.parse_known_args()

From d198b1150fa53c909d93fb65e14b0d025b09946c Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 8 Oct 2024 11:41:44 +0200
Subject: [PATCH 07/57] add parameter=None for vizgen

---
 panpipes/funcs/io.py | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py
index 96b4dca2..f2edb8ef 100644
--- a/panpipes/funcs/io.py
+++ b/panpipes/funcs/io.py
@@ -157,24 +157,12 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
             else:
                 spatial_path = caf["spatial_path"][nn]
             if caf['spatial_filetype'][nn]=="vizgen":
+                spatial_counts = None 
+                spatial_fullres_image_file = None
+                spatial_tissue_positions_file = None
+                spatial_scalefactors_file = None
                 spatial_filetype = caf['spatial_filetype'][nn]
-                #path, counts and metadata are mandatory
-                #if pd.notna(caf["spatial_counts"][nn]):
-                #    spatial_counts= caf["spatial_counts"][nn]
-                #else:
-                #    spatial_counts = None
-                #if pd.notna(caf["spatial_metadata"][nn]):
-                #    spatial_metadata = caf["spatial_metadata"][nn]
-                #else: 
-                #    spatial_metadata = None
-                #transformation is optional
-                #if pd.notna(caf["spatial_transformation"][nn]):
-                #    spatial_transformation = caf["spatial_transformation"][nn]
-                #else:
-                #    spatial_transformation = None
             elif caf['spatial_filetype'][nn]=="visium":
-                #spatial_metadata= None
-                #spatial_transformation = None
                 spatial_filetype = caf['spatial_filetype'][nn]
                 #counts file
                 if pd.notna(caf["spatial_counts"][nn]):
@@ -203,8 +191,6 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
             spatial_fullres_image_file = None
             spatial_tissue_positions_file = None
             spatial_scalefactors_file = None
-            #spatial_metadata = None
-            #spatial_transformation = None
             
         if 'barcode_mtd_path' in caf.columns:
             cell_mtd_path = caf['barcode_mtd_path'][nn] #not yielding this right now!
@@ -217,8 +203,9 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
         else:
             outfile = outfile + ".h5mu"
         sample_id = caf['sample_id'][nn]
+        
         yield spatial_path,  outfile, \
-              sample_id, spatial_filetype, spatial_counts, spatial_fullres_image_file, spatial_tissue_positions_file, spatial_scalefactors_file #spatial_metadata, spatial_transformation
+              sample_id, spatial_filetype, spatial_counts, spatial_fullres_image_file, spatial_tissue_positions_file, spatial_scalefactors_file
 
 
 def read_anndata(

From 2b4c01bf0043f2f49ab8e7ee76e8ec79294f5fc6 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 8 Oct 2024 11:47:46 +0200
Subject: [PATCH 08/57] change h5mu to zarr

---
 panpipes/funcs/io.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py
index f2edb8ef..c6ff2c5c 100644
--- a/panpipes/funcs/io.py
+++ b/panpipes/funcs/io.py
@@ -199,11 +199,11 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
         # create the output file 
         outfile = "./tmp/" + caf['sample_id'][nn]
         if load_raw:
-            outfile = outfile + "_raw.h5mu" 
+            outfile = outfile + "_raw.zarr" 
         else:
-            outfile = outfile + ".h5mu"
+            outfile = outfile + ".zarr"
         sample_id = caf['sample_id'][nn]
-        
+
         yield spatial_path,  outfile, \
               sample_id, spatial_filetype, spatial_counts, spatial_fullres_image_file, spatial_tissue_positions_file, spatial_scalefactors_file
 

From 459d07fc32640e6dfcdb958e3a1aacf9bb9fa6d2 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 15 Oct 2024 10:51:59 +0200
Subject: [PATCH 09/57] change from mudata to spatialdata

---
 panpipes/python_scripts/plot_qc_spatial.py    | 43 ++++++++++---------
 .../python_scripts/run_scanpyQC_spatial.py    | 36 ++++++++--------
 2 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/panpipes/python_scripts/plot_qc_spatial.py b/panpipes/python_scripts/plot_qc_spatial.py
index 6434b3b4..558219c0 100644
--- a/panpipes/python_scripts/plot_qc_spatial.py
+++ b/panpipes/python_scripts/plot_qc_spatial.py
@@ -13,6 +13,7 @@
 import sys
 import logging
 import re 
+import spatialdata as sd
 L = logging.getLogger()
 L.setLevel(logging.INFO)
 log_handler = logging.StreamHandler(sys.stdout)
@@ -57,15 +58,16 @@
 sc.settings.figdir = figdir
 sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
 
-L.info("Reading in MuData from '%s'" % args.input_mudata)
-mdata = mu.read(args.input_mudata)
-spatial = mdata.mod['spatial']
+L.info("Reading in SpatialData from '%s'" % args.input_mudata)
+sdata = sd.read_zarr(args.input_mudata)
+#mdata = mu.read(args.input_mudata)
+#spatial = mdata.mod['spatial']
 
 input_data = os.path.basename(args.input_mudata)
-pattern = r"_filtered.h5(.*)"
+pattern = r"_filtered.zarr"
 match = re.search(pattern, input_data)
 if match is None:
-    match = re.search(r"_unfilt.h5(.*)", input_data)
+    match = re.search(r"_unfilt.zarr", input_data)
 sprefix = input_data[:match.start()]
 
 # convert string to list of strings
@@ -74,15 +76,16 @@
 
 
 # check if metrics in adata.obs or adata.var
-qc_metrics = [metric if metric in spatial.obs.columns or metric in spatial.var.columns else L.warning("Variable '%s' not found in adata.var or adata.obs, will not be plotted" % metric) for metric in qc_metrics]
+qc_metrics = [metric if metric in 
+              sdata["table"].obs.columns or metric in sdata["table"].var.columns else L.warning("Variable '%s' not found in adata.var or adata.obs, will not be plotted" % metric) for metric in qc_metrics]
 qc_metrics = [metric for metric in qc_metrics if metric is not None]
 
 # check that group_vars are in adata.obs
-group_var = [group if group in spatial.obs.columns else L.warning("group_var '%s' not found in adata.obs, will be ignored" % group) for group in group_var]
+group_var = [group if group in sdata["table"].obs.columns else L.warning("group_var '%s' not found in adata.obs, will be ignored" % group) for group in group_var]
 group_var = [group for group in group_var if group is not None]
 # make sure that it's saved as categorical 
 for group in group_var: 
-    spatial.obs[group] = spatial.obs[group].astype("category") 
+    sdata["table"].obs[group] = sdata["table"].obs[group].astype("category") 
     
 if group_var == []:
     group_var = None
@@ -93,34 +96,34 @@
 for metric in qc_metrics: 
     
     # check if in adata.obs:
-    if metric in spatial.obs.columns: 
+    if metric in sdata["table"].obs.columns: 
         # check that it's a numeric column, so that it can be plotted: 
-        if metric not in spatial.obs._get_numeric_data().columns:
+        if metric not in sdata["table"].obs._get_numeric_data().columns:
             L.warning("Variable '%s' not numerical in adata.obs, will not be plotted" % metric)
         else:
             L.info("Creating violin plot for '%s' of .obs" % metric)
             if group_var is None: 
-                sc.pl.violin(spatial, keys = metric, xlabel = metric+ " in .obs",
+                sc.pl.violin(sdata["table"], keys = metric, xlabel = metric+ " in .obs",
                             save =  "_obs_" + metric+ "_" + "."+sprefix + ".png", show = False)
             
             else: #plot violin for each group
                 for group in group_var: 
-                    sc.pl.violin(spatial, keys = metric,groupby = group, xlabel = group + ", "+ metric+ " in .obs",
+                    sc.pl.violin(sdata["table"], keys = metric,groupby = group, xlabel = group + ", "+ metric+ " in .obs",
                             save = "_obs_" + metric+ "_" + group+ "."+sprefix +".png", show = False)
             #plot spatial 
             L.info("Creating spatial embedding plot for '%s' of .obs" % metric)
-            sc.pl.embedding(spatial,basis="spatial", color = metric, save = "_spatial_" + metric + "."+sprefix +".png", show = False)
+            sc.pl.embedding(sdata["table"],basis="spatial", color = metric, save = "_spatial_" + metric + "."+sprefix +".png", show = False)
 
     #check if in adata.var: 
-    if metric in spatial.var.columns:
+    if metric in sdata["table"].var.columns:
         
-        if metric not in spatial.var._get_numeric_data().columns:
+        if metric not in sdata["table"].var._get_numeric_data().columns:
             L.warning("Variable '%s' not numerical in adata.var, will not be plotted" % metric)
         else:
             # plot violins 
             L.info("Creating violin plot for '%s' of .var" % metric)
             ax = sns.violinplot(
-                    data=spatial.var[[metric]],
+                    data=sdata["table"].var[[metric]],
                     orient='vertical', 
                 )
             ax.set(xlabel=metric+ " in .var" )
@@ -135,28 +138,28 @@
 
     axs[0].set_title("Total transcripts per cell")
     sns.histplot(
-        spatial.obs["total_counts"],
+        sdata["table"].obs["total_counts"],
         kde=False,
         ax=axs[0],
     )
 
     axs[1].set_title("Unique transcripts per cell")
     sns.histplot(
-        spatial.obs["n_genes_by_counts"],
+        sdata["table"].obs["n_genes_by_counts"],
         kde=False,
         ax=axs[1],
     )
 
     axs[2].set_title("Transcripts per FOV")
     sns.histplot(
-        spatial.obs.groupby('fov')[['total_counts']].sum(),
+        sdata["table"].obs.groupby('fov')[['total_counts']].sum(),
         kde=False,
         ax=axs[2],
     )
 
     axs[3].set_title("Volume of segmented cells")
     sns.histplot(
-        spatial.obs["volume"],
+        sdata["table"].obs["volume"],
         kde=False,
         ax=axs[3],
     )
diff --git a/panpipes/python_scripts/run_scanpyQC_spatial.py b/panpipes/python_scripts/run_scanpyQC_spatial.py
index 3e3059c6..595d79da 100644
--- a/panpipes/python_scripts/run_scanpyQC_spatial.py
+++ b/panpipes/python_scripts/run_scanpyQC_spatial.py
@@ -18,6 +18,7 @@
 import argparse
 import scanpy as sc
 import muon as mu
+import spatialdata as sd
 
 from panpipes.funcs.io import write_obs
 
@@ -64,14 +65,15 @@
 sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
 
 
-L.info("Reading in MuData from '%s'" % args.input_anndata)
+L.info("Reading in SpatialData from '%s'" % args.input_anndata)
 
-mdata = mu.read(args.input_anndata)
-spatial = mdata['spatial']
+#mdata = mu.read(args.input_anndata)
+sdata = sd.read_zarr(args.input_anndata)
+#spatial = mdata['spatial']
 
 L.info("Spatial data is:")
-print(spatial)
-L.info("With sample id '%s'" % spatial.obs["sample_id"].unique()[0])
+print(sdata)
+L.info("With sample id '%s'" % sdata["table"].obs["sample_id"].unique()[0])
 
 qc_vars = []
 
@@ -94,7 +96,7 @@
             for kk in calc_proportions:
                 xname= kk
                 gene_list = cat_dic[kk]
-                spatial.var[xname] = [x in gene_list for x in spatial.var_names] 
+                sdata["table"].var[xname] = [x in gene_list for x in sdata["table"].var_names] 
                 qc_vars.append(xname)
            
         # Score genes 
@@ -105,7 +107,7 @@
                 L.info("Computing gene scores for '%s'" % kk)
                 xname= kk
                 gene_list = cat_dic[kk]
-                sc.tl.score_genes(spatial, gene_list , 
+                sc.tl.score_genes(sdata["table"], gene_list , 
                                     ctrl_size=min(len(gene_list), 50), 
                                     gene_pool=None, 
                                     n_bins=25, 
@@ -127,11 +129,11 @@
     qc_info = " and calculating proportions for '%s'" % qc_vars
 L.info("Calculating QC metrics with scanpy.pp.calculate_qc_metrics()" + qc_info)
 percent_top = [50, 100, 200, 500] #default
-percent_top = [x for x in percent_top if x <= spatial.n_vars]
-sc.pp.calculate_qc_metrics(spatial, qc_vars=qc_vars, percent_top=percent_top, inplace=True)
+percent_top = [x for x in percent_top if x <= sdata["table"].n_vars]
+sc.pp.calculate_qc_metrics(sdata["table"], qc_vars=qc_vars, percent_top=percent_top, inplace=True)
 
-if (args.spatial_filetype == "vizgen") and ("blank_genes" in spatial.obsm):    
-    spatial.obsm["blank_genes"].to_numpy().sum() / spatial.var["total_counts"].sum() * 100
+if (args.spatial_filetype == "vizgen") and ("blank_genes" in sdata["table"].obsm):    
+    sdata["table"].obsm["blank_genes"].to_numpy().sum() / sdata["table"].var["total_counts"].sum() * 100
 
 # Calculate cc scores 
 if args.ccgenes is not None:
@@ -144,7 +146,7 @@
         sgenes = ccgenes[ccgenes["cc_phase"] == "s"]["gene_name"].tolist()
         g2mgenes = ccgenes[ccgenes["cc_phase"] == "g2m"]["gene_name"].tolist()
         L.info("Calculating cell cycle scores")
-        sc.tl.score_genes_cell_cycle(spatial, s_genes=sgenes, g2m_genes=g2mgenes)
+        sc.tl.score_genes_cell_cycle(sdata["table"], s_genes=sgenes, g2m_genes=g2mgenes)
     else: 
         L.error("The path of the  cell cycle genes tsv file '%s' could not be found" % args.ccgenes)
         sys.exit("The path of the  cell cycle genes tsv file '%s' could not be found" % args.ccgenes)
@@ -153,15 +155,15 @@
 #TODO: we now need to update the mdata object to pick the calc proportion outputs made on 
 # spatial = mdata['spatial']
 
-mdata.update()
+#mdata.update()
 
 single_id = os.path.basename(str(args.input_anndata))
 single_id = single_id.replace("_raw.h5mu","")
 
-L.info("Saving updated obs in a metadata tsv file to ./" + single_id + "_cell_metadata.tsv")
-write_obs(mdata, output_prefix=single_id, output_suffix="_cell_metadata.tsv")
-L.info("Saving updated MuData to '%s'" % args.outfile)
-mdata.write(args.outfile)
+#L.info("Saving updated obs in a metadata tsv file to ./" + single_id + "_cell_metadata.tsv")
+#write_obs(mdata, output_prefix=single_id, output_suffix="_cell_metadata.tsv")
+L.info("Saving updated SpatialData to '%s'" % args.outfile)
+sdata.write(args.outfile)
 
 L.info("Done")
 

From c3fd8d52ec4eb008c983ea9dc4cf3432eb38c2f4 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 15 Oct 2024 11:29:26 +0200
Subject: [PATCH 10/57] write obs spatialdata

---
 panpipes/python_scripts/run_scanpyQC_spatial.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/panpipes/python_scripts/run_scanpyQC_spatial.py b/panpipes/python_scripts/run_scanpyQC_spatial.py
index 595d79da..57c35343 100644
--- a/panpipes/python_scripts/run_scanpyQC_spatial.py
+++ b/panpipes/python_scripts/run_scanpyQC_spatial.py
@@ -160,8 +160,8 @@
 single_id = os.path.basename(str(args.input_anndata))
 single_id = single_id.replace("_raw.h5mu","")
 
-#L.info("Saving updated obs in a metadata tsv file to ./" + single_id + "_cell_metadata.tsv")
-#write_obs(mdata, output_prefix=single_id, output_suffix="_cell_metadata.tsv")
+L.info("Saving updated obs in a metadata tsv file to ./" + single_id + "_cell_metadata.tsv")
+write_obs(sdata["table"], output_prefix=single_id, output_suffix="_cell_metadata.tsv")
 L.info("Saving updated SpatialData to '%s'" % args.outfile)
 sdata.write(args.outfile)
 

From b8281663e282c0f762b5d15d638ef84fc02f402b Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 22 Oct 2024 11:59:59 +0200
Subject: [PATCH 11/57] adjust filtering to spatialdata

---
 panpipes/python_scripts/run_filter_spatial.py | 115 ++++++++++++------
 1 file changed, 79 insertions(+), 36 deletions(-)

diff --git a/panpipes/python_scripts/run_filter_spatial.py b/panpipes/python_scripts/run_filter_spatial.py
index 980f7070..c0734ba3 100644
--- a/panpipes/python_scripts/run_filter_spatial.py
+++ b/panpipes/python_scripts/run_filter_spatial.py
@@ -4,6 +4,7 @@
 import re
 import muon as mu
 from anndata import AnnData
+import spatialdata as sd
 import os
 # import scpipelines.funcs as scp
 from panpipes.funcs.processing import intersect_obs_by_mod, remove_unused_categories
@@ -74,33 +75,73 @@ def test_matching_df_ignore_cat(new_df, old_df):
 
 # load mudata
 
-L.info("Reading in MuData from '%s'" % args.input_mudata)
+L.info("Reading in SpatialData from '%s'" % args.input_mudata)
+sdata = sd.read_zarr(args.input_anndata)
+#mdata = mu.read(args.input_mudata)
 
-mdata = mu.read(args.input_mudata)
+#if isinstance(mdata, AnnData):
+#    raise TypeError("Input '%s' should be of MuData format, not Anndata"  % args.input_mudata)
 
-if isinstance(mdata, AnnData):
-    raise TypeError("Input '%s' should be of MuData format, not Anndata"  % args.input_mudata)
+orig_obs = sdata["table"].obs.copy()
 
-orig_obs = mdata.obs.copy()
-
-L.info("Before filtering: "+ str(mdata.n_obs) + " cells and " + str(mdata.n_vars) + " features")
+L.info("Before filtering: "+ str(sdata["table"].n_obs) + " cells and " + str(sdata["table"].n_vars) + " features")
 
 # filter based on provided barcodes -----
 if args.keep_barcodes is not None:
-    L.info("Filtering MuData by keep_barcodes file")
+    L.info("Filtering SpatialData by keep_barcodes file")
     keep_bc = pd.read_csv(args.keep_barcodes,header=None)
-    mdata = mdata[mdata.obs_names.isin(keep_bc[0]),:].copy()
-    remove_unused_categories(mdata.obs)
-    mdata.update()
-    L.info("Remaining cells: %d" % mdata.n_obs)
+    sdata["table"] = sdata["table"][sdata["table"].obs_names.isin(keep_bc[0]),:].copy()
+    remove_unused_categories(sdata["table"].obs)
+    #mdata.update()
+    L.info("Remaining cells: %d" % sdata["table"].n_obs)
 
 
 
+# filter more than
+if filter_dict['run']:
+    for marg in filter_dict["spatial"].keys():
+        if marg == "obs":
+            if "max" in filter_dict["spatial"][marg].keys():
+                for col, n in filter_dict["spatial"][marg]['max'].items():
+                    L.info("Filtering cells of modality '%s' by '%s' in .obs to less than %s" % ("spatial", col, n))
+                    mu.pp.filter_obs(sdata["table"], col, lambda x: x <= n)
+                    L.info("Remaining cells: %d" % sdata["table"].n_obs)
+            if "min" in filter_dict["spatial"][marg].keys():
+                for col, n in filter_dict["spatial"][marg]['min'].items():
+                    L.info("Filtering cells of modality '%s' by '%s' in .obs to more than %s" % ("spatial", col, n))
+                    mu.pp.filter_obs(sdata["table"], col, lambda x: x >= n)
+                    L.info("Remaining cells: %d" % sdata["table"].n_obs)
+            if "bool" in filter_dict["spatial"][marg].keys():
+                for col, n in filter_dict["spatial"][marg]['bool'].items():
+                    L.info("Filtering cells of modality '%s' by '%s' in .obs marked %s" % ("spatial", col, n))
+                    mu.pp.filter_obs(sdata["table"], col, lambda x: x == n)
+                    L.info("Remaining cells: %d" % sdata["table"].n_obs)
+        if marg == "var":
+            if "max" in filter_dict["spatial"][marg].keys():
+                for col, n in filter_dict["spatial"][marg]['max'].items():
+                    L.info("Filtering features of modality '%s' by '%s' in .var to less than %s" % ("spatial", col, n))
+                    mu.pp.filter_var(sdata["table"], col, lambda x: x <= n)
+                    L.info("Remaining features: %d" % sdata["table"].n_vars)
+
+            if "min" in filter_dict["spatial"][marg].keys():
+                for col, n in filter_dict["spatial"][marg]['min'].items():
+                    L.info("Filtering features of modality '%s' by '%s' in .var to more than %s" % ("spatial", col, n))
+                    mu.pp.filter_var(sdata["table"], col, lambda x: x >= n)
+                    L.info("Remaining features: %d" % sdata["table"].n_vars)
+
+            if "bool" in filter_dict["spatial"][marg].keys():
+                for col, n in filter_dict["spatial"][marg]['bool'].items():
+                    L.info("Filtering features of modality '%s' by '%s' in .var marked %s" % ("spatial", col, n))
+                    mu.pp.filter_var(sdata["table"], col, lambda x: x == n)
+                    L.info("Remaining features: %d" % sdata["table"].n_vars)
+
+
+'''
 # filter more than
 if filter_dict['run']:
     # this will go through the modalities one at a time,
     # then the categories max, min and bool
-    for mod in mdata.mod.keys():
+    for mod in sdata["table"].mod.keys():
         L.info(mod)
         if mod in filter_dict.keys():
             for marg in filter_dict[mod].keys():
@@ -108,57 +149,59 @@ def test_matching_df_ignore_cat(new_df, old_df):
                     if "max" in filter_dict[mod][marg].keys():
                         for col, n in filter_dict[mod][marg]['max'].items():
                             L.info("Filtering cells of modality '%s' by '%s' in .obs to less than %s" % (mod, col, n))
-                            mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x <= n)
-                            L.info("Remaining cells: %d" % mdata[mod].n_obs)
+                            mu.pp.filter_obs(sdata["table"].mod[mod], col, lambda x: x <= n)
+                            L.info("Remaining cells: %d" % sdata["table"][mod].n_obs)
                     if "min" in filter_dict[mod][marg].keys():
                         for col, n in filter_dict[mod][marg]['min'].items():
                             L.info("Filtering cells of modality '%s' by '%s' in .obs to more than %s" % (mod, col, n))
-                            mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x >= n)
-                            L.info("Remaining cells: %d" % mdata[mod].n_obs)
+                            mu.pp.filter_obs(sdata["table"].mod[mod], col, lambda x: x >= n)
+                            L.info("Remaining cells: %d" % sdata["table"][mod].n_obs)
                     if "bool" in filter_dict[mod][marg].keys():
                         for col, n in filter_dict[mod][marg]['bool'].items():
                             L.info("Filtering cells of modality '%s' by '%s' in .obs marked %s" % (mod, col, n))
-                            mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x == n)
-                            L.info("Remaining cells: %d" % mdata[mod].n_obs)
+                            mu.pp.filter_obs(sdata["table"].mod[mod], col, lambda x: x == n)
+                            L.info("Remaining cells: %d" % sdata["table"][mod].n_obs)
                 if marg == "var":
                     if "max" in filter_dict[mod][marg].keys():
                         for col, n in filter_dict[mod][marg]['max'].items():
                             L.info("Filtering features of modality '%s' by '%s' in .var to less than %s" % (mod, col, n))
-                            mu.pp.filter_var(mdata.mod[mod], col, lambda x: x <= n)
-                            L.info("Remaining features: %d" % mdata[mod].n_vars)
+                            mu.pp.filter_var(sdata["table"].mod[mod], col, lambda x: x <= n)
+                            L.info("Remaining features: %d" % sdata["table"][mod].n_vars)
 
                     if "min" in filter_dict[mod][marg].keys():
                         for col, n in filter_dict[mod][marg]['min'].items():
                             L.info("Filtering features of modality '%s' by '%s' in .var to more than %s" % (mod, col, n))
-                            mu.pp.filter_var(mdata.mod[mod], col, lambda x: x >= n)
-                            L.info("Remaining features: %d" % mdata[mod].n_vars)
+                            mu.pp.filter_var(sdata["table"].mod[mod], col, lambda x: x >= n)
+                            L.info("Remaining features: %d" % sdata["table"][mod].n_vars)
 
                     if "bool" in filter_dict[mod][marg].keys():
                         for col, n in filter_dict[mod][marg]['bool'].items():
                             L.info("Filtering features of modality '%s' by '%s' in .var marked %s" % (mod, col, n))
-                            mu.pp.filter_var(mdata.mod[mod], col, lambda x: x == n)
-                            L.info("Remaining features: %d" % mdata[mod].n_vars)
+                            mu.pp.filter_var(sdata["table"].mod[mod], col, lambda x: x == n)
+                            L.info("Remaining features: %d" % sdata["table"][mod].n_vars)
                             
+'''
+
 
-mdata.update()
+#mdata.update()
 
-L.info("After filtering: "+ str(mdata.n_obs) + " cells and " + str(mdata.n_vars) + " features")
+L.info("After filtering: "+ str(sdata["table"].n_obs) + " cells and " + str(sdata["table"].n_vars) + " features")
 
-remove_unused_categories(mdata.obs)
+remove_unused_categories(sdata["table"].obs)
 
 # run quick test before saving out.
-assert test_matching_df_ignore_cat(mdata.obs, orig_obs)  
+assert test_matching_df_ignore_cat(sdata["table"].obs, orig_obs)  
 
 # write out obs
-output_prefix = re.sub(".h5mu", "", os.path.basename(args.output_mudata))
+output_prefix = re.sub(".zarr", "", os.path.basename(args.output_mudata))
 
 L.info("Saving updated obs in a metadata tsv file to './tables/" + output_prefix + "_filtered_cell_metadata.tsv'")
-write_obs(mdata, output_prefix=os.path.join("tables/",output_prefix), output_suffix="_filtered_cell_metadata.tsv")
+write_obs(sdata["table"], output_prefix=os.path.join("tables/",output_prefix), output_suffix="_filtered_cell_metadata.tsv")
 
 # write out the per sample_id cell numbers 
 cell_counts_dict={}
-for mm in mdata.mod.keys():
-    cell_counts_dict[mm] = mdata[mm].obs.sample_id.value_counts().to_frame('n_cells')
+#for mm in mdata.mod.keys():
+cell_counts_dict["spatial"] = sdata["table"].obs.sample_id.value_counts().to_frame('n_cells')
 
 cell_counts = pd.concat(cell_counts_dict).reset_index().rename(
     columns={"level_0": "modality", "level_1": "sample_id"})
@@ -167,10 +210,10 @@ def test_matching_df_ignore_cat(new_df, old_df):
 L.info("Saving cell counts in a metadata csv file to './tables/" + output_prefix + "_cell_counts.csv'")
 cell_counts.to_csv("tables/" + output_prefix + "_cell_counts.csv", index=None)
 
-mdata.update()
+#mdata.update()
 
-L.info("Saving updated MuData to '%s'" % args.output_mudata)
-mdata.write(args.output_mudata)
+L.info("Saving updated SpatialData to '%s'" % args.output_mudata)
+sdata.write(args.output_mudata)
 
 L.info("Done")
 

From a3251d75e1238ce7eb9bd2bdf22253f97a37f72d Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 22 Oct 2024 12:00:10 +0200
Subject: [PATCH 12/57] exchange h5mu for zarr

---
 panpipes/panpipes/pipeline_preprocess_spatial.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/panpipes/panpipes/pipeline_preprocess_spatial.py b/panpipes/panpipes/pipeline_preprocess_spatial.py
index b4bf3f71..b7865b00 100644
--- a/panpipes/panpipes/pipeline_preprocess_spatial.py
+++ b/panpipes/panpipes/pipeline_preprocess_spatial.py
@@ -39,7 +39,7 @@ def gen_filter_jobs():
         input_dir = "../qc.data"
         if not os.path.exists(input_dir):
                 sys.exit("can't find input data")
-    input_paths=glob.glob(os.path.join(input_dir,"*unfilt.h5mu"))
+    input_paths=glob.glob(os.path.join(input_dir,"*unfilt.zarr"))
     for infile_path in input_paths:
         file_name = os.path.basename(infile_path)
         outfile = file_name.replace("unfilt","filtered")
@@ -55,7 +55,7 @@ def gen_filter_jobs():
 def filter_mudata(infile_path,outfile):
     print('processing file = %s' % str(infile_path))
     log_file = os.path.basename(outfile)
-    log_file= "1_filtering."+log_file.replace("filtered.h5mu","") + ".log"
+    log_file= "1_filtering."+log_file.replace("filtered.zarr","") + ".log"
 
 
     filter_dict = dictionary_stripper(PARAMS['filtering'])

From 5851feb4912a6820e14fc02a96b835cdf3d2f297 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Thu, 24 Oct 2024 16:04:53 +0200
Subject: [PATCH 13/57] bug fix

---
 panpipes/python_scripts/run_filter_spatial.py | 48 +------------------
 1 file changed, 1 insertion(+), 47 deletions(-)

diff --git a/panpipes/python_scripts/run_filter_spatial.py b/panpipes/python_scripts/run_filter_spatial.py
index c0734ba3..733c8cad 100644
--- a/panpipes/python_scripts/run_filter_spatial.py
+++ b/panpipes/python_scripts/run_filter_spatial.py
@@ -76,7 +76,7 @@ def test_matching_df_ignore_cat(new_df, old_df):
 # load mudata
 
 L.info("Reading in SpatialData from '%s'" % args.input_mudata)
-sdata = sd.read_zarr(args.input_anndata)
+sdata = sd.read_zarr(args.input_mudata)
 #mdata = mu.read(args.input_mudata)
 
 #if isinstance(mdata, AnnData):
@@ -136,52 +136,6 @@ def test_matching_df_ignore_cat(new_df, old_df):
                     L.info("Remaining features: %d" % sdata["table"].n_vars)
 
 
-'''
-# filter more than
-if filter_dict['run']:
-    # this will go through the modalities one at a time,
-    # then the categories max, min and bool
-    for mod in sdata["table"].mod.keys():
-        L.info(mod)
-        if mod in filter_dict.keys():
-            for marg in filter_dict[mod].keys():
-                if marg == "obs":
-                    if "max" in filter_dict[mod][marg].keys():
-                        for col, n in filter_dict[mod][marg]['max'].items():
-                            L.info("Filtering cells of modality '%s' by '%s' in .obs to less than %s" % (mod, col, n))
-                            mu.pp.filter_obs(sdata["table"].mod[mod], col, lambda x: x <= n)
-                            L.info("Remaining cells: %d" % sdata["table"][mod].n_obs)
-                    if "min" in filter_dict[mod][marg].keys():
-                        for col, n in filter_dict[mod][marg]['min'].items():
-                            L.info("Filtering cells of modality '%s' by '%s' in .obs to more than %s" % (mod, col, n))
-                            mu.pp.filter_obs(sdata["table"].mod[mod], col, lambda x: x >= n)
-                            L.info("Remaining cells: %d" % sdata["table"][mod].n_obs)
-                    if "bool" in filter_dict[mod][marg].keys():
-                        for col, n in filter_dict[mod][marg]['bool'].items():
-                            L.info("Filtering cells of modality '%s' by '%s' in .obs marked %s" % (mod, col, n))
-                            mu.pp.filter_obs(sdata["table"].mod[mod], col, lambda x: x == n)
-                            L.info("Remaining cells: %d" % sdata["table"][mod].n_obs)
-                if marg == "var":
-                    if "max" in filter_dict[mod][marg].keys():
-                        for col, n in filter_dict[mod][marg]['max'].items():
-                            L.info("Filtering features of modality '%s' by '%s' in .var to less than %s" % (mod, col, n))
-                            mu.pp.filter_var(sdata["table"].mod[mod], col, lambda x: x <= n)
-                            L.info("Remaining features: %d" % sdata["table"][mod].n_vars)
-
-                    if "min" in filter_dict[mod][marg].keys():
-                        for col, n in filter_dict[mod][marg]['min'].items():
-                            L.info("Filtering features of modality '%s' by '%s' in .var to more than %s" % (mod, col, n))
-                            mu.pp.filter_var(sdata["table"].mod[mod], col, lambda x: x >= n)
-                            L.info("Remaining features: %d" % sdata["table"][mod].n_vars)
-
-                    if "bool" in filter_dict[mod][marg].keys():
-                        for col, n in filter_dict[mod][marg]['bool'].items():
-                            L.info("Filtering features of modality '%s' by '%s' in .var marked %s" % (mod, col, n))
-                            mu.pp.filter_var(sdata["table"].mod[mod], col, lambda x: x == n)
-                            L.info("Remaining features: %d" % sdata["table"][mod].n_vars)
-                            
-'''
-
 
 #mdata.update()
 

From dc969bb8fede4b0a590b41ebd063b411a5ba6b57 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Thu, 24 Oct 2024 16:05:13 +0200
Subject: [PATCH 14/57] change mudata to spatialdata

---
 .../python_scripts/run_preprocess_spatial.py  | 54 ++++++++++---------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/panpipes/python_scripts/run_preprocess_spatial.py b/panpipes/python_scripts/run_preprocess_spatial.py
index bdf28cf6..d4e83f89 100644
--- a/panpipes/python_scripts/run_preprocess_spatial.py
+++ b/panpipes/python_scripts/run_preprocess_spatial.py
@@ -8,6 +8,7 @@
 import scanpy as sc
 import muon as mu
 import scanpy.experimental as sce
+import spatialdata as sd
 
 import os
 import argparse
@@ -88,9 +89,10 @@
 sc.settings.figdir = figdir
 sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
 
-L.info("Reading in MuData from '%s'" % args.input_mudata)
-mdata = mu.read(args.input_mudata)
-spatial = mdata.mod['spatial']
+L.info("Reading in SpatialData from '%s'" % args.input_mudata)
+sdata = sd.read_zarr(args.input_mudata)
+#mdata = mu.read(args.input_mudata)
+#spatial = mdata.mod['spatial']
 
 input_data = os.path.basename(args.input_mudata)
 pattern = r"_filtered.h5(.*)"
@@ -101,12 +103,12 @@
 # check if raw data is available
 #maybe layer of raw data as parameter
 L.info("Checking if raw data is available")
-if X_is_raw(spatial):
+if X_is_raw(sdata["table"]):
     L.info("Saving raw counts from .X to .layers['raw_counts']")
-    spatial.layers['raw_counts'] = spatial.X.copy()
-elif "raw_counts" in spatial.layers :
+    sdata["table"].layers['raw_counts'] = sdata["table"].X.copy()
+elif "raw_counts" in sdata["table"].layers :
     L.info(".layers['raw_counts'] already exists and copying it to .X")
-    spatial.X = spatial.layers['raw_counts'].copy()
+    sdata["table"].X = sdata["table"].layers['raw_counts'].copy()
 else:
     L.error("X is not raw data and 'raw_counts' layer not found")
     sys.exit("X is not raw data and 'raw_counts' layer not found")
@@ -116,24 +118,24 @@
 if args.norm_hvg_flavour == "squidpy":
     if args.squidpy_hvg_flavour == "seurat_v3":
         L.info("Running HVG selection with flavor seurat_v3")
-        sc.pp.highly_variable_genes(spatial, flavor="seurat_v3", n_top_genes=int(args.n_top_genes), subset=args.filter_by_hvg,
+        sc.pp.highly_variable_genes(sdata["table"], flavor="seurat_v3", n_top_genes=int(args.n_top_genes), subset=args.filter_by_hvg,
                                     batch_key=args.hvg_batch_key)
         L.info("Log-normalizing data")
-        sc.pp.normalize_total(spatial)
-        sc.pp.log1p(spatial)
+        sc.pp.normalize_total(sdata["table"])
+        sc.pp.log1p(sdata["table"])
     else:
         L.info("Log-normalizing data")
-        sc.pp.normalize_total(spatial)
-        sc.pp.log1p(spatial)
+        sc.pp.normalize_total(sdata["table"])
+        sc.pp.log1p(sdata["table"])
         L.info("Running HVG selection with flavor %s" % args.squidpy_hvg_flavour)
-        sc.pp.highly_variable_genes(spatial, flavor=args.squidpy_hvg_flavour,
+        sc.pp.highly_variable_genes(sdata["table"], flavor=args.squidpy_hvg_flavour,
                                     min_mean=float(args.min_mean),
                                     max_mean=float(args.max_mean),
                                     min_disp=float(args.min_disp), subset=args.filter_by_hvg, batch_key=args.hvg_batch_key)
     L.info("Saving log-normalized counts to .layers['lognorm']")
-    spatial.layers["lognorm"] = spatial.X.copy()
+    sdata["table"].layers["lognorm"] = sdata["table"].X.copy()
     # plot HVGs:
-    sc.pl.highly_variable_genes(spatial, show=False, save="_genes_highlyvar" + "."+ sprefix+ ".png")
+    sc.pl.highly_variable_genes(sdata["table"], show=False, save="_genes_highlyvar" + "."+ sprefix+ ".png")
 
 elif args.norm_hvg_flavour == "seurat":
     if args.clip is None:
@@ -145,35 +147,35 @@
     else:
         clip = float(args.clip)
     L.info("Running Pearson Residuals HVG selection")
-    sce.pp.highly_variable_genes(spatial, theta=float(args.theta), clip=clip, n_top_genes=int(args.n_top_genes),
+    sce.pp.highly_variable_genes(sdata["table"], theta=float(args.theta), clip=clip, n_top_genes=int(args.n_top_genes),
                                  batch_key=args.hvg_batch_key, flavor='pearson_residuals',
                                  layer="raw_counts", subset=args.filter_by_hvg)
     L.info("Running Pearson Residuals normalization")
-    sce.pp.normalize_pearson_residuals(spatial, theta=float(args.theta), clip=clip, layer="raw_counts")
+    sce.pp.normalize_pearson_residuals(sdata["table"], theta=float(args.theta), clip=clip, layer="raw_counts")
     L.info("Saving log-normalized counts to .layers['norm_pearson_resid']")
-    spatial.layers["norm_pearson_resid"] = spatial.X.copy()
+    sdata["table"].layers["norm_pearson_resid"] = sdata["table"].X.copy()
 else:
     # error or warning?
     L.warning("No normalization and HVG selection was performed! To perform, please specify the 'norm_hvg_flavour' as either 'squidpy' or 'seurat'")
 
 
-if "highly_variable" in spatial.var:
-    L.info("You have %s Highly Variable Features", np.sum(spatial.var.highly_variable))
+if "highly_variable" in sdata["table"].var:
+    L.info("You have %s Highly Variable Features", np.sum(sdata["table"].var.highly_variable))
 
 
 
 #PCA
 L.info("Running PCA")
-sc.pp.pca(spatial, n_comps=int(args.n_pcs), svd_solver='arpack', random_state=0)
+sc.pp.pca(sdata["table"], n_comps=int(args.n_pcs), svd_solver='arpack', random_state=0)
 L.info("Plotting PCA")
-sc.pl.pca(spatial, save = "_vars" + "."+ sprefix+".png")
-sc.pl.pca_variance_ratio(spatial, log=True, n_pcs=int(args.n_pcs), save= "."+ sprefix+".png")
+sc.pl.pca(sdata["table"], save = "_vars" + "."+ sprefix+".png")
+sc.pl.pca_variance_ratio(sdata["table"], log=True, n_pcs=int(args.n_pcs), save= "."+ sprefix+".png")
 
 
         
-mdata.update()
-L.info("Saving updated MuData to '%s'" % args.output_mudata)
-mdata.write(args.output_mudata)
+#mdata.update()
+L.info("Saving updated SpatialData to '%s'" % args.output_mudata)
+sdata["table"].write(args.output_mudata)
 
 L.info("Done")
 

From 8920cdeeefd9818918e195dbda328c69cfbd3c2a Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Thu, 24 Oct 2024 16:27:46 +0200
Subject: [PATCH 15/57] bug fix

---
 panpipes/panpipes/pipeline_qc_spatial.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/panpipes/panpipes/pipeline_qc_spatial.py b/panpipes/panpipes/pipeline_qc_spatial.py
index c84ade7a..93cd2fa8 100644
--- a/panpipes/panpipes/pipeline_qc_spatial.py
+++ b/panpipes/panpipes/pipeline_qc_spatial.py
@@ -127,7 +127,7 @@ def load_mudatas(spatial_path,  outfile,
 @follows(mkdir("qc.data"))
 @follows(mkdir("./figures"))
 @transform(load_mudatas,
-           regex("./tmp/(.*)_raw.h5(.*)"), 
+           regex("./tmp/(.*)_raw.zarr"), 
            r"./logs/2_spatialQC_\1.log")
 def spatialQC(infile,log_file):
     spatial_filetype = assays[infile]
@@ -175,7 +175,7 @@ def run_plotqc_query(pqc_dict):
 @follows(mkdir("./figures/spatial"))
 @active_if(run_plotqc_query(PARAMS['plotqc']))
 @transform(load_mudatas, 
-           regex("./tmp/(.*)_raw.h5(.*)"),
+           regex("./tmp/(.*)_raw.zarr"),
            r"./logs/3_qcplot.\1.log")
 def plotQC_spatial(unfilt_file,log_file):
     spatial_filetype = assays[unfilt_file]

From f1ac7857a379f19dbe17b5ac94f7bbfdfb9abdd8 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Thu, 24 Oct 2024 16:36:47 +0200
Subject: [PATCH 16/57] bug fix

---
 panpipes/panpipes/pipeline_preprocess_spatial.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/panpipes/panpipes/pipeline_preprocess_spatial.py b/panpipes/panpipes/pipeline_preprocess_spatial.py
index b7865b00..d7a4053a 100644
--- a/panpipes/panpipes/pipeline_preprocess_spatial.py
+++ b/panpipes/panpipes/pipeline_preprocess_spatial.py
@@ -85,7 +85,7 @@ def run_plotqc_query(pqc_dict):
 @active_if(run_plotqc_query(PARAMS['plotqc']))
 @active_if(PARAMS['filtering_run'])
 @transform(filter_mudata,
-           regex("./filtered.data/(.*)_filtered.h5(.*)"), 
+           regex("./filtered.data/(.*)_filtered.zarr"), 
            r"./logs/2_postfilterplot.\1.log")
 def postfilterplot_spatial(filt_file,log_file):
     print(filt_file)    
@@ -109,7 +109,7 @@ def postfilterplot_spatial(filt_file,log_file):
 
 
 @transform(filter_mudata,
-           regex("./filtered.data/(.*)_filtered.h5(.*)"), 
+           regex("./filtered.data/(.*)_filtered.zarr"), 
            r"./logs/3_preprocess.\1.log")
 def spatial_preprocess(filt_file,log_file):
     if os.path.exists("figures/spatial") is False:

From cb165266440d06083fccf7dedf737202f4ac7800 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Thu, 24 Oct 2024 16:37:12 +0200
Subject: [PATCH 17/57] bug fixes

---
 panpipes/python_scripts/run_preprocess_spatial.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/panpipes/python_scripts/run_preprocess_spatial.py b/panpipes/python_scripts/run_preprocess_spatial.py
index d4e83f89..5e389e04 100644
--- a/panpipes/python_scripts/run_preprocess_spatial.py
+++ b/panpipes/python_scripts/run_preprocess_spatial.py
@@ -95,7 +95,7 @@
 #spatial = mdata.mod['spatial']
 
 input_data = os.path.basename(args.input_mudata)
-pattern = r"_filtered.h5(.*)"
+pattern = r"_filtered.zarr"
 match = re.search(pattern, input_data)
 sprefix = input_data[:match.start()]
 
@@ -175,7 +175,7 @@
         
 #mdata.update()
 L.info("Saving updated SpatialData to '%s'" % args.output_mudata)
-sdata["table"].write(args.output_mudata)
+sdata.write(args.output_mudata)
 
 L.info("Done")
 

From 24f035ac9707641c3fd79beb4287b2072b6529d0 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Thu, 7 Nov 2024 15:05:26 +0100
Subject: [PATCH 18/57] add vpt parameters

---
 panpipes/funcs/io.py                     | 63 +++++++++++++++---------
 panpipes/panpipes/pipeline_qc_spatial.py | 37 ++++++++------
 2 files changed, 62 insertions(+), 38 deletions(-)

diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py
index c6ff2c5c..b50496da 100644
--- a/panpipes/funcs/io.py
+++ b/panpipes/funcs/io.py
@@ -157,40 +157,58 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
             else:
                 spatial_path = caf["spatial_path"][nn]
             if caf['spatial_filetype'][nn]=="vizgen":
-                spatial_counts = None 
-                spatial_fullres_image_file = None
-                spatial_tissue_positions_file = None
-                spatial_scalefactors_file = None
+                visium_feature_bc_matrix = None 
+                visium_fullres_image_file = None
+                visium_tissue_positions_file = None
+                visium_scalefactors_file = None
                 spatial_filetype = caf['spatial_filetype'][nn]
+                if pd.notna(caf['vpt_cell_by_gene'][nn]):
+                    vpt_cell_by_gene = caf['vpt_cell_by_gene'][nn]
+                else:
+                    vpt_cell_by_gene = None
+                if pd.notna(caf['vpt_cell_metadata'][nn]):
+                    vpt_cell_metadata = caf['vpt_cell_metadata'][nn]
+                else:
+                    vpt_cell_metadata = None
+                if pd.notna(caf['vpt_cell_boundaries'][nn]):
+                    vpt_cell_boundaries = caf['vpt_cell_boundaries'][nn]
+                else:
+                    vpt_cell_boundaries = None
             elif caf['spatial_filetype'][nn]=="visium":
+                vpt_cell_by_gene = None
+                vpt_cell_metadata = None
+                vpt_cell_boundaries = None
                 spatial_filetype = caf['spatial_filetype'][nn]
                 #counts file
-                if pd.notna(caf["spatial_counts"][nn]):
-                    spatial_counts= caf["spatial_counts"][nn]
+                if pd.notna(caf["visium_feature_bc_matrix"][nn]):
+                    visium_feature_bc_matrix= caf["visium_feature_bc_matrix"][nn]
                 else:
-                    spatial_counts = None  
+                    visium_feature_bc_matrix = None  
                 # fullres image
-                if pd.notna(caf["spatial_fullres_image_file"][nn]):
-                    spatial_fullres_image_file= caf["spatial_fullres_image_file"][nn]
+                if pd.notna(caf["visium_fullres_image_file"][nn]):
+                    visium_fullres_image_file= caf["visium_fullres_image_file"][nn]
                 else:
-                    spatial_fullres_image_file = None 
+                    visium_fullres_image_file = None 
                 # tissue position 
-                if pd.notna(caf["spatial_tissue_positions_file"][nn]):
-                    spatial_tissue_positions_file= caf["spatial_tissue_positions_file"][nn]
+                if pd.notna(caf["visium_tissue_positions_file"][nn]):
+                    visium_tissue_positions_file= caf["visium_tissue_positions_file"][nn]
                 else:
-                    spatial_tissue_positions_file = None  
+                    visium_tissue_positions_file = None  
                 # scalefactor
-                if pd.notna(caf["spatial_scalefactors_file"][nn]):
-                    spatial_scalefactors_file= caf["spatial_scalefactors_file"][nn]
+                if pd.notna(caf["visium_scalefactors_file"][nn]):
+                    visium_scalefactors_file= caf["visium_scalefactors_file"][nn]
                 else:
-                    spatial_scalefactors_file = None  
+                    visium_scalefactors_file = None  
         else:
             spatial_path= None
             spatial_filetype = None
-            spatial_counts = None
-            spatial_fullres_image_file = None
-            spatial_tissue_positions_file = None
-            spatial_scalefactors_file = None
+            visium_feature_bc_matrix = None
+            visium_fullres_image_file = None
+            visium_tissue_positions_file = None
+            visium_scalefactors_file = None
+            vpt_cell_by_gene = None
+            vpt_cell_metadata = None
+            vpt_cell_boundaries = None
             
         if 'barcode_mtd_path' in caf.columns:
             cell_mtd_path = caf['barcode_mtd_path'][nn] #not yielding this right now!
@@ -204,8 +222,9 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
             outfile = outfile + ".zarr"
         sample_id = caf['sample_id'][nn]
 
-        yield spatial_path,  outfile, \
-              sample_id, spatial_filetype, spatial_counts, spatial_fullres_image_file, spatial_tissue_positions_file, spatial_scalefactors_file
+        yield spatial_path, outfile, sample_id, spatial_filetype, \
+              visium_feature_bc_matrix, visium_fullres_image_file, visium_tissue_positions_file, visium_scalefactors_file, \
+              vpt_cell_by_gene, vpt_cell_metadata, vpt_cell_boundaries
 
 
 def read_anndata(
diff --git a/panpipes/panpipes/pipeline_qc_spatial.py b/panpipes/panpipes/pipeline_qc_spatial.py
index 93cd2fa8..3bc90556 100644
--- a/panpipes/panpipes/pipeline_qc_spatial.py
+++ b/panpipes/panpipes/pipeline_qc_spatial.py
@@ -74,8 +74,8 @@ def gen_load_spatial_anndata_jobs():
 @follows(mkdir("tmp"))
 @files(gen_load_spatial_anndata_jobs)
 def load_mudatas(spatial_path,  outfile, 
-                 sample_id, spatial_filetype, spatial_counts, 
-                 spatial_fullres_image_file, spatial_tissue_positions_file, spatial_scalefactors_file):
+                 sample_id, spatial_filetype, visium_feature_bc_matrix, visium_fullres_image_file, visium_tissue_positions_file, visium_scalefactors_file,
+              vpt_cell_by_gene, vpt_cell_metadata, vpt_cell_boundaries):
     
     path_dict = {'spatial':spatial_path}
                  
@@ -83,17 +83,16 @@ def load_mudatas(spatial_path,  outfile,
     print('sample_id = %s' % str(sample_id))
     print('outfile = %s' % str(outfile))
     print('spatial_filetype = %s' % str(spatial_filetype))
-    #print('spatial_counts = %s' % str(spatial_counts))
-    #if spatial_filetype == "vizgen":
-    #    print('spatial_metadata = %s' % str(spatial_metadata))
-    #    print('spatial_transformation = %s' % str(spatial_transformation))
-    #else:
-    #    print("visium")
+
     if spatial_filetype == "visium":
-        print('spatial_counts = %s' % str(spatial_counts))
-        print('spatial_fullres_image_file= %s' % str(spatial_fullres_image_file))
-        print('spatial_tissue_positions_file= %s' % str(spatial_tissue_positions_file))
-        print('spatial_scalefactors_file= %s' % str(spatial_scalefactors_file))
+        print('visium_feature_bc_matrix = %s' % str(visium_feature_bc_matrix))
+        print('visium_fullres_image_file= %s' % str(visium_fullres_image_file))
+        print('visium_tissue_positions_file= %s' % str(visium_tissue_positions_file))
+        print('visium_scalefactors_file= %s' % str(visium_scalefactors_file))
+    if spatial_filetype == "vizgen":
+        print('vpt_cell_by_gene = %s' % str(vpt_cell_by_gene))
+        print('vpt_cell_metadata= %s' % str(vpt_cell_metadata))
+        print('vpt_cell_boundaries= %s' % str(vpt_cell_boundaries))
     modality_dict = {k:True if path_dict[k] is not None else False for k,v in {'spatial': True}.items() }
     print(modality_dict)
 
@@ -109,10 +108,16 @@ def load_mudatas(spatial_path,  outfile,
     """
     if spatial_filetype == "visium":
         cmd += """
-        --spatial_counts %(spatial_counts)s
-        --scalefactors_file %(spatial_scalefactors_file)s 
-        --fullres_image_file %(spatial_fullres_image_file)s
-        --tissue_positions_file %(spatial_tissue_positions_file)s
+        --visium_feature_bc_matrix %(visium_feature_bc_matrix)s
+        --scalefactors_file %(visium_scalefactors_file)s 
+        --fullres_image_file %(visium_fullres_image_file)s
+        --tissue_positions_file %(visium_tissue_positions_file)s
+        """
+    if spatial_filetype == "vizgen":
+        cmd += """
+        --vpt_cell_by_gene %(vpt_cell_by_gene)s
+        --vpt_cell_metadata %(vpt_cell_metadata)s 
+        --vpt_cell_boundaries %(vpt_cell_boundaries)s
         """
     cmd += " > logs/1_make_mudatas_%(sample_id)s.log"
     job_kwargs["job_threads"] = PARAMS['resources_threads_medium']

From ac41674d896ff14e2ff94773e7eb809976b27f88 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Thu, 7 Nov 2024 15:25:52 +0100
Subject: [PATCH 19/57] add vpt output parameter

---
 .../make_spatialData_from_csv.py              | 50 +++++++++++--------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/panpipes/python_scripts/make_spatialData_from_csv.py b/panpipes/python_scripts/make_spatialData_from_csv.py
index 81050809..d34c4822 100644
--- a/panpipes/python_scripts/make_spatialData_from_csv.py
+++ b/panpipes/python_scripts/make_spatialData_from_csv.py
@@ -11,6 +11,7 @@
 import spatialdata_io as sd_io
 from mudata import MuData
 import os
+from pathlib import Path
 """
 this script copies the make_adata_from_csv.py that creates
 ONE MUDATA PER SAMPLE, with in each ONE LAYER per modality
@@ -50,7 +51,7 @@
 parser.add_argument('--spatial_filetype', 
                     default=None,
                     help='')
-parser.add_argument('--spatial_counts', 
+parser.add_argument('--visium_feature_bc_matrix', 
                     default=None,
                     help='')
 parser.add_argument('--scalefactors_file', 
@@ -62,12 +63,15 @@
 parser.add_argument('--tissue_positions_file', 
                     default=None,
                     help='')
-#parser.add_argument('--spatial_metadata', 
-#                    default=None,
-#                    help='')
-#parser.add_argument('--spatial_transformation', 
-#                    default=None,
-#                    help='')
+parser.add_argument('--vpt_cell_by_gene', 
+                   default=None,
+                    help='')
+parser.add_argument('--vpt_cell_metadata', 
+                    default=None,
+                    help='')
+parser.add_argument('--vpt_cell_boundaries', 
+                    default=None,
+                    help='')
 
 parser.set_defaults(verbose=True)
 args, opt = parser.parse_known_args()
@@ -84,10 +88,13 @@
 all_files = {
             "spatial":[args.spatial_infile, #path
                         args.spatial_filetype, #needed for the load_adata_in function to call one of vizgen,visium
-                        args.spatial_counts, #name of the counts file, mandatory for squidpy
+                        args.visium_feature_bc_matrix, #name of the counts file, mandatory for squidpy
                         args.fullres_image_file, # visium
                         args.tissue_positions_file, #visium
-                        args.scalefactors_file]} # visium 
+                        args.scalefactors_file, 
+                        args.vpt_cell_by_gene,
+                        args.vpt_cell_metadata, 
+                        args.vpt_cell_boundaries ]} # visium 
 #                        args.spatial_metadata, #name of the metadata file, mandatory for squidpy
 #                        args.spatial_transformation]}
 #subset to the modalities we want from permf (in this case only spatial)
@@ -130,26 +137,25 @@ def check_dir_transform(infile_path, transform_file):
 
 if args.spatial_filetype=="vizgen":
     L.info("Reading in Vizgen data with squidpy.read.vizgen() into AnnData from directory " + args.spatial_infile)
-    sdata = sd_io.merscope(path = args.spatial_infile)
-#    adata = sq.read.vizgen(path = args.spatial_infile, #path, mandatory for squidpy
-#                        counts_file=args.spatial_counts, #name of the counts file, mandatory for squidpy
-#                        meta_file = args.spatial_metadata, #name of the metadata file, mandatory for squidpy
-#                        transformation_file=args.spatial_transformation,
-#                        library_id = str(args.sample_id)) #this also has kwargs for read_10x_h5 but keep simple
-#    adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns = adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns.astype(str)
+    # check that all vpt parameters are not None 
+    if None not in (args.vpt_cell_by_gene, args.vpt_cell_metadata, args.vpt_cell_boundaries):
+        vpt_outputs = {'cell_by_gene': Path(args.vpt_cell_by_gene) , 
+                'cell_metadata': Path(args.vpt_cell_metadata) , 
+                'cell_boundaries': Path(args.vpt_cell_boundaries)}
+        sdata = sd_io.merscope(path = args.spatial_infile, vpt_outputs=vpt_outputs)
+    else: 
+        sdata = sd_io.merscope(path = args.spatial_infile)
+
 elif args.spatial_filetype =="visium":
     L.info("Reading in Visium data with squidpy.read.visium() into AnnData from directory " + args.spatial_infile)
     sdata = sd_io.visium(path=args.spatial_infile, 
                          dataset_id=str(args.sample_id), 
-                         counts_file=args.spatial_counts, 
+                         counts_file=args.visium_feature_bc_matrix, 
                          fullres_image_file=args.fullres_image_file,
                          tissue_positions_file=args.tissue_positions_file, 
                          scalefactors_file=args.scalefactors_file)
-    #adata = sq.read.visium(path = args.spatial_infile, #path, mandatory for squidpy
-    #                    counts_file=args.spatial_counts, #name of the counts file, mandatory for squidpy
-    #                    library_id = str(args.sample_id)
-    #                    ) #this also has kwargs for read_10x_h5 but keep simple
-
+    
+    
 L.info("Resulting SpatialData is:")
 L.info(sdata)
 #L.info("Creating MuData with .mod['spatial']")

From 9acc5d0a3b39eeb539fb720e9ba6001d898683a5 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Mon, 11 Nov 2024 11:35:55 +0100
Subject: [PATCH 20/57] add xenium

---
 panpipes/funcs/io.py                                 | 9 +++++++++
 panpipes/python_scripts/make_spatialData_from_csv.py | 4 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py
index b50496da..af41d6ff 100644
--- a/panpipes/funcs/io.py
+++ b/panpipes/funcs/io.py
@@ -156,6 +156,15 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
                 spatial_filetype = None
             else:
                 spatial_path = caf["spatial_path"][nn]
+            if caf['spatial_filetype'][nn]=="xenium":
+                spatial_filetype = caf['spatial_filetype'][nn]
+                visium_feature_bc_matrix = None
+                visium_fullres_image_file = None
+                visium_tissue_positions_file = None
+                visium_scalefactors_file = None
+                vpt_cell_by_gene = None
+                vpt_cell_metadata = None
+                vpt_cell_boundaries = None
             if caf['spatial_filetype'][nn]=="vizgen":
                 visium_feature_bc_matrix = None 
                 visium_fullres_image_file = None
diff --git a/panpipes/python_scripts/make_spatialData_from_csv.py b/panpipes/python_scripts/make_spatialData_from_csv.py
index d34c4822..0d22a5c6 100644
--- a/panpipes/python_scripts/make_spatialData_from_csv.py
+++ b/panpipes/python_scripts/make_spatialData_from_csv.py
@@ -155,7 +155,9 @@ def check_dir_transform(infile_path, transform_file):
                          tissue_positions_file=args.tissue_positions_file, 
                          scalefactors_file=args.scalefactors_file)
     
-    
+elif args.spatial_filetype =="xenium": 
+    sdata = sd_io.xenium(path = args.spatial_infile)
+
 L.info("Resulting SpatialData is:")
 L.info(sdata)
 #L.info("Creating MuData with .mod['spatial']")

From 8ab1570e057ee3a8fe8a72d80470d429d51460c6 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 26 Nov 2024 11:06:05 +0100
Subject: [PATCH 21/57] adjust sample submission file

---
 .github/workflows/spatial_ingestion_visium-ci.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/spatial_ingestion_visium-ci.yml b/.github/workflows/spatial_ingestion_visium-ci.yml
index 8fb57e05..a3bac8ca 100644
--- a/.github/workflows/spatial_ingestion_visium-ci.yml
+++ b/.github/workflows/spatial_ingestion_visium-ci.yml
@@ -78,12 +78,11 @@ jobs:
       - name: Preparing the submission file
         run: |
           cd spatial/ingestion
-          curl -o sample_file_qc_spatial.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_visium_data/sample_file_qc_spatial.txt
-
+          curl -o sample_file_qc_spatial.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt
       - name: Preparing the yaml file
         run: |
           cd spatial/ingestion
-          curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_visium_data/pipeline.yml
+          curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/pipeline.yml
 
       - name: File tree
         if: env.debug == 'true'

From f3b8747b3ade6b9c851c105c55ce42325ede74d6 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 26 Nov 2024 11:08:41 +0100
Subject: [PATCH 22/57] add spatialdata

---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2dc39d84..b888fba6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,7 +68,9 @@ spatial = [
     "scipy==1.12.0",
     "squidpy",
     "cell2location",
-    "tangram-sc"
+    "tangram-sc", 
+    "spatialdata", 
+    "spatialdata-io"
 ]
 
 refmap_old = [

From 1937c1a63b6961162ab75e8726132d8cd823c6b0 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 26 Nov 2024 11:14:34 +0100
Subject: [PATCH 23/57] change file name

---
 .github/workflows/spatial_ingestion_visium-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/spatial_ingestion_visium-ci.yml b/.github/workflows/spatial_ingestion_visium-ci.yml
index a3bac8ca..77da20f2 100644
--- a/.github/workflows/spatial_ingestion_visium-ci.yml
+++ b/.github/workflows/spatial_ingestion_visium-ci.yml
@@ -78,7 +78,7 @@ jobs:
       - name: Preparing the submission file
         run: |
           cd spatial/ingestion
-          curl -o sample_file_qc_spatial.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt
+          curl -o sample_file_qc_visium.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt
       - name: Preparing the yaml file
         run: |
           cd spatial/ingestion

From 5d172140fbc123afad9ebfa6cff9c705935e3b4c Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 26 Nov 2024 11:25:20 +0100
Subject: [PATCH 24/57] adjust submission file

---
 .github/workflows/spatial_ingestion_merfish-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/spatial_ingestion_merfish-ci.yml b/.github/workflows/spatial_ingestion_merfish-ci.yml
index 89812404..a270bfbf 100644
--- a/.github/workflows/spatial_ingestion_merfish-ci.yml
+++ b/.github/workflows/spatial_ingestion_merfish-ci.yml
@@ -75,12 +75,12 @@ jobs:
       - name: Preparing the submission file
         run: |
           cd spatial/ingestion_merfish
-          curl -o sample_file_qc_spatial.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_merfish_data/sample_file_qc_spatial.txt
+          curl -o sample_file_qc_merfish.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merfish_data/sample_file_qc_merfish.txt
 
       - name: Preparing the yaml file
         run: |
           cd spatial/ingestion_merfish
-          curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_merfish_data/pipeline.yml
+          curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merfish_data/pipeline.yml
 
       - name: File tree
         if: env.debug == 'true'

From 72f5d5bd2db9c9d3ae19fe4cddea97c2427cdf48 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 26 Nov 2024 11:33:54 +0100
Subject: [PATCH 25/57] change None to string

---
 panpipes/python_scripts/make_spatialData_from_csv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/panpipes/python_scripts/make_spatialData_from_csv.py b/panpipes/python_scripts/make_spatialData_from_csv.py
index 0d22a5c6..69303ea4 100644
--- a/panpipes/python_scripts/make_spatialData_from_csv.py
+++ b/panpipes/python_scripts/make_spatialData_from_csv.py
@@ -138,7 +138,7 @@ def check_dir_transform(infile_path, transform_file):
 if args.spatial_filetype=="vizgen":
     L.info("Reading in Vizgen data with squidpy.read.vizgen() into AnnData from directory " + args.spatial_infile)
     # check that all vpt parameters are not None 
-    if None not in (args.vpt_cell_by_gene, args.vpt_cell_metadata, args.vpt_cell_boundaries):
+    if "None" not in (args.vpt_cell_by_gene, args.vpt_cell_metadata, args.vpt_cell_boundaries):
         vpt_outputs = {'cell_by_gene': Path(args.vpt_cell_by_gene) , 
                 'cell_metadata': Path(args.vpt_cell_metadata) , 
                 'cell_boundaries': Path(args.vpt_cell_boundaries)}

From afd637858b14b44b0472b99e3bd56ede5c1fb9c9 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 26 Nov 2024 11:42:47 +0100
Subject: [PATCH 26/57] rename data files

---
 .github/workflows/spatial_ingestion_merfish-ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/spatial_ingestion_merfish-ci.yml b/.github/workflows/spatial_ingestion_merfish-ci.yml
index a270bfbf..e0b8ae06 100644
--- a/.github/workflows/spatial_ingestion_merfish-ci.yml
+++ b/.github/workflows/spatial_ingestion_merfish-ci.yml
@@ -58,11 +58,11 @@ jobs:
         run: |
           mkdir spatial spatial/ingestion_merfish spatial/ingestion_merfish/data
           cd spatial/ingestion_merfish/data 
-          curl -L -o datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv https://figshare.com/ndownloader/files/45028624
-          curl -L -o datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv https://figshare.com/ndownloader/files/45028621
+          curl -L -o cell_by_gene.csv https://figshare.com/ndownloader/files/45028624
+          curl -L -o cell_metadata.csv https://figshare.com/ndownloader/files/45028621
           mkdir images
           cd images 
-          curl -L -o datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_images_micron_to_mosaic_pixel_transform.csv https://figshare.com/ndownloader/files/45028645
+          curl -L -o micron_to_mosaic_pixel_transform.csv https://figshare.com/ndownloader/files/45028645
         
 
       # Note: we run the following to test that the commands works

From 90ef6e605ff45f9e66f0777c90d401f4c7a911ef Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Sun, 1 Dec 2024 11:57:38 +0100
Subject: [PATCH 27/57] add ingest merscope action

---
 .../spatial_ingestion_merscope-ci.yml         | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 .github/workflows/spatial_ingestion_merscope-ci.yml

diff --git a/.github/workflows/spatial_ingestion_merscope-ci.yml b/.github/workflows/spatial_ingestion_merscope-ci.yml
new file mode 100644
index 00000000..0d10ac0d
--- /dev/null
+++ b/.github/workflows/spatial_ingestion_merscope-ci.yml
@@ -0,0 +1,105 @@
+name: Run tutorials (spatial ingest merscope)
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+env:
+  debug: 'true'
+
+jobs:
+  spatial_ingest_merscope:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-latest"] # , "macos-latest", "windows-latest"
+        python-version: ["3.10"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree
+
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-version: latest
+          auto-activate-base: true
+          auto-update-conda: true
+          channels: conda-forge
+          channel-priority: strict
+          activate-environment: pipeline_env
+          environment-file: pipeline_env.yaml
+
+      - name: Install Panpipes
+        shell: bash -el {0}
+        run: |
+          pip install .[spatial]
+          conda list
+
+      - name: Conda info
+        if: env.debug == 'true'
+        shell: bash -el {0}
+        run: conda info
+
+      - name: Conda list
+        if: env.debug == 'true'
+        shell: pwsh
+        run: conda list
+
+
+      - name: Preparing the data
+        run: |
+          mkdir spatial spatial/ingestion_merscope spatial/ingestion_merscope/data
+          cd spatial/ingestion_merscope/data 
+          curl -L -o cell_by_gene.csv https://figshare.com/ndownloader/files/50899455
+          curl -L -o cell_metadata.csv https://figshare.com/ndownloader/files/50899452
+          curl -L -o cellpose_micron_space.parquet https://figshare.com/ndownloader/files/50899458
+          curl -L -o detected_transcripts.csv https://figshare.com/ndownloader/files/50899476
+          mkdir images
+          cd images 
+          curl -L -o micron_to_mosaic_pixel_transform.csv https://figshare.com/ndownloader/files/50899449
+        
+
+      # Note: we run the following to test that the commands works
+      - name: Preparing the configuration file
+        shell: bash -el {0}
+        run: |
+          cd spatial/ingestion_merscope
+          panpipes qc_spatial config
+
+      - name: Preparing the submission file
+        run: |
+          cd spatial/ingestion_merscope
+          curl -o sample_file_qc_merscope.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merscope_data/sample_file_qc_merscope.txt
+
+      - name: Preparing the yaml file
+        run: |
+          cd spatial/ingestion_merscope
+          curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merscope_data/pipeline.yml
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree spatial/ingestion_merscope
+
+      - name: Review pipeline tasks
+        shell: bash -el {0}
+        run: |
+          cd spatial/ingestion_merscope
+          panpipes qc_spatial show full --local
+
+      - name: Run pipeline tasks
+        shell: bash -el {0}
+        run: |
+          cd spatial/ingestion_merscope
+          panpipes qc_spatial make full --local
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree spatial/ingestion_merscope

From 5fef8cf3c5f7e7a6b3d5cd15e71718d6ed3d3f8f Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Sun, 1 Dec 2024 12:07:34 +0100
Subject: [PATCH 28/57] test to pin squidpy version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b888fba6..199bb99b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,7 @@ spatial = [
     "jaxlib==0.4.23",
     "scvi-tools==1.0.4",
     "scipy==1.12.0",
-    "squidpy",
+    "squidpy>1.6.1",
     "cell2location",
     "tangram-sc", 
     "spatialdata", 

From 1f33c411ac3d16bee5675f62a90d261b460bfa57 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 17 Dec 2024 12:05:40 +0100
Subject: [PATCH 29/57] remove detected transcripts

---
 .github/workflows/spatial_ingestion_merscope-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/spatial_ingestion_merscope-ci.yml b/.github/workflows/spatial_ingestion_merscope-ci.yml
index 0d10ac0d..3a2da28a 100644
--- a/.github/workflows/spatial_ingestion_merscope-ci.yml
+++ b/.github/workflows/spatial_ingestion_merscope-ci.yml
@@ -61,7 +61,7 @@ jobs:
           curl -L -o cell_by_gene.csv https://figshare.com/ndownloader/files/50899455
           curl -L -o cell_metadata.csv https://figshare.com/ndownloader/files/50899452
           curl -L -o cellpose_micron_space.parquet https://figshare.com/ndownloader/files/50899458
-          curl -L -o detected_transcripts.csv https://figshare.com/ndownloader/files/50899476
+          # curl -L -o detected_transcripts.csv https://figshare.com/ndownloader/files/50899476
           mkdir images
           cd images 
           curl -L -o micron_to_mosaic_pixel_transform.csv https://figshare.com/ndownloader/files/50899449

From 2f4392c4756320605cc59ca1382c8a7926976062 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 17 Dec 2024 12:05:56 +0100
Subject: [PATCH 30/57] remove squidpy version pin

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 199bb99b..b888fba6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,7 @@ spatial = [
     "jaxlib==0.4.23",
     "scvi-tools==1.0.4",
     "scipy==1.12.0",
-    "squidpy>1.6.1",
+    "squidpy",
     "cell2location",
     "tangram-sc", 
     "spatialdata", 

From cf4ff073a8b074d7f7bae248bcacfb95b2b9fdeb Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 17 Dec 2024 12:06:09 +0100
Subject: [PATCH 31/57] remove unnecessary imports

---
 panpipes/python_scripts/make_spatialData_from_csv.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/panpipes/python_scripts/make_spatialData_from_csv.py b/panpipes/python_scripts/make_spatialData_from_csv.py
index 69303ea4..d6c85ac6 100644
--- a/panpipes/python_scripts/make_spatialData_from_csv.py
+++ b/panpipes/python_scripts/make_spatialData_from_csv.py
@@ -1,15 +1,15 @@
 import argparse
 import yaml
 # import scanpy as sc
-import pandas as pd
+#import pandas as pd
 # import numpy as np
 # from scipy.sparse import csr_matrix
-import muon as mu
-import warnings
-from muon._atac.tools import add_peak_annotation, locate_fragments
-import squidpy as sq
+#import muon as mu
+#import warnings
+#from muon._atac.tools import add_peak_annotation, locate_fragments
+#import squidpy as sq
 import spatialdata_io as sd_io
-from mudata import MuData
+#from mudata import MuData
 import os
 from pathlib import Path
 """

From 124d849656e32163eb4657d2839d7d8e6d011ef2 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 17 Dec 2024 12:06:22 +0100
Subject: [PATCH 32/57] add ingest xenium action

---
 .../workflows/spatial_ingestion_xenium.yml    | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/workflows/spatial_ingestion_xenium.yml

diff --git a/.github/workflows/spatial_ingestion_xenium.yml b/.github/workflows/spatial_ingestion_xenium.yml
new file mode 100644
index 00000000..0cd3fcdb
--- /dev/null
+++ b/.github/workflows/spatial_ingestion_xenium.yml
@@ -0,0 +1,106 @@
+name: Run tutorials (spatial ingest xenium)
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+env:
+  debug: 'true'
+
+jobs:
+  spatial_ingest_xenium:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-latest"] # , "macos-latest", "windows-latest"
+        python-version: ["3.10"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree
+
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-version: latest
+          auto-activate-base: true
+          auto-update-conda: true
+          channels: conda-forge
+          channel-priority: strict
+          activate-environment: pipeline_env
+          environment-file: pipeline_env.yaml
+
+      - name: Install Panpipes
+        shell: bash -el {0}
+        run: |
+          pip install .[spatial]
+          conda list
+
+      - name: Conda info
+        if: env.debug == 'true'
+        shell: bash -el {0}
+        run: conda info
+
+      - name: Conda list
+        if: env.debug == 'true'
+        shell: pwsh
+        run: conda list
+
+
+      - name: Preparing the data
+        run: |
+          mkdir spatial spatial/ingestion_xenium spatial/ingestion_xenium/data
+          cd spatial/ingestion_xenium/data 
+          curl -L -o experiment.xenium https://figshare.com/ndownloader/files/51243614
+          curl -L -o nucleus_boundaries.parquet https://figshare.com/ndownloader/files/51243605
+          curl -L -o cell_boundaries.parquet https://figshare.com/ndownloader/files/51243596
+          curl -L -o transcripts.parquet https://figshare.com/ndownloader/files/51243608
+          curl -L -o cell_feature_matrix.h5 https://figshare.com/ndownloader/files/51243599
+          curl -L -o cells.parquet https://figshare.com/ndownloader/files/51243620
+          curl -L -o morphology_mip.ome.tif https://figshare.com/ndownloader/files/51243623
+          curl -L -o morphology_focus.ome.tif https://figshare.com/ndownloader/files/51243626
+        
+
+      # Note: we run the following to test that the commands works
+      - name: Preparing the configuration file
+        shell: bash -el {0}
+        run: |
+          cd spatial/ingestion_xenium
+          panpipes qc_spatial config
+
+      - name: Preparing the submission file
+        run: |
+          cd spatial/ingestion_xenium
+          curl -o sample_file_qc_xenium.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_xenium_data/sample_file_qc_xenium.txt
+
+      - name: Preparing the yaml file
+        run: |
+          cd spatial/ingestion_xenium
+          curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_xenium_data/pipeline.yml
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree spatial/ingestion_xenium
+
+      - name: Review pipeline tasks
+        shell: bash -el {0}
+        run: |
+          cd spatial/ingestion_xenium
+          panpipes qc_spatial show full --local
+
+      - name: Run pipeline tasks
+        shell: bash -el {0}
+        run: |
+          cd spatial/ingestion_xenium
+          panpipes qc_spatial make full --local
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree spatial/ingestion_xenium

From a79bc580134a9f312a5a275f30f92d4b7f8ae16d Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 17 Dec 2024 12:32:53 +0100
Subject: [PATCH 33/57] add cells.zarr file

---
 .github/workflows/spatial_ingestion_xenium.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/spatial_ingestion_xenium.yml b/.github/workflows/spatial_ingestion_xenium.yml
index 0cd3fcdb..b33b7f93 100644
--- a/.github/workflows/spatial_ingestion_xenium.yml
+++ b/.github/workflows/spatial_ingestion_xenium.yml
@@ -66,6 +66,7 @@ jobs:
           curl -L -o cells.parquet https://figshare.com/ndownloader/files/51243620
           curl -L -o morphology_mip.ome.tif https://figshare.com/ndownloader/files/51243623
           curl -L -o morphology_focus.ome.tif https://figshare.com/ndownloader/files/51243626
+          curl -L -o cells.zarr.zip https://figshare.com/ndownloader/files/51244049
         
 
       # Note: we run the following to test that the commands works

From 218f437676f33376cab63ee44bb805b12af76432 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 17 Dec 2024 12:41:32 +0100
Subject: [PATCH 34/57] add morphology focus folder

---
 .github/workflows/spatial_ingestion_xenium.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/spatial_ingestion_xenium.yml b/.github/workflows/spatial_ingestion_xenium.yml
index b33b7f93..58ab69de 100644
--- a/.github/workflows/spatial_ingestion_xenium.yml
+++ b/.github/workflows/spatial_ingestion_xenium.yml
@@ -65,8 +65,10 @@ jobs:
           curl -L -o cell_feature_matrix.h5 https://figshare.com/ndownloader/files/51243599
           curl -L -o cells.parquet https://figshare.com/ndownloader/files/51243620
           curl -L -o morphology_mip.ome.tif https://figshare.com/ndownloader/files/51243623
-          curl -L -o morphology_focus.ome.tif https://figshare.com/ndownloader/files/51243626
           curl -L -o cells.zarr.zip https://figshare.com/ndownloader/files/51244049
+          mkdir morphology_focus
+          cd morphology_focus
+          curl -L -o morphology_focus.ome.tif https://figshare.com/ndownloader/files/51243626
         
 
       # Note: we run the following to test that the commands works

From 89a19bfb44f99865e85362906a02d2d8079dd500 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 17 Dec 2024 12:41:49 +0100
Subject: [PATCH 35/57] adjust spatial preprocessing to spatialData

---
 .github/workflows/spatial_preprocess-ci.yml | 60 +++++++++++++++++----
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/spatial_preprocess-ci.yml b/.github/workflows/spatial_preprocess-ci.yml
index f9a6123f..fee80bce 100644
--- a/.github/workflows/spatial_preprocess-ci.yml
+++ b/.github/workflows/spatial_preprocess-ci.yml
@@ -53,19 +53,62 @@ jobs:
         shell: pwsh
         run: conda list
 
-
       - name: Preparing the data
         run: |
-          mkdir spatial spatial/preprocess spatial/preprocess/data
-          cd spatial/preprocess/data 
-          
-          curl -L -o V1_Human_Heart_unfilt.h5mu https://figshare.com/ndownloader/files/45031048
-          curl -L -o V1_Human_Lymph_Node_unfilt.h5mu https://figshare.com/ndownloader/files/45031051
+          mkdir spatial spatial/ingestion spatial/ingestion/data
+          cd spatial/ingestion/data 
+          mkdir V1_Human_Heart V1_Human_Lymph_Node
+          cd V1_Human_Heart
+          curl -O https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Heart/V1_Human_Heart_filtered_feature_bc_matrix.h5
+          curl -O https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Heart/V1_Human_Heart_spatial.tar.gz
+          tar -xf V1_Human_Heart_spatial.tar.gz
+          cd ../V1_Human_Lymph_Node
+          curl -O https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Lymph_Node/V1_Human_Lymph_Node_filtered_feature_bc_matrix.h5
+          curl -O https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Lymph_Node/V1_Human_Lymph_Node_spatial.tar.gz
+          tar -xf V1_Human_Lymph_Node_spatial.tar.gz
 
       # Note: we run the following to test that the commands works
       - name: Preparing the configuration file
         shell: bash -el {0}
         run: |
+          cd spatial/ingestion
+          panpipes qc_spatial config
+
+      - name: Preparing the submission file
+        run: |
+          cd spatial/ingestion
+          curl -o sample_file_qc_visium.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt
+      - name: Preparing the yaml file
+        run: |
+          cd spatial/ingestion
+          curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/pipeline.yml
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree spatial/ingestion
+
+      - name: Review pipeline tasks
+        shell: bash -el {0}
+        run: |
+          cd spatial/ingestion
+          panpipes qc_spatial show full --local
+
+      - name: Run pipeline tasks
+        shell: bash -el {0}
+        run: |
+          cd spatial/ingestion
+          panpipes qc_spatial make full --local
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree spatial/ingestion
+
+
+      # Note: we run the following to test that the commands works
+      - name: Preparing the configuration file
+        shell: bash -el {0}
+        run: |
+          mkdir spatial/preprocess 
           cd spatial/preprocess
           panpipes preprocess_spatial config
 
@@ -74,11 +117,6 @@ jobs:
           cd spatial/preprocess
           curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/preprocess_spatial_data/pipeline.yml
 
-      - name: Replace template contents in configuration file
-        run: |
-          cd spatial/preprocess
-          sed -i 's+../ingestion/qc.data/+./data/+g' pipeline.yml
-
       - name: File tree
         if: env.debug == 'true'
         run: tree spatial/preprocess

From 5fa72b0493d81cf933a33a072873c84da8653a7f Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 17 Dec 2024 13:02:53 +0100
Subject: [PATCH 36/57] adjust file name

---
 .github/workflows/spatial_ingestion_xenium.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/spatial_ingestion_xenium.yml b/.github/workflows/spatial_ingestion_xenium.yml
index 58ab69de..87382353 100644
--- a/.github/workflows/spatial_ingestion_xenium.yml
+++ b/.github/workflows/spatial_ingestion_xenium.yml
@@ -68,7 +68,7 @@ jobs:
           curl -L -o cells.zarr.zip https://figshare.com/ndownloader/files/51244049
           mkdir morphology_focus
           cd morphology_focus
-          curl -L -o morphology_focus.ome.tif https://figshare.com/ndownloader/files/51243626
+          curl -L -o morphology_focus_0000.ome.tif https://figshare.com/ndownloader/files/51243626
         
 
       # Note: we run the following to test that the commands works

From 7e04d41dd39ce95c49bae24f85e3956dc0b9763c Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 17 Dec 2024 13:11:44 +0100
Subject: [PATCH 37/57] xenium adjust figshare links

---
 .github/workflows/spatial_ingestion_xenium.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/spatial_ingestion_xenium.yml b/.github/workflows/spatial_ingestion_xenium.yml
index 87382353..366beb3d 100644
--- a/.github/workflows/spatial_ingestion_xenium.yml
+++ b/.github/workflows/spatial_ingestion_xenium.yml
@@ -58,17 +58,17 @@ jobs:
         run: |
           mkdir spatial spatial/ingestion_xenium spatial/ingestion_xenium/data
           cd spatial/ingestion_xenium/data 
-          curl -L -o experiment.xenium https://figshare.com/ndownloader/files/51243614
-          curl -L -o nucleus_boundaries.parquet https://figshare.com/ndownloader/files/51243605
-          curl -L -o cell_boundaries.parquet https://figshare.com/ndownloader/files/51243596
-          curl -L -o transcripts.parquet https://figshare.com/ndownloader/files/51243608
-          curl -L -o cell_feature_matrix.h5 https://figshare.com/ndownloader/files/51243599
-          curl -L -o cells.parquet https://figshare.com/ndownloader/files/51243620
-          curl -L -o morphology_mip.ome.tif https://figshare.com/ndownloader/files/51243623
-          curl -L -o cells.zarr.zip https://figshare.com/ndownloader/files/51244049
+          curl -L -o experiment.xenium https://figshare.com/ndownloader/files/51244265
+          curl -L -o nucleus_boundaries.parquet https://figshare.com/ndownloader/files/51244286
+          curl -L -o cell_boundaries.parquet https://figshare.com/ndownloader/files/51244244
+          curl -L -o transcripts.parquet https://figshare.com/ndownloader/files/51244283
+          curl -L -o cell_feature_matrix.h5 https://figshare.com/ndownloader/files/51244247
+          curl -L -o cells.parquet https://figshare.com/ndownloader/files/51244259
+          curl -L -o morphology_mip.ome.tif https://figshare.com/ndownloader/files/51244415
+          curl -L -o cells.zarr.zip https://figshare.com/ndownloader/files/51244262
           mkdir morphology_focus
           cd morphology_focus
-          curl -L -o morphology_focus_0000.ome.tif https://figshare.com/ndownloader/files/51243626
+          curl -L -o morphology_focus_0000.ome.tif https://figshare.com/ndownloader/files/51244277
         
 
       # Note: we run the following to test that the commands works

From 1e002b5532b82fe4b27170888369cfc9f6b82d8e Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 14 Jan 2025 10:54:26 +0100
Subject: [PATCH 38/57] update from mudata to spatialdata

---
 .../pipeline_deconvolution_spatial.py         |  6 +-
 panpipes/python_scripts/run_cell2location.py  | 95 ++++++++++---------
 panpipes/python_scripts/run_tangram.py        | 49 +++++-----
 3 files changed, 78 insertions(+), 72 deletions(-)

diff --git a/panpipes/panpipes/pipeline_deconvolution_spatial.py b/panpipes/panpipes/pipeline_deconvolution_spatial.py
index 41e60dde..e7026dc4 100644
--- a/panpipes/panpipes/pipeline_deconvolution_spatial.py
+++ b/panpipes/panpipes/pipeline_deconvolution_spatial.py
@@ -30,12 +30,12 @@ def get_logger():
 
 
 def gen_filter_jobs():
-    input_paths_spatial=glob.glob(os.path.join(PARAMS["input_spatial"],"*.h5mu"))
+    input_paths_spatial=glob.glob(os.path.join(PARAMS["input_spatial"],"*.zarr"))
     input_singlecell = PARAMS["input_singlecell"]
     for input_spatial in input_paths_spatial:
         sample_prefix = os.path.basename(input_spatial)
-        sample_prefix = sample_prefix.replace(".h5mu","")
-        outfile_spatial = "cell2location.output/" + sample_prefix + "/Cell2Loc_spatial_output.h5mu"
+        sample_prefix = sample_prefix.replace(".zarr","")
+        outfile_spatial = "cell2location.output/" + sample_prefix + "/Cell2Loc_spatial_output.zarr"
         yield input_spatial, outfile_spatial, sample_prefix, input_singlecell    
 
 
diff --git a/panpipes/python_scripts/run_cell2location.py b/panpipes/python_scripts/run_cell2location.py
index 3bd74d6c..f262b491 100644
--- a/panpipes/python_scripts/run_cell2location.py
+++ b/panpipes/python_scripts/run_cell2location.py
@@ -7,7 +7,7 @@
 import cell2location as c2l
 import scanpy as sc
 import pandas as pd
-import muon as mu
+import spatialdata as sd
 
 import os
 import argparse
@@ -20,6 +20,7 @@
 from panpipes.funcs.scmethods import cell2loc_filter_genes
 
 
+
 L = logging.getLogger()
 L.setLevel(logging.INFO)
 log_handler = logging.StreamHandler(sys.stdout)
@@ -197,13 +198,15 @@
 
 #1. read in the data
 #spatial: 
-L.info("Reading in spatial MuData from '%s'" % args.input_spatial)
-mdata_spatial = mu.read(args.input_spatial)
-adata_st = mdata_spatial.mod['spatial']
+L.info("Reading in spatial SpatialData from '%s'" % args.input_spatial)
+sdata_st = sd.read_zarr(args.input_spatial)
+#mdata_spatial = mu.read(args.input_spatial)
+#adata_st = mdata_spatial.mod['spatial']
 #single-cell: 
-L.info("Reading in reference MuData from '%s'" % args.input_singlecell)
-mdata_singlecell = mu.read(args.input_singlecell)
-adata_sc = mdata_singlecell.mod['rna']
+L.info("Reading in reference SpatialData from '%s'" % args.input_singlecell)
+sdata_sc = sd.read_zarr(args.input_singlecell)
+#mdata_singlecell = mu.read(args.input_singlecell)
+#adata_sc = mdata_singlecell.mod['rna']
 
 
 
@@ -217,12 +220,12 @@
     reduced_gene_set = pd.read_csv(args.gene_list, header = 0)
     reduced_gene_set.columns = ["HVGs"]
     L.info("Subsetting data on gene list")
-    adata_sc.var["selected_gene"] = adata_sc.var.index.isin(reduced_gene_set["HVGs"])
-    adata_st.var["selected_gene"] = adata_st.var.index.isin(reduced_gene_set["HVGs"])
-    adata_sc = adata_sc[:, adata_sc.var["selected_gene"]]
-    adata_st = adata_st[:, adata_st.var["selected_gene"]]
+    sdata_sc["table"].var["selected_gene"] = sdata_sc["table"].var.index.isin(reduced_gene_set["HVGs"])
+    sdata_st["table"].var["selected_gene"] = sdata_st["table"].var.index.isin(reduced_gene_set["HVGs"])
+    sdata_sc["table"] = sdata_sc["table"][:, sdata_sc["table"].var["selected_gene"]]
+    sdata_st["table"] = sdata_st["table"][:, sdata_st["table"].var["selected_gene"]]
     # check whether all genes are present in both, spatial & reference
-    if set(adata_st.var.index) != set(adata_sc.var.index):
+    if set(sdata_st["table"].var.index) != set(sdata_sc["table"].var.index):
         L.error(
             "Not all genes of the gene list %s are present in the reference as well as in the ST data. Please provide a gene list where all genes are present in both, reference and ST.", args.gene_list)
         sys.exit(
@@ -231,34 +234,34 @@
 else: # perform feature selection according to cell2loc
     if remove_mt is True: 
         L.info("Removing MT genes")
-        adata_st.var["MT_gene"] = [gene.startswith("MT-") for gene in adata_st.var.index]
-        adata_st.obsm["MT"] = adata_st[:, adata_st.var["MT_gene"].values].X.toarray()
-        adata_st = adata_st[:, ~adata_st.var["MT_gene"].values]
+        sdata_st["table"].var["MT_gene"] = [gene.startswith("MT-") for gene in sdata_st["table"].var.index]
+        sdata_st["table"].obsm["MT"] = sdata_st["table"][:, sdata_st["table"].var["MT_gene"].values].X.toarray()
+        sdata_st["table"] = sdata_st["table"][:, ~sdata_st["table"].var["MT_gene"].values]
     # intersect vars of reference and spatial
     L.info("Intersecting vars of reference and spatial ")
-    shared_features = [feature for feature in adata_st.var_names if feature in adata_sc.var_names]
-    adata_sc = adata_sc[:, shared_features]
-    adata_st = adata_st[:, shared_features]
+    shared_features = [feature for feature in sdata_st["table"].var_names if feature in sdata_sc["table"].var_names]
+    sdata_sc["table"] = sdata_sc["table"][:, shared_features]
+    sdata_st["table"] = sdata_st["table"][:, shared_features]
     # select features
     L.info("Selecting features using 'cell2location.utils.filtering.filter_genes() function'")
-    selected = cell2loc_filter_genes(adata_sc, figdir + "/gene_filter.png", cell_count_cutoff=float(args.cell_count_cutoff),
+    selected = cell2loc_filter_genes(sdata_sc["table"], figdir + "/gene_filter.png", cell_count_cutoff=float(args.cell_count_cutoff),
                                                cell_percentage_cutoff2=float(args.cell_percentage_cutoff2),
                                                 nonz_mean_cutoff=float(args.nonz_mean_cutoff))
     L.info("Subsetting data on selected features")
-    adata_sc = adata_sc[:, selected]
-    adata_st = adata_st[:, selected]
+    sdata_sc["table"] = sdata_sc["table"][:, selected]
+    sdata_st["table"] = sdata_st["table"][:, selected]
 
     
 
 # 3. Fit regression model 
 L.info("Setting up AnnData for the reference model")
-c2l.models.RegressionModel.setup_anndata(adata=adata_sc, 
+c2l.models.RegressionModel.setup_anndata(adata=sdata_sc["table"], 
                                          labels_key = args.labels_key_reference,
                                          layer= args.layer_reference, 
                                          batch_key= args.batch_key_reference,
                                          categorical_covariate_keys = categorical_covariate_keys_reference,
                                          continuous_covariate_keys =  continuous_covariate_keys_reference)
-model_ref = c2l.models.RegressionModel(adata_sc)
+model_ref = c2l.models.RegressionModel(sdata_sc["table"])
 L.info("Training the reference model")
 model_ref.train(max_epochs=max_epochs_reference, use_gpu = use_gpu_reference)
 
@@ -268,23 +271,23 @@
 
 # export results
 L.info("Extracting the posterior of the reference model")
-adata_sc = model_ref.export_posterior(adata_sc)
-if "means_per_cluster_mu_fg" in adata_sc.varm.keys():
-    inf_aver = adata_sc.varm["means_per_cluster_mu_fg"][[f"means_per_cluster_mu_fg_{i}" for i in adata_sc.uns["mod"]["factor_names"]]].copy()
+sdata_sc["table"] = model_ref.export_posterior(sdata_sc["table"])
+if "means_per_cluster_mu_fg" in sdata_sc["table"].varm.keys():
+    inf_aver = sdata_sc["table"].varm["means_per_cluster_mu_fg"][[f"means_per_cluster_mu_fg_{i}" for i in sdata_sc["table"].uns["mod"]["factor_names"]]].copy()
 else:
-    inf_aver = adata_sc.var[[f"means_per_cluster_mu_fg_{i}" for i in adata_sc.uns["mod"]["factor_names"]]].copy()
-inf_aver.columns = adata_sc.uns["mod"]["factor_names"]
+    inf_aver = sdata_sc["table"].var[[f"means_per_cluster_mu_fg_{i}" for i in sdata_sc["table"].uns["mod"]["factor_names"]]].copy()
+inf_aver.columns = sdata_sc["table"].uns["mod"]["factor_names"]
 inf_aver.to_csv(output_dir+"/Cell2Loc_inf_aver.csv")
 
 # plot QC
 L.info("Plotting QC plots")
 cell2loc_plot_QC_reference(model_ref, figdir + "/QC_reference_reconstruction_accuracy.png", figdir + "/QC_reference_expression signatures_vs_avg_expression.png")
 
-# save model and update mudata
-if adata_sc.var.index.names[0] in adata_sc.var.columns: 
-	adata_sc.var.index.names = [None]
-mdata_singlecell.mod["rna"] = adata_sc
-mdata_singlecell.update()
+# save model 
+if sdata_sc["table"].var.index.names[0] in sdata_sc["table"].var.columns: 
+	sdata_sc["table"].var.index.names = [None]
+#mdata_singlecell.mod["rna"] = adata_sc
+#mdata_singlecell.update()
 if save_models is True:
     L.info("Saving reference model to '%s'" % output_dir)
     model_ref.save(output_dir +"/Reference_model", overwrite=True)
@@ -293,7 +296,7 @@
        
 # 4. Fit mapping model   
 L.info("Setting up AnnData for the spatial model")
-c2l.models.Cell2location.setup_anndata(adata=adata_st, 
+c2l.models.Cell2location.setup_anndata(adata=sdata_st["table"], 
                                          labels_key = args.labels_key_st,
                                          layer= args.layer_st, 
                                          batch_key= args.batch_key_st,
@@ -301,7 +304,7 @@
                                          continuous_covariate_keys =  continuous_covariate_keys_st)
 
         
-model_spatial = c2l.models.Cell2location(adata = adata_st, cell_state_df=inf_aver, 
+model_spatial = c2l.models.Cell2location(adata = sdata_st["table"], cell_state_df=inf_aver, 
                                         N_cells_per_location=float(args.N_cells_per_location),
                                         detection_alpha=float(args.detection_alpha))
 L.info("Training the spatial model")
@@ -312,7 +315,7 @@
 cell2loc_plot_history(model_spatial, figdir + "/ELBO_spatial_model.png")
 #extract posterior
 L.info("Extracting the posterior of the spatial model")
-adata_st = model_spatial.export_posterior(adata_st)
+sdata_st["table"] = model_spatial.export_posterior(sdata_st["table"])
 #plot QC
 L.info("Plotting QC plots")
 cell2loc_plot_QC_reconstr(model_spatial, figdir + "/QC_spatial_reconstruction_accuracy.png")
@@ -320,24 +323,24 @@
 
 #plot output
 L.info("Plotting spatial embedding plot coloured by 'q05_cell_abundance_w_sf'")
-adata_st.obs[adata_st.uns["mod"]["factor_names"]] = adata_st.obsm["q05_cell_abundance_w_sf"]
-sc.pl.spatial(adata_st,color=adata_st.uns["mod"]["factor_names"], show = False, save = "_Cell2Loc_q05_cell_abundance_w_sf.png") 
+sdata_st["table"].obs[sdata_st["table"].uns["mod"]["factor_names"]] = sdata_st["table"].obsm["q05_cell_abundance_w_sf"]
+sc.pl.spatial(sdata_st["table"],color=sdata_st["table"].uns["mod"]["factor_names"], show = False, save = "_Cell2Loc_q05_cell_abundance_w_sf.png") 
 
 
-# save model and update mudata
-if adata_st.var.index.names[0] in adata_st.var.columns: 
-	adata_st.var.index.names = [None]
-mdata_spatial.mod["spatial"] = adata_st
-mdata_spatial.update()
+# save model 
+if sdata_st["table"].var.index.names[0] in sdata_st["table"].var.columns: 
+	sdata_st["table"].var.index.names = [None]
+#mdata_spatial.mod["spatial"] = adata_st
+#mdata_spatial.update()
 if save_models is True: 
     L.info("Saving spatial model to '%s'" % output_dir)
     model_spatial.save(output_dir+"/Spatial_mapping_model", overwrite=True)
 
 
 #6. save mudatas 
-L.info("Saving MuDatas to '%s'" % output_dir)
-mdata_singlecell.write(output_dir+"/Cell2Loc_screference_output.h5mu")
-mdata_spatial.write(output_dir+"/Cell2Loc_spatial_output.h5mu")
+L.info("Saving SpatialDatas to '%s'" % output_dir)
+sdata_sc.write(output_dir+"/Cell2Loc_screference_output.zarr")
+sdata_st.write(output_dir+"/Cell2Loc_spatial_output.zarr")
 
 
 L.info("Done")
diff --git a/panpipes/python_scripts/run_tangram.py b/panpipes/python_scripts/run_tangram.py
index 6b2cc6a2..6cbe68e7 100644
--- a/panpipes/python_scripts/run_tangram.py
+++ b/panpipes/python_scripts/run_tangram.py
@@ -9,6 +9,7 @@
 import scanpy as sc
 import tangram as tg 
 import muon as mu
+import spatialdata as sd
 
 import os
 import argparse
@@ -100,13 +101,15 @@
 
 #1. read in the data
 #spatial: 
-L.info("Reading in spatial MuData from '%s'" % args.input_spatial)
-mdata_spatial = mu.read(args.input_spatial)
-adata_st = mdata_spatial.mod['spatial']
+L.info("Reading in spatial SpatialData from '%s'" % args.input_spatial)
+sdata_st = sd.read_zarr(args.input_spatial)
+#mdata_spatial = mu.read(args.input_spatial)
+#adata_st = mdata_spatial.mod['spatial']
 #single-cell: 
-L.info("Reading in reference MuData from '%s'" % args.input_singlecell)
-mdata_singlecell = mu.read(args.input_singlecell)
-adata_sc = mdata_singlecell.mod['rna']
+L.info("Reading in reference SpatialData from '%s'" % args.input_singlecell)
+sdata_sc = sd.read_zarr(args.input_singlecell)
+#mdata_singlecell = mu.read(args.input_singlecell)
+#adata_sc = mdata_singlecell.mod['rna']
 
 
 #2. Perform gene selection:
@@ -121,43 +124,43 @@
 
 else: # perform feature selection using sc.tl.rank_genes_groups()
     L.info("Running 'scanpy.tl.rank_genes_groups()'")
-    sc.tl.rank_genes_groups(adata_sc, groupby=args.labels_key_rank_genes, layer=args.layer_rank_genes, method=args.method_rank_genes,corr_method = args.corr_method_rank_genes)
+    sc.tl.rank_genes_groups(sdata_sc["table"], groupby=args.labels_key_rank_genes, layer=args.layer_rank_genes, method=args.method_rank_genes,corr_method = args.corr_method_rank_genes)
     L.info("Plotting rank genes group")
-    sc.pl.rank_genes_groups(adata_sc, show = False, save = ".png")
-    markers_df = pd.DataFrame(adata_sc.uns["rank_genes_groups"]["names"]).iloc[0:int(args.n_genes_rank), :]
+    sc.pl.rank_genes_groups(sdata_sc["table"], show = False, save = ".png")
+    markers_df = pd.DataFrame(sdata_sc["table"].uns["rank_genes_groups"]["names"]).iloc[0:int(args.n_genes_rank), :]
     L.info("Saving rank genes to " + output_dir + "/rank_genes_groups.csv")
     markers_df.to_csv(output_dir + "/rank_genes_groups.csv")
     markers = list(np.unique(markers_df.melt().value.values))
 
 # "Preprocess" anndatas
 L.info("Preprocessing AnnDatas")
-tg.pp_adatas(adata_sc=adata_sc, adata_sp=adata_st, genes=markers)
+tg.pp_adatas(adata_sc=sdata_sc["table"], adata_sp=sdata_st["table"], genes=markers)
 
 # 3. Run tangram
 L.info("Training model")
 adata_results = tg.mapping_utils.map_cells_to_space(
-        adata_sc=adata_sc, adata_sp=adata_st, num_epochs=int(args.num_epochs), device=args.device, **args.kwargs
+        adata_sc=sdata_sc["table"], adata_sp=sdata_st["table"], num_epochs=int(args.num_epochs), device=args.device, **args.kwargs
     )
 
 # 3. Extract and plot results 
 L.info("Extracting annotations")
-tg.project_cell_annotations(adata_results, adata_st, annotation=args.labels_key_model)
+tg.project_cell_annotations(adata_results, sdata_st["table"], annotation=args.labels_key_model)
 
 L.info("Plotting spatial embedding plot coloured by 'tangram_ct_pred'")
-annotation_list = list(pd.unique(adata_sc.obs[args.labels_key_model]))
-df = adata_st.obsm["tangram_ct_pred"][annotation_list]
-tg.construct_obs_plot(df, adata_st, perc=0.05)
-if "spatial" in adata_st.uns: 
-	sc.pl.spatial(adata_st, color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png")
+annotation_list = list(pd.unique(sdata_sc["table"].obs[args.labels_key_model]))
+df = sdata_st["table"].obsm["tangram_ct_pred"][annotation_list]
+tg.construct_obs_plot(df, sdata_st["table"], perc=0.05)
+if "spatial" in sdata_st["table"].uns: 
+	sc.pl.spatial(sdata_st["table"], color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png")
 else: 
-	sc.pl.spatial(adata_st, color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png",spot_size=0.5)
+	sc.pl.spatial(sdata_st["table"], color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png",spot_size=0.5)
 
 
-mdata_singlecell_results = mu.MuData({"rna": adata_sc})
-mdata_spatial_results = mu.MuData({"spatial": adata_st})
+#mdata_singlecell_results = mu.MuData({"rna": adata_sc})
+#mdata_spatial_results = mu.MuData({"spatial": adata_st})
 
-L.info("Saving MuDatas to '%s'" % output_dir)
-mdata_singlecell_results.write(output_dir+"/Tangram_screference_output.h5mu")
-mdata_spatial_results.write(output_dir+"/Tangram_spatial_output.h5mu")
+L.info("Saving SpatialDatas to '%s'" % output_dir)
+sdata_sc.write(output_dir+"/Tangram_screference_output.zarr")
+sdata_st.write(output_dir+"/Tangram_spatial_output.zarr")
 
 L.info("Done")

From 35d5ecb47c5e56c07090b48c50232c967e47598d Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 14 Jan 2025 11:00:21 +0100
Subject: [PATCH 39/57] correct data type of reference

---
 panpipes/python_scripts/run_cell2location.py | 46 ++++++++++----------
 panpipes/python_scripts/run_tangram.py       | 23 +++++-----
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/panpipes/python_scripts/run_cell2location.py b/panpipes/python_scripts/run_cell2location.py
index f262b491..893ca233 100644
--- a/panpipes/python_scripts/run_cell2location.py
+++ b/panpipes/python_scripts/run_cell2location.py
@@ -8,6 +8,7 @@
 import scanpy as sc
 import pandas as pd
 import spatialdata as sd
+import muon as mu
 
 import os
 import argparse
@@ -203,10 +204,9 @@
 #mdata_spatial = mu.read(args.input_spatial)
 #adata_st = mdata_spatial.mod['spatial']
 #single-cell: 
-L.info("Reading in reference SpatialData from '%s'" % args.input_singlecell)
-sdata_sc = sd.read_zarr(args.input_singlecell)
-#mdata_singlecell = mu.read(args.input_singlecell)
-#adata_sc = mdata_singlecell.mod['rna']
+L.info("Reading in reference MuData from '%s'" % args.input_singlecell)
+mdata_singlecell = mu.read(args.input_singlecell)
+adata_sc = mdata_singlecell.mod['rna']
 
 
 
@@ -220,12 +220,12 @@
     reduced_gene_set = pd.read_csv(args.gene_list, header = 0)
     reduced_gene_set.columns = ["HVGs"]
     L.info("Subsetting data on gene list")
-    sdata_sc["table"].var["selected_gene"] = sdata_sc["table"].var.index.isin(reduced_gene_set["HVGs"])
+    adata_sc.var["selected_gene"] = adata_sc.var.index.isin(reduced_gene_set["HVGs"])
     sdata_st["table"].var["selected_gene"] = sdata_st["table"].var.index.isin(reduced_gene_set["HVGs"])
-    sdata_sc["table"] = sdata_sc["table"][:, sdata_sc["table"].var["selected_gene"]]
+    adata_sc = adata_sc[:, adata_sc.var["selected_gene"]]
     sdata_st["table"] = sdata_st["table"][:, sdata_st["table"].var["selected_gene"]]
     # check whether all genes are present in both, spatial & reference
-    if set(sdata_st["table"].var.index) != set(sdata_sc["table"].var.index):
+    if set(sdata_st["table"].var.index) != set(adata_sc.var.index):
         L.error(
             "Not all genes of the gene list %s are present in the reference as well as in the ST data. Please provide a gene list where all genes are present in both, reference and ST.", args.gene_list)
         sys.exit(
@@ -239,29 +239,29 @@
         sdata_st["table"] = sdata_st["table"][:, ~sdata_st["table"].var["MT_gene"].values]
     # intersect vars of reference and spatial
     L.info("Intersecting vars of reference and spatial ")
-    shared_features = [feature for feature in sdata_st["table"].var_names if feature in sdata_sc["table"].var_names]
-    sdata_sc["table"] = sdata_sc["table"][:, shared_features]
+    shared_features = [feature for feature in sdata_st["table"].var_names if feature in adata_sc.var_names]
+    adata_sc = adata_sc[:, shared_features]
     sdata_st["table"] = sdata_st["table"][:, shared_features]
     # select features
     L.info("Selecting features using 'cell2location.utils.filtering.filter_genes() function'")
-    selected = cell2loc_filter_genes(sdata_sc["table"], figdir + "/gene_filter.png", cell_count_cutoff=float(args.cell_count_cutoff),
+    selected = cell2loc_filter_genes(adata_sc, figdir + "/gene_filter.png", cell_count_cutoff=float(args.cell_count_cutoff),
                                                cell_percentage_cutoff2=float(args.cell_percentage_cutoff2),
                                                 nonz_mean_cutoff=float(args.nonz_mean_cutoff))
     L.info("Subsetting data on selected features")
-    sdata_sc["table"] = sdata_sc["table"][:, selected]
+    adata_sc = adata_sc[:, selected]
     sdata_st["table"] = sdata_st["table"][:, selected]
 
     
 
 # 3. Fit regression model 
 L.info("Setting up AnnData for the reference model")
-c2l.models.RegressionModel.setup_anndata(adata=sdata_sc["table"], 
+c2l.models.RegressionModel.setup_anndata(adata=adata_sc, 
                                          labels_key = args.labels_key_reference,
                                          layer= args.layer_reference, 
                                          batch_key= args.batch_key_reference,
                                          categorical_covariate_keys = categorical_covariate_keys_reference,
                                          continuous_covariate_keys =  continuous_covariate_keys_reference)
-model_ref = c2l.models.RegressionModel(sdata_sc["table"])
+model_ref = c2l.models.RegressionModel(adata_sc)
 L.info("Training the reference model")
 model_ref.train(max_epochs=max_epochs_reference, use_gpu = use_gpu_reference)
 
@@ -271,12 +271,12 @@
 
 # export results
 L.info("Extracting the posterior of the reference model")
-sdata_sc["table"] = model_ref.export_posterior(sdata_sc["table"])
-if "means_per_cluster_mu_fg" in sdata_sc["table"].varm.keys():
-    inf_aver = sdata_sc["table"].varm["means_per_cluster_mu_fg"][[f"means_per_cluster_mu_fg_{i}" for i in sdata_sc["table"].uns["mod"]["factor_names"]]].copy()
+adata_sc = model_ref.export_posterior(adata_sc)
+if "means_per_cluster_mu_fg" in adata_sc.varm.keys():
+    inf_aver = adata_sc.varm["means_per_cluster_mu_fg"][[f"means_per_cluster_mu_fg_{i}" for i in adata_sc.uns["mod"]["factor_names"]]].copy()
 else:
-    inf_aver = sdata_sc["table"].var[[f"means_per_cluster_mu_fg_{i}" for i in sdata_sc["table"].uns["mod"]["factor_names"]]].copy()
-inf_aver.columns = sdata_sc["table"].uns["mod"]["factor_names"]
+    inf_aver = adata_sc.var[[f"means_per_cluster_mu_fg_{i}" for i in adata_sc.uns["mod"]["factor_names"]]].copy()
+inf_aver.columns = adata_sc.uns["mod"]["factor_names"]
 inf_aver.to_csv(output_dir+"/Cell2Loc_inf_aver.csv")
 
 # plot QC
@@ -284,10 +284,10 @@
 cell2loc_plot_QC_reference(model_ref, figdir + "/QC_reference_reconstruction_accuracy.png", figdir + "/QC_reference_expression signatures_vs_avg_expression.png")
 
 # save model 
-if sdata_sc["table"].var.index.names[0] in sdata_sc["table"].var.columns: 
-	sdata_sc["table"].var.index.names = [None]
-#mdata_singlecell.mod["rna"] = adata_sc
-#mdata_singlecell.update()
+if adata_sc.var.index.names[0] in adata_sc.var.columns: 
+	adata_sc.var.index.names = [None]
+mdata_singlecell.mod["rna"] = adata_sc
+mdata_singlecell.update()
 if save_models is True:
     L.info("Saving reference model to '%s'" % output_dir)
     model_ref.save(output_dir +"/Reference_model", overwrite=True)
@@ -339,7 +339,7 @@
 
 #6. save mudatas 
 L.info("Saving SpatialDatas to '%s'" % output_dir)
-sdata_sc.write(output_dir+"/Cell2Loc_screference_output.zarr")
+mdata_singlecell.write(output_dir+"/Cell2Loc_screference_output.h5mu")
 sdata_st.write(output_dir+"/Cell2Loc_spatial_output.zarr")
 
 
diff --git a/panpipes/python_scripts/run_tangram.py b/panpipes/python_scripts/run_tangram.py
index 6cbe68e7..28eaeb0d 100644
--- a/panpipes/python_scripts/run_tangram.py
+++ b/panpipes/python_scripts/run_tangram.py
@@ -107,9 +107,8 @@
 #adata_st = mdata_spatial.mod['spatial']
 #single-cell: 
 L.info("Reading in reference SpatialData from '%s'" % args.input_singlecell)
-sdata_sc = sd.read_zarr(args.input_singlecell)
-#mdata_singlecell = mu.read(args.input_singlecell)
-#adata_sc = mdata_singlecell.mod['rna']
+mdata_singlecell = mu.read(args.input_singlecell)
+adata_sc = mdata_singlecell.mod['rna']
 
 
 #2. Perform gene selection:
@@ -124,22 +123,22 @@
 
 else: # perform feature selection using sc.tl.rank_genes_groups()
     L.info("Running 'scanpy.tl.rank_genes_groups()'")
-    sc.tl.rank_genes_groups(sdata_sc["table"], groupby=args.labels_key_rank_genes, layer=args.layer_rank_genes, method=args.method_rank_genes,corr_method = args.corr_method_rank_genes)
+    sc.tl.rank_genes_groups(adata_sc, groupby=args.labels_key_rank_genes, layer=args.layer_rank_genes, method=args.method_rank_genes,corr_method = args.corr_method_rank_genes)
     L.info("Plotting rank genes group")
-    sc.pl.rank_genes_groups(sdata_sc["table"], show = False, save = ".png")
-    markers_df = pd.DataFrame(sdata_sc["table"].uns["rank_genes_groups"]["names"]).iloc[0:int(args.n_genes_rank), :]
+    sc.pl.rank_genes_groups(adata_sc, show = False, save = ".png")
+    markers_df = pd.DataFrame(adata_sc.uns["rank_genes_groups"]["names"]).iloc[0:int(args.n_genes_rank), :]
     L.info("Saving rank genes to " + output_dir + "/rank_genes_groups.csv")
     markers_df.to_csv(output_dir + "/rank_genes_groups.csv")
     markers = list(np.unique(markers_df.melt().value.values))
 
 # "Preprocess" anndatas
 L.info("Preprocessing AnnDatas")
-tg.pp_adatas(adata_sc=sdata_sc["table"], adata_sp=sdata_st["table"], genes=markers)
+tg.pp_adatas(adata_sc=adata_sc, adata_sp=sdata_st["table"], genes=markers)
 
 # 3. Run tangram
 L.info("Training model")
 adata_results = tg.mapping_utils.map_cells_to_space(
-        adata_sc=sdata_sc["table"], adata_sp=sdata_st["table"], num_epochs=int(args.num_epochs), device=args.device, **args.kwargs
+        adata_sc=adata_sc, adata_sp=sdata_st["table"], num_epochs=int(args.num_epochs), device=args.device, **args.kwargs
     )
 
 # 3. Extract and plot results 
@@ -147,7 +146,7 @@
 tg.project_cell_annotations(adata_results, sdata_st["table"], annotation=args.labels_key_model)
 
 L.info("Plotting spatial embedding plot coloured by 'tangram_ct_pred'")
-annotation_list = list(pd.unique(sdata_sc["table"].obs[args.labels_key_model]))
+annotation_list = list(pd.unique(adata_sc.obs[args.labels_key_model]))
 df = sdata_st["table"].obsm["tangram_ct_pred"][annotation_list]
 tg.construct_obs_plot(df, sdata_st["table"], perc=0.05)
 if "spatial" in sdata_st["table"].uns: 
@@ -156,11 +155,11 @@
 	sc.pl.spatial(sdata_st["table"], color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png",spot_size=0.5)
 
 
-#mdata_singlecell_results = mu.MuData({"rna": adata_sc})
+mdata_singlecell_results = mu.MuData({"rna": adata_sc})
 #mdata_spatial_results = mu.MuData({"spatial": adata_st})
 
-L.info("Saving SpatialDatas to '%s'" % output_dir)
-sdata_sc.write(output_dir+"/Tangram_screference_output.zarr")
+L.info("Saving SpatialData and MuData to '%s'" % output_dir)
+mdata_singlecell_results.write(output_dir+"/Tangram_screference_output.h5mu")
 sdata_st.write(output_dir+"/Tangram_spatial_output.zarr")
 
 L.info("Done")

From 358da951ccc97fd98dabd09d223643c18ad25b81 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 14 Jan 2025 11:51:14 +0100
Subject: [PATCH 40/57] adjust to spatialData

---
 .github/workflows/spatial_deconvolution-ci.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/spatial_deconvolution-ci.yml b/.github/workflows/spatial_deconvolution-ci.yml
index b22e40da..ac964e32 100644
--- a/.github/workflows/spatial_deconvolution-ci.yml
+++ b/.github/workflows/spatial_deconvolution-ci.yml
@@ -60,7 +60,9 @@ jobs:
           cd deconvolution/data 
           curl -L -o Human_Heart_reference.h5mu https://figshare.com/ndownloader/files/44969677
           cd spatial_data
-          curl -L -o Human_Heart.h5mu https://figshare.com/ndownloader/files/44969488
+          curl -L -o Human_Heart.zarr.zip https://figshare.com/ndownloader/files/51667673
+          unzip Human_Heart.zarr.zip
+          rm Human_Heart.zarr.zip
 
 
       # Note: we run the following to test that the commands works

From 857d043416fb4f83a0d9af2412edfee3d4b9af80 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Mon, 20 Jan 2025 14:41:58 +0100
Subject: [PATCH 41/57] add tangram github action

---
 ...patial_deconvolution_cell2location-ci.yml} |  4 +-
 .../spatial_deconvolution_tangram-ci.yml      | 98 +++++++++++++++++++
 2 files changed, 100 insertions(+), 2 deletions(-)
 rename .github/workflows/{spatial_deconvolution-ci.yml => spatial_deconvolution_cell2location-ci.yml} (96%)
 create mode 100644 .github/workflows/spatial_deconvolution_tangram-ci.yml

diff --git a/.github/workflows/spatial_deconvolution-ci.yml b/.github/workflows/spatial_deconvolution_cell2location-ci.yml
similarity index 96%
rename from .github/workflows/spatial_deconvolution-ci.yml
rename to .github/workflows/spatial_deconvolution_cell2location-ci.yml
index ac964e32..b43215bb 100644
--- a/.github/workflows/spatial_deconvolution-ci.yml
+++ b/.github/workflows/spatial_deconvolution_cell2location-ci.yml
@@ -1,4 +1,4 @@
-name: Run tutorials (spatial deconvolution)
+name: Run tutorials (spatial deconvolution cell2location)
 
 on:
   push:
@@ -12,7 +12,7 @@ env:
   debug: 'true'
 
 jobs:
-  spatial_deconvolution:
+  spatial_deconvolution_cell2location:
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
diff --git a/.github/workflows/spatial_deconvolution_tangram-ci.yml b/.github/workflows/spatial_deconvolution_tangram-ci.yml
new file mode 100644
index 00000000..c06d8b75
--- /dev/null
+++ b/.github/workflows/spatial_deconvolution_tangram-ci.yml
@@ -0,0 +1,98 @@
+name: Run tutorials (spatial deconvolution tangram)
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+env:
+  debug: 'true'
+
+jobs:
+  spatial_deconvolution_tangram:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-latest"] # , "macos-latest", "windows-latest"
+        python-version: ["3.10"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree
+
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-version: latest
+          auto-activate-base: true
+          auto-update-conda: true
+          channels: conda-forge
+          channel-priority: strict
+          activate-environment: pipeline_env
+          environment-file: pipeline_env.yaml
+
+      - name: Install Panpipes
+        shell: bash -el {0}
+        run: |
+          pip install .[spatial]
+          conda list
+
+      - name: Conda info
+        if: env.debug == 'true'
+        shell: bash -el {0}
+        run: conda info
+
+      - name: Conda list
+        if: env.debug == 'true'
+        shell: pwsh
+        run: conda list
+
+
+      - name: Preparing the data
+        run: |
+          mkdir deconvolution_tangram deconvolution_tangram/data deconvolution_tangram/data/spatial_data
+          cd deconvolution_tangram/data 
+          curl -L -o Human_Heart_reference.h5mu https://figshare.com/ndownloader/files/44969677
+          cd spatial_data
+          curl -L -o Human_Heart.zarr.zip https://figshare.com/ndownloader/files/51667673
+          unzip Human_Heart.zarr.zip
+          rm Human_Heart.zarr.zip
+
+
+      # Note: we run the following to test that the commands works
+      - name: Preparing the configuration file
+        shell: bash -el {0}
+        run: |
+          cd deconvolution_tangram
+          panpipes deconvolution_spatial config
+
+      - name: Edit the submission file
+        run: |
+          cd deconvolution_tangram
+          curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/deconvolution_tangram/pipeline.yml
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree deconvolution_tangram
+
+      - name: Review pipeline tasks
+        shell: bash -el {0}
+        run: |
+          cd deconvolution_tangram
+          panpipes deconvolution_spatial show full --local
+
+      - name: Run pipeline tasks
+        shell: bash -el {0}
+        run: |
+          cd deconvolution_tangram
+          panpipes deconvolution_spatial make full --local
+
+      - name: File tree
+        if: env.debug == 'true'
+        run: tree deconvolution_tangram

From 573fcc280a050adb058bb93d04f873a475521491 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 28 Jan 2025 10:50:24 +0100
Subject: [PATCH 42/57] pin spatialdata version

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b888fba6..ec9b794b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,8 +69,8 @@ spatial = [
     "squidpy",
     "cell2location",
     "tangram-sc", 
-    "spatialdata", 
-    "spatialdata-io"
+    "spatialdata==0.2.6", 
+    "spatialdata-io==0.1.6"
 ]
 
 refmap_old = [

From e4564c57ac491c7313437837514704fa9a5c975f Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 28 Jan 2025 10:58:16 +0100
Subject: [PATCH 43/57] pin dask

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ec9b794b..02285df5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,7 +70,8 @@ spatial = [
     "cell2location",
     "tangram-sc", 
     "spatialdata==0.2.6", 
-    "spatialdata-io==0.1.6"
+    "spatialdata-io==0.1.6", 
+    "dask==2024.12.1"
 ]
 
 refmap_old = [

From a72fbcb63194ab607c14cf5ce552f24beb88a50a Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 28 Jan 2025 11:47:47 +0100
Subject: [PATCH 44/57] remove outfile_spatial

---
 panpipes/panpipes/pipeline_deconvolution_spatial.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/panpipes/panpipes/pipeline_deconvolution_spatial.py b/panpipes/panpipes/pipeline_deconvolution_spatial.py
index e7026dc4..eb970434 100644
--- a/panpipes/panpipes/pipeline_deconvolution_spatial.py
+++ b/panpipes/panpipes/pipeline_deconvolution_spatial.py
@@ -35,8 +35,7 @@ def gen_filter_jobs():
     for input_spatial in input_paths_spatial:
         sample_prefix = os.path.basename(input_spatial)
         sample_prefix = sample_prefix.replace(".zarr","")
-        outfile_spatial = "cell2location.output/" + sample_prefix + "/Cell2Loc_spatial_output.zarr"
-        yield input_spatial, outfile_spatial, sample_prefix, input_singlecell    
+        yield input_spatial, sample_prefix, input_singlecell    
 
 
 @mkdir("logs")
@@ -45,7 +44,7 @@ def gen_filter_jobs():
 @mkdir("figures/Cell2Location")
 @mkdir("cell2location.output")
 @files(gen_filter_jobs)
-def run_cell2location(input_spatial, outfile_spatial, sample_prefix, input_singlecell):
+def run_cell2location(input_spatial, sample_prefix, input_singlecell):
 
     figdir = "./figures/Cell2Location/" + sample_prefix
     output_dir = "./cell2location.output/" + sample_prefix
@@ -103,6 +102,8 @@ def run_cell2location(input_spatial, outfile_spatial, sample_prefix, input_singl
     
     if PARAMS['Cell2Location_save_models'] is not None:
         cmd += " --save_models %(Cell2Location_save_models)s"   
+    if PARAMS['Cell2Location_export_gene_by_spot'] is not None:
+        cmd += " --export_gene_by_spot %(Cell2Location_export_gene_by_spot)s"
 
     cmd += " > logs/%(log_file)s "
     job_kwargs["job_threads"] = PARAMS['resources_threads_low']
@@ -116,7 +117,7 @@ def run_cell2location(input_spatial, outfile_spatial, sample_prefix, input_singl
 @mkdir("figures/Tangram")
 @mkdir("tangram.output")
 @files(gen_filter_jobs)
-def run_tangram(input_spatial, outfile_spatial, sample_prefix, input_singlecell):
+def run_tangram(input_spatial, sample_prefix, input_singlecell):
 
     figdir = "./figures/Tangram/" + sample_prefix
     output_dir = "./tangram.output/" + sample_prefix

From 86fd9d49a2bad0e63a855c90ecc73bded2076d99 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 28 Jan 2025 11:48:30 +0100
Subject: [PATCH 45/57] add option to export gene by spot matrix

---
 .../pipeline_deconvolution_spatial/pipeline.yml  |  1 +
 panpipes/python_scripts/run_cell2location.py     | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml b/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml
index 66d524f0..b9d6d430 100644
--- a/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml
+++ b/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml
@@ -89,6 +89,7 @@ Cell2Location:
 
  # -------------------------------
   save_models: False # Default False; whether to save the reference and spatial mapping models
+  export_gene_by_spot: False # Default False; whether to save a gene by spot matrix for each cell type in a layer
 
 
 
diff --git a/panpipes/python_scripts/run_cell2location.py b/panpipes/python_scripts/run_cell2location.py
index 893ca233..7cb427d8 100644
--- a/panpipes/python_scripts/run_cell2location.py
+++ b/panpipes/python_scripts/run_cell2location.py
@@ -49,6 +49,9 @@
 parser.add_argument("--save_models",
                     default=False,
                     help="whether to save the reference & spatial mapping models")
+parser.add_argument("--export_gene_by_spot",
+                    default=False,
+                    help="whether to save a gene by spot matrix for each cell type in a layer")
 
 
 # parameters for feature selection: 
@@ -148,6 +151,11 @@
     save_models = False
 else:
     save_models = True
+
+if (args.export_gene_by_spot is False) or (args.export_gene_by_spot == "False"): 
+    export_gene_by_spot = False
+else:
+    export_gene_by_spot = True
       
 if (args.remove_mt is True) or (args.remove_mt == "True"): 
     remove_mt = True
@@ -320,6 +328,14 @@
 L.info("Plotting QC plots")
 cell2loc_plot_QC_reconstr(model_spatial, figdir + "/QC_spatial_reconstruction_accuracy.png")
 
+# export a gene by spot matrix for each cell type
+if export_gene_by_spot:
+    # Compute expected expression per cell type
+    expected_dict = model_spatial.module.model.compute_expected_per_cell_type(model_spatial.samples["post_sample_q05"], model_spatial.adata_manager)
+    # Add to anndata layers
+    for i, n in enumerate(model_spatial.factor_names_):
+        sdata_st["table"].layers[n] = expected_dict['mu'][i]
+
 
 #plot output
 L.info("Plotting spatial embedding plot coloured by 'q05_cell_abundance_w_sf'")

From 7e8bcf1b9639d7295244d6c68b0f406e7a5e49a4 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 4 Feb 2025 15:08:40 +0100
Subject: [PATCH 46/57] adjust docs to SpatialData

---
 docs/workflows/clustering_spatial.md    |  2 +-
 docs/workflows/deconvolute_spatial.md   |  7 ++++---
 docs/workflows/ingest_spatial.md        | 10 +++++-----
 docs/workflows/preprocess_spatial.md    |  8 ++++----
 docs/yaml_docs/spatial_deconvolution.md |  8 +++++---
 docs/yaml_docs/spatial_preprocess.md    | 10 +++++-----
 docs/yaml_docs/spatial_qc.md            |  4 ++--
 7 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/docs/workflows/clustering_spatial.md b/docs/workflows/clustering_spatial.md
index 3c63b62e..99ae8b49 100644
--- a/docs/workflows/clustering_spatial.md
+++ b/docs/workflows/clustering_spatial.md
@@ -1,6 +1,6 @@
 # Clustering spatial data
 
 The `clustering` workflow accepts both cell suspension datasets and spatial transcriptomics data as input that have been ingested with the `qc_spatial` workflow and optionally filtered with the `spatial_preprocess` workflow.
-The workflow expects a **single `MuData` object** with the spatial data saved in `mdata.mod["spatial"]`.
+The workflow expects a **single `SpatialData` object**.
 
 Set `spatial: True` in the configuration file and customize the spatial modality clustering parameters exactly as you would for a single cell experiment. For more information check the [clustering workflow](./clustering.md)
diff --git a/docs/workflows/deconvolute_spatial.md b/docs/workflows/deconvolute_spatial.md
index e1233dba..d03f56b9 100644
--- a/docs/workflows/deconvolute_spatial.md
+++ b/docs/workflows/deconvolute_spatial.md
@@ -1,6 +1,6 @@
 # Deconvoluting spatial data
 
-With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `MuData` object for each slide is expected, with the spatial data saved in `mdata.mod["spatial"]`. The spatial slides are deconvoluted using the same reference. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input.
+With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `SpatialData` object for each slide is expected. The spatial slides are deconvoluted using the same reference. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input.
 
 The workflow provides the possibility to run deconvolution using `Cell2Location` and `Tangram`.
 
@@ -19,8 +19,9 @@ For the reference and each spatial slide the following steps are run. **Note, th
 - Regression/reference model is fitted and a plot of the training history as well as QC plots are saved in the `./figures/Cell2Location` directory. Additionally, a csv-file `Cell2Loc_inf_anver.csv` with the estimated expression of every gene in every cell type is saved in `./cell2location.output`.
 - (Optional) Reference model is saved in `./cell2location.output`
 - Spatial mapping model is fitted. Training history and QC plots are saved in the `./figures/Cell2Location` directory. Plot of the spatial embedding coloured by `q05_cell_abundance_w_sf` is also saved in `./figures/Cell2Location`.
+- (Optional) A gene by spot matrix for each cell type is saved to a layer in the table of the `SpatialData` object
 - (Optional) Spatial mapping model is saved in `./cell2location.output`
-- `MuData` objects of the spatial slide and the reference are saved in `./cell2location.output`. The `MuData` object of the spatial slide contains the estimated cell type abundances.
+- The `SpatialData` object of the spatial slide and the `MuData` object of the reference are saved in `./cell2location.output`. The `SpatialData` object of the spatial slide contains the estimated cell type abundances.
 
 
 ### Tangram
@@ -34,7 +35,7 @@ For the reference and each spatial slide the following steps are run. **Note, th
 - Data is preprocessed with [tangram.pp_adatas](https://tangram-sc.readthedocs.io/en/latest/classes/tangram.mapping_utils.pp_adatas.html)
 - Tangram model is fitted with [tangram.mapping_utils.map_cells_to_space](https://tangram-sc.readthedocs.io/en/latest/classes/tangram.mapping_utils.map_cells_to_space.html) and annotations are transfered from single-cell data onto space with [tangram.project_cell_annotations](https://tangram-sc.readthedocs.io/en/latest/classes/tangram.utils.project_cell_annotations.html)
 - Plot of the spatial embedding coloured by `tangram_ct_pred` is saved in `./figures/Tangram`
-- `MuData` objects of the spatial slide and the reference are saved in `./tangram.output`. The `MuData` object of the spatial slide contains the deconvolution predictions.
+- The `SpatialData` object of the spatial slide and the `MuData` object of the reference are saved in `./tangram.output`. The `SpatialData` object of the spatial slide contains the deconvolution predictions.
 
 
 
diff --git a/docs/workflows/ingest_spatial.md b/docs/workflows/ingest_spatial.md
index a9992d6d..f6ded09a 100644
--- a/docs/workflows/ingest_spatial.md
+++ b/docs/workflows/ingest_spatial.md
@@ -1,19 +1,19 @@
 # Ingesting spatial data
 
-Similar to the cell suspension workflow, `spatial_qc` ingests `Vizgen` and/or `Visium` data and saves the data into `MuData` objects.
-A primary difference to the cell suspension `ingestion` workflow is that we are not concatenating the input data into a single matrix, but keeping the samples as separate `MuData` objects, each with a `spatial` layer. This ensures that the processing does not introduce any technical batch effect when tissue slides are very different in cell composition. In a future release, we will use [SpatialData](https://spatialdata.scverse.org/en/latest/tutorials/notebooks/notebooks.html) as a data format and framework to process multi-slides experiments.
+The `spatial_qc` workflow ingests `Vizgen`, `Visium`, or `Xenium` data and saves the data into `SpatialData` objects.
+A primary difference to the cell suspension `ingestion` workflow is that we are not concatenating the input data into a single matrix, but keeping the samples as separate `SpatialData` objects. This ensures that the processing does not introduce any technical batch effect when tissue slides are very different in cell composition. 
 
 ## Steps
 
-- Data is ingested into `MuData` objects with the modality `spatial`. The workflow generates one MuData per dataset.
-  - Raw `MuData` objects are saved into `./tmp`
+- Data is ingested into `SpatialData` objects. The workflow generates one `SpatialData` per dataset.
+  - `SpatialData` objects of the raw data are saved into `./tmp` as `zarr` files
 - QC metrics are computed using `scanpy` functionalities:
   - Basic QC metrics are computed using `sc.pp.calculate_qc_metrics`
   - (Optional) Compute cell-cycle scores using `sc.tl.score_genes_cell_cycle`. For that, the [default gene list](../../panpipes/resources/cell_cycle_genes.tsv) can be used or a path to a tsv file can be specified.
   - (Optional) Custom genes actions. [Default gene list](../../panpipes/resources/qc_genelist_1.0.csv) can be used or a path to a csv file can be specified.
     - Calculate proportions of gene groups, e.g. mitochondrial genes
     - Score genes using `sc.tl.score_genes`
-  - `MuData` objects with calculated QC metrics are saved in `qc.data`
+  - `SpatialData` objects with calculated QC metrics are saved in `qc.data`
   - Metadata (`.obs`) is saved into the current directory as tsv files
 - Specified QC metrics are plotted in violin and spatial embedding plots
   - For `Vizgen` data, additional histograms are plotted
diff --git a/docs/workflows/preprocess_spatial.md b/docs/workflows/preprocess_spatial.md
index 925bd534..3bfae011 100644
--- a/docs/workflows/preprocess_spatial.md
+++ b/docs/workflows/preprocess_spatial.md
@@ -1,17 +1,17 @@
 # Preprocessing spatial data
 
-The `preprocess_spatial` workflow filters the data and preprocesses the data by normalization, HVG selection, and PCA computation. Multiple `MuData` objects of the same assay (`Visium` or `Vizgen`), each with a `spatial` modality, can be filtered and preprocessed in one run.
+The `preprocess_spatial` workflow filters the data and preprocesses the data by normalization, HVG selection, and PCA computation. Multiple `SpatialData` objects of the same assay (`Visium`, `Vizgen`, or `Xenium`) can be filtered and preprocessed in one run.
 
 ## Steps
 
-If multiple `MuData` objects are provided, the following steps are run for each **with the same parameter setting.**
+If multiple `SpatialData` objects are provided, the following steps are run for each **with the same parameter setting.**
 
-- `MuData` object is filtered by the specified thresholds in the pipeline.yml.  Note, that the filtering step is **optional**. You can avoid filtering by setting the `run` parameter in the pipeline.yml under `filtering` to `False`.
+- `SpatialData` object is filtered by the specified thresholds in the pipeline.yml.  Note, that the filtering step is **optional**. You can avoid filtering by setting the `run` parameter in the pipeline.yml under `filtering` to `False`.
 - Post-filter plotting is performed (only when data was filtered, i.e. `run: True`). Specified metrics in the pipeline.yml are plotted in violin and spatial embedding plots. Plots are saved into the `./figures/spatial` directory.
 - Data is normalized  and HVGs are selected.
   Before normalization, raw counts are saved into `.layers["raw_counts"]`, if not present already. Normalized counts are saved into `.X` and `.layers["lognorm"]` or `.layers["norm_pearson_resid"]`, depending on the chosen normalization. HVGs are saved into `.var["highly_variable"]`.
 - PCA is computed and plotted. PCA plots are also saved into the `./figures/spatial` directory.
-- Final `MuData` object is saved into the `./filtered.data` directory
+- Final `SpatialData` object is saved into the `./filtered.data` directory as a `zarr` file
 
 ## Steps to run
 
diff --git a/docs/yaml_docs/spatial_deconvolution.md b/docs/yaml_docs/spatial_deconvolution.md
index 29a4e93a..a1debae4 100644
--- a/docs/yaml_docs/spatial_deconvolution.md
+++ b/docs/yaml_docs/spatial_deconvolution.md
@@ -33,7 +33,7 @@ Specified by the following three parameters:
 
   - <span class="parameter">threads_medium</span> `Integer`, Default: 1<br>
         Number of threads used for medium intensity computing tasks.
-        For each thread, there must be enough memory to load your mudata and do computationally light tasks.
+        For each thread, there must be enough memory to load your SpatialData and do computationally light tasks.
 
   - <span class="parameter">threads_low</span> `Integer`, Default: 1<br>
   	    Number of threads used for low intensity computing tasks.
@@ -46,12 +46,12 @@ Specified by the following three parameters:
 
 
 ## 1. Input Options
-With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `MuData` object for each slide is expected, with the spatial data saved in `mdata.mod["spatial"]`. The spatial slides are deconvoluted **using the same reference**. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input. Please note, that the same parameter setting is used for each slide. <br> For the **spatial** input, the workflow, therefore, reads in **all `.h5mu` objects of a directory** (see below). **The spatial and single-cell data thus need to be saved in different folders.**
+With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `SpatialData` object for each slide is expected. The spatial slides are deconvoluted **using the same reference**. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input. Please note, that the same parameter setting is used for each slide. <br> For the **spatial** input, the workflow, therefore, reads in **all `.zarr` objects of a directory** (see below).
 <br>
 
 <span class="parameter">input</span><br>
   - <span class="parameter">spatial</span> `String`, Mandatory parameter<br>
-        Path to folder containing one or multiple `MuDatas` of spatial data. The pipeline is reading in all `MuData` files in that folder and assuming that they are `MuDatas` of spatial slides.
+        Path to folder containing one or multiple `SpatialDatas` of spatial data. The pipeline is reading in all `SpatialData` files in that folder.
 
   - <span class="parameter">singlecell</span> `String`, Mandatory parameter<br>
        Path to the MuData **file** (not folder) of the reference single-cell data.
@@ -151,6 +151,8 @@ You can specify whether both models (spatial and reference) should be saved with
 <span class="parameter">save_models</span>, Default: False<br>
     Whether to save the reference & spatial mapping models.
 
+<span class="parameter">export_gene_by_spot</span>, Default: False<br>
+    Whether to save a gene by spot matrix for each cell type in a layer.
 
 
 ## 3. Tangram Options
diff --git a/docs/yaml_docs/spatial_preprocess.md b/docs/yaml_docs/spatial_preprocess.md
index 28270c99..2d0db62c 100644
--- a/docs/yaml_docs/spatial_preprocess.md
+++ b/docs/yaml_docs/spatial_preprocess.md
@@ -35,7 +35,7 @@ Specified by the following three parameters:
 
   - <span class="parameter">threads_medium</span> `Integer`, Default: 1<br>
         Number of threads used for medium intensity computing tasks.
-        For each thread, there must be enough memory to load your mudata and do computationally light tasks.
+        For each thread, there must be enough memory to load your SpatialData and do computationally light tasks.
 
   - <span class="parameter">threads_low</span> `Integer`, Default: 1<br>
   	    Number of threads used for low intensity computing tasks.
@@ -48,14 +48,14 @@ Specified by the following three parameters:
 
 ## 1. Input Options
 
-With the preprocess_spatial workflow, one or multiple `MuData` objects can be preprocessed in one run. The workflow **reads in all `.h5mu` objects of a directory**. The `MuData` objects in the directory need to be of the same assay (vizgen or visium). The workflow then runs the preprocessing of each `MuData` object separately with the same parameters that are specified in the yaml file. 
+With the preprocess_spatial workflow, one or multiple `SpatialData` objects can be preprocessed in one run. The workflow **reads in all `.zarr` objects of a directory**. The `SpatialData` objects in the directory need to be of the same assay (Vizgen, Visium, or Xenium). The workflow then runs the preprocessing of each `SpatialData` object separately with the same parameters that are specified in the yaml file. 
 <br>
 
 <span class="parameter">input_dir</span> `String`, Mandatory parameter<br>
-    Path to the folder containing all input `h5mu` files.
+    Path to the folder containing all input `zarr` files.
 
 <span class="parameter">assay</span> [`'visium'`, `'vizgen'`], Default: `'visium'`<br>
-     Spatial transcriptomics assay of the `h5mu` files in `input_dir`.
+     Spatial transcriptomics assay of the `zarr` files in `input_dir`.
 
 
 
@@ -70,7 +70,7 @@ With the preprocess_spatial workflow, one or multiple `MuData` objects can be pr
 <br>
 
 
-With the parameters below you can specify thresholds for filtering. The filtering is fully customisable to any columns in `.obs` or `.var`. You are not restricted by the columns given as default. When specifying a column name, please make sure it exactly matches the column name in the h5mu object. <br> Please slso make sure, that the specified metrics are present in all `h5mu` objects of the `input_dir`, i.e. the `MuData` objects for that the preprocessing is run.
+With the parameters below you can specify thresholds for filtering. The filtering is fully customisable to any columns in `.obs` or `.var`. You are not restricted by the columns given as default. When specifying a column name, please make sure it exactly matches the column name in the table of the `SpatialData` object. <br> Please also make sure, that the specified metrics are present in all `SpatialData` objects of the `input_dir`, i.e. the `SpatialData` objects for that the preprocessing is run.
 
 
 ---
diff --git a/docs/yaml_docs/spatial_qc.md b/docs/yaml_docs/spatial_qc.md
index 5a742ab0..baeb2850 100644
--- a/docs/yaml_docs/spatial_qc.md
+++ b/docs/yaml_docs/spatial_qc.md
@@ -32,11 +32,11 @@ Computing resources to use, specifically the number of threads used for parallel
 Specified by the following three parameters:
   - <span class="parameter">threads_high</span> `Integer`, Default: 1<br>
         Number of threads used for high intensity computing tasks. 
-        For each thread, there must be enough memory to load all your input files at once and create the MuData object.
+        For each thread, there must be enough memory to load all your input files at once and create the SpatialData object.
 
   - <span class="parameter">threads_medium</span> `Integer`, Default: 1<br>
         Number of threads used for medium intensity computing tasks.
-        For each thread, there must be enough memory to load your mudata and do computationally light tasks.
+        For each thread, there must be enough memory to load your SpatialData and do computationally light tasks.
 
   - <span class="parameter">threads_low</span> `Integer`, Default: 1<br>
   	    Number of threads used for low intensity computing tasks.

From 463ba6f1f13e3633b270ec0df5fbde828d4659a5 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 4 Feb 2025 15:13:49 +0100
Subject: [PATCH 47/57] change to SpatialData

---
 panpipes/python_scripts/collate_mdata.py      | 113 ++++++++++++------
 panpipes/python_scripts/plot_cluster_umaps.py |  26 ++--
 .../python_scripts/plot_scanpy_markers.py     |  28 +++--
 .../rerun_find_neighbors_for_clustering.py    |  90 +++++++++-----
 panpipes/python_scripts/run_clustering.py     |  21 ++--
 .../python_scripts/run_find_markers_multi.py  |  29 +++--
 panpipes/python_scripts/run_umap.py           |  21 ++--
 7 files changed, 216 insertions(+), 112 deletions(-)

diff --git a/panpipes/python_scripts/collate_mdata.py b/panpipes/python_scripts/collate_mdata.py
index cf500bcc..3134accc 100644
--- a/panpipes/python_scripts/collate_mdata.py
+++ b/panpipes/python_scripts/collate_mdata.py
@@ -34,8 +34,19 @@
 
 L.info("Running with params: %s", args)
 
-L.info("Reading in MuData from '%s'" % args.input_mudata)
-mdata = mu.read(args.input_mudata)
+#L.info("Reading in MuData from '%s'" % args.input_mudata)
+#mdata = mu.read(args.input_mudata)
+L.info("Reading in data from '%s'" % args.input_mudata)
+if ".zarr" in args.input_mudata:
+    import spatialdata as sd
+    L.info("Reading in SpatialData from '%s'" % args.input_mudata)
+    mdata = sd.read_zarr(args.input_mudata)
+else: 
+    L.info("Reading in MuData from '%s'" % args.input_mudata)
+    mdata = mu.read(args.input_mudata)
+
+ 
+
 
 L.info("Reading in cluster information")
 cf = pd.read_csv(args.clusters_files_csv)
@@ -55,46 +66,78 @@
 
 # add in the clusters
 
+if isinstance(mdata, MuData):
+    L.info("Adding cluster information to MuData")
+    for i in range(cf.shape[0]):
+        cf_df = pd.read_csv(cf['fpath'][i], sep='\t', index_col=0) 
+        cf_df['clusters'] = cf_df['clusters'].astype('str').astype('category')
+        cf_df = cf_df.rename(columns={"clusters":cf['new_key'][i]})
+        
+        if cf['mod'][i] != "multimodal":
+            mdata[cf['mod'][i]].obs = mdata[cf['mod'][i]].obs.merge(cf_df, left_index=True, right_index=True)
+        else:
+            mdata.obs = mdata.obs.merge(cf_df, left_index=True, right_index=True)
+elif isinstance(mdata, sd.SpatialData): 
+    L.info("Adding cluster information to SpatialData")
+    for i in range(cf.shape[0]):
+        cf_df = pd.read_csv(cf['fpath'][i], sep='\t', index_col=0) 
+        cf_df['clusters'] = cf_df['clusters'].astype('str').astype('category')
+        cf_df = cf_df.rename(columns={"clusters":cf['new_key'][i]})
+        mdata["table"].obs = mdata["table"].obs.merge(cf_df, left_index=True, right_index=True)
 
-L.info("Adding cluster information to MuData")
-for i in range(cf.shape[0]):
-    cf_df = pd.read_csv(cf['fpath'][i], sep='\t', index_col=0) 
-    cf_df['clusters'] = cf_df['clusters'].astype('str').astype('category')
-    cf_df = cf_df.rename(columns={"clusters":cf['new_key'][i]})
-    
-    if cf['mod'][i] != "multimodal":
-        mdata[cf['mod'][i]].obs = mdata[cf['mod'][i]].obs.merge(cf_df, left_index=True, right_index=True)
-    else:
-        mdata.obs = mdata.obs.merge(cf_df, left_index=True, right_index=True)
 
 
 L.info("Adding UMAP coordinates to MuData")
 uf = pd.read_csv(args.umap_files_csv)
 
-for i in range(uf.shape[0]):
-    uf_df = pd.read_csv(uf['fpath'][i], sep='\t', index_col=0) 
-    mod = uf['mod'][i]
-    new_key = uf['new_key'][i]
-    if uf['mod'][i] != "multimodal":
-        if all(mdata[mod].obs_names == uf_df.index):
-            mdata[mod].obsm[new_key] =  uf_df.to_numpy()
+if isinstance(mdata, MuData):
+    for i in range(uf.shape[0]):
+        uf_df = pd.read_csv(uf['fpath'][i], sep='\t', index_col=0) 
+        mod = uf['mod'][i]
+        new_key = uf['new_key'][i]
+        if uf['mod'][i] != "multimodal":
+            if all(mdata[mod].obs_names == uf_df.index):
+                mdata[mod].obsm[new_key] =  uf_df.to_numpy()
+            else:
+                L.warn("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
         else:
-            L.warn("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
-    else:
-        # check the observations are the same
-        if set(mdata.obs_names).difference(uf_df.index) == set():
-            # put the observations in the same order
-            uf_df = uf_df.loc[mdata.obs_names,:]
-            mdata.obsm[new_key] =  uf_df.to_numpy()
+            # check the observations are the same
+            if set(mdata.obs_names).difference(uf_df.index) == set():
+                # put the observations in the same order
+                uf_df = uf_df.loc[mdata.obs_names,:]
+                mdata.obsm[new_key] =  uf_df.to_numpy()
+            else:
+                L.warning("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
+elif isinstance(mdata, sd.SpatialData): 
+    for i in range(uf.shape[0]):
+        uf_df = pd.read_csv(uf['fpath'][i], sep='\t', index_col=0) 
+        mod = uf['mod'][i]
+        new_key = uf['new_key'][i]
+        if uf['mod'][i] != "multimodal":
+            if all(mdata["table"].obs_names == uf_df.index):
+                mdata["table"].obsm[new_key] =  uf_df.to_numpy()
+            else:
+                L.warn("Cannot integrate %s into adata as obs_names mismatch" % uf.iloc[i,:] )
         else:
-            L.warning("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
-
-
-L.info("Saving updated MuData to '%s'" % args.output_mudata)
-mdata.write(args.output_mudata)
-
-output_csv = re.sub(".h5mu", "_cell_metdata.tsv", args.output_mudata)
-L.info("Saving metadata to '%s'" % output_csv)
-mdata.obs.to_csv(output_csv, sep='\t')
+            # check the observations are the same
+            if set(mdata["table"].obs_names).difference(uf_df.index) == set():
+                # put the observations in the same order
+                uf_df = uf_df.loc[mdata["table"].obs_names,:]
+                mdata["table"].obsm[new_key] =  uf_df.to_numpy()
+            else:
+                L.warning("Cannot integrate %s into adata as obs_names mismatch" % uf.iloc[i,:] )
+ 
+if isinstance(mdata, MuData):
+    L.info("Saving updated MuData to '%s'" % args.output_mudata)
+    mdata.write(args.output_mudata)
+    output_csv = re.sub(".h5mu", "_cell_metdata.tsv", args.output_mudata)
+    L.info("Saving metadata to '%s'" % output_csv)
+    mdata.obs.to_csv(output_csv, sep='\t')
+elif isinstance(mdata, sd.SpatialData):
+    L.info("Saving updated SpatialData to '%s'" % args.output_mudata)
+    mdata.write(args.output_mudata)
+    output_csv = re.sub(".zarr", "_cell_metdata.tsv", args.output_mudata)
+    L.info("Saving metadata to '%s'" % output_csv)
+    mdata.obs.to_csv(output_csv, sep='\t') 
 
 L.info("Done")
diff --git a/panpipes/python_scripts/plot_cluster_umaps.py b/panpipes/python_scripts/plot_cluster_umaps.py
index 39e73b19..18b804c3 100644
--- a/panpipes/python_scripts/plot_cluster_umaps.py
+++ b/panpipes/python_scripts/plot_cluster_umaps.py
@@ -90,9 +90,13 @@ def plot_spatial(adata,figdir):
         fig.savefig(os.path.join(figdir, ok +  "_clusters.png"))
 
 
-
-L.info("Reading in MuData from '%s'" % args.infile)
-mdata = read(args.infile)
+if ".zarr" in args.infile:
+    import spatialdata as sd
+    L.info("Reading in SpatialData from '%s'" % args.infile)
+    data = sd.read_zarr(args.infile)
+else: 
+    L.info("Reading in MuData from '%s'" % args.infile)
+    data = read(args.infile)
 
 mods = args.modalities.split(',')
 # detemin initial figure directory based on object type
@@ -102,21 +106,27 @@ def plot_spatial(adata,figdir):
     if os.path.exists("multimodal/figures") is False:
         os.makedirs("multimodal/figures")
     L.info("Plotting multimodal figures")
-    main(mdata, figdir="multimodal/figures")
+    main(data, figdir="multimodal/figures")
 
 
 # we also need to plot per modality
-if type(mdata) is MuData:
-    for mod in mdata.mod.keys():
+if type(data) is MuData:
+    for mod in data.mod.keys():
         if mod in mods:
             L.info("Plotting for modality: %s" % mod)
             figdir  = os.path.join(mod, "figures")
             if os.path.exists(figdir) is False:
                 os.makedirs(figdir)
             if mod == "spatial": # added separate function for spatial
-                plot_spatial(mdata[mod], figdir)
+                plot_spatial(data[mod], figdir)
             else:
-                main(mdata[mod], figdir)
+                main(data[mod], figdir)
+elif isinstance(data, sd.SpatialData):
+    L.info("Plotting for modality: spatial")
+    figdir  = os.path.join("spatial", "figures")
+    if os.path.exists(figdir) is False:
+        os.makedirs(figdir)
+    plot_spatial(data["table"], figdir)
 
 
 
diff --git a/panpipes/python_scripts/plot_scanpy_markers.py b/panpipes/python_scripts/plot_scanpy_markers.py
index 09fd4455..c7073b6c 100644
--- a/panpipes/python_scripts/plot_scanpy_markers.py
+++ b/panpipes/python_scripts/plot_scanpy_markers.py
@@ -115,17 +115,23 @@ def do_plots(adata, mod, group_col, mf, n=10, layer=None):
 
 
 # read data
-L.info("Reading in MuData from '%s'" % args.infile)
-mdata = mu.read(args.infile)
-
-if type(mdata) is AnnData:
-    adata = mdata
-    # main function only does rank_gene_groups on X, so 
-elif type(mdata) is mu.MuData and args.modality is not None:
-    adata = mdata[args.modality]    
-else:
-    L.error("If the input is a MuData object, a modality needs to be specified")
-    sys.exit('If the input is a MuData object, a modality needs to be specified')
+if args.modality != "spatial":
+    L.info("Reading in MuData from '%s'" % args.infile)
+    mdata = mu.read(args.infile)
+
+    if type(mdata) is AnnData:
+        adata = mdata
+        # main function only does rank_gene_groups on X, so 
+    elif type(mdata) is mu.MuData and args.modality is not None:
+        adata = mdata[args.modality]    
+    else:
+        L.error("If the input is a MuData object, a modality needs to be specified")
+        sys.exit('If the input is a MuData object, a modality needs to be specified')
+else: 
+    import spatialdata as sd
+    L.info("Reading in SpatialData from '%s'" % args.infile)
+    adata = sd.read_zarr(args.infile)["table"]
+ 
 
 L.info("Loading marker information from '%s'" % args.marker_file)
 mf = pd.read_csv(args.marker_file, sep='\t' )
diff --git a/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py b/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py
index ad675080..ba1af40a 100644
--- a/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py
+++ b/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py
@@ -4,6 +4,7 @@
 import logging
 import scanpy as sc
 from muon import MuData, read
+
 from panpipes.funcs.scmethods import run_neighbors_method_choice
 from panpipes.funcs.io import read_yaml
 from panpipes.funcs.scmethods import lsi
@@ -37,53 +38,80 @@
 sc.settings.n_jobs = int(args.n_threads)
 
 # read data
-L.info("Reading in MuData from '%s'" % args.infile)
-mdata = read(args.infile)
+if ".zarr" in args.infile:
+    import spatialdata as sd
+    L.info("Reading in SpatialData from '%s'" % args.infile)
+    sdata = sd.read_zarr(args.infile)
+else: 
+    L.info("Reading in MuData from '%s'" % args.infile)
+    mdata = read(args.infile)
 
 
 
 for mod in neighbor_dict.keys():
-    if mod in mdata.mod.keys():
+    if mod != "spatial": 
+        if mod in mdata.mod.keys():
+            if neighbor_dict[mod]['use_existing']:
+                L.info('Using existing neighbors graph for %s' % mod)
+                pass
+            else:
+                L.info("Computing new neighbors for modality %s on %s" % (mod, neighbor_dict[mod]['dim_red']))
+                if type(mdata) is MuData:
+                    adata=mdata[mod]
+                if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in adata.obsm.keys()):
+                    L.info("X_pca not found, computing it using default parameters")
+                    sc.tl.pca(adata)
+                    if (mod == "atac") and (neighbor_dict[mod]['dim_remove'] is not None):
+                        dimrem = int(neighbor_dict[mod]['dim_remove'])
+                        adata.obsm['X_pca'] = adata.obsm['X_pca'][:, dimrem:]
+                        adata.varm["PCs"] = adata.varm["PCs"][:, dimrem:]
+                if mod == "atac":
+                    if (neighbor_dict[mod]['dim_red'] == "X_lsi") and ("X_lsi" not in adata.obsm.keys()):
+                        L.info("X_lsi not found, computing it using default parameters")
+                        lsi(adata=adata, num_components=50)
+                        if neighbor_dict[mod]['dim_remove'] is not None:
+                            L.info("Removing dimension %s from X_lsi" % neighbor_dict[mod]['dim_remove'])
+                            dimrem = int(neighbor_dict[mod]['dim_remove'])
+                            adata.obsm['X_lsi'] = adata.obsm['X_lsi'][:, dimrem:]
+                            adata.varm["LSI"] = adata.varm["LSI"][:, dimrem:]
+                            adata.uns["lsi"]["stdev"] = adata.uns["lsi"]["stdev"][dimrem:]
+
+                # run command
+                opts = dict(method=neighbor_dict[mod]['method'],
+                            n_neighbors=int(neighbor_dict[mod]['k']),
+                            n_pcs=int(neighbor_dict[mod]['n_dim_red']),
+                            metric=neighbor_dict[mod]['metric'],
+                            nthreads=args.n_threads,
+                            use_rep=neighbor_dict[mod]['dim_red'])
+
+
+                run_neighbors_method_choice(adata,**opts)
+                mdata.mod[mod] = adata
+                mdata.update()
+    else:  
         if neighbor_dict[mod]['use_existing']:
             L.info('Using existing neighbors graph for %s' % mod)
             pass
         else:
             L.info("Computing new neighbors for modality %s on %s" % (mod, neighbor_dict[mod]['dim_red']))
-            if type(mdata) is MuData:
-                adata=mdata[mod]
-            if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in adata.obsm.keys()):
+            if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in sdata["table"].obsm.keys()):
                 L.info("X_pca not found, computing it using default parameters")
-                sc.tl.pca(adata)
-                if (mod == "atac") and (neighbor_dict[mod]['dim_remove'] is not None):
-                    dimrem = int(neighbor_dict[mod]['dim_remove'])
-                    adata.obsm['X_pca'] = adata.obsm['X_pca'][:, dimrem:]
-                    adata.varm["PCs"] = adata.varm["PCs"][:, dimrem:]
-            if mod == "atac":
-                if (neighbor_dict[mod]['dim_red'] == "X_lsi") and ("X_lsi" not in adata.obsm.keys()):
-                    L.info("X_lsi not found, computing it using default parameters")
-                    lsi(adata=adata, num_components=50)
-                    if neighbor_dict[mod]['dim_remove'] is not None:
-                        L.info("Removing dimension %s from X_lsi" % neighbor_dict[mod]['dim_remove'])
-                        dimrem = int(neighbor_dict[mod]['dim_remove'])
-                        adata.obsm['X_lsi'] = adata.obsm['X_lsi'][:, dimrem:]
-                        adata.varm["LSI"] = adata.varm["LSI"][:, dimrem:]
-                        adata.uns["lsi"]["stdev"] = adata.uns["lsi"]["stdev"][dimrem:]
-
-            # run command
+                sc.tl.pca(sdata["table"])
             opts = dict(method=neighbor_dict[mod]['method'],
                         n_neighbors=int(neighbor_dict[mod]['k']),
                         n_pcs=int(neighbor_dict[mod]['n_dim_red']),
                         metric=neighbor_dict[mod]['metric'],
                         nthreads=args.n_threads,
                         use_rep=neighbor_dict[mod]['dim_red'])
+            # run command
+            run_neighbors_method_choice(sdata["table"],**opts)
 
 
-            run_neighbors_method_choice(adata,**opts)
-            mdata.mod[mod] = adata
-            mdata.update()
-
-
+if ".zarr" in args.infile:
+    L.info("Saving updated SpatialData to '%s'" % args.outfile)
+    sdata.write(args.outfile)
+else: 
+    L.info("Saving updated MuData to '%s'" % args.outfile)
+    mdata.write(args.outfile)
 
-L.info("Saving updated MuData to '%s'" % args.outfile)
-mdata.write(args.outfile)
-L.info("Done")
\ No newline at end of file
+L.info("Done")
diff --git a/panpipes/python_scripts/run_clustering.py b/panpipes/python_scripts/run_clustering.py
index fcd2e5c5..ee183c90 100644
--- a/panpipes/python_scripts/run_clustering.py
+++ b/panpipes/python_scripts/run_clustering.py
@@ -34,13 +34,20 @@
 
 # read data
 L.info("Reading in data from '%s'" % args.infile)
-mdata = mu.read(args.infile)
-if type(mdata) is AnnData:
-    adata = mdata
-elif args.modality is not None:
-    adata = mdata[args.modality]
-else:
-    adata = mdata
+if ".zarr" in args.infile:
+    import spatialdata as sd
+    L.info("Reading in SpatialData from '%s'" % args.infile)
+    sdata = sd.read_zarr(args.infile)
+    adata = sdata["table"]
+else: 
+    mdata = mu.read(args.infile)
+    if type(mdata) is AnnData:
+        adata = mdata
+    elif args.modality is not None:
+        adata = mdata[args.modality]
+    else:
+        adata = mdata
+ 
 
 uns_key=args.neighbors_key
 # check sc.pp.neihgbours has been run
diff --git a/panpipes/python_scripts/run_find_markers_multi.py b/panpipes/python_scripts/run_find_markers_multi.py
index ba1422d7..0ae1d067 100644
--- a/panpipes/python_scripts/run_find_markers_multi.py
+++ b/panpipes/python_scripts/run_find_markers_multi.py
@@ -201,19 +201,22 @@ def main(adata,
 L.info("Running with params: %s", args)
 
 # read data
-L.info("Reading in MuData from '%s'" % args.infile)
-mdata = read(args.infile)
-    
-
-if type(mdata) is AnnData:
-    adata = mdata
-    # main function only does rank_gene_groups on X, so 
-elif type(mdata) is MuData and args.modality is not None:
-    adata = mdata[args.modality]    
-else:
-    L.error("If the input is a MuData object, a modality needs to be specified")
-    sys.exit('If the input is a MuData object, a modality needs to be specified')
-    
+if args.modality != "spatial":
+    L.info("Reading in MuData from '%s'" % args.infile)
+    mdata = read(args.infile)
+    if type(mdata) is AnnData:
+        adata = mdata
+        # main function only does rank_gene_groups on X, so 
+    elif type(mdata) is MuData and args.modality is not None:
+        adata = mdata[args.modality]    
+    else:
+        L.error("If the input is a MuData object, a modality needs to be specified")
+        sys.exit('If the input is a MuData object, a modality needs to be specified')
+else: 
+    import spatialdata as sd
+    L.info("Reading in SpatialData from '%s'" % args.infile)
+    adata = sd.read_zarr(args.infile)["table"]
+        
 
 main(adata, 
      mod=args.modality,
diff --git a/panpipes/python_scripts/run_umap.py b/panpipes/python_scripts/run_umap.py
index b70c19f3..112e9d6c 100644
--- a/panpipes/python_scripts/run_umap.py
+++ b/panpipes/python_scripts/run_umap.py
@@ -10,6 +10,7 @@
 import muon as mu
 from anndata import AnnData
 
+
 import sys
 import logging
 L = logging.getLogger()
@@ -40,13 +41,19 @@
 
 # read data
 L.info("Reading in data from '%s'" % args.infile)
-mdata = mu.read(args.infile)
-if type(mdata) is AnnData:
-    adata = mdata
-elif args.modality is not None:
-    adata = mdata[args.modality]
-else:
-    adata = mdata
+if ".zarr" in args.infile:
+    import spatialdata as sd
+    L.info("Reading in SpatialData from '%s'" % args.infile)
+    sdata = sd.read_zarr(args.infile)
+    adata = sdata["table"]
+else: 
+    mdata = mu.read(args.infile)
+    if type(mdata) is AnnData:
+        adata = mdata
+    elif args.modality is not None:
+        adata = mdata[args.modality]
+    else:
+        adata = mdata
     
 
 # set seed

From 5cb183bea023002a14efe4eb324fbca33c3c166b Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Mon, 10 Feb 2025 10:05:23 +0100
Subject: [PATCH 48/57] update to spatialData

---
 docs/usage/setup_for_spatial_workflows.md | 67 ++++++++++++++++++-----
 1 file changed, 53 insertions(+), 14 deletions(-)

diff --git a/docs/usage/setup_for_spatial_workflows.md b/docs/usage/setup_for_spatial_workflows.md
index 248c33b9..2959dd9f 100644
--- a/docs/usage/setup_for_spatial_workflows.md
+++ b/docs/usage/setup_for_spatial_workflows.md
@@ -1,28 +1,67 @@
 Sample submission file for the ingestion of spatial data
 ===========================
 
-The spatial transcriptomics ingestion workflow requires a sample submission file that specifies the location of the input files. The sample submission file is a tab-separated file with one row per sample. Panpipes currently supports the ingestion of `Visium` and `Vizgen` data.
+The spatial transcriptomics ingestion workflow requires a sample submission file that specifies the location of the input files. The sample submission file is a tab-separated file with one row per sample. Panpipes currently supports the ingestion of `Visium`, `Vizgen`, and `Xenium` data. The data of different technologies needs to be ingested separately with different sample submission files. 
 
-The 6 columns of the sample submission file are:
+
+The minimum required (non-optional) columns for each submission file are
 
 **sample id**: Unique sample ID.
 
-**spatial_path**: The root directory containing the data files. Please note, that the folder structure of the root directory needs to be structured as expected by the [squidpy.read.visium](https://squidpy.readthedocs.io/en/stable/api/squidpy.read.visium.html) (for `Visium` data) or [squidpy.read.vizgen](https://squidpy.readthedocs.io/en/stable/api/squidpy.read.vizgen.html) (for `Vizgen` data) functions.
+**spatial_path**: The root directory containing the data files. Please note, that the folder structure of the root directory needs to be structured as expected by the [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html) (for `Visium` data), [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html) (for `Vizgen` data), or [spatialdata_io.xenium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.xenium.html) (for `Xenium` data) functions.
+
+**spatial_filetype**: Either "vizgen", "visium", or "xenium".
+
+
+## Visium
+
+The 7 columns of the Visium sample submission file are:
+
+sample_id |	spatial_path |	spatial_filetype |	visium_feature_bc_matrix |	visium_fullres_image_file |	visium_tissue_positions_file |	visium_scalefactors_file	
+----------|----------|------------|-----------|----------|-------------|-------------
+
+The following 4 columns are **optional**:
+
+**visium_feature_bc_matrix**: Name of the counts file. Corresponds to the `counts_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html)
+
+**visium_fullres_image_file**: Path to the full-resolution image. Corresponds to the `fullres_image_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html)
+
+**visium_tissue_positions_file**: Path to the tissue positions file. Corresponds to the `tissue_positions_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html)
+
+**visium_scalefactors_file**:	Path to the scalefactors file. Corresponds to the `scalefactors_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html)
+
+#### [Example submission file](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt)
+
+
+## Vizgen
+
+The 6 columns of the Vizgen sample submission file are:  
+
+sample_id |	spatial_path |	spatial_filetype |	vpt_cell_by_gene    |	vpt_cell_metadata	|	vpt_cell_boundaries
+----------|----------|------------|----------|-------------|-------------
+
+The following 3 columns are **optional**:
+
+**vpt_cell_by_gene**: The file name of the output of the vizgen-postprocessing-tool. See [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html)
+
+**vpt_cell_metadata**: The file name of the output of the vizgen-postprocessing-tool. See [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html)
+
+**vpt_cell_boundaries**: The file name of the output of the vizgen-postprocessing-tool. See [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html)
+
+
+#### Example submission files [MERFISH](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_merfish_data/sample_file_qc_merfish.txt) [MERSCOPE](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_merscope_data/sample_file_qc_merscope.txt)
+
+## Xenium
+
+The 3 columns of the Xenium sample submission file are:
+
+sample_id |	spatial_path |	spatial_filetype |
+----------|----------|------------
 
-**spatial_filetype**: Either "vizgen" or "visium".
+#### [Example submission file](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_xenium_data/sample_file_qc_xenium.txt)
 
-**spatial_counts**: The count matrix file. Usually `filtered_feature_bc_matrix.h5` or `raw_feature_bc_matrix.h5` for a `Visium` dataset. For `Vizgen` inputs, this file typically ends with `_cell_by_gene.csv.`
 
-**spatial_metadata**: The metadata csv-file for `Vizgen` data. Leave empty for `Visium` data.
 
-**spatial_transformation**: The transformation csv-file for `Vizgen` data. This column is **optional** for `Vizgen` data. Leave empty for `Visium` data.
 
-**Note, that the columns, `sample_id`, `spatial_path`, `spatial_filetype`, and `spatial_counts` are required for both, `Visium` and `Vizgen` data. The `spatial_metadata`(required) and `spatial_transformation`(optional) columns are `Vizgen`-specific and should be left empty for `Visium` data.**
 
-### <u>Example submission file</u>
 
-| sample_id | spatial_path | spatial_filetype | spatial_counts                          | spatial_metadata                         | spatial_transformation |
-| --------- |--------------|------------------|-----------------------------------------|------------------------------------------|--------------------|
-| V1_Human_Heart |./data_visium/V1_Human_Heart |visium |V1_Human_Heart_filtered_feature_bc_matrix.h5 |
-| V1_Human_Lymph_Node |./data_visium/V1_Human_Lymph_Node| visium | V1_Human_Lymph_Node_filtered_feature_bc_matrix.h5 |
-Mouse_Brain  | ./data_vizgen | vizgen | cell_by_gene_S1R1.csv | cell_metadata_S1R1.csv | images_micron_to_mosaic_pixel_transform.csv

From 96695cfbff37ce6e2cfb168e83bb309a706721d2 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Mon, 10 Feb 2025 10:13:13 +0100
Subject: [PATCH 49/57] check if columns exist

---
 panpipes/funcs/io.py | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py
index af41d6ff..0a46f205 100644
--- a/panpipes/funcs/io.py
+++ b/panpipes/funcs/io.py
@@ -171,16 +171,19 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
                 visium_tissue_positions_file = None
                 visium_scalefactors_file = None
                 spatial_filetype = caf['spatial_filetype'][nn]
-                if pd.notna(caf['vpt_cell_by_gene'][nn]):
-                    vpt_cell_by_gene = caf['vpt_cell_by_gene'][nn]
+                if "vpt_cell_by_gene" in caf[nn].columns:
+                    if pd.notna(caf['vpt_cell_by_gene'][nn]):
+                        vpt_cell_by_gene = caf['vpt_cell_by_gene'][nn]
                 else:
                     vpt_cell_by_gene = None
-                if pd.notna(caf['vpt_cell_metadata'][nn]):
-                    vpt_cell_metadata = caf['vpt_cell_metadata'][nn]
+                if "vpt_cell_metadata" in caf[nn].columns:
+                    if pd.notna(caf['vpt_cell_metadata'][nn]):
+                        vpt_cell_metadata = caf['vpt_cell_metadata'][nn]
                 else:
                     vpt_cell_metadata = None
-                if pd.notna(caf['vpt_cell_boundaries'][nn]):
-                    vpt_cell_boundaries = caf['vpt_cell_boundaries'][nn]
+                if "vpt_cell_boundaries" in caf[nn].columns:
+                    if pd.notna(caf['vpt_cell_boundaries'][nn]):
+                        vpt_cell_boundaries = caf['vpt_cell_boundaries'][nn]
                 else:
                     vpt_cell_boundaries = None
             elif caf['spatial_filetype'][nn]=="visium":
@@ -189,23 +192,27 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
                 vpt_cell_boundaries = None
                 spatial_filetype = caf['spatial_filetype'][nn]
                 #counts file
-                if pd.notna(caf["visium_feature_bc_matrix"][nn]):
-                    visium_feature_bc_matrix= caf["visium_feature_bc_matrix"][nn]
+                if "visium_feature_bc_matrix" in caf[nn].columns:
+                    if pd.notna(caf["visium_feature_bc_matrix"][nn]):
+                        visium_feature_bc_matrix= caf["visium_feature_bc_matrix"][nn]
                 else:
                     visium_feature_bc_matrix = None  
                 # fullres image
-                if pd.notna(caf["visium_fullres_image_file"][nn]):
-                    visium_fullres_image_file= caf["visium_fullres_image_file"][nn]
+                if "visium_fullres_image_file" in caf[nn].columns:
+                    if pd.notna(caf["visium_fullres_image_file"][nn]):
+                        visium_fullres_image_file= caf["visium_fullres_image_file"][nn]
                 else:
                     visium_fullres_image_file = None 
                 # tissue position 
-                if pd.notna(caf["visium_tissue_positions_file"][nn]):
-                    visium_tissue_positions_file= caf["visium_tissue_positions_file"][nn]
+                if "visium_tissue_positions_file" in caf[nn].columns:
+                    if pd.notna(caf["visium_tissue_positions_file"][nn]):
+                        visium_tissue_positions_file= caf["visium_tissue_positions_file"][nn]
                 else:
                     visium_tissue_positions_file = None  
                 # scalefactor
-                if pd.notna(caf["visium_scalefactors_file"][nn]):
-                    visium_scalefactors_file= caf["visium_scalefactors_file"][nn]
+                if "visium_scalefactors_file" in caf[nn].columns:
+                    if pd.notna(caf["visium_scalefactors_file"][nn]):
+                        visium_scalefactors_file= caf["visium_scalefactors_file"][nn]
                 else:
                     visium_scalefactors_file = None  
         else:

From 4d873f270d76ad6f52c9ed39a172631048aa93c1 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Mon, 10 Feb 2025 10:22:13 +0100
Subject: [PATCH 50/57] remove index

---
 panpipes/funcs/io.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py
index 0a46f205..780228c6 100644
--- a/panpipes/funcs/io.py
+++ b/panpipes/funcs/io.py
@@ -171,17 +171,17 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
                 visium_tissue_positions_file = None
                 visium_scalefactors_file = None
                 spatial_filetype = caf['spatial_filetype'][nn]
-                if "vpt_cell_by_gene" in caf[nn].columns:
+                if "vpt_cell_by_gene" in caf.columns:
                     if pd.notna(caf['vpt_cell_by_gene'][nn]):
                         vpt_cell_by_gene = caf['vpt_cell_by_gene'][nn]
                 else:
                     vpt_cell_by_gene = None
-                if "vpt_cell_metadata" in caf[nn].columns:
+                if "vpt_cell_metadata" in caf.columns:
                     if pd.notna(caf['vpt_cell_metadata'][nn]):
                         vpt_cell_metadata = caf['vpt_cell_metadata'][nn]
                 else:
                     vpt_cell_metadata = None
-                if "vpt_cell_boundaries" in caf[nn].columns:
+                if "vpt_cell_boundaries" in caf.columns:
                     if pd.notna(caf['vpt_cell_boundaries'][nn]):
                         vpt_cell_boundaries = caf['vpt_cell_boundaries'][nn]
                 else:
@@ -192,25 +192,25 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
                 vpt_cell_boundaries = None
                 spatial_filetype = caf['spatial_filetype'][nn]
                 #counts file
-                if "visium_feature_bc_matrix" in caf[nn].columns:
+                if "visium_feature_bc_matrix" in caf.columns:
                     if pd.notna(caf["visium_feature_bc_matrix"][nn]):
                         visium_feature_bc_matrix= caf["visium_feature_bc_matrix"][nn]
                 else:
                     visium_feature_bc_matrix = None  
                 # fullres image
-                if "visium_fullres_image_file" in caf[nn].columns:
+                if "visium_fullres_image_file" in caf.columns:
                     if pd.notna(caf["visium_fullres_image_file"][nn]):
                         visium_fullres_image_file= caf["visium_fullres_image_file"][nn]
                 else:
                     visium_fullres_image_file = None 
                 # tissue position 
-                if "visium_tissue_positions_file" in caf[nn].columns:
+                if "visium_tissue_positions_file" in caf.columns:
                     if pd.notna(caf["visium_tissue_positions_file"][nn]):
                         visium_tissue_positions_file= caf["visium_tissue_positions_file"][nn]
                 else:
                     visium_tissue_positions_file = None  
                 # scalefactor
-                if "visium_scalefactors_file" in caf[nn].columns:
+                if "visium_scalefactors_file" in caf.columns:
                     if pd.notna(caf["visium_scalefactors_file"][nn]):
                         visium_scalefactors_file= caf["visium_scalefactors_file"][nn]
                 else:

From 39d3c5730ba26474321233b4af08d3f0f63b60c9 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Mon, 10 Feb 2025 10:31:43 +0100
Subject: [PATCH 51/57] fix bug

---
 panpipes/funcs/io.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py
index 780228c6..b5d635ab 100644
--- a/panpipes/funcs/io.py
+++ b/panpipes/funcs/io.py
@@ -171,22 +171,23 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
                 visium_tissue_positions_file = None
                 visium_scalefactors_file = None
                 spatial_filetype = caf['spatial_filetype'][nn]
+                vpt_cell_by_gene = None
+                vpt_cell_metadata = None
+                vpt_cell_boundaries = None
                 if "vpt_cell_by_gene" in caf.columns:
                     if pd.notna(caf['vpt_cell_by_gene'][nn]):
                         vpt_cell_by_gene = caf['vpt_cell_by_gene'][nn]
-                else:
-                    vpt_cell_by_gene = None
                 if "vpt_cell_metadata" in caf.columns:
                     if pd.notna(caf['vpt_cell_metadata'][nn]):
                         vpt_cell_metadata = caf['vpt_cell_metadata'][nn]
-                else:
-                    vpt_cell_metadata = None
                 if "vpt_cell_boundaries" in caf.columns:
                     if pd.notna(caf['vpt_cell_boundaries'][nn]):
                         vpt_cell_boundaries = caf['vpt_cell_boundaries'][nn]
-                else:
-                    vpt_cell_boundaries = None
             elif caf['spatial_filetype'][nn]=="visium":
+                visium_feature_bc_matrix = None 
+                visium_fullres_image_file = None
+                visium_tissue_positions_file = None
+                visium_scalefactors_file = None
                 vpt_cell_by_gene = None
                 vpt_cell_metadata = None
                 vpt_cell_boundaries = None
@@ -195,26 +196,18 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
                 if "visium_feature_bc_matrix" in caf.columns:
                     if pd.notna(caf["visium_feature_bc_matrix"][nn]):
                         visium_feature_bc_matrix= caf["visium_feature_bc_matrix"][nn]
-                else:
-                    visium_feature_bc_matrix = None  
                 # fullres image
                 if "visium_fullres_image_file" in caf.columns:
                     if pd.notna(caf["visium_fullres_image_file"][nn]):
                         visium_fullres_image_file= caf["visium_fullres_image_file"][nn]
-                else:
-                    visium_fullres_image_file = None 
                 # tissue position 
                 if "visium_tissue_positions_file" in caf.columns:
                     if pd.notna(caf["visium_tissue_positions_file"][nn]):
                         visium_tissue_positions_file= caf["visium_tissue_positions_file"][nn]
-                else:
-                    visium_tissue_positions_file = None  
                 # scalefactor
                 if "visium_scalefactors_file" in caf.columns:
                     if pd.notna(caf["visium_scalefactors_file"][nn]):
-                        visium_scalefactors_file= caf["visium_scalefactors_file"][nn]
-                else:
-                    visium_scalefactors_file = None  
+                        visium_scalefactors_file= caf["visium_scalefactors_file"][nn] 
         else:
             spatial_path= None
             spatial_filetype = None

From 775e3346267d34306cf0f1594942ab4e7b7a3179 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Mon, 10 Feb 2025 10:39:39 +0100
Subject: [PATCH 52/57] use zip files

---
 .github/workflows/spatial_preprocess-ci.yml | 64 +++++----------------
 1 file changed, 15 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/spatial_preprocess-ci.yml b/.github/workflows/spatial_preprocess-ci.yml
index fee80bce..ce322ee6 100644
--- a/.github/workflows/spatial_preprocess-ci.yml
+++ b/.github/workflows/spatial_preprocess-ci.yml
@@ -55,60 +55,21 @@ jobs:
 
       - name: Preparing the data
         run: |
-          mkdir spatial spatial/ingestion spatial/ingestion/data
-          cd spatial/ingestion/data 
-          mkdir V1_Human_Heart V1_Human_Lymph_Node
-          cd V1_Human_Heart
-          curl -O https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Heart/V1_Human_Heart_filtered_feature_bc_matrix.h5
-          curl -O https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Heart/V1_Human_Heart_spatial.tar.gz
-          tar -xf V1_Human_Heart_spatial.tar.gz
-          cd ../V1_Human_Lymph_Node
-          curl -O https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Lymph_Node/V1_Human_Lymph_Node_filtered_feature_bc_matrix.h5
-          curl -O https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Lymph_Node/V1_Human_Lymph_Node_spatial.tar.gz
-          tar -xf V1_Human_Lymph_Node_spatial.tar.gz
+          mkdir spatial spatial/preprocess spatial/preprocess/data
+          cd spatial/preprocess/data 
+          
+          curl -L -o V1_Human_Heart_unfilt.zarr.zip https://figshare.com/ndownloader/files/52236521
+          unzip V1_Human_Heart_unfilt.zarr.zip
+          rm V1_Human_Heart_unfilt.zarr.zip
+          curl -L -o V1_Human_Lymph_Node_unfilt.zarr.zip https://figshare.com/ndownloader/files/52236575
+          unzip V1_Human_Lymph_Node_unfilt.zarr.zip
+          rm V1_Human_Lymph_Node_unfilt.zarr.zip
+       
 
       # Note: we run the following to test that the commands works
       - name: Preparing the configuration file
         shell: bash -el {0}
         run: |
-          cd spatial/ingestion
-          panpipes qc_spatial config
-
-      - name: Preparing the submission file
-        run: |
-          cd spatial/ingestion
-          curl -o sample_file_qc_visium.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt
-      - name: Preparing the yaml file
-        run: |
-          cd spatial/ingestion
-          curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/pipeline.yml
-
-      - name: File tree
-        if: env.debug == 'true'
-        run: tree spatial/ingestion
-
-      - name: Review pipeline tasks
-        shell: bash -el {0}
-        run: |
-          cd spatial/ingestion
-          panpipes qc_spatial show full --local
-
-      - name: Run pipeline tasks
-        shell: bash -el {0}
-        run: |
-          cd spatial/ingestion
-          panpipes qc_spatial make full --local
-
-      - name: File tree
-        if: env.debug == 'true'
-        run: tree spatial/ingestion
-
-
-      # Note: we run the following to test that the commands works
-      - name: Preparing the configuration file
-        shell: bash -el {0}
-        run: |
-          mkdir spatial/preprocess 
           cd spatial/preprocess
           panpipes preprocess_spatial config
 
@@ -117,6 +78,11 @@ jobs:
           cd spatial/preprocess
           curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/preprocess_spatial_data/pipeline.yml
 
+      - name: Replace template contents in configuration file
+        run: |
+          cd spatial/preprocess
+          sed -i 's+../ingestion/qc.data/+./data/+g' pipeline.yml
+  
       - name: File tree
         if: env.debug == 'true'
         run: tree spatial/preprocess

From 66890d4e61f03f189b79f920ca29f992e8a6d6db Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Mon, 10 Feb 2025 10:52:12 +0100
Subject: [PATCH 53/57] change function and parameter names

---
 .../panpipes/pipeline_preprocess_spatial.py   | 20 ++++++++---------
 panpipes/panpipes/pipeline_qc_spatial.py      | 16 +++++++-------
 panpipes/python_scripts/plot_qc_spatial.py    | 12 +++++-----
 panpipes/python_scripts/run_filter_spatial.py | 22 +++++++++----------
 .../python_scripts/run_preprocess_spatial.py  | 20 ++++++++---------
 5 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/panpipes/panpipes/pipeline_preprocess_spatial.py b/panpipes/panpipes/pipeline_preprocess_spatial.py
index d7a4053a..dbed8c9c 100644
--- a/panpipes/panpipes/pipeline_preprocess_spatial.py
+++ b/panpipes/panpipes/pipeline_preprocess_spatial.py
@@ -52,7 +52,7 @@ def gen_filter_jobs():
 @mkdir("tables")
 @mkdir("filtered.data")
 @files(gen_filter_jobs)
-def filter_mudata(infile_path,outfile):
+def filter_spatialdata(infile_path,outfile):
     print('processing file = %s' % str(infile_path))
     log_file = os.path.basename(outfile)
     log_file= "1_filtering."+log_file.replace("filtered.zarr","") + ".log"
@@ -61,15 +61,15 @@ def filter_mudata(infile_path,outfile):
     filter_dict = dictionary_stripper(PARAMS['filtering'])
     cmd = """
         python %(py_path)s/run_filter_spatial.py
-        --input_mudata %(infile_path)s
-        --output_mudata %(outfile)s
+        --input_spatialdata %(infile_path)s
+        --output_spatialdata %(outfile)s
         --filter_dict "%(filter_dict)s"
         """
     if PARAMS['filtering_keep_barcodes'] is not None:
         cmd += " --keep_barcodes %(filtering_keep_barcodes)s"
     cmd += " > logs/%(log_file)s "
     job_kwargs["job_threads"] = PARAMS['resources_threads_low']
-    log_msg = f"TASK: 'filter_mudata'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/{log_file}' FOR MORE INFORMATION."
+    log_msg = f"TASK: 'filter_spatialdata'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/{log_file}' FOR MORE INFORMATION."
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
     
@@ -84,7 +84,7 @@ def run_plotqc_query(pqc_dict):
 
 @active_if(run_plotqc_query(PARAMS['plotqc']))
 @active_if(PARAMS['filtering_run'])
-@transform(filter_mudata,
+@transform(filter_spatialdata,
            regex("./filtered.data/(.*)_filtered.zarr"), 
            r"./logs/2_postfilterplot.\1.log")
 def postfilterplot_spatial(filt_file,log_file):
@@ -93,7 +93,7 @@ def postfilterplot_spatial(filt_file,log_file):
     spatial_filetype = PARAMS["assay"]
     cmd = """
             python %(py_path)s/plot_qc_spatial.py
-             --input_mudata %(filt_file)s
+             --input_spatialdata %(filt_file)s
              --spatial_filetype %(spatial_filetype)s
              --figdir ./figures/spatial
             """
@@ -108,7 +108,7 @@ def postfilterplot_spatial(filt_file,log_file):
     P.run(cmd, **job_kwargs)
 
 
-@transform(filter_mudata,
+@transform(filter_spatialdata,
            regex("./filtered.data/(.*)_filtered.zarr"), 
            r"./logs/3_preprocess.\1.log")
 def spatial_preprocess(filt_file,log_file):
@@ -119,8 +119,8 @@ def spatial_preprocess(filt_file,log_file):
     write_output = os.path.join("./tmp/",os.path.basename(filt_file))
     cmd = """
         python %(py_path)s/run_preprocess_spatial.py
-        --input_mudata %(filt_file)s
-        --output_mudata %(write_output)s
+        --input_spatialdata %(filt_file)s
+        --output_spatialdata %(write_output)s
         --figdir ./figures/spatial
         """
     if PARAMS['spatial_norm_hvg_flavour'] is not None:
@@ -154,7 +154,7 @@ def spatial_preprocess(filt_file,log_file):
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
 
-@follows(filter_mudata, postfilterplot_spatial, spatial_preprocess)
+@follows(filter_spatialdata, postfilterplot_spatial, spatial_preprocess)
 @originate("cleanup_done.txt")
 def cleanup(file):
     # remove any ctmp fails
diff --git a/panpipes/panpipes/pipeline_qc_spatial.py b/panpipes/panpipes/pipeline_qc_spatial.py
index 3bc90556..6342e8f7 100644
--- a/panpipes/panpipes/pipeline_qc_spatial.py
+++ b/panpipes/panpipes/pipeline_qc_spatial.py
@@ -56,7 +56,7 @@ def set_up_dirs(log_file):
     pass
 
 # -----------------------------------------------------------------------------------------------
-## Creating h5mu from filtered data files
+## Creating spatialData from filtered data files
 # -----------------------------------------------------------------------------------------------
 
 
@@ -73,7 +73,7 @@ def gen_load_spatial_anndata_jobs():
 @follows(mkdir("logs"))
 @follows(mkdir("tmp"))
 @files(gen_load_spatial_anndata_jobs)
-def load_mudatas(spatial_path,  outfile, 
+def load_spatialdatas(spatial_path,  outfile, 
                  sample_id, spatial_filetype, visium_feature_bc_matrix, visium_fullres_image_file, visium_tissue_positions_file, visium_scalefactors_file,
               vpt_cell_by_gene, vpt_cell_metadata, vpt_cell_boundaries):
     
@@ -119,19 +119,19 @@ def load_mudatas(spatial_path,  outfile,
         --vpt_cell_metadata %(vpt_cell_metadata)s 
         --vpt_cell_boundaries %(vpt_cell_boundaries)s
         """
-    cmd += " > logs/1_make_mudatas_%(sample_id)s.log"
+    cmd += " > logs/1_make_spatialdatas_%(sample_id)s.log"
     job_kwargs["job_threads"] = PARAMS['resources_threads_medium']
-    log_msg = f"TASK: 'load_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/1_make_mudatas_{sample_id}.log' FOR MORE INFORMATION."
+    log_msg = f"TASK: 'load_spatialdatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/1_make_spatialdatas_{sample_id}.log' FOR MORE INFORMATION."
     get_logger().info(log_msg)
     P.run(cmd, **job_kwargs)
 
 
 
 
-@follows(load_mudatas)
+@follows(load_spatialdatas)
 @follows(mkdir("qc.data"))
 @follows(mkdir("./figures"))
-@transform(load_mudatas,
+@transform(load_spatialdatas,
            regex("./tmp/(.*)_raw.zarr"), 
            r"./logs/2_spatialQC_\1.log")
 def spatialQC(infile,log_file):
@@ -179,7 +179,7 @@ def run_plotqc_query(pqc_dict):
 @follows(spatialQC)
 @follows(mkdir("./figures/spatial"))
 @active_if(run_plotqc_query(PARAMS['plotqc']))
-@transform(load_mudatas, 
+@transform(load_spatialdatas, 
            regex("./tmp/(.*)_raw.zarr"),
            r"./logs/3_qcplot.\1.log")
 def plotQC_spatial(unfilt_file,log_file):
@@ -188,7 +188,7 @@ def plotQC_spatial(unfilt_file,log_file):
     unfilt_file = unfilt_file.replace("tmp", "qc.data")
     cmd = """
             python %(py_path)s/plot_qc_spatial.py
-             --input_mudata %(unfilt_file)s
+             --input_spatialdata %(unfilt_file)s
              --spatial_filetype %(spatial_filetype)s
              --figdir ./figures/spatial
             """
diff --git a/panpipes/python_scripts/plot_qc_spatial.py b/panpipes/python_scripts/plot_qc_spatial.py
index 558219c0..205bf74a 100644
--- a/panpipes/python_scripts/plot_qc_spatial.py
+++ b/panpipes/python_scripts/plot_qc_spatial.py
@@ -28,8 +28,8 @@
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument("--input_mudata",
-                    default="mudata_unfilt.h5mu",
+parser.add_argument("--input_spatialdata",
+                    default="spatialdata_unfilt.h5mu",
                     help="")
 parser.add_argument("--figdir",
                     default="./figures/",
@@ -58,12 +58,12 @@
 sc.settings.figdir = figdir
 sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
 
-L.info("Reading in SpatialData from '%s'" % args.input_mudata)
-sdata = sd.read_zarr(args.input_mudata)
-#mdata = mu.read(args.input_mudata)
+L.info("Reading in SpatialData from '%s'" % args.input_spatialdata)
+sdata = sd.read_zarr(args.input_spatialdata)
+#mdata = mu.read(args.input_spatialdata)
 #spatial = mdata.mod['spatial']
 
-input_data = os.path.basename(args.input_mudata)
+input_data = os.path.basename(args.input_spatialdata)
 pattern = r"_filtered.zarr"
 match = re.search(pattern, input_data)
 if match is None:
diff --git a/panpipes/python_scripts/run_filter_spatial.py b/panpipes/python_scripts/run_filter_spatial.py
index 733c8cad..c3a05c91 100644
--- a/panpipes/python_scripts/run_filter_spatial.py
+++ b/panpipes/python_scripts/run_filter_spatial.py
@@ -42,10 +42,10 @@ def test_matching_df_ignore_cat(new_df, old_df):
 # parse arguments
 parser = argparse.ArgumentParser()
 
-parser.add_argument('--input_mudata',
+parser.add_argument('--input_spatialdata',
                     default='gut_minus1_amp.h5ad',
                     help='')
-parser.add_argument('--output_mudata',
+parser.add_argument('--output_spatialdata',
                     default='',
                     help='')
 parser.add_argument('--filter_dict',
@@ -53,7 +53,7 @@ def test_matching_df_ignore_cat(new_df, old_df):
                     help='this is pull')
 # cross modalities args
 parser.add_argument('--keep_barcodes', default=None,
-                    help='1 column list of barcodes to keep, note that they should match the mudata input, this filtering happens first')
+                    help='1 column list of barcodes to keep, note that they should match the spatialdata input, this filtering happens first')
 
 
 # load options
@@ -73,14 +73,14 @@ def test_matching_df_ignore_cat(new_df, old_df):
 filter_dict = dictionary_stripper(filter_dict)
 L.info("Filter dictionary:\n %s" %filter_dict)
 
-# load mudata
+# load spatialdata
 
-L.info("Reading in SpatialData from '%s'" % args.input_mudata)
-sdata = sd.read_zarr(args.input_mudata)
-#mdata = mu.read(args.input_mudata)
+L.info("Reading in SpatialData from '%s'" % args.input_spatialdata)
+sdata = sd.read_zarr(args.input_spatialdata)
+#mdata = mu.read(args.input_spatialdata)
 
 #if isinstance(mdata, AnnData):
-#    raise TypeError("Input '%s' should be of MuData format, not Anndata"  % args.input_mudata)
+#    raise TypeError("Input '%s' should be of spatialdata format, not Anndata"  % args.input_spatialdata)
 
 orig_obs = sdata["table"].obs.copy()
 
@@ -147,7 +147,7 @@ def test_matching_df_ignore_cat(new_df, old_df):
 assert test_matching_df_ignore_cat(sdata["table"].obs, orig_obs)  
 
 # write out obs
-output_prefix = re.sub(".zarr", "", os.path.basename(args.output_mudata))
+output_prefix = re.sub(".zarr", "", os.path.basename(args.output_spatialdata))
 
 L.info("Saving updated obs in a metadata tsv file to './tables/" + output_prefix + "_filtered_cell_metadata.tsv'")
 write_obs(sdata["table"], output_prefix=os.path.join("tables/",output_prefix), output_suffix="_filtered_cell_metadata.tsv")
@@ -166,8 +166,8 @@ def test_matching_df_ignore_cat(new_df, old_df):
 
 #mdata.update()
 
-L.info("Saving updated SpatialData to '%s'" % args.output_mudata)
-sdata.write(args.output_mudata)
+L.info("Saving updated SpatialData to '%s'" % args.output_spatialdata)
+sdata.write(args.output_spatialdata)
 
 L.info("Done")
 
diff --git a/panpipes/python_scripts/run_preprocess_spatial.py b/panpipes/python_scripts/run_preprocess_spatial.py
index 5e389e04..250057ec 100644
--- a/panpipes/python_scripts/run_preprocess_spatial.py
+++ b/panpipes/python_scripts/run_preprocess_spatial.py
@@ -32,11 +32,11 @@
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument("--input_mudata",
-                    default="mudata_unfilt.h5mu",
+parser.add_argument("--input_spatialdata",
+                    default="spatialdata_unfilt.h5mu",
                     help="")
-parser.add_argument("--output_mudata",
-                    default="mudata_unfilt.h5mu",
+parser.add_argument("--output_spatialdata",
+                    default="spatialdata_unfilt.h5mu",
                     help="")
 parser.add_argument("--figdir",
                     default="./figures/",
@@ -89,12 +89,12 @@
 sc.settings.figdir = figdir
 sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
 
-L.info("Reading in SpatialData from '%s'" % args.input_mudata)
-sdata = sd.read_zarr(args.input_mudata)
-#mdata = mu.read(args.input_mudata)
+L.info("Reading in SpatialData from '%s'" % args.input_spatialdata)
+sdata = sd.read_zarr(args.input_spatialdata)
+#mdata = mu.read(args.input_spatialdata)
 #spatial = mdata.mod['spatial']
 
-input_data = os.path.basename(args.input_mudata)
+input_data = os.path.basename(args.input_spatialdata)
 pattern = r"_filtered.zarr"
 match = re.search(pattern, input_data)
 sprefix = input_data[:match.start()]
@@ -174,8 +174,8 @@
 
         
 #mdata.update()
-L.info("Saving updated SpatialData to '%s'" % args.output_mudata)
-sdata.write(args.output_mudata)
+L.info("Saving updated SpatialData to '%s'" % args.output_spatialdata)
+sdata.write(args.output_spatialdata)
 
 L.info("Done")
 

From 33259c3afcc5491c0042b0d06c6121835c3a19af Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Mon, 10 Feb 2025 10:55:22 +0100
Subject: [PATCH 54/57] decrease font size

---
 docs/usage/setup_for_spatial_workflows.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/usage/setup_for_spatial_workflows.md b/docs/usage/setup_for_spatial_workflows.md
index 2959dd9f..456d519f 100644
--- a/docs/usage/setup_for_spatial_workflows.md
+++ b/docs/usage/setup_for_spatial_workflows.md
@@ -30,7 +30,7 @@ The following 4 columns are **optional**:
 
 **visium_scalefactors_file**:	Path to the scalefactors file. Corresponds to the `scalefactors_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html)
 
-#### [Example submission file](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt)
+##### [Example submission file](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt)
 
 
 ## Vizgen
@@ -49,7 +49,7 @@ The following 3 columns are **optional**:
 **vpt_cell_boundaries**: The file name of the output of the vizgen-postprocessing-tool. See [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html)
 
 
-#### Example submission files [MERFISH](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_merfish_data/sample_file_qc_merfish.txt) [MERSCOPE](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_merscope_data/sample_file_qc_merscope.txt)
+##### Example submission files [MERFISH](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_merfish_data/sample_file_qc_merfish.txt) [MERSCOPE](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_merscope_data/sample_file_qc_merscope.txt)
 
 ## Xenium
 
@@ -58,7 +58,7 @@ The 3 columns of the Xenium sample submission file are:
 sample_id |	spatial_path |	spatial_filetype |
 ----------|----------|------------
 
-#### [Example submission file](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_xenium_data/sample_file_qc_xenium.txt)
+##### [Example submission file](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_xenium_data/sample_file_qc_xenium.txt)
 
 
 

From b6844b560e4da46eb7890f56222e791902f623c5 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Mon, 10 Feb 2025 13:05:22 +0100
Subject: [PATCH 55/57] add changes to changelog

---
 CHANGELOG.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1ac8d32d..4baadcb7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,24 @@
 
 ## [latest]
 
+### added 
+- moved from MuData to SpatialData 
+- xenium ingestion & ingest_xenium GitHub action
+- `export_gene_by_spot` for Cell2Location
+- separate sample submission files for the different spatial technologies 
+- separate GitHub actions for Cell2Location & Tangram 
+- separate GitHub actions for MERSCOPE & MERFISH
+
+
+### fixed
+
+
+### dependencies
+- pinned "spatialdata==0.2.6", "spatialdata-io==0.1.6", "dask==2024.12.1" as temporary fix
+
+
+## v1.1.0
+
 ### added 
 
 ### fixed

From bbf99ccf7f5f89516f7a7f10e791fdba5ce14bab Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Wed, 12 Feb 2025 15:26:14 +0100
Subject: [PATCH 56/57] update comment

---
 panpipes/python_scripts/make_spatialData_from_csv.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/panpipes/python_scripts/make_spatialData_from_csv.py b/panpipes/python_scripts/make_spatialData_from_csv.py
index d6c85ac6..d9d8a02e 100644
--- a/panpipes/python_scripts/make_spatialData_from_csv.py
+++ b/panpipes/python_scripts/make_spatialData_from_csv.py
@@ -13,11 +13,8 @@
 import os
 from pathlib import Path
 """
-this script copies the make_adata_from_csv.py that creates
-ONE MUDATA PER SAMPLE, with in each ONE LAYER per modality
-for cell-suspension, saves them to temp. 
-concatenation of the mudatas saved in tmp happens 
-in the concat_anndata.py script
+This script is an adjustment of the make_adata_from_csv.py. It creates
+ONE SPATIALDATA PER SAMPLE and saves them to temp.
 """
 
 import sys

From a96ea4ca5806bda45c56ce375f38d43e8191c601 Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Wed, 12 Feb 2025 15:34:31 +0100
Subject: [PATCH 57/57] update logging info

---
 panpipes/python_scripts/run_cell2location.py | 2 +-
 panpipes/python_scripts/run_tangram.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/panpipes/python_scripts/run_cell2location.py b/panpipes/python_scripts/run_cell2location.py
index 7cb427d8..9be32a1d 100644
--- a/panpipes/python_scripts/run_cell2location.py
+++ b/panpipes/python_scripts/run_cell2location.py
@@ -207,7 +207,7 @@
 
 #1. read in the data
 #spatial: 
-L.info("Reading in spatial SpatialData from '%s'" % args.input_spatial)
+L.info("Reading in SpatialData from '%s'" % args.input_spatial)
 sdata_st = sd.read_zarr(args.input_spatial)
 #mdata_spatial = mu.read(args.input_spatial)
 #adata_st = mdata_spatial.mod['spatial']
diff --git a/panpipes/python_scripts/run_tangram.py b/panpipes/python_scripts/run_tangram.py
index 28eaeb0d..6f545771 100644
--- a/panpipes/python_scripts/run_tangram.py
+++ b/panpipes/python_scripts/run_tangram.py
@@ -101,7 +101,7 @@
 
 #1. read in the data
 #spatial: 
-L.info("Reading in spatial SpatialData from '%s'" % args.input_spatial)
+L.info("Reading in SpatialData from '%s'" % args.input_spatial)
 sdata_st = sd.read_zarr(args.input_spatial)
 #mdata_spatial = mu.read(args.input_spatial)
 #adata_st = mdata_spatial.mod['spatial']