Skip to content

Commit

Permalink
add logging info
Browse files Browse the repository at this point in the history
  • Loading branch information
SarahOuologuem committed Mar 28, 2024
1 parent 4150e4b commit dca2e58
Showing 1 changed file with 32 additions and 29 deletions.
61 changes: 32 additions & 29 deletions panpipes/python_scripts/run_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from anndata import AnnData
from muon import MuData
import yaml
import os

# import scpipelines.funcs as scp
from panpipes.funcs.processing import intersect_obs_by_mod, remove_unused_categories
Expand Down Expand Up @@ -66,7 +67,6 @@ def test_matching_df_ignore_cat(new_df, old_df):
# load options
parser.set_defaults(verbose=True)
args, opt = parser.parse_known_args()
L.info('running with args:')
L.info(args)

# load the filtering dictionary
Expand All @@ -79,21 +79,29 @@ def test_matching_df_ignore_cat(new_df, old_df):
# remove Nones for each level of filter dict
filter_dict = map_nested_dicts_remove_none(filter_dict)
filter_dict = dictionary_stripper(filter_dict)
L.info("filter dictionary:\n %s" %filter_dict)
L.info("Filter dictionary:\n %s" %filter_dict)

# load mudata
L.info("Reading in MuData from '%s'" % args.input_mudata)
mdata = mu.read(args.input_mudata)

if isinstance(mdata, AnnData):
raise TypeError("input_mudata should be of MuData format, not Anndata")
raise TypeError("Input '%s' should be of MuData format, not Anndata" % args.input_mudata)

orig_obs = mdata.obs.copy()

L.info("Before filtering %d" % mdata.n_obs)
L.info("Before filtering: "+ str(mdata.n_obs) + " cells and " + str(mdata.n_vars) + " features")

# filter based on provided barcodes -----
if args.keep_barcodes is not None:
L.info("filtering all modalities by keep_barcodes file")
keep_bc = pd.read_csv(args.keep_barcodes,header=None)
if os.path.exists(args.keep_barcodes):
L.info("Reading in keep_barcodes file from '%s'" % args.keep_barcodes)
keep_bc = pd.read_csv(args.keep_barcodes,header=None)
else:
L.error("The path of the keep_barcodes file '%s' could not be found" % args.keep_barcodes)
sys.exit("The path of the keep_barcodes file '%s' could not be found" % args.keep_barcodes)

L.info("Filtering all modalities by keep_barcodes file")
mdata = mdata[mdata.obs_names.isin(keep_bc[0]),:].copy()
remove_unused_categories(mdata.obs)
mdata.update()
Expand All @@ -106,62 +114,57 @@ def test_matching_df_ignore_cat(new_df, old_df):
# this will go through the modalities one at a time,
# then the categories max, min and bool
for mod in mdata.mod.keys():
L.info(mod)
L.info("Filtering modality '%s'" % mod)
if mod in filter_dict.keys():
for marg in filter_dict[mod].keys():
if marg == "obs":
if "max" in filter_dict[mod][marg].keys():
for col, n in filter_dict[mod][marg]['max'].items():
L.info("filter %s %s to less than %s" % (mod, col, n))
L.info("Filtering cells of modality '%s' by '%s' in obs to less than %s" % (mod, col, n))
mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x <= n)
L.info("Remaining cells %d" % mdata[mod].n_obs)
L.info("Remaining cells %d" % mdata[mod].shape[0])
if "min" in filter_dict[mod][marg].keys():
for col, n in filter_dict[mod][marg]['min'].items():
L.info("filter %s %s to more than %s" % (mod, col, n))
L.info("Filtering cells of modality '%s' by '%s' in obs to more than %s" % (mod, col, n))
mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x >= n)
L.info("Remaining cells %d" % mdata[mod].n_obs)
if "bool" in filter_dict[mod][marg].keys():
for col, n in filter_dict[mod][marg]['bool'].items():
L.info("filter %s %s marked %s" % (mod, col, n))
L.info("Filtering cells of modality '%s' by '%s' in obs marked %s" % (mod, col, n))
mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x == n)
L.info("Remaining cells %d" % mdata[mod].n_obs)
if marg == "var":
if "max" in filter_dict[mod][marg].keys():
for col, n in filter_dict[mod][marg]['max'].items():
L.info("filter %s %s to less than %s" % (mod, col, n))
L.info("Filtering features of modality '%s' by '%s' in .var to less than %s" % (mod, col, n))
mu.pp.filter_var(mdata.mod[mod], col, lambda x: x <= n)
L.info("Remaining cells %d" % mdata[mod].n_obs)
L.info("Remaining features: %d" % mdata[mod].n_vars)

if "min" in filter_dict[mod][marg].keys():
for col, n in filter_dict[mod][marg]['min'].items():
L.info("filter %s %s to more than %s" % (mod, col, n))
L.info("Filtering features of modality '%s' by '%s' in .var to more than %s" % (mod, col, n))
mu.pp.filter_var(mdata.mod[mod], col, lambda x: x >= n)
L.info("Remaining cells %d" % mdata[mod].n_obs)
L.info("Remaining features: %d" % mdata[mod].n_vars)

if "bool" in filter_dict[mod][marg].keys():
for col, n in filter_dict[mod][marg]['bool'].items():
L.info("filter %s %s marked %s" % (mod, col, n))
L.info("Filtering features of modality '%s' by '%s' in .var marked %s" % (mod, col, n))
mu.pp.filter_var(mdata.mod[mod], col, lambda x: x == n)
L.info("Remaining cells %d" % mdata[mod].n_obs)
L.info("Remaining features: %d" % mdata[mod].n_vars)

L.debug(mdata.obs['sample_id'].value_counts())
L.debug(mdata['rna'].obs['sample_id'].value_counts())

mdata.update()
L.debug(mdata.obs['sample_id'].value_counts())
L.debug(mdata['rna'].obs['sample_id'].value_counts())

# intersect specific modalities
# we don't want to just use mu.pp.intersect_obs,
# because we don't want to necessarily filter by repertoire
if args.intersect_mods is not None:
intersect_mods = args.intersect_mods.split(',')
intersect_mods= [a.strip() for a in intersect_mods]
L.info("intersecting barcodes in %s" % args.intersect_mods)
L.info("Intersecting barcodes of modalities %s" % args.intersect_mods)
intersect_obs_by_mod(mdata, mods = intersect_mods)
L.info("Remaining cells %d" % mdata.n_obs)
L.info(mdata.shape)


L.info("After filtering: "+ str(mdata.n_obs) + " cells and " + str(mdata.n_vars) + " features across all modalities")

remove_unused_categories(mdata.obs)

Expand All @@ -170,6 +173,8 @@ def test_matching_df_ignore_cat(new_df, old_df):

# write out obs
output_prefix = re.sub(".h5mu", "", args.output_mudata)

L.info("Saving updated MuData.obs in a metadata tsv file to '" + output_prefix + "_filtered_cell_metadata.tsv'")
write_obs(mdata, output_prefix=output_prefix, output_suffix="_filtered_cell_metadata.tsv")

# write out the per sample_id cell numbers
Expand All @@ -181,14 +186,12 @@ def test_matching_df_ignore_cat(new_df, old_df):
columns={"level_0": "modality", "level_1": "sample_id"})

L.info("cell_counts\n%s" %cell_counts)
L.info("Saving cell counts in a metadata csv file to '" + output_prefix + "_cell_counts.csv'")
cell_counts.to_csv(output_prefix + "_cell_counts.csv", index=None)


mdata.update()
L.info("writing mdata h5mu")
L.info(args.output_mudata)
L.debug(mdata.obs['sample_id'].value_counts())
L.debug(mdata['rna'].obs['sample_id'].value_counts())
L.info("Saving updated MuData to '%s'" % args.output_mudata)
mdata.write(args.output_mudata)

L.info("Done")
Expand Down

0 comments on commit dca2e58

Please sign in to comment.