Skip to content

Commit adb44ee

Browse files
committed
save IBD obj instead of individual dataframes
1 parent e399057 commit adb44ee

File tree

5 files changed

+40
-32
lines changed

5 files changed

+40
-32
lines changed

bin/proc_dist_ne.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,16 @@
2222
genome_14_100 = ibdutils.Genome.get_genome("simu_14chr_100cm")
2323
ibd = ibdutils.IBD(genome=genome_14_100, label=f"{idx}_orig")
2424
ibd.read_ibd(ibd_fn_lst=args.ibd_files)
25+
ibd.calc_ibd_cov()
26+
ibd.find_peaks()
2527

2628

2729
# output files:
28-
ofs_ibd_pq = f"{args.genome_set_id}_ibddist_ibd.pq"
30+
of_ibddist_obj = f"{args.genome_set_id}.ibddist.ibdobj.gz"
2931

3032

3133
# store combined IBD for IBD distribution analysis
32-
ibd._df.to_parquet(ofs_ibd_pq)
34+
ibd.pickle_dump(of_ibddist_obj)
3335

3436

3537
# remove highly relatedness samples
@@ -53,13 +55,17 @@
5355
# calculate coverage and remove peaks
5456
ibd.calc_ibd_cov()
5557
ibd.find_peaks()
58+
of_orig_ibdne_obj = f"{args.genome_set_id}_orig.ibdne.ibdobj.gz"
59+
ibd.pickle_dump(of_orig_ibdne_obj)
5660

5761
ibd2 = ibd.duplicate(f"{idx}_rmpeaks")
5862
ibd2.remove_peaks()
5963
ibd2._df = ibd2.cut_and_split_ibd()
64+
of_rmpeaks_ibdne_obj = f"{args.genome_set_id}_rmpeaks.ibdne.ibdobj.gz"
65+
ibd2.pickle_dump(of_rmpeaks_ibdne_obj)
6066

6167
# link ibdne.jar file
62-
if not Path(f"ibdne.jar").exists():
68+
if not Path("ibdne.jar").exists():
6369
assert Path(args.ibdne_jar).exists()
6470
this = Path("ibdne.jar")
6571
target = Path(args.ibdne_jar).absolute()
@@ -90,13 +96,15 @@
9096
print(
9197
f"""
9298
output files:
93-
{ofs_ibd_pq}
99+
{of_ibddist_obj}
94100
ibdne.jar
95101
{idx}_orig.sh
96102
{idx}_orig.map
97103
{idx}_orig.ibd.gz
98104
{idx}_rmpeaks.sh
99105
{idx}_rmpeaks.map
100106
{idx}_rmpeaks.ibd.gz
107+
{of_orig_ibdne_obj}
108+
{of_rmpeaks_ibdne_obj}
101109
"""
102110
)

bin/proc_infomap.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717

1818

1919
# output files:
20-
ofs_ifm_orig_ibd_pq = f"{args.genome_set_id}_ifm_orig_ibd.pq"
21-
ofs_ifm_rmpeaks_ibd_pq = f"{args.genome_set_id}_ifm_rmpeaks_ibd.pq"
20+
of_ifm_orig_ibd_obj = f"{args.genome_set_id}_orig.ifm.ibdobj.gz"
21+
of_ifm_rmpeaks_ibd_obj = f"{args.genome_set_id}_rmpeaks.ifm.ibdobj.gz"
2222

2323

2424
# remove highly relatedness samples
@@ -34,5 +34,5 @@
3434
ibd2 = ibd.duplicate("rmpeak")
3535
ibd2.remove_peaks()
3636

37-
ibd._df.to_parquet(ofs_ifm_orig_ibd_pq)
38-
ibd2._df.to_parquet(ofs_ifm_rmpeaks_ibd_pq)
37+
ibd.pickle_dump(of_ifm_orig_ibd_obj)
38+
ibd2.pickle_dump(of_ifm_rmpeaks_ibd_obj)

bin/run_infomap.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
#! /usr/bin/env python3
22

3-
import pandas as pd
4-
import numpy as np
5-
from ibdutils.utils.ibdutils import IBD, Genome
63
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
74

5+
import numpy as np
6+
import pandas as pd
7+
from ibdutils.utils.ibdutils import IBD
8+
89

910
def parse_args():
1011
p = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
11-
p.add_argument("--ibd_pq", type=str, required=True)
12+
p.add_argument("--ibd_obj", type=str, required=True)
1213
p.add_argument("--npop", type=int, required=True)
1314
p.add_argument("--nsam", type=int, required=True)
1415
p.add_argument("--genome_set_id", type=int, required=True)
@@ -26,9 +27,7 @@ def parse_args():
2627

2728
def run(args) -> pd.DataFrame:
2829

29-
g = Genome.get_genome("simu_14chr_100cm")
30-
ibd = IBD(genome=g, label=f"gsid_{args.genome_set_id}")
31-
ibd.read_ibd([args.ibd_pq], format="parquet")
30+
ibd = IBD.pickle_load(args.ibd_obj)
3231

3332
# make meta data
3433
meta = pd.DataFrame(
@@ -46,13 +45,10 @@ def run(args) -> pd.DataFrame:
4645
return member_df
4746

4847

49-
def main():
48+
if __name__ == "__main__":
5049
args = parse_args()
5150
member_df = run(args)
5251

5352
ofs = f"{args.genome_set_id}_{args.cut_mode}_member.pq"
5453
member_df.to_parquet(ofs)
5554
print(member_df)
56-
57-
58-
main()

main.nf

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,8 @@ process PROC_DIST_NE {
162162
publishDir "${resdir}/${genome_set_id}_${label}/ne_input/", pattern: "*.sh", mode: 'symlink'
163163
publishDir "${resdir}/${genome_set_id}_${label}/ne_input/", pattern: "*.map", mode: 'symlink'
164164
publishDir "${resdir}/${genome_set_id}_${label}/ne_input/", pattern: "*.ibd.gz", mode: 'symlink'
165-
publishDir "${resdir}/${genome_set_id}_${label}/ibddist_ibd/", pattern: "*_ibddist_ibd.pq", mode: 'symlink'
165+
publishDir "${resdir}/${genome_set_id}_${label}/ibddist_ibd/", pattern: "*.ibddist.ibdobj.gz", mode: 'symlink'
166+
publishDir "${resdir}/${genome_set_id}_${label}/ibdne_ibd/", pattern: "*.ibdne.ibdobj.gz", mode: 'symlink'
166167

167168
input:
168169
tuple val(label), path(ibd_lst), val(genome_set_id)
@@ -171,7 +172,8 @@ process PROC_DIST_NE {
171172
path("*_orig.map"), path("*_orig.ibd.gz"), emit: ne_input_orig
172173
tuple val(label), path("ibdne.jar"), path("*_rmpeaks.sh"), \
173174
path("*_rmpeaks.map"), path("*_rmpeaks.ibd.gz"), emit: ne_input_rmpeaks
174-
tuple val(label), path("*_ibddist_ibd.pq"), emit: ibddist_ibd_pq
175+
tuple val(label), path("*.ibddist.ibdobj.gz"), emit: ibddist_ibd_obj
176+
tuple val(label), path("*.ibdne.ibdobj.gz"), emit: ibdne_ibd_obj
175177
script:
176178
def args_local = [
177179
ibd_files: "${ibd_lst}", // path is a blank separate list
@@ -185,7 +187,9 @@ process PROC_DIST_NE {
185187
touch ibdne.jar
186188
touch ${genome_set_id}{_orig.sh,_orig.map,_orig.ibd.gz}
187189
touch ${genome_set_id}{_rmpeaks.sh,_rmpeaks.map,_rmpeaks.ibd.gz}
188-
touch ${genome_set_id}_ibddist_ibd.pq
190+
touch ${genome_set_id}.ibddist.ibdobj.gz
191+
touch ${genome_set_id}_orig.ibdne.ibdobj.gz
192+
touch ${genome_set_id}_rmpeaks.ibdne.ibdobj.gz
189193
"""
190194
}
191195

@@ -197,8 +201,8 @@ process PROC_INFOMAP {
197201
input:
198202
tuple val(label), path(ibd_lst), val(genome_set_id)
199203
output:
200-
tuple val(label), path("*_ifm_orig_ibd.pq"), emit: ifm_orig_ibd_pq
201-
tuple val(label), path("*_ifm_rmpeaks_ibd.pq"), emit: ifm_rmpeaks_ibd_pq
204+
tuple val(label), path("*_orig.ifm.ibdobj.gz"), emit: ifm_orig_ibd_obj
205+
tuple val(label), path("*_rmpeaks.ifm.ibdobj.gz"), emit: ifm_rmpeaks_ibd_obj
202206
script:
203207
def args_local = [
204208
ibd_files: "${ibd_lst}", // path is a blank separate list
@@ -209,7 +213,7 @@ process PROC_INFOMAP {
209213
"""
210214
stub:
211215
"""
212-
touch ${genome_set_id}{_ifm_orig_ibd.pq,_ifm_rmpeaks_ibd.pq}
216+
touch ${genome_set_id}{_orig.ifm.ibdobj.gz,_rmpeaks.ifm.ibdobj.gz}
213217
"""
214218
}
215219

@@ -238,13 +242,13 @@ process RUN_INFOMAP {
238242
tag "${args.genome_set_id}_${are_peaks_removed}"
239243
publishDir "${resdir}/${args.genome_set_id}_${label}/ifm_output/", mode: 'symlink'
240244
input:
241-
tuple val(label), path(ibd_pq), val(are_peaks_removed), val(args)
245+
tuple val(label), path(ibd_obj), val(are_peaks_removed), val(args)
242246
output:
243247
tuple val(label), val(are_peaks_removed), path("*_member.pq")
244248
script:
245249
def cut_mode = are_peaks_removed? 'rmpeaks': 'orig'
246250
def args_local = [
247-
ibd_pq: ibd_pq,
251+
ibd_obj: ibd_obj,
248252
npop: args.npop,
249253
nsam: args.nsam,
250254
genome_set_id: args.genome_set_id,
@@ -308,7 +312,7 @@ workflow WF_SP {
308312

309313
// Process IBD for ibd distribution and ne analyses
310314
PROC_DIST_NE(ch_ibd_per_genome)
311-
// PROC_DIST_NE.out.ibddist_ibd_pq.view{label, ibdpq -> [label, ibdpq.getName()]}
315+
// PROC_DIST_NE.out.ibddist_ibd_obj.view{label, ibdpq -> [label, ibdpq.getName()]}
312316

313317

314318
// RUN IBDNe actually
@@ -370,13 +374,13 @@ workflow WF_MP {
370374
// Process IBD for ibd distribution and ne analyses
371375
PROC_INFOMAP(ch_ibd_per_genome)
372376

373-
// PROC_INFOMAP.out.ifm_orig_ibd_pq.view{label, ibdpq -> [label, ibdpq.getName()]}
377+
// PROC_INFOMAP.out.ifm_orig_ibd_ob.view{label, ibdpq -> [label, ibdpq.getName()]}
374378

375379

376380
// RUN INFOMAP
377381
RUN_INFOMAP(
378-
PROC_INFOMAP.out.ifm_orig_ibd_pq.map{it -> it + false}.combine(ch_mp_params, by:0).concat(
379-
PROC_INFOMAP.out.ifm_rmpeaks_ibd_pq.map{it -> it + true}.combine(ch_mp_params, by:0)
382+
PROC_INFOMAP.out.ifm_orig_ibd_obj.map{it -> it + false}.combine(ch_mp_params, by:0).concat(
383+
PROC_INFOMAP.out.ifm_rmpeaks_ibd_obj.map{it -> it + true}.combine(ch_mp_params, by:0)
380384
)
381385
)
382386

nextflow.config

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ profiles{
2222
}
2323
process {
2424
// For faster testing
25-
conda = "/home/gbinux/mambaforge/envs/simulation"
25+
// conda = "/home/gbinux/mambaforge/envs/simulation"
2626

2727
errorStrategy = {task.attempt < 5 ? 'retry': 'finish'}
2828
maxRetries = 5

0 commit comments

Comments
 (0)