Skip to content

Commit 6951808

Browse files
committed
fix file naming issue
1 parent 4ec1e68 commit 6951808

File tree

3 files changed

+226
-226
lines changed

3 files changed

+226
-226
lines changed

gwaspy/check_alleles/check_alleles.py

+172
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
_author__ = 'Lindo Nkambule'
2+
3+
import hailtop.batch as hb
4+
import hailtop.fs as hfs
5+
6+
from hailtop.batch.job import Job
7+
8+
9+
def size(file: str):
10+
"""
11+
Convert the size from bytes to GiB
12+
:param file: path to file, str
13+
:return: file size in GiB
14+
"""
15+
file_info = hfs.stat(file) # returns a named tuple
16+
size_gigs = file_info.size / (1024 * 1024 * 1024)
17+
18+
return size_gigs
19+
20+
21+
def check_alleles_workflow(
22+
batch: hb.Batch = None,
23+
input_path: str = None,
24+
reference_path: str = None,
25+
output_filename: str = None,
26+
step: str = "check",
27+
fix_mode: str = "top",
28+
output_path: str = None):
29+
30+
def get_stats(
31+
b: hb.batch.Batch,
32+
job_name: str = None,
33+
vcf: hb.ResourceGroup = None,
34+
ref_fasta: hb.ResourceGroup = None,
35+
output_name: str = None,
36+
out_dir: str = None,
37+
ncpu: int = 8,
38+
memory: str = 'standard',
39+
storage: int = None,
40+
img: str = 'docker.io/lindonkambule/gwaspy_phase_impute:latest',
41+
) -> Job:
42+
j = b.new_job(name=f'Check alleles: {job_name}')
43+
44+
j.image(img)
45+
j.memory(memory)
46+
j.cpu(ncpu)
47+
j.storage(f'{storage}Gi')
48+
49+
j.command(
50+
f"""
51+
bcftools +fixref {vcf['vcf']} -- -f {ref_fasta['ref_fasta']} > {j.stats}
52+
"""
53+
)
54+
55+
b.write_output(j.stats,
56+
f'{out_dir}/check_alleles/{output_name}.stats.txt')
57+
58+
return j
59+
60+
def fix_alleles(
61+
b: hb.batch.Batch,
62+
job_name: str = None,
63+
vcf: hb.ResourceGroup = None,
64+
ref_fasta: hb.ResourceGroup = None,
65+
allele_mode: str = "top",
66+
output_name: str = None,
67+
out_dir: str = None,
68+
ncpu: int = 8,
69+
memory: str = 'standard',
70+
storage: int = None,
71+
img: str = 'docker.io/lindonkambule/gwaspy_phase_impute:latest',
72+
) -> Job:
73+
j = b.new_job(name=f'Fix alleles: {job_name}')
74+
75+
j.image(img)
76+
j.memory(memory)
77+
j.cpu(ncpu)
78+
j.storage(f'{storage}Gi')
79+
80+
j.declare_resource_group(
81+
fixed_file={
82+
'bcf': '{root}.bcf',
83+
'bcf.csi': '{root}.bcf.csi'
84+
}
85+
)
86+
87+
j.command(
88+
f"""
89+
bcftools +fixref {vcf['vcf']} -Ob -o {j.fixed_file['bcf']} -- -f {ref_fasta['ref_fasta']} -m {allele_mode}
90+
"""
91+
)
92+
93+
b.write_output(j.stats,
94+
f'{out_dir}/check_alleles/{output_name}.alleles.fixed')
95+
96+
return j
97+
98+
ref_fasta_in = batch.read_input_group(**{'vcf': reference_path,
99+
'index': f'{reference_path}.fai'})
100+
ref_size = round(size(reference_path))
101+
102+
if "CNUMBER" in input_path: # input VCF is already split by chromosome
103+
for i in range(1, 23):
104+
vcf_path = input_path.replace('CNUMBER', str(i))
105+
input_idx = f'{vcf_path}.tbi' if hfs.exists(f'{vcf_path}.tbi') else f'{vcf_path}.csi'
106+
107+
if not hfs.exists(input_idx):
108+
raise SystemExit('Input file needs to be indexed (.tbi or .csi). Found none, exiting')
109+
110+
chrom_vcf = batch.read_input_group(**{'vcf': vcf_path,
111+
'index': input_idx})
112+
vcf_size = round(size(vcf_path))
113+
disk_size = int(round(5.0 + vcf_size + ref_size))
114+
115+
if step == "check":
116+
get_stats(
117+
b=batch,
118+
job_name=vcf_path,
119+
vcf=chrom_vcf,
120+
ref_fasta=ref_fasta_in,
121+
output_name=output_filename,
122+
out_dir=output_path,
123+
storage=disk_size
124+
)
125+
else:
126+
fix_alleles(
127+
b=batch,
128+
job_name=vcf_path,
129+
vcf=chrom_vcf,
130+
ref_fasta=ref_fasta_in,
131+
allele_mode=fix_mode,
132+
output_name=output_filename,
133+
out_dir=output_path,
134+
storage=disk_size
135+
)
136+
137+
else: # one input file with all the chromosomes
138+
vcf_path = input_path
139+
input_idx = f'{vcf_path}.tbi' if hfs.exists(f'{vcf_path}.tbi') else f'{vcf_path}.csi'
140+
141+
if not hfs.exists(input_idx):
142+
raise SystemExit('Input file needs to be indexed (.tbi or .csi). Found none, exiting')
143+
144+
chrom_vcf = batch.read_input_group(**{'vcf': input_path,
145+
'index': input_idx})
146+
147+
vcf_size = round(size(vcf_path))
148+
disk_size = int(round(5.0 + vcf_size + ref_size))
149+
150+
if step == "check":
151+
get_stats(
152+
b=batch,
153+
job_name=vcf_path,
154+
vcf=chrom_vcf,
155+
ref_fasta=ref_fasta_in,
156+
output_name=output_filename,
157+
out_dir=output_path,
158+
storage=disk_size
159+
)
160+
else:
161+
fix_alleles(
162+
b=batch,
163+
job_name=vcf_path,
164+
vcf=chrom_vcf,
165+
ref_fasta=ref_fasta_in,
166+
allele_mode=fix_mode,
167+
output_name=output_filename,
168+
out_dir=output_path,
169+
storage=disk_size
170+
)
171+
172+
batch.run()

gwaspy/check_alleles/check_fix_alleles.py

-60
This file was deleted.

0 commit comments

Comments
 (0)