Skip to content

Commit 88fdcea

Browse files
committed
init version 2 from Wits
0 parents  commit 88fdcea

File tree

92 files changed

+11169
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+11169
-0
lines changed

.gitignore

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
.git/
2+
.nextflow*
3+
.idea*
4+
work/
5+
output/*
6+
input/*
7+
/config-gen/nbproject/private/
8+
/config-gen/build/

README.md

+749
Large diffs are not rendered by default.

aux/check.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
2+
3+
4+
import pandas as pd
5+
6+
7+
m = pd.read_csv("/dataB/AWIGenGWAS/aux/H3Africa_2017_20021485_A3.csv",skiprows=7,usecols=["Name","IlmnStrand","RefStrand"],delimiter=",",dtype={"Chr":str})
8+
#s = pd.read_csv("/dataB/AWIGenGWAS/aux/H3Africa_2017_20021485_A3_StrandReport_FT.txt",usecols=["SNP_Name","Forward_Allele1","Top_AlleleA"],delim_whitespace=True,comment="#",dtype={"Chr":str})
9+
10+
xx = (m[(m['RefStrand']=="+")&(m['IlmnStrand']=="TOP")])["Name"]
11+
for snp in xx.values:
12+
print(snp)
13+
14+
15+

aux/fake.sh

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
head -n 8 $manifest > m.csv
2+
grep $1 $manifest >> m.csv
3+
head -n 6 $strand > s.csv
4+
grep $1 $strand >> s.csv

aux/make_ref.py

+178
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
2+
from __future__ import print_function
3+
4+
import pandas as pd
5+
from Bio import SeqIO
6+
import glob
7+
import re
8+
import os
9+
import sys
10+
import gzip
11+
import distance
12+
import argparse
13+
14+
15+
TAB=chr(9)
16+
EOL=chr(10)
17+
18+
def parseArguments():
19+
parser=argparse.ArgumentParser()
20+
parser.add_argument('reference_genome_dir', type=str, metavar='referencegenomedir'),
21+
parser.add_argument('strand_report', type=str, metavar='strand_report'),
22+
parser.add_argument('chip_manifest', type=str, metavar='chip_manifest'),
23+
parser.add_argument('output', type=str, metavar='output',help="output report"),
24+
parser.add_argument('--chrom-only',type=str,default=False,dest="chrom_only")
25+
parser.add_argument('--seg-len', type=int, dest='seg_len', metavar='seg_len',\
26+
default = 25, help="how much matches"),
27+
args = parser.parse_args()
28+
return args
29+
30+
31+
def getRef(path,build):
32+
match = os.path.join(path,str(build),"genomic","*fa.gz")
33+
all_files = glob.glob(match)
34+
genome = {}
35+
for fname in all_files:
36+
m = re.search(".*.chromosome\.(\w+)\.fa.gz",fname)
37+
if not m:
38+
sys.exit("Illegal file "+fname)
39+
chrom = m.group(1)
40+
if chrom not in chroms: continue
41+
print(chrom,end="\t")
42+
seq = SeqIO.read(gzip.open(fname,"rt"),"fasta")
43+
genome[chrom]=seq
44+
print()
45+
return genome
46+
47+
48+
def getData(directory, strand_fn, manifest_fn):
49+
genome = [None]*40
50+
51+
for build in [36,37]:
52+
print("Reading in build ",build)
53+
genome[build] = getRef(directory,build)
54+
55+
mf = pd.read_csv(manifest_fn,delimiter=",",dtype={"Chr":str}, skiprows=7)
56+
strand = \
57+
pd.read_csv(strand_fn,delim_whitespace=True,dtype={"Chr":str}, comment="#")
58+
return (genome, mf, strand)
59+
60+
61+
comp = {'A':'T','C':'G','G':'C','T':'A'}
62+
63+
64+
def rc(seq):
65+
m = ""
66+
for x in seq:
67+
m = comp.get(x,x)+m
68+
return m
69+
70+
71+
def match(ref_left,probe_left,ref_right,probe_right,RC,dist):
72+
if RC:
73+
tmp = probe_left
74+
probe_left = rc(probe_right)
75+
probe_right = rc(tmp)
76+
pl_len = len(probe_left)
77+
pl_rgt = len(probe_right)
78+
pleft = probe_left[-min(seg_len,pl_len):]
79+
pright = probe_right[:min(seg_len,pl_rgt)]
80+
rleft = ref_left[-min(seg_len,pl_len):].seq
81+
rright = ref_right[:min(seg_len,pl_rgt)].seq
82+
try:
83+
d1 = dist(rleft,pleft)
84+
d2 = dist(rright,pright)
85+
except ValueError:
86+
print("\nString length mismatch error,i,",snp["SNP_Name"])
87+
print("LEFT: ",rleft,len(rleft),pleft,len(pleft),"RC=",RC)
88+
print("RIGHT: ",rright,len(rright),pright,len(pright))
89+
return 50
90+
return d1+d2
91+
92+
93+
def warn(warnf,chrom_num,coord,snp,direc,score):
94+
if score>4:
95+
warnf.write(TAB.join(map(str,["WARN:",chrom_num,coord,snp,direc,score]))+EOL)
96+
97+
def alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst,dist=distance.hamming):
98+
seq = chromosome[coord-seg_len:coord+seg_len+1]
99+
base = seq.seq[seg_len]
100+
ref_pre = seq[0:seg_len]
101+
ref_post = seq[seg_len+1:]
102+
fwd = match(ref_pre,probe_pre,ref_post,probe_pst,False,dist)
103+
rev = 2000
104+
score = 1000
105+
align = "-"
106+
if fwd<10:
107+
align="fwd"
108+
score=fwd
109+
warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"+",fwd)
110+
else:
111+
rev=match(ref_pre,probe_pre,ref_post,probe_pst,True,dist)
112+
if rev<10:
113+
align="rev"
114+
score=rev
115+
warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"-",rev)
116+
if the_snp in [ "[D/I]", "[I/D]"]:
117+
align="fwd"
118+
base ="I"
119+
return align, base, score, fwd, rev
120+
121+
122+
args = parseArguments()
123+
if args.chrom_only:
124+
chroms = [ args.chrom_only ]
125+
else:
126+
chroms = list(map(str, range(1,23)))+['X','Y','MT']
127+
128+
(genome,mf,strand) = getData(args.reference_genome_dir,args.strand_report,args.chip_manifest)
129+
130+
131+
132+
133+
g = open(args.output+".ref","w")
134+
warnf = open(args.output+".wrn","w")
135+
errf = open(args.output+".err","w")
136+
seg_len = args.seg_len
137+
for i,snp in strand.iterrows():
138+
align = "fwd"
139+
base = snp["Top_AlleleA"] # if can't do better
140+
score = -1
141+
coord = snp["Coord"]-1
142+
build = int(snp['Build'])
143+
snp_name = snp["SNP_Name"]
144+
the_snp = mf.loc[i]["SNP"]
145+
chrom_num = snp["Chr"]
146+
if genome[build] == None:
147+
output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"build err"]))+EOL
148+
errf.write(output)
149+
output = TAB.join(map(str, [snp["SNP_Name"],chrom_num,coord+1,base,align]))+EOL
150+
g.write(output)
151+
continue
152+
if chrom_num not in genome[build]:
153+
output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"chrom_num_err"]))+EOL
154+
errf.write(output)
155+
output = TAB.join(map(str,[snp_name,chrom_num,coord+1,base,align]))+EOL
156+
g.write(output)
157+
continue
158+
top_seq = snp["Top_Seq"].upper()
159+
top_pre = top_seq.index("[")
160+
top_pst = top_seq.index("]")+1
161+
alleles = top_seq[top_pre+1:top_pst-1].split("/")
162+
probe_pre = top_seq[:top_pre]
163+
probe_pst = top_seq[top_pst:]
164+
chromosome = genome[build]["X" if chrom_num == "XY" else chrom_num]
165+
align, base, score, fwd, rev = alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst)
166+
if score==1000:
167+
align, base, score, fwd,rev = alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst,distance.levenshtein)
168+
if score<10:
169+
warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"BM",min(fwd,rev))
170+
else:
171+
output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"non_align",fwd,rev]))+EOL
172+
errf.write(output)
173+
output = TAB.join(map(str,[snp_name,chrom_num,coord+1,base,align]))+EOL
174+
g.write(output)
175+
g.close()
176+
errf.close()
177+
warnf.close()
178+

aux/updateFam.py

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import pandas as pd
5+
import sys
6+
import re
7+
8+
def parseArguments():
9+
parser=argparse.ArgumentParser()
10+
parser.add_argument('samplesheet', type=str, metavar='samplesheet'),
11+
parser.add_argument('updatesheet', type=str, metavar='updatesheet'),
12+
parser.add_argument('oldfam', type=str, metavar='oldfam'),
13+
parser.add_argument('output', type=str, metavar='output'),
14+
args = parser.parse_args()
15+
if "%s.fam"%args.output == args.oldfam:
16+
sys.exit("New and old fam cannot be the same")
17+
return args
18+
19+
EOL=chr(10)
20+
PIDS = ['FID','IID']
21+
22+
def getFam(famf):
23+
fam = pd.read_csv(famf,delim_whitespace=True,header=None,\
24+
names=["FID","IID","FAT","MAT","SEX","PHE"])
25+
old = ['OFID','OIID']
26+
fam['OFID']=fam['FID']
27+
fam['OIID']=fam['IID']
28+
fam.set_index(old, inplace=True)
29+
oldfam = fam.copy(deep=True)
30+
return oldfam,fam
31+
32+
def getSmplLbl(data):
33+
m=re.search(".*_(\w+)",data["Institute Sample Label"])
34+
if not m: sys.exit("Failed parsing "+data)
35+
return m.group(1)
36+
37+
38+
39+
args = parseArguments()
40+
oldfam,fam = getFam(args.oldfam)
41+
42+
orig = pd.read_excel(args.samplesheet)
43+
orig.set_index(["Institute Plate Label","Well"],inplace=True)
44+
orig['PID']=orig.apply(getSmplLbl,axis=1)
45+
update = pd.read_excel(args.updatesheet,skiprows=3)
46+
count=0
47+
g=open("%s.errs"%args.output,"w")
48+
h=open("%s.switch"%args.output,"w")
49+
50+
for i, row in update.iterrows():
51+
if "IlluminaControl" in row["Institute Sample Label"]: continue
52+
new_lbl=getSmplLbl(row)
53+
pos = tuple(row[['Institute Plate Label','Well']].values)
54+
oldid = getSmplLbl(orig.loc[pos])
55+
if oldid != new_lbl:
56+
h.write("%s -> %s \n"%(oldid,new_lbl))
57+
count=count+1
58+
if fam.index.contains((new_lbl,new_lbl)):
59+
fam.loc[(oldid,oldid),["FID","IID"]] = new_lbl
60+
fam.loc[(oldid,oldid),["FAT","MAT","SEX","PHE"]] = oldfam.loc[(new_lbl,new_lbl),["FAT","MAT","SEX", "PHE"]]
61+
else:
62+
g.write(new_lbl+EOL)
63+
continue
64+
if "unknown" in row["Institute Sample Label"]:
65+
print("Pos=<%s>; Old =<%s>; New=<%s>"%(pos,oldid,new_lbl))
66+
print(fam.loc[(oldid,oldid)]['IID'])
67+
68+
69+
g.close()
70+
h.close()
71+
all_ids = fam[PIDS].to_records(index=False).tolist()
72+
uniq =set([])
73+
orig.set_index('PID',inplace=True)
74+
for x in all_ids:
75+
if x in uniq:
76+
print("Duplicated element ",x[0],orig.loc[x[0],"Institute Sample Label"])
77+
else:
78+
uniq.add(x)
79+
80+
fam.to_csv("%s.fam"%args.output,sep="\t",header=None,index=False),

bin/qc1logextract.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/usr/bin/env python3
2+
3+
from __future__ import print_function
4+
5+
6+
import sys
7+
import re
8+
9+
f = open(sys.argv[1])
10+
rem_name=sys.argv[2]
11+
rem_inds = rem_snps = rem_maf = rem_hwe = 0
12+
13+
autosome=False
14+
nonautosome = 0
15+
for line in f:
16+
if "Options in effect" in line:
17+
autosome=False
18+
continue
19+
if "--autosome" in line:
20+
autosome=True
21+
continue
22+
m =re.search("(\w+) out of (\w+) variants loaded from .bim file.", line)
23+
if m and autosome:
24+
nonautosome = int(m.group(2))-int(m.group(1))
25+
continue
26+
m=re.search("(\w+) (\w+) removed due to ([-A-z0-9]+ \w+)",line)
27+
if m:
28+
#print(m.group(1),m.group(2),m.group(3))
29+
n = int(m.group(1))
30+
if m.group(2)=="variants":
31+
if m.group(3)=="missing genotype":
32+
rem_snps = rem_snps+n
33+
elif m.group(3)=="Hardy-Weinberg exact":
34+
rem_hwe = rem_hwe+n
35+
else:
36+
rem_maf = rem_maf + n
37+
else:
38+
rem_inds = rem_inds + n
39+
40+
text = """
41+
*-noindent
42+
Using this approach,
43+
*-begin{itemize}
44+
*-item %d SNPs that are non-autosomal were removed;
45+
*-item %d SNPs were removed due missing genotype threshold constraints;
46+
*-item %d individuals were removed due to missing genotype constraints (the list of missing individuals, if any, can be found in the file *-url{%s});
47+
*-item %d SNPs were removed as the MAF was too low.
48+
*-item %d SNPs were removed as they were out of the specified Hardy-Weinberg equilibrium.
49+
*-end{itemize}
50+
""" %(nonautosome,rem_snps,rem_inds,rem_name,rem_maf,rem_hwe)
51+
52+
print(text.replace("*-",chr(92)).replace("##",chr(36)))

0 commit comments

Comments
 (0)