|
| 1 | + |
| 2 | +from __future__ import print_function |
| 3 | + |
| 4 | +import pandas as pd |
| 5 | +from Bio import SeqIO |
| 6 | +import glob |
| 7 | +import re |
| 8 | +import os |
| 9 | +import sys |
| 10 | +import gzip |
| 11 | +import distance |
| 12 | +import argparse |
| 13 | + |
| 14 | + |
| 15 | +TAB=chr(9) |
| 16 | +EOL=chr(10) |
| 17 | + |
| 18 | +def parseArguments(): |
| 19 | + parser=argparse.ArgumentParser() |
| 20 | + parser.add_argument('reference_genome_dir', type=str, metavar='referencegenomedir'), |
| 21 | + parser.add_argument('strand_report', type=str, metavar='strand_report'), |
| 22 | + parser.add_argument('chip_manifest', type=str, metavar='chip_manifest'), |
| 23 | + parser.add_argument('output', type=str, metavar='output',help="output report"), |
| 24 | + parser.add_argument('--chrom-only',type=str,default=False,dest="chrom_only") |
| 25 | + parser.add_argument('--seg-len', type=int, dest='seg_len', metavar='seg_len',\ |
| 26 | + default = 25, help="how much matches"), |
| 27 | + args = parser.parse_args() |
| 28 | + return args |
| 29 | + |
| 30 | + |
| 31 | +def getRef(path,build): |
| 32 | + match = os.path.join(path,str(build),"genomic","*fa.gz") |
| 33 | + all_files = glob.glob(match) |
| 34 | + genome = {} |
| 35 | + for fname in all_files: |
| 36 | + m = re.search(".*.chromosome\.(\w+)\.fa.gz",fname) |
| 37 | + if not m: |
| 38 | + sys.exit("Illegal file "+fname) |
| 39 | + chrom = m.group(1) |
| 40 | + if chrom not in chroms: continue |
| 41 | + print(chrom,end="\t") |
| 42 | + seq = SeqIO.read(gzip.open(fname,"rt"),"fasta") |
| 43 | + genome[chrom]=seq |
| 44 | + print() |
| 45 | + return genome |
| 46 | + |
| 47 | + |
| 48 | +def getData(directory, strand_fn, manifest_fn): |
| 49 | + genome = [None]*40 |
| 50 | + |
| 51 | + for build in [36,37]: |
| 52 | + print("Reading in build ",build) |
| 53 | + genome[build] = getRef(directory,build) |
| 54 | + |
| 55 | + mf = pd.read_csv(manifest_fn,delimiter=",",dtype={"Chr":str}, skiprows=7) |
| 56 | + strand = \ |
| 57 | + pd.read_csv(strand_fn,delim_whitespace=True,dtype={"Chr":str}, comment="#") |
| 58 | + return (genome, mf, strand) |
| 59 | + |
| 60 | + |
| 61 | +comp = {'A':'T','C':'G','G':'C','T':'A'} |
| 62 | + |
| 63 | + |
| 64 | +def rc(seq): |
| 65 | + m = "" |
| 66 | + for x in seq: |
| 67 | + m = comp.get(x,x)+m |
| 68 | + return m |
| 69 | + |
| 70 | + |
| 71 | +def match(ref_left,probe_left,ref_right,probe_right,RC,dist): |
| 72 | + if RC: |
| 73 | + tmp = probe_left |
| 74 | + probe_left = rc(probe_right) |
| 75 | + probe_right = rc(tmp) |
| 76 | + pl_len = len(probe_left) |
| 77 | + pl_rgt = len(probe_right) |
| 78 | + pleft = probe_left[-min(seg_len,pl_len):] |
| 79 | + pright = probe_right[:min(seg_len,pl_rgt)] |
| 80 | + rleft = ref_left[-min(seg_len,pl_len):].seq |
| 81 | + rright = ref_right[:min(seg_len,pl_rgt)].seq |
| 82 | + try: |
| 83 | + d1 = dist(rleft,pleft) |
| 84 | + d2 = dist(rright,pright) |
| 85 | + except ValueError: |
| 86 | + print("\nString length mismatch error,i,",snp["SNP_Name"]) |
| 87 | + print("LEFT: ",rleft,len(rleft),pleft,len(pleft),"RC=",RC) |
| 88 | + print("RIGHT: ",rright,len(rright),pright,len(pright)) |
| 89 | + return 50 |
| 90 | + return d1+d2 |
| 91 | + |
| 92 | + |
| 93 | +def warn(warnf,chrom_num,coord,snp,direc,score): |
| 94 | + if score>4: |
| 95 | + warnf.write(TAB.join(map(str,["WARN:",chrom_num,coord,snp,direc,score]))+EOL) |
| 96 | + |
| 97 | +def alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst,dist=distance.hamming): |
| 98 | + seq = chromosome[coord-seg_len:coord+seg_len+1] |
| 99 | + base = seq.seq[seg_len] |
| 100 | + ref_pre = seq[0:seg_len] |
| 101 | + ref_post = seq[seg_len+1:] |
| 102 | + fwd = match(ref_pre,probe_pre,ref_post,probe_pst,False,dist) |
| 103 | + rev = 2000 |
| 104 | + score = 1000 |
| 105 | + align = "-" |
| 106 | + if fwd<10: |
| 107 | + align="fwd" |
| 108 | + score=fwd |
| 109 | + warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"+",fwd) |
| 110 | + else: |
| 111 | + rev=match(ref_pre,probe_pre,ref_post,probe_pst,True,dist) |
| 112 | + if rev<10: |
| 113 | + align="rev" |
| 114 | + score=rev |
| 115 | + warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"-",rev) |
| 116 | + if the_snp in [ "[D/I]", "[I/D]"]: |
| 117 | + align="fwd" |
| 118 | + base ="I" |
| 119 | + return align, base, score, fwd, rev |
| 120 | + |
| 121 | + |
| 122 | +args = parseArguments() |
| 123 | +if args.chrom_only: |
| 124 | + chroms = [ args.chrom_only ] |
| 125 | +else: |
| 126 | + chroms = list(map(str, range(1,23)))+['X','Y','MT'] |
| 127 | + |
| 128 | +(genome,mf,strand) = getData(args.reference_genome_dir,args.strand_report,args.chip_manifest) |
| 129 | + |
| 130 | + |
| 131 | + |
| 132 | + |
| 133 | +g = open(args.output+".ref","w") |
| 134 | +warnf = open(args.output+".wrn","w") |
| 135 | +errf = open(args.output+".err","w") |
| 136 | +seg_len = args.seg_len |
| 137 | +for i,snp in strand.iterrows(): |
| 138 | + align = "fwd" |
| 139 | + base = snp["Top_AlleleA"] # if can't do better |
| 140 | + score = -1 |
| 141 | + coord = snp["Coord"]-1 |
| 142 | + build = int(snp['Build']) |
| 143 | + snp_name = snp["SNP_Name"] |
| 144 | + the_snp = mf.loc[i]["SNP"] |
| 145 | + chrom_num = snp["Chr"] |
| 146 | + if genome[build] == None: |
| 147 | + output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"build err"]))+EOL |
| 148 | + errf.write(output) |
| 149 | + output = TAB.join(map(str, [snp["SNP_Name"],chrom_num,coord+1,base,align]))+EOL |
| 150 | + g.write(output) |
| 151 | + continue |
| 152 | + if chrom_num not in genome[build]: |
| 153 | + output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"chrom_num_err"]))+EOL |
| 154 | + errf.write(output) |
| 155 | + output = TAB.join(map(str,[snp_name,chrom_num,coord+1,base,align]))+EOL |
| 156 | + g.write(output) |
| 157 | + continue |
| 158 | + top_seq = snp["Top_Seq"].upper() |
| 159 | + top_pre = top_seq.index("[") |
| 160 | + top_pst = top_seq.index("]")+1 |
| 161 | + alleles = top_seq[top_pre+1:top_pst-1].split("/") |
| 162 | + probe_pre = top_seq[:top_pre] |
| 163 | + probe_pst = top_seq[top_pst:] |
| 164 | + chromosome = genome[build]["X" if chrom_num == "XY" else chrom_num] |
| 165 | + align, base, score, fwd, rev = alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst) |
| 166 | + if score==1000: |
| 167 | + align, base, score, fwd,rev = alignSNP(warnf,chromosome,coord,snp,the_snp,probe_pre,probe_pst,distance.levenshtein) |
| 168 | + if score<10: |
| 169 | + warn(warnf,chrom_num,coord+1,snp["SNP_Name"],"BM",min(fwd,rev)) |
| 170 | + else: |
| 171 | + output = TAB.join(map(str, [snp_name,chrom_num,str(coord+1),base, build,"non_align",fwd,rev]))+EOL |
| 172 | + errf.write(output) |
| 173 | + output = TAB.join(map(str,[snp_name,chrom_num,coord+1,base,align]))+EOL |
| 174 | + g.write(output) |
| 175 | +g.close() |
| 176 | +errf.close() |
| 177 | +warnf.close() |
| 178 | + |
0 commit comments