|
| 1 | +with open('SLmic1.0_gene_models.gff') as fp: |
| 2 | + data = fp.readlines() |
| 3 | + data = [item.replace('\n','') for item in data] |
| 4 | + for i in range(len(data)): |
| 5 | + data[i] = data[i].split() |
| 6 | +cds_dict = {} |
| 7 | +for i in range(len(data)): |
| 8 | + #print(data[i][0]) |
| 9 | + cds_dict[f'{data[i][0]}'] = [] |
| 10 | +# print(cds_dict) |
| 11 | +for i in range(len(data)): |
| 12 | + if data[i][2] == 'CDS': |
| 13 | + cds_dict[f'{data[i][0]}'].append([data[i][-1][3:13],int(data[i][3])-1,int(data[i][4])])# 파일의 1-based에서 0-based로 바뀐 것을 고려(단, 슬라이싱을 고려하여 end좌표는 1을 빼지 않음) |
| 14 | +cds_dict = {key:value for key,value in cds_dict.items() if key == 'SLmic1.0_chr1' |
| 15 | + or key == 'SLmic1.0_chr2' |
| 16 | + or key == 'SLmic1.0_chr3' |
| 17 | + or key == 'SLmic1.0_chr4' |
| 18 | + or key == 'SLmic1.0_chr5' |
| 19 | + or key == 'SLmic1.0_chr6' |
| 20 | + or key == 'SLmic1.0_chr7' |
| 21 | + or key == 'SLmic1.0_chr8' |
| 22 | + or key == 'SLmic1.0_chr9' |
| 23 | + or key == 'SLmic1.0_chr10' |
| 24 | + or key == 'SLmic1.0_chr11' |
| 25 | + or key == 'SLmic1.0_chr12'} |
| 26 | + |
| 27 | +from Bio import SeqIO |
| 28 | +# 파일 형식: (sequence), (chr), (location), (+/-), (gene) |
| 29 | +cleavage_dict = {} |
| 30 | +chromo_list = ['SLmic1.0_chr1','SLmic1.0_chr2','SLmic1.0_chr3','SLmic1.0_chr4','SLmic1.0_chr5','SLmic1.0_chr6','SLmic1.0_chr7','SLmic1.0_chr8','SLmic1.0_chr9','SLmic1.0_chr10','SLmic1.0_chr11','SLmic1.0_chr12'] |
| 31 | + |
| 32 | +for i in chromo_list: |
| 33 | + cleavage_dict[i] = [] |
| 34 | + |
| 35 | + |
| 36 | +for seq_record in SeqIO.parse("GCA_012431665.1_SLYMIC_genomic.fa", "fasta"): |
| 37 | + seq = seq_record.seq |
| 38 | + |
| 39 | + seq=str(seq.upper()) |
| 40 | + |
| 41 | + print(seq_record.id) |
| 42 | + |
| 43 | + # cleavage, start_point, sequence, ngg/ccn list |
| 44 | + if seq_record.id in chromo_list: |
| 45 | + for i in range(21,len(seq)-1): |
| 46 | + if seq[i] == 'G' and seq[i+1]=='G':# ngg-3, ngg-20, sequence |
| 47 | + cleavage_dict[f'{seq_record.id}'].append([i-4,i-21,seq[i-21:i+2],'+']) |
| 48 | + for i in range(len(seq)-24): |
| 49 | + if seq[i] == 'C' and seq[i+1]=='C' and (seq_record.id in chromo_list):# ccn+6, ccn, sequence |
| 50 | + cleavage_dict[f'{seq_record.id}'].append([i+5,i,seq[i:i+23][::1],'-']) |
| 51 | + |
| 52 | + |
| 53 | +# 파일 형식: (sequence), (chr), (location), (+/-), (gene) |
| 54 | +f = open("first.txt", 'w') |
| 55 | +f.write('# (sequence), (chr), (location), (+/-), (gene)\n') |
| 56 | +for i in cleavage_dict: |
| 57 | + print(i) |
| 58 | + for j in cds_dict[i]: |
| 59 | + for k in cleavage_dict[i]: |
| 60 | + if (j[1] <= k[0]) and (k[0] < j[2]) : |
| 61 | + f.write(f'({k[2]}), ({i}), ({k[1]}), ({k[-1]}), ({j[0]})\n') |
| 62 | +f.close() |
| 63 | + |
0 commit comments