forked from schneiderthomas/OTA-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclipPrimers_manifest_speed.py
819 lines (703 loc) · 38.2 KB
/
clipPrimers_manifest_speed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
#!/usr/bin/python
import pysam
import csv
import Bio.Seq
import regex
import sys, getopt
# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# Returns "Start" if You need to look for the primer at the "start" of the read, returns "End" if you need to look for the primer at the end of the read
# parameters are passed by reference by default
#FOR REFERENCE NOT USING ANYMORE
def where_to_look_for_primer(selectedread):
if selectedread.is_reverse:
if selectedread.is_read1:
currentPrimerList = end_pos
else:
currentPrimerList = end_neg
else:
if selectedread.is_read1:
currentPrimerList = start_neg
else:
currentPrimerList = start_pos
return currentPrimerList
#FOR REFERENCE NOT USING ANYMORE
def getmatch(selectedread, primerList, target_portion_read):
count_end = len(primerList)-1
count = 0
endreadmatchlength = None
returnlist = None
foundPrimerLength = None
while (count <= count_end):
# CHECKING IF THE READ IS REVERSED, IF IT IS REVERSED THEN ENDING SEQUENCE CONTAINS THE PRIMER
# IF THE READ IS NOT REVERSED THEN THE BEGINNED SEQUENCE CONTAINS THE PRIMER
currentPrimer = primerList[count]
if selectedread.is_reverse:
endPrimer = currentPrimer[::-1]
endtestprimer = "(" + endPrimer + ")" + "{e<=2}"
end_reverse_read = target_portion_read[::-1]
match = regex.match(endtestprimer, end_reverse_read)
else:
# searching the beginning, with allowable error of 2 or less
testprimer = "(" + currentPrimer + ")" + "{e<=2}"
match = regex.match(testprimer, target_portion_read)
# index_of_primer_start = read.query_sequence.find(currentPrimer,0,40)
# if index_of_primer != -1:
if match != None:
count = count_end + 1
endreadmatchlength = match.span()[1]
foundPrimerLength = len(currentPrimer)
returnlist = [endreadmatchlength, foundPrimerLength]
else:
if count != count_end:
count = count + 1
else:
# THIS MEANS THERE IS NO MATCHES AND THEREFORE UNSURE WHAT PRIMER USED AND SO CUTTING OFF THE LENGTH OF THE LARGEST PRIMER WHICH IS THIRTY
count = count + 1
return returnlist
def getmatchall(readforward, readreverse, customPrimerList, usePosStrand, target_portion_read_forward, target_portion_read_reverse):
# forward and reverse fasta files should be the same length
currentChr=readforward.tid
currentPos=readforward.pos
count_end = len(customPrimerList)-1
count = 0
endreadmatchlengthforward = None
endreadmatchlengthreverse = None
returnlist = None
foundPrimerLengthforward = None
foundPrimerLengthreverse = None
while (count <= count_end):
if customPrimerList[count][0] == '-' and usePosStrand:
count= count +1
continue
elif int(customPrimerList[count][1]) != currentChr:
count= count +1
continue
elif abs(int(customPrimerList[count][3])-currentPos) > 20000:#note the tile is based on exon size, the ie the start and stop from the manifest cooresponds to exon size, the largest exon in the human genome is 18200 bp
count= count + 1
continue
# CHECKING IF THE READ IS REVERSED, IF IT IS REVERSED THEN ENDING SEQUENCE CONTAINS THE PRIMER
# IF THE READ IS NOT REVERSED THEN THE BEGINNED SEQUENCE CONTAINS THE PRIMER
currentPrimerforward = customPrimerList[count][4]
currentPrimerreverse = customPrimerList[count][5]
# if selectedread.is_reverse:
endPrimerreverse = currentPrimerreverse[::-1]
testprimerreverse = "(" + endPrimerreverse + ")" + "{e<=2}"
end_reverse_read = target_portion_read_reverse[::-1]
matchreverse = regex.match(testprimerreverse, end_reverse_read)
# else:
# searching the beginning, with allowable error of 2 or less
testprimerforward = "(" + currentPrimerforward + ")" + "{e<=2}"
matchforward = regex.match(testprimerforward, target_portion_read_forward)
# index_of_primer_start = read.query_sequence.find(currentPrimer,0,40)
# if index_of_primer != -1:
if matchreverse != None and matchforward != None:
count = count_end + 1
endreadmatchlengthforward = matchforward.span()[1]
endreadmatchlengthreverse = matchreverse.span()[1]
foundPrimerLengthforward = len(currentPrimerforward)
foundPrimerLengthreverse = len(currentPrimerreverse)
returnlist = [endreadmatchlengthforward, foundPrimerLengthforward, endreadmatchlengthreverse, foundPrimerLengthreverse]
elif matchreverse != None:
#print "Reverse matched but forward did not"
#print "The read forward is below"
#print readforward
#print target_portion_read_forward
#print "The read reverse is below"
#print readreverse
#print target_portion_read_reverse
#print "The fasta file forward is below"
#print primerListforward.filename
#print "The fasta file reverse is below"
#print primerListreverse.filename
count = count + 1
elif matchforward != None:
#print "Forward matched but reverse did not"
#print "The read forward is below"
#print readforward
#print target_portion_read_forward
#print "The read reverse is below"
#print readreverse
#print target_portion_read_reverse
#print "The fasta file forward is below"
#print primerListforward.filename
#print "The fasta file reverse is below"
#print primerListreverse.filename
count = count + 1
else:
count = count + 1
return returnlist
def change_cigar_either(booleanIsReverse,originalcigar, endreadmatchlength, numberlist, numberlistint, letterlist, expectedEndReadMatchLength, readLength, read):
# because changing read.position
global forwardread
global reverseread
oldAlignmentStart = 0
# print sum(numberlistint)
#if read.qname != "NS500796:7:H77VTAFXX:1:11107:3753:3856":
# return False
# print 'Found NS500796:7:H77VTAFXX:1:11107:3753:3856'
# print 'The endreadmatchlength is ' + str(endreadmatchlength)
if booleanIsReverse:
numberlist = numberlist[::-1]
numberlistint = numberlistint[::-1]
letterlist = letterlist[::-1]
else:
oldAlignmentStart = forwardread.qstart
# #THERE MAY BE A SITUATION IF THERE IS TANDEM DUPLICATION shortly after a probe and the duplicated entry gets a match even though THOUGH THERE ARE TOO MANY DELETIONS, NEED TO GET RID OF THIS SCENARIO BECAUSE ONLY EXPECTING 2 deletions at most
# IF IT IS NOT LESS THEN OR EQUAL TO endreadmatchlength
indexchanging = 0
deletionPresent = False
# numberToSubtract is dependent on whether or not there is a deletion present in the primer area, need to adjust position by the number of deletions present
# (shouldn't be more than 2 since only 2 deletions are allowed, if more then read should be thrown out)
# numberToSubtract and InsertionCount, only works if in the primer
DeletionCount = 0
DeletionCountExceptEnd=0
InsertionCount = 0
insertionAtEnd = 0
booleanDeletionAtEnd = False
booleanInsertionAtEnd = False
runningtotalBeforeDeletion=0
if not sum(numberlistint) <= endreadmatchlength:
runningTotal = 0
for index in range(len(numberlist)):
#
if endreadmatchlength <= numberlistint[index] + runningTotal:
indexchanging = index
runningTotal = runningTotal + numberlistint[index]
#if read.qname == "PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer":
# print 'Found PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer'
# print 'The endreadmatchlength is ' + str(endreadmatchlength)
# print 'The current letter is ' + letterlist[index]
# print 'The runningTotal is ' + str(runningTotal)
# print 'The DeletionCount is ' + str(DeletionCount)
if letterlist[index] == 'D' or letterlist[index] == 'H' or letterlist[index] == 'N' or letterlist[index] == 'P':
#print 'I am trying to modify a read with a deletion at the end of the primer'
#print 'The read is' + str(read.qname)
deletionPresent = True
booleanDeletionAtEnd = True
# print 'The number to Subtract is ' + str(DeletionCount)
#print 'The runningTotal is ' + str(runningTotal)
#print 'The endereadmatchlength is ' + str(endreadmatchlength)
DeletionCountExceptEnd = DeletionCount
DeletionCount = DeletionCount + (runningTotal - endreadmatchlength)+1
runningtotalBeforeDeletion = runningTotal - numberlistint[index]
if letterlist[index] == 'I':
# IF INSERTION IS AT END DON'T NEED TO MODIFY POSITION, SO DON'T ADD TO InsertionCount
# ie if 24M3I94M, and primer is 26 so want 26S, need 26S3I92M; it may not be perfectly aligned right, pathologist will need to adjust
insertionAtEnd = endreadmatchlength - (runningTotal - numberlistint[index])
#print 'There is an insertion at the end and the insertionAtEnd is' + str(insertionAtEnd)
#print 'The read name is ' + str(read.qname)
#print 'The original cigar is ' + str(originalcigar)
#print 'This is a reverse read ' + str(booleanIsReverse)
if DeletionCount + InsertionCount + insertionAtEnd >= 3:
# there are too many deletions or insertions in the primer region, can't accept
# ALSO CANT ACCEPT IF INSERTION AT END OF PRIMER LIKELY ERROR
#if read.qname == "M01382:52:000000000-AAECF:1:2102:21256:8889":
# print 'throwing out M01382:52:000000000-AAECF:1:2102:21256:8889'
# print 'InsertionCount is ' + str(InsertionCount)
return True
break
else:
if letterlist[index] == 'D' or letterlist[index] == 'H' or letterlist[index] == 'N' or letterlist[index] == 'P':
deletionPresent = True
DeletionCount = DeletionCount + numberlistint[index]
#if read.qname == "PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer":
# print 'Found PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer where it should not be'
# print 'The index is' + str(index)
# print 'The endreadmatchlength is' + str(endreadmatchlength)
elif letterlist[index] == 'I':
InsertionCount = InsertionCount + numberlistint[index]
runningTotal = runningTotal + numberlistint[index]
else:
return
#if read.qname == "PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer":
# print 'Found PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer 2'
# print 'The DeletionCount is ' + str(DeletionCount)
# check if deletion is present in the primer (if so need to adjust cigar so the cigar length is what is expected, need to substract from not the primer)
# ie if 100M1D21M, need it to be 94M27S not 95M27S
# if expected endreadmatchlength do not match that means there was either a 1 or 2 bp deletion or 1 or 2 bp insertion
# need to subtract or add on to the last
# IF INSERTION NEED TO MOVE POSITION BACK INSTEAD OF FORWARD
#print "The indexchanging is " + str(indexchanging)
#print "The runningTotal is " + str(runningTotal)
#print "The indexchanging letter is " + str(numberlistint[indexchanging])
#print "The current letter is " + str(letterlist[indexchanging])
#print "The endreadmatchlength is " + str(endreadmatchlength)
#TO MAKE THINGS EASIER
#numberToSubtractTotal = DeletionCount + InsertionCount + insertionAtEnd
TotalInsertionsPresent = InsertionCount + insertionAtEnd
# #PLEASENOTE: INSERTIONATEND DOES NOT CHANGE POSITION
newcigar = ""
newcigarnumberlist = []
newcigarletterlist = []
needtomodifynumber = False
numberToSubtractTotal = 0
for index in range(len(numberlist)):
if index == 0:
# newcigar=newcigar+str(endreadmatchlength)+'S'
# newcigarnumberlist.append(str(endreadmatchlength-numberToSubtract))
if booleanDeletionAtEnd == True:
newcigarnumberlist.append(str(runningtotalBeforeDeletion + InsertionCount -DeletionCountExceptEnd))
else:
newcigarnumberlist.append(str(endreadmatchlength + InsertionCount -DeletionCount))
newcigarletterlist.append('S')
if indexchanging == 0 and not (letterlist[indexchanging] == 'S' or letterlist[indexchanging] == 'H'):
# newcigar=newcigar+str(runningTotal-endreadmatchlength)+letterlist[indexchanging]
##TODO MAKE SUBFUNCTION AS CALLED TWICE
newnumber= runningTotal - (endreadmatchlength + InsertionCount)
if newnumber > 0:
newcigarnumberlist.append(str(newnumber))
newcigarletterlist.append(letterlist[indexchanging])
elif newnumber == 0:
continue
else:
# if read.qname == "PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer":
# print 'Found PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer 3'
numberToSubtractTotal = abs(newnumber)
needtomodifynumber= True
###END TODO
# else just keep the position the same
else:
if indexchanging == 0:
# IF THE CURRENT S or H is larger then the primer then get rid of the primer length added to the newcigarnumberlist
del newcigarnumberlist[len(newcigarnumberlist) - 1]
newcigarnumberlist.append(str(numberlistint[0]))
elif index < indexchanging:
continue
elif index == indexchanging:
# ##DO I NEED ENDREADMATCHLENGTH?? WHY NOT USE EXPECTED??? -> NEED IN CASE DELETION
if letterlist[indexchanging] == 'D' or letterlist[indexchanging] == 'H':
#print 'I am trying to modify a read with a deletion at the end of the primer'
#print 'The read is ' + str(read.qname)
needtomodifynumber = True
booleanDeletionAtEnd = True
newnumber= runningTotal - (endreadmatchlength + InsertionCount)
#print 'The newnumber is ' + str(newnumber)
if newnumber > 0:
newcigarnumberlist.append(str(newnumber))
newcigarletterlist.append(letterlist[indexchanging])
else:
numberToSubtractTotal = abs(newnumber)
# DO NOTHING, THE DELETION LETTER WILL BE REMOVES (ONLY TWO SCENARIOS REMEMBER CAN'T HAVE MORE THAN 2 INS OR DEL
elif letterlist[indexchanging] == 'I':
#NOTE: WE ARE NOT MODIFYING IF THERE IS AN INSERTION AT THE END
needtomodifynumber = True
booleanInsertionAtEnd = True
numberToSubtractTotal = TotalInsertionsPresent
newcigarnumberlist.append(numberlist[indexchanging])
newcigarletterlist.append(letterlist[indexchanging])
else:
# if read.qname == "PTENex6.chr10.89711875_Del_right_after_End_of_Primer":
# print 'Found PTENex6.chr10.89711875_Del_right_after_End_of_Primer'
# print 'The numberToSubtract is ' + str(numberToSubtractTotal)
# print 'The number of the index being changed is ' + str(numberlist[indexchanging])
# print 'The letter of the index being changed is ' + str(letterlist[indexchanging])
# print 'The answer to the calculation is ' + str(runningTotal - endreadmatchlength - numberToSubtractTotal)
##TODO MAKE SUBFUNCTION AS CALLED TWICE
#newnumber= runningTotal - (endreadmatchlength + InsertionCount - (DeletionCount*2))
newnumber= runningTotal - (endreadmatchlength + InsertionCount)
if newnumber > 0:
newcigarnumberlist.append(str(newnumber))
newcigarletterlist.append(letterlist[indexchanging])
elif newnumber == 0:
continue
else:
# if read.qname == "PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer":
# print 'Found PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer 4'
# print 'The number of the index being changed is ' + str(numberlist[indexchanging])
# print 'The letter of the index being changed is ' + str(letterlist[indexchanging])
# print 'runningTotal is ' + str(runningTotal)
# print 'numberToSubstract is' + str(numberToSubtractTotal)
# print 'endreadmatchlength is ' + str(endreadmatchlength)
numberToSubtractTotal = abs(newnumber)
needtomodifynumber= True
##END TODO
# newcigarnumberlist.append(str(runningTotal-endreadmatchlength))
# newcigarletterlist.append(letterlist[indexchanging])
else:
# newcigar=newcigar+numberlist[index]+letterlist[index]
# if read.qname == "PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer":
# print 'Found PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer 1'
# print 'The InsertionCount is' + str(InsertionCount)
# print 'The DeletionCount is' + str(DeletionCount)
# print 'The numberToSubtract is ' + str(numberToSubtractTotal)
# print 'Is there a deletion at the end' + str(booleanDeletionAtEnd)
if needtomodifynumber and letterlist[index] != 'D' and letterlist[index] !='I':
#if read.qname == "PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer":
# print 'Found PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer 2FFFFF'
# print 'The InsertionCount is' + str(InsertionCount)
# print 'The DeletionCount is' + str(DeletionCount)
#print 'The numberToSubtract is ' + str(numberToSubtractTotal)
#print 'Is there a deletion at the end' + str(booleanDeletionAtEnd)
#print 'The current number is ' + str(numberlistint[index])
#print 'The current letter is ' + str(letterlist[index])
newnumber = numberlistint[index] - numberToSubtractTotal
if newnumber > 0:
newcigarnumberlist.append(str(newnumber))
newcigarletterlist.append(letterlist[index])
needtomodifynumber = False
else:
numberToSubtractTotal = numberToSubtractTotal - numberlistint[index] + 1
newcigarnumberlist.append(str(1))
newcigarletterlist.append(letterlist[index])
elif needtomodifynumber and letterlist[index] == 'D' and InsertionCount >0:
#THIS IS A SPECIFIC SITUATION AND NEED TO ADD TO DELETIONCOUNT IN ORDER TO PROPERLY POSITION THE READ IF IT IS FORWARD
DeletionCount = DeletionCount + 1
#if read.qname == "PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer":
# print 'Found PTENex6.chr10.89711875_Del_at_End_of_Primer_plus_Ins_in_primer 2kl;jl'
# print 'The InsertionCount is' + str(InsertionCount)
# print 'The DeletionCount is' + str(DeletionCount)
# print 'The numberToSubtract is ' + str(numberToSubtractTotal)
# print 'Is there a deletion at the end' + str(booleanDeletionAtEnd)
# print 'The current number is ' + str(numberlistint[index])
# print 'The current letter is ' + str(letterlist[index])
newnumber = numberlistint[index] - numberToSubtractTotal
if newnumber > 0:
newcigarnumberlist.append(str(newnumber))
newcigarletterlist.append(letterlist[index])
needtomodifynumber = False
elif newnumber == 0:
#KNOW THAT THIS IS A CERTAIN SITUATION IN WHICH WILL WANT TO REDUCE SIZE OF CLIP
newcigarnumberlist[len(newcigarletterlist)-1]=str(int(newcigarnumberlist[len(newcigarletterlist)-1])-1)
needtomodifynumber = False
else:
numberToSubtractTotal = numberToSubtractTotal - numberlistint[index] + 1
newcigarnumberlist.append(str(1))
newcigarletterlist.append(letterlist[index])
else:
newcigarnumberlist.append(numberlist[index])
newcigarletterlist.append(letterlist[index])
if booleanIsReverse:
newcigarnumberlist = newcigarnumberlist[::-1]
newcigarletterlist = newcigarletterlist[::-1]
for index in range(len(newcigarletterlist)):
###TODO GET RID OF IF FOUND NOT TO HAVE ERRORS
##if int(newcigarnumberlist[index]) <= 0:
### continue
newcigar = newcigar + newcigarnumberlist[index] + newcigarletterlist[index]
# Sum of lengths of the M/I/S/=/X operations shall equal the length of SEQ
if booleanIsReverse:
reverseread.cigarstring = newcigar
if not booleanIsReverse:
forwardread.cigarstring = newcigar
# need to adjust position if a deletion was present in the primer area so adding + numbertosubtract
forwardread.pos = forwardread.pos + forwardread.qstart - oldAlignmentStart + DeletionCount - InsertionCount
return False
def main(argv):
global inputfile
global outputfile
global start_pos
global end_pos
global end_neg
global start_neg
global library_type
global reverseread
global forwardread
global current
inputfile = ''
outputfile = ''
library_type = ''
try:
opts, args = getopt.getopt(argv, "hi:o:t:m:", ["ifile=", "ofile=", "librarytype=", "manifest="])
except getopt.GetoptError:
print 'clipPrimers.py -i <inputfile> -o <outputfile> -t <library_type> -m <Illumina manifest file>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'clipPrimers.py -i <inputfile> -o <outputfile> -t <library_type> -m <Illumina manifest file>'
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
elif opt in ("-t", "--librarytype"):
library_type = arg
elif opt in ("-m", "--manifest"):
manifest = arg
print 'The manifest file is "', manifest
print 'Input file is "', inputfile
print 'Output file "', outputfile
#start pos is postive strand ULSO sequences
start_pos=[]
#start neg is negative strand DLSO sequences, note these are reverse complement of sequences in manifest
start_neg=[]
#end pos is positive strand DLSO sequences
end_pos=[]
#end neg is negative strand ULSO sequences, note these are reverse complement of sequences in manifest
end_neg=[]
header=[]
#list where [strand,chr,startPosition,endPosition,currentStartPrimer,currentEndPrimer]
customPrimerList=[]
rowcount=0
file_read = csv.reader(open(manifest, 'r'), delimiter='\t');
for row in file_read:
if rowcount==1:
#this is a header row
header=row
rowcount=rowcount+1
continue
elif row[0]=='[Targets]':
break
elif rowcount==0:
rowcount=rowcount+1
continue
currentDLSO=row[header.index('DLSO Sequence')]
currentULSO=row[header.index('ULSO Sequence')]
startPosition=row[header.index('Start Position')]
endPosition=row[header.index('End Position')]
strand=row[header.index('Strand')]
chr=row[header.index('Chromosome')][3:]
currentStartPrimer=''
currentEndPrimer=''
if strand=='-':
currentStartPrimer=Bio.Seq.reverse_complement(currentDLSO)
currentEndPrimer=Bio.Seq.reverse_complement(currentULSO)
start_neg.append(currentStartPrimer)
end_neg.append(currentEndPrimer)
else:
currentStartPrimer=currentULSO
currentEndPrimer=currentDLSO
start_pos.append(currentStartPrimer)
end_pos.append(currentEndPrimer)
currentlist=[strand,chr,startPosition,endPosition,currentStartPrimer,currentEndPrimer]
customPrimerList.append((currentlist[:]))
rowcount=rowcount+1
samfile = pysam.AlignmentFile(inputfile)
print "The length of start_pos is" + str(len(start_pos))
print "The length of start_neg is" + str(len(start_neg))
print "The length of start_pos is" + str(len(end_pos))
print "The length of end_neg is" + str(len(end_neg))
print "The first primer of start_pos is" + str(start_pos[0])
print "The first primer of start_neg is" + str(start_neg[0])
print "The first primer of end_pos is" + str(end_pos[0])
print "The first primer of end_neg is" + str(end_neg[0])
# template copies the header from the other alignment file
newreads = pysam.AlignmentFile(outputfile, "wb", template=samfile)
readnum = 0
read1 = None
read2 = None
for read in samfile: # TMS KEY CHANGE (GET RID OF FETCH)
# have to process reads1 and reads2
# the read.rnext==read.rname is another double check they are on the same chromosome
if read.is_read1 and read.rnext == read.rname: # Store read1 of the pair and iterate the for loop.
read1 = read
continue
elif read.is_read2 and read.rnext == read.rname: # Store read2. Should now have read1 and read2 from a properly-paired read. Do a bit of QC-ing of the read-pair.
read2 = read
if read1 is None:
read1 = None
read2 = None
print "WARNING READ1 EMPTY WHILE READ2 WAS NOT, IF SEE ERROR MULTIPLE TIMES CHECK IF SORTED BY QUERYNAME"
#print "continue at elif read.is_read2 and read.rnext == read.rname:"
continue
elif read2.qname != read1.qname:
read1 = None
read2 = None
#print "continue at elif read2.qname != read1.qname"
continue
if read1 is None or read2 is None:
#print "continue at if read1 is None or read2 is None"
continue
# if read1.tid != read2.tid then that means the mate is mapped to a different chromosome, so getting rid of
#print read1.cigarstring
if (read1.query_length <= 35) or (read2.query_length <= 35) or (not isinstance(read.cigarstring, str)) or (read1.tid != read2.tid):
# newreads.write(read)
#print "continue at if (read1.query_length <= 35) or (read2.query_length <= 35) or (not isinstance(read.cigarstring, str)) or (read1.tid != read2.tid):"
continue
# ASSUMING READ1 AND READ2 are opposite in read.is_reverse
#if read1.qname != "NS500796:7:H77VTAFXX:1:11107:3753:3856" or read1.qname != "NS500796:7:H77VTAFXX:1:11107:3753:3856":
# continue
p = regex.compile(ur'\d+')
p2 = regex.compile(ur'[MIDNSHP=X]')
# would rather know which read is reverse and which is forward, will find out later if one is read1 or read2
#if read1reverse
usePosStrand=True
if read1.is_reverse:
reverseread = read1
forwardread = read2
currentPrimerListF=start_pos
currentPrimerListR=end_pos
usePosStrand=True
else:
reverseread = read2
forwardread = read1
currentPrimerListF=start_neg
currentPrimerListR=end_neg
usePosStrand=False
#if forwardread.cigarstring == '121M' and reverseread.cigarstring == '121M':
# print "The read is 121M on both sides, this read should be written."
# print "The read name is : " + read1.qname
numberlistR = regex.findall(p, reverseread.cigarstring)
letterlistR = regex.findall(p2, reverseread.cigarstring)
numberlistF = regex.findall(p, forwardread.cigarstring)
letterlistF = regex.findall(p2, forwardread.cigarstring)
# http://stackoverflow.com/questions/7368789/convert-all-strings-in-a-list-to-int
numberlistintR = map(int, numberlistR)
numberlistintF = map(int, numberlistF)
if letterlistR[len(letterlistR) - 1] == 'H':
if numberlistintR[len(numberlistintR) - 1] > 2:
# then it clipped the end and hence it won't match a primer, filtering out
# newreads.write(read)
#print "continue at if numberlistintR[len(numberlistintR) - 1] > 2:"
#print "the reverseread is :" + str(reverseread.cigarstring)
continue
if letterlistF[0] == 'H':
if numberlistintF[0] > 2:
# then it clipped the start and hence it won't match a primer, filtering out
#print "continue at if numberlistintF[0] > 2:"
#print "the forwardread is :" + str(forwardread.cigarstring)
continue
# if the primer is already soft-clipped, likely probe mismatch, should get rid of, poor quality
if letterlistR[len(letterlistR) - 1] == 'S':
if numberlistintR[len(numberlistintR) - 1] > 10:
#print "continue at if numberlistintR[len(numberlistintR) - 1] > 10:"
#print "the reverseread is :" + str(reverseread.cigarstring)
continue
if letterlistF[0] == 'S':
if numberlistintF[0] > 10:
#print "continue at if numberlistintF[0] > 10:"
#print "the forwardread is :" + str(forwardread.cigarstring)
continue
#throw out if heavily soft clipped at end
if letterlistR[0] == 'S':
if numberlistintR[0] > 50:
#print "continue at if numberlistintR[0] > 50:"
#print "the reverseread is :" + str(reverseread.cigarstring)
continue
if letterlistF[len(numberlistintF) - 1] == 'S':
if numberlistintF[len(numberlistintF) - 1] > 50:
#print "continue at if numberlistintF[0] > 10:"
#print "the forwardread is :" + str(forwardread.cigarstring)
continue
# get read to check on
# get the last 35 reads
target_portion_read_R = reverseread.query_sequence[reverseread.query_length - 35:]
# need read start
target_portion_read_F = forwardread.query_sequence[0:35]
# currentSequence = read.query_sequence
# returns a match object
# print read.query_sequence
# print read.query_alignment_end
# print read.next_reference_start
# print read.is_reverse
#currentPrimerListR = ''
#currentPrimerListR = where_to_look_for_primer(reverseread)
#currentPrimerListF = ''
#currentPrimerListF = where_to_look_for_primer(forwardread)
#[currentPrimerListF,currentPrimerListR] = where_to_look_for_primer_both(forwardread)
# First only matched forward then reverse, to increase sensitivity the forward and reverse match have to have same index (ie can't match primer of PTEN forward and TP53 reverse)
matchAllList = getmatchall(forwardread, reverseread, customPrimerList, usePosStrand, target_portion_read_F, target_portion_read_R)
# throwing out reads if the primer did not match, likely low quality reads
if matchAllList is None:
#print "continue at if matchAllList is None:"
#print "The target portion read forward is below"
#print target_portion_read_F
continue
else:
matchF = matchAllList[0]
matchFexpectedLength = matchAllList[1]
matchR = matchAllList[2]
matchRexpectedLength = matchAllList[3]
originalreversecigar = reverseread.cigarstring
originalforwardcigar = forwardread.cigarstring
# cigar_end_error=change_cigar_end(originalreversecigar,matchR-1,numberlistR,numberlistintR,letterlistR,matchRexpectedLength-1,reverseread.query_length)
#cigar_end_error = change_cigar_end(originalreversecigar, matchRexpectedLength - 1, numberlistR, numberlistintR, letterlistR, matchRexpectedLength - 1, reverseread.query_length, read)
cigar_end_error = change_cigar_either(True,originalreversecigar, matchRexpectedLength - 1, numberlistR, numberlistintR, letterlistR, matchRexpectedLength - 1, reverseread.query_length, read)
# cigar_start_error=change_cigar_start(originalforwardcigar,matchF-1,numberlistF,numberlistintF,letterlistF,matchFexpectedLength-1,forwardread.query_length)
#cigar_start_error = change_cigar_start(originalforwardcigar, matchFexpectedLength - 1, numberlistF, numberlistintF, letterlistF, matchFexpectedLength - 1, forwardread.query_length, read)
cigar_start_error = change_cigar_either(False,originalforwardcigar, matchFexpectedLength - 1, numberlistF, numberlistintF, letterlistF, matchFexpectedLength - 1, forwardread.query_length, read)
if cigar_end_error == True and cigar_start_error == True:
#print "There was an error in making both cigars"
read1 = None
read2 = None
#print "continue at if cigar_end_error == True and cigar_start_error == True:"
continue
if cigar_start_error == True:
#print "There was an error in making the start cigar"
read1 = None
read2 = None
#print "continue at if cigar_start_error == True"
continue
if cigar_end_error == True:
#print "There was an error in making the end cigar"
read1 = None
read2 = None
#print "continue at if cigar_end_error == True"
continue
if reverseread.cigarstring == "121M" or forwardread.cigarstring == "121M":
#print "WARNING A READ DIDN'T MATCH A PRIMER EVEN THOUGH IT SHOULD HAVE"
# should have match a primer because I already had a check for this
#print "continue at if reverseread.cigarstring == 121M or forwardread.cigarstring == 121M"
continue
#sys.exit()
# final double checking of CIGAR invalids
#if read.qname == "PTENex6.chr10.89711875_Del_right_after_End_of_Primer":
# print 'found PTENex6.chr10.89711875_Del_right_after_End_of_Primer'
# print 'The orignal reverse cigar is' + str(originalreversecigar)
# print 'The orignal forward cigar is' + str(originalforwardcigar)
# print 'The new reverse cigar is' + str(reverseread.cigarstring)
#print 'The new forward cigar is' + str(forwardread.cigarstring)
#print 'The target portion read R is' + str(target_portion_read_R)
#print 'The target portion read F is' + str(target_portion_read_F)
#print 'The match forward length is' + str(matchF)
#print 'The match forward expected length is' + str(matchFexpectedLength)
#print 'The match forward length is' + str(matchR)
#print 'The match forward expected length is' + str(matchRexpectedLength)
RealOnly = regex.compile(ur'[MIDN]')
RealLetterlistR = regex.findall(RealOnly, reverseread.cigarstring)
RealLetterlistF = regex.findall(RealOnly, forwardread.cigarstring)
p = regex.compile(ur'\d+')
p2 = regex.compile(ur'[MIDNSHP=X]')
# according to the same file specifications
# Sum of lengths of the M/I/S/=/X operations shall equal the length of SEQ
numberlistR = regex.findall(p, reverseread.cigarstring)
letterlistR = regex.findall(p2, reverseread.cigarstring)
numberlistF = regex.findall(p, forwardread.cigarstring)
letterlistF = regex.findall(p2, forwardread.cigarstring)
# http://stackoverflow.com/questions/7368789/convert-all-strings-in-a-list-to-int
numberlistintR = map(int, numberlistR)
numberlistintF = map(int, numberlistF)
calCigarLengthR = 0
calCigarLengthF = 0
for eachindex in range(0, len(letterlistR)):
if (letterlistR[eachindex] != 'D') and (letterlistR[eachindex] != 'H') and (letterlistR[eachindex] != 'N') and (letterlistR[eachindex] != 'P'):
calCigarLengthR = numberlistintR[eachindex] + calCigarLengthR
for eachindex in range(0, len(letterlistF)):
if (letterlistF[eachindex] != 'D') and (letterlistF[eachindex] != 'H') and (letterlistF[eachindex] != 'N') and (letterlistF[eachindex] != 'P'):
calCigarLengthF = numberlistintF[eachindex] + calCigarLengthF
# if letterlist has D,H,N,P; then don't write it down
if (calCigarLengthR != reverseread.query_length) or (calCigarLengthF != forwardread.query_length):
print "Cigar length did not match"
print reverseread.qname
print forwardread.query_length
print "the length of letterlistR is: " + str(len(letterlistR))
print "the length of the reverse matched endreadmatchlength is:" + str(matchR)
print "the length of the reverse expected matched primer is:" + str(matchRexpectedLength)
print "the length of the forward matched endreadmatchlength is:" + str(matchF)
print "the length of the forward expected primer is:" + str(matchFexpectedLength)
print "the new calculated reverse Cigar Length is: " + str(calCigarLengthR)
print "the reverse cigar length should be " + str(reverseread.query_length)
print "the original reverse cigar is: " + str(originalreversecigar)
print "the reverse cigarstring is: " + str(reverseread.cigarstring)
print "the new forward Cigar Length is: " + str(calCigarLengthF)
print "the forward cigar length should be " + str(forwardread.query_length)
print "the original forward cigar is: " + str(originalforwardcigar)
print "the forward cigarstring is: " + str(forwardread.cigarstring)
print "continue at if (calCigarLengthR != reverseread.query_length) or (calCigarLengthF != forwardread.query_length)"
continue
if (not RealLetterlistF) or (not RealLetterlistR):
#print "continue at if (not RealLetterlistF) or (not RealLetterlistR)"
continue
# print "THE ENDING CIGAR STRING IS : " + read.cigarstring
reverseread.mpos = forwardread.pos
#print "The read is " + str(reverseread.qname)
if reverseread.is_read1:
newreads.write(reverseread)
newreads.write(forwardread)
else:
newreads.write(forwardread)
newreads.write(reverseread)
# END FOR READ IN SAMFILE
newreads.close()
samfile.close()
if __name__ == "__main__":
main(sys.argv[1:])