@@ -333,9 +333,14 @@ def ensembl_to_bed_features(
333
333
written to the parent directory of "annotation_file"
334
334
using "features" as stem, and bed as extension.
335
335
336
- NOTE: "extended features" are used to validate consistency
337
- with the caller: "conventional features" are extracted
338
- on the fly, but intron and intergenic require special treatment.
336
+ NOTE: To merge features "gene-wise", GTF is first formatted
337
+ into bed-like format with name in the first field, sorted,
338
+ merged, then fields are re-ordered into standard bed-like
339
+ order and sorted. The equivalent command would be e.g.
340
+ awk 'OFS="\t " {print $4, $2, $3, $1, $5, $6, $7, $8}' exon.tmp |
341
+ sort -k1,1 -k2,2n | bedtools merge -i - -s -c 4,5,6,7,8 -o distinct |
342
+ awk 'OFS="\t " {print $4, $2, $3, $1, $5, $6, $7, $8}' |
343
+ sort -k1,1 -k2,2n > exon.bed.
339
344
340
345
:param annotation_file: Path to annotation file.
341
346
The format is implicitely assumed to be GTF.
@@ -349,32 +354,71 @@ def ensembl_to_bed_features(
349
354
parent = annotation_file .parent
350
355
annotation_bedtool = pybedtools .BedTool (annotation_file .as_posix ()).sort ()
351
356
357
+ def _get_gtf_attrs_by_name (feature ):
358
+ line = [
359
+ feature .attrs ["gene_name" ]
360
+ if "gene_name" in feature .attrs
361
+ else feature .attrs ["gene_id" ],
362
+ feature .start ,
363
+ feature .end ,
364
+ feature .chrom ,
365
+ feature .score ,
366
+ feature .strand ,
367
+ feature .attrs ["gene_id" ],
368
+ feature .attrs ["gene_biotype" ],
369
+ ]
370
+ return pybedtools .cbedtools .create_interval_from_list (line )
371
+
372
+ def _get_fields_by_name (feature ):
373
+ line = [
374
+ feature .name ,
375
+ feature .start ,
376
+ feature .end ,
377
+ feature .chrom ,
378
+ feature .score ,
379
+ feature .strand ,
380
+ feature .fields [6 ],
381
+ feature .fields [7 ],
382
+ ]
383
+ return pybedtools .cbedtools .create_interval_from_list (line )
384
+
385
+ def _annotation_to_stream (feature , function = _get_gtf_attrs_by_name ):
386
+ return annotation_bedtool .filter (lambda a : a .fields [2 ] == feature ).each (
387
+ function
388
+ )
389
+
352
390
logger .info (f"Preparing annotation and writing to { parent } ..." )
353
391
354
392
for feature in features ["conventional" ]:
355
393
logger .debug (f"Writing { feature } ..." )
356
394
357
395
file_name = Path (parent , f"{ feature } .bed" ).as_posix ()
358
396
_ = (
359
- self ._annotation_to_stream (annotation_bedtool , feature )
397
+ _annotation_to_stream (feature )
398
+ .sort ()
360
399
.merge (s = True , c = [4 , 5 , 6 , 7 , 8 ], o = "distinct" )
400
+ .each (_get_fields_by_name )
401
+ .sort ()
361
402
.moveto (file_name )
362
403
)
363
404
364
405
file_name = Path (parent , "exon.bed" ).as_posix ()
365
- exons = pybedtools .BedTool (file_name )
406
+ exons = pybedtools .BedTool (file_name ). each ( _get_fields_by_name ). sort ()
366
407
file_name = self ._check_feature ("intron" , features , parent )
367
- # why is the sort order lost after subtract?
368
408
_ = (
369
- self ._annotation_to_stream (annotation_bedtool , "gene" )
409
+ _annotation_to_stream ("gene" )
410
+ .sort ()
370
411
.subtract (exons , s = True , sorted = True )
371
412
.sort ()
372
413
.merge (s = True , c = [4 , 5 , 6 , 7 , 8 ], o = "distinct" )
373
- ).moveto (file_name )
414
+ .each (_get_fields_by_name )
415
+ .sort ()
416
+ .moveto (file_name )
417
+ )
374
418
375
419
file_name = self ._check_feature ("intergenic" , features , parent )
376
420
_ = (
377
- self . _annotation_to_stream (annotation_bedtool , "gene" )
421
+ _annotation_to_stream ("gene" , function = _get_gtf_attrs )
378
422
.complement (g = chrom_file .as_posix ())
379
423
.moveto (file_name )
380
424
)
@@ -559,12 +603,6 @@ def getfasta(
559
603
sequence = bedtool .sequence (fi = fasta , s = is_strand )
560
604
return sequence .seqfn
561
605
562
- @staticmethod
563
- def _annotation_to_stream (annotation_bedtool , feature ):
564
- return annotation_bedtool .filter (lambda a : a .fields [2 ] == feature ).each (
565
- _get_gtf_attrs
566
- )
567
-
568
606
@staticmethod
569
607
def _check_feature (feature , features , parent ):
570
608
if feature not in features ["extended" ]:
0 commit comments