Skip to content

Commit 080d968

Browse files
committed
FIX #139
1 parent 2f69fdd commit 080d968

File tree

1 file changed

+53
-15
lines changed

1 file changed

+53
-15
lines changed

Diff for: server/src/scimodom/services/bedtools.py

+53-15
Original file line numberDiff line numberDiff line change
@@ -333,9 +333,14 @@ def ensembl_to_bed_features(
333333
written to the parent directory of "annotation_file"
334334
using "features" as stem, and bed as extension.
335335
336-
NOTE: "extended features" are used to validate consistency
337-
with the caller: "conventional features" are extracted
338-
on the fly, but intron and intergenic require special treatment.
336+
NOTE: To merge features "gene-wise", GTF is first formatted
337+
into bed-like format with name in the first field, sorted,
338+
merged, then fields are re-ordered into standard bed-like
339+
order and sorted. The equivalent command would be e.g.
340+
awk 'OFS="\t" {print $4, $2, $3, $1, $5, $6, $7, $8}' exon.tmp |
341+
sort -k1,1 -k2,2n | bedtools merge -i - -s -c 4,5,6,7,8 -o distinct |
342+
awk 'OFS="\t" {print $4, $2, $3, $1, $5, $6, $7, $8}' |
343+
sort -k1,1 -k2,2n > exon.bed.
339344
340345
:param annotation_file: Path to annotation file.
341346
The format is implicitely assumed to be GTF.
@@ -349,32 +354,71 @@ def ensembl_to_bed_features(
349354
parent = annotation_file.parent
350355
annotation_bedtool = pybedtools.BedTool(annotation_file.as_posix()).sort()
351356

357+
def _get_gtf_attrs_by_name(feature):
358+
line = [
359+
feature.attrs["gene_name"]
360+
if "gene_name" in feature.attrs
361+
else feature.attrs["gene_id"],
362+
feature.start,
363+
feature.end,
364+
feature.chrom,
365+
feature.score,
366+
feature.strand,
367+
feature.attrs["gene_id"],
368+
feature.attrs["gene_biotype"],
369+
]
370+
return pybedtools.cbedtools.create_interval_from_list(line)
371+
372+
def _get_fields_by_name(feature):
373+
line = [
374+
feature.name,
375+
feature.start,
376+
feature.end,
377+
feature.chrom,
378+
feature.score,
379+
feature.strand,
380+
feature.fields[6],
381+
feature.fields[7],
382+
]
383+
return pybedtools.cbedtools.create_interval_from_list(line)
384+
385+
def _annotation_to_stream(feature, function=_get_gtf_attrs_by_name):
386+
return annotation_bedtool.filter(lambda a: a.fields[2] == feature).each(
387+
function
388+
)
389+
352390
logger.info(f"Preparing annotation and writing to {parent}...")
353391

354392
for feature in features["conventional"]:
355393
logger.debug(f"Writing {feature}...")
356394

357395
file_name = Path(parent, f"{feature}.bed").as_posix()
358396
_ = (
359-
self._annotation_to_stream(annotation_bedtool, feature)
397+
_annotation_to_stream(feature)
398+
.sort()
360399
.merge(s=True, c=[4, 5, 6, 7, 8], o="distinct")
400+
.each(_get_fields_by_name)
401+
.sort()
361402
.moveto(file_name)
362403
)
363404

364405
file_name = Path(parent, "exon.bed").as_posix()
365-
exons = pybedtools.BedTool(file_name)
406+
exons = pybedtools.BedTool(file_name).each(_get_fields_by_name).sort()
366407
file_name = self._check_feature("intron", features, parent)
367-
# why is the sort order lost after subtract?
368408
_ = (
369-
self._annotation_to_stream(annotation_bedtool, "gene")
409+
_annotation_to_stream("gene")
410+
.sort()
370411
.subtract(exons, s=True, sorted=True)
371412
.sort()
372413
.merge(s=True, c=[4, 5, 6, 7, 8], o="distinct")
373-
).moveto(file_name)
414+
.each(_get_fields_by_name)
415+
.sort()
416+
.moveto(file_name)
417+
)
374418

375419
file_name = self._check_feature("intergenic", features, parent)
376420
_ = (
377-
self._annotation_to_stream(annotation_bedtool, "gene")
421+
_annotation_to_stream("gene", function=_get_gtf_attrs)
378422
.complement(g=chrom_file.as_posix())
379423
.moveto(file_name)
380424
)
@@ -559,12 +603,6 @@ def getfasta(
559603
sequence = bedtool.sequence(fi=fasta, s=is_strand)
560604
return sequence.seqfn
561605

562-
@staticmethod
563-
def _annotation_to_stream(annotation_bedtool, feature):
564-
return annotation_bedtool.filter(lambda a: a.fields[2] == feature).each(
565-
_get_gtf_attrs
566-
)
567-
568606
@staticmethod
569607
def _check_feature(feature, features, parent):
570608
if feature not in features["extended"]:

0 commit comments

Comments
 (0)