fix: [#256] cancer and multiassay support pull_sheets.py

bihealth · Feb 13, 2025 · 64390ee · 64390ee
1 parent 22b8d6b
commit 64390ee
Show file tree

Hide file tree

Showing 2 changed files with 332 additions and 327 deletions.
diff --git a/src/cubi_tk/snappy/parse_sample_sheet.py b/src/cubi_tk/snappy/parse_sample_sheet.py
@@ -1,6 +1,14 @@
 """Common code to parse BioMedSheets"""
+import typing
 from logzero import logger
 
+from ..isa_support import (
+    IsaNodeVisitor,
+    first_value,
+)
+from biomedsheets import io_tsv
+from biomedsheets.naming import NAMING_ONLY_SECONDARY_ID
+import attr
 
 class ParseSampleSheet:
     """Class contains methods to parse BioMedSheet"""
@@ -256,3 +264,323 @@ def yield_donor(
                     # the sample sheet is sorted.
                     continue
             yield donor
+
+
+class SampleSheetBuilder(IsaNodeVisitor):
+    def __init__(self):
+        #: Source by sample name.
+        self.sources = {}
+        #: Sample by sample name.
+        self.samples = {}
+        #: The previous process.
+        self.prev_process = None
+
+    def on_visit_material(self, material, node_path, study=None, assay=None):
+        super().on_visit_material(material, node_path, study, assay)
+
+    def on_visit_process(self, process, node_path, study=None, assay=None):
+        super().on_visit_node(process, study, assay)
+
+    def generateSheet(self):
+        logger.debug("building sheet")
+
+    def get_libtype(self, splitted_lib, library):
+        #get libtype
+        lib_type_string = splitted_lib[-1]
+        if lib_type_string.startswith("WGS"):
+            library_type = "WGS"
+        elif lib_type_string.startswith("WES"):
+            library_type = "WES"
+        elif lib_type_string.startswith("Panel_seq"):
+            library_type = "Panel_seq"
+        elif lib_type_string.startswith("mRNA_seq"):
+            library_type = "mRNA_seq"
+        elif lib_type_string.startswith("RNA_seq"):
+            library_type = "RNA_seq"
+        else:
+            raise Exception("Cannot infer library type from %s" % library.name)
+        return library_type
+
+
+####Germline specific Classes, Templates and Constants
+
+#: Template for the to-be-generated file.
+HEADER_TPL_GERMLINE = (
+    "[Metadata]",
+    "schema\tgermline_variants",
+    "schema_version\tv1",
+    "",
+    "[Custom Fields]",
+    "key\tannotatedEntity\tdocs\ttype\tminimum\tmaximum\tunit\tchoices\tpattern",
+    "batchNo\tbioEntity\tBatch No.\tinteger\t.\t.\t.\t.\t.",
+    "familyId\tbioEntity\tFamily\tstring\t.\t.\t.\t.\t.",
+    "projectUuid\tbioEntity\tProject UUID\tstring\t.\t.\t.\t.\t.",
+    "libraryKit\tngsLibrary\tEnrichment kit\tstring\t.\t.\t.\t.\t.",
+    "",
+    "[Data]",
+    (
+        "familyId\tpatientName\tfatherName\tmotherName\tsex\tisAffected\tlibraryType\tfolderName"
+        "\tbatchNo\thpoTerms\tprojectUuid\tseqPlatform\tlibraryKit"
+    ),
+)
+
+#: Mapping from ISA-tab sex to sample sheet sex.
+MAPPING_SEX_GERMLNE = {"female": "F", "male": "M", "unknown": "U", None: "."}
+
+#: Mapping from disease status to sample sheet status.
+MAPPING_STATUS_GERMLINE = {"affected": "Y", "carrier": "Y", "unaffected": "N", "unknown": ".", None: "."}
+
+@attr.s(frozen=True, auto_attribs=True)
+class SourceGermline:
+    family: typing.Optional[str]
+    source_name: str
+    batch_no: int
+    father: str
+    mother: str
+    sex: str
+    affected: str
+    sample_name: str
+
+
+@attr.s(frozen=True, auto_attribs=True)
+class SampleGermline:
+    source: SourceGermline
+    library_name: str
+    library_type: str
+    folder_name: str
+    seq_platform: str
+    library_kit: str
+
+class SampleSheetBuilderGermline(SampleSheetBuilder):
+    def __init__(self):
+        super().__init__()
+        self.config = None
+        self.project_uuid = ""
+        self.first_batch = 0
+        self.last_batch = 0
+
+    def set_germline_specific_values(self, config, project_uuid, first_batch, last_batch):
+        self.config = config
+        self.project_uuid = project_uuid
+        self.first_batch = first_batch
+        self.last_batch = last_batch
+
+    def on_visit_material(self, material, node_path, study=None, assay=None):
+        super().on_visit_material(material, node_path, study, assay)
+        material_path = [x for x in node_path if hasattr(x, "type")]
+        source = material_path[0]
+        if material.type == "Sample Name" and assay is None:
+            sample = material
+            characteristics = {c.name: c for c in source.characteristics}
+            comments = {c.name: c for c in source.comments}
+            batch = characteristics.get("Batch", comments.get("Batch"))
+            family = characteristics.get("Family", comments.get("Family"))
+            father = characteristics.get("Father", comments.get("Father"))
+            mother = characteristics.get("Mother", comments.get("Mother"))
+            sex = characteristics.get("Sex", comments.get("Sex"))
+            affected = characteristics.get("Disease status", comments.get("Disease status"))
+            self.sources[material.name] = SourceGermline(
+                family=family.value[0] if family else None,
+                source_name=source.name,
+                batch_no=batch.value[0] if batch else None,
+                father=father.value[0] if father else None,
+                mother=mother.value[0] if mother else None,
+                sex=sex.value[0] if sex else None,
+                affected=affected.value[0] if affected else None,
+                sample_name=sample.name,
+            )
+        elif material.type == "Library Name" or (
+            material.type == "Extract Name"
+            and self.prev_process.protocol_ref.startswith("Library construction")
+        ):
+            library = material
+            sample = material_path[0]
+
+            splitted_lib = library.name.split("-")
+            library_type = self.get_libtype(splitted_lib, library)
+
+            folder_name = first_value("Folder name", node_path)
+            if not folder_name:
+                folder_name = library.name
+            self.samples[sample.name] = SampleGermline(
+                source=self.sources[sample.name],
+                library_name=library.name,
+                library_type=library_type,
+                folder_name=folder_name,
+                seq_platform=first_value("Platform", node_path),
+                library_kit=first_value("Library Kit", node_path),
+            )
+
+    def on_visit_process(self, process, node_path, study=None, assay=None):
+        super().on_visit_process(process, study, assay)
+        self.prev_process = process
+        material_path = [x for x in node_path if hasattr(x, "type")]
+        sample = material_path[0]
+        if process.protocol_ref.startswith("Nucleic acid sequencing"):
+            self.samples[sample.name] = attr.evolve(
+                self.samples[sample.name], seq_platform=first_value("Platform", node_path)
+            )
+
+    def generateSheet(self):
+        super().generateSheet()
+        result = []
+        for sample_name, source in self.sources.items():
+            sample = self.samples.get(sample_name, None)
+            if not self.config.library_types or not sample or sample.library_type in self.config.library_types:
+                row = [
+                    source.family or "FAM",
+                    source.source_name or ".",
+                    source.father or "0",
+                    source.mother or "0",
+                    MAPPING_SEX_GERMLNE[source.sex.lower()],
+                    MAPPING_STATUS_GERMLINE[source.affected.lower()],
+                    sample.library_type or "." if sample else ".",
+                    sample.folder_name or "." if sample else ".",
+                    "0" if source.batch_no is None else source.batch_no,
+                    ".",
+                    str(self.project_uuid),
+                    sample.seq_platform or "." if sample else ".",
+                    sample.library_kit or "." if sample else ".",
+                ]
+                result.append("\t".join([c.strip() for c in row]))
+
+        load_tsv = getattr(io_tsv, "read_%s_tsv_sheet" % "germline")
+
+        sheet = load_tsv(list(HEADER_TPL_GERMLINE) + result, naming_scheme=NAMING_ONLY_SECONDARY_ID)
+        parser = ParseSampleSheet()
+        samples_in_batch = list(parser.yield_sample_names(sheet, self.first_batch, self.last_batch))
+        result = (
+            list(HEADER_TPL_GERMLINE)
+            + [line if line.split("\t")[1] in samples_in_batch else "#" + line for line in result]
+            + [""]
+        )
+        return result
+
+####Cancer specific Classes, Templates and Constants
+
+HEADER_TPL_CANCER= (
+    "[Metadata]",
+    "schema\tcancer_matched",
+    "schema_version\tv1",
+    "",
+    "[Custom Fields]",
+    "key\tannotatedEntity\tdocs\ttype\tminimum\tmaximum\tunit\tchoices\tpattern",
+    "extractionType\ttestSample\textraction type\tstring\t.\t.\t.\t.\t.",
+    "libraryKit\tngsLibrary\texome enrichment kit\tstring\t.\t.\t.\t.\t.",
+    "",
+    "[Data]",
+    (
+        "patientName\tsampleName\textractionType\tlibraryType\tfolderName\tisTumor\tlibraryKit"
+    ),
+)
+
+@attr.s(frozen=True, auto_attribs=True)
+class SourceCancer:
+    source_name: str
+    sample_name: str
+    is_tumor: str
+
+
+@attr.s(frozen=True, auto_attribs=True)
+class SampleCancer:
+    source: SourceCancer
+    extraction_type: str
+    sample_name_biomed :str
+    library_name: str
+    library_type: str
+    folder_name: str
+    library_kit: str
+
+class SampleSheetBuilderCancer(SampleSheetBuilder):
+    def __init__(self):
+        super().__init__()
+
+    def on_visit_material(self, material, node_path, study=None, assay=None):
+        super().on_visit_material(material, node_path, study, assay)
+        material_path = [x for x in node_path if hasattr(x, "type")]
+        source = material_path[0]
+        if material.type == "Sample Name" and assay is None:
+            sample = material
+            characteristics_material = {c.name: c for c in material.characteristics}
+            comments = {c.name: c for c in source.comments}
+            tumor =characteristics_material.get("Is tumor", comments.get("Is tumor"))
+            self.sources[material.name] = SourceCancer(
+                source_name=source.name,
+                sample_name=sample.name,
+                is_tumor=tumor.value[0] if tumor else None
+            )
+        elif material.type == "Library Name" or (
+            material.type == "Extract Name"
+            and self.prev_process.protocol_ref.startswith("Library construction")
+        ):
+            library = material
+            sample = material_path[0]
+            splitted_lib = library.name.split("-")
+            library_type = self.get_libtype(splitted_lib, library)
+
+            #get extractiontype
+            extr_type_string = splitted_lib[-2]
+            if extr_type_string.startswith("DNA"):
+                extraction_type="DNA"
+            elif extr_type_string.startswith("RNA"):
+                extraction_type="RNA"
+            else:
+                raise Exception("Cannot infer exctraction type from %s" % library.name)
+
+            #get sample name for biomedsheet
+            samp_name_string = splitted_lib[-3]
+            if samp_name_string.startswith("T"):
+                sample_name_biomed = "T"
+            elif samp_name_string.startswith("N"):
+                sample_name_biomed = "N"
+            else:
+                raise Exception("Cannot biomed sample name from %s" % library.name)
+
+            folder_name = first_value("Folder name", node_path)
+            if not folder_name:
+                folder_name = library.name
+
+            self.samples[sample.name] = SampleCancer(
+                source=self.sources[sample.name],
+                sample_name_biomed = sample_name_biomed,
+                extraction_type= extraction_type,
+                library_name=library.name,
+                folder_name=folder_name,
+                library_type=library_type,
+                library_kit=first_value("Library Kit", node_path),
+            )
+
+    def on_visit_process(self, process, node_path, study=None, assay=None):
+        super().on_visit_process(process, study, assay)
+        self.prev_process = process
+        material_path = [x for x in node_path if hasattr(x, "type")]
+        sample = material_path[0]
+        if process.protocol_ref.startswith("Nucleic acid sequencing"):
+            self.samples[sample.name] = attr.evolve(
+                self.samples[sample.name]
+            )
+
+    def generateSheet(self):
+        super().generateSheet()
+        result = []
+        #for sample_name, source in self.sources.items():
+            #sample = self.samples.get(sample_name, None)
+            #if sample:
+        for sample_name, sample in self.samples.items():
+            source = self.sources.get(sample_name, None)    
+            row = [
+                source.source_name or ".",
+                sample.sample_name_biomed or ".",
+                sample.extraction_type or "." if sample else ".",
+                sample.library_type or "." if sample else ".",
+                sample.folder_name or "." if sample else ".",
+                source.is_tumor,
+                sample.library_kit or "." if sample else ".",
+            ]
+            result.append("\t".join([c.strip() for c in row]))
+        result = (
+            list(HEADER_TPL_CANCER)
+            + [line for line in result]
+            + [""]
+        )
+        return result