diff --git a/README.md b/README.md index c9246bf..004d500 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ The library depends on the availability of the numpy package in the python insta > for (locus, genotype) in zip( names, genotypes ): >   sys.stdout.write( locus + "," + code2genotype[genotype] + "\n" ) -Also, see examples/gtc_final_report.py for an additional example of usage. +Also, see examples/gtc_final_report.py and examples/gtc_final_report_matrix.py for additional examples of usage. ## GTC File Format The specification for the GTC file format is provided in docs/GTC_File_Format_v5.pdf diff --git a/examples/gtc_custom_report.py b/examples/gtc_custom_report.py new file mode 100644 index 0000000..e3f61b0 --- /dev/null +++ b/examples/gtc_custom_report.py @@ -0,0 +1,50 @@ +import sys +import argparse +import os +from datetime import datetime +#import GenotypeCalls +from IlluminaBeadArrayFiles import GenotypeCalls + +delim = "\t" + +parser = argparse.ArgumentParser("Generate a custom report from a directory of GTC files") +parser.add_argument("gtc_directory", help="Directory containing GTC files") +parser.add_argument("output_file", help="Location to write report") + +args = parser.parse_args() + +if os.path.isfile(args.output_file): + sys.stderr.write("Output file already exists, please delete and re-run\n") + sys.exit(-1) + +with open(args.output_file, "w") as output_handle: + output_handle.write("[Header]\n") + output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n") + samples = [] + for gtc_file in os.listdir(args.gtc_directory): + if gtc_file.lower().endswith(".gtc"): + samples.append(gtc_file) + index = 1 + output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n") + output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n") + output_handle.write("[Data]\n") + output_handle.write(delim.join(["Index","Sample ID", "Call Rate", "p05 Grn", "p50 Grn", "p90 Grn", "p05 Red", "p50 Red", \ + "p90 Red","p10 GC", "p50 GC"]) + "\n") + for gtc_file in samples: + sys.stderr.write("Processing " + gtc_file + "\n") + gtc_file = os.path.join(args.gtc_directory, gtc_file) + gtc = GenotypeCalls.GenotypeCalls(gtc_file) + call_rate = gtc.get_call_rate() + percentiles_grn = gtc.get_percentiles_x() + p05_grn = percentiles_grn[0] + p50_grn = percentiles_grn[1] + p90_grn = percentiles_grn[2] + percentiles_red = gtc.get_percentiles_y() + p05_red = percentiles_red[0] + p50_red = percentiles_red[1] + p90_red = percentiles_red[2] + p10_GC = gtc.get_gc10() + p50_GC = gtc.get_gc50() + output_handle.write(delim.join([str(index), os.path.basename(gtc_file)[:-4], str(call_rate), str(p05_grn), \ + str(p50_grn), str(p90_grn), str(p05_red), str(p50_red), str(p90_red), str(p10_GC), str(p50_GC)]) + "\n") + index +=1 diff --git a/examples/gtc_final_report.py b/examples/gtc_final_report.py index dd4174f..f88ce45 100644 --- a/examples/gtc_final_report.py +++ b/examples/gtc_final_report.py @@ -2,7 +2,7 @@ import argparse import os from datetime import datetime -from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype +from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype delim = "\t" @@ -18,7 +18,7 @@ sys.exit(-1) try: - manifest = BeadPoolManifest(args.manifest) + manifest = BeadPoolManifest.BeadPoolManifest(args.manifest) except: sys.stderr.write("Failed to read data from manifest\n") sys.exit(-1) @@ -39,15 +39,17 @@ output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n") output_handle.write("[Data]\n") - output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward"]) + "\n") + output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward", "Alleles - TOP", "Genotype Score"]) + "\n") for gtc_file in samples: sys.stderr.write("Processing " + gtc_file + "\n") gtc_file = os.path.join(args.gtc_directory, gtc_file) - gtc = GenotypeCalls(gtc_file) + gtc = GenotypeCalls.GenotypeCalls(gtc_file) genotypes = gtc.get_genotypes() plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands) forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands) + top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands) + genotype_scores = gtc.get_genotype_scores() assert len(genotypes) == len(manifest.names) - for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes): - output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype]) + "\n") + for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype, ilmn_strand_genotype, genotype_score) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes, top_strand_genotypes, genotype_scores): + output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype, ilmn_strand_genotype, str(genotype_score)]) + "\n") diff --git a/examples/gtc_final_report_matrix.py b/examples/gtc_final_report_matrix.py new file mode 100644 index 0000000..98533bc --- /dev/null +++ b/examples/gtc_final_report_matrix.py @@ -0,0 +1,147 @@ +import sys +import argparse +import os +from datetime import datetime +from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, code2genotype +from pandas import DataFrame +from numpy import around + +""" +This example will allow the user to create matrix report files identical to the ones exported from GenomeStudio +by chosing the genotype with or without the genotype scores. +""" + +def build_dict(matrix = None, samplename = None, names = None, genotypes = None, genotype_scores = None): + if genotype_scores is not None: + for (name, genotype, genotype_score) in zip(names, genotypes, genotype_scores): + if samplename in matrix: + matrix[samplename][name] = genotype + "|" + str(genotype_score) + else: + matrix[samplename] = {} + matrix[samplename][name] = genotype + "|" + str(genotype_score) + else: + for (name, genotype) in zip(names, genotypes): + if samplename in matrix: + matrix[samplename][name] = genotype + else: + matrix[samplename] = {} + matrix[samplename][name] = genotype + return matrix + +delim = "\t" +NUM_ARGUMENTS = 5 + +parser = argparse.ArgumentParser("Generate a final matrix report by specifying the report strand from a directory of GTC files") +parser.add_argument("manifest", help="BPM manifest file") +parser.add_argument("gtc_directory", help="Directory containing GTC files") +parser.add_argument("output_file", help="Location to write report") +parser.add_argument("--forward", action="store_true", default=False, help="python gtc_final_report_matrix.py --forward - print matrix with forward alleles") +parser.add_argument("--forward_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py --forward_GC - print matrix with forward alleles including genotype scores") +parser.add_argument("--top", action="store_true", default=False, help="python gtc_final_report_matrix.py --top - print matrix with top alleles") +parser.add_argument("--top_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py --top_GC - print matrix with top alleles including genotype scores") +parser.add_argument("--AB", action="store_true", default=False, help="python gtc_final_report_matrix.py --forward - print matrix with forward alleles") +parser.add_argument("--AB_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py --forward_GC - print matrix with forward alleles including genotype scores") +parser.add_argument("--plus", action="store_true", default=False, help="python gtc_final_report_matrix.py --top - print matrix with top alleles") +parser.add_argument("--plus_GC", action="store_true", default=False, help="python gtc_final_report_matrix.py --top_GC - print matrix with top alleles including genotype scores") + + +args = parser.parse_args() + +if len(sys.argv) != NUM_ARGUMENTS: + sys.stderr.write("For matrix report user needs to provide either forward or top strand parameter with or without genotype score, can only build one report!\n") + sys.exit(-1) + +if os.path.isfile(args.output_file): + sys.stderr.write("Output file already exists, please delete and re-run\n") + sys.exit(-1) + +try: + manifest = BeadPoolManifest.BeadPoolManifest(args.manifest) +except: + sys.stderr.write("Failed to read data from manifest\n") + sys.exit(-1) + +with open(args.output_file, "w") as output_handle: + output_handle.write("[Header]\n") + output_handle.write(delim.join(["Processing Date", datetime.now().strftime("%m/%d/%Y %I:%M %p")])+ "\n") + output_handle.write(delim.join(["Content", os.path.basename(args.manifest)]) + "\n") + output_handle.write(delim.join(["Num SNPs", str(len(manifest.names))]) + "\n") + output_handle.write(delim.join(["Total SNPs", str(len(manifest.names))]) + "\n") + + samples = [] + for gtc_file in os.listdir(args.gtc_directory): + if gtc_file.lower().endswith(".gtc"): + samples.append(gtc_file) + + output_handle.write(delim.join(["Num Samples", str(len(samples))]) + "\n") + output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n") + output_handle.write("[Data]\n") + + matrix_forward = {} + matrix_forward_GC = {} + matrix_top = {} + matrix_top_GC = {} + matrix_plus = {} + matrix_plus_GC = {} + matrix_AB = {} + matrix_AB_GC = {} + + for gtc_file in samples: + samplename = os.path.basename(gtc_file)[:-4] + sys.stderr.write("Processing " + gtc_file + "\n") + gtc_file = os.path.join(args.gtc_directory, gtc_file) + gtc = GenotypeCalls.GenotypeCalls(gtc_file) + genotypes = [code2genotype[genotype] for genotype in gtc.get_genotypes()] + assert len(genotypes) == len(manifest.names) + if args.plus or args.plus_GC: + plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands) + if args.forward or args.forward_GC: + forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands) + if args.top or args.top_GC: + top_strand_genotypes = gtc.get_base_calls_TOP_strand(manifest.snps, manifest.ilmn_strands) + if args.forward_GC or args.top_GC or args.plus_GC or args.AB_GC: + genotype_scores = around(gtc.get_genotype_scores(), decimals = 4) + #build dictionary for pandas + if args.forward_GC: + matrix_forward_GC = build_dict(matrix = matrix_forward_GC, samplename = samplename, \ + names = manifest.names, genotypes = forward_strand_genotypes, genotype_scores = genotype_scores) + elif args.forward: + matrix_forward = build_dict(matrix = matrix_forward, samplename = samplename, \ + names = manifest.names, genotypes = forward_strand_genotypes) + elif args.top: + matrix_top = build_dict(matrix = matrix_top, samplename = samplename, \ + names = manifest.names, genotypes = top_strand_genotypes) + elif args.top_GC: + matrix_top_GC = build_dict(matrix = matrix_top_GC, samplename = samplename, \ + names = manifest.names, genotypes = top_strand_genotypes, genotype_scores = genotype_scores) + elif args.plus_GC: + matrix_plus_GC = build_dict(matrix = matrix_plus_GC, samplename = samplename, \ + names = manifest.names, genotypes = plus_strand_genotypes, genotype_scores = genotype_scores) + elif args.plus: + matrix_plus = build_dict(matrix = matrix_plus, samplename = samplename, \ + names = manifest.names, genotypes = plus_strand_genotypes) + elif args.AB: + matrix_AB = build_dict(matrix = matrix_AB, samplename = samplename, \ + names = manifest.names, genotypes = genotypes) + elif args.AB_GC: + matrix_AB_GC = build_dict(matrix = matrix_AB_GC, samplename = samplename, \ + names = manifest.names, genotypes = genotypes, genotype_scores = genotype_scores) + #create pandas dataframe from dictionaryand append to file + if args.forward_GC: + df = DataFrame.from_dict(matrix_forward_GC) + elif args.forward: + df = DataFrame.from_dict(matrix_forward) + elif args.top: + df = DataFrame.from_dict(matrix_top) + elif args.top_GC: + df = DataFrame.from_dict(matrix_top_GC) + elif args.plus_GC: + df = DataFrame.from_dict(matrix_plus_GC) + elif args.plus: + df = DataFrame.from_dict(matrix_plus) + elif args.AB: + df = DataFrame.from_dict(matrix_AB) + elif args.AB_GC: + df = DataFrame.from_dict(matrix_AB_GC) + df = df.reindex(manifest.names) + df.to_csv(output_handle, sep = delim) diff --git a/module/BeadPoolManifest.py b/module/BeadPoolManifest.py index dbf6bba..354b18f 100644 --- a/module/BeadPoolManifest.py +++ b/module/BeadPoolManifest.py @@ -1,5 +1,6 @@ from .BeadArrayUtility import read_int, read_string, read_byte + class BeadPoolManifest(object): """ Class for parsing binary (BPM) manifest file. @@ -14,6 +15,7 @@ class BeadPoolManifest(object): list of normalization transforms read from GTC file ref_strands (list(int)): Reference strand annotation for loci (see RefStrand class) source_strands (list(int)): Source strand annotations for loci (see SourceStrand class) + ilmn_strands (list(int)): Ilumina strand annotations for loci (see IlmnStrand class) num_loci (int): Number of loci in manifest manifest_name (string): Name of manifest control_config (string): Control description from manifest @@ -39,6 +41,7 @@ def __init__(self, filename): self.normalization_lookups = [] self.ref_strands = [] self.source_strands = [] + self.ilmn_strands = [] self.num_loci = 0 self.manifest_name = "" self.control_config = "" @@ -100,6 +103,7 @@ def __parse_file(self, manifest_file): self.map_infos = [0] * self.num_loci self.ref_strands = [RefStrand.Unknown] * self.num_loci self.source_strands = [SourceStrand.Unknown] * self.num_loci + self.ilmn_strands = [IlmnStrand.Unknown] * self.num_loci for idx in xrange(self.num_loci): locus_entry = LocusEntry(manifest_handle) self.assay_types[name_lookup[locus_entry.name]] = locus_entry.assay_type @@ -109,6 +113,7 @@ def __parse_file(self, manifest_file): self.map_infos[name_lookup[locus_entry.name]] = locus_entry.map_info self.ref_strands[name_lookup[locus_entry.name]] = locus_entry.ref_strand self.source_strands[name_lookup[locus_entry.name]] = locus_entry.source_strand + self.ilmn_strands[name_lookup[locus_entry.name]] = locus_entry.ilmn_strand if len(self.normalization_ids) != len(self.assay_types): raise Exception( @@ -179,6 +184,71 @@ def from_string(source_strand): raise ValueError( "Unexpected value for source strand " + source_strand) + +class IlmnStrand(object): + Unknown = 0 + TOP = 1 + BOT = 2 + PLUS = 3 + MINUS = 4 + + @staticmethod + def to_string(ilmn_strand): + """Get an integer representation of Illumina strand annotation + + Args: + ilmn_strand (str) : string representation of Illumina strand annotation (e.g., "T") + + Returns: + int : int representation of Illumina strand annotation (e.g. IlmnStrand.TOP) + + Raises: + ValueError: Unexpected value for Illumina strand + """ + if ilmn_strand == IlmnStrand.Unknown: + return "U" + elif ilmn_strand == IlmnStrand.TOP: + return "T" + elif ilmn_strand == IlmnStrand.BOT: + return "B" + elif ilmn_strand == IlmnStrand.PLUS: + return "P" + elif ilmn_strand == IlmnStrand.MINUS: + return "M" + + else: + raise ValueError( + "Unexpected value for ilmn strand " + ilmn_strand) + + @staticmethod + def from_string(ilmn_strand): + """ + Get a string representation of Illumina strand annotation + + Args: + ilmn_strand (int) : int representation of Illumina strand (e.g., IlmnStrand.TOP) + + Returns: + str : string representation of Illumina strand annotation + + Raises: + ValueError: Unexpected value for Illumina strand + """ + if ilmn_strand == "U": + return IlmnStrand.Unknown + if ilmn_strand == "T": + return IlmnStrand.TOP + elif ilmn_strand == "B": + return IlmnStrand.BOT + if ilmn_strand == "P": + return IlmnStrand.PLUS + elif ilmn_strand == "M": + return IlmnStrand.MINUS + else: + raise ValueError( + "Unexpected value for ilmn strand " + ilmn_strand) + + class RefStrand(object): Unknown = 0 Plus = 1 @@ -188,13 +258,13 @@ class RefStrand(object): def to_string(ref_strand): """ Get a string reprensetation of ref strand annotation - + Args: ref_strand (int) : int representation of ref strand (e.g., RefStrand.Plus) - + Returns: str : string representation of reference strand annotation - + Raises: ValueError: Unexpected value for reference strand """ @@ -211,13 +281,13 @@ def to_string(ref_strand): @staticmethod def from_string(ref_strand): """Get an integer representation of ref strand annotation - + Args: ref_strand (str) : string representation of reference strand annotation (e.g., "+") - + Returns: int : int representation of reference strand annotation (e.g. RefStrand.Plus) - + Raises: ValueError: Unexpected value for reference strand """ @@ -235,7 +305,7 @@ class LocusEntry(object): """ Helper class representing a locus entry within a bead pool manifest. Current only support version 6,7, and 8. - + Attributes: ilmn_id (string) : IlmnID (probe identifier) of locus name (string): Name (variant identifier) of locus @@ -247,6 +317,7 @@ class LocusEntry(object): address_b (int) : AddressB ID of locus (0 if none) ref_strand (int) : See RefStrand class source_strand (int) : See SourceStrand class + ilmn_strand (int) : See IlmnStrand class """ def __init__(self, handle): @@ -269,6 +340,7 @@ def __init__(self, handle): self.address_b = -1 self.ref_strand = RefStrand.Unknown self.source_strand = SourceStrand.Unknown + self.ilmn_strand = IlmnStrand.Unknown self.__parse_file(handle) def __parse_file(self, handle): @@ -308,6 +380,8 @@ def __parse_locus_version_6(self, handle): self.ilmn_id = read_string(handle) self.source_strand = SourceStrand.from_string( self.ilmn_id.split("_")[-2]) + self.ilmn_strand = IlmnStrand.from_string( + self.ilmn_id.split("_")[-3]) self.name = read_string(handle) for idx in xrange(3): read_string(handle) diff --git a/module/GenotypeCalls.py b/module/GenotypeCalls.py index 80b12ec..fb10a7f 100644 --- a/module/GenotypeCalls.py +++ b/module/GenotypeCalls.py @@ -1,7 +1,7 @@ import struct from numpy import cos, sin, pi, arctan2, float32, uint16, int32, seterr, frombuffer, dtype from .BeadArrayUtility import read_int, read_string, read_byte, read_float, read_char, read_ushort, complement -from .BeadPoolManifest import RefStrand, SourceStrand +from .BeadPoolManifest import RefStrand, SourceStrand, IlmnStrand seterr(divide='ignore', invalid='ignore') nan = float32('nan') @@ -320,29 +320,35 @@ def get_base_calls_generic(self, snps, strand_annotations, report_strand, unknow Raises: ValueError: Number of SNPs or ref strand annotations not matched to entries in GTC file """ + if type(report_strand) != list: + report_strands = [report_strand] + else: + report_strands = report_strand + genotypes = self.get_genotypes() if len(genotypes) != len(snps): raise ValueError( "The number of SNPs must match the number of loci in the GTC file") - + if len(genotypes) != len(strand_annotations): raise ValueError( "The number of reference strand annotations must match the number of loci in the GTC file") - result = [] - for (genotype, snp, strand_annotation) in zip(genotypes, snps, strand_annotations): - ab_genotype = code2genotype[genotype] - a_nucleotide = snp[1] - b_nucleotide = snp[-2] - if a_nucleotide == "N" or b_nucleotide == "N" or strand_annotation == unknown_annotation or ab_genotype == "NC" or ab_genotype == "NULL": - result.append("-") - else: - report_strand_nucleotides = [] - for ab_allele in ab_genotype: - nucleotide_allele = a_nucleotide if ab_allele == "A" else b_nucleotide - report_strand_nucleotides.append( - nucleotide_allele if strand_annotation == report_strand else complement(nucleotide_allele)) - result.append("".join(report_strand_nucleotides)) + for report_strand in report_strands: + for (genotype, snp, strand_annotation) in zip(genotypes, snps, strand_annotations): + ab_genotype = code2genotype[genotype] + a_nucleotide = snp[1] + b_nucleotide = snp[-2] + if a_nucleotide == "N" or b_nucleotide == "N" or strand_annotation == unknown_annotation or ab_genotype == "NC" or ab_genotype == "NULL": + result.append("-") + else: + report_strand_nucleotides = [] + for ab_allele in ab_genotype: + nucleotide_allele = a_nucleotide if ab_allele == "A" else b_nucleotide + report_strand_nucleotides.append( + nucleotide_allele if strand_annotation == report_strand else complement(nucleotide_allele) + ) + result.append("".join(report_strand_nucleotides)) return result def get_base_calls_plus_strand(self, snps, ref_strand_annotations): @@ -374,6 +380,20 @@ def get_base_calls_forward_strand(self, snps, forward_strand_annotations): """ return self.get_base_calls_generic(snps, forward_strand_annotations, SourceStrand.Forward, RefStrand.Unknown) + def get_base_calls_TOP_strand(self, snps, ilmn_strand_annotations): + """ + Get base calls on the Illumina strand. + + Args: + snps (list(string)) : A list of string representing the snp on the design strand for the loci (e.g. [A/C]) + ilmn_strand_annotations (list(int)) : A list of strand annotations for the loci (e.g., IlmnStrand.TOP) + + Returns: + The genotype basecalls on the report strand as a list of strings. + The characters are A, C, G, T, or - for a no-call/null. + """ + return self.get_base_calls_generic(snps, ilmn_strand_annotations, [IlmnStrand.TOP, IlmnStrand.PLUS], IlmnStrand.Unknown) + def get_base_calls(self): """ Returns: