diff --git a/src/util/entropyCreateStateFromJsons.ts b/src/util/entropyCreateStateFromJsons.ts index 0745c532a..95beefe37 100644 --- a/src/util/entropyCreateStateFromJsons.ts +++ b/src/util/entropyCreateStateFromJsons.ts @@ -1,67 +1,79 @@ import { genotypeColors } from "./globals"; import { defaultEntropyState } from "../reducers/entropy"; -type JsonAnnotations = Record -// enum Strand {'+', '-'} // other GFF-valid options are '.' and '?' -type Strand = string; -type JsonSegmentRange = {start: number, end: number}; // Start is 1-based, End is 1-based closed (GFF) -interface JsonAnnotation { - /* Other properties are commonly set in the JSON structure, but the following are - the only ones read by Auspice */ - end?: number; - start?: number; - segments?: JsonSegmentRange[]; - strand: Strand; - gene?: string; - color?: string; - display_name?: string; - description?: string; +/** + * Object used for user-provided JSON. + * Stricter than simply `object` which implicitly types values as `any`. + */ +interface UnknownJsonObject { + [key: string]: unknown } -/* Specifies the range of the each segment's corresponding position in the genome, -or defines the range of the genome (chromosome) itself. -Start is always less than or equal to end. -Start is 1-based, End is 1-based closed. I.e. GFF. */ -type RangeGenome = [number, number]; -/* Same as RangeGenome but now relative to the nucleotides which make up the CDS -(i.e. after slippage, splicing etc). The first CDS segment's RangeLocal will always -start at 1, and the end value (of the last segment) corresponds to the number of nt in the CDS: -range_segLast[1] - range_seg1[0] + 1 = 3 * number_of_amino_acids_in_translated_CDS */ -type RangeLocal = [number, number]; -type ChromosomeMetadata = { - strandsObserved: Set, - posStrandStackHeight: number, - negStrandStackHeight: number, +type Strand = '+' | '-' // other GFF-valid options are '.' and '?' + +/** + * Specifies the range of the each segment's corresponding position in the genome, + * or defines the range of the genome (chromosome) itself. + * Start is always less than or equal to end. + * Start is 1-based, End is 1-based closed. I.e. GFF. + */ +type RangeGenome = [number, number] + +/** + * Same as RangeGenome but now relative to the nucleotides which make up the CDS + * (i.e. after slippage, splicing etc). The first CDS segment's RangeLocal will always + * start at 1, and the end value (of the last segment) corresponds to the number of nt in the CDS: + * range_segLast[1] - range_seg1[0] + 1 = 3 * number_of_amino_acids_in_translated_CDS + */ +type RangeLocal = [number, number] + +interface ChromosomeMetadata { + strandsObserved: Set + posStrandStackHeight: number + negStrandStackHeight: number } -type GenomeAnnotation = Chromosome[]; interface Chromosome { - name: string; - range: RangeGenome; - genes: Gene[]; + name: string + range: RangeGenome + genes: Gene[] metadata: ChromosomeMetadata } + interface Gene { - name: string; - cds: CDS[]; + name: string + cds: CDS[] } + interface CDS { - length: number; /* length of the CDS in nucleotides. Will be a multiple of 3 */ - segments: CdsSegment[]; - strand: Strand; - color: string; - name: string; - isWrapping: boolean; - displayName?: string; - description?: string; - stackPosition?: number; + /** length of the CDS in nucleotides. Will be a multiple of 3 */ + length: number + segments: CdsSegment[] + strand: Strand + color: string + name: string + isWrapping: boolean + displayName?: string + description?: string + stackPosition?: number } + +type Phase = 0 | 1 | 2 + +type Frame = 0 | 1 | 2 + interface CdsSegment { - rangeLocal: RangeLocal; - rangeGenome: RangeGenome; - segmentNumber: number; /* 1-based */ - phase: number; /* 0, 1 or 2. Indicates where the next codon begins relative to the 5' end of this segment */ - frame: number; /* 0, 1 or 2. The frame the codons are in, relative to the 5' end of the genome. It thus takes into account the phase */ + rangeLocal: RangeLocal + rangeGenome: RangeGenome + + /** 1-based */ + segmentNumber: number + + /** Indicates where the next codon begins relative to the 5' end of this segment */ + phase: Phase + + /** The frame the codons are in, relative to the 5' end of the genome. It thus takes into account the phase */ + frame: Frame } /** @@ -83,28 +95,58 @@ interface CdsSegment { * ¹ The exception being a single CDS which wraps around the origin, which we are able * to split into two segments here. */ -export const genomeMap = (annotations: JsonAnnotations): GenomeAnnotation => { +export const genomeMap = (annotations: UnknownJsonObject): Chromosome[] => { const nucAnnotation = Object.entries(annotations) .filter(([name,]) => name==='nuc') .map(([, annotation]) => annotation)[0]; - if (!nucAnnotation) throw new Error("Genome annotation missing 'nuc' definition") - if (!nucAnnotation.start || !nucAnnotation.end) throw new Error("Genome annotation for 'nuc' missing start or end") - if (nucAnnotation.strand==='-') throw new Error("Auspice can only display genomes represented as positive strand." + - "Note that -ve strand RNA viruses are typically annotated as 5' → 3'."); + + if (!nucAnnotation) { + throw new Error("Genome annotation missing 'nuc' definition"); + } + if (typeof nucAnnotation !== 'object') { + throw new Error("Genome annotation for 'nuc' is not a JSON object."); + } + if (!('start' in nucAnnotation) || !('end' in nucAnnotation)) { + throw new Error("Genome annotation for 'nuc' missing start or end"); + } + if (typeof nucAnnotation.start !== 'number' || typeof nucAnnotation.end !== 'number') { + throw new Error("Genome annotation for 'nuc.start' or 'nuc.end' is not a number."); + } + if ('strand' in nucAnnotation && nucAnnotation.strand === '-') { + throw new Error("Auspice can only display genomes represented as positive strand." + + "Note that -ve strand RNA viruses are typically annotated as 5' → 3'."); + } + const rangeGenome: RangeGenome = [nucAnnotation.start, nucAnnotation.end]; /* Group by genes -- most JSONs will not include this information, so it'll essentially be one CDS per gene, but that's just fine! */ - const annotationsPerGene: Record = {}; + const annotationsPerGene: Record> = {}; Object.entries(annotations) .filter(([name,]) => name!=='nuc') .map(([annotationKey, annotation]) => { - const geneName = annotation.gene || annotationKey; + + if (typeof annotation !== 'object') { + throw new Error(`Genome annotation for '${annotationKey}' is not a JSON object.`); + } + + let geneName = annotationKey; + + if ('gene' in annotation) { + if (typeof annotation.gene !== 'string') { + throw new Error(`Genome annotation '${annotationKey}.gene' is not a string.`); + } + geneName = annotation.gene; + } + if (!(geneName in annotationsPerGene)) annotationsPerGene[geneName] = {}; - const gene = annotationsPerGene[geneName]; - gene[annotationKey] = annotation; + + /* Assertion is safe: see docstring of UnknownJsonObject + */ + // eslint-disable-next-line @typescript-eslint/consistent-type-assertions + annotationsPerGene[geneName][annotationKey] = annotation as UnknownJsonObject; }) const nextColor = nextColorGenerator(); @@ -141,7 +183,7 @@ export const genomeMap = (annotations: JsonAnnotations): GenomeAnnotation => { return [chromosome]; } -export const entropyCreateState = (genomeAnnotations: JsonAnnotations) => { +export const entropyCreateState = (genomeAnnotations: UnknownJsonObject) => { if (genomeAnnotations) { try { return { @@ -159,8 +201,8 @@ export const entropyCreateState = (genomeAnnotations: JsonAnnotations) => { }; -function validColor(color:(string|undefined)) { - if (!color) return false; +function validColor(color: string | undefined | unknown) { + if (typeof color !== "string") return false; return color; // TODO XXX } @@ -175,7 +217,12 @@ function* nextColorGenerator() { /** * Returns a CDS object parsed from the provided JsonAnnotation block */ -function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGenome: RangeGenome, defaultColor: (string|void)): CDS { +function cdsFromAnnotation( + cdsName: string, + annotation: UnknownJsonObject, + rangeGenome: RangeGenome, + defaultColor: string | void, +): CDS { const invalidCds: CDS = { name: '__INVALID__', length: 0, @@ -190,7 +237,7 @@ function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGen * which are represented by augur as '?' and null, respectively. (null comes from `None` in python.) * In both cases it's not a good idea to make an assumption of strandedness, or to assume it's even a CDS. */ console.error(`[Genome annotation] ${cdsName} has strand ` + - (Object.prototype.hasOwnProperty.call(annotation, "strand") ? String(annotation.strand) : '(missing)') + + (annotation.strand !== undefined ? annotation.strand : '(missing)') + ". This CDS will be ignored."); return invalidCds; } @@ -199,6 +246,12 @@ function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGen let length = 0; // rangeLocal length const segments: CdsSegment[] = []; if (annotation.start && annotation.end) { + + if (typeof annotation.start !== 'number' || typeof annotation.end !== 'number') { + console.error(`[Genome annotation] ${cdsName} start (${annotation.start}) and/or end (${annotation.end}) is not a number.`); + return invalidCds; + } + /* The simplest case is where a JSON annotation block defines a contiguous CDS, however it may be a wrapping CDS (i.e. cds end > genome end */ @@ -217,8 +270,12 @@ function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGen {start: annotation.start, end: rangeGenome[1]}, {start: 1, end: annotation.end-rangeGenome[1]} ] - /* -ve strand segments are 3' -> 5', so segment[0] is at the start of the genome */ - if (!positive) annotation.segments.reverse(); + // TypeScript is unable to infer that annotation.segments is an array, + // hence the explicit type guard. + if (Array.isArray(annotation.segments)){ + /* -ve strand segments are 3' -> 5', so segment[0] is at the start of the genome */ + if (!positive) annotation.segments.reverse(); + } } } @@ -234,7 +291,7 @@ function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGen For -ve strand that's 3' to 5'. The rangeGenome within each segment is always 5' to 3'. */ const segmentLength = segment.end - segment.start + 1; // in nucleotides /* phase is the number of nucs we need to add to the so-far-observed length to make it mod 3 */ - const phase = length%3===0 ? 0 : (length%3===1 ? 2 : 1); + const phase: Phase = length%3===0 ? 0 : (length%3===1 ? 2 : 1); const s: CdsSegment = { segmentNumber: segmentNumber++, @@ -280,17 +337,30 @@ function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGen * Calculates the (open reading) frame the provided segment is in. * For +ve strand this is calculated 5'->3', for -ve strand it's 3'->5'. * The frame is calculated once the CDS is back in phase. - * start: 1 based, rangeGenome[0] of the segment - * end: 1 based, rangeGenome[1] of the segment - * start < end always - * genomeLength: 1 based - * positiveStrand: boolean - * Returns a number in the set {0, 1, 2} */ -function _frame(start:number, end:number, phase: number, genomeLength:number, positiveStrand:boolean) { - return positiveStrand ? +function _frame( + /** 1 based, rangeGenome[0] of the segment */ + start: number, + + /** + * 1 based, rangeGenome[1] of the segment. + * start < end always + */ + end: number, + + phase: Phase, + + /** 1 based */ + genomeLength: number, + positiveStrand: boolean, +): Frame { + /* TypeScript cannot infer the exact range of values from a modulo operation, + * so it is manually provided. + */ + // eslint-disable-next-line @typescript-eslint/consistent-type-assertions + return (positiveStrand ? (start+phase-1)%3 : - Math.abs((end-phase-genomeLength)%3); + Math.abs((end-phase-genomeLength)%3)) as 0 | 1 | 2; } /** @@ -300,16 +370,18 @@ function _frame(start:number, end:number, phase: number, genomeLength:number, po * refers to this being a stacking problem.) The stack position starts at 1. * Returns the maximum position observed. */ -function calculateStackPosition(genes: Gene[], strand: (Strand|null) = null):number { +function calculateStackPosition( + genes: Gene[], + strand: Strand, +): number { /* List of CDSs, sorted by their earliest occurrence in the genome (for any segment) */ - let cdss = genes.reduce((acc: CDS[], gene) => [...acc, ...gene.cds], []); - if (strand) { - cdss = cdss.filter((cds) => cds.strand===strand); - } - cdss.sort((a, b) => - Math.min(...a.segments.map((s) => s.rangeGenome[0])) < Math.min(...b.segments.map((s) => s.rangeGenome[0])) ? - -1 : 1 - ); + const cdss = genes + .reduce((acc: CDS[], gene) => [...acc, ...gene.cds], []) + .filter((cds) => cds.strand===strand) + .sort((a, b) => + Math.min(...a.segments.map((s) => s.rangeGenome[0])) < Math.min(...b.segments.map((s) => s.rangeGenome[0])) ? + -1 : 1 + ); let stack: CDS[] = []; // current CDSs in stack for (const newCds of cdss) { /* remove any CDS from the stack which has ended (completely) before this one starts */ @@ -350,7 +422,7 @@ function calculateStackPosition(genes: Gene[], strand: (Strand|null) = null):num * Given an array of sorted integers, if there are any spaces (starting with 1) * then return the value which can fill that space. Returns 0 if no spaces. */ -function _emptySlots(values: number[]):number { +function _emptySlots(values: number[]): number { if ((values[0] || 0) > 1) return 1; for (let i=1; i s.rangeGenome[0])); const b = Math.max(...newCds.segments.map((s) => s.rangeGenome[1])); for (const cds of existing) { @@ -400,10 +475,13 @@ function _fitCdssTogether(existing: CDS[], newCds: CDS):number { /* Does a CDS wrap the origin? */ -function _isCdsWrapping(strand: Strand, segments: CdsSegment[]): boolean { +function _isCdsWrapping( + strand: Strand, + segments: CdsSegment[], +): boolean { const positive = strand==='+'; // segments ordered to guarantee rangeLocal will always be greater (than the previous segment) - let prevSegment; + let prevSegment: CdsSegment; for (const segment of segments) { if (prevSegment) { if (positive && prevSegment.rangeGenome[0] > segment.rangeGenome[0]) {