Skip to content

Commit

Permalink
Merge pull request #1915: Update entropy types
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin authored Jan 16, 2025
2 parents de6290e + d3e89c0 commit 5270cf1
Showing 1 changed file with 166 additions and 88 deletions.
254 changes: 166 additions & 88 deletions src/util/entropyCreateStateFromJsons.ts
Original file line number Diff line number Diff line change
@@ -1,67 +1,79 @@
import { genotypeColors } from "./globals";
import { defaultEntropyState } from "../reducers/entropy";

type JsonAnnotations = Record<string, JsonAnnotation>
// enum Strand {'+', '-'} // other GFF-valid options are '.' and '?'
type Strand = string;
type JsonSegmentRange = {start: number, end: number}; // Start is 1-based, End is 1-based closed (GFF)
interface JsonAnnotation {
/* Other properties are commonly set in the JSON structure, but the following are
the only ones read by Auspice */
end?: number;
start?: number;
segments?: JsonSegmentRange[];
strand: Strand;
gene?: string;
color?: string;
display_name?: string;
description?: string;
/**
* Object used for user-provided JSON.
* Stricter than simply `object` which implicitly types values as `any`.
*/
interface UnknownJsonObject {
[key: string]: unknown
}

/* Specifies the range of the each segment's corresponding position in the genome,
or defines the range of the genome (chromosome) itself.
Start is always less than or equal to end.
Start is 1-based, End is 1-based closed. I.e. GFF. */
type RangeGenome = [number, number];
/* Same as RangeGenome but now relative to the nucleotides which make up the CDS
(i.e. after slippage, splicing etc). The first CDS segment's RangeLocal will always
start at 1, and the end value (of the last segment) corresponds to the number of nt in the CDS:
range_segLast[1] - range_seg1[0] + 1 = 3 * number_of_amino_acids_in_translated_CDS */
type RangeLocal = [number, number];
type ChromosomeMetadata = {
strandsObserved: Set<Strand>,
posStrandStackHeight: number,
negStrandStackHeight: number,
type Strand = '+' | '-' // other GFF-valid options are '.' and '?'

/**
* Specifies the range of the each segment's corresponding position in the genome,
* or defines the range of the genome (chromosome) itself.
* Start is always less than or equal to end.
* Start is 1-based, End is 1-based closed. I.e. GFF.
*/
type RangeGenome = [number, number]

/**
* Same as RangeGenome but now relative to the nucleotides which make up the CDS
* (i.e. after slippage, splicing etc). The first CDS segment's RangeLocal will always
* start at 1, and the end value (of the last segment) corresponds to the number of nt in the CDS:
* range_segLast[1] - range_seg1[0] + 1 = 3 * number_of_amino_acids_in_translated_CDS
*/
type RangeLocal = [number, number]

interface ChromosomeMetadata {
strandsObserved: Set<Strand>
posStrandStackHeight: number
negStrandStackHeight: number
}

type GenomeAnnotation = Chromosome[];
interface Chromosome {
name: string;
range: RangeGenome;
genes: Gene[];
name: string
range: RangeGenome
genes: Gene[]
metadata: ChromosomeMetadata
}

interface Gene {
name: string;
cds: CDS[];
name: string
cds: CDS[]
}

interface CDS {
length: number; /* length of the CDS in nucleotides. Will be a multiple of 3 */
segments: CdsSegment[];
strand: Strand;
color: string;
name: string;
isWrapping: boolean;
displayName?: string;
description?: string;
stackPosition?: number;
/** length of the CDS in nucleotides. Will be a multiple of 3 */
length: number
segments: CdsSegment[]
strand: Strand
color: string
name: string
isWrapping: boolean
displayName?: string
description?: string
stackPosition?: number
}

type Phase = 0 | 1 | 2

type Frame = 0 | 1 | 2

interface CdsSegment {
rangeLocal: RangeLocal;
rangeGenome: RangeGenome;
segmentNumber: number; /* 1-based */
phase: number; /* 0, 1 or 2. Indicates where the next codon begins relative to the 5' end of this segment */
frame: number; /* 0, 1 or 2. The frame the codons are in, relative to the 5' end of the genome. It thus takes into account the phase */
rangeLocal: RangeLocal
rangeGenome: RangeGenome

/** 1-based */
segmentNumber: number

/** Indicates where the next codon begins relative to the 5' end of this segment */
phase: Phase

/** The frame the codons are in, relative to the 5' end of the genome. It thus takes into account the phase */
frame: Frame
}

/**
Expand All @@ -83,28 +95,58 @@ interface CdsSegment {
* ¹ The exception being a single CDS which wraps around the origin, which we are able
* to split into two segments here.
*/
export const genomeMap = (annotations: JsonAnnotations): GenomeAnnotation => {
export const genomeMap = (annotations: UnknownJsonObject): Chromosome[] => {

const nucAnnotation = Object.entries(annotations)
.filter(([name,]) => name==='nuc')
.map(([, annotation]) => annotation)[0];
if (!nucAnnotation) throw new Error("Genome annotation missing 'nuc' definition")
if (!nucAnnotation.start || !nucAnnotation.end) throw new Error("Genome annotation for 'nuc' missing start or end")
if (nucAnnotation.strand==='-') throw new Error("Auspice can only display genomes represented as positive strand." +
"Note that -ve strand RNA viruses are typically annotated as 5' → 3'.");

if (!nucAnnotation) {
throw new Error("Genome annotation missing 'nuc' definition");
}
if (typeof nucAnnotation !== 'object') {
throw new Error("Genome annotation for 'nuc' is not a JSON object.");
}
if (!('start' in nucAnnotation) || !('end' in nucAnnotation)) {
throw new Error("Genome annotation for 'nuc' missing start or end");
}
if (typeof nucAnnotation.start !== 'number' || typeof nucAnnotation.end !== 'number') {
throw new Error("Genome annotation for 'nuc.start' or 'nuc.end' is not a number.");
}
if ('strand' in nucAnnotation && nucAnnotation.strand === '-') {
throw new Error("Auspice can only display genomes represented as positive strand." +
"Note that -ve strand RNA viruses are typically annotated as 5' → 3'.");
}

const rangeGenome: RangeGenome = [nucAnnotation.start, nucAnnotation.end];


/* Group by genes -- most JSONs will not include this information, so it'll essentially be
one CDS per gene, but that's just fine! */
const annotationsPerGene: Record<string,JsonAnnotations> = {};
const annotationsPerGene: Record<string,Record<string, UnknownJsonObject>> = {};
Object.entries(annotations)
.filter(([name,]) => name!=='nuc')
.map(([annotationKey, annotation]) => {
const geneName = annotation.gene || annotationKey;

if (typeof annotation !== 'object') {
throw new Error(`Genome annotation for '${annotationKey}' is not a JSON object.`);
}

let geneName = annotationKey;

if ('gene' in annotation) {
if (typeof annotation.gene !== 'string') {
throw new Error(`Genome annotation '${annotationKey}.gene' is not a string.`);
}
geneName = annotation.gene;
}

if (!(geneName in annotationsPerGene)) annotationsPerGene[geneName] = {};
const gene = annotationsPerGene[geneName];
gene[annotationKey] = annotation;

/* Assertion is safe: see docstring of UnknownJsonObject
*/
// eslint-disable-next-line @typescript-eslint/consistent-type-assertions
annotationsPerGene[geneName][annotationKey] = annotation as UnknownJsonObject;
})

const nextColor = nextColorGenerator();
Expand Down Expand Up @@ -141,7 +183,7 @@ export const genomeMap = (annotations: JsonAnnotations): GenomeAnnotation => {
return [chromosome];
}

export const entropyCreateState = (genomeAnnotations: JsonAnnotations) => {
export const entropyCreateState = (genomeAnnotations: UnknownJsonObject) => {
if (genomeAnnotations) {
try {
return {
Expand All @@ -159,8 +201,8 @@ export const entropyCreateState = (genomeAnnotations: JsonAnnotations) => {
};


function validColor(color:(string|undefined)) {
if (!color) return false;
function validColor(color: string | undefined | unknown) {
if (typeof color !== "string") return false;
return color; // TODO XXX
}

Expand All @@ -175,7 +217,12 @@ function* nextColorGenerator() {
/**
* Returns a CDS object parsed from the provided JsonAnnotation block
*/
function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGenome: RangeGenome, defaultColor: (string|void)): CDS {
function cdsFromAnnotation(
cdsName: string,
annotation: UnknownJsonObject,
rangeGenome: RangeGenome,
defaultColor: string | void,
): CDS {
const invalidCds: CDS = {
name: '__INVALID__',
length: 0,
Expand All @@ -190,7 +237,7 @@ function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGen
* which are represented by augur as '?' and null, respectively. (null comes from `None` in python.)
* In both cases it's not a good idea to make an assumption of strandedness, or to assume it's even a CDS. */
console.error(`[Genome annotation] ${cdsName} has strand ` +
(Object.prototype.hasOwnProperty.call(annotation, "strand") ? String(annotation.strand) : '(missing)') +
(annotation.strand !== undefined ? annotation.strand : '(missing)') +
". This CDS will be ignored.");
return invalidCds;
}
Expand All @@ -199,6 +246,12 @@ function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGen
let length = 0; // rangeLocal length
const segments: CdsSegment[] = [];
if (annotation.start && annotation.end) {

if (typeof annotation.start !== 'number' || typeof annotation.end !== 'number') {
console.error(`[Genome annotation] ${cdsName} start (${annotation.start}) and/or end (${annotation.end}) is not a number.`);
return invalidCds;
}

/* The simplest case is where a JSON annotation block defines a
contiguous CDS, however it may be a wrapping CDS (i.e. cds end > genome
end */
Expand All @@ -217,8 +270,12 @@ function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGen
{start: annotation.start, end: rangeGenome[1]},
{start: 1, end: annotation.end-rangeGenome[1]}
]
/* -ve strand segments are 3' -> 5', so segment[0] is at the start of the genome */
if (!positive) annotation.segments.reverse();
// TypeScript is unable to infer that annotation.segments is an array,
// hence the explicit type guard.
if (Array.isArray(annotation.segments)){
/* -ve strand segments are 3' -> 5', so segment[0] is at the start of the genome */
if (!positive) annotation.segments.reverse();
}
}
}

Expand All @@ -234,7 +291,7 @@ function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGen
For -ve strand that's 3' to 5'. The rangeGenome within each segment is always 5' to 3'. */
const segmentLength = segment.end - segment.start + 1; // in nucleotides
/* phase is the number of nucs we need to add to the so-far-observed length to make it mod 3 */
const phase = length%3===0 ? 0 : (length%3===1 ? 2 : 1);
const phase: Phase = length%3===0 ? 0 : (length%3===1 ? 2 : 1);

const s: CdsSegment = {
segmentNumber: segmentNumber++,
Expand Down Expand Up @@ -280,17 +337,30 @@ function cdsFromAnnotation(cdsName: string, annotation: JsonAnnotation, rangeGen
* Calculates the (open reading) frame the provided segment is in.
* For +ve strand this is calculated 5'->3', for -ve strand it's 3'->5'.
* The frame is calculated once the CDS is back in phase.
* start: 1 based, rangeGenome[0] of the segment
* end: 1 based, rangeGenome[1] of the segment
* start < end always
* genomeLength: 1 based
* positiveStrand: boolean
* Returns a number in the set {0, 1, 2}
*/
function _frame(start:number, end:number, phase: number, genomeLength:number, positiveStrand:boolean) {
return positiveStrand ?
function _frame(
/** 1 based, rangeGenome[0] of the segment */
start: number,

/**
* 1 based, rangeGenome[1] of the segment.
* start < end always
*/
end: number,

phase: Phase,

/** 1 based */
genomeLength: number,
positiveStrand: boolean,
): Frame {
/* TypeScript cannot infer the exact range of values from a modulo operation,
* so it is manually provided.
*/
// eslint-disable-next-line @typescript-eslint/consistent-type-assertions
return (positiveStrand ?
(start+phase-1)%3 :
Math.abs((end-phase-genomeLength)%3);
Math.abs((end-phase-genomeLength)%3)) as 0 | 1 | 2;
}

/**
Expand All @@ -300,16 +370,18 @@ function _frame(start:number, end:number, phase: number, genomeLength:number, po
* refers to this being a stacking problem.) The stack position starts at 1.
* Returns the maximum position observed.
*/
function calculateStackPosition(genes: Gene[], strand: (Strand|null) = null):number {
function calculateStackPosition(
genes: Gene[],
strand: Strand,
): number {
/* List of CDSs, sorted by their earliest occurrence in the genome (for any segment) */
let cdss = genes.reduce((acc: CDS[], gene) => [...acc, ...gene.cds], []);
if (strand) {
cdss = cdss.filter((cds) => cds.strand===strand);
}
cdss.sort((a, b) =>
Math.min(...a.segments.map((s) => s.rangeGenome[0])) < Math.min(...b.segments.map((s) => s.rangeGenome[0])) ?
-1 : 1
);
const cdss = genes
.reduce((acc: CDS[], gene) => [...acc, ...gene.cds], [])
.filter((cds) => cds.strand===strand)
.sort((a, b) =>
Math.min(...a.segments.map((s) => s.rangeGenome[0])) < Math.min(...b.segments.map((s) => s.rangeGenome[0])) ?
-1 : 1
);
let stack: CDS[] = []; // current CDSs in stack
for (const newCds of cdss) {
/* remove any CDS from the stack which has ended (completely) before this one starts */
Expand Down Expand Up @@ -350,7 +422,7 @@ function calculateStackPosition(genes: Gene[], strand: (Strand|null) = null):num
* Given an array of sorted integers, if there are any spaces (starting with 1)
* then return the value which can fill that space. Returns 0 if no spaces.
*/
function _emptySlots(values: number[]):number {
function _emptySlots(values: number[]): number {
if ((values[0] || 0) > 1) return 1;
for (let i=1; i<values.length; i++) {
/* intermediate variables because of https://github.com/microsoft/TypeScript/issues/46253 */
Expand All @@ -365,7 +437,10 @@ function _emptySlots(values: number[]):number {
* between-segment space of an existing segment, then return the stackPosition
* of that existing CDS. Otherwise return 0;
*/
function _fitCdssTogether(existing: CDS[], newCds: CDS):number {
function _fitCdssTogether(
existing: CDS[],
newCds: CDS,
): number {
const a = Math.min(...newCds.segments.map((s) => s.rangeGenome[0]));
const b = Math.max(...newCds.segments.map((s) => s.rangeGenome[1]));
for (const cds of existing) {
Expand Down Expand Up @@ -400,10 +475,13 @@ function _fitCdssTogether(existing: CDS[], newCds: CDS):number {


/* Does a CDS wrap the origin? */
function _isCdsWrapping(strand: Strand, segments: CdsSegment[]): boolean {
function _isCdsWrapping(
strand: Strand,
segments: CdsSegment[],
): boolean {
const positive = strand==='+';
// segments ordered to guarantee rangeLocal will always be greater (than the previous segment)
let prevSegment;
let prevSegment: CdsSegment;
for (const segment of segments) {
if (prevSegment) {
if (positive && prevSegment.rangeGenome[0] > segment.rangeGenome[0]) {
Expand Down

0 comments on commit 5270cf1

Please sign in to comment.