-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.tex
951 lines (675 loc) · 119 KB
/
main.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
\documentclass[9pt,lineno]{elife}
\ifx\c@authorfn\undefined{}
\newcounter{authorfn}
\setcounter{authorfn}{1}
\newcommand{\authfn}[1]{%
\fnsymbolmult{\numexpr\value{authorfn}+#1}%
}
\else
\fi
\def\@correspondence{}
\def\@contribution{}
\def\@presentaddress{}
\def\@deceased{}
\ifx\c@correspondence\undefined{}
\newcommand{\corr}[2]{%
\ifx\empty\@correspondence\else\appto{\@correspondence}{; }{}{}\fi
\appto{\@correspondence}{%
\url{#1}%
\ifx\empty#2\else\space(#2)\fi
}{}{}%
}
\fi
\ifx\c@contribution\undefined{}
\newcommand{\contrib}[2][]{
\appto{\@contribution}{%
\ifx\empty#1\else\textsuperscript{#1}\fi
#2\\
}{}{}
}
\fi
\ifx\c@presentaddress\undefined{}
\newcommand{\presentadd}[2][]{
\ifx\empty\@presentaddress\else\appto{\@presentaddress}{; }{}{}\fi
\appto{\@presentaddress}{%
\ifx\empty#1\else\textsuperscript{#1}\fi
#2%
}{}{}
}
\fi
\ifx\c@deceased\undefined{}
\newcommand{\deceased}[1]{\def\@deceased{\textsuperscript{#1}Deceased}}
\fi
\reversemarginpar
\usepackage[symbol]{footmisc}
\renewcommand{\thefootnote}{\fnsymbol{footnote}}
\usepackage{pdfpages}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% table
\usepackage{multirow}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% font size %%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{anyfontsize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Maths %%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{amsmath}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% referencing %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{hyperref}
\usepackage{xcolor}
\hypersetup{
colorlinks,
%linkcolor={red!50!black},
linkcolor={black},
citecolor={blue!50!black},
urlcolor={blue!80!black}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% figure and table %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{graphicx}
\graphicspath{{./figures/}}
%\graphicspath{{}}
\usepackage[label font=bf,labelformat=simple, position = top]{subfig}
\newdimen\figrasterwd
\figrasterwd\textwidth
\usepackage{wrapfig}
\usepackage{lipsum}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% layout %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{setspace}
\linespread{1.5}
\usepackage{authblk}
%\usepackage{fancyhdr}
\providecommand{\keywords}[1]{\textbf{\textit{Key words:}} #1}
%\newcommand\@shorttitle{}
% define \theshorttitle to what is given
%\newcommand\shorttitle[1]{\renewcommand\@shorttitle{#1}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% todo %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage[disable]{todonotes}
\newcounter{todocounter}
\newcommand{\todonum}[2][]
{\stepcounter{todocounter}\todo[#1]{\thetodocounter: #2}}
\newcommand{\done}[2][]
{\todo[color=green!40, #1]{#2}}
\newcommand{\donenum}[2][]
{\stepcounter{todocounter}\done[#1]{\thetodocounter: #2}}
\newcommand{\narrative}[2][]
{\todo[color=blue!40, #1]{#2}}
\newcommand{\narrativenum}[2][]
{\stepcounter{todocounter}\narrative[#1]{\thetodocounter: #2}}
\newcommand{\donenarrative}[2][]
{\todo[color=green!40, #1]{#2}}
\newcommand{\donenarrativenum}[2][]
{\stepcounter{todocounter}\narrative[#1]{\thetodocounter: #2}}
%%%%%%%%%%%%%%%%%%%%%%%%%%% Supplement package
\usepackage{listings}
\begin{document}
\listoftodos
\clearpage
\setcounter{page}{1}
%\firstpage{1}
\title{The origins and relatedness structure of mixed infections vary with local prevalence of {\it P. falciparum} malaria}
\newcommand\shorttitle{Mixed infections in malaria}
\date{}
\author[1$\dagger$]{Sha Joe Zhu}
\author[1$\dagger$]{Jason A. Hendry}
\author[1,2,3,4]{Jacob Almagro-Garcia}
\author[2,3,4]{Richard D. Pearson}
\author[2,3,4]{Roberto Amato}
\author[1,2,3,4]{Alistair Miles}
\author[1]{Daniel J. Weiss}
\author[1]{Tim C.D. Lucas}
\author[1]{Michele Nguyen}
%\author[$ $]{Pf3k Consortium}
\author[1]{Peter W. Gething}
\author[1,2,3,4]{Dominic Kwiatkowski}
\author[1,3*]{Gil McVean}
\author[5]{for the Pf3k Project}
\affil[1]{Big Data Institute, Li Ka Shing Centre for Health Information and Discovery, University of Oxford}
\affil[2]{Wellcome Centre for Human Genetics, University of Oxford}
\affil[3]{Medical Research Council Centre for Genomics and Global Health, University of Oxford}
\affil[4]{Wellcome Sanger Institute}
\affil[5]{See Supplementary Note}
\contrib[$\dagger$]{These authors contributed equally to this work}
\corr{[email protected]}{GM}
\maketitle{}
\begin{abstract}
Individual malaria infections can carry multiple strains of {\it Plasmodium falciparum} with varying levels of relatedness. Yet, how local epidemiology affects the properties of such mixed infections remains unclear. Here, we develop an enhanced method for strain deconvolution from genome sequencing data, which estimates the number of strains, their proportions, identity-by-descent (IBD) profiles and individual haplotypes. Applying it to the Pf3k data set, we find that the rate of mixed infection varies from \textcolor{black}{29}\% \donenum{number change, was Senegal, now using Gambia} to 63\% across countries and that 51\% of mixed infections involve more than two strains. Furthermore, we estimate that 47\% of symptomatic dual infections contain sibling strains likely to have been co-transmitted from a single mosquito, and find evidence of mixed infections propagated over successive infection cycles. Finally, leveraging data from the Malaria Atlas Project, we find that prevalence correlates within Africa, but not Asia, with both the rate of mixed infection and the level of IBD.
\end{abstract}
\keywords{Malaria, genome, epidemiology, relatedness}
\section{Introduction}
Individuals infected with malaria-causing parasites of the genus {\it Plasmodium} often carry multiple, distinct strains of the same species ~\citep{Bell2006}. Such mixed infections, also known as complex infections, are likely indicative of intense local exposure rates, being common in regions of Africa with high rates of prevalence~\citep{Howes2016}. However, they have also been documented for {\it P.~vivax} and other malaria-causing parasites~\citep{Mueller2007, Collins2012}, even in regions of much lower prevalence~\citep{Howes2016, Steenkeste2010}. Mixed infections have been associated with increased disease severity \citep{deRoode2005} and also facilitate the generation of genomic diversity within the parasite, enabling co-transmission to the mosquito vector where sexual recombination occurs \citep{Mzilahowa2007}. The distribution of mixed infection duration, and whether the clearance of one or more strains results purely from host immunity~\citep{Borrmann2011} or can be influenced by interactions between the distinct strains~\citep{Enosse2006, Bushman2016}, are all open questions.
Although mixed infections can be studied from genetic barcodes~\citep{Galinsky2015}, genome sequencing provides a more powerful approach for detecting mixed infections~\citep{Jack2016, Chang2017}. Genetic differences between co-existing strains manifest as polymorphic loci in the DNA sequence of the isolate. The higher resolution of sequencing data allows the use of statistical methods for estimating the number of distinct strains, their relative proportions, and genome sequences~\citep{Zhu2017}. Although genomic approaches cannot identify individuals infected multiple times by identical strains, and are affected by sequencing errors and problems of incomplete or erroneous reference assemblies, they provide a rich characterisation of within host diversity~\citep{Manske2012, auburn2012characterization, Pearson2016}.
Previous research has highlighted that co-existing strains can be highly related~\citep{Nair2014, Trevino2017}. For example, in {\it P. vivax}, 58\% of mixed infections show long stretches of within host homozygosity~\citep{Pearson2016}. In addition, \citet{Nkhoma2012} reported an average of 78.7\% {\it P. falciparum} allele sharing in Malawi and 87.6\% sharing in Thailand. A mixed infection with related strains can arise through different mechanisms. Firstly, relatedness is created when distinct parasite strains undergo meiosis in a mosquito vector. A mosquito vector can acquire distinct strains by biting a single multiply-infected individual, or multiple infected individuals in close succession. Co-transmission of multiple meiotic progeny produces a mixed infection in a single-bite, containing related strains. Alternatively, relatedness in a mixed infection can result from multiple bites in a parasite population with low genetic diversity, such as is expected during the early stages of an outbreak or following severe population bottlenecks; for instance, those resulting from an intervention ~\citep{Mouzin2010, Wong2017, Daniels2015}. Interestingly, serial co-transmission of a mixed infection is akin to inbreeding, producing strains with relatedness levels well above those of standard siblings.
The rate and relatedness structure of mixed infections are therefore highly relevant for understanding regional epidemiology. However, progress towards utilising this source of information is limited by three problems. Firstly, while strain deconvolution within mixed infections has received substantial attention~\citep{Galinsky2015, Jack2016, Chang2017, Zhu2017}, currently, no methods perform both deconvolution of strains and estimation of relatedness. Because existing deconvolution methods assume equal relatedness along the genome, differences in relatedness that occur, for example through infection by sibling strains, can lead to errors in the estimation of the number, proportions and sequences of individual strains (Figure \ref{fig:fig1}). Recently, progress has been made in the case of dual-infections with balanced proportions~\citep{Henden2016}, but a general solution is lacking. The second problem is that little is known about how the rate and relatedness structure of mixed infections relates to underlying epidemiological parameters. Informally, mixed infections will occur when prevalence is high; an observation exploited by \citet{Cerqueira2017} when estimating changes in transmission over time. However, the quantitative nature of this relationship, the key parameters that influence mixed infection rates and how patterns of relatedness relate to infection dynamics are largely unexplored. Finally, an important issue, though not one addressed here, is the sampling design. Malaria parasites may be taken from individuals presenting with disease or as part of a surveillance programme. They are also often highly clustered in time and space. What impact different sampling approaches have on observed genomic variation is not clear. Nevertheless, because mixed infection rates are likely to respond rapidly to changes in prevalence~\citep{volkman2012}, exploring these challenges may render critical insights for malaria control in the field.
Here, we develop, test and apply an enhanced method for strain deconvolution called \texttt{DEploidIBD}, which builds on our previously-published \texttt{DEploid} software. The method separates estimation of strain number, proportions, and relatedness (specifically the identity-by-descent, or IBD, profile along the genome) from the problem of inferring genome sequences. This strategy provides substantial improvements to accuracy when strains are closely related. We apply \texttt{DEploidIBD} to 2,344 field isolates of {\it P. falciparum} collected from 13 countries over a range of years (2001-2014) and available through the Pf3k Project (see Appendix), and characterise the rate and relatedness patterns of mixed infections. In addition, we develop a statistical framework for characterising the processes underlying mixed infections, estimating that nearly half of symptomatic mixed infections arise from the transmission of sibling strains, as well as demonstrating the propagation of mixed infections through multiple cycles of host-vector transmission. Finally, we investigate the relationships between statistics of mixed infection and epidemiological estimates of pathogen prevalence \citep{MAP2017}, showing that, at a global level, regional rates of mixed infection and levels of background IBD are correlated with estimates of malaria parasite prevalence.
\section{Strain deconvolution in the presence of relatedness}
Existing methods for deconvolution of mixed infections typically assume that the different genetic strains present in mixed infections are unrelated. This assumption allows for efficient computation of priors for allele frequencies within samples, either through assuming independence of loci~\citep{Jack2016} or as sequences generated as imperfect mosaics of some (predefined) reference panel~\citep{Zhu2017}. However, when strains are related to each other, and particularly when patterns of IBD vary along the genome (for example through being siblings), the constraints imposed on within-sample allele frequencies through IBD can cause problems for deconvolution methods, which can try to fit complex strain combinations (with relatedness) as simpler configurations (without relatedness). Below we outline the approach we take to integrating IBD into \texttt{DEploid}. Further details are provided in the Appendix.
\subsection{Decoding genomic relatedness among strains}
A common approach to detecting IBD between two genomes is to employ a hidden Markov Model that transitions into and out of IBD states~\citep{Chang2015, Gusev2009, Gusev2011}. We have generalised this approach to the case of $K$ haploid {\it Plasmodium} genomes (strains). In this setting, there are $2^K$ possible genotype configurations, as each of the $K$ strains can be either reference (i.e. same as the reference genome used during assembly), or alternative (i.e. carry a different allele) at a given locus (we assume all variation is bi-allelic). In most cases, if each of the $K$ strains constitutes a unique proportion of the infection, each genotype configuration will produce a distinct alternative within sample allele frequency (WSAF; Figure~\ref{fig:fig1}A), which defines the expected fraction of total sequencing reads that are alternative at a given locus in the sequenced infection.
The effect of IBD among these $K$ strains is to limit the number of distinct genotype configurations possible, in a way that depends on the pattern of IBD sharing. Consider that, for any given locus, the $K$ strains in the infection are assigned to $j \leq K$ possible reference haplotypes. IBD exists when two or more strains are assigned to the same haplotype. In this scenario, the total number of possible patterns of IBD for a given $K$ is equal to $\sum_{j=1}^{K} S(K,j)$, where $S(K,j)$ is the number of ways $k$ objects can be split into $j$ subsets; a Stirling number of the second kind \citep{Ronald1988}. Thus, for two strains, there are two possible IBD states (IBD or non-IBD), for three strains there are five states (all IBD, none IBD and the three pairwise IBD configurations), for four strains there are fifteen states (see Appendix), and so on. We limit analysis to a maximum of four strains for computational efficiency. Finally, for a given IBD state, only $2^j$ rather than $2^K$ genotype configurations are possible, thereby restricting the set of possible WSAF values.
Moving along the genome, recombination can result in changes in IBD state, hence changing the set of possible WSAF values at loci (Figure~\ref{fig:fig1}B). To infer IBD states we use a hidden Markov model, which assumes linkage equilibrium between variants for computational efficiency, with a Gamma-Poisson emission model for read counts to account for over-dispersion (see Appendix). Population-level allele frequencies are estimated from isolates obtained from a similar geographic region. Given the structure of the hidden Markov model, we can compute the likelihood of the strain proportions by integrating over all possible IBD sharing patterns, yielding a Bayesian estimate for the number and proportions of strains (see Appendix~1.4 Implementation details). We then use posterior decoding to infer the relatedness structure across the genome (Figure~\ref{fig:fig1}B). To quantify relatedness, we compute the mean IBD between pairs of strains and statistics of IBD tract length (mean, median and N50, the length-weighted median IBD tract length, Figure~\ref{fig:fig1}C).
In contrast to our previous work, \texttt{DEploidIBD} infers strain structure in two steps. In the first we estimate the number and proportions of strains using Markov Chain Monte-Carlo (MCMC), allowing for IBD as described above. In the second, we infer the individual genomes of the strains, using the MCMC methodology of~\citet{Zhu2017}, which can account for linkage disequilibrium (LD) between variants, but without updating strain proportions. The choice of reference samples for deconvolution is described in~\citet{Zhu2017} and in the Appendix. During this step we do not use the inferred IBD constraints {\em per se}, though the inferred haplotypes will typically copy from the same (or identical) members of the reference panel within the IBD tract.
\begin{figure}[ht]
\begin{center}
\includegraphics[width=\textwidth]{Fig1.pdf}
\caption{Deconvolution of a complex field sample PD0577-C from Thailand. (A) Scatter-plot showing the number of reads supporting the reference (REF: x-axis) and alternative (ALT: y-axis) alleles. The multiple clusters indicate the presence of multiple strains, but cannot distinguish the exact number or proportions. (B) The profile of within-sample allele frequency along chromosomes 11 and 12 (red points) suggests a changing profile of IBD with three distinct strains, estimated to be with proportions of $22\%$, $52\%$ and $26\%$ respectively (other chromosomes omitted for clarity, see Figure~\ref{fig:fig1}-Supplement~\ref{figsupp:ring}); blue points indicate expected allele frequencies within the isolate. However, the strains are inferred to be siblings of each other: green segments indicate where all three strains are IBD (Note: green segments do not appear in this example, but occur in Figure~\ref{fig:strainIBD}); yellow, orange and dark orange segments indicate the regions where one pair of strains are IBD but the others are not. In no region are all three strains inferred to be distinct. (C) Statistics of IBD tract length, in particular illustrating the N50 segment length. A graphical description of the modules and workflows for \texttt{DEploidIBD} is given in Figure~\ref{fig:fig1}-Supplement~\ref{figsupp:workflow}.}\label{fig:fig1}
\end{center}
\figsupp{Whole genome deconvolution of field sample PD0577-C. The outer ring shows the expected within-sample allele frequency (WSAF) (blue) and observed WSAF (red) across the genome. Red and blue points indicate observed and expected allele frequencies within the isolate. The inner ring indicates the IBD states among the three strains: green segments indicate where all three strains are IBD; yellow, orange and dark orange segments indicate the regions where one pair of strains are IBD but the others are not. In no region are all three strains inferred to be distinct, suggesting that the three strains are siblings.}{\includegraphics[width=.7\textwidth]{{myring.ring}.pdf}}
\label{figsupp:ring}
\figsupp{A graphical overview of the data types and work flows for \texttt{DEploidIBD}. The boxes at the bottom represent final outputs of the pipeline. The rectangular boxes indicate when \texttt{DEploidIBD} is executed, with inputs highlighted by blue arrows. The process has three key steps: Step 1. A reference panel for the set of samples is constructed from high confidence clonal haplotypes, either identified from within a study or from an external resource, such as Pf3k. Step 2: \texttt{DEploidIBD}, using population level allele frequencies, is used to infer the number of strains, strain proportions and IBD profile within each sample. Step 3: \texttt{DEploidIBD} is re-run on each sample to infer haplotypes, but with the proportions estimated in Step 2 fixed and this time using the haplotype (LD-aware) method previously implemented in \texttt{DEploid}.}{\includegraphics[width=.8\textwidth]{scheme-v2.png}}
\label{figsupp:workflow}
\end{figure}
\section{Results}
\subsection{Method validation}
\subsubsection{Validation using experimentally generated mixed infections}
We first sought to characterise the behaviour of \texttt{DEploidIBD} and compare its performance to the previously published method, \texttt{DEploid}. To this end, we re-analysed a set of 27 experimentally generated mixed infections \citep{Wendler2015} that had been previously deconvoluted by \texttt{DEploid} \citep{Zhu2017} using \texttt{DEploidIBD} (Figure~\ref*{fig:benchmark}-supplement~\ref*{figsupp:invitro}). These mixed infections were created with combinations of two or three laboratory strains (selected from 3D7, Dd2, HB3 and 7G8), set at varying known proportions \citep{Wendler2015}, and therefore provide a simple framework for evaluating inference of the number of strains ($K$) and their proportions. Since the method allows deconvolution of mixed infections containing up to four strains, we augmented the experimental mixtures by combining all four lab strains {\it in silico} at differing proportions (see Appendix~2.1 {\it In silico} lab mixtures). Using this approach, we found that \texttt{DEploid} and \texttt{DEploidIBD} performed comparably, except in the case of three strains with equal proportions, where LD information is necessary to achieve accurate deconvolution and \texttt{DEploid} performed better. Both \texttt{DEploid} and \texttt{DEploidIBD} struggled to deconvolute our {\it in silico} mixtures of four strains, typically underestimating the number of strains present.
\begin{figure}[htp]
\begin{center}
\includegraphics[width=.9\textwidth]{Fig2.pdf}
\caption{Performance of \texttt{DEploidIBD} and \texttt{DEploid} on 100 {\it in silico} mixtures for each of three different scenarios. From the left to the right, the panels show the strain proportion compositions, distribution of inferred $K$ in a vertically-oriented histogram (top: $K=1$, bottom: $K=4$), using both methods: {\tt DEploid} in orange and {\tt DEploidIBD} in blue, effective number of strains, pairwise relatedness and IBD N50 (the latter two only for {\tt DEploidIBD}). From top to the bottom, cases are ordered from even strain proportions to the most imbalanced composition. Grey points identify experiments of low coverage data (median sequencing depth $<20$), and pink identify cases where $K$ is inferred incorrectly. (A) {\it In silico} mixtures of two African strains with high-relatedness (75\%) for 7,757 (s.d. 178) sites on Chromosome 14, Note that {\tt DEploid} underestimates the minor strain proportion if strains have high relatedness. In the extreme case, {\tt DEploid} misclassifies a $K=2$-mixture as clonal, whereas {\tt DEploidIBD} consistently estimates the correct proportions. (B) {\it In silico} mixtures of two Asian strains with high-relatedness (75\%) for 3,041 sites (s.d. 227) on Chromosome 14, Note that {\tt DEploid} underestimates strain number when the minor strain is low frequency, while {\tt DEploidIBD} typically performs well. (C) {\it In silico} mixtures of three African strains, where each pair is IBD over a distinct third of the chromosome. Note that both methods fail to deconvolute the case of equal proportions. However, for unbalanced mixtures, {\tt DEploidIBD} consistently performs better than {\tt DEploid}.
}\label{fig:benchmark}
\end{center}
\newcounter{fig2Counter}
\setcounter{fig2Counter}{\value{enumi}}
\end{figure}
\begin{figure}[htp]
\setcounter{enumi}{\value{fig2Counter}}
%\setcounter{figsupp}{1}
\figsupp{Validation of {\tt DEploidIBD} using 27 {\it in vitro} lab mixtures and 4 {\it in silico} mixtures. A reference panel of the laboratory strains (3D7, Dd2, HB3 and 7G8; Panel V) was used to deconvolute samples with {\tt DEploid}. Each experiment is performed with and without IBD inference and with the maximum number of 4 strains. Black crosses indicate the true effective number of strains. Coloured crosses ({\tt DEploid} in red, {\tt DEploidIBD} in purple) indicate median values obtained from 30 replicates using the algorithm indicated in the legend. The coloured dots show the inferred effective number of strains across replicates with intensity proportional to fraction. Note one sample where balanced proportions of three strains results in the LD-free ({\tt DEploid-IBD}) approach fitting the data as a mixture of two strains with proportions of $1\slash3$ and $2\slash3$. For {\it in silico } mixtures of four strains, {\tt DEploid} performs poorly. {\tt DEploidIBD} shows some improvement in unbalanced mixtures, though misclassifies $K=4$ mixtures as only having three strains.}{\includegraphics[width=0.6\textwidth]{Fig2inVitro.pdf}}
\label{figsupp:invitro}
\donenum[inline]{Minor Comment 13, on figure supplement 2}
\figsupp{Illustration of simulation study design. We conduct simulation studies to mimic $K$-mixtures (top row) as results of $b$-biting events, where $K \in \{2,3,4\}$ and $1 \leq b \leq K$. For each $K$-mixture, the left column illustrate the overall relationship between strains (black dots): connected dots imply strains are from the same mosquito bite. The level of relatedness between parasite strains is reflected by the haplotype segment copied from the parental strains within the mosquito. Each colour represents a unique strain within the mosquito, which we randomly draw from field clonal haplotypes. For example, when $K=2$, we consider the case that the two strains are from two independent mosquito bites; on the other hand, when two strains are from the same mosquito bite, we consider scenarios of low (25\%), moderate (50\%) and high (75\%) relatedness between two sibling strains. These events are represented in the second, third and forth rows respectively. For $K=3$, we consider mixed-infection events as products of three mosquito bites, two mosquito bites and a single bite. For $K=4$, we consider mixed infections as products of four mosquito bites, three mosquito bites, two mosquito bites and a single bite. We further divide the possibilities of the 2-bite event into the case that both bites pass on two strains (2+2) and the other possibility that one bite passes on a single strain and the other bit passes on three strains (1+3).}{\includegraphics[width = \textwidth]{Fig2SimScheme-v2.pdf}}
\label{figsupp:scheme}
\figsupp{Additional comparison of \texttt{DEploidIBD} and \texttt{DEploid} on 100 {\it in silico} mixtures of two strains from Africa with low and moderate relatedness, illustrated by sub panels (A) and (B), respectively. Detailed panel description can be found in the caption to Figure~\ref{fig:benchmark}. {\tt DEploid} generally performs well for samples of low within sample relatedness, though struggles when the minor strain proportion is below 30\%. In contrast, {\tt DEploidIBD} consitently performs well.}{\includegraphics[width = \textwidth]{Fig2Africa.pdf}}
\label{figsupp:africa-low-moderate}
\figsupp{Additional comparison of \texttt{DEploidIBD} and \texttt{DEploid} on {\it in silico} $b=2$ bite mixtures of $K=3$ strains from Africa and Asia, illustrated by sub panels (A) and (B), respectively. Detailed panel descriptions can be found in the caption to Figure~\ref{fig:benchmark}. The unrelated strain provides a strong signal in allele frequency imbalance for {\tt DEploid} to detect and therefore performs better than dealing with $b=1$ mixtures. Comparing (A) and (B), pairwise relatedness estimates are noisy in Asia because of the background IBD. However, background relatedness generates shorts segments of IBD and therefore leads to IBD N50 underestimation.}{\includegraphics[width = \textwidth]{Fig2k3.pdf}}
\label{figsupp:k3}
\figsupp{Comparison of \texttt{DEploidIBD} and \texttt{DEploid} on 100 {\it in silico} $b=3$ bite mixtures of four strains from Africa. Detailed panel descriptions can be found in the caption to Figure~\ref{fig:benchmark}. {\tt DEploid} performs poorly in all cases. In contrast, {\tt DEploidIBD} performs well when all four strains have unequal proportions, but is less accurate when some strains have equal proportion.}{\includegraphics[width = \textwidth]{Fig2k4.pdf}}
\label{figsupp:k4}
\end{figure}
\subsubsection{Validation against simulated mixed infections}
Validation using mixtures of lab strains has two limitations: (i) the strains comprising the mixed infection were part of the reference panel and (ii) no IBD was present. We therefore investigated the ability of \texttt{DEploidIBD} to recover IBD between strains within a mixed infection, in the context of a realistic reference panel, and with strains typical of those we observe in nature. To achieve this, we designed a validation framework where clonal samples from the Pf3k project were combined {\it in silico } to produce simulated mixed infections, allowing us to create examples with varying numbers of strains and proportions, and to introduce tracts of IBD, by copying selected sections of the genome between strains. Using this framework, we constructed a broad suite of simulated mixed infections, derived from clonal samples from Africa and Asia that were combined into mixtures of 2, 3 and 4 strains with variable proportions and IBD configurations.
We randomly selected 189 clonal samples of African origin and 204 clonal samples of Asian origin from which to construct our simulated mixed infections and restricted the analysis to chromosome 14 to reduce computational time. Starting with mixed infections containing two strains, we randomly took two samples of African or Asian origin and combined them at proportions ranging from highly imbalanced (10\% and 90\%) to exactly balanced (each 50\%) and used copying to produce either no (0\%), low (25\%), medium (50\%) or high (75\%) levels of IBD (note that background IBD between the two clonal strains may also exist). In total this resulted in 4,000 $K=2$ mixed infections, each of which was deconvoluted with \texttt{DEploid} and \texttt{DEploidIBD}. Outputs of \texttt{DEploidIBD} were compared to the true values for each simulated infection, including the inference of $K$, the effective $K_e$ (computed as $K_e = 1/\sum w_{i}^{2}$, where $w_i$ is the proportion of the $i$th strain, thus incorporating proportion inference), the average pairwise relatedness between strains (for $K=2$, this is the fraction of the genome inferred to be IBD), and the inference of IBD tract length, expressed as the IBD N50 metric.
For mixtures of two strains, both \texttt{DEploid} and \texttt{DEploidIBD} performed well in scenarios where the IBD between strains was low (<=25\%). In moderate or high IBD scenarios with imbalanced strain proportions, \texttt{DEploid} tended to underestimate the proportion of the minor strain resulting in underestimation of $K_e$, whereas \texttt{DEploidIBD} was able to infer the proportion of these mixtures correctly (Figure~\ref{fig:benchmark}). The main novelty of \texttt{DEploidIBD} is the calculation of an IBD profile between strains. We found that the IBD summary statistics produced by \texttt{DEploidIBD} were accurate across all two-strain mixed infections tested in Africa. In Asia, \texttt{DEploidIBD} tended to estimate more IBD than was simulated (Figure~\ref{fig:benchmark}B). However, this likely reflects the presence of higher background IBD in Asia rather than systematic error.
To simulate realistic mixed infections containing 3 or 4 strains, we first considered the different transmission scenarios under which they can arise. We modelled a mixed infection of $K$ strains as resulting from $b$ biting events, where $K \in \{3,4\}$ and $1~{\leq}~{b}~{\leq}~K$. When greater than one strain is transmitted in a single biting event, the co-transmitted strains will share IBD, as a consequence of meiosis occurring in the mosquito. Strains transmitted through independent bites, causing superinfection in the host, do not share any IBD beyond background. Following this paradigm, we generated a suite of mixed infection types: $K=3$, $b=1, 2, 3$ and $K=4$, $b=1, 2, 2, 3, 4$ (the first $b=2$ has two strains per bite, the second three and one); and simulated each of these across a variety of proportions, again using sets of clonal samples from Africa and Asia as starting strains.
As with the experimental validation, the balanced-proportion $K=3$ mixed infections generated {\it in silico} proved challenging to deconvolute, with both methods inferring the presence of two rather than three strains (Figure~\ref{fig:benchmark}C). In mixed infections with imbalanced proportions, we found that, in African samples with IBD ($b=1, 2$), \texttt{DEploid} tended to either underestimate the number of strains present, or infer proportions incorrectly. In Asian samples this is less of an issue as the reference panels can provide better prior information for deconvolution due to lower diversity. In contrast, \texttt{DEploidIBD} consistently gave the correct number of strains and proportions in such cases, and produced IBD statistics that were accurate as long as the median coverage of simulated infections was $>20$x. Both methods struggled to deconvolute mixed infections of four strains (Figure~\ref*{fig:benchmark}-supplement~\ref*{figsupp:invitro}), although performed better (i.e. inferred $K=4$ greater than 50\% of the time) for mixtures with less IBD ($b=3, 4$). However, even in these cases, estimates of the proportions and IBD statistics were variable, indicating that further work is needed before $K=4$ mixed infections can be reliably deconvoluted.
Finally, we used the \emph{in silico} approach to explore the quality of haplotypes inferred by \texttt{DEploidIBD}, focusing on $K=2$ infections across variable proportions. We compared the haplotype inferences between \texttt{DEploid} and \texttt{DEpoloidIBD} using the error model described in the Appendix, and found that rates of genotype error are similar for the two approaches in settings of low relatedness ({\tt DEploidIBD} has an error rate of 0.7\% per site per strain for 20/80 mixtures and 1.4\% for 50/50 mixtures). However, for the 20/80\% mixtures with high relatedness, genotype error for \texttt{DEploid} increased to 1.8\%, while remaining at 0.8\% for \texttt{DEploidIBD} (Figure~\ref{fig:error-analysis}A). Switch errors in haplotype estimation are comparable between the two methods and decrease with increasing relatedness due to higher homozygosity (Figure~\ref{fig:error-analysis}B). Finally, we identified a simple metric to compute on inferred haplotypes that can identify low quality haplotypes (see Appendix),
\begin{figure}[htp]
\begin{center}
\includegraphics[width=\textwidth]{Fig3.pdf}
\caption{Cumulative distribution of the average per site genotype error (left) and switch error (right) across simulated mixtures (measured at sites that are heterozygous in the sample or sample-specific reference panel). (A) Error rates of Asian {\it in silico} samples of three levels of IBD (25\%, 50\% and 75\%) for a $K=2$ mixture with proportions of $20/80\%$. Because {\tt DEploidIBD} estimates proportions more accurately, it enables better haplotype inference.
(B) Error rates of African {\it in silico} samples of three levels of IBD (25\%, 50\% and 75\%) for a $K=2$ mixture with proportions of $20/80\%$. Inference in Asia benefits from better reference panels (due to lower overall diversity) and therefore gives lower error rates than in Africa.
(C) {\tt DEploidIBD} error rates for African {\it in silico} samples of three mosquito biting scenarios for a $K=3$ mixture with proportions of $10/10/80\%$. The additional strain increases the difficulty of haplotype inference, particularly in the case of three independent bites.
} \label{fig:error-analysis}
\end{center}
\end{figure}
\subsection{Geographical variation in mixed infection rates and relatedness}
To investigate how the rate and relatedness structure of mixed infections varies among geographical regions with different epidemiological characteristics, we applied \texttt{DEploidIBD} to 2,344 field samples of {\it P. falciparum} released by the Pf3k project \citep{pf3k}. These samples were collected under a wide range of studies with differing designs, though the majority of samples were collected from symptomatic individuals seeking clinical treatment. An important exception are the samples from Senegal which, though collected passively at a clinic, were screened to contain only one strain by SNP barcode \citep{Daniels2015}. A summary of the data sources is presented in Table~\ref{tab:Pf3k} and full details regarding study designs can be found at \url{https://www.malariagen.net/projects/pf3k#sampling-locations}. Details of data processing are given in the Appendix. For deconvolution, samples were grouped into geographical regions by genetic similarity; four in Africa, and three in Asia. (Table~\ref{tab:Pf3k}). Reference panels were constructed from the clonal samples found in each region. Since previous research has uncovered strong population structure in Cambodia \citep{Miotto2013}, we stratified samples into West and North Cambodia when performing analysis at the country level. Diagnostic plots for the deconvolution of all samples can be found at \url{https://github.com/mcveanlab/mixedIBD-Supplement} and inferred haplotypes can be accessed at \url{ftp://ngs.sanger.ac.uk/production/pf3k/technical_working/release_5/mixedIBD_paper_haplotypes/}. We identified 787 samples where low sequencing coverage or the presence of low-frequency strains resulted in unusual haplotypes (see Appendix). Estimates of strain number, proportions and IBD states from these samples are used in subsequent analyses, but not the haplotypes. We also confirmed that reported results are not affected by the exclusion of samples with haplotypes with low confidence (data not shown). In all following analyses, only strains present with a proportion of >1\% in a sample are reported.
\begin{table}[btp]
\caption{Summary of Pf3k samples.}\label{tab:Pf3k}
{\small
\begin{tabular}{p{1.3cm} c p{1.8cm} c | c c c c p{2.7cm}}
\hline
\hline
%& & & & & & & & \\
Country & Year &Location & $PfPR$ & $ss$ & $\bar{D}$ (s.e.) & $\bar{\rho}$ & $\bar{K_e}$& Reference\\
\hline
Gambia & 2008 & Brikam & 0.06 & 65 & 129 ( 9.4 ) & \textcolor{black}{0.5}\donenum[inline]{was 0.53} & 1.3 &\citet{Amambua-Ngwa2012}\\
\hline
Ghana & 2009 & Navrongo & 0.79 & 121 & 86 ( 5.7 )& \textcolor{black}{0.21}\donenum[inline]{was 0.25} & 1.6&\multirow{3}{*}{\parbox{3.4cm}{\citet{Duffy2015,Kamau2015,eLife2016}}}\\
& 2010 & Navrongo & 0.79 & 171 & 127 ( 10.3 )& \textcolor{black}{0.23}\donenum[inline]{was 0.26} & 1.5 &\\
& 2011 & Navrongo & 0.72 & 97 & 76 ( 5.3 )& \textcolor{black}{0.21}\donenum[inline]{was 0.24} & 1.5& \\
& & Kintampo & 0.58 & 6 & 89 ( 13.5 )& \textcolor{black}{0.11}\donenum[inline]{was 0.16} & 1.5&\\
& 2012 & Navrongo & 0.52 & 47 & 111 ( 3.8 )& \textcolor{black}{0.29}\donenum[inline]{was 0.31} & 1.6&\\
& & Kintampo & 0.41 & 40 & 157 ( 8.1 )& \textcolor{black}{0.22}\donenum[inline]{was 0.25} & 1.6&\\
& 2013 & Navrongo & 0.31 & 88 & 119 ( 4 )& \textcolor{black}{0.26}\donenum[inline]{was 0.29} & 1.6&\\
& & Kintampo & 0.29 & 4 & 172 ( 38.4 )& \textcolor{black}{0.44}\donenum[inline]{was 0.53} & 1.1&\\
\hline
Malawi & 2011 & Chikwawa & 0.19 & 230 & 101 ( 3 )& \textcolor{black}{0.26}\donenum[inline]{was 0.28} & 1.7 &\citet{Ocholla2014}\\
& & Zomba & 0.34 & 35 & 89 ( 9.1 )& \textcolor{black}{0.24}\donenum[inline]{was 0.28} & 1.6 &\\
\hline
Mali & 2007 & Bandiagara & 0.43 & 9 & 95 ( 25.2 )& \textcolor{black}{0.39}\donenum[inline]{was 0.39} & 1.8 &\multirow{3}{*}{\parbox{3.4cm}{\citet{Mobegi2014,eLife2016}}}\\
& & Faladje & 0.37 & 36 & 75 ( 10.1 )& \textcolor{black}{0.27}\donenum[inline]{was 0.34} & 1.3 &\\
& & Kolle & 0.21 & 51 & 82 ( 10.5 )& \textcolor{black}{0.3}\donenum[inline]{was 0.36} & 1.6 &\\
\cline{1-8}
Guinea & 2011 & Nzerekore & 0.49 & 97 & 77 ( 4.6 )& \textcolor{black}{0.17}\donenum[inline]{was 0.21} & 1.4 &\\
\cline{1-8}
Congo DR & 2013 & Kinshasa & 0.24 & 113 & 49 ( 3.2 )& \textcolor{black}{0.31}\donenum[inline]{was 0.36} & 1.5\\
\hline
Senegal & 2004 & Thies & 0.09 & 2 & 130 ( 68.2 )& \textcolor{black}{0.01}\donenum[inline]{was 0.03} & 1.4 &\citet{Wong2017}\\
& 2009 & Thies & 0.04 & 43 & 175 ( 14.9 )& \textcolor{black}{0.43}\donenum[inline]{was 0.47} & 1.1 &\\
& 2010 & Thies & 0.04 & 24 & 159 ( 9.7 )& \textcolor{black}{0.3}\donenum[inline]{was 0.36} & 1.3 &\\
& 2011 & Thies & 0.03 & 32 & 97 ( 6 )& \textcolor{black}{0.33}\donenum[inline]{was 0.4} & 1.1 &\\
\hline
\hline
West & 2009 & Pursat & 0.0071 & 19 & 75 ( 8.8 )& \textcolor{black}{0.39}\donenum[inline]{was 0.39} & 1.3 &\multirow{3}{*}{\parbox{3.4cm}{\citet{Amato2017, eLife2016}}}\\
Cambodia & 2010 & Pursat & 0.0071 & 105 & 95 ( 6.8 )& \textcolor{black}{0.65}\donenum[inline]{was 0.65} & 1.2&\\
& 2011 & Pailin & 0.0025 & 49 & 54 ( 4.1 )& \textcolor{black}{0.43}\donenum[inline]{was 0.43} & 1.1&\\
& & Pursat & 0.0096 & 103 & 49 ( 3.1 )& \textcolor{black}{0.63}\donenum[inline]{was 0.63} & 1.2&\\
& 2012 & Pailin & 0.00096 & 31 & 46 ( 5.6 )& \textcolor{black}{0.43}\donenum[inline]{was 0.43} & 1.0&\\
& & Pursat & 0.0079 & 7 & 37 ( 19.1 )& \textcolor{black}{0.58}\donenum[inline]{was 0.58} & 1.4&\\
\cline{1-8}
North & 2010 & Ratanakiri & 0.0039 & 50 & 71 ( 6.1 )& \textcolor{black}{0.43}\donenum[inline]{was 0.44} & 1.3&\\
Cambodia & 2011 & Preah Vihear & 0.02 & 73 & 51 ( 5.3 )& \textcolor{black}{0.36}\donenum[inline]{was 0.36} & 1.2&\\
& & Ratanakiri & 0.0032 & 81 & 45 ( 4.3 )& \textcolor{black}{0.47}\donenum[inline]{was 0.48} & 1.4&\\
& 2012 & Preah Vihear & 0.0075 & 30 & 43 ( 6.7 )& \textcolor{black}{0.37}\donenum[inline]{was 0.38} & 1.0&\\
& & Ratanakiri & 0.0016 & 15 & 44 ( 8.9 )& \textcolor{black}{0.3}\donenum[inline]{was 0.32} & 1.3&\\
\hline
Thailand & 2011 & Mae Sot & 0.00011 & 35 & 66 ( 7.5 )& \textcolor{black}{0.35}\donenum[inline]{was 0.35} & 1.2 &\multirow{3}{*}{\parbox{3.4cm}{\citet{Miotto2013, eLife2016}}}\\
& & Sisakhet & 1e-04 & 5 & 112 ( 25.4 )& \textcolor{black}{0.17}\donenum[inline]{was 0.17} & 1.3 &\\
& 2012 & Mae Sot & 5.7e-05 & 69 & 83 ( 4.9 )& \textcolor{black}{0.58}\donenum[inline]{was 0.59} & 1.3 &\\
& & Ranong & 0.00018 & 11 & 82 ( 12.4 )& \textcolor{black}{0.38}\donenum[inline]{was 0.34} & 1.2 &\\
& & Sisakhet & 0 & 13 & 89 ( 13 )& \textcolor{black}{0.37}\donenum[inline]{was 0.37} & 1.1 &\\
& 2013 & Sisakhet & 0 & 3 & 62 ( 8.8 )& \textcolor{black}{0.09}\donenum[inline]{was 0.09} & 1.2&\\
\cline{1-8}
Bangladesh & 2012 & Ramu & 0.0021 & 50 & 53 ( 4.2 )& \textcolor{black}{0.45}\donenum[inline]{was 0.49} & 1.5 &\\
\cline{1-8}
Viet Nam & 2011 & Bu Gia Map & 0.0073 & 43 & 67 ( 5 )& \textcolor{black}{0.43}\donenum[inline]{was 0.44} & 1.3&\\
& & Phuoc Long & 0.0053 & 27 & 68 ( 7.2 )& \textcolor{black}{0.37}\donenum[inline]{was 0.38} & 1.2&\\
& 2012 & Bu Gia Map & 0.0072 & 19 & 115 ( 8 )& \textcolor{black}{0.67}\donenum[inline]{was 0.67} & 1.1&\\
& & Phuoc Long & 0.0048 & 5 & 107 ( 6.3 )& \textcolor{black}{0.81}\donenum[inline]{was 0.82} & 1.2&\\
\cline{1-8}
Myanmar & 2011 & Bago Division & 0.0076 & 12 & 59 ( 7.1 )& \textcolor{black}{0.24}\donenum[inline]{was 0.26} & 1.2 &\\
& 2012 & Bago Division & 0.0084 & 47 & 62 ( 5.2 )& \textcolor{black}{0.45}\donenum[inline]{was 0.46} & 1.2 &\\
\cline{1-8}
Laos & 2011 & Attapeu & 0.0094 & 59 & 71 ( 4.2 )& \textcolor{black}{0.36}\donenum[inline]{was 0.37} & 1.4 &\\
& 2012 & Attapeu & 0.02 & 25 & 77 ( 7.2 )& \textcolor{black}{0.68}\donenum[inline]{was 0.69} & 1.3 &\\
\hline
\hline
\end{tabular}
}
\tabledata{Summary of Pf3k samples in data release 5.1, where $D$ denotes mean read depth and $ss$ is sample size. Genotyping, including both indel and SNP variants, was performed using a pipeline based on GATK best practices, see Methods. Data available from \url{ftp://ngs.sanger.ac.uk/production/pf3k/release_5/5.1}. $PfPR$ is the inferred parasite prevalence rate in a 5 $\times$ 5 km resolution grid from the MAP project, centred at the Pf3k sample collection sites; Relatedness $\rho$ and effective number of strains $K_{e}$ are summary metrics from {\tt DEploidIBD} output.}
\end{table}
We find substantial variation in the rate and relatedness structure of mixed infections across continents and countries. Within Africa, rates of mixed infection vary from \textcolor{black}{29}\% in The Gambia to 63\% in Malawi (Figure~\ref{fig:mixInfPlot}A). Senegal has a rate of mixed infection (18\%) lower than The Gambia, however as these samples were screened by SNP barcode to be clonal, this rate should be an underestimate. In Southeast Asian samples, mixed infection rates are in general lower, though also vary considerably; from 21\% in Thailand to 54\% in Bangladesh. Where data for a location is available over multiple years, we find no evidence for significant fluctuation over time (though we note that these studies are typically not well powered to see temporal variations). We observe that between 5.1\% (Senegal) and 40\% (Malawi) of individuals have infections carrying more than two strains.
\begin{figure}[htp]
\begin{center}
\includegraphics[width=\textwidth]{Fig4.pdf}
\caption{Characterisation of mixed infections across 2,344 field samples of {\it Plasmodium falciparum}. (A) The fraction of samples, by population, inferred by \texttt{DEploidIBD} to be $K=1$ (clonal), $K=2$ (dual), $K=3$ (triple), or $K=4$ (More than 3). Populations are ordered by rate of mixed infections within each continent. We use shaded regions to indicate the distribution of 787 samples that have low-confidence deconvoluted haplotypes. Senegal is marked with an asterisks as these samples were screened to be clonal. (B) The distribution of average pairwise IBD sharing within mixed infections (including dual, triple and quad infections), broken down into unrelated (where the fraction of the genome inferred to be IBD, $\rho$, is $< 0.1$), low IBD ($0.1 \geq \rho < 0.3)$, sib-level ($0.3 \geq \rho <0.7$) and high ($\rho \geq 0.7$). Stars indicate the average IBD scaled between 0 and 1 from bottom to the top. Populations follow the same order as in Panel A. (C) The relationship between the rate of mixed infection and level of IBD. Populations are coloured by continent, with size reflecting sample size and error bars showing $\pm 1$ s.e.m.. The dotted line shows the slope of the regression from a linear model. Abbreviations: SN-Senegal, GM-The Gambia, NG-Nigeria, GN-Guinea, CD-The Democratic Republic of Congo, ML-Mali, GH-Ghana, MW-Malawi, MM-Myanmar, TH-Thailand, VN-Vietnam, KH-Cambodia, LA-Laos, BD-Bangladesh.} \label{fig:mixInfPlot}
\end{center}
\end{figure}
Relatedness between samples and populations also varies substantially. In dual infections, the average fraction of the genome inferred to be IBD ranges from 14\% in Guinea to 65\% in West Cambodia (Figure~\ref{fig:mixInfPlot}B). Asian populations show, on average, a higher level of relatedness within dual infections (\textcolor{black}{44}\donenum{change number, was 48}\%) compared to African populations ( \textcolor{black}{26}\donenum{change number, was 29}\%). Levels of IBD in samples with three or more strains are comparable to those seen in dual infections (average IBD being \textcolor{black}{45}\donenum{change number, was 50}\% in Asia and \textcolor{black}{37}\donenum{change number, was 29}\% in Africa) and significantly correlated at the country level, with correlation of \textcolor{black}{0.75}\donenum{change number, was 0.76} (P = \textcolor{black}{0.0019}\donenum{change number, was 0.0017}, weighted by the number of mixed samples). Overall, \textcolor{black}{51}\donenum{change number, was 53}\% of all mixed infections involve strains with over 30\% of the genome being IBD.
We next considered the relationship between mixed infection rate and the level of IBD. We find that populations with higher rates of mixed infection tend to have lower levels of IBD within mixed infections (linear model P = 0.06 after accounting for a continental level difference and weighted by sample size). However, the continental level effect is driven by Senegal, which has an unusual combination of low mixed infections and also low IBD. Excluding Senegal, we find a consistent pattern across populations (Figure~\ref{fig:mixInfPlot}C), with a strong negative correlation between mixed infection rate and the level of IBD (Pearson \textcolor{black}{$r = -0.65$}\donenum{change number, was $r = -0.84$}, P = $3\times10^{-4}$). Previous work has demonstrated how a recent and dramatic decline in \emph{P. falciparum} prevalence within Senegal has left an impact on patterns of genetic variation \citep{Daniels2015}, which may explain its unusual profile.
\begin{figure}[ht]
\begin{center}
\includegraphics[width=.9\textwidth]{Fig5.pdf}
\caption{Example IBD profiles in mixed infections. Plots showing the ALT versus REF plots (left hand side) and inferred IBD profiles along the genome for five strains of differing composition. From top to bottom: A dual infection of highly related strains ($\rho = 0.84$); a dual infection of two sibling strains ($\rho = 0.6$); a triple infection of three sibling strains (note the absence of stretches without IBD); a triple infection of two related strains and one unrelated strain; and a triple infection of three unrelated strains. The numbers below the sample IDs indicate the average pairwise IBD, $r$, the mean length of IBD segments, $l$, in kb and the inferred number of distinct strains, $K$, respectively.}\label{fig:strainIBD}
\end{center}
\end{figure}
\subsection{Inferring the origin of IBD in mixed infections}
The high levels of IBD observed in many mixed infections suggest the presence of sibling strains (Figure~\ref{fig:strainIBD}). To quantify the expected IBD patterns between siblings, we developed a meiosis simulator for {\it P. falciparum} ({\tt pf-meiosis}), incorporating relevant features of malaria biology that can impact the way IBD is produced in a mosquito and detected in a human host. Most importantly, a single infected mosquito can undergo multiple meioses in parallel, one occurring for each oocyst that forms on the mosquito midgut \citep{Gosh2000}. In a mosquito infected with two distinct strains, each oocyst can either self (the maternal and paternal strain are the same) or outbreed (the maternal and paternal strains are different). We model a $K = n$ mixed infection as a sample of $n$ strains (without replacement, as drawing identical strains yields $K = n-1$) from the pool of strains created by all oocysts. Studies of wild-caught \textit{Anopheles Gambiae} suggest that the distribution of oocysts is roughly geometric, with the majority of infected mosquitoes carrying only one oocyst \citep{Beir1991,Collins1984}. In such a case, we find that a $K=2$ infection will have an expected IBD of $1/3$, consistent with the observations of \citet{Wong2018}. Conditioning on at least one progeny originating from an outbred oocyst (such that a detectable recombination event has occurred), the expected IBD asymptotically approaches $1/2$ as the total number of oocysts grows (see Appendix).
Using this simulation framework, we sought to classify observed mixed infections based on their patterns of IBD. We used two summary statistics to perform the classification: mean IBD segment length and IBD fraction. We built empirical distributions for these two statistics for each country in Pf3k, by simulating meiosis between pairs of clonal samples from that country. In this way, we control for variation in genetic diversity (as background IBD between clonal samples) in each country. Starting from a pair of clonal samples ($M=0$, where $M$ indicates the number of meioses that have occurred), we simulated three successive rounds of meiosis ($M=1, 2, 3$), representing the creation and serial transmission of a mixed infection (Figure~\ref{fig:classify}A). Each round of meiosis increases the amount of observed IBD. For example, in Ghana, the mean IBD fraction for $M=0$ was 0.002, for $M=1$ was 0.41, for $M=2$ was 0.66, and for $M=3$ was 0.80 (Figure~\ref{fig:classify}B). West Cambodia, which has lower genetic diversity, had a mean IBD fraction of 0.08 for $M=0$ and consequently, the mean IBD fractions for higher values of $M$ were slightly increased, to 0.46, 0.68, 0.81 for $M=1, 2$ and $3$, respectively (Figure~\ref{fig:classify}B).
With these simulated distributions, we used Naive Bayes to classify $k=2$ mixed infections in Pf3k (Figure~\ref{fig:classify}C). Of the 393 $k = 2$ samples containing only high-quality haplotypes (see Appendix), 325 (83\%) had IBD statistics that fell within the range observed across all simulated $M$. Of these, nearly half (183, 47\%) were classified as siblings ($M > 0$). Moreover, we observe geographical differences in the rate at which sibling and unrelated mixed infections occur. Notably, in Asia a greater fraction of all mixed infections contained siblings (59\% vs. 41\% in Africa), driven by a higher frequency of $M=2$ and $M=3$ mixed infections (Figure~\ref{fig:classify}D). Mixed infections classified as $M>1$ are produced by serial co-transmission of parasite strains, i.e. a chain of mixed infections along which IBD increases.
\begin{figure}[htp]
\begin{center}
\includegraphics[width=\textwidth]{Fig6_corrected.pdf}
\caption{Identifying sibling strains within mixed infections. (A) Schematic showing how IBD fraction and IBD segment length distributions are created for $k=2$ mixed infections using \texttt{pf-meiosis}. Two clonal samples from a given country are combined to create an unrelated ($M=0$, where $M$ is number of meioses that have occurred) mixed infection. The $M=0$ infection is then passed through 3 rounds of \texttt{pf-meioses} to generate $M=1,2,3$ classes, representing serial transmission of the mixed infection ($M=1$ are siblings). (B) Simulated IBD distributions for $M=0,1,2,3$ for Ghana (top) and West Cambodia (bottom). A total of 10,000 mixed infections are simulated for each class, from 500 random pairs of clonal samples. (C) Classification results for 393 $K=2$ mixed infections from 13 countries. Undetermined indicates mixed infections with IBD statistics that were never observed in simulation. (D) Breakdown of class percentage by continent. Total number of samples is given above bars. Colours as in panel C ($M=0$, grey; $M=1$, purple; $M=2$, pink; $M=3$, orange; Undetermined, black). (E) Same as (D), but by country. Abbreviations as in Figure~\ref{fig:mixInfPlot}.}
\label{fig:classify}
\end{center}
\end{figure}
\subsection{Characteristics of mixed infections correlate with local parasite prevalence}
To assess how characteristics of mixed infections relate to local infection intensity, we obtained estimates of {\it P. falciparum} prevalence (standardised as $PfPR_{2-10}$, prevalence in the 2-to-10 year age range) from the Malaria Atlas Project \citep[see Table~\ref{tab:Pf3k}]{MAP2017}. The country-level prevalence estimates range from 0.01\% in Thailand to 55\% in Ghana, with African countries having up to two orders of magnitude greater values than Asian ones (mean of 36\% in Africa and 0.6\% in Asia). However, seasonal and geographic fluctuations in prevalence mean that, conditional on sampling an individual with malaria, local prevalence may be much higher than the longer-term (and more geographically widespread) country-level average, hence we extracted the individual pixel-level estimate of prevalence (corresponding to a 5x5km region) from MAP nearest to each genome collection point. We summarise mixed infection rates by the average effective number of strains, which reflects both the number and proportion of strains present.
Given that samples from Senegal were screened to be primarily single-genome \citep{Daniels2015}, we computed all correlations with prevalence including ($r_{S+}$) and excluding them ($r_{S-}$; Figure~\ref{fig:model}). We find that the effective number of strains is a significant predictor of $PfPR_{2-10}$ globally ($r_{S+} = 0.65, p < 10^{-5}$) and in African populations when Senegal is included ($r_{S+} = 0.48, p=0.04$; $r_{S-} = 0.18, p=0.51$), but is uncorrelated across Asia. Similarly, $PfPR_{2-10}$ is negatively correlated with background IBD globally ($r_{S+} = -0.43, p=0.004$) and across Africa but not in Asia. Surprisingly, the amount of IBD observed within $k=2$ mixed infections was not correlated with prevalence in Africa or Asia. The rate of sibling infection ($M=1$) is not correlated with the parasite prevalence (Asia: $r_{S+} = 0.23, p=0.2$, Africa: $r_{S+} = 0.16, p=0.5$). However, the rate of supersiblings ($k=2, M>1$) is significantly correlated with $PfPR_{2-10}$ ($r_{S+} = -0.31, p=0.04$) at the global scale, suggesting that serial co-transmission may occur more readily in low prevalence regions.
\begin{figure}[htp]
\centering{}
\includegraphics[width=\textwidth]{Fig7_mixed-prevalence_v4-2.pdf}
\caption{The relationship between {\it P. falciparum} prevalence and characteristics of mixed infection. Four mixed infection statistics are shown including the average effective number of strains (Effective k, first column), given by $k_e = (\sum w_i^2)^{-1}$, where $w_i$ is the proportion of the $i$th strain; background IBD observed between clonal samples (Background Fraction IBD, second column); fraction IBD within $K=2$ mixed infections (Fraction IBD, third column); and the rate of $K=2$ mixed infections classified as having $M>1$ (Supersibling Rate, fourth column). Each point relates to a row in Table~1 from different sampling locations and years. Pearson's $r$ is computed globally (shown at top in a grey box for each statistic), across Asian countries (upper panel) and across African countries (lower panel). Globally and for Africa, the correlations were computed including Senegal ($r_{S+}$) and excluding Senegal ($r_{S-}$). The slope and confidence intervals for the regression line excluding Senegal are drawn. Significant correlations ($p < 0.05$) are highlighted in red and significance levels indicated by asterisks (* $<0.05$, ** $<0.01$, *** $<0.001$)}
\donenum[inline]{Minor Comment 2, on unlabelled points in Fig 6}
\label{fig:model}
\end{figure}
\section{Discussion}
It has long been appreciated that mixed infections are an integral part of malaria biology. However, determining the number, proportions, and haplotypes of the strains that comprise them has proven a formidable challenge. Previously we developed an algorithm, \texttt{DEploid}, for deconvoluting mixed infections \citep{Zhu2017}. However, we subsequently noticed the presence of mixed infections with highly related strains in which the algorithm performed poorly, particularly with low-frequency minor strains. Mixed infections containing highly related strains represent an epidemiological scenario of particular interest, because they are likely to have been produced from a single mosquito bite, itself multiply infected, and in which meiosis has occurred to generate sibling strains. Thus, we developed an enhanced method, \texttt{DEploidIBD}, capable not only of deconvoluting highly related mixed infections, but also inferring IBD segments between all pairs of strains present in the infection. Validation work using simulated mixed infections illustrated that \texttt{DEploidIBD} performs well on infections of two or three strains and across a wide-range of IBD levels. We note that limitations and technical difficulties remain, including deconvoluting infections with more than three strains, handling mixed infections with highly symmetrical or asymmetrical strain proportions (e.g. $K=3$ with strains at 33\%, or $K=2$ with one strain at 2\%), analysing data with multiple infecting species, coping with low-coverage data, and selecting appropriate reference panels from the growing reference resources.
The application of \texttt{DEploidIBD} to the 2,344 samples in the Pf3k project has revealed the extent and structure of relatedness among malaria infections and how these characteristics vary between geographic locations. We found that 1,026 (44\%) of all samples in Pf3k were mixed, being comprised of 480 $K=2$ infections, 372 $K=3$ and 127 $K=4$ infections. Across the entire data set, the total number of genomes extracted from mixed infections is nearly double the number extracted from clonal infections (2,584 genomes from $K>1$ vs. 1,365 from $K=1$). We also found considerable variation, between countries and continents in the characteristics of mixed infections, suggesting that they are sensitive to local epidemiology. Previous work has highlighted the utility of mixed infection rate in discerning changes in regional prevalence, and we re-enforce that finding here, observing a significant correlation between the effective number of strains and parasite prevalence across Pf3k collection sites. Similarly, using \texttt{DEploidIBD} we also observe significant geographical variation in the relatedness profiles of strains within mixed infections. Interestingly, this variation is structured such that regions with high rates of mixed infection tend to contain strains that are less related, resulting in a significant negative correlation between mixed infection rate and mean relatedness within those infections.
The ability to identify the extent and genomic structure of IBD enables inference of the mechanisms by which mixed infections can arise. A mixed infection of $K$ strains can be produced by either $K$ independent infectious bites or by $j < K$ infectious bites. In the first case, parasites are delivered by separate vectors and no meiosis occurs between the distinct strains, thus any IBD observed in the mixed infection must have pre-existed as background IBD between the individual strains. In the second case, meiosis may occur between strains, resulting in long tracts of IBD. The exact amount of IBD produced by meiosis is a random variable, dependent on outcomes of meiotic processes, such as the number of recombination events, the distance between them, and the segregation of chromosomes. Importantly, the mean IBD produced during meiosis in \textit{P. falciparum} also depends on the number and type (selfed vs outbred) of oocysts in the infectious mosquito. Here, we have shown, from first principles, that the amount of IBD expected in a single-bite mixed infection produced from two unrelated parasites strains will always be slightly less than $1/2$, and potentially as low as $1/3$ (see Appendix).
To quantify the distribution of IBD statistics expected through different mechanisms of mixed infection, we developed a Monte Carlo simulation tool, \texttt{pf-meiosis}, which we used to infer the recent transmission history of individuals with dual ($K=2$) infections. We considered mixed infection chains, in which $M$ successive rounds of meiosis, transmission to host, and uptake by vector can result in sibling strain infections with very high levels of IBD. Using this approach, we found that 47\% of dual infections within the Pf3k Project likely arose through co-transmission events. Moreover, and particularly within Asian population samples, we found evidence for long mixed infection chains ($M>1$), representing repeated co-transmission without intervening superinfection. This observation is not a product of lower genetic diversity in Asia, as differences in background IBD between countries have been controlled for in the simulations. Rather, it reflects true differences in transmission epidemiology between continents. These findings have three important consequences. Firstly, they suggest that successful establishment of multiple strains through a single infection event is a major source of mixed infection. Second, they imply that the bottlenecks imposed at transmission (to host and vector) are relatively weak. Finally, they indicate that the differing mechanisms causing mixed infections reflect aspects of local epidemiology.
We note that a non-trivial fraction (17\%) of all mixed infections had patterns of IBD inconsistent with the simulations (typically with slightly higher IBD levels than background but lower than among siblings). We suggest three possible explanations. A first is that the unclassified samples result from the IBD profiles produced by DEploidIBD, in particular the overestimation of short IBD tracks, similar to the issue observed by \citep{Wong2018}. Alternatively, our estimate of background IBD, generated by combining pairs of random clonal samples from a given country into an artificial $M=0$ mixed infection, will underestimate true background IBD if there is very strong local population structure. Finally, we only simulated simple mixed infection transmission chains, at the exclusion of more complex transmission histories, such as those involving strains related at the level of cousins. The extent to which such complex histories can be inferred with certainty remains to be explored.
Lastly, our results show that the rate and relatedness structure of mixed infections correlate with estimated levels of parasite prevalence, at least within Africa, where prevalence is typically high \citep{SMITH199355}. In Asia, which has much lower overall prevalence, as well as greater temporal (and possibly spatial) fluctuations, we do not observe such correlations. However, it may well be that other genomic features that we do not consider in this work could provide much higher resolution, in space and time, for capturing changes in prevalence than traditional methods. Testing this hypothesis will lead to a much greater understanding of how genomic data can potentially be used to inform global efforts to control and eradicate malaria.
\section{Methods and Materials}
The data analysed within this paper were collected and made openly available to researchers by member of the Pf3k Consortium. Information about studies within the data set can be found at \url{https://www.malariagen.net/projects/pf3k#sampling-locations}. Detailed information about data processing can be found at \url{https://www.malariagen.net/data/pf3k-5}. Briefly, field isolates were sequenced to an average read depth of 86 (range 12.6 -- 192.5). After removing human-derived reads and mapping to the 3D7 reference genome, variants were called using GATK best practice and approximately one million variant sites were genotyped in each isolate. After filtering samples for low coverage and cross-species contamination, 2,344 samples remained. The Appendix provides details on the filters used and data availability. For deconvolution, samples were grouped into geographical regions by genetic similarity; four in Africa, and three in Asia. (Table~\ref{tab:Pf3k}). Reference panels were constructed from the clonal samples found at each region. Since previous research has uncovered severe population structure in Cambodia \citep{Miotto2013}, we stratified samples into West and North Cambodia when performing analysis at the country level.
\section{Acknowledgements}
This study was supported by the Wellcome Trust (206194, 090770, 204911, 100956/Z/13/Z to GM), the Medical Research Council (G0600718), the UK Department for International Development (M006212) and the Li Ka Shing Foundation (to GM). This study used data from the MalariaGEN Pf3k Project. Genome sequencing was done by the Wellcome Sanger Institute (WSI), and sample collections were coordinated by the MalariaGEN Resource Centre. The samples from Senegal were was supported by funding from the Bill and Melinda Gates Foundation to Dyann Wirth, and sequenced by the Broad Institute. We thank the staff of the WSI Sample Logistics, Sequencing, and Informatics facilities for their contribution; all patients and collaborators contributing samples and data to the Pf3k project. We acknowledge and appreciate the critical role of administrator team and research computing support team, especially Victoria Cornelius for keeping non-scientific business running smoothly.
\section{Data availability}
Metadata on samples is available from \url{ftp://ngs.sanger.ac.uk/production/pf3k/release_5/pf3k_release_5_metadata_20170804.txt.gz}. Sequence data (aligned to {\emph Plasmodium falciparum} strain 3D7 v3.1 reference genome sequences, for details see \url{ ftp://ftp.sanger.ac.uk/pub/project/pathogens/gff3/2015-08/Pfalciparum.genome.fasta.gz}) is available from \url{ftp://ngs.sanger.ac.uk/production/pf3k/release_5/5.1/}. Diagnostic plots for the deconvolution of all samples can be found at \url{https://github.com/mcveanlab/mixedIBD-Supplement} and deconvoluted haplotypes can be accessed at \url{ftp://ngs.sanger.ac.uk/production/pf3k/technical_working/release_5/mixedIBD_paper_haplotypes/}. Code implementing the algorithms described in this paper, \texttt{DEploidIBD}, is available at \url{https://github.com/DEploid-dev/DEploid}. Code to generate {\it in silico} lab mixture of 4 strains are available at \url{https://github.com/DEploid-dev/DEploid-Data-Benchmark-in_silico_lab_mixed_4s}. Code to generate {\it in silico} field mixtures of 2, 3, 4 strains are available at \url{https://github.com/DEploid-dev/DEploid-Data-Benchmark-in_silico_field}.
\section{Disclosure Declaration}
None declared.
\appendix
\begin{appendixbox}
\section{Deconvolution in the presence of IBD}
\subsection{Notation}
We use the same notations as \citet{Zhu2017} (see Table~\ref{tab:notation}). Our data, $D$, are the allele read counts of sample $j$ at a given site $i$, denoted as $r_{j,i}$ and $a_{j,i}$ for reference (REF) and alternative (ALT) alleles respectively. These have assigned values of $0$ and $1$, respectively. Here we consider only bi-allelic loci, though future extensions to the model to include multi-allelic sites could be accommodated. The empirical allele frequencies within a sample (WSAF) $p_{j,i}$ and at population level (PLAF) $f_i$ are calculated by $ \frac{a_{j,i}}{a_{j,i} + r_{j,i}}$ and $ \frac{\sum_j a_{j,i}}{\sum_j a_{j,i} + \sum_j r_{j,i}}$ respectively. Since all data in this section refers to the same sample, we drop the subscript $j$ from now on.
\subsection{Modelling mixed infections with IBD} \label{sect:prior}
We describe the mixed infection problem by considering the number of strains, $K$, the relative abundance of each strain, $\mathbf{w}$, and the allelic state of the $k$th strain at the $i$th locus, $\mathbf{h}_{k,i}$. In addition to \citet{Zhu2017}, we also infer the identity-by-descent (IBD) state $\mathcal{S}_{i}$, which describes the strain relationship with each other at site $i$. For example, for three strains, the IBD-state could be one of the five cases: (1) all strains are not IBD; (2) strains 1 and 2 are IBD; (3) strains 1 and 3 are IBD; (4) strains 2 and 3 are IBD; (5) all strains are IBD (see Table~\ref{tab:encode}). To simplify the problem, we assume independence between markers, and drop the subscript $i$ from now on. As previously, we use a Bayesian approach and define the posterior probabilities of $K$, $\mathbf{w}$, $\mathbf{h}$ and $\mathcal{S}$, as
\begin{equation}
P(K, \mathbf{w}, \mathbf{h}, \mathcal{S}| e, D) \propto L(K, \mathbf{w}, \mathbf{h}, \mathcal{S} | e, D) \times P(K, \mathbf{w}, \mathbf{h}, \mathcal{S}), \label{eqn:post}
\end{equation}
\noindent where $e$ is the read error rate. Moreover, we can decompose the joint prior as
\begin{equation}
P(K, \mathbf{w}, \mathbf{h}, \mathcal{S}) = P(K) \times P(\mathbf{w}|K) \times P(\mathcal{S}|K) \times P(\mathbf{h}|\mathcal{S}).
\end{equation}
\noindent The number of strains, $K$, and their proportions $\mathbf{w}$, are as in \citet{Zhu2017}: $K$ is fixed at a user-defined value (here, $K=4$) and strains below a fixed proportion threshold (here, $0.01$) are removed. To model IBD configurations and resulting haplotypes we introduce a parameter $\theta$, which is the probability of outbreeding (i.e. non-IBD) for a mixed infection with two strains. Specifically, for $K$ strains, the prior probability on configuration $\mathcal{S}$ is a function of the number of distinct strains in the configuration, $C_{\mathcal{S}}$, and the number of distinct configurations with the same number of distinct strains, $M_{\mathcal{C}}$:
\begin{equation}
P(\mathcal{S}|K) = \genfrac{(}{)}{0pt}{}{K-1}{C_{\mathcal{S}}-1} \frac{\theta^{C_{\mathcal{S}}-1}(1-\theta)^{K-C_{\mathcal{S}}+1}}{M_{\mathcal{C}}}.
\end{equation}
\noindent That is, the number of distinct strains is drawn from a binomial distribution with parameters $K$ and $\theta$ and the IBD configuration is uniformly drawn from among those with the same number of distinct strains. Consequently, the expected number of distinct haplotypes (i.e. non-IBD) in an infection with $K$ strains is $1+\theta (K-1)$. The value of $\theta$ estimated within the Markov chain Monte Carlo (see details below).
%Note that in implementation, the number of strains is typically fixed at some maximum value, $K_{max}$, but only those strains achieving some threshold proportion, typically 1\%, are actually reported.
Suppose that the probability of recombination event between two adjacent sites is $p_{rec}$ (see Implementation details).
To model the transitions between IBD states we make the approximation that the IBD state at the $i+1$th site is the same as at the $i$th site, with probability $1-p_{rec}$ and, if there is a recombination event, the IBD state is drawn from the stationary distribution described above.
To model allelic states (or genotype), conditional on IBD state, we assume that allelic states for each IBD group are independent Bernoulli draws given the population allele frequency (PLAF). For example, if $K=3$ and only strains 1 and 2 are IBD (state 0,0,1), the genotype at site $i$, $\mathbf{h_i}$, could be $\{0,0,0\}$; $\{0,0,1\}$; $\{1,1,0\}$ or $\{1,1,1\}$, with probabilities $(1-f)^2$, $(1-f)f$, $f(1-f)$ and $f^2$ respectively. The joint prior on IBD and allelic states is referred to as $\psi(\mathcal{S},\mathbf{h})$. Note that we ignore association (known as linkage disequilibrium or LD) between nearby alleles, though note that in implementation we run \texttt{DEploidIBD} twice; first as described here to estimate strain number and proportions, allowing for IBD, then subsequently with a reference panel, as for \texttt{DEploid}, but with the strain number and proportions fixed. This latter step does model LD and consequently results in better estimates of strain haplotypes.
Details of the emission model are the same as described in \citet{Zhu2017}. Briefly, the reference and alternative allele read counts at each site are modelled as being drawn from a beta-binomial distribution given the expected WSAF, $q = \mathbf{w}^T \mathbf{h}$. To incorporate sequencing error, we modify the expected WSAF such that allele are misread with probability $e$. Thus, the adjusted expected WSAF becomes
\begin{equation} \label{eqn:adj_q}
\pi_i = q_i + (1 - q_i)e - q_ie = q_i + (1 - 2q_i)e.
\end{equation}
\noindent As previously, we model over-dispersion in read counts relative to the Binomial using a Beta-binomial distribution.
\subsection{Parameter estimation using Markov chain Monte Carlo}
To infer the relatedness between strains within a mixed infection and their relative proportions we use a Markov chain Monte Carlo (MCMC) approach. We learn the relative abundance of each strain by exploiting signatures of within-sample allele frequency imbalance, using a Metropolis-Hastings algorithm, which samples proportions, $\mathbf{w}$, given IBD-configurations, $\mathcal{S}$. We then use a Gibbs sampler to update $\mathcal{S}$ and $\mathbf{h}$ for a given $\mathbf{w}$ by first sampling the IBD state from the posterior, and then sampling the allelic configuration (genotype) from the selected IBD state at each site to update haplotypes. Note that the hidden Markov model structure for IBD states along the chromosome leads to efficient algorithms for calculating quantities of importance. Notably, by summing over allelic configurations that are consistent with a given IBD configuration, we can - for a given set of strain proportions - calculate the likelihood integrated over all possible IBD configurations, which leads to efficient sampling.
\subsubsection{Metropolis-Hastings update for proportions}\label{sec:updateP}
We update strain proportions, $\mathbf{w}$, through modelling underlying log titres, $\mathbf{x}$, where $w_i \propto exp(x_i)$. As previously, log titres are assumed to be drawn from a $N(-log(K), \sigma^2)$ distribution under the prior. To update, we choose $i$ uniformly from $K$ and propose new $x_i'$s from $x_i' = x_i + \delta x$, where $\delta x \sim N(0, \sigma^2/s)$, and $s$ is a scaling factor. The new proposed proportion is therefore $\frac{\exp(x_i')}{\sum_{k=1}^K \exp(x_k')}$. Since the proposal distribution is symmetrical, the Hastings ratio is 1. The proposed update is accepted with probability
$$\min\left(1, \frac{P(\mathbf{w}'|K)}{P(\mathbf{w}|K)} \frac{L(\mathbf{w}', \mathbf{h} | e, D)}{L(\mathbf{w}, \mathbf{h}, | e, D)}\right).$$
\noindent Note that, conditional on the current estimate of strain haplotypes, $\mathbf{h}$, the likelihood is not a function of the specific IBD configuration.
\subsubsection{Gibbs sampling update for haplotype and IBD-configuration update}
As described above, in {\tt DEploidIBD}, allelic states at different sites within a haplotype are independent. However, IBD states are connected through a Markov process. We therefore update the IBD configuration and strain haplotypes in a two stage process. First, we use the Forward algorithm to first compute the integrated likelihood - that is the probability of observing the read-level data integrating over all possible IBD configurations and allelic states. Defining $F_{i,\mathcal{S},\mathbf{h}}$ to be the integrated likelihood for the set of paths ending in IBD configuration $\mathcal{S}$ at site $i$ and with allelic configuration $\mathbf{h}$, it follows that
\begin{equation}
F_{i,\mathcal{S},\mathbf{h}} = P(D_{i}|e,\mathbf{h}) \big[(1-p_{rec}) F_{i-1,\mathcal{S},\mathbf{h}} + \psi(\mathcal{S},\mathbf{h}) p_{rec} \sum_{m,n}F_{i-1,m,n}\big],
\end{equation}
\noindent where $\psi(\mathcal{S},\mathbf{h})$ is the prior on the combination of IBD configuration $\mathcal{S}$ and allelic configuration $\mathbf{h}$ as defined in Section \ref{sect:prior}. Note that the summation term is identical for all IBD / allelic configurations.
By storing the output of the Forward algorithm we can then sample from the posterior distribution of the IBD- and allelic-configurations (given current proportions). That is, we sample $\mathcal{S}_l,\mathbf{h}_l \propto F_{l,\mathcal{S}, \mathbf{h}}$, and then previous steps proportion to $F_{i,\mathcal{S}, \mathbf{h}}$ times the probability to transition to the sampled state at position $i+1$.
\vspace{.2cm}
\noindent{\bf Caveat: identifiability with balanced mixing}\\
Using a reference panel free approach means that it can be difficult or impossible to deconvolute samples containing strains with equal proportions. For example, it is impossible to distinguish between $\{\frac{1}{3},\frac{1}{3},\frac{1}{3}\}$ and $\{\frac{1}{3},\frac{2}{3}\}$, or
$\{\frac{1}{4},\frac{1}{4}, \frac{1}{4}, \frac{1}{4}\}$ and $\{\frac{1}{4},\frac{1}{4}, \frac{1}{2}\}$. Because of the prior we use, the model will prefer to merge haplotypes when possible, which can lead to underestimation of the number of strains. We advise users to apply {\tt DEploid} with multiple runs with and without the `-ibd' flag and see if such problem occurs.
\subsection{Gibbs sampling update for IBD parameter}
Given the sampled IBD configuration, the IBD parameter $\theta$ can be updated using Gibbs sampling. We first identify the path of the $D$ distinct IBD configurations along the chromosome and, for each, identify the number of distinct IBD groups, $\mathcal{C}_d$. For example, if the IBD state is 0,1,2,2 there are three distinct IBD groups (0,1,2; $\mathcal{C}_d = 3$), while if the IBD state is 0,0,1,1 there are two IBD groups (0,1; $\mathcal{C}_d = 2$). Under the prior, the number of distinct strains (minus 1) is drawn from a binomial distribution with parameters $K-1$ and $\theta$. With a uniform prior on $\theta$, a new value, $\theta'$ is therefore drawn from:
\begin{equation*}
\theta' \sim Beta \big(\mathcal{C}_D - D + 1, D K-\mathcal{C}_D + 1\big),
\end{equation*}
\noindent where $\mathcal{C}_D = \sum_{d=1}^D \mathcal{C}_d$.
\subsection{Implementation details}
Below we detail a number of implementation details.
\begin{itemize}
\item {\bf Approximating the likelihood surface}
At any site, the likelihood for the WSAF, $q$, induced by the allelic configuration and strain proportions derives from the beta-binomial model as implemented in \texttt{DEploid}. That is, the likelihood of $q_i$ is
\begin{equation}
L(q_{i}| e, D) \propto \frac{B(a_i + \pi c, r_i + (1-\pi) c)}{B(\pi c, (1-\pi) c)}, \label{eqn:llk}
\end{equation}
\noindent where $\pi_i$ is the adjusted WSAF $\pi = q(1-e)+(1-q)e$ and $c^{-1}$ determines the magnitude of over-dispersion relative to the binomial, with $c \to \infty$ recovering the binomial. Here, we use $c=100$.
For computational efficiency, rather than recomputing for every value of $q$, we first approximate the likelihood function for each site through a Beta distribution, matching the first two moments of the posterior on $q$ implied by Equation \ref{eqn:llk} within a uniform prior on $q$. The estimated parameters of the Beta distribution are then used to approximate the likelihood surface in all subsequent calculations.
\item {\bf MCMC parameters for deconvolution}
\begin{itemize}
\item {\bf Number of strains}. As described above, we aim to infer more strains than are actually present, starting the MCMC chain with a fixed $K$ (default of 5). In our experience, we find poor performance for $K>4$, hence use the flag {\tt -k 4} to specify the number of strains as 4. At the point of reporting, we keep strains with a proportion above a fixed threshold, typically $0.01$.
\item {\bf Parameters}. We typically set the log-titre variance $\sigma^2 = 5$, which can be adjusted when working with extremely unbalanced samples (see \citet{Zhu2017} Supplementary Material). We set the read error rate as $0.01$ and the rate of mis-copying as $0.01$.
\item {\bf Recombination rate and scaling}. We assume a uniform recombination map, where the genetic distance between loci $i$ and $i+1$ is computed by $\gamma_i = D_i / d_m$ where $D_i$ denotes the physical distance between loci $i$ and $i+1$ in nucleotides and $d_m$ denotes the average recombination rate in Morgans bp$^{-1}$. We use the recombination rate for {\it P. falciparum} of 13,500 base pairs per centiMorgan as reported by \citet{Miles2016}. The recombination rate is scaled by a factor $G$, which reflects the effective population size, rate of inbreeding, and relatedness of the reference panel. In practice, we deconvolute over 1 million markers in field samples. We tuned the parameter $G$ using {\it in vitro} lab mixtures, finding that a value of $G=20$ ensures small recombination probabilities between any two markers, with a mean of 0.015. A large value of $G$ relaxes the reference panel constraint, becoming an LD free model when $G$ is infinity. The scaled genetic distance $G \gamma$ is used to compute the transition probability of switching from copying reference haplotype $a$ to reference haplotype $b$. Given LD varies enormously between {\it P. falciparum} populations, we will investigate how to tune this parameter for future improvement. In the current release, our program allows users to apply the flag {\tt -G} to specify a specific value. We note that IBD deconvolution from error-prone data can benefit from even smaller values of $G$, such as $0.1$.
\item {\bf Reporting}. We aim to provide users with a single point estimate of the haplotypes, their proportions and IBD configurations, although the full chain is also available for analysis. To achieve this we report values at the last iteration - i.e. we report a single sample from the posterior. However, to measure robustness, we typically repeat the deconvolution with multiple random starting points. We use a majority vote rule on the inferred number of strains; we then select the chain with the lowest average deviance (after removing the burn-in) as our estimate. The deviance measures the difference in log likelihood between the fitted and saturated models, the latter being inferred by setting the WSAF to that of the observed values. These parameters can be modified by users to achieve a preferred balance between computational speed and confidence. By default, we set the MCMC sampling rate as 5, with the first 50\% of samples removed as burn in and 800 samples used for estimation.
\item {\bf Reference panel construction}. To infer clonal samples for the reference panel we use the Pf3k project data, running the algorithm without LD on all samples and identifying those with a dominant haplotype (proportion $>$ 0.99) as clonal. These clonal samples are grouped by region of sampling to form location-specific reference panels. In addition, we have included a number of reference strains, described in more detail below.
\end{itemize}
\item {\bf Summarising pairwise IBD}
At the end of a run, we obtain the maximum likelihood estimate of IBD configurations along the chromosome using the Viterbi algorithm. Patterns of IBD are then obtained for all pairs of strains and summarised through the mean fraction IBD and the N50 IBD tract, obtained by identifying all blocks of IBD, sorting them in decreasing size and finding the size of the block such that 50\% of all sites that are in IBD lie in blocks of at least this size. A larger N50 statistic indicates more recent common ancestry.
We note that it is also possible to obtain estimates of pairwise IBD from the full posterior distribution of pairwise IBD (using standard hidden Markov model practices). Typically these give very similar answers to the Viterbi solution, though are typically slightly larger due to the identification of low certainty regions. The posterior mean IBD between pairs is also provided to users as output.
\end{itemize}
\subsection{Commands}
Each isolate deconvolution is repeated 15 times, each time initialized with a random seed. For each run we obtain an estimate of the number of strains and we take the modal value to be the estimate. For reporting, we use the first run that estimated the consensus number of strains. The text below gives an example of an input script for deconvolution. Full details are available in the documentation at the Github page: \url{https://github.com/mcveanlab/DEploid/}.
\linespread{1}
\begin{lstlisting}
ref=PD0577-C_ref.txt
alt=PD0577-C_alt.txt
plaf=asiaGroup1_PLAF.txt
panel=asiaGroup1PanelMostDiverse10.csv
exludeAt=asiaGroup1_and_pf3k_bad_snp_in_at_least_50_samples.txt
prefix=PD0577-C_IBD
time dEploid -ref ${ref} -alt ${alt} -plaf ${plaf} -panel ${panel} \
-exclude ${exludeAt} -o ${prefix} -nSample 250 -rate 8 -burn 0.67 -ibd -k 4
interpretDEploid.r -ref ${ref} -alt ${alt} -plaf ${plaf} -o ${prefix} \
-dEprefix ${prefix} -exclude ${exludeAt}
\end{lstlisting}
\subsection{Error analysis}
For haplotype quality analysis, we compared the inferred haplotype ({\tt DEploid} output) with the true haplotype. In addition to mismatches between the testing haplotype and the truth, the deconvolution process also introduces switch errors, where the haplotypes are correct, but have undergone {\it in silico} recombination events. In addition, when one strain has low frequency, there might be insufficient data to make accurate inference, resulting in missing segments of the true haplotypes. We refer to this as dropout error. Dropout error can also be caused by the sequencing process, when parts of the genome are not well sequenced (low read count).
When comparing inferred haplotypes to true haplotypes we use a dynamic programming approach to find a description of the differences, optimising a cost function in which switch errors are twice as costly as mismatches and allele dropouts. Code for performing the optimal description, {\tt errorAnalysis.r}, is available at the Github page: \url{https://github.com/DEploid-dev/DEploid-Utilities/}. An example analysis for three strains is shown in Figure~\ref{fig:error}.
\end{appendixbox}
\begin{table}[htb]
\centering
\begin{tabular}{c|l}\hline
$i$ & Marker index\\
$j$ & Sample index \\
$r$ & Read count for reference allele \\
$a$ & Read count for alternative allele \\
$f$ & Population level allele frequency (PLAF) \\
$K$ & Number of distinct strains within sample \\
$l$ & Number of sites\\
$\mathbf{w}$ & Proportions of strains \\
$\mathbf{x}$ & Log titre of strains \\
$\mathbf{h}_{i}$ & Allelic states of $K$ parasite strains at site $i$ \\
$h_{k,i}$ & Allelic state of parasite strain $k$ at site $i$\\
$p$ & Observed within sample allele frequency (WSAF) \\
$q$ & Unadjusted expected WSAF \\
$\pi$ & Adjusted expected WSAF \\
%$\Xi$ & Reference panel\\
%$\xi_{k,i}$ & Allelic state of reference panel strain $k$ at site $i$\\
%$G$ & Scaling factor used for genetic map\\
$e$ & Probability of read error\\
$\mathcal{S}_{i}$ & IBD configuration at site $i$ \\
%$\mathcal{G}$ & Allelic configuration (genotype) at site $i$ \\
$\theta$ & Probability of non-IBD in a mixture of two strains \\
\hline
\end{tabular}
\vspace{.2cm}
\caption{Notation used in this article.}\label{tab:notation}
\end{table}
\begin{table}
%\centering
\begin{tabular}{c|ccc}
Index & \multicolumn{3}{c}{IBD state} \\
& K = 2& K = 3 & K = 4 \\ \hline
0 &0-1 & 0-1-2 & 0-1-2-3\\
1 &0-0 & 0-0-1 & 0-0-1-2 \\
2 & & 0-1-0 &0-1-0-2 \\
3 & & 0-1-1 &0-1-2-0 \\
4 & & 0-0-0 &0-1-1-2 \\
5 & & &0-1-2-1 \\
6 & & &0-1-2-2 \\
7 & & &0-0-1-1 \\
8 & & &0-1-0-1 \\
9 & & &0-1-1-0 \\
10 & & &0-0-0-1 \\
11 & & &0-0-1-0 \\
12 & & &0-1-0-0 \\
13 & & &0-1-1-1 \\
14 & & &0-0-0-0 \\
\end{tabular}
\caption{\textcolor{black}{IBD configurations for two, three and four strains, ordered top to bottom by the number of IBD pairs. The (zero-indexed) notation indicates the type assigned to each haplotype, thus 0-1 indicates non-IBD for two strains, while 0-1-2-2 indicates four strains in which the third and fourth are IBD.} }\label{tab:encode}
\end{table}
\begin{figure}[htp]
%\centering
\includegraphics[width=0.7\textwidth]{DEploid_IBD_haps_compare.pdf}
\caption{Comparison of true and inferred haplotypes for Chromosome 14 (2,369 SNPs) in the lab strain mixture sample PG0396-C after running \texttt{DEploidIBD} to infer strain number and proportions (top) and after subsequent refinement of haplotypes by running \texttt{DEploid} with Reference Panel V (bottom). The yellow, cyan and white backgrounds identify the haplotype segments from strains 7G8, HB3 and Dd2 respectively. Numbers in the titles indicate the inferred switch, mismatch and dropout errors identified by the dynamic programming approach, with the cost of switch errors being twice that of other errors.
}
\label{fig:error}
\end{figure}
\begin{appendixbox}
\section{Generating {\it in silico} mixtures for deconvolution benchmark}
\subsection{{\it In silico} mixtures of lab strains}
For {\it in silico} lab mixtures, we used the lab strain GB4 to provide a whole-genome read depth profile. We drew Bernoulli variables to simulate alternative allele counts, with the probability of success as the adjusted within sample allele frequency at each position (Equation~\ref{eqn:adj_q}). The expected WSAF is the dot product of the genotype of the lab strains: 3D7, Dd2, HB3, 7G8 and the proportions. For example, if the genotype is 0, 0, 1, 1 at an arbitrary position, and the mixture proportion of 0.1, 0.2, 0.4 and 0.3 will lead to an expected WSAF of 0.7. We then further adjust the WSAF with an error term (0.01) to account for all technical error/noise: $0.7 \times(1-0.01) + (1-0.7) \times.01 = 0.696$.
\vspace{.5cm}
To test the accuracy of \texttt{DEploidIBD} in a more realistic setting, we created {\it in silico} mixtures of 2, 3 and 4 strains given different transmission scenarios from mosquito bites. Let $K$ denote the number strains within each sample, and $b$ denote the number of independent mosquito bites. In such a scenario, a mixed infection containing $K$ strains can be treated as a result of $b$-bite events, where $K \in \{2,3,4\}$ and $1~{\leq}~{b}~{\leq}~K$. For example, a $K=2$ mixed infection can result from co-transmission, where two strains are passed in a single bite ($b=1$); or by superinfection, where each strain is delivered by a unique bite ($b=2$). A mixed infection containing 3 strains is more complex: besides co-infection (3 strains in a single bite, $b=1$) and superinfection (3 strains in 3 bites, $b=3$), we can also have a super-infection scenario made up from a co-infection plus a clonal-infection ($b=2$). The complete simulation procedure (code) is available at \url{https://github.com/DEploid-dev/DEploid-Data-Benchmark-in_silico_field}.
\subsection{{\it In silico} mixtures of two field strains}
To simulate a mixture of two strains, we randomly selected two strains from 189 clonal samples of African origin (proportions ranging from 10/90\% to 50/50\%) using Chromosome 14 data. A further 20 randomly chosen samples were used as the reference panel. In order to compare the accuracy of the two methods at different levels of relatedness, we set 0\%, 25\%, 50\% and 75\% of the second haplotype the same as the first haplotype to mimic scenarios of unrelated, low, medium and high relatedness respectively. This operation sets a lower limit to the relatedness between two strains, as background relatedness may also exist. We used empirical read depths and drew read counts for the two alleles from binomial proportions (the same approach for generating {\it in silico} lab mixtures). We excluded sites for analysis at zero alternative allele counts in both targeted samples and reference panel, kept and analysed around 7,757 polymorphic sites (standard deviation 178) for each {\it in silico} samples.
We repeated the \emph{in silico} experiment with mixtures of two strains from 204 clonal Asian samples, also with mixing proportions of ranging from 10/90\% to 50/50\%, using about 3,041 sites (s.d. 227) from Chromosome 14.
\subsection{{\it In silico} mixtures of three field strains}
We further extended benchmarking to \emph{in silico} mixtures of three strains in African populations with mixing proportions of 10/10/80\%, 10/25/65\%, 15/25/60\%, 10/40/50\%, 15/30/55\%, 20/30/50\%, 33/33/34\%. Two generate $b=1$ infections with three strains, we randomly selected two strains, namely parent A and parent B, from 189 clonal samples of Africa origin, and set the first 33\% of the first haplotype and the last 66\% of the second haplotype the same as parent A; the rest of the first and second haplotypes and the third haplotype the same as parent B, to mimic the scenario of a 1/3 pairwise relatedness within sample.
In addition to the basic co-infection and super-infection scenarios of 3 strains, we also consider more complex events when co-infection presents within a super-infection. This data simulation is similar to simulating 2 strains of 50\% relatedness, with one additional unrelated haplotype. Therefore, the overall pairwise relatedness is $1/2$ distributed over three possible pairs, which leads to $1/6$.
\subsection{{\it In silico} mixtures of four field strains}
We tested {\it In silico} deconvolution experiments on mixtures of 4 strains. From previous experiments, we have learned that strain compositions with even proportions are difficult to deconvolute. In this set of simulations, we experimented with various unbalanced and balanced proportions including 11/22/30/37\%, 25/25/25/25\%, 20/20/20/40\% and 30/30/30/10\%. For some cases, the data generation procedures can easily be modified from previous experiment: a $b=4$ event of for four strains is equivalent to a $3$-bite event of 3 strains with one extra random haplotype; a $b=3$ event of four strains is equivalent to a $2$-bite event of 3 strains with one extra random haplotype. For $2$-bite event of 4 strains, there are two possibilities: (i) both bites pass on 2 strains, which is essentially repeating $b=1$ event of 2 strains twice or (ii) three strains in one bite and one in another, which is essentially a $1$-bite event of 3 strains with one extra random haplotype.
\subsection{Haplotype quality for {\it in silico} mixtures}
We assessed the quality of all the haplotypes deconvolved with \texttt{DEploid} and \texttt{DEploidIBD} for the \textit{in silico} simulated mixtures (Figure \ref{fig:hap-quality-sim-classic}, Figure \ref{fig:hap-quality-sim-ibd}). Our results are consistent with the performance we observed in field samples. Complex mixtures with
balanced proportions or marginal strains (i.e. with a very low proportion) tend to produce chimeric haplotypes. Nonetheless, further research is needed to explore factors that result in deconvolution failure. \texttt{DEpoidIBD} performs slightly worse than the previous version of the software; we ascribe this to the fact that the simulations lack any complex IBD structure. See \ref{section:hap-quality} for more details about the procedure.
\end{appendixbox}
\begin{figure}[ht]
\centering
\includegraphics[width=\textwidth]{quality-classic.pdf}
\caption{Distribution of quality scores haplotypes deconvolved from \textit{in silico} mixtures using \texttt{DEploid}. Each row represents a different population (Africa and Asia). The left panels represent the overall distribution of z-scores whereas the right panels stratify results according to the entropy of mixture proportions (y-axis) and number of strains (color).}
\label{fig:hap-quality-sim-classic}
\end{figure}
\begin{figure}[ht]
\centering
\includegraphics[width=\textwidth]{quality-ibd.pdf}
\caption{Distribution of quality scores haplotypes deconvolved from \textit{in silico} mixtures using \texttt{DEploidIBD}. Each row represents a different population (Africa and Asia). The left panels represent the overall distribution of z-scores whereas the right panels stratify results according to the entropy of mixture proportions (y-axis) and number of strains (color).}
\label{fig:hap-quality-sim-ibd}
\end{figure}
\begin{appendixbox}
\section{Pf3k field sample analysis}
\subsection{Sample choice}
We used 2,640 samples from the Pf3k project (see \url{https://www.malariagen.net/projects/pf3k}). We excluded Nigerian samples from downstream analysis as there were only 5 samples from this country. We discarded samples containing mixed malaria species and those where sequencing coverage depth was below 30X or in which less than 30\% of sites were callable. Finally, we exclude all lab strains (including reference strains, artificial mixtures, and crosses samples) and duplicated samples. In total, we retained 2,344 field samples from 13 countries.
\subsection{Data filtering}
We ran {\tt DEploid-IBD} on high quality biallelic SNPs (both coding and non-coding, tagged with PASS at the QUAL column in the VCF file) from Pf3k~\citep{pf3k}. Before the additional filtering step described below, this set contained 1,057,830 SNPs.
\linespread{1}
\begin{lstlisting}
bcftools view \
--include 'FILTER="PASS"' \
--min-alleles 2 \
--max-alleles 2 \
--types snps \
--output-file SNP_INDEL_Pf3D7_01_v3.high_quality_biallelic_snps.vcf.gz \
--output-type z \
SNP_INDEL_Pf3D7_01_v3.combined.filtered.vcf.gz
\end{lstlisting}
\subsubsection{High leverage data points}
We found that markers with high coverage for both alleles could mislead our model, inducing it to fit additional strains. We used a threshold of $\geq$ 99.5\% coverage (default) to identify markers with extremely high allele counts. We further expanded this list of potential problematic markers by considering their nearest 10 neighbours on both flanks, and excluding those that were tagged more than once (see Figure~\ref{fig:fitering}). These poorly-genotyped variants are likely to be errors of mapping and genotype calling.
%\subsubsection{High nucleotide diversity regions}
%\noindent
\donenum{Sup. Mat. 5, Why are we calculating diversity}
To track down the causes of high leverage points, we assessed nucleotide diversity in the {\it P. falciparum} genome. We used clonal haplotypes to compute nucleotide diversity by running a sliding window along the genome. At each SNP, we use $n_{0}$ and $n_{1}$ to denote counts for reference and alternative alleles, respectively. Let $n = (n_{0} + n_{1})$ be the number of haplotypes in the population with a non-missing call. We computed the mean number of pairwise differences for this SNP as follows. First, we computed the total number of pairs as $n_{pairs} = n * (n - 1) / 2$. Then, we computed the number of pairs that were the same, $n_{same} = (n_{0} * (n_{0} - 1) / 2) + (n_{1} * (n_{1} - 1) / 2)$, and the number of pairs that were different, $n_{d}$ = $n_{pairs} - n_{same}$. Finally, we obtained the mean number of pairwise differences as $mpd = n_{d} / n_{pairs}$. To estimate nucleotide diversity $\pi$, we computed the sum of $mpd$ in a window of 20kbp centred on each SNP, and divided by the number of accessible bases, which produces the mean number of pairwise differences per base.
Regions containing high leverage points tended to be at the ends of chromosomes or within regions of high nucleotide diversity, where read mapping was problematic (see Figure~\ref{fig:nd}). We identified potential outliers in all samples, and filtered out common outliers in at least 50 samples -- 48,443 in total.
\subsection{Analysis preparation}
To improve the accuracy and efficiency of the deconvolution process, we first split the data into groups based on genetic similarity. We computed genetic distance between two samples as follows:
\begin{equation}
d(x, y) = \sum_{l}^{L}\textrm{WSAF}_{x,l} * (1-\textrm{WSAF}_{y,l}) + \textrm{WSAF}_{x,l} * (1-\textrm{WSAF}_{y,l})
\end{equation}
where $l$ represents an arbitrary locus, $L$ denotes the total number of loci, and $\textrm{WSAF}_{s,l}$ indicates the non-reference within-sample allele frequency for sample $s$ at locus $l$. $\textrm{WSAF}_{s,l}$ is then given by $\textrm{WSAF}_{s,l} = \frac{a_{s,l}}{r_{s,l}+a_{s,l}}$ where $a_{s,l}$ is the number of read counts supporting the alternative allele in sample $s$ at locus $l$, and $r_{s,l}$ is the number of read counts supporting the reference allele in sample $s$ at locus $l$.
We found that samples from the same geographical region differentiated into clear groups. We used this initial grouping as the basis for defining the reference panels that assisted the deconvolution. The geographical groups arising from this analysis are listed below. In order to reduce computational time, we only used polymorphic sites at each population group:
\begin{enumerate}
\item Malawi, Congo, with 349,242 sites.
\item Ghana (Navrongo), with 508,606 sites.
\item Nigeria, Senegal, Mali, with 210,819 sites.
\item The Gambia, Guinea, Ghana (Kintampo), with 250,827 sites.
\item Cambodia (Pursat), Cambodia (Pailin), Thailand (Sisakhet), with 44,317 sites.
\item Vietnam, Laos, Cambodia (Ratanakiri), Cambodia (Preah Vihear), with 88,410 sites.
\item Bangladesh, Myanmar, Thailand (Mae Sot), Thailand (Ranong), with 84,868 sites.
\end{enumerate}
\subsection{Haplotype quality assessment}
\label{section:hap-quality}
In this work, we also assessed the quality of the haplotypes inferred by \texttt{DEploidIBD}. Our goal was to establish to what degree our inferred haplotypes were statistically indistinguishable, given a suite of population genetics statistics, from the subset of clonal haplotypes that had the same geographical origin. Our assumption was that haplotypes found in mixed infections would have similar characteristics than those present in clonal samples. In our assessment, we found that the distribution of statistics for groups of deconvoluted haplotypes had extreme outliers and presented a higher variance when compared with the clonal population originating on the same region. We noticed that the painting process implemented by DEploid struggles when faced with challenging mixtures. For instance, mixed infections in which the co-existing strains have the same relative proportion (e.g. $k=4$ with each strain having a proportion of 25\%), or samples in which proportions are very unbalanced (e.g. k=2 with the marginal strain at 2\%). This often results in an excess of alternative calls being assigned to one of the strains, which in turn provokes a deficit of diversity on the remaining haplotypes, that cannot be explained in terms of their genetic relationship to the reference genome used for mapping and assembly (3D7). We defined our quality metric as a $z$-score that approximates how much a deconvoluted haplotype deviates from the mean genetic diversity of the clonal population present in the same geographical area.
For each population, we computed the distribution of alternative calls observed within the subset of clonal samples ($k=1$). Using this distribution as reference, we computed a $z$-score for each haplotype in the whole population following $$z_i = \frac{a_i - \bar{a}_r}{\sigma_r},$$ where $a_i$ denotes the number of alternative calls in the haplotype $i$, and $\bar{a}_r$ and $\sigma_r$ are, respectively, the mean and standard deviation of observed alternative calls in the clonal set of samples from the population of origin. We only considere as suitable haplotypes with a $z$-score in the range $(-3,3)$, thus discarding any strain that is three or more standard deviations away, in terms of alternative calls, from the mean observed for clonal samples. By using the set of clonal samples as the reference distribution, we approximated the number of alternative calls expected in a genome belonging to that population, which serves as a proxy for genetic diversity but is easier to compute. Supp. Figure \ref{fig:ghana-filtering} shows an example of this filtering process for the most problematic population in the dataset (Ghana). Supp. Table \ref{table:haps-discarded-by-country} lists the number of haplotypes discarded by population while Supp. Table \ref{table:haps-discarded-by-COI} describes the number of haplotypes discarded by COI level. Statistical deconvolution of haplotypes in mixed infections remains a challenging problem and requires further research. Nonetheless, our quality metric can guide other researchers in the process of discarding haplotypes that are clearly artefactual
\subsection{Combining clonal sample pairs for background IBD computation}
We combined randomly selected clonal sample pairs to create artificial mixed infections, as a way to generate a background IBD distributions for each country. We assumed these artificial mixed infections mimic infections generated from two independent mosquito bites. In this setting, strain proportions are determined by their median read depth, whereas sample coverage is obtained by accumulating the reference and alternative allele counts of two clonal samples. Similar to {\tt DEploidIBD} deconvolution, SNPs with very high coverage resulting from this process caused high leverage in the model. Additionally, the sample sequence depth and skewness were heterogeneous due to different sample preparation and sequencing protocols. We reduced the {\tt DEploidIBD} filtering threshold from 99.5\% to 80\%, and used low recombination probabilities to avoid false IBD breakpoint inference. We validated our method using lab crosses \citep{Miles2016}, and compared the IBD block detection using \citet{Li2003}'s painting with parental strains and the {\tt DEploidIBD} algorithm (Figure~\ref{fig:bgibd}).
\end{appendixbox}
\begin{figure}[ht]
\centering
\includegraphics[width=0.7\textwidth]{{PG0415-CaltVsRefAndWSAFvsPLAF}.png}
\caption{Identification of high leverage data points for filtering. (Top) Plot showing total allele counts across all markers for field isolate PG0415. We observe a small number of heterozygous sites with high coverage (shown as crosses on the bottom-left plot), which can potentially mislead our model to over-fit the data with additional strains (above the dotted line). We used a threshold of $\geq$ 99.5\% coverage to identify markers with high allele counts. Red crosses indicate markers that are filtered out. (Bottom-left) Scatter plot showing alternative against reference allele count. The marked black crosses refer to the outliers identified on the previous plot, which will cause the inference method to mistakenly identify the sample as being a mixed infection. (Bottom-middle) Histogram of allele frequency within sample. (Bottom-right) Allele frequency within sample (WSAF), compared against the population average (PLAF).}
\label{fig:fitering}
\end{figure}
\begin{figure}[ht]
\centering
\includegraphics[width=0.7\textwidth]{{nd_hist.pdf}}
\caption{Nucleotide diversity for a sliding window size of 20,000 base pairs. (Top) Histograms showing the heavy tail of ND beyond 0.0007. (Bottom) Figure showing ND along {\it P. falciparum} chromosome 1. Scattered Points mark chromosome positions of poorly genotyped SNPs which we exclude from the deconvolution process. These points are jitterred to ease visualization.
}
\label{fig:nd}
\end{figure}
\begin{figure}[ht]
\centering
\includegraphics[width=1\textwidth]{qualityGhana.pdf}
\caption{Diagnostic plots showing the distribution of haplotype quality ($z$-scores) for the Ghanian samples. Left. Scatterplot showing the relationship between haplotype $z$-score and strain proportion. The top axis shows the number of alternative calls below/above the mean of the subset of clonal samples that correspond to a given $z$-score. The vertical red line denotes a $z$-score of $0$ whereas the red-shaded area indicate the haplotypes we retain Point colors show the COI level of the sample. Right. Four views of the same plot in which the samples have been highlighted according to their COI level.}
\label{fig:ghana-filtering}
\end{figure}
\begin{table}[ht]
\centering
\begin{tabular}{r|r|r|r}
\textbf{Country} & \textbf{Discarded} & \textbf{Retained} &\textbf{Fraction Discarded} \\
\hline
Bangladesh & 25 & 69 & 0.27 \\
Cambodia & 108 & 697 & 0.13 \\
DR. of Congo & 62 & 155 & 0.29 \\
Ghana & 493 & 609 & 0.45 \\
Guinea & 79 & 88 & 0.47 \\
Laos & 28 & 110 & 0.20 \\
Malawi & 233 & 341 & 0.41 \\
Mali & 37 & 140 & 0.21 \\
Myanmar & 7 & 71 & 0.09 \\
Senegal & 2 & 167 & 0.01\\
Thailand & 28 & 169 & 0.14 \\
The Gambia & 22 & 73 & 0.23 \\
Vietnam & 23 & 113 & 0.17\\
\hline
\textbf{Total} & 1147 & 2802 & 0.29
\end{tabular}
\vspace{.2cm}
\caption{Number of haplotypes discarded and retained for each population in the Pf3k dataset.}
\label{table:haps-discarded-by-country}
\end{table}
\begin{table}[ht]
\centering
\begin{tabular}{r|r|r|r}
\textbf{COI} & \textbf{Retained} & \textbf{Discarded} & \textbf{Fraction Discarded} \\
\hline
1 & 1331 & 34 & 0.02 \\
2 & 669 & 291 & 0.30 \\
3 & 583 & 533 & 0.48 \\
4 & 219 & 289 & 0.57 \\
\hline
\textbf{Total} & 2802 & 1147 & \\
\hline
\textbf{Fraction} & 0.71 & 0.29 & \\
\end{tabular}
\vspace{.2cm}
\caption{Number of haplotypes retained and discarded stratified by COI level.}
\label{table:haps-discarded-by-COI}
\end{table}
\begin{figure}[ht]
\centering{}
\includegraphics[width = \textwidth]{app3fig4.pdf}
\caption{(A) Comparison of IBD block detection between {\tt DEploidIBD} (top) and ancestral state inference from \citet{Li2003} (bottom), using artificial mixtures of lab crosses PG0071-C and PG0058-C (last tract). (B) Scatter plot of IBD segment Nx values extracted by comparing clonal sample ancestry (using {\tt DEploidIBD}) on artificial mixtures.}\label{fig:bgibd}
\end{figure}
\begin{appendixbox}
\section{Expected levels of IBD in \textit{ P.falciparum } mixed infections}
The amount of IBD observed in a mixed infection is a function of the number of oocysts present in the biting mosquito. We will demonstrate this below.
First, let us briefly review the fundamentals of malaria meiosis. In our case, we imagine a mosquito bites a human host containing two distinct malaria strains. Call these strains $A$ and $B$. Some number of gametocytes of $A$ and $B$ are imbibed during the bite, differentiate into gametes, and undergo fertilization to produce zygotes (reviewed in \citet{Gosh2000}, \citet{Bennink2016}). Some fraction of these zygotes succeed in establishing themselves as oocysts on the mosquito midgut (\citet{Gosh2000}). Three products of fertilization are possible, and thus the oocysts can be either: $A+A$ or $B+B$ (inbred oocysts, $n_{ii}$), or $A+B$ (outbred oocysts, $n_{ij}$). The oocyst state of a mosquito can be characterized by ($n_{ij}$, $n_{ii}$). Which strain is maternal and paternal may vary from oocyst to oocyst, but this is of no consequence here.
A $k=2$ mixed infection is established when two distinct sporozoites, produced from the oocysts of this mosquito, infect a host. Each oocyst produces thousands of sporozoites (\citet{Beir1991}), of four types (discussed in \citet{McKenzie2001}), which pool in the mosquito salivary glands (\citet{Gosh2000}). Imagine drawing a $k=2$ mixed infection from a mosquito harbouring a single outbred oocyst ($n_{ij}$ = 1). In such a mosquito there are two copies of each of the two strains (two sets of sister chromatids; $A$, $A$, $B$, $B$). Thus, ignoring recombination for the present, there are two pairs with an IBD fraction of 1 and, if our original strains are unrelated, the remainder of the ${4 \choose 2}$ pairs will have an IBD of 0. Thus a single $n_{ij}$ oocyst has an expected IBD of $E[\rho]= 2/{4 \choose 2} = 1/3$. We draw pairs without replacement because if sporozoites of only one type seed the infection, it will be $k=1$. Importantly, neither recombination nor segregation change this result, as they only shuffle how the total identity is distributed between pairs, rather than create or destroy identity (identity is created by DNA replication and destroyed by mutation); the expectation is taken over all pairs and is thus unaffected.
Computing the expected IBD fraction for a mosquito possessing $n_{ij}$ outbred oocysts is an extension of the above. Again ignoring recombination, the expected IBD fraction $E[\rho|n_{ij}]$ is equal to the total number of pairs with an IBD of 1 (IBD pairs), over all possible pairs. In a mosquito with $n_{ij}$ oocysts, we have $2n_{ij}$ copies of each parental strain, thus we have ${2n_{ij} \choose 2}$ IBD pairs for each parental strain, thus $2 {2n_{ij} \choose 2}$ IBD pairs total. Dividing this by the total number of pairs amongst $n_{ij}$ oocysts we have
\begin{equation} \label{eq1}
E[\rho|n_{ij} > 0] = \frac{2{2n_{ij} \choose 2}}{{4n_{ij} \choose 2}} = \frac{2n_{ij} - 1}{4n_{ij} - 1}.
\end{equation}
The above yields $1/3$ for $n_{ij}=1$, approaching $1/2$ as $n_{ij}$ grows. This result has been validated with \texttt{pf-meiosis} in Figure~\ref{fig:validoocyst}.
Including $n_{ii}$ oocysts is somewhat involved, as some pairs (selected without replacement) may be identical (thus yielding $k=1$) or completely unrelated (yielding $k=2$, but effectively without having undergone meiosis or producing any detectable recombination breakpoints between parental strains). We are interested in the expected IBD produced as a result of meiosis between parental strains, and thus for the moment we exclude these pairs. In practice, this means the mosquito must have at least one outbred oocyst, and at least one of the infecting sporozoites must be from an outbred oocyst.
The derivation is as above: first ignoring recombination and segregation, then enumerating all IBD pairs (pairs with IBD fraction of 1) and dividing by the total number of pairs to compute the expectation. Note that the additional IBD pairs possible between an outbred and inbred oocyst are given by the term $8n_{ij}n_{ii}$, and that you can no longer use all possible pairs drawn without replacement as the denominator, but must exclude the pairs described above.
\begin{align}
E[\rho|n_{ij} > 0, n_{ii}] & = \frac{2{2n_{ij} \choose 2} + 8n_{ij}n_{ii}}{2{2n_{ij} \choose 2} + 16n_{ij}n_{ii} + 4n_{ij}^2} \nonumber\\
%& = \frac{{2n_{ij} \choose 2} + 4n_{ij}n_{ii}}{{2n_{ij} \choose 2} + 8n_{ij}n_{ii} + 2n_{ij}^2} \nonumber\\
%& = \frac{n_{ij}(2n_{ij} - 1) + 4n_{ij}n_{ii}}{n_{ij}(2n_{ij} - 1) + 8n_{ij}n_{ii} + 2n_{ij}^2} \nonumber\\
%& = \frac{2n_{ij} - 1 + 4n_{ii}}{2n_{ij} - 1 + 8n_{ii} + 2n_{ij}} \nonumber\\
& = \frac{2(n_{ij} + 2n_{ii}) - 1}{4(n_{ij} + 2n_{ii}) - 1} \label{eq2}
\end{align}
Which is of a similar form to above, but increases to $1/2$ quicker if more inbred oocysts are present. As before the equation is validated in Figure~\ref{fig:validinbred}A.
The expression for $E[\rho|n_{ij} > 0, n_{ii}]$ can also be derived by recognizing that there are three \textit{types} of pairs possible in a mosquito with a collection of $n_{ij}$ and $n_{ii}$ oocysts: (1) a pair can contain two strains from a single $n_{ij}$, ($n^{o=1}_{ij}$); (2) a pair can contain two strains from two different $n_{ij}$, ($n^{o=2}_{ij}$); or (3) a pair can contain one strain from an $n_{ij}$ oocyst and one from an $n_{ii}$ oocyst, $n^{o=2}_{ij, ii}$. Pair type (1) is unique to malaria and has an $E[\rho|n^{o=1}_{ij}]=1/3$, as shown above; pair type (2) are standard siblings with $E[\rho|n^{o=2}_{ij}]=1/2$; and pair type (3) represent a mother-daughter relationship, also with $E[\rho|n^{o=2}_{ij, ii}]=1/2$. The full IBD fraction and IBD segment length distributions of these pairs were generated using \texttt{pf-meiosis} and can be seen in Figure~\ref{fig:validinbred}B. We enumerate the number of each pair type given $n_{ij}$ and $n_{ii}$, weighted by their expectation, to derive $E[\rho|n_{ij} > 0, n_{ii}]$:
\begin{align}
E[\rho|n_{ij} > 0, n_{ii}] & = \frac{n^{o=1}_{ij}E[f|n^{o=1}_{ij}] + n^{o=2}_{ij}E[f|n^{o=2}_{ij}]
+ n^{o=2}_{ij, ii}E[f|n^{o=2}_{ij, ii}]}{n^{o=1}_{ij} + n^{o=2}_{ij} + n^{o=2}_{ij, ii}} \nonumber\\
& = \frac{ n_{ij}{4\choose 2}1/3 + 16{n_{ij}\choose 2}1/2 + 16n_{ij}n_{ii}1/2}{
n_{ij}{4\choose 2} + 16{n_{ij}\choose 2} + 16n_{ij}n_{ii}} \nonumber\\
%& = \frac{2n_{ij} + 4n_{ij}(n_{ij} - 1) + 8n_{ij}n_{ii}}{6n_{ij} + 8n_{ij}(n_{ij} - 1) + 16n_{ij}n_{ii}}\nonumber\\
%& = \frac{1 + 2(n_{ij} - 1) + 4n_{ii}}{3 + 4(n_{ij} - 1) + 8n_{ii}} \nonumber\\
& = \frac{2(n_{ij} + 2n_{ii}) - 1}{4(n_{ij} + 2n_{ii}) - 1}
\end{align}
As above.
\end{appendixbox}
\begin{figure}[pt]
\centering{}
\includegraphics[width = .75\textwidth]{supp-Fig1.pdf}
\caption{Exploring the relationship between number of outbred oocysts ($n_{ij}$) and IBD. (A) Joint IBD fraction and IBD segment length distributions for $k=2$ mixed infections simulated from two unrelated strains and a fixed number of outbred oocysts $n_{ij}$, using \texttt{pf-meiosis}. Mean values for each distribution are indicated by same-color dashed lines. Each distribution is created from 1000 simulated mixed infections. (B) Validation of theoretical result given in text (S1.8). Line plot compares trend in expected IBD fraction with the number of outbred oocysts, $n_{ij}$, for infections simulated in panel A, and analytical expression S1.8.} \label{fig:validoocyst}
\end{figure}
\begin{figure}[ht]
\centering{}
\includegraphics[width = .85\textwidth]{supp-Fig2.pdf}
\caption{Exploring expected IBD allowing for outbred ($n_{ij}$) and inbred ($n_{ii}$) oocysts (A) Validation of expression for expected IBD fraction conditional on outbred $n_{ij}$ and inbred $n_{ii}$ oocysts (S1.9). Line plot compares trend in expected IBD fraction with varying number of outbred (x-axis, $n_{ij}$) and inbred (line color, $n_{ii}$) oocysts and the analytical expression S1.9 (grey dashed lines). (B) Using \texttt{pf-meiosis} to simulate $k=2$ mixed infections generated from (1) two strains from the same outbred oocyst from ($n^{o=1}_{ij}$, 'Within oocyst'); (2) two strains different outbred oocysts($n^{o=2}_{ij}$, 'Standard Siblings'); (3) one strain from an outbred and one strain from an inbred oocyst ($n^{o=2}_{ij, ii}$, 'Mother-daughter').} \label{fig:validinbred}
\end{figure}
%\nocite{Zhu2017}
%\nocite{Miles2016}
%\nocite{Li2003}
%\nocite{pf3k}
%\nocite{Gosh2000}
%\nocite{Bennink2016}
%\nocite{McKenzie2001}
%\nocite{Beir1991}
\bibliography{mixedIBD.bib}
\end{document}