Skip to content

Commit 0ef5375

Browse files
author
gsc74
committed
PHI v1.0 code-base
0 parents  commit 0ef5375

File tree

126 files changed

+10820
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

126 files changed

+10820
-0
lines changed

Installdeps

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#!/usr/bin/bash
2+
3+
source $HOME/.bashrc
4+
conda create -n PHI -c conda-forge gxx_linux-64=13.2.0 make libstdcxx-ng cmake zlib bzip2 xz lz4 zstd -y
5+
conda activate PHI
6+
7+
rm -rf ../extra
8+
rm -rf temp_bin
9+
mkdir -p temp_bin
10+
cd temp_bin
11+
mkdir -p ../extra
12+
mkdir -p ../extra/plugins
13+
mkdir -p ../extra/lib
14+
mkdir -p ../extra/bin
15+
mkdir -p ../extra/include
16+
git clone https://github.com/jltsiren/gbwtgraph.git
17+
git clone https://github.com/vgteam/sdsl-lite.git
18+
git clone https://github.com/vgteam/libhandlegraph.git
19+
git clone https://github.com/jltsiren/gbwt.git
20+
git clone https://github.com/samtools/htslib.git
21+
git clone https://github.com/samtools/samtools.git
22+
git clone https://github.com/samtools/bcftools.git
23+
24+
# htslib
25+
cd htslib
26+
git submodule update --init --recursive
27+
autoreconf -i
28+
./configure --prefix=$(pwd)/../../extra
29+
make -j4
30+
make install
31+
32+
# samtools
33+
cd ../samtools
34+
autoheader
35+
autoconf -Wno-syntax
36+
./configure --prefix=$(pwd)/../../extra
37+
make -j4
38+
make install
39+
cd ..
40+
41+
# bcftools
42+
cd bcftools
43+
make -j4
44+
cp bcftools ../../extra/bin/
45+
cp -r plugins/* ../../extra/plugins/
46+
cd ..
47+
48+
wget --no-check-certificate https://packages.gurobi.com/11.0/gurobi11.0.2_linux64.tar.gz
49+
tar -xvf gurobi11.0.2_linux64.tar.gz
50+
cd gurobi1102/linux64
51+
cp lib/* ../../../extra/lib/
52+
cp include/* ../../../extra/include/
53+
cd ../../
54+
55+
# zlib
56+
wget --no-check-certificate https://zlib.net/current/zlib.tar.gz
57+
tar -xvf zlib.tar.gz
58+
cd zlib-*
59+
./configure --prefix=$(pwd)/../../extra
60+
make -j4
61+
make install
62+
cd ..
63+
64+
cd sdsl-lite
65+
cd build
66+
cmake .. && make -j4
67+
cd ../..
68+
69+
cd libhandlegraph
70+
mkdir -p build
71+
cd build
72+
cmake .. && make -j4 CPPFLAGS="-I../sdsl-lite/include" LDFLAGS="-L../sdsl-lite/build/lib/ -L../sdsl-lite/build/external/libdivsufsort/lib/"
73+
cd ../..
74+
75+
cd gbwt
76+
make -j4 CPPFLAGS="-I../sdsl-lite/include -I../libhandlegraph/src/include/" LDFLAGS="-L../sdsl-lite/build/lib/ -L../sdsl-lite/build/external/libdivsufsort/lib/ -L../libhandlegraph/build/"
77+
cd ..
78+
79+
cd gbwtgraph
80+
make CPPFLAGS="-I../sdsl-lite/include -I../gbwt/include -I../libhandlegraph/src/include/" \
81+
LDFLAGS="-L../gbwt/lib -L../sdsl-lite/build/lib/ -L../sdsl-lite/build/external/libdivsufsort/lib/ -L../libhandlegraph/build/" -j4
82+
83+
cp bin/gfa2gbwt ../../extra/bin/
84+
cp lib/* ../../extra/lib/
85+
cd ..
86+
cp gbwt/lib/* ../extra/lib/
87+
cp sdsl-lite/build/lib/* ../extra/lib/
88+
cp sdsl-lite/build/external/libdivsufsort/lib/* ../extra/lib/
89+
cp libhandlegraph/build/libhandlegraph.a ../extra/lib/
90+
cp libhandlegraph/build/libhandlegraph.so ../extra/lib/
91+
92+
93+
# get vg
94+
wget https://github.com/vgteam/vg/releases/download/v1.60.0/vg -O ../extra/bin/vg
95+
chmod +x ../extra/bin/vg
96+
cd ..
97+
rm -rf temp_bin
98+
99+
100+
# # Define the paths
101+
# BIN_PATH="$(pwd)/extra/bin"
102+
# LIB_PATH="$(pwd)/extra/lib"
103+
104+
# # Check if BIN_PATH is already in .bashrc
105+
# if ! grep -q "$BIN_PATH" ~/.bashrc; then
106+
# echo "Adding $BIN_PATH to PATH in .bashrc"
107+
# echo "export PATH=\"$BIN_PATH:\$PATH\"" >> ~/.bashrc
108+
# else
109+
# echo "$BIN_PATH is already in PATH"
110+
# fi
111+
112+
# # Check if LIB_PATH is already in .bashrc
113+
# if ! grep -q "$LIB_PATH" ~/.bashrc; then
114+
# echo "Adding $LIB_PATH to LD_LIBRARY_PATH in .bashrc"
115+
# echo "export LD_LIBRARY_PATH=\"$LIB_PATH:\$LD_LIBRARY_PATH\"" >> ~/.bashrc
116+
# else
117+
# echo "$LIB_PATH is already in LD_LIBRARY_PATH"
118+
# fi
119+
120+
# echo "Done! Paths are now updated and applied. Please source ~/.bashrc to apply the changes."

LICENSE

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
This software constitutes a joint work and the contributions of individual
2+
authors are subject to different licenses. Contributions and licenses are
3+
listed in the applicable source files, with specific details on each
4+
individual contribution captured in the revision control system.
5+
6+
--
7+
For all code, except as indicated otherwise:
8+
9+
PUBLIC DOMAIN NOTICE
10+
11+
This software is freely available to the public for use
12+
without a copyright notice. Restrictions cannot be placed on its present or
13+
future use.
14+
15+
--
16+
For code used from minigraph:
17+
18+
URL: https://lh3.github.io/minigraph
19+
20+
The MIT License
21+
22+
Copyright (c) 2019- Dana-Farber Cancer Institute
23+
24+
Permission is hereby granted, free of charge, to any person obtaining
25+
a copy of this software and associated documentation files (the
26+
"Software"), to deal in the Software without restriction, including
27+
without limitation the rights to use, copy, modify, merge, publish,
28+
distribute, sublicense, and/or sell copies of the Software, and to
29+
permit persons to whom the Software is furnished to do so, subject to
30+
the following conditions:
31+
32+
The above copyright notice and this permission notice shall be
33+
included in all copies or substantial portions of the Software.
34+
35+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
36+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
37+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
38+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
39+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
40+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
41+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42+
SOFTWARE.
43+
44+
--
45+
For code used for Murmurhash3
46+
47+
Apache License, Version 2.0
48+
49+
Austin Appleby
50+
51+
--
52+
For vg toolkit
53+
54+
The MIT License (MIT)
55+
56+
Copyright (c) 2014 Erik Garrison
57+
58+
59+
--
60+
For GBWTgraph
61+
62+
Copyright (c) 2019, 2020, 2021, 2022, 2023, 2024 Jouni Siren and other authors

Makefile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
CXX = g++ -std=c++11
2+
CXXFLAGS = -fopenmp -pthread -march=native -mtune=native -O3 -lgurobi_c++ -lgurobi110 -lm -lz -lpthread -ldl
3+
GUROBI_HOME = /home/ghanshyam/opt/gurobi1101/linux64
4+
INLCLUDES = -Iextra/include
5+
LIBS = -Lextra/lib
6+
7+
all: PHI
8+
9+
src_dir := src
10+
11+
OBJS = $(src_dir)/main.o $(src_dir)/gfa-io.o $(src_dir)/gfa-base.o \
12+
$(src_dir)/options.o $(src_dir)/kalloc.o \
13+
$(src_dir)/misc.o $(src_dir)/sys.o $(src_dir)/ILP_index.o \
14+
$(src_dir)/MurmurHash3.o
15+
16+
PHI: $(OBJS)
17+
$(CXX) $^ -o $@ $(INLCLUDES) $(LIBS) $(CXXFLAGS)
18+
19+
$(src_dir)/%.o: $(src_dir)/%.cpp
20+
$(CXX) -c $< -o $@ $(INLCLUDES) $(LIBS) $(CXXFLAGS)
21+
22+
clean:
23+
rm -f $(src_dir)/*.o PHI

README.md

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
<div align="center">
2+
<img src="test/logo/logo_phi.png" alt="PHI Logo" width="200">
3+
</div>
4+
5+
## <div align="center"><span style="color:red;"><b>PHI</b></span> (<span style="color:red;"><b>P</b></span>angenome-based <span style="color:red;"><b>H</b></span>aplotype <span style="color:red;"><b>I</b></span>nference)</div>
6+
7+
8+
## <a name="started"></a>Getting Started
9+
10+
### Prerequisites
11+
12+
Before using PHI, please ensure that **Miniforge** is installed: [Miniforge Installation Guide](https://github.com/conda-forge/miniforge). This package installer is used for installing a few dependencies such as VG and samtools. To run PHI, you also need a Gurobi license. You can get a free academic license [here](https://www.gurobi.com/academia/academic-program-and-licenses/). You should download and save `gurobi.lic` file in your home directory.
13+
14+
## <a name="get_phi"></a>Get PHI
15+
16+
```bash
17+
git clone https://github.com/at-cg/PHI
18+
cd PHI
19+
# Install dependencies (Miniforge is required)
20+
./Installdeps
21+
export PATH="$(pwd)/extra/bin:$PATH"
22+
export LD_LIBRARY_PATH="$(pwd)/extra/lib:$LD_LIBRARY_PATH"
23+
make
24+
25+
# test run
26+
./PHI -t32 -g test/MHC_4.gfa.gz -r test/CHM13_reads.fq.gz -o CHM13.fa
27+
28+
# test run with VCF file as input
29+
./vcf2gfa.py -v test/MHC_4.vcf.gz -r test/MHC-CHM13.0.fa.gz | bgzip > test/MHC_4_vcf.gfa.gz
30+
./PHI -t32 -g test/MHC_4_vcf.gfa.gz -r test/CHM13_reads.fq.gz -o CHM13.fa
31+
```
32+
33+
#### Adding Binary and Library Paths to `.bashrc`
34+
To ensure that the `extra/bin` and `extra/lib` directories are automatically loaded for every terminal session, you can export them to your `~/.bashrc`. This will make sure the required binaries and libraries for `PHI` are available.
35+
36+
```bash
37+
# Add extra/bin and extra/lib to .bashrc
38+
echo 'export PATH="$(pwd)/extra/bin:$PATH"' >> ~/.bashrc
39+
echo 'export LD_LIBRARY_PATH="$(pwd)/extra/lib:$LD_LIBRARY_PATH"' >> ~/.bashrc
40+
source ~/.bashrc
41+
```
42+
43+
## Table of Contents
44+
45+
- [Getting Started](#started)
46+
- [Get PHI](#get_phi)
47+
- [Introduction](#intro)
48+
- [Results](#results)
49+
- [Future work](#future)
50+
- [Publications](#pub)
51+
52+
## <a name="intro"></a>Introduction
53+
PHI is a pangenome-based genotyping method. It estimates complete haplotype sequence from low-coverage sequencing data (short-reads or long-reads of a haploid genome). Users should provide a pangenome graph reference in either:
54+
- Graph Format ([GFA v1.1](http://gfa-spec.github.io/GFA-spec/GFA1.html#gfa-11)): A sequence graph-based representation of the pangenome graph. Graph should be acyclic.
55+
- Variant Call Format ([VCF](https://samtools.github.io/hts-specs/VCFv4.2.pdf)): A list of multi-sample, multi-allelic phased variants along with a reference genome.
56+
57+
Output of PHI is the haplotype sequence (FASTA) associated with the optimal inferred path from the graph. It identifies a path in the pangenome graph that maximizes the matches between the path and read k-mers while minimizing recombination events (haplotype switches) along the path. We implemented integer programming to compute an optimal solution. The integer program is solved optimally using the [Gurobi optimizer](https://www.gurobi.com). Details of these formulations are described in our [paper](#publications).
58+
59+
60+
## <a name="results"></a>Results
61+
We benchmarked PHI (v1.0) using short-read datasets sampled from MHC sequences of five haplotypes (APD, DBB, MANN, QBL, and SSTO). This data was generated by [Houwaart et al. (2022)](https://doi.org/10.1111/tan.15020). These datasets were downsampled to various coverages ranging from 0.1x to 10x. We built a pangenome graph using [Minigraph-Cactus](https://github.com/ComparativeGenomicsToolkit/cactus/tree/master), comprising 49 complete [MHC sequences](https://doi.org/10.5281/zenodo.6617246). To assess the accuracy of PHI, we evaluated the edit distance between the inferred haplotype sequences and the MHC sequences from Houwaart et al. that were determined using de novo assembly and curation.
62+
63+
<p align="center" id="F1-score">
64+
<img src="data/edit_distances.jpg" width="700" alt="F1-score"/>
65+
</p>
66+
67+
> Edit distance between ground-truth haplotype sequences and the sequences estimated by different tools (PHI, VG, and PanGenie). Lower edit distance implies higher accuracy. PHI provides advangate over existing methods on low-coverage inputs.
68+
69+
In PHI, we have implemented two integer programs (referred to as ILP and IQP respectively). They both solve the same problem, but differ in terms of their runtime and memory-usage. IQP is generally faster but it requires more memory. Users can select between the two using command line argument (see `./PHI -h`).
70+
71+
<p align="center" id="F1-score">
72+
<img src="data/phi_vs_phi_ilp.jpg" width="700" alt="F1-score"/>
73+
</p>
74+
75+
> Performance comparison between ILP and IQP.
76+
77+
The scripts to reproduce the results are available [here](data).
78+
79+
80+
## <a name="future"></a>Future Work
81+
- Add support for diploid haplotype estimation.
82+
- Scale to pangenome graphs having larger number of genomes.
83+
84+
85+
## <a name="pub"></a>Publications
86+
- **Ghanshyam Chandra, Md Helal Hossen, Stephan Scholz, Alexander T Dilthey, Daniel Gibney and Chirag Jain**. "[Integer programming framework for pangenome-based genome inference](https://www.biorxiv.org/)". *bioRxiv* 2024.

data/Ground_truth/APD.fasta.gz

1.36 MB
Binary file not shown.

data/Ground_truth/COX.fasta.gz

1.34 MB
Binary file not shown.

data/Ground_truth/DBB.fasta.gz

1.39 MB
Binary file not shown.

data/Ground_truth/KAS116.fasta.gz

1.35 MB
Binary file not shown.

data/Ground_truth/MANN.fasta.gz

1.39 MB
Binary file not shown.

data/Ground_truth/PGF.fasta.gz

1.39 MB
Binary file not shown.

data/Ground_truth/QBL.fasta.gz

1.35 MB
Binary file not shown.

data/Ground_truth/SSTO.fasta.gz

1.39 MB
Binary file not shown.

data/MHC.seqfile

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
CHM13.0 data/hprc_haps/MHC-CHM13.0.fa
2+
HG002.1 data/hprc_haps/MHC-HG002.1.fa
3+
HG002.2 data/hprc_haps/MHC-HG002.2.fa
4+
HG00438.1 data/hprc_haps/MHC-HG00438.1.fa
5+
HG00438.2 data/hprc_haps/MHC-HG00438.2.fa
6+
HG005.1 data/hprc_haps/MHC-HG005.1.fa
7+
HG005.2 data/hprc_haps/MHC-HG005.2.fa
8+
HG00621.1 data/hprc_haps/MHC-HG00621.1.fa
9+
HG00621.2 data/hprc_haps/MHC-HG00621.2.fa
10+
HG00741.1 data/hprc_haps/MHC-HG00741.1.fa
11+
HG00741.2 data/hprc_haps/MHC-HG00741.2.fa
12+
HG01106.1 data/hprc_haps/MHC-HG01106.1.fa
13+
HG01106.2 data/hprc_haps/MHC-HG01106.2.fa
14+
HG01109.1 data/hprc_haps/MHC-HG01109.1.fa
15+
HG01109.2 data/hprc_haps/MHC-HG01109.2.fa
16+
HG01123.1 data/hprc_haps/MHC-HG01123.1.fa
17+
HG01123.2 data/hprc_haps/MHC-HG01123.2.fa
18+
HG01258.1 data/hprc_haps/MHC-HG01258.1.fa
19+
HG01258.2 data/hprc_haps/MHC-HG01258.2.fa
20+
HG01358.1 data/hprc_haps/MHC-HG01358.1.fa
21+
HG01358.2 data/hprc_haps/MHC-HG01358.2.fa
22+
HG01361.1 data/hprc_haps/MHC-HG01361.1.fa
23+
HG01361.2 data/hprc_haps/MHC-HG01361.2.fa
24+
HG01891.1 data/hprc_haps/MHC-HG01891.1.fa
25+
HG01891.2 data/hprc_haps/MHC-HG01891.2.fa
26+
HG01928.1 data/hprc_haps/MHC-HG01928.1.fa
27+
HG01928.2 data/hprc_haps/MHC-HG01928.2.fa
28+
HG01952.1 data/hprc_haps/MHC-HG01952.1.fa
29+
HG01952.2 data/hprc_haps/MHC-HG01952.2.fa
30+
HG01978.1 data/hprc_haps/MHC-HG01978.1.fa
31+
HG01978.2 data/hprc_haps/MHC-HG01978.2.fa
32+
HG02080.1 data/hprc_haps/MHC-HG02080.1.fa
33+
HG02080.2 data/hprc_haps/MHC-HG02080.2.fa
34+
HG02257.1 data/hprc_haps/MHC-HG02257.1.fa
35+
HG02257.2 data/hprc_haps/MHC-HG02257.2.fa
36+
HG02486.1 data/hprc_haps/MHC-HG02486.1.fa
37+
HG02486.2 data/hprc_haps/MHC-HG02486.2.fa
38+
HG02559.1 data/hprc_haps/MHC-HG02559.1.fa
39+
HG02559.2 data/hprc_haps/MHC-HG02559.2.fa
40+
HG02622.1 data/hprc_haps/MHC-HG02622.1.fa
41+
HG02622.2 data/hprc_haps/MHC-HG02622.2.fa
42+
HG02717.1 data/hprc_haps/MHC-HG02717.1.fa
43+
HG02717.2 data/hprc_haps/MHC-HG02717.2.fa
44+
HG02886.1 data/hprc_haps/MHC-HG02886.1.fa
45+
HG02886.2 data/hprc_haps/MHC-HG02886.2.fa
46+
HG03540.1 data/hprc_haps/MHC-HG03540.1.fa
47+
HG03540.2 data/hprc_haps/MHC-HG03540.2.fa
48+
NA18906.1 data/hprc_haps/MHC-NA18906.1.fa
49+
NA18906.2 data/hprc_haps/MHC-NA18906.2.fa

0 commit comments

Comments
 (0)