Skip to content

Commit 267c8de

Browse files
awenocurgspowley
andauthored
Update to C++ accelerated IAF computation (#538)
* add dummy condition when IAF field is detected * add option for dataset-wide cohort for IAF computation * actually compute IAF from full dataset sample count when requested * add test for full sample scan * count 'ref' in variant stats array * examples appear to work * disable async variant stats reader and fix race * unify ref,alt representation * check stats table version * make clumsy types more parsimonious * clarify logic, flip default async value * update stats check to require a minimum version, update error message * temporary Quarto rendering fix --------- Co-authored-by: George Powley <george.powley@gmail.com>
1 parent 46caae3 commit 267c8de

File tree

15 files changed

+209
-29
lines changed

15 files changed

+209
-29
lines changed

.github/workflows/quarto-render.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
- name: "Quarto render"
1818
shell: bash
1919
run: |
20-
pip install quartodoc
20+
pip install quartodoc "pydantic<2"
2121
# create a symlink to the tiledbvcf python package, so it doesn't have to be installed
2222
ln -s apis/python/src/tiledbvcf
2323
quartodoc build

apis/python/src/tiledbvcf/binding/libtiledbvcf.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ PYBIND11_MODULE(libtiledbvcf, m) {
3939
.def("set_output_path", &Reader::set_output_path)
4040
.def("set_output_dir", &Reader::set_output_dir)
4141
.def("set_af_filter", &Reader::set_af_filter)
42+
.def("set_scan_all_samples", &Reader::set_scan_all_samples)
4243
.def("read", &Reader::read, py::arg("release_buffs") = true)
4344
.def("get_results_arrow", &Reader::get_results_arrow)
4445
.def("completed", &Reader::completed)

apis/python/src/tiledbvcf/binding/reader.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,12 +201,26 @@ void Reader::set_af_filter(const std::string& af_filter) {
201201
reader, tiledb_vcf_reader_set_af_filter(reader, af_filter.c_str()));
202202
}
203203

204+
void Reader::set_scan_all_samples(const bool scan_all_samples) {
205+
auto reader = ptr.get();
206+
check_error(
207+
reader, tiledb_vcf_reader_set_scan_all_samples(reader, scan_all_samples));
208+
}
209+
204210
void Reader::read(const bool release_buffs) {
205211
auto reader = ptr.get();
206212
bool af_filter_enabled = false;
207213
if (tiledb_vcf_reader_get_af_filter_exists(reader, &af_filter_enabled) ==
208214
TILEDB_VCF_ERR)
209215
throw std::runtime_error("TileDB-VCF-Py: Error finding AF filter.");
216+
if (!af_filter_enabled) {
217+
for (std::string attribute : attributes_) {
218+
if (attribute == "info_TILEDB_IAF") {
219+
tiledb_vcf_reader_set_af_filter(reader, ">=0");
220+
af_filter_enabled = true;
221+
}
222+
}
223+
}
210224
if (af_filter_enabled) {
211225
// add alleles buffer only if not already present
212226
bool add_alleles = true;

apis/python/src/tiledbvcf/binding/reader.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ class Reader {
154154
/** Set internal allele frequency filtering expression */
155155
void set_af_filter(const std::string& af_filter);
156156

157+
/** Set whether to scan all samples in the dataset when computing frequency */
158+
void set_scan_all_samples(const bool scan_all_samples);
159+
157160
/** Set the TileDB query buffer memory percentage */
158161
void set_buffer_percentage(float buffer_percentage);
159162

apis/python/src/tiledbvcf/dataset.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ def read_arrow(
212212
bed_file: str = None,
213213
skip_check_samples: bool = False,
214214
set_af_filter: str = "",
215+
scan_all_samples: bool = False,
215216
enable_progress_estimation: bool = False,
216217
) -> pa.Table:
217218
"""
@@ -239,6 +240,8 @@ def read_arrow(
239240
set_af_filter
240241
Filter variants by internal allele frequency. For example, to include
241242
variants with AF > 0.1, set this to ">0.1".
243+
scan_all_samples
244+
Scan all samples when computing internal allele frequency.
242245
enable_progress_estimation
243246
**DEPRECATED** - This parameter will be removed in a future release.
244247
@@ -264,6 +267,7 @@ def read_arrow(
264267
self.reader.set_attributes(attrs)
265268
self.reader.set_check_samples_exist(not skip_check_samples)
266269
self.reader.set_af_filter(set_af_filter)
270+
self.reader.set_scan_all_samples(scan_all_samples)
267271
self.reader.set_enable_progress_estimation(enable_progress_estimation)
268272

269273
if bed_file is not None:
@@ -280,6 +284,7 @@ def read(
280284
bed_file: str = None,
281285
skip_check_samples: bool = False,
282286
set_af_filter: str = "",
287+
scan_all_samples: bool = False,
283288
enable_progress_estimation: bool = False,
284289
) -> pd.DataFrame:
285290
"""
@@ -334,6 +339,7 @@ def read(
334339
self.reader.set_attributes(attrs)
335340
self.reader.set_check_samples_exist(not skip_check_samples)
336341
self.reader.set_af_filter(set_af_filter)
342+
self.reader.set_scan_all_samples(scan_all_samples)
337343
self.reader.set_enable_progress_estimation(enable_progress_estimation)
338344

339345
if bed_file is not None:
@@ -369,6 +375,11 @@ def export(
369375
URI of a BED file of genomic regions to be read.
370376
skip_check_samples
371377
Skip checking if the samples in `samples_file` exist in the dataset.
378+
set_af_filter
379+
Filter variants by internal allele frequency. For example, to include
380+
variants with AF > 0.1, set this to ">0.1".
381+
scan_all_samples
382+
Scan all samples when computing internal allele frequency.
372383
enable_progress_estimation
373384
**DEPRECATED** - This parameter will be removed in a future release.
374385
merge

apis/python/tests/test_tiledbvcf.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,6 +1168,17 @@ def test_ingest_with_stats(tmp_path):
11681168
data_frame[data_frame["sample_name"] == "second"]["info_TILEDB_IAF"].iloc[0][0]
11691169
== 0.0625
11701170
)
1171+
data_frame = ds.read(
1172+
samples=sample_names,
1173+
attrs=["contig", "pos_start", "id", "qual", "info_TILEDB_IAF", "sample_name"],
1174+
scan_all_samples=True,
1175+
)
1176+
assert (
1177+
data_frame[
1178+
(data_frame["sample_name"] == "second") & (data_frame["pos_start"] == 4)
1179+
]["info_TILEDB_IAF"].iloc[0][0]
1180+
== 0.0625
1181+
)
11711182

11721183

11731184
def test_ingest_mode_separate(tmp_path):

libtiledbvcf/src/c_api/tiledbvcf.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,18 @@ int32_t tiledb_vcf_reader_set_af_filter(
942942
return TILEDB_VCF_OK;
943943
}
944944

945+
int32_t tiledb_vcf_reader_set_scan_all_samples(
946+
tiledb_vcf_reader_t* reader, bool scan_all_samples) {
947+
if (sanity_check(reader) == TILEDB_VCF_ERR)
948+
return TILEDB_VCF_ERR;
949+
950+
if (SAVE_ERROR_CATCH(
951+
reader, reader->reader_->set_scan_all_samples(scan_all_samples)))
952+
return TILEDB_VCF_ERR;
953+
954+
return TILEDB_VCF_OK;
955+
}
956+
945957
/* ********************************* */
946958
/* BED FILE */
947959
/* ********************************* */

libtiledbvcf/src/c_api/tiledbvcf.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -918,6 +918,13 @@ TILEDBVCF_EXPORT int32_t tiledb_vcf_reader_set_output_path(
918918
TILEDBVCF_EXPORT int32_t tiledb_vcf_reader_set_af_filter(
919919
tiledb_vcf_reader_t* reader, const char* af_filter);
920920

921+
/**
922+
* Sets whether to scan all samples for IAF computation
923+
*
924+
*/
925+
TILEDBVCF_EXPORT int32_t tiledb_vcf_reader_set_scan_all_samples(
926+
tiledb_vcf_reader_t* reader, bool scan_all_samples);
927+
921928
/**
922929
* Sets export output directory
923930
* @param reader VCF reader object

libtiledbvcf/src/read/reader.cc

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <iomanip>
2929
#include <random>
3030
#include <thread>
31+
#include <vector>
3132

3233
#include "dataset/attribute_buffer_set.h"
3334
#include "read/bcf_exporter.h"
@@ -1282,8 +1283,17 @@ bool Reader::process_query_results_v4() {
12821283

12831284
read_state_.query_results.af_values.clear();
12841285
int allele_index = 0;
1286+
size_t num_samples = 0;
1287+
if (params_.scan_all_samples) {
1288+
num_samples = dataset_->sample_names().size();
1289+
}
1290+
bool is_ref = true;
12851291
for (auto&& allele : alleles) {
1286-
auto [allele_passes, af] = af_filter_->pass(real_start, allele);
1292+
auto [allele_passes, af] = af_filter_->pass(
1293+
real_start,
1294+
is_ref ? "ref" : alleles[0] + "," + allele,
1295+
params_.scan_all_samples,
1296+
num_samples);
12871297

12881298
// If the allele is in GT, consider it in the pass computation
12891299
// TODO: when supporting greater than diploid organisms, expand the
@@ -1303,6 +1313,7 @@ bool Reader::process_query_results_v4() {
13031313
read_state_.query_results.af_values.push_back(af);
13041314

13051315
LOG_TRACE(" pass = {}", pass);
1316+
is_ref = false;
13061317
}
13071318

13081319
// If all alleles do not pass the af filter, continue
@@ -2562,6 +2573,10 @@ void Reader::set_af_filter(const std::string& af_filter) {
25622573
}
25632574
}
25642575

2576+
void Reader::set_scan_all_samples(bool scan_all_samples) {
2577+
params_.scan_all_samples = scan_all_samples;
2578+
}
2579+
25652580
void Reader::set_tiledb_query_config() {
25662581
assert(read_state_.query != nullptr);
25672582
assert(buffers_a != nullptr);

libtiledbvcf/src/read/reader.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,9 @@ struct ExportParams {
139139
// and VALUE = float
140140
// If empty, AF filtering is not applied
141141
std::string af_filter = "";
142+
143+
// Should all samples be scanned when computing internal allele frequency?
144+
bool scan_all_samples = false;
142145
};
143146

144147
/* ********************************* */
@@ -443,6 +446,12 @@ class Reader {
443446
*/
444447
void set_af_filter(const std::string& af_filter);
445448

449+
/**
450+
* sets whether to scan all samples in the dataset when computing IAF
451+
* @param af_filter setting
452+
*/
453+
void set_scan_all_samples(bool scan_all_samples);
454+
446455
/**
447456
* Sets disabling of progress estimation in verbose mode
448457
* @param enable_progress_estimation setting

0 commit comments

Comments
 (0)