Skip to content

Commit ec45c63

Browse files
authored
Add Hail-like AC, AN, and AF computation to variant stats V3 (#731)
* read more fields from variant stats * make use of V3 additions in variant stats reader, cache non-refs * make variant stats reader track entering/exiting ref blocks * fix sorting logic for ref blocks * update IAF with ref blocks for variant stats V3 * fix AF computation for variant stats V3 * fix data frame cardinality for variant stats reads * add TILEDB_IAC, TILEDB_IAN to data query * update variant stats reader for v3, fix variant stats timing * fix progressive VCF stats computation, update tests * fix variant stats v3 writer: correct ordering and END * read in max_length when reading variant stats * Refactor inclusion/exclusion of positions, add non-ref extension * info_TILEDB_IAN is pared down to a single encapsulated element * fix type mismatch * make tiledb_IAF_IAN a scalar value * minor fix to comparison * isolate variant stats v3 feature from v2 * refactored stats v3 writer in-place sorting to include samples * simplify in-place sort loop in variant stats writer * clean up extra string comparison * move AF computation to method file * variant stats writer sorting fix * fix IAC/IAF computation error for variant stats v3 * refactor * fix AF computations * fix test * restructure writer for normalizing alleles * add normalization for variant stats v3 * change normalization of variant stats V3 to pairwise * reimplement variant stats V3 pairwise normalization, including ref This reverts commit 81eda28. * address nits and add documentation * non-functional assignment change for variant stats end
1 parent be5a5c5 commit ec45c63

16 files changed

+752
-169
lines changed

apis/python/tests/test_tiledbvcf.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,7 +1241,7 @@ def test_ingest_with_stats_v3(tmp_path):
12411241
attrs=["contig", "pos_start", "id", "qual", "info_TILEDB_IAF", "sample_name"],
12421242
set_af_filter="<0.2",
12431243
)
1244-
assert data_frame.shape == (3, 8)
1244+
assert data_frame.shape == (1, 8)
12451245
assert data_frame.query("sample_name == 'second'")["qual"].iloc[0] == pytest.approx(
12461246
343.73
12471247
)
@@ -1262,17 +1262,16 @@ def test_ingest_with_stats_v3(tmp_path):
12621262
)
12631263
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="r")
12641264
df = ds.read_variant_stats("chr1:1-10000")
1265-
assert df.shape == (12, 5)
1265+
assert df.shape == (13, 5)
12661266
df = tiledbvcf.allele_frequency.read_allele_frequency(
12671267
os.path.join(tmp_path, "stats_test"), "chr1:1-10000"
12681268
)
12691269
assert df.pos.is_monotonic_increasing
12701270
df["an_check"] = (df.ac / df.af).round(0).astype("int32")
12711271
assert df.an_check.equals(df.an)
12721272
df = ds.read_variant_stats("chr1:1-10000")
1273-
assert df.shape == (12, 5)
1273+
assert df.shape == (13, 5)
12741274
df = df.to_pandas()
1275-
assert sum(sum((df[df["alleles"] == "nr"] == (9, "nr", 4, 4, 1.0)).values)) == 5
12761275
df = ds.read_allele_count("chr1:1-10000")
12771276
assert df.shape == (7, 6)
12781277
df = df.to_pandas()
@@ -1343,15 +1342,15 @@ def test_ingest_with_stats_v2(tmp_path):
13431342
)
13441343
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="r")
13451344
df = ds.read_variant_stats("chr1:1-10000")
1346-
assert df.shape == (12, 5)
1345+
assert df.shape == (13, 5)
13471346
df = tiledbvcf.allele_frequency.read_allele_frequency(
13481347
os.path.join(tmp_path, "stats_test"), "chr1:1-10000"
13491348
)
13501349
assert df.pos.is_monotonic_increasing
13511350
df["an_check"] = (df.ac / df.af).round(0).astype("int32")
13521351
assert df.an_check.equals(df.an)
13531352
df = ds.read_variant_stats("chr1:1-10000")
1354-
assert df.shape == (12, 5)
1353+
assert df.shape == (13, 5)
13551354
df = df.to_pandas()
13561355
df = ds.read_allele_count("chr1:1-10000")
13571356
assert df.shape == (7, 6)

libtiledbvcf/src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ set(TILEDB_VCF_SOURCES
6868
${CMAKE_CURRENT_SOURCE_DIR}/utils/bitmap.cc
6969
${CMAKE_CURRENT_SOURCE_DIR}/utils/buffer.cc
7070
${CMAKE_CURRENT_SOURCE_DIR}/utils/logger.cc
71+
${CMAKE_CURRENT_SOURCE_DIR}/utils/normalize.cc
7172
${CMAKE_CURRENT_SOURCE_DIR}/utils/sample_utils.cc
7273
${CMAKE_CURRENT_SOURCE_DIR}/utils/utils.cc
7374
${CMAKE_CURRENT_SOURCE_DIR}/vcf/bed_file.cc

libtiledbvcf/src/dataset/tiledbvcfdataset.cc

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -764,14 +764,35 @@ void TileDBVCFDataset::load_field_type_maps_v4(
764764
}
765765

766766
if (add_iaf) {
767+
int header_appended = bcf_hdr_append(
768+
// TODO: do something better than promoting this pointer type;
769+
// perhaps the header should be duplicated and later modified
770+
const_cast<bcf_hdr_t*>(hdr),
771+
"##INFO=<ID=TILEDB_IAF,Number=R,Type=Float,Description="
772+
"\"Internal Allele Frequency, computed over dataset by TileDB\">");
773+
if (header_appended < 0) {
774+
throw std::runtime_error(
775+
"Error appending to header for internal allele frequency.");
776+
}
767777
if (bcf_hdr_append(
768778
// TODO: do something better than promoting this pointer type;
769779
// perhaps the header should be duplicated and later modified
770780
const_cast<bcf_hdr_t*>(hdr),
771-
"##INFO=<ID=TILEDB_IAF,Number=R,Type=Float,Description=\"Internal "
772-
"Allele Frequency, computed over dataset by TileDB\">") < 0) {
781+
"##INFO=<ID=TILEDB_IAC,Number=R,Type=Integer,Description="
782+
"\"Internal Allele Count, computed over dataset by TileDB\">") <
783+
0) {
773784
throw std::runtime_error(
774-
"Error appending to header for internal allele frequency.");
785+
"Error appending to header for internal allele count.");
786+
}
787+
if (bcf_hdr_append(
788+
// TODO: do something better than promoting this pointer type;
789+
// perhaps the header should be duplicated and later modified
790+
const_cast<bcf_hdr_t*>(hdr),
791+
"##INFO=<ID=TILEDB_IAN,Number=R,Type=Integer,Description="
792+
"\"Internal Allele Number, computed over dataset by TileDB\">") <
793+
0) {
794+
throw std::runtime_error(
795+
"Error appending to header for internal allele number.");
775796
}
776797
info_iaf_field_type_added_ = true;
777798
if (bcf_hdr_sync(const_cast<bcf_hdr_t*>(hdr)) < 0) {
@@ -949,8 +970,8 @@ void TileDBVCFDataset::delete_samples(
949970
throw std::runtime_error("Sample not found in dataset: " + sample);
950971
}
951972

952-
// If a stats array exists, read the data with the delete exporter, which
953-
// adds negative counts to the stats arrays
973+
// If a stats array exists, read the data with the delete exporter,
974+
// which adds negative counts to the stats arrays
954975
if (stats_array_exists) {
955976
ExportParams args;
956977
args.tiledb_config = tiledb_config;
@@ -1132,7 +1153,8 @@ std::unordered_map<uint32_t, SafeBCFHdr> TileDBVCFDataset::fetch_vcf_headers_v4(
11321153
// first sample.
11331154
if (first_sample && num_rows == 0) {
11341155
LOG_DEBUG(
1135-
"[fetch_vcf_headers_v4] the first sample was deleted, resubmitting");
1156+
"[fetch_vcf_headers_v4] the first sample was deleted, "
1157+
"resubmitting");
11361158
mq = std::make_unique<ManagedQuery>(
11371159
vcf_header_array_, "fetch_vcf_headers_v4");
11381160
mq->select_columns({"sample", "header"});
@@ -2002,7 +2024,8 @@ const char* TileDBVCFDataset::materialized_attribute_name(
20022024

20032025
bool TileDBVCFDataset::is_attribute_materialized(
20042026
const std::string& attr) const {
2005-
if (attr == "info_TILEDB_IAF") {
2027+
if (attr == "info_TILEDB_IAF" || attr == "info_TILEDB_IAC" ||
2028+
attr == "info_TILEDB_IAN") {
20062029
return true;
20072030
}
20082031

libtiledbvcf/src/read/in_memory_exporter.cc

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
*/
2626

2727
#include "read/in_memory_exporter.h"
28+
#include <stdexcept>
2829
#include "enums/attr_datatype.h"
2930

3031
namespace tiledb {
@@ -594,7 +595,8 @@ bool InMemoryExporter::fixed_len_attr(const std::string& attr) {
594595
"fmt_PS",
595596
"fmt_PQ",
596597
"fmt_MQ",
597-
"fmt_MIN_DP"};
598+
"fmt_MIN_DP",
599+
"info_TILEDB_IAN"};
598600
return fixed_len.count(attr) > 0;
599601
}
600602

@@ -975,13 +977,35 @@ bool InMemoryExporter::copy_info_fmt_value(
975977
const std::string& field_name = dest->info_fmt_field_name;
976978
const bool is_gt = field_name == "GT";
977979
const bool is_iaf = field_name == "TILEDB_IAF";
980+
const bool is_iac = field_name == "TILEDB_IAC";
981+
const bool is_ian = field_name == "TILEDB_IAN";
982+
978983
const void* src = nullptr;
979984
uint64_t nbytes = 0, nelts = 0;
980-
if (is_iaf && !curr_query_results_->af_values.empty()) {
985+
auto& af_values = curr_query_results_->af_values;
986+
auto& ac_values = curr_query_results_->ac_values;
987+
uint32_t an_value = curr_query_results_->an_value;
988+
if ((is_iaf || is_iac || is_ian) && !af_values.empty()) {
989+
if (af_values.size() != ac_values.size()) {
990+
throw std::runtime_error(
991+
"inconsistent cardinality of IAC/IAN/IAF vectors in query results");
992+
}
981993
// assign source pointer, nbytes, and nelts from vector
982-
src = curr_query_results_->af_values.data();
983-
nelts = curr_query_results_->af_values.size();
984-
nbytes = nelts * sizeof(decltype(curr_query_results_->af_values.at(0)));
994+
if (is_iaf) {
995+
src = af_values.data();
996+
nelts = af_values.size();
997+
nbytes = nelts * sizeof(decltype(af_values.at(0)));
998+
} else {
999+
if (is_iac) {
1000+
src = ac_values.data();
1001+
nelts = ac_values.size();
1002+
nbytes = nelts * sizeof(decltype(ac_values.at(0)));
1003+
} else {
1004+
src = &an_value;
1005+
nelts = 1;
1006+
nbytes = nelts * sizeof(decltype(an_value));
1007+
}
1008+
}
9851009
} else {
9861010
get_info_fmt_value(dest, cell_idx, &src, &nbytes, &nelts);
9871011
}

libtiledbvcf/src/read/read_query_results.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ struct ReadQueryResults {
8888

8989
/** TileDB Internal Allele Frequency values*/
9090
std::vector<float> af_values;
91+
std::vector<uint32_t> ac_values;
92+
uint32_t an_value;
9193

9294
private:
9395
/** Pointer to buffer set holding the actual data. */

libtiledbvcf/src/read/reader.cc

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "read/reader.h"
4242
#include "read/tsv_exporter.h"
4343
#include "utils/logger_public.h"
44+
#include "utils/normalize.h"
4445
#include "utils/utils.h"
4546

4647
namespace tiledb {
@@ -183,6 +184,10 @@ void Reader::init_af_filter() {
183184
Group group(*ctx_, dataset_->root_uri(), TILEDB_READ);
184185
af_filter_ = std::make_unique<VariantStatsReader>(ctx_, group);
185186
af_filter_->set_condition(params_.af_filter);
187+
// TODO: enable sorting only if needed for GVCF IAF feature
188+
if (af_filter_->array_version() > 2) {
189+
params_.sort_real_start_pos = true;
190+
}
186191
}
187192
}
188193

@@ -948,7 +953,7 @@ void Reader::prepare_variant_stats() {
948953
"Error preparing variant stats: there should be exactly one region "
949954
"specified");
950955
}
951-
af_filter_->add_region(params_.regions[0]);
956+
af_filter_->add_region(params_.regions[0], true);
952957
af_filter_->compute_af();
953958
}
954959

@@ -1372,12 +1377,20 @@ bool Reader::process_query_results_v4() {
13721377
}
13731378

13741379
read_state_.query_results.af_values.clear();
1380+
read_state_.query_results.ac_values.clear();
1381+
read_state_.query_results.an_value = 0;
13751382
int allele_index = 0;
13761383
bool is_ref = true;
1384+
uint32_t an = 0;
13771385
for (auto&& allele : alleles) {
1378-
auto [allele_passes, af] = af_filter_->pass(
1386+
std::string normalized_ref = alleles[0];
1387+
std::string normalized_allele = allele;
1388+
if (af_filter_->array_version() > 2) {
1389+
normalize(normalized_ref, normalized_allele);
1390+
}
1391+
auto [allele_passes, af, ac, allele_an] = af_filter_->pass(
13791392
real_start,
1380-
is_ref ? "ref" : alleles[0] + "," + allele,
1393+
is_ref ? "ref" : normalized_ref + "," + normalized_allele,
13811394
params_.scan_all_samples,
13821395
num_samples);
13831396

@@ -1399,10 +1412,13 @@ bool Reader::process_query_results_v4() {
13991412
// - build vector of AFs matching the order of the VCF record
14001413
// - all allele AF values are required, so do not exit this loop early
14011414
read_state_.query_results.af_values.push_back(af);
1415+
read_state_.query_results.ac_values.push_back(ac);
1416+
an = allele_an;
14021417

14031418
LOG_TRACE(" pass = {}", pass);
14041419
is_ref = false;
14051420
}
1421+
read_state_.query_results.an_value = an;
14061422

14071423
// If all alleles do not pass the af filter, continue
14081424
if (!pass) {

0 commit comments

Comments
 (0)