Skip to content

Commit 1563340

Browse files
authored
Merge pull request #171 from joergi-w/snp_indel_call
Calculate active regions based on Cigar information
2 parents 2844212 + e180d2a commit 1563340

File tree

6 files changed

+174
-63
lines changed

6 files changed

+174
-63
lines changed

include/variant_detection/snp_indel_detection.hpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,30 @@
22

33
#include <seqan3/std/filesystem>
44

5+
#include <seqan3/alphabet/cigar/cigar.hpp>
6+
57
/*!
68
* \brief Detect Single Nucleotide Polymorphisms (SNPs) and short deletions and insertions.
79
* \param[in] reads_filename - The file path where to find the sequenced reads.
10+
* \param[in] min_var_length - The length above which an indel/SNP is considered a variant.
11+
*/
12+
void detect_snp_and_indel(std::filesystem::path const & reads_filename, uint64_t min_var_length);
13+
14+
/*!
15+
* \brief Extract activity from SAM records by counting indels and soft clips.
16+
* \param[in,out] activity - The activity values for one reference genome.
17+
* \param[in] cigar_sequence - The cigar string of the SAM record.
18+
* \param[in] min_var_length - The length above which an indel/SNP is considered a variant.
19+
* \param[in] ref_pos - The start position of the alignment in the genome.
20+
*/
21+
void update_activity_for_record(std::vector<unsigned> & activity,
22+
std::vector<seqan3::cigar> const & cigar_sequence,
23+
uint64_t min_var_length,
24+
int32_t ref_pos);
25+
26+
/*!
27+
* \brief Extract active regions from activity profile.
28+
* \param[in] activity - The activity values for one reference genome.
29+
* \return a list of position intervals, where the activity is high.
830
*/
9-
void detect_snp_and_indel(std::filesystem::path const & reads_filename);
31+
std::vector<std::pair<size_t, size_t>> active_regions(std::vector<unsigned> const & activity);

src/iGenVar.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ void detect_variants_in_alignment_file(cmd_arguments const & args)
150150
if (!args.alignment_short_reads_file_path.empty() && !args.genome_file_path.empty())
151151
{
152152
seqan3::debug_stream << "Detect SNPs and indels in short reads...\n";
153-
detect_snp_and_indel(args.alignment_short_reads_file_path);
153+
detect_snp_and_indel(args.alignment_short_reads_file_path, args.min_var_length);
154154
}
155155

156156
std::sort(junctions.begin(), junctions.end());
Lines changed: 107 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,106 @@
11
#include <iostream>
22

3-
#include <seqan3/alphabet/cigar/cigar.hpp>
43
#include <seqan3/core/debug_stream.hpp>
54
#include <seqan3/io/sam_file/input.hpp>
65

76
#include "iGenVar.hpp"
87
#include "variant_detection/bam_functions.hpp" // check the sam flags
98
#include "variant_detection/snp_indel_detection.hpp" // detect_snp_and_indel
109

11-
void detect_snp_and_indel(std::filesystem::path const & reads_filename)
10+
void update_activity_for_record(std::vector<unsigned> & activity,
11+
std::vector<seqan3::cigar> const & cigar_sequence,
12+
uint64_t const min_var_length,
13+
int32_t ref_pos)
14+
{
15+
// Track the current position within the read.
16+
unsigned read_pos = 0;
17+
18+
// Step through the CIGAR string.
19+
for (auto && [length, operation] : cigar_sequence)
20+
{
21+
// Skip long variants.
22+
if (length >= min_var_length)
23+
{
24+
read_pos += length;
25+
ref_pos += length;
26+
continue;
27+
}
28+
29+
// Case distinction for cigar elements.
30+
switch (operation.to_char())
31+
{
32+
case 'I':
33+
case 'S': // insertion or soft clip
34+
{
35+
activity[ref_pos] += 1u;
36+
read_pos += length;
37+
break;
38+
}
39+
case 'D': // deletion
40+
{
41+
for (unsigned idx = 0; idx < length; ++idx)
42+
++activity[ref_pos + idx];
43+
ref_pos += length;
44+
break;
45+
}
46+
default: // ignore match, mismatch, hard clipping, padding, and skipped region
47+
{ // TODO (joergi-w 30.09.2021) mismatches are ignored but should be verified with the reference
48+
ref_pos += length;
49+
read_pos += length;
50+
}
51+
}
52+
}
53+
}
54+
55+
std::vector<std::pair<size_t, size_t>> active_regions(std::vector<unsigned> const & activity)
56+
{
57+
// Skip reference sequences that are not contained in the reads.
58+
if (activity.empty())
59+
return {};
60+
61+
// Some constants for the sliding window approach.
62+
size_t const window_width = 5u; // another idea: (min_var_length + 1) / 2;
63+
size_t const activity_threshold = 2; // TODO (joergi-w 25.10.2021) depend on read coverage
64+
65+
// Set up the result vector.
66+
std::vector<std::pair<size_t, size_t>> regions{};
67+
68+
size_t window_score = 0; // sum of the activities inside the window
69+
size_t pos; // the current position in the genome sequence
70+
bool active = false; // whether `pos` is inside an active region
71+
72+
// Initialisation of a window of length min_var_length.
73+
for (pos = 0; pos < window_width && pos < activity.size(); ++pos)
74+
window_score += activity[pos];
75+
76+
while (pos < activity.size())
77+
{
78+
// Check for start or end of an active region.
79+
if (!active && window_score >= activity_threshold)
80+
{
81+
regions.emplace_back(pos - window_width, 0);
82+
active = true;
83+
}
84+
else if (active && window_score < activity_threshold)
85+
{
86+
std::get<1>(regions.back()) = pos - 1u;
87+
active = false;
88+
}
89+
90+
// Advance the sliding window.
91+
window_score -= activity[pos - window_width];
92+
window_score += activity[pos];
93+
++pos;
94+
}
95+
96+
// Handle the case if the genome ends in an active region.
97+
if (active)
98+
std::get<1>(regions.back()) = pos - 1u;
99+
100+
return regions;
101+
}
102+
103+
void detect_snp_and_indel(std::filesystem::path const & reads_filename, uint64_t min_var_length)
12104
{
13105
// Get the header information and set the necessary fields.
14106
using sam_fields = seqan3::fields<seqan3::field::flag, // 2: FLAG
@@ -18,17 +110,14 @@ void detect_snp_and_indel(std::filesystem::path const & reads_filename)
18110
seqan3::field::cigar>; // 6: CIGAR
19111

20112
seqan3::sam_file_input reads_file{reads_filename, sam_fields{}};
21-
auto const & header = reads_file.header();
113+
auto & header = reads_file.header();
22114

23115
// Check that the file is sorted.
24116
if (header.sorting != "coordinate")
25117
throw seqan3::format_error{"ERROR: Input file must be sorted by coordinate (e.g. samtools sort)"};
26118

27119
// Prepare one activity vector per reference sequence.
28-
std::vector<std::vector<unsigned>> activity{};
29-
activity.resize(header.ref_id_info.size()); // allocate the number of reference sequences
30-
for (size_t idx = 0; idx < activity.size(); ++idx)
31-
activity[idx].resize(std::get<0>(header.ref_id_info[idx])); // allocate the length of the reference sequence
120+
std::vector<std::vector<unsigned>> activity_vec(header.ref_id_info.size()); // number of ref sequences
32121

33122
for (auto && record : reads_file)
34123
{
@@ -47,48 +136,18 @@ void detect_snp_and_indel(std::filesystem::path const & reads_filename)
47136
continue;
48137
}
49138

50-
// Track the current position within the read.
51-
unsigned read_pos = 0;
52-
53-
// Step through the CIGAR string.
54-
for (auto && [length, operation] : record.cigar_sequence())
55-
{
56-
// Skip long variants.
57-
if (length >= 30) // TODO (joergi-w 30.09.2021) Use the min_length parameter here.
58-
{
59-
read_pos += length;
60-
ref_pos += length;
61-
continue;
62-
}
139+
// Check if we have already allocated the length of the reference sequence.
140+
if (activity_vec[ref_id].empty())
141+
activity_vec[ref_id].resize(std::get<0>(header.ref_id_info[ref_id])); // length of reference sequence
63142

64-
// Case distinction for cigar elements.
65-
seqan3::cigar::operation const op = operation;
66-
switch (op.to_char())
67-
{
68-
case 'I':
69-
case 'S': // insertion or soft clip
70-
{
71-
activity[ref_id][ref_pos] += length;
72-
read_pos += length;
73-
break;
74-
}
75-
case 'D': // deletion
76-
{
77-
for (unsigned idx = 0; idx < length; ++idx)
78-
++activity[ref_id][ref_pos + idx];
79-
ref_pos += length;
80-
break;
81-
}
82-
default: // ignore match, mismatch, hard clipping, padding, and skipped region
83-
{ // TODO (joergi-w 30.09.2021) mismatches are ignored but should be verified with the reference
84-
ref_pos += length;
85-
read_pos += length;
86-
}
87-
}
88-
}
143+
update_activity_for_record(activity_vec[ref_id], record.cigar_sequence(), min_var_length, ref_pos);
89144
}
90145

91-
// Print the raw activity profiles.
92-
for (auto const & av : activity)
93-
seqan3::debug_stream << av << std::endl;
146+
// Extract active regions from activity profile.
147+
for (size_t ref_id = 0; ref_id < activity_vec.size(); ++ref_id)
148+
{
149+
auto regions = active_regions(activity_vec[ref_id]);
150+
if (!regions.empty())
151+
seqan3::debug_stream << "Active regions of " << header.ref_ids()[ref_id] << ": " << regions << std::endl;
152+
}
94153
}

test/api/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ add_api_test (clustering_test.cpp)
99

1010
# add_api_test (refinement_test.cpp)
1111

12+
add_api_test (snp_indel_test.cpp)
1213
add_api_test (structures_test.cpp)

test/api/snp_indel_test.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#include <gtest/gtest.h>
2+
3+
#include <vector>
4+
5+
#include "variant_detection/snp_indel_detection.hpp"
6+
7+
TEST(activity_analysis, input_empty)
8+
{
9+
std::vector<unsigned> activity{};
10+
auto regions = active_regions(activity);
11+
EXPECT_TRUE(regions.empty());
12+
}
13+
14+
TEST(activity_analysis, input_zero)
15+
{
16+
std::vector<unsigned> activity(150, 0u);
17+
auto regions = active_regions(activity);
18+
EXPECT_TRUE(regions.empty());
19+
}
20+
21+
TEST(activity_analysis, input_active_everywhere)
22+
{
23+
std::vector<unsigned> activity(150, 9999u);
24+
auto regions = active_regions(activity);
25+
EXPECT_EQ(regions.size(), 1u);
26+
EXPECT_EQ(regions[0], (std::pair<size_t, size_t>{0, 149}));
27+
}
28+
29+
TEST(activity_analysis, input_interval)
30+
{
31+
std::vector<unsigned> activity(150, 0u);
32+
for (size_t pos = 34u; pos <= 44u; ++pos)
33+
activity[pos] = 6;
34+
for (size_t pos = 134u; pos <= 144u; ++pos)
35+
activity[pos] = 1;
36+
auto regions = active_regions(activity);
37+
EXPECT_EQ(regions.size(), 2u);
38+
EXPECT_EQ(regions[0], (std::pair<size_t, size_t>{30, 49}));
39+
EXPECT_EQ(regions[1], (std::pair<size_t, size_t>{131, 148}));
40+
}

test/cli/iGenVar_cli_test.cpp

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -302,19 +302,8 @@ TEST_F(iGenVar_cli_test, test_genome_input)
302302
"-g", data(default_genome_file_path),
303303
"-i", data("single_end_mini_example.sam"));
304304
std::string const expected_err = "Detect SNPs and indels in short reads...\n"
305-
"[0,0,0,0,0,0,0,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,"
306-
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,9,9,9,9,9,9,9,9,9,9,9,9,1,0,0,0,0,0,0,0,"
307-
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"
308-
"0,0,0,0,0,0,0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"
309-
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,33,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,"
310-
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"
311-
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,2,2,2,2,2,2,2,2,"
312-
"2,2,2,2,2,2,2,67,3,3,3,67,1,1,1,1,1,1,1,1,1,1,1,1,43,0,0,0,0,0,0,0,0,0,0,0,0,0,"
313-
"0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,4,4,4,4,4,4,4,4,4,4,4,4,0,0,0,"
314-
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,"
315-
"1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"
316-
"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52,"
317-
"0,0,0,0,0,0,0,0,0,0]\n"
305+
"Active regions of chr1: [(6,15),(53,74),(121,130),(176,185),(184,193),(262,304),"
306+
"(311,319),(332,354),(364,373),(381,398),(467,476)]\n"
318307
"Start clustering...\n"
319308
"Done with clustering. Found 0 junction clusters.\n"
320309
"No refinement was selected.\n";

0 commit comments

Comments
 (0)