Skip to content

Commit f5fae1c

Browse files
authored
Merge pull request #16 from eseiler/feature/file_input
[FEATURE] Accept file containing bin paths
2 parents 221f189 + 8146c2f commit f5fae1c

File tree

5 files changed

+113
-10
lines changed

5 files changed

+113
-10
lines changed

CHANGELOG.md

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
1+
# 1.1.0
2+
3+
## Features
4+
* Raptor accepts a text file containing the path to a bin on each line
5+
([\#16](https://github.com/seqan/raptor/pull/16)).
6+
7+
## Bug fixes
8+
* Threshold option not working ([\#14](https://github.com/seqan/raptor/pull/14)).
9+
110
# 1.0.1
211

312
## Bug fixes
413

5-
* Reduced the number of open file handles ([\#10](https://github.com/seqan/raptor/pull/10))
14+
* Reduced the number of open file handles ([\#10](https://github.com/seqan/raptor/pull/10)).

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ cmake_minimum_required (VERSION 3.8)
33
## CUSTOMISE
44

55
# Define the application name and version.
6-
project (raptor VERSION 1.0.2)
6+
project (raptor VERSION 1.1.0)
77
set (CMAKE_CXX_STANDARD 17)
88
set (CMAKE_CXX_STANDARD_REQUIRED ON)
99

README.md

+17-1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,13 @@ raptor build --kmer 19 --window 23 --size 8m --output index.raptor $(seq -f "exa
9999
# You can replace `$(seq -f "example_data/64/bins/bin_%02g.fasta" 0 1 63)` by `example_data/64/bins/bin_{00..63}.fasta` if your shell supports this syntax.
100100
# The equivalent command for 1,024 bins is `$(seq -f "example_data/1024/bins/bin_%04g.fasta" 0 1 1023)`
101101
```
102+
103+
You can also prepare a file that contains one file path per line (a line corresponds to a bin) and use this file as input:
104+
```
105+
seq -f "example_data/64/bins/bin_%02g.fasta" 0 1 63 > all_bin_paths.txt
106+
raptor build --kmer 19 --window 23 --size 8m --output another_index.raptor all_bin_paths.txt
107+
```
108+
102109
You may be prompted to enable or disable automatic update notifications. For questions, please consult [the SeqAn documentation](https://github.com/seqan/seqan3/wiki/Update-Notifications).
103110

104111
Afterwards, we can search for all reads from bin 1:
@@ -143,7 +150,16 @@ raptor build --kmer 19 --window 23 --size 8m --compute-minimiser --output precom
143150

144151
Then we run the build step again and use the computed minimisers as input:
145152
```
146-
raptor build --kmer 19 --window 23 --size 8m --output index.raptor $(seq -f "precomputed_minimisers/bin_%02g.minimiser" 0 1 63)
153+
raptor build --kmer 19 --window 23 --size 8m --output minimiser_index.raptor $(seq -f "precomputed_minimisers/bin_%02g.minimiser" 0 1 63)
154+
```
155+
156+
Alternatively, you can also prepare a file that contains one file path per line (a line corresponds to a bin)
157+
and use this file as input for both cases:
158+
```
159+
seq -f "example_data/64/bins/bin_%02g.fasta" 0 1 63 > all_bin_paths.txt
160+
raptor build --kmer 19 --window 23 --size 8m --compute-minimiser --output precomputed_minimisers/ all_bin_paths.txt
161+
seq -f "precomputed_minimisers/bin_%02g.minimiser" 0 1 63 > all_minimiser_paths.txt
162+
raptor build --kmer 19 --window 23 --size 8m --output another_minimiser_index.raptor all_minimiser_paths.txt
147163
```
148164

149165
## Authorship and Copyright

src/raptor.cpp

+52-6
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,22 @@ class bin_validator
117117
{
118118
if (value.extension() == ".minimiser")
119119
minimiser_file_validator(value);
120+
else if (values.size() == 1u)
121+
{
122+
std::ifstream list_of_files{value};
123+
std::string line;
124+
while (std::getline(list_of_files, line))
125+
{
126+
if (!line.empty())
127+
{
128+
std::filesystem::path bin_path{line};
129+
if (bin_path.extension() == ".minimiser")
130+
minimiser_file_validator(bin_path);
131+
else
132+
sequence_file_validator(bin_path);
133+
}
134+
}
135+
}
120136
else
121137
throw exception;
122138
}
@@ -131,9 +147,10 @@ class bin_validator
131147

132148
std::string get_help_page_message() const
133149
{
134-
return "The input file must exist and read permissions must be granted. Valid file extensions are: [minimiser],"
135-
" or [embl, fasta, fa, fna, ffn, faa, frn, fas, fastq, fq, genbank, gb, gbk, sam] possibly followed by: "
136-
"[gz, bgzf, bz2].";
150+
return "The input file must exist and read permissions must be granted. Valid file extensions for bin files are"
151+
" : [minimiser], or [embl, fasta, fa, fna, ffn, faa, frn, fas, fastq, fq, genbank, gb, gbk, sam] "
152+
"possibly followed by: [gz, bgzf, bz2]. All other extensions will be assumed to contain one line per"
153+
" path to a bin.";
137154
}
138155

139156
private:
@@ -148,7 +165,7 @@ inline void init_shared_meta(seqan3::argument_parser & parser)
148165
parser.info.citation = "Seiler, E. et al. (2020). Raptor: A fast and space-efficient pre-filter for"
149166
" querying very large collections of nucleotide sequences. bioRxiv 2020.10.08.330985. doi:"
150167
" https://doi.org/10.1101/2020.10.08.330985";
151-
parser.info.date = "12-10-2020";
168+
parser.info.date = "16-12-2020";
152169
parser.info.email = "[email protected]";
153170
parser.info.long_copyright = R"(BSD 3-Clause License
154171
@@ -182,7 +199,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.)";
182199
parser.info.short_copyright = "BSD 3-Clause License";
183200
parser.info.short_description = "A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences.";
184201
parser.info.url = "https://github.com/seqan/raptor";
185-
parser.info.version = "1.0.2";
202+
parser.info.version = "1.1.0";
186203
}
187204

188205
void init_top_level_parser(seqan3::argument_parser & parser)
@@ -229,7 +246,8 @@ inline void init_build_parser(seqan3::argument_parser & parser, build_arguments
229246
init_shared_meta(parser);
230247
init_shared_options(parser, arguments);
231248
parser.add_positional_option(arguments.bin_path,
232-
"Provide a list of input files. One file per bin. ",
249+
"Provide a list of input files (one file per bin). Alternatively, provide a text file "
250+
"containing the paths to the bins (one line per path to a bin). ",
233251
bin_validator{});
234252
parser.add_option(arguments.out_path,
235253
'\0',
@@ -319,6 +337,34 @@ void run_build(seqan3::argument_parser & parser)
319337
init_build_parser(parser, arguments);
320338
try_parsing(parser);
321339

340+
// ==========================================
341+
// Process bin_path
342+
// ==========================================
343+
if (arguments.bin_path.size() == 1u) // Either only one bin or a file containing bin paths
344+
{
345+
auto & file = arguments.bin_path[0];
346+
347+
if (file.extension() != ".minimiser")
348+
{
349+
try
350+
{
351+
seqan3::input_file_validator<seqan3::sequence_file_input<>> validator;
352+
validator(file);
353+
}
354+
catch (seqan3::validation_error const & exception)
355+
{
356+
decltype(arguments.bin_path) new_values;
357+
std::ifstream list_of_files{file};
358+
std::string line;
359+
while (std::getline(list_of_files, line))
360+
new_values.emplace_back(line);
361+
while (new_values.back().empty())
362+
new_values.pop_back();
363+
arguments.bin_path = std::move(new_values);
364+
}
365+
}
366+
}
367+
322368
// ==========================================
323369
// Various checks.
324370
// ==========================================

test/cli/raptor_test.cpp

+33-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ inline std::string string_from_file(std::filesystem::path const & path, std::ios
1919
///////////////////////////////////////////////// raptor build tests ///////////////////////////////////////////////////
2020
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2121

22-
TEST_P(raptor_build, build)
22+
TEST_P(raptor_build, build_with_list)
2323
{
2424
auto const [number_of_bins, run_parallel] = GetParam();
2525

@@ -39,6 +39,38 @@ TEST_P(raptor_build, build)
3939
EXPECT_TRUE(expected == actual);
4040
}
4141

42+
TEST_P(raptor_build, build_with_file)
43+
{
44+
auto const [number_of_bins, run_parallel] = GetParam();
45+
46+
{
47+
std::string const expanded_bins = expand_bins(number_of_bins);
48+
std::ofstream file{"raptor_cli_test.txt"};
49+
auto split_bins = expanded_bins | std::views::split(' ') | ranges::view::transform([](auto &&rng) {return std::string_view(&*rng.begin(), ranges::distance(rng));});
50+
for (auto && file_path : split_bins)
51+
{
52+
file << file_path << '\n';
53+
}
54+
file << '\n';
55+
}
56+
57+
cli_test_result result = execute_app("raptor", "build",
58+
"--kmer 19",
59+
"--window 23",
60+
"--size 8m",
61+
"--threads ", run_parallel ? "2" : "1",
62+
"--output index.ibf",
63+
"raptor_cli_test.txt");
64+
ASSERT_EQ(result.exit_code, 0);
65+
ASSERT_EQ(result.out, std::string{});
66+
ASSERT_EQ(result.err, std::string{});
67+
68+
std::string expected = string_from_file(data("expected_results/b" + std::to_string(number_of_bins) + "_k19_w23_s8m.ibf"), std::ios::binary);
69+
std::string actual = string_from_file("index.ibf", std::ios::binary);
70+
71+
ASSERT_TRUE(expected == actual);
72+
}
73+
4274
INSTANTIATE_TEST_SUITE_P(build_suite,
4375
raptor_build,
4476
testing::Combine(testing::Values(64, 1024), testing::Values(true, false)),

0 commit comments

Comments
 (0)