Skip to content

Commit 91d11e5

Browse files
smehringerseqan-actionseseiler
authored
[FEATURE] Allow writing and reading a sketch file (#254)
* [FEATURE] Read and write a sketch file in chopper layout. * [MISC] automatic linting * Review * [MISC] automatic linting --------- Co-authored-by: seqan-actions[bot] <[email protected]> Co-authored-by: Enrico Seiler <[email protected]>
1 parent 7cc49aa commit 91d11e5

13 files changed

+361
-230
lines changed

Diff for: include/chopper/sketch/read_hll_files_into.hpp

-23
This file was deleted.

Diff for: include/chopper/sketch/sketch_file.hpp

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// ---------------------------------------------------------------------------------------------------
2+
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3+
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4+
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5+
// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
6+
// ---------------------------------------------------------------------------------------------------
7+
8+
#pragma once
9+
10+
#include <vector>
11+
12+
#include <cereal/types/vector.hpp>
13+
14+
#include <chopper/configuration.hpp>
15+
16+
#include <hibf/sketch/hyperloglog.hpp>
17+
#include <hibf/sketch/minhashes.hpp>
18+
19+
namespace chopper::sketch
20+
{
21+
22+
struct sketch_file
23+
{
24+
chopper::configuration chopper_config{};
25+
std::vector<std::vector<std::string>> filenames{};
26+
std::vector<seqan::hibf::sketch::hyperloglog> hll_sketches{};
27+
std::vector<seqan::hibf::sketch::minhashes> minHash_sketches{};
28+
29+
private:
30+
friend class cereal::access;
31+
32+
template <typename archive_t>
33+
void serialize(archive_t & archive)
34+
{
35+
uint32_t version{1};
36+
archive(CEREAL_NVP(version));
37+
38+
archive(CEREAL_NVP(chopper_config));
39+
archive(CEREAL_NVP(filenames));
40+
archive(CEREAL_NVP(hll_sketches));
41+
archive(CEREAL_NVP(minHash_sketches));
42+
}
43+
};
44+
45+
} // namespace chopper::sketch

Diff for: src/chopper_layout.cpp

+63-15
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,31 @@
2121
#include <chopper/sketch/check_filenames.hpp>
2222
#include <chopper/sketch/output.hpp>
2323
#include <chopper/sketch/read_data_file.hpp>
24+
#include <chopper/sketch/sketch_file.hpp>
2425

2526
#include <hibf/sketch/compute_sketches.hpp>
2627

2728
namespace chopper
2829
{
2930

31+
void validate_configuration(sharg::parser & parser,
32+
chopper::configuration & config,
33+
chopper::configuration const & sketch_config)
34+
{
35+
if (parser.is_option_set("sketch-bits"))
36+
throw sharg::parser_error{"You cannot set --sketch-bits when using a sketch file as input."};
37+
38+
if (parser.is_option_set("kmer") && config.k != sketch_config.k)
39+
{
40+
std::cerr << sharg::detail::to_string(
41+
"[WARNING] Given k-mer size (",
42+
config.k,
43+
") differs from k-mer size in the sketch file (",
44+
config.k,
45+
"). The results may be suboptimal. If this was a conscious decision, you can ignore this warning.\n");
46+
}
47+
}
48+
3049
int chopper_layout(chopper::configuration & config, sharg::parser & parser)
3150
{
3251
parser.parse();
@@ -36,41 +55,70 @@ int chopper_layout(chopper::configuration & config, sharg::parser & parser)
3655
else if (config.k > config.window_size)
3756
throw sharg::parser_error{"The k-mer size cannot be bigger than the window size."};
3857

58+
auto has_sketch_file_extension = [](std::filesystem::path const & path)
59+
{
60+
return path.string().ends_with(".sketch") || path.string().ends_with(".sketches");
61+
};
62+
3963
config.disable_sketch_output = !parser.is_option_set("output-sketches-to");
64+
if (!config.disable_sketch_output && !has_sketch_file_extension(config.sketch_directory))
65+
throw sharg::parser_error{"The sketch output file must have the extension \".sketch\" or \".sketches\"."};
66+
67+
bool const input_is_a_sketch_file = has_sketch_file_extension(config.data_file);
4068

4169
int exit_code{};
4270

4371
std::vector<std::vector<std::string>> filenames{};
72+
std::vector<seqan::hibf::sketch::hyperloglog> sketches{};
4473

45-
chopper::sketch::read_data_file(config, filenames);
74+
if (input_is_a_sketch_file)
75+
{
76+
chopper::sketch::sketch_file sin{};
77+
78+
{ // Deserialization is guaranteed to be complete when going out of scope.
79+
std::ifstream is{config.data_file};
80+
cereal::BinaryInputArchive iarchive{is};
81+
iarchive(sin);
82+
}
4683

47-
std::vector<seqan::hibf::sketch::hyperloglog> sketches;
84+
filenames = std::move(sin.filenames); // No need to call check_filenames because the files are not read.
85+
sketches = std::move(sin.hll_sketches);
86+
validate_configuration(parser, config, sin.chopper_config);
87+
}
88+
else
89+
{
90+
chopper::sketch::read_data_file(config, filenames);
4891

49-
if (filenames.empty())
50-
throw sharg::parser_error{
51-
sharg::detail::to_string("The file ", config.data_file.string(), " appears to be empty.")};
92+
if (filenames.empty())
93+
throw sharg::parser_error{
94+
sharg::detail::to_string("The file ", config.data_file.string(), " appears to be empty.")};
5295

53-
chopper::sketch::check_filenames(filenames, config);
96+
// Files need to exist because they will be read for sketching.
97+
chopper::sketch::check_filenames(filenames, config);
98+
}
5499

55100
config.hibf_config.input_fn =
56101
chopper::input_functor{filenames, config.precomputed_files, config.k, config.window_size};
57102
config.hibf_config.number_of_user_bins = filenames.size();
58103
config.hibf_config.validate_and_set_defaults();
59104

60-
config.compute_sketches_timer.start();
61-
seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
62-
config.compute_sketches_timer.stop();
105+
if (!input_is_a_sketch_file)
106+
{
107+
config.compute_sketches_timer.start();
108+
seqan::hibf::sketch::compute_sketches(config.hibf_config, sketches);
109+
config.compute_sketches_timer.stop();
110+
}
63111

64112
exit_code |= chopper::layout::execute(config, filenames, sketches);
65113

66114
if (!config.disable_sketch_output)
67115
{
68-
if (!std::filesystem::exists(config.sketch_directory))
69-
std::filesystem::create_directory(config.sketch_directory);
70-
71-
assert(filenames.size() == sketches.size());
72-
for (size_t i = 0; i < filenames.size(); ++i)
73-
chopper::sketch::write_sketch_file(filenames[i][0], sketches[i], config);
116+
chopper::sketch::sketch_file sout{.chopper_config = config,
117+
.filenames = std::move(filenames),
118+
.hll_sketches = std::move(sketches)};
119+
std::ofstream os{config.sketch_directory, std::ios::binary};
120+
cereal::BinaryOutputArchive oarchive{os};
121+
oarchive(sout);
74122
}
75123

76124
if (!config.output_timings.empty())

Diff for: src/set_up_parser.cpp

+6-5
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@ void set_up_parser(sharg::parser & parser, configuration & config)
4242
.short_id = '\0',
4343
.long_id = "input",
4444
.description =
45-
"The input must be a file containing paths to sequence data you wish to estimate; one filepath "
46-
"per line. If your file contains auxiliary information (e.g. species IDs), your file must be tab-"
47-
"separated.",
45+
"The input can either be (1) a sketch (binary) file (extensions: \".sketch\" or \".sketches\") that "
46+
"was produced by chopper or (2) a file containing paths to sequence data you wish to estimate; one "
47+
"filepath per line. If your file contains auxiliary information (e.g. species IDs), your file must be "
48+
"tab-separated.",
4849
.required = true});
4950
parser.add_list_item("", "Example file:");
5051
parser.add_list_item("", "```");
@@ -248,8 +249,8 @@ void set_up_parser(sharg::parser & parser, configuration & config)
248249
sharg::config{
249250
.long_id = "output-sketches-to",
250251
.description =
251-
"If you supply a directory path with this option, the hyperloglog sketches of your input will be "
252-
"stored in the respective path; one .hll file per input file.",
252+
"If supplied, a sketch file will be written to the specified directory. The sketch file can be used "
253+
"as input for the next run of chopper. The file extension must be \".sketch\" or \".sketches\".",
253254
.default_message = "None",
254255
.advanced = true});
255256

Diff for: src/sketch/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
cmake_minimum_required (VERSION 3.18)
22

33
if (NOT TARGET chopper_sketch)
4-
add_library (chopper_sketch STATIC check_filenames.cpp output.cpp read_data_file.cpp read_hll_files_into.cpp)
4+
add_library (chopper_sketch STATIC check_filenames.cpp output.cpp read_data_file.cpp)
55

66
target_link_libraries (chopper_sketch PUBLIC chopper_shared)
77
endif ()

Diff for: src/sketch/read_hll_files_into.cpp

-52
This file was deleted.

Diff for: test/api/layout/execute_with_estimation_test.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
#include <chopper/configuration.hpp>
2323
#include <chopper/layout/execute.hpp>
24-
#include <chopper/sketch/read_hll_files_into.hpp>
2524

2625
#include <hibf/sketch/compute_sketches.hpp>
2726
#include <hibf/sketch/hyperloglog.hpp>

Diff for: test/api/sketch/CMakeLists.txt

-6
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,6 @@
77

88
cmake_minimum_required (VERSION 3.18)
99

10-
add_api_test (read_hll_files_into_test.cpp)
11-
target_use_datasources (read_hll_files_into_test FILES small.fa)
12-
target_use_datasources (read_hll_files_into_test FILES small2.fa)
13-
target_use_datasources (read_hll_files_into_test FILES small.hll)
14-
target_use_datasources (read_hll_files_into_test FILES small2.hll)
15-
1610
add_api_test (check_filenames_test.cpp)
1711
target_use_datasources (check_filenames_test FILES seq1.fa)
1812
target_use_datasources (check_filenames_test FILES seq2.fa)

0 commit comments

Comments
 (0)