Skip to content

Commit 2048bf0

Browse files
authored
Merge pull request #120 from MitraDarja/level_0
[MISC] cutoffs as variable.
2 parents d8adf45 + 2428d14 commit 2048bf0

11 files changed

+157
-111
lines changed

Diff for: include/ibf.h

+6-5
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ struct minimiser_arguments
2424
std::filesystem::path exclude_file; // Needs to be defined when minimisers appearing in this file should NOT be stored
2525
std::vector<int> samples{}; // Can be used to indicate that sequence files belong to the same experiment
2626
bool paired = false; // If true, than experiments are seen as paired-end experiments
27-
std::vector<uint8_t> cutoffs{};
2827
bool experiment_names = false; // Flag, if names of experiment should be stored in a txt file
2928
};
3029

@@ -62,23 +61,24 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map<
6261
* \param args Min arguments.
6362
* \param filename The filename of the binary file.
6463
* \param num_of_minimisers Variable, where to number of minimisers should be stored.
65-
64+
* \param cutoff cutoff value.
6665
*/
67-
void read_binary_start(min_arguments & args, std::filesystem::path filename, uint64_t & num_of_minimisers);
66+
void read_binary_start(min_arguments & args, std::filesystem::path filename, uint64_t & num_of_minimisers, uint8_t & cutoff);
6867

6968
/*! \brief Creates IBFs.
7069
* \param sequence_files A vector of sequence file paths.
7170
* \param ibf_args The IBF specific arguments to use (bin size, number of hash functions, ...). See
7271
* struct ibf_arguments.
7372
* \param minimiser_args The minimiser specific arguments to use.
7473
* \param fpr The average false positive rate that should be used.
74+
* \param cutoffs List of cutoffs.
7575
* \param expression_by_genome_file File that contains the only minimisers that should be considered for the
7676
* determination of the expression thresholds.
7777
* \param num_hash The number of hash functions to use.
7878
* \returns The expression thresholds per experiment.
7979
*/
8080
std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & sequence_files, estimate_ibf_arguments & ibf_args,
81-
minimiser_arguments & minimiser_args, std::vector<double> & fpr,
81+
minimiser_arguments & minimiser_args, std::vector<double> & fpr, std::vector<uint8_t> & cutoffs,
8282
std::filesystem::path const expression_by_genome_file = "",
8383
size_t num_hash = 1);
8484

@@ -101,6 +101,7 @@ std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & minimiser_f
101101
* \param sequence_files A vector of sequence file paths.
102102
* \param args The minimiser arguments to use (seed, shape, window size).
103103
* \param minimiser_args The minimiser specific arguments to use.
104+
* \param cutoffs List of cutoffs.
104105
*/
105106
void minimiser(std::vector<std::filesystem::path> const & sequence_files, min_arguments const & args,
106-
minimiser_arguments & minimiser_args);
107+
minimiser_arguments & minimiser_args, std::vector<uint8_t> & cutoffs);

Diff for: src/estimate.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint
8181

8282
// Perform normalization by dividing through the threshold of the first level. Only works, if multiple expressions were used.
8383
if constexpr (normalization & multiple_expressions)
84-
estimations_i[j] = estimations_i[j]/expressions[0][j];
84+
estimations_i[j] = estimations_i[j]/expressions[1][j];
8585
}
8686
else
8787
{

Diff for: src/ibf.cpp

+41-34
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,10 @@ inline bool check_for_fasta_format(std::vector<std::string> const & valid_extens
7575
// Determine cutoff for one experiment
7676
uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples)
7777
{
78-
// Cutoff according to Mantis paper, divided by two because we store expression thresholds and
79-
// -1 because we use "<" and not "<="
80-
uint16_t const default_cutoff{24};
78+
// Cutoff according to Mantis paper -1 because we use "<" and not "<="
79+
uint16_t const default_cutoff{49};
8180
uint8_t cutoff{default_cutoff};
82-
std::array<uint16_t, 4> const cutoffs{0, 1, 4, 9};
81+
std::array<uint16_t, 4> const cutoffs{0, 2, 9, 19};
8382
std::array<uint64_t, 4> const cutoff_bounds{314'572'800, 524'288'000, 1'073'741'824, 3'221'225'472};
8483
cutoff = default_cutoff;
8584

@@ -208,6 +207,7 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map<
208207
fin.open(filename, std::ios::binary);
209208
fin.read((char*)&buffer, sizeof(buffer));
210209
fin.read((char*)&small_buffer, sizeof(small_buffer));
210+
fin.read((char*)&small_buffer, sizeof(small_buffer));
211211
fin.read((char*)&window, sizeof(window));
212212
fin.read((char*)&buffer, sizeof(buffer));
213213
bool ungapped;
@@ -231,16 +231,19 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map<
231231

232232
void read_binary_start(min_arguments & args,
233233
std::filesystem::path filename,
234-
uint64_t & num_of_minimisers)
234+
uint64_t & num_of_minimisers, uint8_t & cutoff)
235235
{
236236
std::ifstream fin;
237237

238238
uint32_t window;
239239
uint64_t buffer;
240+
uint8_t small_buffer;
240241
fin.open(filename, std::ios::binary);
241242
fin.read((char*)&buffer, sizeof(buffer));
242243
num_of_minimisers = buffer;
243244

245+
fin.read((char*)&small_buffer, sizeof(small_buffer));
246+
cutoff = small_buffer;
244247
fin.read((char*)&args.k, sizeof(args.k));
245248
fin.read((char*)&window, sizeof(window));
246249
args.w_size = seqan3::window_size{window};
@@ -332,7 +335,7 @@ void check_fpr(uint8_t const number_expression_thresholds, std::vector<double> &
332335
void get_expression_thresholds(uint8_t const number_expression_thresholds,
333336
robin_hood::unordered_node_map<uint64_t, uint16_t> const & hash_table,
334337
std::vector<uint16_t> & expression_thresholds, std::vector<uint64_t> & sizes,
335-
robin_hood::unordered_set<uint64_t> const & genome, bool all = true)
338+
robin_hood::unordered_set<uint64_t> const & genome, uint8_t cutoff, bool all = true)
336339
{
337340
// Calculate expression thresholds by taking median recursively
338341
std::vector<uint16_t> counts;
@@ -347,6 +350,8 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,
347350
auto prev_exp{0};
348351
auto exp{0};
349352
auto max_elem = *std::max_element(counts.begin(), counts.end());
353+
// Zero Level = cutoff + 1
354+
expression_thresholds.push_back(cutoff + 1);
350355
// First Level
351356
std::nth_element(counts.begin() + prev_pos, counts.begin() + prev_pos + counts.size()/dev, counts.end());
352357
exp = counts[prev_pos + counts.size()/dev];
@@ -371,6 +376,7 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,
371376

372377
prev_exp = exp;
373378
}
379+
sizes.push_back(prev_pos);
374380
// In case not all levels have a threshold, give the last levels a maximal threshold, which can not be met by any minimiser.
375381
while(expression_thresholds.size() < number_expression_thresholds)
376382
expression_thresholds.push_back(max_elem + 1);
@@ -390,6 +396,7 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
390396
fin.open(filename, std::ios::binary);
391397
fin.read((char*)&buffer, sizeof(buffer));
392398
fin.read((char*)&small_buffer, sizeof(small_buffer));
399+
fin.read((char*)&small_buffer, sizeof(small_buffer));
393400
fin.read((char*)&window, sizeof(window));
394401
fin.read((char*)&buffer, sizeof(buffer));
395402
bool ungapped;
@@ -423,7 +430,8 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
423430
template<bool samplewise, bool minimiser_files_given = true>
424431
void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
425432
std::vector<double> const & fprs,
426-
estimate_ibf_arguments & ibf_args, size_t num_hash = 1, std::filesystem::path expression_by_genome_file = "",
433+
estimate_ibf_arguments & ibf_args, std::vector<uint8_t> & cutoffs = {},
434+
size_t num_hash = 1, std::filesystem::path expression_by_genome_file = "",
427435
minimiser_arguments const & minimiser_args = {})
428436
{
429437

@@ -437,8 +445,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
437445
std::vector<std::vector<uint64_t>> sizes{};
438446
sizes.assign(num_files, {});
439447

440-
bool const calculate_cutoffs = minimiser_args.cutoffs.empty();
441-
std::vector<uint8_t> file_cutoffs{};
448+
bool const calculate_cutoffs = cutoffs.empty();
442449

443450
robin_hood::unordered_set<uint64_t> include_set_table; // Storage for minimisers in include file
444451
robin_hood::unordered_set<uint64_t> exclude_set_table; // Storage for minimisers in exclude file
@@ -474,7 +481,9 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
474481

475482
if constexpr(minimiser_files_given)
476483
{
477-
read_binary_start(ibf_args, minimiser_files[i], filesize);
484+
uint8_t cutoff;
485+
read_binary_start(ibf_args, minimiser_files[i], filesize, cutoff);
486+
cutoffs.push_back(cutoff);
478487
}
479488
else
480489
{
@@ -484,22 +493,19 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
484493

485494
// Determine cutoffs
486495
if (calculate_cutoffs)
487-
file_cutoffs.push_back(calculate_cutoff(minimiser_files[file_iterator], minimiser_args.samples[i]));
496+
cutoffs.push_back(calculate_cutoff(minimiser_files[file_iterator], minimiser_args.samples[i]));
488497

489498
bool const is_compressed = minimiser_files[file_iterator].extension() == ".gz" || minimiser_files[file_iterator].extension() == ".bgzf" || minimiser_files[file_iterator].extension() == ".bz2";
490499
bool const is_fasta = is_compressed ? check_for_fasta_format(seqan3::format_fasta::file_extensions,minimiser_files[file_iterator].stem())
491500
: check_for_fasta_format(seqan3::format_fasta::file_extensions, minimiser_files[file_iterator].extension());
492501
filesize = std::filesystem::file_size(minimiser_files[file_iterator]) * minimiser_args.samples[i] * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3);
493-
if (calculate_cutoffs)
494-
filesize = filesize/((file_cutoffs[i] + 1) * (is_fasta ? 1 : 2));
495-
else
496-
filesize = filesize/((minimiser_args.cutoffs[i] + 1) * (is_fasta ? 1 : 2));
502+
filesize = filesize/((cutoffs[i] + 1) * (is_fasta ? 1 : 2));
497503
}
498504
// If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for
499505
// all files.
500506
if constexpr (samplewise)
501507
{
502-
uint64_t diff{2};
508+
uint64_t diff{1};
503509
for (std::size_t c = 0; c < ibf_args.number_expression_thresholds - 1; c++)
504510
{
505511
diff = diff * 2;
@@ -579,12 +585,8 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
579585
for (unsigned f = 0; f < minimiser_args.samples[i]; f++)
580586
{
581587
seqan3::sequence_file_input<my_traits, seqan3::fields<seqan3::field::seq>> fin{minimiser_files[file_iterator+f]};
582-
if (calculate_cutoffs)
583-
fill_hash_table(ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table,
584-
(minimiser_args.include_file != ""), file_cutoffs[i]);
585-
else
586-
fill_hash_table(ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table,
587-
(minimiser_args.include_file != ""), minimiser_args.cutoffs[i]);
588+
fill_hash_table(ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table,
589+
(minimiser_args.include_file != ""), cutoffs[i]);
588590
}
589591
cutoff_table.clear();
590592
}
@@ -598,6 +600,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
598600
expression_thresholds,
599601
sizes[i],
600602
genome,
603+
cutoffs[i],
601604
expression_by_genome);
602605
expressions[i] = expression_thresholds;
603606
}
@@ -667,14 +670,14 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
667670
// Create ibfs
668671
std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & sequence_files,
669672
estimate_ibf_arguments & ibf_args, minimiser_arguments & minimiser_args,
670-
std::vector<double> & fpr,
673+
std::vector<double> & fpr, std::vector<uint8_t> & cutoffs,
671674
std::filesystem::path const expression_by_genome_file, size_t num_hash)
672675
{
673676
// Declarations
674677
robin_hood::unordered_node_map<uint64_t, uint16_t> hash_table{}; // Storage for minimisers
675678
seqan3::concatenated_sequences<seqan3::dna4_vector> sequences; // Storage for sequences in experiment files
676679

677-
check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, minimiser_args.cutoffs);
680+
check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, cutoffs);
678681

679682

680683
check_expression(ibf_args.expression_thresholds, ibf_args.number_expression_thresholds, expression_by_genome_file);
@@ -696,9 +699,9 @@ std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & sequence_fi
696699
}
697700

698701
if (ibf_args.samplewise)
699-
ibf_helper<true, false>(sequence_files, fpr, ibf_args, num_hash, expression_by_genome_file, minimiser_args);
702+
ibf_helper<true, false>(sequence_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_args);
700703
else
701-
ibf_helper<false, false>(sequence_files, fpr, ibf_args, num_hash, expression_by_genome_file, minimiser_args);
704+
ibf_helper<false, false>(sequence_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_args);
702705

703706
store_args(ibf_args, std::string{ibf_args.path_out} + "IBF_Data");
704707

@@ -716,10 +719,11 @@ std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & minimiser_f
716719

717720
ibf_args.samplewise = (ibf_args.expression_thresholds.size() == 0);
718721

722+
std::vector<uint8_t> cutoffs{};
719723
if (ibf_args.samplewise)
720-
ibf_helper<true>(minimiser_files, fpr, ibf_args, num_hash, expression_by_genome_file);
724+
ibf_helper<true>(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file);
721725
else
722-
ibf_helper<false>(minimiser_files, fpr, ibf_args, num_hash, expression_by_genome_file);
726+
ibf_helper<false>(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file);
723727

724728
store_args(ibf_args, std::string{ibf_args.path_out} + "IBF_Data");
725729

@@ -732,7 +736,8 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
732736
robin_hood::unordered_set<uint64_t> const & exclude_set_table,
733737
min_arguments const & args,
734738
minimiser_arguments const & minimiser_args,
735-
unsigned const i)
739+
unsigned const i,
740+
std::vector<uint8_t> & cutoffs)
736741
{
737742
robin_hood::unordered_node_map<uint64_t, uint16_t> hash_table{}; // Storage for minimisers
738743
uint16_t count{0};
@@ -744,12 +749,12 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
744749
std::ofstream outfile;
745750
unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0);
746751

747-
bool const calculate_cutoffs = minimiser_args.cutoffs.empty();
752+
bool const calculate_cutoffs = cutoffs.empty();
748753

749754
if (calculate_cutoffs)
750755
cutoff = calculate_cutoff(sequence_files[file_iterator], minimiser_args.samples[i]);
751756
else
752-
cutoff = minimiser_args.cutoffs[i];
757+
cutoff = cutoffs[i];
753758

754759
// Fill hash_table with minimisers.
755760
for (unsigned f = 0; f < minimiser_args.samples[i]; f++)
@@ -764,6 +769,7 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
764769
+ ".minimiser", std::ios::binary);
765770
auto hash_size = hash_table.size();
766771
outfile.write(reinterpret_cast<const char*>(&hash_size), sizeof(hash_size));
772+
outfile.write(reinterpret_cast<const char*>(&cutoff), sizeof(cutoff));
767773
outfile.write(reinterpret_cast<const char*>(&args.k), sizeof(args.k));
768774
outfile.write(reinterpret_cast<const char*>(&args.w_size.get()), sizeof(args.w_size.get()));
769775
outfile.write(reinterpret_cast<const char*>(&args.s.get()), sizeof(args.s.get()));
@@ -784,13 +790,14 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
784790
outfile.close();
785791
}
786792

787-
void minimiser(std::vector<std::filesystem::path> const & sequence_files, min_arguments const & args, minimiser_arguments & minimiser_args)
793+
void minimiser(std::vector<std::filesystem::path> const & sequence_files, min_arguments const & args,
794+
minimiser_arguments & minimiser_args, std::vector<uint8_t> & cutoffs)
788795
{
789796
// Declarations
790797
robin_hood::unordered_set<uint64_t> include_set_table{}; // Storage for minimisers in include file
791798
robin_hood::unordered_set<uint64_t> exclude_set_table{}; // Storage for minimisers in exclude file
792799

793-
check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, minimiser_args.cutoffs);
800+
check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, cutoffs);
794801

795802
if (minimiser_args.include_file != "")
796803
get_include_set_table(args, minimiser_args.include_file, include_set_table);
@@ -805,6 +812,6 @@ void minimiser(std::vector<std::filesystem::path> const & sequence_files, min_ar
805812
#pragma omp parallel for schedule(dynamic, chunk_size)
806813
for(unsigned i = 0; i < minimiser_args.samples.size(); i++)
807814
{
808-
calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i);
815+
calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i, cutoffs);
809816
}
810817
}

0 commit comments

Comments
 (0)