@@ -75,11 +75,10 @@ inline bool check_for_fasta_format(std::vector<std::string> const & valid_extens
75
75
// Determine cutoff for one experiment
76
76
uint8_t calculate_cutoff (std::filesystem::path sequence_file, int samples)
77
77
{
78
- // Cutoff according to Mantis paper, divided by two because we store expression thresholds and
79
- // -1 because we use "<" and not "<="
80
- uint16_t const default_cutoff{24 };
78
+ // Cutoff according to Mantis paper -1 because we use "<" and not "<="
79
+ uint16_t const default_cutoff{49 };
81
80
uint8_t cutoff{default_cutoff};
82
- std::array<uint16_t , 4 > const cutoffs{0 , 1 , 4 , 9 };
81
+ std::array<uint16_t , 4 > const cutoffs{0 , 2 , 9 , 19 };
83
82
std::array<uint64_t , 4 > const cutoff_bounds{314'572'800 , 524'288'000 , 1'073'741'824 , 3'221'225'472 };
84
83
cutoff = default_cutoff;
85
84
@@ -208,6 +207,7 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map<
208
207
fin.open (filename, std::ios::binary);
209
208
fin.read ((char *)&buffer, sizeof (buffer));
210
209
fin.read ((char *)&small_buffer, sizeof (small_buffer));
210
+ fin.read ((char *)&small_buffer, sizeof (small_buffer));
211
211
fin.read ((char *)&window, sizeof (window));
212
212
fin.read ((char *)&buffer, sizeof (buffer));
213
213
bool ungapped;
@@ -231,16 +231,19 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map<
231
231
232
232
void read_binary_start (min_arguments & args,
233
233
std::filesystem::path filename,
234
- uint64_t & num_of_minimisers)
234
+ uint64_t & num_of_minimisers, uint8_t & cutoff )
235
235
{
236
236
std::ifstream fin;
237
237
238
238
uint32_t window;
239
239
uint64_t buffer;
240
+ uint8_t small_buffer;
240
241
fin.open (filename, std::ios::binary);
241
242
fin.read ((char *)&buffer, sizeof (buffer));
242
243
num_of_minimisers = buffer;
243
244
245
+ fin.read ((char *)&small_buffer, sizeof (small_buffer));
246
+ cutoff = small_buffer;
244
247
fin.read ((char *)&args.k , sizeof (args.k ));
245
248
fin.read ((char *)&window, sizeof (window));
246
249
args.w_size = seqan3::window_size{window};
@@ -332,7 +335,7 @@ void check_fpr(uint8_t const number_expression_thresholds, std::vector<double> &
332
335
void get_expression_thresholds (uint8_t const number_expression_thresholds,
333
336
robin_hood::unordered_node_map<uint64_t , uint16_t > const & hash_table,
334
337
std::vector<uint16_t > & expression_thresholds, std::vector<uint64_t > & sizes,
335
- robin_hood::unordered_set<uint64_t > const & genome, bool all = true )
338
+ robin_hood::unordered_set<uint64_t > const & genome, uint8_t cutoff, bool all = true )
336
339
{
337
340
// Calculate expression thresholds by taking median recursively
338
341
std::vector<uint16_t > counts;
@@ -347,6 +350,8 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,
347
350
auto prev_exp{0 };
348
351
auto exp {0 };
349
352
auto max_elem = *std::max_element (counts.begin (), counts.end ());
353
+ // Zero Level = cutoff + 1
354
+ expression_thresholds.push_back (cutoff + 1 );
350
355
// First Level
351
356
std::nth_element (counts.begin () + prev_pos, counts.begin () + prev_pos + counts.size ()/dev, counts.end ());
352
357
exp = counts[prev_pos + counts.size ()/dev];
@@ -371,6 +376,7 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,
371
376
372
377
prev_exp = exp ;
373
378
}
379
+ sizes.push_back (prev_pos);
374
380
// In case not all levels have a threshold, give the last levels a maximal threshold, which can not be met by any minimiser.
375
381
while (expression_thresholds.size () < number_expression_thresholds)
376
382
expression_thresholds.push_back (max_elem + 1 );
@@ -390,6 +396,7 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
390
396
fin.open (filename, std::ios::binary);
391
397
fin.read ((char *)&buffer, sizeof (buffer));
392
398
fin.read ((char *)&small_buffer, sizeof (small_buffer));
399
+ fin.read ((char *)&small_buffer, sizeof (small_buffer));
393
400
fin.read ((char *)&window, sizeof (window));
394
401
fin.read ((char *)&buffer, sizeof (buffer));
395
402
bool ungapped;
@@ -423,7 +430,8 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
423
430
template <bool samplewise, bool minimiser_files_given = true >
424
431
void ibf_helper (std::vector<std::filesystem::path> const & minimiser_files,
425
432
std::vector<double > const & fprs,
426
- estimate_ibf_arguments & ibf_args, size_t num_hash = 1 , std::filesystem::path expression_by_genome_file = " " ,
433
+ estimate_ibf_arguments & ibf_args, std::vector<uint8_t > & cutoffs = {},
434
+ size_t num_hash = 1 , std::filesystem::path expression_by_genome_file = " " ,
427
435
minimiser_arguments const & minimiser_args = {})
428
436
{
429
437
@@ -437,8 +445,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
437
445
std::vector<std::vector<uint64_t >> sizes{};
438
446
sizes.assign (num_files, {});
439
447
440
- bool const calculate_cutoffs = minimiser_args.cutoffs .empty ();
441
- std::vector<uint8_t > file_cutoffs{};
448
+ bool const calculate_cutoffs = cutoffs.empty ();
442
449
443
450
robin_hood::unordered_set<uint64_t > include_set_table; // Storage for minimisers in include file
444
451
robin_hood::unordered_set<uint64_t > exclude_set_table; // Storage for minimisers in exclude file
@@ -474,7 +481,9 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
474
481
475
482
if constexpr (minimiser_files_given)
476
483
{
477
- read_binary_start (ibf_args, minimiser_files[i], filesize);
484
+ uint8_t cutoff;
485
+ read_binary_start (ibf_args, minimiser_files[i], filesize, cutoff);
486
+ cutoffs.push_back (cutoff);
478
487
}
479
488
else
480
489
{
@@ -484,22 +493,19 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
484
493
485
494
// Determine cutoffs
486
495
if (calculate_cutoffs)
487
- file_cutoffs .push_back (calculate_cutoff (minimiser_files[file_iterator], minimiser_args.samples [i]));
496
+ cutoffs .push_back (calculate_cutoff (minimiser_files[file_iterator], minimiser_args.samples [i]));
488
497
489
498
bool const is_compressed = minimiser_files[file_iterator].extension () == " .gz" || minimiser_files[file_iterator].extension () == " .bgzf" || minimiser_files[file_iterator].extension () == " .bz2" ;
490
499
bool const is_fasta = is_compressed ? check_for_fasta_format (seqan3::format_fasta::file_extensions,minimiser_files[file_iterator].stem ())
491
500
: check_for_fasta_format (seqan3::format_fasta::file_extensions, minimiser_files[file_iterator].extension ());
492
501
filesize = std::filesystem::file_size (minimiser_files[file_iterator]) * minimiser_args.samples [i] * (is_fasta ? 2 : 1 ) / (is_compressed ? 1 : 3 );
493
- if (calculate_cutoffs)
494
- filesize = filesize/((file_cutoffs[i] + 1 ) * (is_fasta ? 1 : 2 ));
495
- else
496
- filesize = filesize/((minimiser_args.cutoffs [i] + 1 ) * (is_fasta ? 1 : 2 ));
502
+ filesize = filesize/((cutoffs[i] + 1 ) * (is_fasta ? 1 : 2 ));
497
503
}
498
504
// If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for
499
505
// all files.
500
506
if constexpr (samplewise)
501
507
{
502
- uint64_t diff{2 };
508
+ uint64_t diff{1 };
503
509
for (std::size_t c = 0 ; c < ibf_args.number_expression_thresholds - 1 ; c++)
504
510
{
505
511
diff = diff * 2 ;
@@ -579,12 +585,8 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
579
585
for (unsigned f = 0 ; f < minimiser_args.samples [i]; f++)
580
586
{
581
587
seqan3::sequence_file_input<my_traits, seqan3::fields<seqan3::field::seq>> fin{minimiser_files[file_iterator+f]};
582
- if (calculate_cutoffs)
583
- fill_hash_table (ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table,
584
- (minimiser_args.include_file != " " ), file_cutoffs[i]);
585
- else
586
- fill_hash_table (ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table,
587
- (minimiser_args.include_file != " " ), minimiser_args.cutoffs [i]);
588
+ fill_hash_table (ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table,
589
+ (minimiser_args.include_file != " " ), cutoffs[i]);
588
590
}
589
591
cutoff_table.clear ();
590
592
}
@@ -598,6 +600,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
598
600
expression_thresholds,
599
601
sizes[i],
600
602
genome,
603
+ cutoffs[i],
601
604
expression_by_genome);
602
605
expressions[i] = expression_thresholds;
603
606
}
@@ -667,14 +670,14 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
667
670
// Create ibfs
668
671
std::vector<uint16_t > ibf (std::vector<std::filesystem::path> const & sequence_files,
669
672
estimate_ibf_arguments & ibf_args, minimiser_arguments & minimiser_args,
670
- std::vector<double > & fpr,
673
+ std::vector<double > & fpr, std::vector< uint8_t > & cutoffs,
671
674
std::filesystem::path const expression_by_genome_file, size_t num_hash)
672
675
{
673
676
// Declarations
674
677
robin_hood::unordered_node_map<uint64_t , uint16_t > hash_table{}; // Storage for minimisers
675
678
seqan3::concatenated_sequences<seqan3::dna4_vector> sequences; // Storage for sequences in experiment files
676
679
677
- check_cutoffs_samples (sequence_files, minimiser_args.paired , minimiser_args.samples , minimiser_args. cutoffs );
680
+ check_cutoffs_samples (sequence_files, minimiser_args.paired , minimiser_args.samples , cutoffs);
678
681
679
682
680
683
check_expression (ibf_args.expression_thresholds , ibf_args.number_expression_thresholds , expression_by_genome_file);
@@ -696,9 +699,9 @@ std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & sequence_fi
696
699
}
697
700
698
701
if (ibf_args.samplewise )
699
- ibf_helper<true , false >(sequence_files, fpr, ibf_args, num_hash, expression_by_genome_file, minimiser_args);
702
+ ibf_helper<true , false >(sequence_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_args);
700
703
else
701
- ibf_helper<false , false >(sequence_files, fpr, ibf_args, num_hash, expression_by_genome_file, minimiser_args);
704
+ ibf_helper<false , false >(sequence_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_args);
702
705
703
706
store_args (ibf_args, std::string{ibf_args.path_out } + " IBF_Data" );
704
707
@@ -716,10 +719,11 @@ std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & minimiser_f
716
719
717
720
ibf_args.samplewise = (ibf_args.expression_thresholds .size () == 0 );
718
721
722
+ std::vector<uint8_t > cutoffs{};
719
723
if (ibf_args.samplewise )
720
- ibf_helper<true >(minimiser_files, fpr, ibf_args, num_hash, expression_by_genome_file);
724
+ ibf_helper<true >(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file);
721
725
else
722
- ibf_helper<false >(minimiser_files, fpr, ibf_args, num_hash, expression_by_genome_file);
726
+ ibf_helper<false >(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file);
723
727
724
728
store_args (ibf_args, std::string{ibf_args.path_out } + " IBF_Data" );
725
729
@@ -732,7 +736,8 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
732
736
robin_hood::unordered_set<uint64_t > const & exclude_set_table,
733
737
min_arguments const & args,
734
738
minimiser_arguments const & minimiser_args,
735
- unsigned const i)
739
+ unsigned const i,
740
+ std::vector<uint8_t > & cutoffs)
736
741
{
737
742
robin_hood::unordered_node_map<uint64_t , uint16_t > hash_table{}; // Storage for minimisers
738
743
uint16_t count{0 };
@@ -744,12 +749,12 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
744
749
std::ofstream outfile;
745
750
unsigned file_iterator = std::accumulate (minimiser_args.samples .begin (), minimiser_args.samples .begin () + i, 0 );
746
751
747
- bool const calculate_cutoffs = minimiser_args. cutoffs .empty ();
752
+ bool const calculate_cutoffs = cutoffs.empty ();
748
753
749
754
if (calculate_cutoffs)
750
755
cutoff = calculate_cutoff (sequence_files[file_iterator], minimiser_args.samples [i]);
751
756
else
752
- cutoff = minimiser_args. cutoffs [i];
757
+ cutoff = cutoffs[i];
753
758
754
759
// Fill hash_table with minimisers.
755
760
for (unsigned f = 0 ; f < minimiser_args.samples [i]; f++)
@@ -764,6 +769,7 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
764
769
+ " .minimiser" , std::ios::binary);
765
770
auto hash_size = hash_table.size ();
766
771
outfile.write (reinterpret_cast <const char *>(&hash_size), sizeof (hash_size));
772
+ outfile.write (reinterpret_cast <const char *>(&cutoff), sizeof (cutoff));
767
773
outfile.write (reinterpret_cast <const char *>(&args.k ), sizeof (args.k ));
768
774
outfile.write (reinterpret_cast <const char *>(&args.w_size .get ()), sizeof (args.w_size .get ()));
769
775
outfile.write (reinterpret_cast <const char *>(&args.s .get ()), sizeof (args.s .get ()));
@@ -784,13 +790,14 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
784
790
outfile.close ();
785
791
}
786
792
787
- void minimiser (std::vector<std::filesystem::path> const & sequence_files, min_arguments const & args, minimiser_arguments & minimiser_args)
793
+ void minimiser (std::vector<std::filesystem::path> const & sequence_files, min_arguments const & args,
794
+ minimiser_arguments & minimiser_args, std::vector<uint8_t > & cutoffs)
788
795
{
789
796
// Declarations
790
797
robin_hood::unordered_set<uint64_t > include_set_table{}; // Storage for minimisers in include file
791
798
robin_hood::unordered_set<uint64_t > exclude_set_table{}; // Storage for minimisers in exclude file
792
799
793
- check_cutoffs_samples (sequence_files, minimiser_args.paired , minimiser_args.samples , minimiser_args. cutoffs );
800
+ check_cutoffs_samples (sequence_files, minimiser_args.paired , minimiser_args.samples , cutoffs);
794
801
795
802
if (minimiser_args.include_file != " " )
796
803
get_include_set_table (args, minimiser_args.include_file , include_set_table);
@@ -805,6 +812,6 @@ void minimiser(std::vector<std::filesystem::path> const & sequence_files, min_ar
805
812
#pragma omp parallel for schedule(dynamic, chunk_size)
806
813
for (unsigned i = 0 ; i < minimiser_args.samples .size (); i++)
807
814
{
808
- calculate_minimiser (sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i);
815
+ calculate_minimiser (sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i, cutoffs );
809
816
}
810
817
}
0 commit comments