@@ -372,9 +372,20 @@ RLCSA::locate(usint index, bool steps) const
372
372
return this ->directLocate (index + this ->number_of_sequences , steps);
373
373
}
374
374
375
+ usint
376
+ RLCSA::inverseLocate (usint location) const
377
+ {
378
+ if (!(this ->support_locate ) || location >= this ->data_size ) { return this ->data_size ; }
379
+
380
+ // Inverse-locate the given location in BWT space, and convert back to SA
381
+ // space before returning.
382
+ return this ->directInverseLocate (location) - this ->number_of_sequences ;
383
+ }
384
+
375
385
void
376
386
RLCSA::directLocate (pair_type range, usint* data, bool steps) const
377
387
{
388
+ // range is in SA coordinates, so first we need to convert to BWT coordinates.
378
389
this ->convertToBWTRange (range);
379
390
for (usint i = 0 , j = range.first ; j <= range.second ; i++, j++)
380
391
{
@@ -385,24 +396,69 @@ RLCSA::directLocate(pair_type range, usint* data, bool steps) const
385
396
usint
386
397
RLCSA::directLocate (usint index, bool steps) const
387
398
{
399
+ // Note that index is in BWT coordinates initially.
400
+
401
+ // This keeps track of how far along the sequence we had to go to find an SA
402
+ // sample.
388
403
usint offset = 0 ;
389
404
while (true )
390
405
{
406
+ // First try in BWT space, so we can account for the text end characters.
391
407
if (this ->hasImplicitSample (index ))
392
408
{
409
+ // If we took an implicit sample at this index in the BWT (due to us being
410
+ // in the range occupied by sequence start characters), we know where it
411
+ // falls in the original sequences: at the endpoint of the appropriate
412
+ // text.
393
413
if (steps) { return offset; }
394
414
else { return this ->getImplicitSample (index ) - offset; }
395
415
}
416
+ // Pop index into SA space, where the SA samples live
396
417
index -= this ->number_of_sequences ;
397
418
if (this ->sa_samples ->isSampled (index ))
398
419
{
420
+ // If we took a real SA sample here, we know where it falls in the
421
+ // original sequences.
399
422
return (steps ? offset : this ->sa_samples ->getSampleAt (index ) - offset);
400
423
}
424
+
425
+ // If we get here, we couldn't map this position. Proceed forwards (towards
426
+ // the end of the sequence), in hopes of hitting either a sample or the
427
+ // sequence end character. Note that psi maps from SA position to the *BWT*
428
+ // position of the subsequent character, popping index back into BWT space.
401
429
index = this ->psi (index );
402
430
offset++;
403
431
}
404
432
}
405
433
434
+ usint
435
+ RLCSA::directInverseLocate (usint location) const
436
+ {
437
+ // Get the SA value and text location (in that order) of the last SA sample
438
+ // before the given text location.
439
+ pair_type last_sample = this ->sa_samples ->inverseSA (location);
440
+
441
+ // TODO: catch the (size, size) sentinel.
442
+
443
+ while (last_sample.second != location) {
444
+ // We're not at the desired text location, so we must be before it.
445
+
446
+ // Advance the text location by 1
447
+ last_sample.second += 1 ;
448
+
449
+ // Advance the SA position to that corresponding to the next character. Note
450
+ // that psi returns BWT coordinates, so we have to convert back to SA
451
+ // coordinates.
452
+ last_sample.first = (this ->psi (last_sample.first ) -
453
+ this ->number_of_sequences );
454
+ }
455
+
456
+ // Return the answer in BWT coordinates. It will probably be immediately
457
+ // converted back to SA coordinates, but it's worth it for consistency with
458
+ // the directLocate function, which takes in BWT coordinates.
459
+ return last_sample.first + this ->number_of_sequences ;
460
+ }
461
+
406
462
void
407
463
RLCSA::locateUnsafe (pair_type range, usint* data, bool steps) const
408
464
{
@@ -794,18 +850,52 @@ RLCSA::getSequenceForPosition(usint* values, usint len) const
794
850
pair_type
795
851
RLCSA::getRelativePosition (usint value) const
796
852
{
853
+ // Get an iterator so we can use the vector of sequence endpoints.
797
854
DeltaVector::Iterator iter (*(this ->end_points ));
855
+
856
+ // Start out saying we're in text 0 at index whatever our index in the whole
857
+ // string of texts is.
798
858
pair_type result (0 , value);
799
859
860
+ // Adjust the text number to whatever text hadn't yet ended at the position
861
+ // before the one we're interested in.
800
862
if (value > 0 ) { result.first = iter.rank (value - 1 ); }
801
863
if (result.first > 0 )
802
864
{
865
+ // If we're not still in the 0th text, re-base our index in the text to
866
+ // count from the first multiple of the SA sample rate after the end of the
867
+ // text before this text (since we have declared text coordinates start on
868
+ // SA sample multiples).
803
869
result.second -= nextMultipleOf (this ->sample_rate , iter.select (result.first - 1 ));
804
870
}
805
871
872
+ // Return the fixed-up relative position.
806
873
return result;
807
874
}
808
875
876
+ usint
877
+ RLCSA::getAbsolutePosition (pair_type position) const
878
+ {
879
+ // Get an iterator so we can use the vector of sequence endpoints.
880
+ DeltaVector::Iterator iter (*(this ->end_points ));
881
+
882
+ // Where is this position as an absolute position? Start off at the beginning.
883
+ usint value = 0 ;
884
+
885
+ if (position.first > 0 ) {
886
+ // Only the 0th text starts at 0. Find the absolute endpoint of the previous
887
+ // text, and then start at the next multiple of the sample rate after that.
888
+ // That is where this text is going to start.
889
+ value = nextMultipleOf (this ->sample_rate , iter.select (position.first - 1 ));
890
+ }
891
+
892
+ // Advance the start of the text by the index into the text.
893
+ value += position.second ;
894
+
895
+ // Return the resulting absolute position.
896
+ return value;
897
+ }
898
+
809
899
// --------------------------------------------------------------------------
810
900
811
901
uchar*
0 commit comments