Skip to content

Commit 4553f30

Browse files
committed
Adding inverseLocate to RLCSA, and some functions to make it useful
1 parent ff1684e commit 4553f30

File tree

2 files changed

+105
-0
lines changed

2 files changed

+105
-0
lines changed

rlcsa.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,9 +372,20 @@ RLCSA::locate(usint index, bool steps) const
372372
return this->directLocate(index + this->number_of_sequences, steps);
373373
}
374374

375+
usint
376+
RLCSA::inverseLocate(usint location) const
377+
{
378+
if(!(this->support_locate) || location >= this->data_size) { return this->data_size; }
379+
380+
// Inverse-locate the given location in BWT space, and convert back to SA
381+
// space before returning.
382+
return this->directInverseLocate(location) - this->number_of_sequences;
383+
}
384+
375385
void
376386
RLCSA::directLocate(pair_type range, usint* data, bool steps) const
377387
{
388+
// range is in SA coordinates, so first we need to convert to BWT coordinates.
378389
this->convertToBWTRange(range);
379390
for(usint i = 0, j = range.first; j <= range.second; i++, j++)
380391
{
@@ -385,24 +396,69 @@ RLCSA::directLocate(pair_type range, usint* data, bool steps) const
385396
usint
386397
RLCSA::directLocate(usint index, bool steps) const
387398
{
399+
// Note that index is in BWT coordinates initially.
400+
401+
// This keeps track of how far along the sequence we had to go to find an SA
402+
// sample.
388403
usint offset = 0;
389404
while(true)
390405
{
406+
// First try in BWT space, so we can account for the text end characters.
391407
if(this->hasImplicitSample(index))
392408
{
409+
// If we took an implicit sample at this index in the BWT (due to us being
410+
// in the range occupied by sequence start characters), we know where it
411+
// falls in the original sequences: at the endpoint of the appropriate
412+
// text.
393413
if(steps) { return offset; }
394414
else { return this->getImplicitSample(index) - offset; }
395415
}
416+
// Pop index into SA space, where the SA samples live
396417
index -= this->number_of_sequences;
397418
if(this->sa_samples->isSampled(index))
398419
{
420+
// If we took a real SA sample here, we know where it falls in the
421+
// original sequences.
399422
return (steps ? offset : this->sa_samples->getSampleAt(index) - offset);
400423
}
424+
425+
// If we get here, we couldn't map this position. Proceed forwards (towards
426+
// the end of the sequence), in hopes of hitting either a sample or the
427+
// sequence end character. Note that psi maps from SA position to the *BWT*
428+
// position of the subsequent character, popping index back into BWT space.
401429
index = this->psi(index);
402430
offset++;
403431
}
404432
}
405433

434+
usint
435+
RLCSA::directInverseLocate(usint location) const
436+
{
437+
// Get the SA value and text location (in that order) of the last SA sample
438+
// before the given text location.
439+
pair_type last_sample = this->sa_samples->inverseSA(location);
440+
441+
// TODO: catch the (size, size) sentinel.
442+
443+
while(last_sample.second != location) {
444+
// We're not at the desired text location, so we must be before it.
445+
446+
// Advance the text location by 1
447+
last_sample.second += 1;
448+
449+
// Advance the SA position to that corresponding to the next character. Note
450+
// that psi returns BWT coordinates, so we have to convert back to SA
451+
// coordinates.
452+
last_sample.first = (this->psi(last_sample.first) -
453+
this->number_of_sequences);
454+
}
455+
456+
// Return the answer in BWT coordinates. It will probably be immediately
457+
// converted back to SA coordinates, but it's worth it for consistency with
458+
// the directLocate function, which takes in BWT coordinates.
459+
return last_sample.first + this->number_of_sequences;
460+
}
461+
406462
void
407463
RLCSA::locateUnsafe(pair_type range, usint* data, bool steps) const
408464
{
@@ -794,18 +850,52 @@ RLCSA::getSequenceForPosition(usint* values, usint len) const
794850
pair_type
795851
RLCSA::getRelativePosition(usint value) const
796852
{
853+
// Get an iterator so we can use the vector of sequence endpoints.
797854
DeltaVector::Iterator iter(*(this->end_points));
855+
856+
// Start out saying we're in text 0 at index whatever our index in the whole
857+
// string of texts is.
798858
pair_type result(0, value);
799859

860+
// Adjust the text number to whatever text hadn't yet ended at the position
861+
// before the one we're interested in.
800862
if(value > 0) { result.first = iter.rank(value - 1); }
801863
if(result.first > 0)
802864
{
865+
// If we're not still in the 0th text, re-base our index in the text to
866+
// count from the first multiple of the SA sample rate after the end of the
867+
// text before this text (since we have declared text coordinates start on
868+
// SA sample multiples).
803869
result.second -= nextMultipleOf(this->sample_rate, iter.select(result.first - 1));
804870
}
805871

872+
// Return the fixed-up relative position.
806873
return result;
807874
}
808875

876+
usint
877+
RLCSA::getAbsolutePosition(pair_type position) const
878+
{
879+
// Get an iterator so we can use the vector of sequence endpoints.
880+
DeltaVector::Iterator iter(*(this->end_points));
881+
882+
// Where is this position as an absolute position? Start off at the beginning.
883+
usint value = 0;
884+
885+
if(position.first > 0) {
886+
// Only the 0th text starts at 0. Find the absolute endpoint of the previous
887+
// text, and then start at the next multiple of the sample rate after that.
888+
// That is where this text is going to start.
889+
value = nextMultipleOf(this->sample_rate, iter.select(position.first - 1));
890+
}
891+
892+
// Advance the start of the text by the index into the text.
893+
value += position.second;
894+
895+
// Return the resulting absolute position.
896+
return value;
897+
}
898+
809899
//--------------------------------------------------------------------------
810900

811901
uchar*

rlcsa.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ class RLCSA
111111

112112
// Returns SA[index].
113113
usint locate(usint index, bool steps = false) const;
114+
115+
// Given SA[index], returns index.
116+
usint inverseLocate(usint location) const;
114117

115118
// Returns T^{sequence}[range]. User must free the buffer.
116119
// Third version uses buffer provided by the user.
@@ -144,6 +147,9 @@ class RLCSA
144147

145148
// Changes SA value to (sequence, offset).
146149
pair_type getRelativePosition(usint value) const;
150+
// Changes (sequence, offset) to SA value (not the same as SA index; it's a
151+
// position in the unified coordinate space over all texts).
152+
usint getAbsolutePosition(pair_type position) const;
147153

148154
// Returns the BWT of the collection including end of sequence markers.
149155
uchar* readBWT() const;
@@ -304,13 +310,22 @@ class RLCSA
304310
// INTERNAL VERSIONS OF QUERIES
305311
//--------------------------------------------------------------------------
306312

313+
// Locates the given SA range, one item at a time, storing results in the
314+
// array given by the data pointer. Finds actual locations if steps is
315+
// false, or the number of steps needed to determine each location if steps
316+
// is true.
307317
void directLocate(pair_type range, usint* data, bool steps) const;
318+
// Locates the given BWT position, returning either the actual location or
319+
// the steps needed to find it, depending on the value of steps.
308320
usint directLocate(usint index, bool steps) const;
309321
void locateUnsafe(pair_type range, usint* data, bool steps) const;
310322
bool processRun(pair_type run, usint* data, usint* offsets, bool* finished, PsiVector::Iterator** iters, bool steps) const;
311323
void displayUnsafe(pair_type range, uchar* data, bool get_ranks = false, usint* ranks = 0) const;
312324

313325
void locateRange(pair_type range, std::vector<usint>& vec) const;
326+
327+
// Given a sequence position, return the corresponding BWT position.
328+
usint directInverseLocate(usint location) const;
314329

315330
//--------------------------------------------------------------------------
316331
// INTERNAL VERSIONS OF BASIC OPERATIONS

0 commit comments

Comments
 (0)