diff --git a/src/copy_graph.cpp b/src/copy_graph.cpp index 2ed46e3..b9d81cb 100644 --- a/src/copy_graph.cpp +++ b/src/copy_graph.cpp @@ -61,7 +61,6 @@ void copy_path(const PathHandleGraph* from, const path_handle_t& from_path, from->get_sample_name(from_path), from->get_locus_name(from_path), from->get_haplotype(from_path), - from->get_phase_block(from_path), from->get_subrange(from_path), from->get_is_circular(from_path)); diff --git a/src/include/handlegraph/mutable_path_metadata.hpp b/src/include/handlegraph/mutable_path_metadata.hpp index 572f7ed..1efe360 100644 --- a/src/include/handlegraph/mutable_path_metadata.hpp +++ b/src/include/handlegraph/mutable_path_metadata.hpp @@ -30,8 +30,7 @@ class MutablePathMetadata : virtual public PathMetadata { /** * Add a path with the given metadata. Any item can be the corresponding - * unset sentinel (PathMetadata::NO_LOCUS_NAME, - * PathMetadata::NO_PHASE_BLOCK, etc.). + * unset sentinel (PathMetadata::NO_LOCUS_NAME, etc.). * * Implementations may refuse to store paths-or-threads of certain senses * when relevant fields are unset. @@ -43,7 +42,6 @@ class MutablePathMetadata : virtual public PathMetadata { const std::string& sample, const std::string& locus, const size_t& haplotype, - const size_t& phase_block, const subrange_t& subrange, bool is_circular = false); diff --git a/src/include/handlegraph/path_handle_graph.hpp b/src/include/handlegraph/path_handle_graph.hpp index 9b504a2..421c003 100644 --- a/src/include/handlegraph/path_handle_graph.hpp +++ b/src/include/handlegraph/path_handle_graph.hpp @@ -115,7 +115,7 @@ class PathHandleGraph : virtual public HandleGraph, virtual public PathMetadata /// visible here. Only reference or generic named paths should be visible. template bool for_each_step_on_handle(const handle_t& handle, const Iteratee& iteratee) const; - + //////////////////////////////////////////////////////////////////////////// // Backing protected virtual methods that need to be implemented //////////////////////////////////////////////////////////////////////////// @@ -154,6 +154,9 @@ class PathHandleGraph : virtual public HandleGraph, virtual public PathMetadata /// Returns true if the given path is empty, and false otherwise virtual bool is_empty(const path_handle_t& path_handle) const; + /// Measure the length of a path. + virtual size_t get_path_length(const path_handle_t& path_handle) const; + //////////////////////////////////////////////////////////////////////////// // Concrete utility methods //////////////////////////////////////////////////////////////////////////// @@ -186,7 +189,6 @@ bool PathHandleGraph::for_each_step_on_handle(const handle_t& handle, const Iter return for_each_step_on_handle_impl(handle, BoolReturningWrapper::wrap(iteratee)); } - template bool PathHandleGraph::for_each_step_in_path(const path_handle_t& path, const Iteratee& iteratee) const { diff --git a/src/include/handlegraph/path_metadata.hpp b/src/include/handlegraph/path_metadata.hpp index a054795..d08b3ac 100644 --- a/src/include/handlegraph/path_metadata.hpp +++ b/src/include/handlegraph/path_metadata.hpp @@ -22,13 +22,13 @@ namespace handlegraph { * * Our model is that paths come in different "senses": * - * - SENSE_GENERIC: a generic named path. Has a "locus" name. + * - PathSense::GENERIC: a generic named path. Has a "locus" name. * - * - SENSE_REFERENCE: a part of a reference assembly. Has a "sample" name, a - * "locus" name, and a haplotype number. + * - PathSense::REFERENCE: a part of a reference assembly. Has a "sample" name, + * a "locus" name, and a haplotype number. * - * - SENSE_HAPLOTYPE: a haplotype from a particular individual. Has a "sample" - * name, a "locus" name, a haplotype number, and a phase block identifier. + * - PathSense::HAPLOTYPE: a haplotype from a particular individual. Has a + * "sample" name, a "locus" name, a haplotype number. * * Paths of all sneses can represent subpaths, with bounds. * @@ -43,15 +43,11 @@ namespace handlegraph { * represented. GFA uses a convention where the presence of a haplotype 0 * implies that only one haplotype is present. * - * - Phase block identifier: Distinguishes fragments of a haplotype that are - * phased but not necessarily part of a single self-consistent scaffold (often - * due to self-contradictory VCF information). Must be unique within a sample, - * locus, and haplotype. May be a number or a start coordinate. + * - Subrange, for when a path as stored gives only a sub-range of a conceptually + * longer scaffold. Multiple items can be stored with identical metadata in the + * other fields if their subranges are non-overlapping. For haplotypes, the + * subrange coordinates may be synthetic. * - * - Bounds, for when a path as stored gives only a sub-range of a conceptually - * longer path. Multiple items can be stored with identical metadata in the - * other fields if their bounds are non-overlapping. - * TODO: Interaction with phase block in GBWT??? */ class PathMetadata { public: @@ -84,17 +80,11 @@ class PathMetadata { virtual std::string get_locus_name(const path_handle_t& handle) const; static const std::string NO_LOCUS_NAME; - /// Get the haplotype number (0 or 1, for diploid) of the path-or-thread, + /// Get the haplotype number (0 for haploid, 1 or 2 for diploid) of the path-or-thread, /// or NO_HAPLOTYPE if it does not belong to one. virtual size_t get_haplotype(const path_handle_t& handle) const; static const size_t NO_HAPLOTYPE; - /// Get the phase block number (contiguously phased region of a sample, - /// contig, and haplotype) of the path-or-thread, or NO_PHASE_BLOCK if it - /// does not belong to one. - virtual size_t get_phase_block(const path_handle_t& handle) const; - static const size_t NO_PHASE_BLOCK; - /// Get the bounds of the path-or-thread that are actually represented /// here. Should be NO_SUBRANGE if the entirety is represented here, and /// 0-based inclusive start and exclusive end positions of the stored @@ -105,13 +95,21 @@ class PathMetadata { virtual subrange_t get_subrange(const path_handle_t& handle) const; static const subrange_t NO_SUBRANGE; static const offset_t NO_END_POSITION; + + /// Get the name of the scaffold that the path is on. This is the path name + /// without any subrange information. + virtual std::string get_path_scaffold_name(const path_handle_t& handle) const; + + /// Get the region that a path covers on its scaffold. Will compute the end + /// coordinate if not stored. + virtual region_t get_path_region(const path_handle_t& handle) const; //////////////////////////////////////////////////////////////////////////// // Tools for converting back and forth with single-string path names //////////////////////////////////////////////////////////////////////////// /// Extract the sense of a path from the given formatted path name, if - /// possible. If not possible, return SENSE_GENERIC. + /// possible. If not possible, return PathSense::GENERIC. static PathSense parse_sense(const std::string& path_name); /// Get the name of the sample or assembly embedded in the given formatted @@ -122,37 +120,39 @@ class PathMetadata { /// path name, or NO_LOCUS_NAME if it does not belong to one. static std::string parse_locus_name(const std::string& path_name); - /// Get the haplotype number (0 or 1, for diploid) embedded in the given + /// Get the haplotype number (0 for haploid, 1 or 2 for diploid) embedded in the given /// formatted path name, or NO_HAPLOTYPE if it does not belong to one. static size_t parse_haplotype(const std::string& path_name); - /// Get the phase block number (contiguously phased region of a sample, - /// contig, and haplotype) embedded in the given formatted path name, or - /// NO_PHASE_BLOCK if it does not belong to one. - static size_t parse_phase_block(const std::string& path_name); - /// Get the bounds embedded in the given formatted path name, or /// NO_SUBRANGE if they are absent. If no end position is stored, /// NO_END_POSITION may be returned for the end position. static subrange_t parse_subrange(const std::string& path_name); /// Decompose a formatted path name into metadata. + /// Expects 1-based, end-inclusive coordinates in subranges in the name, + /// and emits 0-based, end-exclusive coordinates. static void parse_path_name(const std::string& path_name, PathSense& sense, std::string& sample, std::string& locus, size_t& haplotype, - size_t& phase_block, subrange_t& subrange); + /// Decompose a scaffold name (without range) into metadata (without sense) + static void parse_scaffold_name(const std::string& scaffold_name, + std::string& sample, + std::string& locus, + size_t& haplotype); + /// Compose a formatted path name for the given metadata. Any item can be - /// the corresponding unset sentinel (PathMetadata::NO_LOCUS_NAME, - /// PathMetadata::NO_PHASE_BLOCK, etc.). + /// the corresponding unset sentinel (PathMetadata::NO_LOCUS_NAME, etc.). + /// Expects 0-based, end-exclusive coordinates and procudes 1-based, + /// end-inclusive coordinates in the name. static std::string create_path_name(const PathSense& sense, const std::string& sample, const std::string& locus, const size_t& haplotype, - const size_t& phase_block, const subrange_t& subrange); //////////////////////////////////////////////////////////////////////////// @@ -177,6 +177,16 @@ class PathMetadata { const std::unordered_set* samples, const std::unordered_set* loci, const Iteratee& iteratee) const; + + /// Loop through all the paths matching the given query. Query elements + /// which are null match everything. Returns false and stops if the + /// iteratee returns false. + template + bool for_each_path_matching(const std::unordered_set* senses, + const std::unordered_set* samples, + const std::unordered_set* loci, + const std::unordered_set* haplotypes, + const Iteratee& iteratee) const; /// Loop through all the paths matching the given query. Query elements /// which are empty match everything. Returns false and stops if the @@ -186,6 +196,22 @@ class PathMetadata { const std::unordered_set& samples, const std::unordered_set& loci, const Iteratee& iteratee) const; + + /// Loop through all the paths matching the given query. Query elements + /// which are empty match everything. Returns false and stops if the + /// iteratee returns false. + template + bool for_each_path_matching(const std::unordered_set& senses, + const std::unordered_set& samples, + const std::unordered_set& loci, + const std::unordered_set& haplotypes, + const Iteratee& iteratee) const; + + /// Loop through all the paths on the scaffold with the given name. Paths + /// are not necessarily visited in order. + template + bool for_each_path_on_scaffold(const std::string& scaffold_name, + const Iteratee& iteratee) const; /// Loop through all steps on the given handle for paths with the given /// sense. Returns false and stops if the iteratee returns false. @@ -210,7 +236,13 @@ class PathMetadata { virtual bool for_each_path_matching_impl(const std::unordered_set* senses, const std::unordered_set* samples, const std::unordered_set* loci, + const std::unordered_set* haplotypes, const std::function& iteratee) const; + + /// Loop through the handles of paths that are on the given scaffold. Paths + /// are not necessarily visited in order. Returns false and stops if the + /// iteratee returns false. + virtual bool for_each_path_on_scaffold_impl(const std::string& scaffold, const std::function& iteratee) const; /// Loop through all steps on the given handle for paths with the given /// sense. Returns false and stops if the iteratee returns false. @@ -224,6 +256,9 @@ class PathMetadata { /// Look up the name of a path from a handle to it virtual std::string get_path_name(const path_handle_t& path_handle) const = 0; + + /// Measure the length of a path. + virtual size_t get_path_length(const path_handle_t& path_handle) const = 0; /// Returns a handle to the path that an step is on virtual path_handle_t get_path_handle_of_step(const step_handle_t& step_handle) const = 0; @@ -253,11 +288,11 @@ class PathMetadata { //////////////////////////////////////////////////////////////////////////// static const std::regex FORMAT; + static const std::regex SCAFFOLD_FORMAT; static const size_t ASSEMBLY_OR_NAME_MATCH; static const size_t LOCUS_MATCH_NUMERICAL_WITHOUT_HAPLOTYPE; static const size_t HAPLOTYPE_MATCH; static const size_t LOCUS_MATCH_ANY; - static const size_t PHASE_BLOCK_MATCH; static const size_t RANGE_START_MATCH; static const size_t RANGE_END_MATCH; @@ -266,7 +301,6 @@ class PathMetadata { // Ranges are set off with some additional characters. static const char RANGE_START_SEPARATOR; static const char RANGE_END_SEPARATOR; - static const char RANGE_TERMINATOR; }; //////////////////////////////////////////////////////////////////////////// @@ -276,34 +310,61 @@ class PathMetadata { template bool PathMetadata::for_each_path_of_sense(const PathSense& sense, const Iteratee& iteratee) const { std::unordered_set senses{sense}; - return for_each_path_matching_impl(&senses, nullptr, nullptr, BoolReturningWrapper::wrap(iteratee)); + return for_each_path_matching_impl(&senses, nullptr, nullptr, nullptr, BoolReturningWrapper::wrap(iteratee)); } template bool PathMetadata::for_each_path_of_sample(const std::string& sample, const Iteratee& iteratee) const { std::unordered_set samples{sample}; - return for_each_path_matching_impl(nullptr, &samples, nullptr, BoolReturningWrapper::wrap(iteratee)); + return for_each_path_matching_impl(nullptr, &samples, nullptr, nullptr, BoolReturningWrapper::wrap(iteratee)); +} + +template +bool PathMetadata::for_each_path_matching(const std::unordered_set* senses, + const std::unordered_set* samples, + const std::unordered_set* loci, + const Iteratee& iteratee) const { + return for_each_path_matching_impl(senses, samples, loci, nullptr, BoolReturningWrapper::wrap(iteratee)); } template bool PathMetadata::for_each_path_matching(const std::unordered_set* senses, const std::unordered_set* samples, const std::unordered_set* loci, + const std::unordered_set* haplotypes, + const Iteratee& iteratee) const { + return for_each_path_matching_impl(senses, samples, loci, haplotypes, BoolReturningWrapper::wrap(iteratee)); +} + +template +bool PathMetadata::for_each_path_matching(const std::unordered_set& senses, + const std::unordered_set& samples, + const std::unordered_set& loci, const Iteratee& iteratee) const { - return for_each_path_matching_impl(senses, samples, loci, BoolReturningWrapper::wrap(iteratee)); + return for_each_path_matching(senses.empty() ? nullptr : &senses, + samples.empty() ? nullptr : &samples, + loci.empty() ? nullptr : &loci, + iteratee); } template bool PathMetadata::for_each_path_matching(const std::unordered_set& senses, const std::unordered_set& samples, const std::unordered_set& loci, + const std::unordered_set& haplotypes, const Iteratee& iteratee) const { return for_each_path_matching(senses.empty() ? nullptr : &senses, samples.empty() ? nullptr : &samples, loci.empty() ? nullptr : &loci, + haplotypes.empty() ? nullptr : &haplotypes, iteratee); } +template +bool PathMetadata::for_each_path_on_scaffold(const std::string& scaffold_name, const Iteratee& iteratee) const { + return for_each_path_on_scaffold_impl(scaffold_name, BoolReturningWrapper::wrap(iteratee)); +} + template bool PathMetadata::for_each_step_of_sense(const handle_t& visited, const PathSense& sense, const Iteratee& iteratee) const { return for_each_step_of_sense_impl(visited, sense, BoolReturningWrapper::wrap(iteratee)); diff --git a/src/include/handlegraph/types.hpp b/src/include/handlegraph/types.hpp index 5576a3d..69e25c9 100644 --- a/src/include/handlegraph/types.hpp +++ b/src/include/handlegraph/types.hpp @@ -9,6 +9,8 @@ #include #include #include +#include +#include namespace handlegraph { @@ -29,9 +31,28 @@ typedef std::size_t offset_t; [[deprecated("off_t collides with a POSIX type, use offset_t instead")]] typedef offset_t off_t; -/// Represents a range of offsets, 0-based, end-exclusive +/// Represents a range of offsets, 0-based, end-exclusive. +/// The end may be PathMetadata::NO_END_POSITION. typedef std::pair subrange_t; +/// Represents a position or range on a named scaffold. May partially cover +/// zero or more paths with subranges in a graph. Its subrange must always have +/// a start and an end set. +typedef std::pair region_t; + +/// Parse a region_t from user-facing one-based end-inclusive coordinates. +/// Raises std::invalid_argument if the provided string is not understandable +/// as a region. The region must include an end coordinate. +region_t parse_region(const std::string& region_text); + +/// Turn a region_t into a user-facing one-based end-inclusive coordinate +/// string. The region must include an end coordinate. +std::string to_string(const region_t& region); + +/// Write a region_t to a stream as a user-facing one-based end-inclusive +/// coordinate string. The region must include an end coordinate. +std::ostream& operator<<(std::ostream& out, const region_t region); + /// Represents a position typedef std::tuple pos_t; diff --git a/src/mutable_path_metadata.cpp b/src/mutable_path_metadata.cpp index 7eb6125..9ab593b 100644 --- a/src/mutable_path_metadata.cpp +++ b/src/mutable_path_metadata.cpp @@ -12,11 +12,10 @@ path_handle_t MutablePathMetadata::create_path(const PathSense& sense, const std::string& sample, const std::string& locus, const size_t& haplotype, - const size_t& phase_block, const subrange_t& subrange, bool is_circular) { - return create_path_handle(PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange), is_circular); + return create_path_handle(PathMetadata::create_path_name(sense, sample, locus, haplotype, subrange), is_circular); } } diff --git a/src/path_handle_graph.cpp b/src/path_handle_graph.cpp index c3e9966..1c0c206 100644 --- a/src/path_handle_graph.cpp +++ b/src/path_handle_graph.cpp @@ -36,7 +36,17 @@ bool PathHandleGraph::is_empty(const path_handle_t& path_handle) const { // But some implementations may have an expensive length query and a cheaper emptiness one return get_step_count(path_handle) == 0; } - + +size_t PathHandleGraph::get_path_length(const path_handle_t& path_handle) const { + // By default we scan the path to compute the length. + // PathPositionHandleGraph makes this abstract again so you must provide a real implementation. + size_t length = 0; + for (handle_t node : scan_path(path_handle)) { + length += get_length(node); + } + return length; +} + PathForEachSocket PathHandleGraph::scan_path(const path_handle_t& path) const { return PathForEachSocket(this, path); } diff --git a/src/path_metadata.cpp b/src/path_metadata.cpp index 5af4951..b397df9 100644 --- a/src/path_metadata.cpp +++ b/src/path_metadata.cpp @@ -11,7 +11,6 @@ namespace handlegraph { const std::string PathMetadata::NO_SAMPLE_NAME = ""; const std::string PathMetadata::NO_LOCUS_NAME = ""; const size_t PathMetadata::NO_HAPLOTYPE = std::numeric_limits::max(); -const size_t PathMetadata::NO_PHASE_BLOCK = std::numeric_limits::max(); const offset_t PathMetadata::NO_END_POSITION = std::numeric_limits::max(); const subrange_t PathMetadata::NO_SUBRANGE{PathMetadata::NO_END_POSITION, PathMetadata::NO_END_POSITION}; @@ -27,25 +26,27 @@ const subrange_t PathMetadata::NO_SUBRANGE{PathMetadata::NO_END_POSITION, PathMe // So we match a regex for: // One separator-free name component -// Up to 3 other optional separator-free name components, led by separators tacked on by non-capturing groups. Last one must be a number. -// Haplotype one must also always be a number, or we aren't allowed to match (or we will crash trying to parse the number). -// Possibly a bracket-bounded non-capturing group at the end +// Up to 2 other optional separator-free name components, led by separators tacked on by non-capturing groups. +// Haplotype one must always be a number, or we aren't allowed to match (or we will crash trying to parse the number). +// Last one is lazy and allows colons but only takes as many characters as needed. +// Possibly a colon-delimited non-capturing group at the end // Which has a number, and possibly a dash-led non-capturing group with a number. -// Match number: 1 2 3 4 5 6 -const std::regex PathMetadata::FORMAT(R"(([^[#]*)(?:#(\d+))?(?:#([^[#]*))?(?:#(\d+))?(?:\[(\d+)(?:-(\d+))?\])?)"); +// Match number: 1 2 3 4 5 +const std::regex PathMetadata::FORMAT(R"(([^[#]*)(?:#(\d+))?(?:#([^#]*?))?(?::(\d+)(?:-(\d+))?)?)"); +// We also need to be able to parse scaffold names that we know can't contain subranges we respect. +// This uses the same groups, but the final peice is greedy +const std::regex PathMetadata::SCAFFOLD_FORMAT(R"(([^[#]*)(?:#(\d+))?(?:#([^#]*))?)"); const size_t PathMetadata::ASSEMBLY_OR_NAME_MATCH = 1; const size_t PathMetadata::LOCUS_MATCH_NUMERICAL_WITHOUT_HAPLOTYPE = 2; const size_t PathMetadata::HAPLOTYPE_MATCH = 2; const size_t PathMetadata::LOCUS_MATCH_ANY = 3; -const size_t PathMetadata::PHASE_BLOCK_MATCH = 4; -const size_t PathMetadata::RANGE_START_MATCH = 5; -const size_t PathMetadata::RANGE_END_MATCH = 6; +const size_t PathMetadata::RANGE_START_MATCH = 4; +const size_t PathMetadata::RANGE_END_MATCH = 5; // And these are the constants for composing path names from metadata const char PathMetadata::SEPARATOR = '#'; -const char PathMetadata::RANGE_START_SEPARATOR = '['; +const char PathMetadata::RANGE_START_SEPARATOR = ':'; const char PathMetadata::RANGE_END_SEPARATOR = '-'; -const char PathMetadata::RANGE_TERMINATOR = ']'; PathSense PathMetadata::get_sense(const path_handle_t& handle) const { @@ -64,33 +65,60 @@ size_t PathMetadata::get_haplotype(const path_handle_t& handle) const { return PathMetadata::parse_haplotype(get_path_name(handle)); } -size_t PathMetadata::get_phase_block(const path_handle_t& handle) const { - return PathMetadata::parse_phase_block(get_path_name(handle)); -} - subrange_t PathMetadata::get_subrange(const path_handle_t& handle) const { return PathMetadata::parse_subrange(get_path_name(handle)); } -PathSense PathMetadata::parse_sense(const std::string& path_name) { - // Match the regex - std::smatch result; - if (std::regex_match(path_name, result, FORMAT)) { - // It's something we know. - if (result[PHASE_BLOCK_MATCH].matched) { - // It's a haplotype because it has a phase block - return PathSense::HAPLOTYPE; - } else if (result[LOCUS_MATCH_NUMERICAL_WITHOUT_HAPLOTYPE].matched || result[LOCUS_MATCH_ANY].matched) { - // It's a reference because it has a locus and a sample - return PathSense::REFERENCE; - } else { - // It's just a one-piece generic name - return PathSense::GENERIC; - } - } else { - // We can't parse this at all. - return PathSense::GENERIC; + +std::string PathMetadata::get_path_scaffold_name(const path_handle_t& handle) const { + // TODO: With the default implementations for the get methods this does a + // lot of regex parsing just to split on the last colon (unless it doesn't + // actually parse that way). + // TODO: Write a consolidated get method? + + PathSense sense = get_sense(handle); + std::string sample = get_sample_name(handle); + std::string locus = get_locus_name(handle); + size_t haplotype = get_haplotype(handle); + + // Just make a default style path name without a subrange. + return create_path_name(sense, sample, locus, haplotype, NO_SUBRANGE); +} + +region_t PathMetadata::get_path_region(const path_handle_t& handle) const { + region_t region; + region.first = get_path_scaffold_name(handle); + region.second = get_subrange(handle); + if (region.second == NO_SUBRANGE) { + // We need to occupy the whole space + region.second.first = 0; + region.second.second = NO_END_POSITION; + } + if (region.second.second == NO_END_POSITION) { + // Go ask for the path's length + region.second.second = get_path_length(handle); } + return region; +} + +PathSense PathMetadata::parse_sense(const std::string& path_name) { + // To get the sense we have to parse the whole thing and use its internal + // guessing logic. + PathSense sense; + std::string sample; + std::string locus; + size_t haplotype; + subrange_t subrange; + parse_path_name( + path_name, + sense, + sample, + locus, + haplotype, + subrange + ); + + return sense; } @@ -153,24 +181,6 @@ size_t PathMetadata::parse_haplotype(const std::string& path_name) { } -size_t PathMetadata::parse_phase_block(const std::string& path_name) { - // Match the regex - std::smatch result; - if (std::regex_match(path_name, result, FORMAT)) { - if (result[PHASE_BLOCK_MATCH].matched) { - // There's a phase block. - // We know it is a number. - return std::stoll(result[PHASE_BLOCK_MATCH].str()); - } else { - // No phase block is stored - return NO_PHASE_BLOCK; - } - } else { - // We can't parse this at all. - return NO_PHASE_BLOCK; - } -} - subrange_t PathMetadata::parse_subrange(const std::string& path_name) { auto to_return = NO_SUBRANGE; @@ -195,7 +205,6 @@ void PathMetadata::parse_path_name(const std::string& path_name, std::string& sample, std::string& locus, size_t& haplotype, - size_t& phase_block, subrange_t& subrange) { std::smatch result; @@ -205,17 +214,6 @@ void PathMetadata::parse_path_name(const std::string& path_name, // TODO: can we unify this with the other places we parse out from the // regex? With yet a third set of functions? if (matched) { - if (result[PHASE_BLOCK_MATCH].matched) { - // It's a haplotype because it has a phase block. - sense = PathSense::HAPLOTYPE; - } else if (result[LOCUS_MATCH_ANY].matched || result[LOCUS_MATCH_NUMERICAL_WITHOUT_HAPLOTYPE].matched) { - // It's a reference because it has a locus and a sample - sense = PathSense::REFERENCE; - } else { - // It's just a one-piece generic name - sense = PathSense::GENERIC; - } - if (result[LOCUS_MATCH_ANY].matched && result[HAPLOTYPE_MATCH].matched) { // There's a haplotype and a locus and a sample sample = result[ASSEMBLY_OR_NAME_MATCH].str(); @@ -238,18 +236,14 @@ void PathMetadata::parse_path_name(const std::string& path_name, haplotype = NO_HAPLOTYPE; } - if (result[PHASE_BLOCK_MATCH].matched) { - // There's a phase block. - // We know it is a number. - phase_block = std::stoll(result[PHASE_BLOCK_MATCH].str()); - } else { - // No phase block is stored - phase_block = NO_PHASE_BLOCK; - } - if (result[RANGE_START_MATCH].matched) { - // There is a range start, so pasre it + // There is a range start, so parse it subrange.first = std::stoll(result[RANGE_START_MATCH].str()); + // Make sure to convert it to 0-based, end-exclusive coordinates. + if (subrange.first == 0) { + throw std::invalid_argument("Expected 1-based indexing in " + path_name); + } + subrange.first--; if (result[RANGE_END_MATCH].matched) { // There is also an end, so parse that too subrange.second = std::stoll(result[RANGE_END_MATCH].str()); @@ -259,22 +253,76 @@ void PathMetadata::parse_path_name(const std::string& path_name, } else { subrange = NO_SUBRANGE; } + + if (result[LOCUS_MATCH_ANY].matched || result[LOCUS_MATCH_NUMERICAL_WITHOUT_HAPLOTYPE].matched) { + // It's a reference or haplotype because it has a locus and a sample. + + // TODO: We don't actually have a way to distinguish the sense by + // name anymore without phase blocks. Cheat and abuse the fact that + // references usually use haplotype 0 and haplotypes usually use 1 + // and 2. + if (haplotype == 0 || haplotype == NO_HAPLOTYPE) { + sense = PathSense::REFERENCE; + } else { + sense = PathSense::HAPLOTYPE; + } + } else { + // It's just a one-piece generic name + sense = PathSense::GENERIC; + } } else { // Just a generic path where the locus is all of it. sense = PathSense::GENERIC; sample = NO_SAMPLE_NAME; locus = path_name; haplotype = NO_HAPLOTYPE; - phase_block = NO_PHASE_BLOCK; subrange = NO_SUBRANGE; } } +void PathMetadata::parse_scaffold_name(const std::string& scaffold_name, + std::string& sample, + std::string& locus, + size_t& haplotype) { + + std::smatch result; + auto matched = std::regex_match(scaffold_name, result, SCAFFOLD_FORMAT); + + // Parse out each piece. + if (matched) { + if (result[LOCUS_MATCH_ANY].matched && result[HAPLOTYPE_MATCH].matched) { + // There's a haplotype and a locus and a sample + sample = result[ASSEMBLY_OR_NAME_MATCH].str(); + locus = result[LOCUS_MATCH_ANY].str(); + haplotype = std::stoll(result[HAPLOTYPE_MATCH].str()); + } else if (result[LOCUS_MATCH_NUMERICAL_WITHOUT_HAPLOTYPE].matched) { + // There's a numerical locus but no haplotype, and a sample + sample = result[ASSEMBLY_OR_NAME_MATCH].str(); + locus = result[LOCUS_MATCH_NUMERICAL_WITHOUT_HAPLOTYPE].str(); + haplotype = NO_HAPLOTYPE; + } else if (result[LOCUS_MATCH_ANY].matched) { + // There's a non-numerical locus but no haplotype, and a sample + sample = result[ASSEMBLY_OR_NAME_MATCH].str(); + locus = result[LOCUS_MATCH_ANY].str(); + haplotype = NO_HAPLOTYPE; + } else { + // There's nothing but the locus. + sample = NO_SAMPLE_NAME; + locus = result[ASSEMBLY_OR_NAME_MATCH].str(); + haplotype = NO_HAPLOTYPE; + } + } else { + // Just a generic path where the locus is all of it. + sample = NO_SAMPLE_NAME; + locus = scaffold_name; + haplotype = NO_HAPLOTYPE; + } +} + std::string PathMetadata::create_path_name(const PathSense& sense, const std::string& sample, const std::string& locus, const size_t& haplotype, - const size_t& phase_block, const subrange_t& subrange) { std::stringstream name_builder; @@ -312,25 +360,13 @@ std::string PathMetadata::create_path_name(const PathSense& sense, throw std::runtime_error("Haplotype path must have a locus"); } } - if (phase_block != NO_PHASE_BLOCK) { - if (sense == PathSense::GENERIC) { - throw std::runtime_error("Generic path cannot have a phase block"); - } else if (sense == PathSense::REFERENCE) { - throw std::runtime_error("Reference path cannot have a phase block"); - } - name_builder << SEPARATOR << phase_block; - } else { - if (sense == PathSense::HAPLOTYPE) { - throw std::runtime_error("Haplotype path must have a phase block"); - } - } if (subrange != NO_SUBRANGE) { // Everything can have a subrange. - name_builder << RANGE_START_SEPARATOR << subrange.first; + // Make sure to convert to 1-based, end-inclusive coordinates. + name_builder << RANGE_START_SEPARATOR << subrange.first + 1; if (subrange.second != NO_END_POSITION) { name_builder << RANGE_END_SEPARATOR << subrange.second; } - name_builder << RANGE_TERMINATOR; } return name_builder.str(); @@ -339,6 +375,7 @@ std::string PathMetadata::create_path_name(const PathSense& sense, bool PathMetadata::for_each_path_matching_impl(const std::unordered_set* senses, const std::unordered_set* samples, const std::unordered_set* loci, + const std::unordered_set* haplotypes, const std::function& iteratee) const { return for_each_path_handle_impl([&](const path_handle_t& handle) { if (senses && !senses->count(get_sense(handle))) { @@ -353,11 +390,26 @@ bool PathMetadata::for_each_path_matching_impl(const std::unordered_setcount(get_haplotype(handle))) { + // Wrong haplotype + return true; + } // And emit any matching handles return iteratee(handle); }); } - + +bool PathMetadata::for_each_path_on_scaffold_impl(const std::string& scaffold_name, const std::function& iteratee) const { + // Parse out the region into some structured metadata + std::string sample; + std::string locus; + size_t haplotype; + parse_scaffold_name(scaffold_name, sample, locus, haplotype); + + // Query the matching paths + return for_each_path_matching({}, {sample}, {locus}, {haplotype}, iteratee); +} + bool PathMetadata::for_each_step_of_sense_impl(const handle_t& visited, const PathSense& sense, const std::function& iteratee) const { return for_each_step_on_handle_impl(visited, [&](const step_handle_t& handle) { if (get_sense(get_path_handle_of_step(handle)) != sense) { diff --git a/src/types.cpp b/src/types.cpp index ae7da1b..ecc04f0 100644 --- a/src/types.cpp +++ b/src/types.cpp @@ -1,6 +1,11 @@ #include "handlegraph/types.hpp" #include "handlegraph/util.hpp" +#include "handlegraph/path_metadata.hpp" + +#include +#include +#include /** \file types.cpp * Implement operators for libhandlegraph value types @@ -8,42 +13,100 @@ namespace handlegraph { -/// Define equality on handles +region_t parse_region(const std::string& region_text) { + region_t result; + size_t last_colon = region_text.rfind(":"); + if (last_colon == std::string::npos) { + throw std::invalid_argument("Cannot parse coordinate region: No ':' found in " + region_text); + } + if (last_colon == 0) { + throw std::invalid_argument("Cannot parse coordinate region: No text before ':' in " + region_text); + } + // Sequence name is everything up to the colon + result.first = region_text.substr(0, last_colon); + size_t dash = region_text.find("-", last_colon); + if (dash == std::string::npos) { + throw std::invalid_argument("Cannot parse coordinate region: No '-' found after last ':' in " + region_text); + } + + // Make sure we actually have content on both sides of the dash + if (dash == last_colon + 1) { + throw std::invalid_argument("Cannot parse coordinate region: No text between last ':' and '-' in " + region_text); + } + + if (region_text.size() == dash + 1) { + throw std::invalid_argument("Cannot parse coordinate region: No text after '-' in " + region_text); + } + + const char* start = region_text.c_str() + last_colon + 1; + char* end; + result.second.first = strtoll(start, &end, 10); + if (end != region_text.c_str() + dash) { + // We didn't parse everything before the dash + throw std::invalid_argument("Cannot parse coordinate region: Non-number found before '-' in " + region_text); + } + if (result.second.first == 0) { + throw std::invalid_argument("Cannot parse coordinate region: Expected 1-based indexing in " + region_text); + } + + start = region_text.c_str() + dash + 1; + result.second.second = strtoll(start, &end, 10); + if (end != region_text.c_str() + region_text.size()) { + // We didn't parse everything after the dash + throw std::invalid_argument("Cannot parse coordinate region: Non-number found after '-' in " + region_text); + } + + // Convert the range from 1-based, end-inclusive to 0-based, end-exclusive + result.second.first -= 1; + + return result; +} + +std::string to_string(const region_t& region) { + std::stringstream ss; + ss << region; + return ss.str(); +} + +std::ostream& operator<<(std::ostream& out, const region_t region) { + // Regions are always supposed to actually have start and end positions. + if (region.second == PathMetadata::NO_SUBRANGE) { + throw std::invalid_argument("Region on " + region.first + " does not have a subrange"); + } + if (region.second.second == PathMetadata::NO_END_POSITION) { + throw std::invalid_argument("Region on " + region.first + " starting at " + std::to_string(region.second.first) + " does not have an end position"); + } + return out << region.first << ":" << (region.second.first + 1) << "-" << region.second.second; +} + bool operator==(const handle_t& a, const handle_t& b) { return as_integer(a) == as_integer(b); } -/// Define inequality on handles bool operator!=(const handle_t& a, const handle_t& b) { return as_integer(a) != as_integer(b); } -/// Define equality on path handles bool operator==(const path_handle_t& a, const path_handle_t& b) { return as_integer(a) == as_integer(b); } -/// Define inequality on path handles bool operator!=(const path_handle_t& a, const path_handle_t& b) { return as_integer(a) != as_integer(b); } -/// Define equality on step handles bool operator==(const step_handle_t& a, const step_handle_t& b) { return as_integers(a)[0] == as_integers(b)[0] && as_integers(a)[1] == as_integers(b)[1]; } -/// Define inequality on step handles bool operator!=(const step_handle_t& a, const step_handle_t& b) { return !(a == b); } -/// Define equality on net handles bool operator==(const net_handle_t& a, const net_handle_t& b) { return as_integer(a) == as_integer(b); } -/// Define inequality on net handles bool operator!=(const net_handle_t& a, const net_handle_t& b) { return !(a == b); }