Skip to content

Commit 1f62ee8

Browse files
committed
move newick parsing to separate class
1 parent 1eaaa38 commit 1f62ee8

File tree

4 files changed

+260
-239
lines changed

4 files changed

+260
-239
lines changed

src/commons/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@ set(commons_source_files
33
commons/FoldmasonParameters.cpp
44
commons/StructureSmithWaterman.cpp
55
commons/StructureSmithWaterman.h
6+
commons/newick.cpp
7+
commons/newick.h
68
PARENT_SCOPE)

src/commons/newick.cpp

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#include <iostream>
2+
#include <string>
3+
#include <vector>
4+
#include <stack>
5+
#include "Util.h"
6+
#include "newick.h"
7+
8+
NewickParser::Node* NewickParser::parse(const std::string& newick) {
9+
Node* root = new Node();
10+
std::stack<Node*> stack;
11+
stack.push(root); // dummy root
12+
std::string token;
13+
14+
std::string parsed = "";
15+
bool readingBranchLength = false;
16+
17+
for (char ch : newick) {
18+
parsed += ch;
19+
if (readingBranchLength) {
20+
if (ch == ',' || ch == ')' || ch == ';') {
21+
readingBranchLength = false;
22+
} else {
23+
continue;
24+
}
25+
}
26+
switch (ch) {
27+
case '(': // Start new node
28+
stack.push(new Node());
29+
break;
30+
case ')': // End of a node
31+
if (!token.empty()) {
32+
stack.top()->children.push_back(new Node(token));
33+
token.clear();
34+
}
35+
if (!stack.empty()) {
36+
Node* finishedNode = stack.top();
37+
stack.pop();
38+
if (!stack.empty()) {
39+
stack.top()->children.push_back(finishedNode);
40+
}
41+
}
42+
break;
43+
case ',': // Another child of the current node
44+
if (!token.empty()) {
45+
stack.top()->children.push_back(new Node(token));
46+
token.clear();
47+
}
48+
break;
49+
case ':': // Branch length portion starting; set flag to ignore
50+
readingBranchLength = true;
51+
break;
52+
case ';': // End of tree
53+
break;
54+
default: // Otherwise just add to the token if alphanumeric
55+
if (!isspace(ch)) {
56+
token += ch;
57+
}
58+
break;
59+
}
60+
}
61+
Node* actualRoot = root->children.empty() ? nullptr : root->children.front();
62+
root->children.clear();
63+
delete root;
64+
return actualRoot;
65+
}
66+
67+
std::string NewickParser::toNewick(const NewickParser::Node* node) {
68+
if (!node) return "";
69+
std::string buffer;
70+
if (!node->children.empty()) {
71+
// Intermediate node with children
72+
buffer += "(";
73+
for (size_t i = 0; i < node->children.size(); i++) {
74+
buffer += NewickParser::toNewick(node->children[i]);
75+
if (i < node->children.size() - 1) {
76+
buffer += ",";
77+
}
78+
}
79+
buffer += ")";
80+
}
81+
// Leaf node, just add name
82+
buffer += node->name;
83+
84+
return buffer;
85+
}
86+
87+
/**
88+
* @brief Post-order traversal of a parsed Tree.
89+
* Generates the merging order for structuremsa
90+
*
91+
* @param node Pointer to root TNode of the tree
92+
*/
93+
void NewickParser::postOrder(NewickParser::Node *node, std::vector<std::string> *linkage) {
94+
for (NewickParser::Node *child : node->children) {
95+
postOrder(child, linkage);
96+
}
97+
if (node->children.size() > 0) {
98+
for (NewickParser::Node *child : node->children) {
99+
linkage->push_back(child->name);
100+
101+
// Propagate child name from leaf to root, so we
102+
// always have a reference during alignment stage
103+
node->name = child->name;
104+
}
105+
}
106+
}
107+
108+
/**
109+
* @brief Update nodeMap to point to newest root for ALL children of a Node
110+
*
111+
* @param node
112+
* @param newParent
113+
* @param nodeMap
114+
*/
115+
void NewickParser::updateDescendants( NewickParser::Node* node, NewickParser::Node* newParent, std::unordered_map<size_t, Node*>& nodeMap) {
116+
nodeMap[node->id] = newParent;
117+
for (NewickParser::Node* child : node->children) {
118+
nodeMap[child->id] = newParent;
119+
updateDescendants(child, newParent, nodeMap);
120+
}
121+
}
122+
123+
/**
124+
* @brief Build a tree from a list of successive merges (i.e. with queryId/targetId)
125+
*
126+
* @param merges
127+
* @return NewickParser::Node*
128+
*/
129+
NewickParser::Node* NewickParser::buildTree(std::vector<AlnSimple> &merges) {
130+
std::unordered_map<size_t, NewickParser::Node*> nodeMap;
131+
NewickParser::Node* root = nullptr;
132+
NewickParser::Node* nodeA = nullptr;
133+
NewickParser::Node* nodeB = nullptr;
134+
for (const AlnSimple& merge : merges) {
135+
root = new NewickParser::Node(merge.queryId);
136+
nodeA = (nodeMap.find(merge.queryId) == nodeMap.end()) ? new NewickParser::Node(merge.queryId) : nodeMap[merge.queryId];
137+
nodeB = (nodeMap.find(merge.targetId) == nodeMap.end()) ? new NewickParser::Node(merge.targetId) : nodeMap[merge.targetId];
138+
root->children.push_back(nodeA);
139+
root->children.push_back(nodeB);
140+
updateDescendants(nodeA, root, nodeMap);
141+
updateDescendants(nodeB, root, nodeMap);
142+
}
143+
return root;
144+
}
145+
146+
147+
void NewickParser::addNames(Node* root, IndexReader* headers) {
148+
for (auto &child : root->children) {
149+
NewickParser::addNames(child, headers);
150+
}
151+
if (root->children.size() == 0) {
152+
unsigned int headerId = headers->sequenceReader->getId(root->id);
153+
root->name = Util::parseFastaHeader(headers->sequenceReader->getData(headerId, 0));
154+
}
155+
}

src/commons/newick.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#include <iostream>
2+
#include <string>
3+
#include <vector>
4+
#include <unordered_map>
5+
#include "IndexReader.h"
6+
7+
#ifndef NEWICK_H
8+
#define NEWICK_H
9+
10+
struct AlnSimple {
11+
size_t queryId;
12+
size_t targetId;
13+
int score;
14+
};
15+
16+
class NewickParser {
17+
public:
18+
struct Node {
19+
size_t id;
20+
std::string name;
21+
std::vector<Node*> children;
22+
Node(const std::string& name = "") : name(name) {}
23+
Node(size_t id) : id(id) {}
24+
~Node() {
25+
for (auto *child : children) {
26+
delete child;
27+
}
28+
}
29+
};
30+
static Node* parse(const std::string& newick);
31+
static Node* buildTree(std::vector<AlnSimple> &merges);
32+
static void updateDescendants(Node* node, Node* newParent, std::unordered_map<size_t, Node*>& nodeMap);
33+
static void addNames(Node* root, IndexReader* headers);
34+
static void postOrder(Node* node, std::vector<std::string> *linkage);
35+
static std::string toNewick(const Node* node);
36+
private:
37+
38+
};
39+
40+
#endif

0 commit comments

Comments
 (0)