Skip to content

Commit f526186

Browse files
authored
Merge pull request #3 from dpryan79/speedup
Speedup up sequence/base fetching.
2 parents 448a439 + 0550276 commit f526186

File tree

4 files changed

+156
-83
lines changed

4 files changed

+156
-83
lines changed

2bit.c

+146-82
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,41 @@ char byte2base(uint8_t byte, int offset) {
6262
return bases[foo];
6363
}
6464

65+
void bytes2bases(char *seq, uint8_t *byte, uint32_t sz, int offset) {
66+
uint32_t pos = 0, remainder = 0, i = 0;
67+
char bases[4] = "TCAG";
68+
uint8_t foo = byte[0];
69+
70+
// Deal with the first partial byte
71+
if(offset != 0) {
72+
while(offset < 4) {
73+
seq[pos++] = byte2base(foo, offset++);
74+
}
75+
foo = byte[++i];
76+
}
77+
78+
// Deal with everything else, with the possible exception of the last fractional byte
79+
remainder = (sz - pos) % 4;
80+
while(pos < sz - remainder) {
81+
foo = byte[i++];
82+
seq[pos + 3] = bases[foo & 3];
83+
foo >>= 2;
84+
seq[pos + 2] = bases[foo & 3];
85+
foo >>= 2;
86+
seq[pos + 1] = bases[foo & 3];
87+
foo >>= 2;
88+
seq[pos] = bases[foo & 3];
89+
foo >>= 2;
90+
pos += 4;
91+
}
92+
93+
// Deal with the last partial byte
94+
if(remainder > 0) foo = byte[i];
95+
for(offset=0; offset<remainder; offset++) {
96+
seq[pos++] = byte2base(foo, offset);
97+
}
98+
}
99+
65100
/*
66101
Replace Ts (or whatever else is being used) with N as appropriate
67102
*/
@@ -122,25 +157,23 @@ void softMask(char *seq, TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end)
122157
This is the worker function for twobitSequence, which mostly does error checking
123158
*/
124159
char *constructSequence(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
125-
uint32_t sz = end - start + 1, pos = 0;
126-
uint32_t blockStart, offset;
127-
char *seq = malloc(sz * sizeof(char)), byte;
160+
uint32_t sz = end - start + 1;
161+
uint32_t blockStart, blockEnd, offset;
162+
char *seq = malloc(sz * sizeof(char));
163+
uint8_t *bytes = NULL;
128164
if(!seq) return NULL;
129165

130166
//There are 4 bases/byte
131167
blockStart = start/4;
132168
offset = start % 4;
169+
blockEnd = end/4 + ((end % 4) ? 1 : 0);
170+
bytes = malloc(blockEnd - blockStart);
171+
if(!bytes) goto error;
133172

134173
if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
135-
while(pos < sz - 1) {
136-
if(twobitRead(&byte, 1, 1, tb) != 1) goto error;
137-
138-
for(; offset<4; offset++) {
139-
seq[pos++] = byte2base(byte, offset);
140-
if(pos >= sz - 1) break;
141-
}
142-
offset = 0;
143-
}
174+
if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
175+
bytes2bases(seq, bytes, sz - 1, offset);
176+
free(bytes);
144177

145178
//Null terminate the output
146179
seq[sz - 1] = '\0';
@@ -155,6 +188,7 @@ char *constructSequence(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end)
155188

156189
error:
157190
if(seq) free(seq);
191+
if(bytes) free(bytes);
158192
return NULL;
159193
}
160194

@@ -227,59 +261,23 @@ void getMask(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, uint32_t *m
227261
}
228262
}
229263

230-
/*
231-
Determine whether "start + pos" overlaps an N block. If so, update pos, return 1, and update the mask stuff (as well as update blockStart and offset). If there's no overlap, return 0.
232-
233-
Returning a value of 1 indicates that pos has been updated (i.e., there was an overlap). A value of 0 indicates otherwise.
234-
235-
A return value of -1 indicates an error (in twobitSeek).
236-
*/
237-
int cycle(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, uint32_t *pos, uint32_t *maskIdx, uint32_t *maskStart, uint32_t *maskEnd, uint32_t *blockStart, uint32_t *offset) {
238-
if(*maskIdx < tb->idx->nBlockCount[tid]) {
239-
if(end < *maskStart) {
240-
getMask(tb, tid, start, end, maskIdx, maskStart, maskEnd);
241-
if(*maskIdx < tb->idx->nBlockCount[tid]) return 0;
242-
}
243-
if((start + *pos >= *maskStart) && (start + *pos < *maskEnd)) {
244-
*pos += (*maskEnd) - (start + *pos);
245-
*blockStart = (start + *pos)/4;
246-
*offset = (start + *pos) % 4;
247-
getMask(tb, tid, start, end, maskIdx, maskStart, maskEnd);
248-
if(twobitSeek(tb, tb->idx->offset[tid] + *blockStart) != 0) return -1;
249-
return 1;
250-
}
251-
}
252-
return 0;
253-
}
254-
255-
/*
256-
Increment the counter for the appropriate base
257-
*/
258-
void increment(char base, uint32_t *A, uint32_t *C, uint32_t *T, uint32_t *G) {
259-
switch(base) {
260-
case 'T':
261-
*T += 1;
262-
break;
263-
case 'C':
264-
*C += 1;
265-
break;
266-
case 'A':
267-
*A += 1;
268-
break;
269-
case 'G':
270-
*G += 1;
271-
break;
264+
uint8_t getByteMaskFromOffset(int offset) {
265+
switch(offset) {
266+
case 0:
267+
return (uint8_t) 15;
268+
case 1:
269+
return (uint8_t) 7;
270+
case 2:
271+
return (uint8_t) 3;
272272
}
273+
return 1;
273274
}
274275

275276
void *twobitBasesWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, int fraction) {
276277
void *out;
277-
uint32_t sz = end - start, pos = 0;
278-
uint32_t A = 0, C = 0, T = 0, G = 0, len = end - start;
279-
uint32_t maskIdx = -1, maskStart = -1, maskEnd = -1;
280-
uint32_t blockStart, offset;
281-
uint8_t byte, base;
282-
int rv = 0;
278+
uint32_t tmp[4] = {0, 0, 0, 0}, len = end - start + (start % 4), i = 0, j = 0;
279+
uint32_t blockStart, blockEnd, maskIdx = (uint32_t) -1, maskStart, maskEnd, foo;
280+
uint8_t *bytes = NULL, mask = 0, offset;
283281

284282
if(fraction) {
285283
out = malloc(4 * sizeof(double));
@@ -288,43 +286,109 @@ void *twobitBasesWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end,
288286
}
289287
if(!out) return NULL;
290288

291-
getMask(tb, tid, start, end, &maskIdx, &maskStart, &maskEnd);
292-
293289
//There are 4 bases/byte
294290
blockStart = start/4;
295291
offset = start % 4;
292+
blockEnd = end/4 + ((end % 4) ? 1 : 0);
293+
bytes = malloc(blockEnd - blockStart);
294+
if(!bytes) goto error;
295+
296+
//Set the initial mask, reset start/offset so we always deal with full bytes
297+
mask = getByteMaskFromOffset(offset);
298+
start = 4 * blockStart;
299+
offset = 0;
296300

297301
if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
298-
while(pos < sz) {
299-
//Iterate over the 4 (or less) bases in each byte
300-
if(twobitRead(&byte, 1, 1, tb) != 1) goto error;
301-
for(; offset<4; offset++) {
302-
rv = cycle(tb, tid, start, end, &pos, &maskIdx, &maskStart, &maskEnd, &blockStart, &offset);
303-
if(rv == -1) goto error;
304-
if(rv == 1) break;
305-
base = byte2base(byte, offset);
306-
increment(base, &A, &C, &T, &G);
307-
if(++pos >= sz) break;
302+
if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
303+
304+
//Get the index/start/end of the next N-mask block
305+
getMask(tb, tid, start, end, &maskIdx, &maskStart, &maskEnd);
306+
307+
while(i < len) {
308+
// Check if we need to jump
309+
if(maskIdx != -1 && start + i + 4 >= maskStart) {
310+
if(start + i >= maskStart || start + i + 4 - offset > maskStart) {
311+
//Jump iff the whole byte is inside an N block
312+
if(start + i >= maskStart && start + i + 4 - offset < maskEnd) {
313+
//iff we're fully in an N block then jump
314+
i = maskEnd - start;
315+
getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
316+
offset = (start + i) % 4;
317+
j = i / 4;
318+
mask = getByteMaskFromOffset(offset);
319+
i = 4 * j; //Now that the mask has been set, reset i to byte offsets
320+
offset = 0;
321+
continue;
322+
}
323+
324+
//Set the mask, if appropriate
325+
foo = 4*j + 4*blockStart; // The smallest position in the byte
326+
if(mask & 1 && (foo + 3 >= maskStart && foo + 3 < maskEnd)) mask -= 1;
327+
if(mask & 2 && (foo + 2 >= maskStart && foo + 2 < maskEnd)) mask -= 2;
328+
if(mask & 4 && (foo + 1 >= maskStart && foo + 1 < maskEnd)) mask -= 4;
329+
if(mask & 8 && (foo >= maskStart && foo < maskEnd)) mask -= 8;
330+
if(foo + 4 > maskEnd) {
331+
getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
332+
continue;
333+
}
334+
}
335+
}
336+
337+
//Ensure that anything after then end is masked
338+
if(i+4>=len) {
339+
if((mask & 1) && i+3>=len) mask -=1;
340+
if((mask & 2) && i+2>=len) mask -=2;
341+
if((mask & 4) && i+1>=len) mask -=4;
342+
if((mask & 8) && i>=len) mask -=8;
343+
}
344+
345+
foo = bytes[j++];
346+
//Offset 3
347+
if(mask & 1) {
348+
tmp[foo & 3]++;
349+
}
350+
foo >>= 2;
351+
mask >>= 1;
352+
//Offset 2
353+
if(mask & 1) {
354+
tmp[foo & 3]++;
355+
}
356+
foo >>= 2;
357+
mask >>= 1;
358+
//Offset 1
359+
if(mask & 1) {
360+
tmp[foo & 3]++;
361+
}
362+
foo >>= 2;
363+
mask >>= 1;
364+
//Offset 0
365+
if(mask & 1) {
366+
tmp[foo & 3]++; // offset 0
308367
}
309-
if(rv != 1) offset = 0;
368+
i += 4;
369+
mask = 15;
310370
}
371+
free(bytes);
311372

373+
//out is in TCAG order, since that's how 2bit is stored.
374+
//However, for whatever reason I went with ACTG in the first release...
312375
if(fraction) {
313-
((double*) out)[0] = ((double) A)/((double) len);
314-
((double*) out)[1] = ((double) C)/((double) len);
315-
((double*) out)[2] = ((double) T)/((double) len);
316-
((double*) out)[3] = ((double) G)/((double) len);
376+
((double*) out)[0] = ((double) tmp[2])/((double) len);
377+
((double*) out)[1] = ((double) tmp[1])/((double) len);
378+
((double*) out)[2] = ((double) tmp[0])/((double) len);
379+
((double*) out)[3] = ((double) tmp[3])/((double) len);
317380
} else {
318-
((uint32_t*) out)[0] = A;
319-
((uint32_t*) out)[1] = C;
320-
((uint32_t*) out)[2] = T;
321-
((uint32_t*) out)[3] = G;
381+
((uint32_t*) out)[0] = tmp[2];
382+
((uint32_t*) out)[1] = tmp[1];
383+
((uint32_t*) out)[2] = tmp[0];
384+
((uint32_t*) out)[3] = tmp[3];
322385
}
323386

324387
return out;
325388

326389
error:
327-
free(out);
390+
if(out) free(out);
391+
if(bytes) free(bytes);
328392
return NULL;
329393
}
330394

CHANGES.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
0.2.0:
2+
- The `twobitBasesWorker()` and `constructSequence()` functions that underly `twobitSequence()` and `twobitBases()` have been rewritten and are now much faster.
3+
4+
0.1.0:
5+
- Initial release

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
CC ?= gcc
22
AR ?= ar
33
RANLIB ?= ranlib
4-
CFLAGS ?= -g -Wall -O0
4+
CFLAGS ?= -g -Wall -O3
55
LIBS =
66
EXTRA_CFLAGS_PIC = -fpic
77
LDFLAGS =

test/test.py

+4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
#!/usr/bin/env python
2+
"""
3+
The expected output is below:
4+
5+
"""
26
from subprocess import Popen, PIPE, check_call
37
from os import remove
48

0 commit comments

Comments
 (0)