Skip to content

Commit f5048d6

Browse files
committed
Speed up bases command as well and update test.
1 parent 4fe9279 commit f5048d6

File tree

2 files changed

+119
-78
lines changed

2 files changed

+119
-78
lines changed

2bit.c

Lines changed: 102 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -261,105 +261,130 @@ void getMask(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, uint32_t *m
261261
}
262262
}
263263

264-
/*
265-
Determine whether "start + pos" overlaps an N block. If so, update pos, return 1, and update the mask stuff (as well as update blockStart and offset). If there's no overlap, return 0.
266-
267-
Returning a value of 1 indicates that pos has been updated (i.e., there was an overlap). A value of 0 indicates otherwise.
268-
269-
A return value of -1 indicates an error (in twobitSeek).
270-
*/
271-
int cycle(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, uint32_t *pos, uint32_t *maskIdx, uint32_t *maskStart, uint32_t *maskEnd, uint32_t *blockStart, uint32_t *offset) {
272-
if(*maskIdx < tb->idx->nBlockCount[tid]) {
273-
if(end < *maskStart) {
274-
getMask(tb, tid, start, end, maskIdx, maskStart, maskEnd);
275-
if(*maskIdx < tb->idx->nBlockCount[tid]) return 0;
276-
}
277-
if((start + *pos >= *maskStart) && (start + *pos < *maskEnd)) {
278-
*pos += (*maskEnd) - (start + *pos);
279-
*blockStart = (start + *pos)/4;
280-
*offset = (start + *pos) % 4;
281-
getMask(tb, tid, start, end, maskIdx, maskStart, maskEnd);
282-
if(twobitSeek(tb, tb->idx->offset[tid] + *blockStart) != 0) return -1;
283-
return 1;
284-
}
285-
}
286-
return 0;
287-
}
288-
289-
/*
290-
Increment the counter for the appropriate base
291-
*/
292-
void increment(char base, uint32_t *A, uint32_t *C, uint32_t *T, uint32_t *G) {
293-
switch(base) {
294-
case 'T':
295-
*T += 1;
296-
break;
297-
case 'C':
298-
*C += 1;
299-
break;
300-
case 'A':
301-
*A += 1;
302-
break;
303-
case 'G':
304-
*G += 1;
305-
break;
264+
uint8_t getByteMaskFromOffset(int offset) {
265+
switch(offset) {
266+
case 0:
267+
return (uint8_t) 15;
268+
case 1:
269+
return (uint8_t) 7;
270+
case 2:
271+
return (uint8_t) 3;
306272
}
273+
return 1;
307274
}
308275

309276
void *twobitBasesWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, int fraction) {
310277
void *out;
311-
uint32_t A = 0, C = 0, T = 0, G = 0, len = end - start, i = 0;
312-
uint32_t blockStart, blockEnd, offset;
313-
char *s = NULL;
278+
uint32_t tmp[4] = {0, 0, 0, 0}, len = end - start, i = 0, j = 0;
279+
uint32_t blockStart, blockEnd, maskIdx = (uint32_t) -1, maskStart, maskEnd, foo;
280+
uint8_t *bytes = NULL, mask = 0, offset;
314281

315282
if(fraction) {
316283
out = malloc(4 * sizeof(double));
317284
} else {
318285
out = malloc(4 * sizeof(uint32_t));
319286
}
320287
if(!out) return NULL;
321-
s = constructSequence(tb, tid, start, end);
322-
if(!s) goto error;
323-
324-
for(i=0; i<len; i++) {
325-
switch(s[i]) {
326-
case 'A':
327-
case 'a':
328-
A++;
329-
break;
330-
case 'C':
331-
case 'c':
332-
C++;
333-
break;
334-
case 'G':
335-
case 'g':
336-
G++;
337-
break;
338-
case 'T':
339-
case 't':
340-
T++;
341-
break;
288+
289+
//There are 4 bases/byte
290+
blockStart = start/4;
291+
offset = start % 4;
292+
blockEnd = end/4 + ((end % 4) ? 1 : 0);
293+
bytes = malloc(blockEnd - blockStart);
294+
if(!bytes) goto error;
295+
296+
//Set the initial mask
297+
mask = getByteMaskFromOffset(offset);
298+
299+
if(twobitSeek(tb, tb->idx->offset[tid] + blockStart) != 0) goto error;
300+
if(twobitRead(bytes, blockEnd - blockStart, 1, tb) != 1) goto error;
301+
302+
//Get the index/start/end of the next N-mask block
303+
getMask(tb, tid, start, end, &maskIdx, &maskStart, &maskEnd);
304+
305+
while(i < len) {
306+
// Check if we need to jump
307+
if(maskIdx != -1 && start + i + 4 >= maskStart) {
308+
if(start + i >= maskStart || start + i + 4 - offset > maskStart) {
309+
//Jump iff the whole byte is inside an N block
310+
if(start + i >= maskStart && start + i + 4 - offset < maskEnd) {
311+
//iff we're fully in an N block then jump
312+
i = maskEnd - start;
313+
getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
314+
offset = (start + i) % 4;
315+
j = i / 4;
316+
mask = getByteMaskFromOffset(offset);
317+
i = 4 * j; //Now that the mask has been set, reset i to byte offsets
318+
offset = 0;
319+
continue;
320+
}
321+
322+
//Set the mask, if appropriate
323+
foo = 4*j + 4*blockStart; // The smallest position in the byte
324+
if(mask & 1 && (foo + 3 >= maskStart && foo + 3 < maskEnd)) mask -= 1;
325+
if(mask & 2 && (foo + 2 >= maskStart && foo + 2 < maskEnd)) mask -= 2;
326+
if(mask & 4 && (foo + 1 >= maskStart && foo + 1 < maskEnd)) mask -= 4;
327+
if(mask & 8 && (foo >= maskStart && foo < maskEnd)) mask -= 8;
328+
if(foo + 4 > maskEnd) {
329+
getMask(tb, tid, i, end, &maskIdx, &maskStart, &maskEnd);
330+
continue;
331+
}
332+
}
342333
}
334+
335+
//Ensure that anything after then end is masked
336+
if(i+4>=len) {
337+
if((mask & 1) && i+3>=len) mask -=1;
338+
if((mask & 2) && i+2>=len) mask -=2;
339+
if((mask & 4) && i+1>=len) mask -=4;
340+
if((mask & 8) && i>=len) mask -=8;
341+
}
342+
343+
foo = bytes[j++];
344+
//Offset 3
345+
if(mask & 1) {
346+
tmp[foo & 3]++;
347+
}
348+
foo >>= 2;
349+
mask >>= 1;
350+
//Offset 2
351+
if(mask & 1) {
352+
tmp[foo & 3]++;
353+
}
354+
foo >>= 2;
355+
mask >>= 1;
356+
//Offset 1
357+
if(mask & 1) {
358+
tmp[foo & 3]++;
359+
}
360+
foo >>= 2;
361+
mask >>= 1;
362+
//Offset 0
363+
if(mask & 1) {
364+
tmp[foo & 3]++; // offset 0
365+
}
366+
i += 4;
367+
mask = 15;
343368
}
344-
free(s);
369+
free(bytes);
345370

346371
if(fraction) {
347-
((double*) out)[0] = ((double) A)/((double) len);
348-
((double*) out)[1] = ((double) C)/((double) len);
349-
((double*) out)[2] = ((double) T)/((double) len);
350-
((double*) out)[3] = ((double) G)/((double) len);
372+
((double*) out)[0] = ((double) tmp[0])/((double) len);
373+
((double*) out)[1] = ((double) tmp[1])/((double) len);
374+
((double*) out)[2] = ((double) tmp[2])/((double) len);
375+
((double*) out)[3] = ((double) tmp[3])/((double) len);
351376
} else {
352-
((uint32_t*) out)[0] = A;
353-
((uint32_t*) out)[1] = C;
354-
((uint32_t*) out)[2] = T;
355-
((uint32_t*) out)[3] = G;
377+
((uint32_t*) out)[0] = tmp[0];
378+
((uint32_t*) out)[1] = tmp[1];
379+
((uint32_t*) out)[2] = tmp[2];
380+
((uint32_t*) out)[3] = tmp[3];
356381
}
357382

358383
return out;
359384

360385
error:
361-
free(out);
362-
if(s) free(s);
386+
if(out) free(out);
387+
if(bytes) free(bytes);
363388
return NULL;
364389
}
365390

test/test.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,20 @@
11
#!/usr/bin/env python
2+
"""
3+
The expected output is below:
4+
5+
0 chr1 150 offset 0x4a
6+
1 chr2 100 offset 0x88
7+
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATCGATCGTAGCTAGCTAGCTAGCTGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
8+
NNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATC
9+
0 0.086667
10+
1 0.080000
11+
2 0.080000
12+
3 0.086667
13+
0 0.120000
14+
1 0.120000
15+
2 0.120000
16+
3 0.120000
17+
"""
218
from subprocess import Popen, PIPE, check_call
319
from os import remove
420

@@ -11,6 +27,6 @@
1127
except:
1228
p2 = Popen(["md5"], stdin=p1.stdout, stdout=PIPE)
1329
md5sum = p2.communicate()[0].strip().split()[0]
14-
assert(md5sum == "0274c32c7f3dd75e8991f6107dca6a5f")
30+
assert(md5sum == "55e28106e51c37e9191846f0b69a7b52")
1531

1632
print("Passed!")

0 commit comments

Comments
 (0)