@@ -261,105 +261,130 @@ void getMask(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, uint32_t *m
261
261
}
262
262
}
263
263
264
- /*
265
- Determine whether "start + pos" overlaps an N block. If so, update pos, return 1, and update the mask stuff (as well as update blockStart and offset). If there's no overlap, return 0.
266
-
267
- Returning a value of 1 indicates that pos has been updated (i.e., there was an overlap). A value of 0 indicates otherwise.
268
-
269
- A return value of -1 indicates an error (in twobitSeek).
270
- */
271
- int cycle (TwoBit * tb , uint32_t tid , uint32_t start , uint32_t end , uint32_t * pos , uint32_t * maskIdx , uint32_t * maskStart , uint32_t * maskEnd , uint32_t * blockStart , uint32_t * offset ) {
272
- if (* maskIdx < tb -> idx -> nBlockCount [tid ]) {
273
- if (end < * maskStart ) {
274
- getMask (tb , tid , start , end , maskIdx , maskStart , maskEnd );
275
- if (* maskIdx < tb -> idx -> nBlockCount [tid ]) return 0 ;
276
- }
277
- if ((start + * pos >= * maskStart ) && (start + * pos < * maskEnd )) {
278
- * pos += (* maskEnd ) - (start + * pos );
279
- * blockStart = (start + * pos )/4 ;
280
- * offset = (start + * pos ) % 4 ;
281
- getMask (tb , tid , start , end , maskIdx , maskStart , maskEnd );
282
- if (twobitSeek (tb , tb -> idx -> offset [tid ] + * blockStart ) != 0 ) return -1 ;
283
- return 1 ;
284
- }
285
- }
286
- return 0 ;
287
- }
288
-
289
- /*
290
- Increment the counter for the appropriate base
291
- */
292
- void increment (char base , uint32_t * A , uint32_t * C , uint32_t * T , uint32_t * G ) {
293
- switch (base ) {
294
- case 'T' :
295
- * T += 1 ;
296
- break ;
297
- case 'C' :
298
- * C += 1 ;
299
- break ;
300
- case 'A' :
301
- * A += 1 ;
302
- break ;
303
- case 'G' :
304
- * G += 1 ;
305
- break ;
264
+ uint8_t getByteMaskFromOffset (int offset ) {
265
+ switch (offset ) {
266
+ case 0 :
267
+ return (uint8_t ) 15 ;
268
+ case 1 :
269
+ return (uint8_t ) 7 ;
270
+ case 2 :
271
+ return (uint8_t ) 3 ;
306
272
}
273
+ return 1 ;
307
274
}
308
275
309
276
void * twobitBasesWorker (TwoBit * tb , uint32_t tid , uint32_t start , uint32_t end , int fraction ) {
310
277
void * out ;
311
- uint32_t A = 0 , C = 0 , T = 0 , G = 0 , len = end - start , i = 0 ;
312
- uint32_t blockStart , blockEnd , offset ;
313
- char * s = NULL ;
278
+ uint32_t tmp [ 4 ] = { 0 , 0 , 0 , 0 } , len = end - start , i = 0 , j = 0 ;
279
+ uint32_t blockStart , blockEnd , maskIdx = ( uint32_t ) -1 , maskStart , maskEnd , foo ;
280
+ uint8_t * bytes = NULL , mask = 0 , offset ;
314
281
315
282
if (fraction ) {
316
283
out = malloc (4 * sizeof (double ));
317
284
} else {
318
285
out = malloc (4 * sizeof (uint32_t ));
319
286
}
320
287
if (!out ) return NULL ;
321
- s = constructSequence (tb , tid , start , end );
322
- if (!s ) goto error ;
323
-
324
- for (i = 0 ; i < len ; i ++ ) {
325
- switch (s [i ]) {
326
- case 'A' :
327
- case 'a' :
328
- A ++ ;
329
- break ;
330
- case 'C' :
331
- case 'c' :
332
- C ++ ;
333
- break ;
334
- case 'G' :
335
- case 'g' :
336
- G ++ ;
337
- break ;
338
- case 'T' :
339
- case 't' :
340
- T ++ ;
341
- break ;
288
+
289
+ //There are 4 bases/byte
290
+ blockStart = start /4 ;
291
+ offset = start % 4 ;
292
+ blockEnd = end /4 + ((end % 4 ) ? 1 : 0 );
293
+ bytes = malloc (blockEnd - blockStart );
294
+ if (!bytes ) goto error ;
295
+
296
+ //Set the initial mask
297
+ mask = getByteMaskFromOffset (offset );
298
+
299
+ if (twobitSeek (tb , tb -> idx -> offset [tid ] + blockStart ) != 0 ) goto error ;
300
+ if (twobitRead (bytes , blockEnd - blockStart , 1 , tb ) != 1 ) goto error ;
301
+
302
+ //Get the index/start/end of the next N-mask block
303
+ getMask (tb , tid , start , end , & maskIdx , & maskStart , & maskEnd );
304
+
305
+ while (i < len ) {
306
+ // Check if we need to jump
307
+ if (maskIdx != -1 && start + i + 4 >= maskStart ) {
308
+ if (start + i >= maskStart || start + i + 4 - offset > maskStart ) {
309
+ //Jump iff the whole byte is inside an N block
310
+ if (start + i >= maskStart && start + i + 4 - offset < maskEnd ) {
311
+ //iff we're fully in an N block then jump
312
+ i = maskEnd - start ;
313
+ getMask (tb , tid , i , end , & maskIdx , & maskStart , & maskEnd );
314
+ offset = (start + i ) % 4 ;
315
+ j = i / 4 ;
316
+ mask = getByteMaskFromOffset (offset );
317
+ i = 4 * j ; //Now that the mask has been set, reset i to byte offsets
318
+ offset = 0 ;
319
+ continue ;
320
+ }
321
+
322
+ //Set the mask, if appropriate
323
+ foo = 4 * j + 4 * blockStart ; // The smallest position in the byte
324
+ if (mask & 1 && (foo + 3 >= maskStart && foo + 3 < maskEnd )) mask -= 1 ;
325
+ if (mask & 2 && (foo + 2 >= maskStart && foo + 2 < maskEnd )) mask -= 2 ;
326
+ if (mask & 4 && (foo + 1 >= maskStart && foo + 1 < maskEnd )) mask -= 4 ;
327
+ if (mask & 8 && (foo >= maskStart && foo < maskEnd )) mask -= 8 ;
328
+ if (foo + 4 > maskEnd ) {
329
+ getMask (tb , tid , i , end , & maskIdx , & maskStart , & maskEnd );
330
+ continue ;
331
+ }
332
+ }
342
333
}
334
+
335
+ //Ensure that anything after then end is masked
336
+ if (i + 4 >=len ) {
337
+ if ((mask & 1 ) && i + 3 >=len ) mask -= 1 ;
338
+ if ((mask & 2 ) && i + 2 >=len ) mask -= 2 ;
339
+ if ((mask & 4 ) && i + 1 >=len ) mask -= 4 ;
340
+ if ((mask & 8 ) && i >=len ) mask -= 8 ;
341
+ }
342
+
343
+ foo = bytes [j ++ ];
344
+ //Offset 3
345
+ if (mask & 1 ) {
346
+ tmp [foo & 3 ]++ ;
347
+ }
348
+ foo >>= 2 ;
349
+ mask >>= 1 ;
350
+ //Offset 2
351
+ if (mask & 1 ) {
352
+ tmp [foo & 3 ]++ ;
353
+ }
354
+ foo >>= 2 ;
355
+ mask >>= 1 ;
356
+ //Offset 1
357
+ if (mask & 1 ) {
358
+ tmp [foo & 3 ]++ ;
359
+ }
360
+ foo >>= 2 ;
361
+ mask >>= 1 ;
362
+ //Offset 0
363
+ if (mask & 1 ) {
364
+ tmp [foo & 3 ]++ ; // offset 0
365
+ }
366
+ i += 4 ;
367
+ mask = 15 ;
343
368
}
344
- free (s );
369
+ free (bytes );
345
370
346
371
if (fraction ) {
347
- ((double * ) out )[0 ] = ((double ) A )/((double ) len );
348
- ((double * ) out )[1 ] = ((double ) C )/((double ) len );
349
- ((double * ) out )[2 ] = ((double ) T )/((double ) len );
350
- ((double * ) out )[3 ] = ((double ) G )/((double ) len );
372
+ ((double * ) out )[0 ] = ((double ) tmp [ 0 ] )/((double ) len );
373
+ ((double * ) out )[1 ] = ((double ) tmp [ 1 ] )/((double ) len );
374
+ ((double * ) out )[2 ] = ((double ) tmp [ 2 ] )/((double ) len );
375
+ ((double * ) out )[3 ] = ((double ) tmp [ 3 ] )/((double ) len );
351
376
} else {
352
- ((uint32_t * ) out )[0 ] = A ;
353
- ((uint32_t * ) out )[1 ] = C ;
354
- ((uint32_t * ) out )[2 ] = T ;
355
- ((uint32_t * ) out )[3 ] = G ;
377
+ ((uint32_t * ) out )[0 ] = tmp [ 0 ] ;
378
+ ((uint32_t * ) out )[1 ] = tmp [ 1 ] ;
379
+ ((uint32_t * ) out )[2 ] = tmp [ 2 ] ;
380
+ ((uint32_t * ) out )[3 ] = tmp [ 3 ] ;
356
381
}
357
382
358
383
return out ;
359
384
360
385
error :
361
- free (out );
362
- if (s ) free (s );
386
+ if ( out ) free (out );
387
+ if (bytes ) free (bytes );
363
388
return NULL ;
364
389
}
365
390
0 commit comments