@@ -62,6 +62,41 @@ char byte2base(uint8_t byte, int offset) {
62
62
return bases [foo ];
63
63
}
64
64
65
+ void bytes2bases (char * seq , uint8_t * byte , uint32_t sz , int offset ) {
66
+ uint32_t pos = 0 , remainder = 0 , i = 0 ;
67
+ char bases [4 ] = "TCAG" ;
68
+ uint8_t foo = byte [0 ];
69
+
70
+ // Deal with the first partial byte
71
+ if (offset != 0 ) {
72
+ while (offset < 4 ) {
73
+ seq [pos ++ ] = byte2base (foo , offset ++ );
74
+ }
75
+ foo = byte [++ i ];
76
+ }
77
+
78
+ // Deal with everything else, with the possible exception of the last fractional byte
79
+ remainder = (sz - pos ) % 4 ;
80
+ while (pos < sz - remainder ) {
81
+ foo = byte [i ++ ];
82
+ seq [pos + 3 ] = bases [foo & 3 ];
83
+ foo >>= 2 ;
84
+ seq [pos + 2 ] = bases [foo & 3 ];
85
+ foo >>= 2 ;
86
+ seq [pos + 1 ] = bases [foo & 3 ];
87
+ foo >>= 2 ;
88
+ seq [pos ] = bases [foo & 3 ];
89
+ foo >>= 2 ;
90
+ pos += 4 ;
91
+ }
92
+
93
+ // Deal with the last partial byte
94
+ if (remainder > 0 ) foo = byte [i ];
95
+ for (offset = 0 ; offset < remainder ; offset ++ ) {
96
+ seq [pos ++ ] = byte2base (foo , offset );
97
+ }
98
+ }
99
+
65
100
/*
66
101
Replace Ts (or whatever else is being used) with N as appropriate
67
102
*/
@@ -122,25 +157,23 @@ void softMask(char *seq, TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end)
122
157
This is the worker function for twobitSequence, which mostly does error checking
123
158
*/
124
159
char * constructSequence (TwoBit * tb , uint32_t tid , uint32_t start , uint32_t end ) {
125
- uint32_t sz = end - start + 1 , pos = 0 ;
126
- uint32_t blockStart , offset ;
127
- char * seq = malloc (sz * sizeof (char )), byte ;
160
+ uint32_t sz = end - start + 1 ;
161
+ uint32_t blockStart , blockEnd , offset ;
162
+ char * seq = malloc (sz * sizeof (char ));
163
+ uint8_t * bytes = NULL ;
128
164
if (!seq ) return NULL ;
129
165
130
166
//There are 4 bases/byte
131
167
blockStart = start /4 ;
132
168
offset = start % 4 ;
169
+ blockEnd = end /4 + ((end % 4 ) ? 1 : 0 );
170
+ bytes = malloc (blockEnd - blockStart );
171
+ if (!bytes ) goto error ;
133
172
134
173
if (twobitSeek (tb , tb -> idx -> offset [tid ] + blockStart ) != 0 ) goto error ;
135
- while (pos < sz - 1 ) {
136
- if (twobitRead (& byte , 1 , 1 , tb ) != 1 ) goto error ;
137
-
138
- for (; offset < 4 ; offset ++ ) {
139
- seq [pos ++ ] = byte2base (byte , offset );
140
- if (pos >= sz - 1 ) break ;
141
- }
142
- offset = 0 ;
143
- }
174
+ if (twobitRead (bytes , blockEnd - blockStart , 1 , tb ) != 1 ) goto error ;
175
+ bytes2bases (seq , bytes , sz - 1 , offset );
176
+ free (bytes );
144
177
145
178
//Null terminate the output
146
179
seq [sz - 1 ] = '\0' ;
@@ -155,6 +188,7 @@ char *constructSequence(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end)
155
188
156
189
error :
157
190
if (seq ) free (seq );
191
+ if (bytes ) free (bytes );
158
192
return NULL ;
159
193
}
160
194
@@ -227,59 +261,23 @@ void getMask(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, uint32_t *m
227
261
}
228
262
}
229
263
230
- /*
231
- Determine whether "start + pos" overlaps an N block. If so, update pos, return 1, and update the mask stuff (as well as update blockStart and offset). If there's no overlap, return 0.
232
-
233
- Returning a value of 1 indicates that pos has been updated (i.e., there was an overlap). A value of 0 indicates otherwise.
234
-
235
- A return value of -1 indicates an error (in twobitSeek).
236
- */
237
- int cycle (TwoBit * tb , uint32_t tid , uint32_t start , uint32_t end , uint32_t * pos , uint32_t * maskIdx , uint32_t * maskStart , uint32_t * maskEnd , uint32_t * blockStart , uint32_t * offset ) {
238
- if (* maskIdx < tb -> idx -> nBlockCount [tid ]) {
239
- if (end < * maskStart ) {
240
- getMask (tb , tid , start , end , maskIdx , maskStart , maskEnd );
241
- if (* maskIdx < tb -> idx -> nBlockCount [tid ]) return 0 ;
242
- }
243
- if ((start + * pos >= * maskStart ) && (start + * pos < * maskEnd )) {
244
- * pos += (* maskEnd ) - (start + * pos );
245
- * blockStart = (start + * pos )/4 ;
246
- * offset = (start + * pos ) % 4 ;
247
- getMask (tb , tid , start , end , maskIdx , maskStart , maskEnd );
248
- if (twobitSeek (tb , tb -> idx -> offset [tid ] + * blockStart ) != 0 ) return -1 ;
249
- return 1 ;
250
- }
251
- }
252
- return 0 ;
253
- }
254
-
255
- /*
256
- Increment the counter for the appropriate base
257
- */
258
- void increment (char base , uint32_t * A , uint32_t * C , uint32_t * T , uint32_t * G ) {
259
- switch (base ) {
260
- case 'T' :
261
- * T += 1 ;
262
- break ;
263
- case 'C' :
264
- * C += 1 ;
265
- break ;
266
- case 'A' :
267
- * A += 1 ;
268
- break ;
269
- case 'G' :
270
- * G += 1 ;
271
- break ;
264
+ uint8_t getByteMaskFromOffset (int offset ) {
265
+ switch (offset ) {
266
+ case 0 :
267
+ return (uint8_t ) 15 ;
268
+ case 1 :
269
+ return (uint8_t ) 7 ;
270
+ case 2 :
271
+ return (uint8_t ) 3 ;
272
272
}
273
+ return 1 ;
273
274
}
274
275
275
276
void * twobitBasesWorker (TwoBit * tb , uint32_t tid , uint32_t start , uint32_t end , int fraction ) {
276
277
void * out ;
277
- uint32_t sz = end - start , pos = 0 ;
278
- uint32_t A = 0 , C = 0 , T = 0 , G = 0 , len = end - start ;
279
- uint32_t maskIdx = -1 , maskStart = -1 , maskEnd = -1 ;
280
- uint32_t blockStart , offset ;
281
- uint8_t byte , base ;
282
- int rv = 0 ;
278
+ uint32_t tmp [4 ] = {0 , 0 , 0 , 0 }, len = end - start + (start % 4 ), i = 0 , j = 0 ;
279
+ uint32_t blockStart , blockEnd , maskIdx = (uint32_t ) -1 , maskStart , maskEnd , foo ;
280
+ uint8_t * bytes = NULL , mask = 0 , offset ;
283
281
284
282
if (fraction ) {
285
283
out = malloc (4 * sizeof (double ));
@@ -288,43 +286,109 @@ void *twobitBasesWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end,
288
286
}
289
287
if (!out ) return NULL ;
290
288
291
- getMask (tb , tid , start , end , & maskIdx , & maskStart , & maskEnd );
292
-
293
289
//There are 4 bases/byte
294
290
blockStart = start /4 ;
295
291
offset = start % 4 ;
292
+ blockEnd = end /4 + ((end % 4 ) ? 1 : 0 );
293
+ bytes = malloc (blockEnd - blockStart );
294
+ if (!bytes ) goto error ;
295
+
296
+ //Set the initial mask, reset start/offset so we always deal with full bytes
297
+ mask = getByteMaskFromOffset (offset );
298
+ start = 4 * blockStart ;
299
+ offset = 0 ;
296
300
297
301
if (twobitSeek (tb , tb -> idx -> offset [tid ] + blockStart ) != 0 ) goto error ;
298
- while (pos < sz ) {
299
- //Iterate over the 4 (or less) bases in each byte
300
- if (twobitRead (& byte , 1 , 1 , tb ) != 1 ) goto error ;
301
- for (; offset < 4 ; offset ++ ) {
302
- rv = cycle (tb , tid , start , end , & pos , & maskIdx , & maskStart , & maskEnd , & blockStart , & offset );
303
- if (rv == -1 ) goto error ;
304
- if (rv == 1 ) break ;
305
- base = byte2base (byte , offset );
306
- increment (base , & A , & C , & T , & G );
307
- if (++ pos >= sz ) break ;
302
+ if (twobitRead (bytes , blockEnd - blockStart , 1 , tb ) != 1 ) goto error ;
303
+
304
+ //Get the index/start/end of the next N-mask block
305
+ getMask (tb , tid , start , end , & maskIdx , & maskStart , & maskEnd );
306
+
307
+ while (i < len ) {
308
+ // Check if we need to jump
309
+ if (maskIdx != -1 && start + i + 4 >= maskStart ) {
310
+ if (start + i >= maskStart || start + i + 4 - offset > maskStart ) {
311
+ //Jump iff the whole byte is inside an N block
312
+ if (start + i >= maskStart && start + i + 4 - offset < maskEnd ) {
313
+ //iff we're fully in an N block then jump
314
+ i = maskEnd - start ;
315
+ getMask (tb , tid , i , end , & maskIdx , & maskStart , & maskEnd );
316
+ offset = (start + i ) % 4 ;
317
+ j = i / 4 ;
318
+ mask = getByteMaskFromOffset (offset );
319
+ i = 4 * j ; //Now that the mask has been set, reset i to byte offsets
320
+ offset = 0 ;
321
+ continue ;
322
+ }
323
+
324
+ //Set the mask, if appropriate
325
+ foo = 4 * j + 4 * blockStart ; // The smallest position in the byte
326
+ if (mask & 1 && (foo + 3 >= maskStart && foo + 3 < maskEnd )) mask -= 1 ;
327
+ if (mask & 2 && (foo + 2 >= maskStart && foo + 2 < maskEnd )) mask -= 2 ;
328
+ if (mask & 4 && (foo + 1 >= maskStart && foo + 1 < maskEnd )) mask -= 4 ;
329
+ if (mask & 8 && (foo >= maskStart && foo < maskEnd )) mask -= 8 ;
330
+ if (foo + 4 > maskEnd ) {
331
+ getMask (tb , tid , i , end , & maskIdx , & maskStart , & maskEnd );
332
+ continue ;
333
+ }
334
+ }
335
+ }
336
+
337
+ //Ensure that anything after then end is masked
338
+ if (i + 4 >=len ) {
339
+ if ((mask & 1 ) && i + 3 >=len ) mask -= 1 ;
340
+ if ((mask & 2 ) && i + 2 >=len ) mask -= 2 ;
341
+ if ((mask & 4 ) && i + 1 >=len ) mask -= 4 ;
342
+ if ((mask & 8 ) && i >=len ) mask -= 8 ;
343
+ }
344
+
345
+ foo = bytes [j ++ ];
346
+ //Offset 3
347
+ if (mask & 1 ) {
348
+ tmp [foo & 3 ]++ ;
349
+ }
350
+ foo >>= 2 ;
351
+ mask >>= 1 ;
352
+ //Offset 2
353
+ if (mask & 1 ) {
354
+ tmp [foo & 3 ]++ ;
355
+ }
356
+ foo >>= 2 ;
357
+ mask >>= 1 ;
358
+ //Offset 1
359
+ if (mask & 1 ) {
360
+ tmp [foo & 3 ]++ ;
361
+ }
362
+ foo >>= 2 ;
363
+ mask >>= 1 ;
364
+ //Offset 0
365
+ if (mask & 1 ) {
366
+ tmp [foo & 3 ]++ ; // offset 0
308
367
}
309
- if (rv != 1 ) offset = 0 ;
368
+ i += 4 ;
369
+ mask = 15 ;
310
370
}
371
+ free (bytes );
311
372
373
+ //out is in TCAG order, since that's how 2bit is stored.
374
+ //However, for whatever reason I went with ACTG in the first release...
312
375
if (fraction ) {
313
- ((double * ) out )[0 ] = ((double ) A )/((double ) len );
314
- ((double * ) out )[1 ] = ((double ) C )/((double ) len );
315
- ((double * ) out )[2 ] = ((double ) T )/((double ) len );
316
- ((double * ) out )[3 ] = ((double ) G )/((double ) len );
376
+ ((double * ) out )[0 ] = ((double ) tmp [ 2 ] )/((double ) len );
377
+ ((double * ) out )[1 ] = ((double ) tmp [ 1 ] )/((double ) len );
378
+ ((double * ) out )[2 ] = ((double ) tmp [ 0 ] )/((double ) len );
379
+ ((double * ) out )[3 ] = ((double ) tmp [ 3 ] )/((double ) len );
317
380
} else {
318
- ((uint32_t * ) out )[0 ] = A ;
319
- ((uint32_t * ) out )[1 ] = C ;
320
- ((uint32_t * ) out )[2 ] = T ;
321
- ((uint32_t * ) out )[3 ] = G ;
381
+ ((uint32_t * ) out )[0 ] = tmp [ 2 ] ;
382
+ ((uint32_t * ) out )[1 ] = tmp [ 1 ] ;
383
+ ((uint32_t * ) out )[2 ] = tmp [ 0 ] ;
384
+ ((uint32_t * ) out )[3 ] = tmp [ 3 ] ;
322
385
}
323
386
324
387
return out ;
325
388
326
389
error :
327
- free (out );
390
+ if (out ) free (out );
391
+ if (bytes ) free (bytes );
328
392
return NULL ;
329
393
}
330
394
0 commit comments