Skip to content

Commit 6a2c193

Browse files
Anders LeungAnders Leung
authored andcommitted
BCF 2.2 writing WIP
1 parent e917800 commit 6a2c193

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+3913
-2435
lines changed

.travis.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ cache:
1313
env:
1414
global:
1515
- HTSJDK_SAMTOOLS_BIN=/usr/bin/samtools
16+
- HTSJDK_BCFTOOLS_BIN=/usr/bin/bcftools
1617
jdk:
1718
- oraclejdk8
1819
- openjdk8
@@ -32,6 +33,7 @@ matrix:
3233

3334
before_install:
3435
- scripts/install-samtools.sh
36+
- scripts/install-bcftools.sh
3537
- scripts/htsget-scripts/start-htsget-test-server.sh
3638

3739
script:

scripts/install-bcftools.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/sh
2+
set -ex
3+
wget https://github.com/samtools/bcftools/releases/download/1.13/bcftools-1.13.tar.bz2
4+
tar -xjvf bcftools-1.13.tar.bz2
5+
cd bcftools-1.13 && ./configure --prefix=/usr && make && sudo make install

src/main/java/htsjdk/samtools/util/FileExtensions.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ public final class FileExtensions {
6565
public static final String VCF = ".vcf";
6666
public static final String VCF_INDEX = TRIBBLE_INDEX;
6767
public static final String BCF = ".bcf";
68+
// Note that .bcf on its own may be gzip compressed and usually is,
69+
// but files with the extension .bcf.gz to seem to exist in the wild and should be supported
70+
public static final String COMPRESSED_BCF = ".bcf.gz";
6871
public static final String COMPRESSED_VCF = ".vcf.gz";
6972
public static final String COMPRESSED_VCF_INDEX = ".tbi";
7073
public static final List<String> VCF_LIST = Collections.unmodifiableList(Arrays.asList(VCF, COMPRESSED_VCF, BCF));
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
package htsjdk.samtools.util;
2+
3+
import java.io.IOException;
4+
import java.io.OutputStream;
5+
import java.nio.ByteBuffer;
6+
import java.util.ArrayList;
7+
import java.util.Arrays;
8+
9+
/**
10+
* Growable byte buffer backed by a list of byte arrays, which can
11+
* be used to buffer data without reallocating an underlying array.
12+
* Once data is accumulated, it can either be retrieved by converting
13+
* into a byte[] for interfaces that require a contiguous block of bytes,
14+
* or written directly to an OutputStream to avoid array copies.
15+
*/
16+
public class ListByteBufferOutputStream extends OutputStream {
17+
18+
private final int blockSize;
19+
private final ArrayList<byte[]> blocks;
20+
private byte[] currentBlock;
21+
private int nextBlockIndex;
22+
private int nextBytePosition;
23+
private int size;
24+
25+
public ListByteBufferOutputStream(final int blockSize) {
26+
this.blockSize = blockSize;
27+
blocks = new ArrayList<>();
28+
nextBlockIndex = 0;
29+
advanceBlock();
30+
size = 0;
31+
}
32+
33+
@Override
34+
public void write(final int b) {
35+
if (nextBytePosition == blockSize) {
36+
advanceBlock();
37+
}
38+
currentBlock[nextBytePosition++] = (byte) b;
39+
size++;
40+
}
41+
42+
public void write(final byte b, final int nCopies) {
43+
assert nCopies >= 0;
44+
45+
int bytesRemaining = nCopies;
46+
while (bytesRemaining > 0) {
47+
if (nextBytePosition == blockSize) {
48+
advanceBlock();
49+
}
50+
final int toIndex = Math.min(nextBytePosition + bytesRemaining, blockSize);
51+
Arrays.fill(currentBlock, nextBytePosition, toIndex, b);
52+
bytesRemaining -= toIndex - nextBytePosition;
53+
nextBytePosition = toIndex;
54+
}
55+
size += nCopies;
56+
}
57+
58+
@Override
59+
public void write(final byte[] b) {
60+
write(b, 0, b.length);
61+
}
62+
63+
@Override
64+
public void write(final byte[] b, int off, final int len) {
65+
assert b != null;
66+
assert off >= 0;
67+
assert len >= 0;
68+
assert off + len <= b.length;
69+
70+
int bytesRemaining = len;
71+
while (bytesRemaining > 0) {
72+
if (nextBytePosition == blockSize) {
73+
advanceBlock();
74+
}
75+
final int lengthToWrite = Math.min(bytesRemaining, blockSize - nextBytePosition);
76+
System.arraycopy(b, off, currentBlock, nextBytePosition, lengthToWrite);
77+
nextBytePosition += lengthToWrite;
78+
off += lengthToWrite;
79+
bytesRemaining -= lengthToWrite;
80+
}
81+
size += len;
82+
}
83+
84+
public int size() {
85+
return size;
86+
}
87+
88+
public void writeTo(final OutputStream out) throws IOException {
89+
for (final byte[] b : blocks) {
90+
if (b == currentBlock) {
91+
out.write(b, 0, nextBytePosition);
92+
break;
93+
} else {
94+
out.write(b);
95+
}
96+
}
97+
}
98+
99+
public byte[] toByteArray() {
100+
final byte[] bytes = new byte[size];
101+
final ByteBuffer buff = ByteBuffer.wrap(bytes);
102+
for (final byte[] b : blocks) {
103+
if (b == currentBlock) {
104+
buff.put(b, 0, nextBytePosition);
105+
break;
106+
} else {
107+
buff.put(b);
108+
}
109+
}
110+
return bytes;
111+
}
112+
113+
public void reset() {
114+
currentBlock = blocks.get(0);
115+
nextBytePosition = 0;
116+
nextBlockIndex = 1;
117+
size = 0;
118+
}
119+
120+
public void clear() {
121+
reset();
122+
// blocks always has at least 1 element
123+
blocks.subList(1, blocks.size()).clear();
124+
}
125+
126+
private void advanceBlock() {
127+
if (nextBlockIndex == blocks.size()) {
128+
// Need to add a new block
129+
currentBlock = new byte[blockSize];
130+
blocks.add(currentBlock);
131+
} else {
132+
// Reuse old block
133+
currentBlock = blocks.get(nextBlockIndex);
134+
}
135+
nextBytePosition = 0;
136+
nextBlockIndex++;
137+
}
138+
}

src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import htsjdk.tribble.index.IndexFactory;
3434
import htsjdk.tribble.readers.PositionalBufferedStream;
3535
import htsjdk.tribble.util.ParsingUtils;
36+
import htsjdk.variant.vcf.VCFFileReader;
3637

3738
import java.io.BufferedInputStream;
3839
import java.io.IOException;
@@ -252,7 +253,11 @@ private void readHeader() throws IOException {
252253
PositionalBufferedStream pbs = null;
253254
try {
254255
is = ParsingUtils.openInputStream(path, wrapper);
255-
if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, "UTF-8")))) {
256+
// BCFs are usually gzipped but do not have the .gz extension,
257+
// so we explicitly check for the presence of a gzip header
258+
if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, "UTF-8")))
259+
|| (VCFFileReader.isBCF(path) && IOUtil.isGZIPInputStream(is))
260+
) {
256261
// TODO: TEST/FIX THIS! https://github.com/samtools/htsjdk/issues/944
257262
// TODO -- warning I don't think this can work, the buffered input stream screws up position
258263
is = new GZIPInputStream(new BufferedInputStream(is));
@@ -326,7 +331,8 @@ public WFIterator() throws IOException {
326331
final InputStream inputStream = ParsingUtils.openInputStream(path, wrapper);
327332

328333
final PositionalBufferedStream pbs;
329-
if (IOUtil.hasBlockCompressedExtension(path)) {
334+
// BCFs can be gzipped but usually do not have a compressed extension, so an extra check is needed
335+
if (IOUtil.hasBlockCompressedExtension(path) || (VCFFileReader.isBCF(path) && IOUtil.isGZIPInputStream(inputStream))) {
330336
// Gzipped -- we need to buffer the GZIPInputStream methods as this class makes read() calls,
331337
// and seekableStream does not support single byte reads
332338
final InputStream is = new GZIPInputStream(new BufferedInputStream(inputStream, 512000));

src/main/java/htsjdk/tribble/util/ParsingUtils.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,8 @@ public static InputStream openInputStream(final String uri, final Function<Seeka
101101
if (URL_SCHEMES.stream().anyMatch(uri::startsWith)) {
102102
inputStream = getURLHelper(new URL(uri)).openInputStream();
103103
} else if (!IOUtil.hasScheme(uri)) {
104-
File file = new File(uri);
105-
inputStream = Files.newInputStream(file.toPath());
104+
final File file = new File(uri);
105+
inputStream = new SeekablePathStream(file.toPath(), wrapper);
106106
} else {
107107
inputStream = new SeekablePathStream(IOUtil.getPath(uri), wrapper);
108108
}

0 commit comments

Comments
 (0)