Skip to content
This repository was archived by the owner on Feb 23, 2024. It is now read-only.

Commit a61bd62

Browse files
committed
Move bundled fasta files to a separate crate.
It's not great to make the linker process 25MB of fasta files unless it's actually required. Also move the binaries, with their dependency on exons (which shouldn't have been published, but too late now...).
1 parent dbc1c77 commit a61bd62

36 files changed

+192
-138
lines changed

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ members = [
2222
"string_utils",
2323
"tables",
2424
"vdj_ann",
25+
"vdj_ann_ref",
2526
"vdj_types",
2627
"vector_utils",
2728
]

vdj_ann/Cargo.toml

-3
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,14 @@ align_tools = "0.1"
1515
amino = "0.1"
1616
bio_edit = "0.1"
1717
debruijn = "0.3.2"
18-
exons = "0.1"
1918
fasta_tools = "0.1"
2019
flate2 = "1.0.20"
2120
hyperbase = "0.1"
2221
io_utils = "0.2"
2322
itertools = ">= 0.8, <= 0.11"
2423
kmer_lookup = "0.1"
25-
pretty_trace = "0.5"
2624
serde = { version = "1.0", features = ["derive"] }
2725
serde_json = "1.0.62"
28-
sha2 = "0.9.3"
2926
stats_utils = "0.1"
3027
string_utils = "0.1"
3128
vector_utils = "0.1"

vdj_ann/src/annotate.rs

+24-59
Original file line numberDiff line numberDiff line change
@@ -482,9 +482,7 @@ pub fn annotate_seq_core(
482482
let mut mis = semi[i].4.clone();
483483
let mut mis_count = 0;
484484
while l > 0 && l + off > 0 {
485-
if b.get((l - 1_i32) as usize)
486-
!= refs[t as usize].get((l + off - 1_i32) as usize)
487-
{
485+
if b.get((l - 1_i32) as usize) != refs[t as usize].get((l + off - 1_i32) as usize) {
488486
mis.push(l - 1);
489487
mis_count += 1;
490488
}
@@ -1208,11 +1206,13 @@ pub fn annotate_seq_core(
12081206
{
12091207
win1 = true;
12101208
} else if zstop1 == 0 && zstop2 > 0 {
1211-
} else if (outside2 <= 10.0 || total2 - share <= 10) && (m1 < m2
1209+
} else if (outside2 <= 10.0 || total2 - share <= 10)
1210+
&& (m1 < m2
12121211
|| (m1 == m2 && err1 < err2 && !c2)
12131212
|| (m1 == m2 && err1 == err2 && outside1 > outside2)
12141213
|| (m1 == m2 && err1 == err2 && outside1 == outside2 && t1 < t2)
1215-
|| c1) {
1214+
|| c1)
1215+
{
12161216
win1 = true;
12171217
}
12181218

@@ -1223,11 +1223,13 @@ pub fn annotate_seq_core(
12231223
{
12241224
win2 = true;
12251225
} else if zstop2 == 0 && zstop1 > 0 {
1226-
} else if (outside1 <= 10.0 || total1 - share <= 10) && (m2 < m1
1226+
} else if (outside1 <= 10.0 || total1 - share <= 10)
1227+
&& (m2 < m1
12271228
|| (m2 == m1 && err2 < err1 && !c1)
12281229
|| (m2 == m1 && err2 == err1 && outside2 > outside1)
12291230
|| (m2 == m1 && err2 == err1 && outside2 == outside1 && t2 < t1)
1230-
|| c2) {
1231+
|| c2)
1232+
{
12311233
win2 = true;
12321234
}
12331235
if win2 {
@@ -1307,14 +1309,10 @@ pub fn annotate_seq_core(
13071309
&& err1 == err2
13081310
&& t1 < t2
13091311
{
1310-
if refdata.name[t1] == *"TRBC1"
1311-
&& refdata.name[t2] == *"TRBC2"
1312-
{
1312+
if refdata.name[t1] == *"TRBC1" && refdata.name[t2] == *"TRBC2" {
13131313
continue;
13141314
}
1315-
if refdata.name[t2] == *"TRBC1"
1316-
&& refdata.name[t1] == *"TRBC2"
1317-
{
1315+
if refdata.name[t2] == *"TRBC1" && refdata.name[t1] == *"TRBC2" {
13181316
continue;
13191317
}
13201318
win1 = true;
@@ -1460,7 +1458,10 @@ pub fn annotate_seq_core(
14601458
let mut have_v = false;
14611459
for i2 in 0..annx.len() {
14621460
let t2 = annx[i2].2 as usize;
1463-
if !rheaders[t2].contains("segment") && refdata.segtype[t2] == *"V" && refdata.rtype[t1] == refdata.rtype[t2] {
1461+
if !rheaders[t2].contains("segment")
1462+
&& refdata.segtype[t2] == *"V"
1463+
&& refdata.rtype[t1] == refdata.rtype[t2]
1464+
{
14641465
have_v = true;
14651466
}
14661467
}
@@ -1652,14 +1653,10 @@ pub fn annotate_seq_core(
16521653
}
16531654

16541655
if mis1 == mis2 {
1655-
if refdata.name[t1] == *"TRBC1"
1656-
&& refdata.name[t2] == *"TRBC2"
1657-
{
1656+
if refdata.name[t1] == *"TRBC1" && refdata.name[t2] == *"TRBC2" {
16581657
continue;
16591658
}
1660-
if refdata.name[t2] == *"TRBC1"
1661-
&& refdata.name[t1] == *"TRBC2"
1662-
{
1659+
if refdata.name[t2] == *"TRBC1" && refdata.name[t1] == *"TRBC2" {
16631660
continue;
16641661
}
16651662
}
@@ -2819,17 +2816,19 @@ pub fn make_annotation_units(
28192816
}
28202817
let mut entries = 1;
28212818
let mut len = ann[j].1;
2822-
if j < ann.len() - 1 && ann[j + 1].2 as usize == t && ((ann[j].0 + ann[j].1 == ann[j + 1].0 && ann[j].3 + ann[j].1 < ann[j + 1].3)
2823-
|| (ann[j].0 + ann[j].1 < ann[j + 1].0 && ann[j].3 + ann[j].1 == ann[j + 1].3)) {
2819+
if j < ann.len() - 1
2820+
&& ann[j + 1].2 as usize == t
2821+
&& ((ann[j].0 + ann[j].1 == ann[j + 1].0 && ann[j].3 + ann[j].1 < ann[j + 1].3)
2822+
|| (ann[j].0 + ann[j].1 < ann[j + 1].0 && ann[j].3 + ann[j].1 == ann[j + 1].3))
2823+
{
28242824
entries = 2;
28252825
len += ann[j + 1].1;
28262826
}
28272827
let mut score = len as usize;
28282828
if refdata.segtype[t] == *"V" && ann[j].3 == 0 {
28292829
score += 1_000_000;
28302830
}
2831-
if refdata.segtype[t] == *"J"
2832-
&& (ann[j].3 + ann[j].1) as usize == refdata.refs[t].len()
2831+
if refdata.segtype[t] == *"J" && (ann[j].3 + ann[j].1) as usize == refdata.refs[t].len()
28332832
{
28342833
score += 1_000_000;
28352834
}
@@ -2852,41 +2851,7 @@ pub fn make_annotation_units(
28522851
#[cfg(test)]
28532852
mod tests {
28542853
use super::*;
2855-
use crate::refx::{human_ref, make_vdj_ref_data_core};
2856-
use crate::{annotate, refx};
2857-
2858-
// The following test checks for alignment of a D region. This example was fixed by code
2859-
// changes in March 2020.
2860-
2861-
#[test]
2862-
fn test_d_region_alignment() {
2863-
use annotate::{annotate_seq, DnaString, RefData};
2864-
let seq = DnaString::from_acgt_bytes(
2865-
b"GGAGGTGCGAATGACTCTGCTCTCTGTCCTGTCTCCTCATCTGCAAAATTAGGAAGCCTGTCTTGATTATCTCCAGGAA\
2866-
CCTCCCACCTCTTCATTCCAGCCTCTGACAAACTCTGCACATTAGGCCAGGAGAAGCCCCCGAGCCAAGTCTCTTTTCTCATTCTC\
2867-
TTCCAACAAGTGCTTGGAGCTCCAAGAAGGCCCCCTTTGCACTATGAGCAACCAGGTGCTCTGCTGTGTGGTCCTTTGTCTCCTGG\
2868-
GAGCAAACACCGTGGATGGTGGAATCACTCAGTCCCCAAAGTACCTGTTCAGAAAGGAAGGACAGAATGTGACCCTGAGTTGTGAA\
2869-
CAGAATTTGAACCACGATGCCATGTACTGGTACCGACAGGACCCAGGGCAAGGGCTGAGATTGATCTACTACTCACAGATAGTAAA\
2870-
TGACTTTCAGAAAGGAGATATAGCTGAAGGGTACAGCGTCTCTCGGGAGAAGAAGGAATCCTTTCCTCTCACTGTGACATCGGCCC\
2871-
AAAAGAACCCGACAGCTTTCTATCTCTGTGCCAGTAGTATTTTTCTTGCCGGGACAGGGGGCTGGAGCGGCACTGAAGCTTTCTTT\
2872-
GGACAAGGCACCAGACTCACAGTTGTAGAGGACCTGAACAAGGTGTTCCCACCCGAGGTCGCTGTGTTTGAGCCATCAGA",
2873-
);
2874-
let (refx, ext_refx) = (human_ref(), String::new());
2875-
let (is_tcr, is_bcr) = (true, false);
2876-
let mut refdata = RefData::new();
2877-
make_vdj_ref_data_core(&mut refdata, &refx, &ext_refx, is_tcr, is_bcr, None);
2878-
let mut ann = Vec::<(i32, i32, i32, i32, i32)>::new();
2879-
annotate_seq(&seq, &refdata, &mut ann, true, false, true);
2880-
let mut have_d = false;
2881-
for i in 0..ann.len() {
2882-
if refdata.is_d(ann[i].2 as usize) {
2883-
have_d = true;
2884-
}
2885-
}
2886-
if !have_d {
2887-
panic!("\nFailed to find alignment of D region.\n");
2888-
}
2889-
}
2854+
use crate::refx;
28902855

28912856
#[test]
28922857
fn test_no_internal_soft_clipping() {

vdj_ann/src/refx.rs

-76
Original file line numberDiff line numberDiff line change
@@ -17,42 +17,6 @@ use std::collections::{HashMap, HashSet};
1717
use string_utils::TextUtils;
1818
use vector_utils::erase_if;
1919

20-
pub fn human_ref() -> String {
21-
include_str!["../vdj_refs/human/fasta/regions.fa"].to_string()
22-
}
23-
24-
pub fn human_supp_ref() -> String {
25-
include_str!["../vdj_refs/human/fasta/supp_regions.fa"].to_string()
26-
}
27-
28-
pub fn human_ref_2_0() -> String {
29-
include_str!["../vdj_refs_2.0/human/fasta/regions.fa"].to_string()
30-
}
31-
32-
pub fn human_ref_3_1() -> String {
33-
include_str!["../vdj_refs_3.1/human/fasta/regions.fa"].to_string()
34-
}
35-
36-
pub fn human_ref_4_0() -> String {
37-
include_str!["../vdj_refs_4.0/human/fasta/regions.fa"].to_string()
38-
}
39-
40-
pub fn mouse_ref() -> String {
41-
include_str!["../vdj_refs/mouse/fasta/regions.fa"].to_string()
42-
}
43-
44-
pub fn mouse_supp_ref() -> String {
45-
include_str!["../vdj_refs/mouse/fasta/supp_regions.fa"].to_string()
46-
}
47-
48-
pub fn mouse_ref_3_1() -> String {
49-
include_str!["../vdj_refs_3.1/mouse/fasta/regions.fa"].to_string()
50-
}
51-
52-
pub fn mouse_ref_4_0() -> String {
53-
include_str!["../vdj_refs_4.0/mouse/fasta/regions.fa"].to_string()
54-
}
55-
5620
// RefData: this is a packaging of reference data appropriate for VDJ analysis.
5721

5822
#[derive(Default)]
@@ -304,43 +268,3 @@ pub fn make_vdj_ref_data_core(
304268
}
305269
}
306270
}
307-
308-
pub fn make_vdj_ref_data(
309-
refdata: &mut RefData,
310-
imgt: bool,
311-
species: &String,
312-
extended: bool,
313-
is_tcr: bool,
314-
is_bcr: bool,
315-
) {
316-
let mut refx = String::new();
317-
let mut ext_refx = String::new();
318-
if !imgt && species == "human" {
319-
refx = human_ref();
320-
if extended {
321-
ext_refx = human_supp_ref();
322-
}
323-
}
324-
if !imgt && species == "mouse" {
325-
refx = mouse_ref();
326-
if extended {
327-
ext_refx = mouse_supp_ref();
328-
}
329-
}
330-
if imgt && species == "human" {
331-
refx = read_to_string_safe(
332-
"/mnt/opt/refdata_cellranger/vdj/\
333-
vdj_IMGT_20170916-2.1.0/fasta/regions.fa",
334-
);
335-
}
336-
if imgt && species == "mouse" {
337-
refx = read_to_string_safe(
338-
"/mnt/opt/refdata_cellranger/vdj/\
339-
vdj_IMGT_mouse_20180723-2.2.0/fasta/regions.fa",
340-
);
341-
}
342-
if refx.is_empty() {
343-
panic!("Reference file has zero length.");
344-
}
345-
make_vdj_ref_data_core(refdata, &refx, &ext_refx, is_tcr, is_bcr, None);
346-
}

vdj_ann_ref/Cargo.toml

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
[package]
2+
name = "vdj_ann_ref"
3+
version = "0.1.6"
4+
authors = ["David Jaffe <[email protected]"]
5+
edition = "2018"
6+
license = "MIT"
7+
description = "Some tools that are 'internal' for now because they are insufficiently refined and unstable, but which are used by other 'public' crates."
8+
repository = "https://github.com/10XGenomics/rust-toolbox"
9+
10+
# This crate is not published because it is too big.
11+
publish = false
12+
13+
[dependencies]
14+
align_tools = "0.1"
15+
amino = "0.1"
16+
bio_edit = "0.1"
17+
debruijn = "0.3.2"
18+
exons = "0.1"
19+
fasta_tools = "0.1"
20+
flate2 = "1.0.20"
21+
hyperbase = "0.1"
22+
io_utils = "0.2"
23+
itertools = ">= 0.8, <= 0.11"
24+
kmer_lookup = "0.1"
25+
pretty_trace = "0.5"
26+
serde = { version = "1.0", features = ["derive"] }
27+
serde_json = "1.0.62"
28+
sha2 = "0.9.3"
29+
stats_utils = "0.1"
30+
string_utils = "0.1"
31+
vector_utils = "0.1"
32+
strum = ">=0.18.0, <0.22"
33+
strum_macros = ">=0.18.0, <0.22"
34+
vdj_types = "0.1"
35+
vdj_ann = { path = "../vdj_ann" }
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)