Skip to content

Commit 961a14c

Browse files
authored
Merge pull request #690 from ARTbio/optimize_bowtie
Optimize bowtie2 task in repenrich2 tool
2 parents b3b166a + 9c8ae25 commit 961a14c

5 files changed

+31
-30
lines changed

tools/repenrich2/RepEnrich2.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,13 @@ def run_bowtie(args):
109109
'''
110110
write to files to save memory
111111
'''
112-
metagenome, fastqfile = args
113-
b_opt = "-k 1 -p 1 --quiet --no-hd --no-unal"
114-
command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}")
112+
metagenome = args
113+
b_opt = "-k 1 -p 2 --quiet --no-hd --no-unal"
114+
if paired_end is True:
115+
command = shlex.split(f"bowtie2 {b_opt} -x {metagenome}"
116+
f" -1 {fastqfile_1} -2 {fastqfile_1}")
117+
else:
118+
command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile_1}")
115119
bowtie_align = subprocess.run(command, check=True,
116120
capture_output=True, text=True).stdout
117121
bowtie_align = bowtie_align.rstrip('\r\n').split('\n')
@@ -123,17 +127,14 @@ def run_bowtie(args):
123127

124128

125129
# multimapper parsing
126-
args_list = [(metagenome, fastqfile_1) for metagenome in repeat_list]
127-
if paired_end:
128-
args_list.extend([(metagenome, fastqfile_2) for
129-
metagenome in repeat_list])
130+
args_list = [metagenome for metagenome in repeat_list]
130131
with ProcessPoolExecutor(max_workers=cpus) as executor:
131132
results = executor.map(run_bowtie, args_list)
132133

133134
# Aggregate results (avoiding race conditions)
134135
metagenome_reads = defaultdict(list) # metagenome: list of multimap reads
135136

136-
# Now we read .reads file to populate metagnomes_reads
137+
# Now we read .reads files to populate metagnomes_reads
137138
for metagenome in repeat_list:
138139
with open(f"{metagenome}.reads") as readfile:
139140
for read in readfile:

tools/repenrich2/macros.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<macros>
22
<token name="@TOOL_VERSION@">2.31.1</token>
3-
<token name="@VERSION_SUFFIX@">8</token>
3+
<token name="@VERSION_SUFFIX@">9</token>
44
<token name="@PROFILE@">23.0</token>
55

66
<xml name="repenrich_requirements">
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
DNA 184.0
22
LINE 227.0
3-
LTR 27179.0
4-
Low_complexity 0.67
3+
LTR 27175.0
4+
Low_complexity 0.0
55
RC 0.0
6-
Simple_repeat 90.33
6+
Simple_repeat 91.0
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
CMC-Transib 30.0
22
CR1 4.0
33
Copia 25880.0
4-
Gypsy 1238.0
4+
Gypsy 1234.0
55
Helitron 0.0
66
Jockey 107.0
77
LOA 0.0
8-
Low_complexity 0.67
8+
Low_complexity 0.0
99
P 60.0
1010
Pao 61.0
1111
R1 116.0
12-
Simple_repeat 90.33
12+
Simple_repeat 91.0
1313
TcMar-Tc1 94.0

tools/repenrich2/test-data/chrY_paired_fraction_counts.tab

+15-15
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ BS LINE Jockey 0.0
88
BS2 LINE Jockey 58.0
99
BURDOCK_I-int LTR Gypsy 0.0
1010
Baggins1 LINE LOA 0.0
11-
Bica_I-int LTR Gypsy 49.0
12-
Bica_LTR LTR Gypsy 1.0
11+
Bica_I-int LTR Gypsy 50.0
12+
Bica_LTR LTR Gypsy 0.0
1313
CIRCE LTR Gypsy 0.0
1414
Chouto_I-int LTR Gypsy 1.5
1515
Copia1-I_DM LTR Copia 0.0
@@ -37,7 +37,7 @@ G3_DM LINE Jockey 0.0
3737
G5A_DM LINE Jockey 0.0
3838
G5_DM LINE Jockey 0.0
3939
G6_DM LINE Jockey 0.0
40-
GA-rich Low_complexity Low_complexity 0.67
40+
GA-rich Low_complexity Low_complexity 0.0
4141
GTWIN_I-int LTR Gypsy 12.5
4242
G_DM LINE Jockey 0.0
4343
Gypsy11_I-int LTR Gypsy 0.0
@@ -48,7 +48,7 @@ Gypsy2-LTR_DM LTR Gypsy 0.0
4848
Gypsy3_LTR LTR Gypsy 0.0
4949
Gypsy4_I-int LTR Gypsy 0.0
5050
Gypsy5_I-int LTR Gypsy 0.0
51-
Gypsy6A_LTR LTR Gypsy 1.0
51+
Gypsy6A_LTR LTR Gypsy 0.0
5252
Gypsy6_I-int LTR Gypsy 31.0
5353
Gypsy8_I-int LTR Gypsy 0.0
5454
Gypsy8_LTR LTR Gypsy 0.0
@@ -73,29 +73,29 @@ MAX_I-int LTR Pao 56.0
7373
MAX_LTR LTR Pao 2.0
7474
MDG1_I-int LTR Gypsy 0.0
7575
MDG1_LTR LTR Gypsy 0.0
76-
MDG3_I-int LTR Gypsy 156.5
77-
MDG3_LTR LTR Gypsy 2.5
76+
MDG3_I-int LTR Gypsy 156.0
77+
MDG3_LTR LTR Gypsy 3.0
7878
MICROPIA_I-int LTR Gypsy 51.0
7979
MICROPIA_LTR LTR Gypsy 2.0
8080
Mariner2_DM DNA TcMar-Tc1 0.0
8181
NINJA_I-int LTR Pao 0.0
8282
NOMAD_I-int LTR Gypsy 0.0
83-
PROTOP_A DNA P 50.0
84-
PROTOP_B DNA P 10.0
83+
PROTOP_A DNA P 55.0
84+
PROTOP_B DNA P 5.0
8585
QUASIMODO2-I_DM LTR Gypsy 43.0
8686
QUASIMODO2-LTR_DM LTR Gypsy 0.0
87-
QUASIMODO_I-int LTR Gypsy 108.0
88-
QUASIMODO_LTR LTR Gypsy 23.0
87+
QUASIMODO_I-int LTR Gypsy 105.0
88+
QUASIMODO_LTR LTR Gypsy 25.0
8989
R1_DM LINE R1 0.0
9090
ROOA_I-int LTR Pao 0.0
9191
ROOA_LTR LTR Pao 0.0
9292
ROVER-I_DM LTR Gypsy 414.0
93-
ROVER-LTR_DM LTR Gypsy 6.0
93+
ROVER-LTR_DM LTR Gypsy 5.0
9494
S2_DM DNA TcMar-Tc1 0.0
95-
STALKER4_I-int LTR Gypsy 143.5
95+
STALKER4_I-int LTR Gypsy 146.5
9696
STALKER4_LTR LTR Gypsy 25.0
9797
S_DM DNA TcMar-Tc1 53.0
98-
Stalker2_I-int LTR Gypsy 103.0
98+
Stalker2_I-int LTR Gypsy 99.0
9999
Stalker2_LTR LTR Gypsy 3.0
100100
TART-A LINE Jockey 4.0
101101
TART_B1 LINE Jockey 21.0
@@ -109,7 +109,7 @@ _AACACA_n Simple_repeat Simple_repeat 0.0
109109
_AAT_n Simple_repeat Simple_repeat 0.0
110110
_ACAATAG_n Simple_repeat Simple_repeat 0.0
111111
_ACC_n Simple_repeat Simple_repeat 0.0
112-
_AGAGAAG_n Simple_repeat Simple_repeat 2.17
112+
_AGAGAAG_n Simple_repeat Simple_repeat 2.5
113113
_AGAGA_n Simple_repeat Simple_repeat 43.0
114114
_ATAAT_n Simple_repeat Simple_repeat 0.0
115115
_ATATATT_n Simple_repeat Simple_repeat 0.0
@@ -120,7 +120,7 @@ _AT_n Simple_repeat Simple_repeat 0.0
120120
_A_n Simple_repeat Simple_repeat 0.0
121121
_CATA_n Simple_repeat Simple_repeat 0.0
122122
_CTTTT_n Simple_repeat Simple_repeat 0.0
123-
_GAGAA_n Simple_repeat Simple_repeat 45.17
123+
_GAGAA_n Simple_repeat Simple_repeat 45.5
124124
_GCCTTT_n Simple_repeat Simple_repeat 0.0
125125
_TAATAT_n Simple_repeat Simple_repeat 0.0
126126
_TAATA_n Simple_repeat Simple_repeat 0.0

0 commit comments

Comments
 (0)