Skip to content

Commit 9612458

Browse files
authored
Merge pull request #79 from proksee-project/fix/v1.0.0a5
Various small fixes for v1.0.0a5
2 parents bc8d3d1 + b184107 commit 9612458

11 files changed

+229
-55
lines changed

proksee/assembly_measurer.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ def measure_quality(self):
8080
if not os.path.exists(self.contigs_filename):
8181
raise FileNotFoundError("File not found: " + self.contigs_filename)
8282

83-
quast_command = "quast --contig-thresholds 0," + str(self.minimum_contig_length) + " " + \
83+
quast_command = "quast --contig-thresholds 0," + str(self.minimum_contig_length) + \
84+
" --min-contig " + str(self.minimum_contig_length) + " " + \
8485
self.contigs_filename + " -o " + self.quast_directory
8586
quast_out = open(os.path.join(self.output_directory, self.OUTPUT_FILENAME), "w+")
8687
quast_err = open(os.path.join(self.output_directory, self.ERROR_FILENAME), "w+")

proksee/commands/cmd_assemble.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@
2929
from shutil import rmtree
3030

3131
from proksee import utilities
32+
from proksee.utilities import InputType
3233
from proksee.utilities import get_time
3334

35+
3436
from proksee.assembly_database import AssemblyDatabase
3537
from proksee.assembly_measurer import AssemblyMeasurer
3638
from proksee.contamination_handler import ContaminationHandler
@@ -206,11 +208,11 @@ def determine_platform(reads, platform_name=None):
206208
help="The species to assemble. This will override species estimation. Must be spelled correctly.")
207209
@click.option('-p', '--platform', required=False, default=None,
208210
help="The sequencing platform used to generate the reads. 'Illumina', 'Ion Torrent', or 'Pac Bio'.")
209-
@click.option('--min-contig-length', required=False, default=1000,
211+
@click.option('--min-contig-length', required=False, default=1000, type=click.IntRange(min=1, max=None),
210212
help="The minimum contig length to include in analysis and output. The default is 1000.")
211-
@click.option('-t', '--threads', required=False, default=4,
213+
@click.option('-t', '--threads', required=False, default=4, type=click.IntRange(min=1, max=None),
212214
help="Specifies the number of threads programs in the pipeline should use. The default is 4.")
213-
@click.option('-m', '--memory', required=False, default=4,
215+
@click.option('-m', '--memory', required=False, default=4, type=click.IntRange(min=1, max=None),
214216
help="Specifies the amount of memory in gigabytes programs in the pipeline should use. The default is 4")
215217
@click.pass_context
216218
def cli(ctx, forward, reverse, output, force, species, platform, min_contig_length, threads, memory):
@@ -299,7 +301,7 @@ def assemble(reads, output_directory, force, mash_database_path, resource_specif
299301
output_directory (string): the location to place all program output and temporary files
300302
force (bool): whether or not to force the assembly to continue, even when it's evaluated as being poor
301303
mash_database_path (string): optional; the file path of the Mash database
302-
resource_specification (ResourceSpecification): the resources that sub-programs should use
304+
resource_specification (ResourceSpecification): the computational resources available
303305
species_name (string): optional; the name of the species being assembled
304306
platform_name (string): optional; the name of the sequencing platform that generated the reads
305307
minimum_contig_length (int): optional; the minimum contig length to use for assembly and analysis
@@ -337,8 +339,9 @@ def assemble(reads, output_directory, force, mash_database_path, resource_specif
337339

338340
# Estimate species
339341
filtered_filenames = filtered_reads.get_file_locations()
340-
species_list = utilities.determine_species(filtered_filenames, assembly_database, output_directory,
341-
mash_database_path, id_mapping_filename, species_name)
342+
species_list = utilities.determine_major_species(filtered_filenames, assembly_database, output_directory,
343+
mash_database_path, id_mapping_filename, InputType.READS,
344+
resource_specification, species_name)
342345
species = species_list[0]
343346
report_species(species_list)
344347

@@ -359,7 +362,8 @@ def assemble(reads, output_directory, force, mash_database_path, resource_specif
359362

360363
# Check for contamination at the contig level:
361364
contamination_handler = ContaminationHandler(species, assembler.contigs_filename, output_directory,
362-
mash_database_path, id_mapping_filename)
365+
mash_database_path, id_mapping_filename,
366+
resource_specification)
363367
evaluation = contamination_handler.estimate_contamination()
364368
report_contamination(evaluation)
365369

proksee/commands/cmd_evaluate.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,17 @@
2424
from pathlib import Path
2525

2626
from proksee import utilities
27+
from proksee.utilities import InputType
2728
from proksee.utilities import get_time
29+
2830
from proksee.assembly_database import AssemblyDatabase
2931
from proksee.assembly_measurer import AssemblyMeasurer
3032
from proksee.machine_learning_evaluator import MachineLearningEvaluator
3133

3234
import proksee.config as config
3335
from proksee.ncbi_assembly_evaluator import NCBIAssemblyEvaluator
3436
from proksee.species_assembly_evaluator import SpeciesAssemblyEvaluator
37+
from proksee.resource_specification import ResourceSpecification
3538

3639
DATABASE_PATH = os.path.join(Path(__file__).parent.parent.absolute(), "database",
3740
"refseq_short.csv")
@@ -50,8 +53,12 @@
5053
@click.option('-o', '--output', required=True,
5154
type=click.Path(exists=False, file_okay=False,
5255
dir_okay=True, writable=True))
56+
@click.option('-t', '--threads', required=False, default=4, type=click.IntRange(min=1, max=None),
57+
help="Specifies the number of threads programs in the pipeline should use. The default is 4.")
58+
@click.option('-m', '--memory', required=False, default=4, type=click.IntRange(min=1, max=None),
59+
help="Specifies the amount of memory in gigabytes programs in the pipeline should use. The default is 4")
5360
@click.pass_context
54-
def cli(ctx, contigs, species, min_contig_length, output):
61+
def cli(ctx, contigs, species, min_contig_length, output, threads, memory):
5562

5663
# Check Mash database is installed:
5764
mash_database_path = config.get(config.MASH_PATH)
@@ -60,17 +67,19 @@ def cli(ctx, contigs, species, min_contig_length, output):
6067
print("Please run 'proksee updatedb' to install the databases!")
6168
return
6269

63-
evaluate(contigs, output, mash_database_path, species, min_contig_length)
70+
resource_specification = ResourceSpecification(threads, memory)
71+
evaluate(contigs, output, resource_specification, mash_database_path, species, min_contig_length)
6472

6573

66-
def evaluate(contigs_filename, output_directory, mash_database_path,
74+
def evaluate(contigs_filename, output_directory, resource_specification, mash_database_path,
6775
species_name=None, minimum_contig_length=1000, id_mapping_filename=ID_MAPPING_FILENAME):
6876
"""
6977
The main control flow of the program that evaluates the assembly.
7078
7179
ARGUMENTS:
7280
contigs_filename (string): the filename of the contigs to evaluate
7381
output_directory (string): the location to place all program output and temporary files
82+
resource_specification (ResourceSpecification): the computational resources available
7483
mash_database_path (string): optional; the name of the Mash database
7584
species_name (string): optional; the name of the species being assembled
7685
minimum_contig_length (int): optional; the minimum contig length to consider for analysis
@@ -91,8 +100,9 @@ def evaluate(contigs_filename, output_directory, mash_database_path,
91100
assembly_database = AssemblyDatabase(DATABASE_PATH)
92101

93102
# Estimate species
94-
species_list = utilities.determine_species([contigs_filename], assembly_database, output_directory,
95-
mash_database_path, id_mapping_filename, species_name)
103+
species_list = utilities.determine_major_species([contigs_filename], assembly_database, output_directory,
104+
mash_database_path, id_mapping_filename, InputType.ASSEMBLY,
105+
resource_specification, species_name)
96106
species = species_list[0]
97107

98108
click.echo("\n" + get_time())

proksee/commands/cmd_updatedb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def update(directory):
8282

8383
click.echo("Downloading database...")
8484

85-
command = "wget -O " + str(mash_database_path) + " " + MASH_DATABASE_URL
85+
command = "wget -O " + str(mash_database_path) + " " + MASH_DATABASE_URL + " --progress=dot:giga"
8686

8787
try:
8888
subprocess.check_call(command, shell=True)

proksee/contamination_handler.py

+35-2
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,13 @@ class ContaminationHandler:
3636
subdirectory of the program output directory
3737
mash_database_filename (str): the filename of the Mash sketch (database)
3838
id_mapping_filename (str): filename of the NCBI ID-to-taxonomy mapping file
39+
resource_specification (ResourceSpecification): the computational resources available
3940
"""
4041

4142
FASTA_DIRECTORY = "fasta"
4243

43-
def __init__(self, species, contigs_file, output_directory, mash_database_filename, id_mapping_filename):
44+
def __init__(self, species, contigs_file, output_directory, mash_database_filename, id_mapping_filename,
45+
resource_specification):
4446
"""
4547
Initializes the contamination handler.
4648
@@ -50,13 +52,15 @@ def __init__(self, species, contigs_file, output_directory, mash_database_filena
5052
output_directory (str): the output directory for the program
5153
mash_database_filename (str): the filename of the Mash sketch (database)
5254
id_mapping_filename (str): filename of the NCBI ID-to-taxonomy mapping file
55+
resource_specification (ResourceSpecification): the computational resources available
5356
"""
5457

5558
self.species = species
5659
self.contigs_file = contigs_file
5760
self.output_directory = output_directory
5861
self.mash_database_filename = mash_database_filename
5962
self.id_mapping_filename = id_mapping_filename
63+
self.resource_specification = resource_specification
6064

6165
def estimate_contamination(self):
6266
"""
@@ -68,6 +72,34 @@ def estimate_contamination(self):
6872
contains an associated, plain-language report
6973
"""
7074

75+
# The single FASTA file containing multiple contigs (self.contigs_file) is first
76+
# split into multiple single contig FASTA files, which are all placed in the
77+
# newly created `fasta_directory`.
78+
#
79+
# Next, a number of chunks / lists are created. The single contig FASTA files
80+
# are listed in descending order by contig size and this allows for (usually)
81+
# an even distribution of contigs by size into the different chunks / lists.
82+
# Single contig FASTA files are added to each chunk by rotating the lists and
83+
# added the next largest contig. At the end of this step, there should be a
84+
# number of lists equal to the number of chunks, and each list should contain
85+
# a similar amount of sequence.
86+
#
87+
# Finally, the species of each chunk / list is evaluated independently. This
88+
# will consider all of the single contig FASTA files in the chunk together
89+
# when estimating a species. For example, if there are 5 chunks, each will
90+
# contain approxately 1/5th of the total assembly sequence, and each 1/5th
91+
# will be given a species estimation. These 5 species estimations will then be
92+
# compared with each other and the provided species to see if there's any
93+
# discrepancey. What this accomplishes is distributing the entire collection
94+
# of assembled contigs into 5 different chunks, estimating the species of
95+
# each chunk, and ensuring that each chunk still provides the same species
96+
# estimation.
97+
#
98+
# The motivation is that with that right number of chunks (accounting for
99+
# computational time), we should observe if a large contiminate contig has
100+
# been assembled, because it will disagree with the other chunks and the
101+
# provided species.
102+
71103
CHUNKS = 5
72104

73105
fasta_directory = os.path.join(self.output_directory, self.FASTA_DIRECTORY)
@@ -96,7 +128,8 @@ def estimate_contamination(self):
96128
for i in range(len(contig_filenames)):
97129

98130
species_estimator = SpeciesEstimator(contig_filenames[i], self.output_directory,
99-
self.mash_database_filename, self.id_mapping_filename)
131+
self.mash_database_filename, self.id_mapping_filename,
132+
self.resource_specification)
100133
species_list = species_estimator.estimate_all_species()
101134

102135
contig_species.append(species_list[0]) # Select the estimation with the most evidence

0 commit comments

Comments
 (0)