proksee-project
diff --git a/‎proksee/assembly_measurer.py
+2-1 b/‎proksee/assembly_measurer.py
+2-1
diff --git a/‎proksee/commands/cmd_assemble.py
+11-7 b/‎proksee/commands/cmd_assemble.py
+11-7
diff --git a/‎proksee/commands/cmd_evaluate.py
+15-5 b/‎proksee/commands/cmd_evaluate.py
+15-5
diff --git a/‎proksee/commands/cmd_updatedb.py
+1-1 b/‎proksee/commands/cmd_updatedb.py
+1-1
diff --git a/‎proksee/contamination_handler.py
+35-2 b/‎proksee/contamination_handler.py
+35-2
@@ -80,7 +80,8 @@ def measure_quality(self):
         if not os.path.exists(self.contigs_filename):
             raise FileNotFoundError("File not found: " + self.contigs_filename)
 
-        quast_command = "quast --contig-thresholds 0," + str(self.minimum_contig_length) + " " + \
+        quast_command = "quast --contig-thresholds 0," + str(self.minimum_contig_length) + \
+                        " --min-contig " + str(self.minimum_contig_length) + " " + \
                         self.contigs_filename + " -o " + self.quast_directory
         quast_out = open(os.path.join(self.output_directory, self.OUTPUT_FILENAME), "w+")
         quast_err = open(os.path.join(self.output_directory, self.ERROR_FILENAME), "w+")
 
@@ -29,8 +29,10 @@
 from shutil import rmtree
 
 from proksee import utilities
+from proksee.utilities import InputType
 from proksee.utilities import get_time
 
+
 from proksee.assembly_database import AssemblyDatabase
 from proksee.assembly_measurer import AssemblyMeasurer
 from proksee.contamination_handler import ContaminationHandler
@@ -206,11 +208,11 @@ def determine_platform(reads, platform_name=None):
               help="The species to assemble. This will override species estimation. Must be spelled correctly.")
 @click.option('-p', '--platform', required=False, default=None,
               help="The sequencing platform used to generate the reads. 'Illumina', 'Ion Torrent', or 'Pac Bio'.")
-@click.option('--min-contig-length', required=False, default=1000,
+@click.option('--min-contig-length', required=False, default=1000, type=click.IntRange(min=1, max=None),
               help="The minimum contig length to include in analysis and output. The default is 1000.")
-@click.option('-t', '--threads', required=False, default=4,
+@click.option('-t', '--threads', required=False, default=4, type=click.IntRange(min=1, max=None),
               help="Specifies the number of threads programs in the pipeline should use. The default is 4.")
-@click.option('-m', '--memory', required=False, default=4,
+@click.option('-m', '--memory', required=False, default=4, type=click.IntRange(min=1, max=None),
               help="Specifies the amount of memory in gigabytes programs in the pipeline should use. The default is 4")
 @click.pass_context
 def cli(ctx, forward, reverse, output, force, species, platform, min_contig_length, threads, memory):
@@ -299,7 +301,7 @@ def assemble(reads, output_directory, force, mash_database_path, resource_specif
         output_directory (string): the location to place all program output and temporary files
         force (bool): whether or not to force the assembly to continue, even when it's evaluated as being poor
         mash_database_path (string): optional; the file path of the Mash database
-        resource_specification (ResourceSpecification): the resources that sub-programs should use
+        resource_specification (ResourceSpecification): the computational resources available
         species_name (string): optional; the name of the species being assembled
         platform_name (string): optional; the name of the sequencing platform that generated the reads
         minimum_contig_length (int): optional; the minimum contig length to use for assembly and analysis
@@ -337,8 +339,9 @@ def assemble(reads, output_directory, force, mash_database_path, resource_specif
 
     # Estimate species
     filtered_filenames = filtered_reads.get_file_locations()
-    species_list = utilities.determine_species(filtered_filenames, assembly_database, output_directory,
-                                               mash_database_path, id_mapping_filename, species_name)
+    species_list = utilities.determine_major_species(filtered_filenames, assembly_database, output_directory,
+                                                     mash_database_path, id_mapping_filename, InputType.READS,
+                                                     resource_specification, species_name)
     species = species_list[0]
     report_species(species_list)
 
@@ -359,7 +362,8 @@ def assemble(reads, output_directory, force, mash_database_path, resource_specif
 
     # Check for contamination at the contig level:
     contamination_handler = ContaminationHandler(species, assembler.contigs_filename, output_directory,
-                                                 mash_database_path, id_mapping_filename)
+                                                 mash_database_path, id_mapping_filename,
+                                                 resource_specification)
     evaluation = contamination_handler.estimate_contamination()
     report_contamination(evaluation)
 
 
@@ -24,14 +24,17 @@
 from pathlib import Path
 
 from proksee import utilities
+from proksee.utilities import InputType
 from proksee.utilities import get_time
+
 from proksee.assembly_database import AssemblyDatabase
 from proksee.assembly_measurer import AssemblyMeasurer
 from proksee.machine_learning_evaluator import MachineLearningEvaluator
 
 import proksee.config as config
 from proksee.ncbi_assembly_evaluator import NCBIAssemblyEvaluator
 from proksee.species_assembly_evaluator import SpeciesAssemblyEvaluator
+from proksee.resource_specification import ResourceSpecification
 
 DATABASE_PATH = os.path.join(Path(__file__).parent.parent.absolute(), "database",
                              "refseq_short.csv")
@@ -50,8 +53,12 @@
 @click.option('-o', '--output', required=True,
               type=click.Path(exists=False, file_okay=False,
                               dir_okay=True, writable=True))
+@click.option('-t', '--threads', required=False, default=4, type=click.IntRange(min=1, max=None),
+              help="Specifies the number of threads programs in the pipeline should use. The default is 4.")
+@click.option('-m', '--memory', required=False, default=4, type=click.IntRange(min=1, max=None),
+              help="Specifies the amount of memory in gigabytes programs in the pipeline should use. The default is 4")
 @click.pass_context
-def cli(ctx, contigs, species, min_contig_length, output):
+def cli(ctx, contigs, species, min_contig_length, output, threads, memory):
 
     # Check Mash database is installed:
     mash_database_path = config.get(config.MASH_PATH)
@@ -60,17 +67,19 @@ def cli(ctx, contigs, species, min_contig_length, output):
         print("Please run 'proksee updatedb' to install the databases!")
         return
 
-    evaluate(contigs, output, mash_database_path, species, min_contig_length)
+    resource_specification = ResourceSpecification(threads, memory)
+    evaluate(contigs, output, resource_specification, mash_database_path, species, min_contig_length)
 
 
-def evaluate(contigs_filename, output_directory, mash_database_path,
+def evaluate(contigs_filename, output_directory, resource_specification, mash_database_path,
              species_name=None, minimum_contig_length=1000, id_mapping_filename=ID_MAPPING_FILENAME):
     """
     The main control flow of the program that evaluates the assembly.
 
     ARGUMENTS:
         contigs_filename (string): the filename of the contigs to evaluate
         output_directory (string): the location to place all program output and temporary files
+        resource_specification (ResourceSpecification): the computational resources available
         mash_database_path (string): optional; the name of the Mash database
         species_name (string): optional; the name of the species being assembled
         minimum_contig_length (int): optional; the minimum contig length to consider for analysis
@@ -91,8 +100,9 @@ def evaluate(contigs_filename, output_directory, mash_database_path,
     assembly_database = AssemblyDatabase(DATABASE_PATH)
 
     # Estimate species
-    species_list = utilities.determine_species([contigs_filename], assembly_database, output_directory,
-                                               mash_database_path, id_mapping_filename, species_name)
+    species_list = utilities.determine_major_species([contigs_filename], assembly_database, output_directory,
+                                                     mash_database_path, id_mapping_filename, InputType.ASSEMBLY,
+                                                     resource_specification, species_name)
     species = species_list[0]
 
     click.echo("\n" + get_time())
 
@@ -82,7 +82,7 @@ def update(directory):
 
         click.echo("Downloading database...")
 
-        command = "wget -O " + str(mash_database_path) + " " + MASH_DATABASE_URL
+        command = "wget -O " + str(mash_database_path) + " " + MASH_DATABASE_URL + " --progress=dot:giga"
 
         try:
             subprocess.check_call(command, shell=True)
 
@@ -36,11 +36,13 @@ class ContaminationHandler:
             subdirectory of the program output directory
         mash_database_filename (str): the filename of the Mash sketch (database)
         id_mapping_filename (str): filename of the NCBI ID-to-taxonomy mapping file
+        resource_specification (ResourceSpecification): the computational resources available
     """
 
     FASTA_DIRECTORY = "fasta"
 
-    def __init__(self, species, contigs_file, output_directory, mash_database_filename, id_mapping_filename):
+    def __init__(self, species, contigs_file, output_directory, mash_database_filename, id_mapping_filename,
+                 resource_specification):
         """
         Initializes the contamination handler.
 
@@ -50,13 +52,15 @@ def __init__(self, species, contigs_file, output_directory, mash_database_filena
             output_directory (str): the output directory for the program
             mash_database_filename (str): the filename of the Mash sketch (database)
             id_mapping_filename (str): filename of the NCBI ID-to-taxonomy mapping file
+            resource_specification (ResourceSpecification): the computational resources available
         """
 
         self.species = species
         self.contigs_file = contigs_file
         self.output_directory = output_directory
         self.mash_database_filename = mash_database_filename
         self.id_mapping_filename = id_mapping_filename
+        self.resource_specification = resource_specification
 
     def estimate_contamination(self):
         """
@@ -68,6 +72,34 @@ def estimate_contamination(self):
                 contains an associated, plain-language report
         """
 
+        # The single FASTA file containing multiple contigs (self.contigs_file) is first
+        # split into multiple single contig FASTA files, which are all placed in the
+        # newly created `fasta_directory`.
+        #
+        # Next, a number of chunks / lists are created. The single contig FASTA files
+        # are listed in descending order by contig size and this allows for (usually)
+        # an even distribution of contigs by size into the different chunks / lists.
+        # Single contig FASTA files are added to each chunk by rotating the lists and
+        # added the next largest contig. At the end of this step, there should be a
+        # number of lists equal to the number of chunks, and each list should contain
+        # a similar amount of sequence.
+        #
+        # Finally, the species of each chunk / list is evaluated independently. This
+        # will consider all of the single contig FASTA files in the chunk together
+        # when estimating a species. For example, if there are 5 chunks, each will
+        # contain approxately 1/5th of the total assembly sequence, and each 1/5th
+        # will be given a species estimation. These 5 species estimations will then be
+        # compared with each other and the provided species to see if there's any
+        # discrepancey. What this accomplishes is distributing the entire collection
+        # of assembled contigs into 5 different chunks, estimating the species of
+        # each chunk, and ensuring that each chunk still provides the same species
+        # estimation.
+        #
+        # The motivation is that with that right number of chunks (accounting for
+        # computational time), we should observe if a large contiminate contig has
+        # been assembled, because it will disagree with the other chunks and the
+        # provided species.
+
         CHUNKS = 5
 
         fasta_directory = os.path.join(self.output_directory, self.FASTA_DIRECTORY)
@@ -96,7 +128,8 @@ def estimate_contamination(self):
         for i in range(len(contig_filenames)):
 
             species_estimator = SpeciesEstimator(contig_filenames[i], self.output_directory,
-                                                 self.mash_database_filename, self.id_mapping_filename)
+                                                 self.mash_database_filename, self.id_mapping_filename,
+                                                 self.resource_specification)
             species_list = species_estimator.estimate_all_species()
 
             contig_species.append(species_list[0])  # Select the estimation with the most evidence