Make genome mapping more parallel

blakesweeney · blakesweeney · commit 841df5f03ff1 · 2019-05-10T10:42:52.000+01:00
The idea behind this is to make the genome mapping faster, use less
memory and more panellized. Seems some of the new genomes are too large
and the selecting hits part fails. We can rework this by select the best
hits per chunk scanned first. Then once we merge we can select the best
hits on a much smaller set. This should reduce the memory usage of the
final select step (we should only need memory proportional to the number
of chunks and number of sequences not the number of total hits). It
should also be faster since we can do multiple select steps at once.
diff --git a/main.nf b/main.nf
@@ -511,7 +511,7 @@ process species_to_map {
 raw_genomes
   .splitCsv()
   .filter { s, a, t, d -> !params.genome_mapping.species_excluded_from_mapping.contains(s) }
-  .into { assemblies; genomes_to_fetch; assembly_tracking }
+  .into { assemblies; genomes_to_fetch }
 
 assemblies
   .combine(Channel.fromPath('files/genome-mapping/find-unmapped.sql'))
@@ -543,13 +543,13 @@ process download_genome {
   set val(species), val(assembly), val(taxid), val(division) from genomes_to_fetch
 
   output:
-  set val(species), file('parts/*.{2bit,ooc}') into genomes
+  set val(species), val(assembly), file('parts/*.{2bit,ooc}') into genomes
 
   """
   set -o pipefail
 
   rnac genome-mapping url-for --host=$division $species $assembly - |\
-    xargs -I {} fetch generic '{}' ${species}.fasta.gz 
+    xargs -I {} fetch generic '{}' ${species}.fasta.gz
 
   gzip -d ${species}.fasta.gz
   split-sequences ${species}.fasta ${params.genome_mapping.download_genome.chunk_size} parts
@@ -569,10 +569,10 @@ process download_genome {
 
 genomes
   .join(split_mappable_sequences)
-  .flatMap { species, genome_chunks, chunks ->
+  .flatMap { species, assembly, genome_chunks, chunks ->
     [genome_chunks.collate(2), chunks]
       .combinations()
-      .inject([]) { acc, it -> acc << [species] + it.flatten() }
+      .inject([]) { acc, it -> acc << [species, assembly] + it.flatten() }
   }
   .set { targets }
 
@@ -581,12 +581,14 @@ process blat {
   errorStrategy 'finish'
 
   input:
-  set val(species), file(genome), file(ooc), file(chunk) from targets
+  set val(species), val(assembly), file(genome), file(ooc), file(chunk) from targets
 
   output:
-  set val(species), file('output.psl') into blat_results
+  set val(species), file('selected.json') into blat_results
 
   """
+  set -o pipefail
+
   blat \
     -ooc=$ooc \
     -noHead \
@@ -596,38 +598,46 @@ process blat {
     -minScore=${params.genome_mapping.blat.options.min_score} \
     -minIdentity=${params.genome_mapping.blat.options.min_identity} \
     $genome $chunk output.psl
+
+  sort -k 10 output.psl |\
+    rnac genome-mapping blat as-json $assembly - - |\
+    rnac genome-mapping blat select - selected.json
   """
 }
 
  blat_results
   .groupTuple()
-  .join(assembly_tracking)
-  .map { species, psl, assembly_id, taxid, division -> [psl, species, assembly_id] }
   .set { species_results }
 
 process select_mapped_locations {
   tag { species }
   memory { params.genome_mapping.select_mapped.directives.memory }
 
   input:
-  set file('output*.psl'), val(species), val(assembly_id) from species_results
+  set val(species), file('selected*.json') from species_results
 
   output:
-  file 'locations.csv' into selected_locations
+  file('locations.csv') into selected_locations
 
   """
   set -o pipefail
 
-  sort -k 10 output*.psl > sorted.psl
-  rnac genome-mapping select-hits $assembly_id sorted.psl locations.csv
+  find . -name 'selected*.json' |\
+    xargs cat |\
+    rnac genome-mapping blat select --sort - - |\
+    rnac genome-mapping blat as-importable - locations.csv
   """
 }
 
+selected_locations
+  .collect()
+  .set { blat_to_import }
+
 process load_genome_mapping {
   maxForks 1
 
   input:
-  file('raw*.csv') from selected_locations.collect()
+  file('raw*.csv') from blat_to_import
   file(ctl) from Channel.fromPath('files/genome-mapping/load.ctl')
 
   output:
diff --git a/rnacentral_pipeline/cli/genome_mapping.py b/rnacentral_pipeline/cli/genome_mapping.py
@@ -29,12 +29,49 @@ def cli():
     pass
 
 
-@cli.command('select-hits')
+@cli.group('blat')
+def hits():
+    """
+    A series of commands for working with blat hits.
+    """
+    pass
+
+
+@hits.command('as-json')
 @click.argument('assembly_id')
 @click.argument('hits', default='-', type=click.File('r'))
 @click.argument('output', default='-', type=click.File('w'))
-def select_hits(assembly_id, hits, output):
-    blat.write_selected(assembly_id, hits, output)
+def hits_json(assembly_id, hits, output):
+    """
+    Convert the PSL file into a JSON-line file (one object per line). This is a
+    lossy operation but keeps everything needed for selecting later.
+    """
+    blat.as_json(assembly_id, hits, output)
+
+
+@cli.command('as-importable')
+@click.argument('hits', default='-', type=click.File('r'))
+@click.argument('output', default='-', type=click.File('w'))
+def as_importable(raw, output):
+    """
+    Convert a json-line file into a CSV that can be used for import by pgloader.
+    This is lossy as it only keeps the things needed for the database.
+    """
+    blat.as_importable(raw, output)
+
+
+@hits.command('select')
+@click.option('--sort', default=False)
+@click.argument('hits', default='-', type=click.File('r'))
+@click.argument('output', default='-', type=click.File('w'))
+def select_hits(hits, output, sort=False):
+    """
+    Parse a JSON-line file and select the best hits in the file. The best hits
+    are written to the output file. This assumes the file is sorted by
+    urs_taxid unless --sort is given in which case the data is sorted in memory.
+    That may be very expensive.
+    """
+    blat.select_json(hits, output, sort=sort)
 
 
 @cli.command('url-for')
@@ -43,6 +80,10 @@ def select_hits(assembly_id, hits, output):
 @click.argument('assembly_id')
 @click.argument('output', default='-', type=click.File('w'))
 def find_remote_url(species, assembly_id, output, host=None):
+    """
+    Determine the remote URL to fetch a the genome for a given species/assembly.
+    The url is written to the output file and may include '*'.
+    """
     url = urls.url_for(species, assembly_id, host=host)
     output.write(url)
 
@@ -51,4 +92,9 @@ def find_remote_url(species, assembly_id, output, host=None):
 @click.argument('filename', default='-', type=click.File('r'))
 @click.argument('output', default='-', type=click.File('w'))
 def find_remote_urls(filename, output):
+    """
+    Determine the remote URL to fetch a the genomes for all entries in a file,
+    where the file is a csv of species,assembly. The urls is written to the
+    output file and may include '*'.
+    """
     urls.write_urls_for(filename, output)
diff --git a/rnacentral_pipeline/rnacentral/genome_mapping/blat.py b/rnacentral_pipeline/rnacentral/genome_mapping/blat.py
@@ -84,14 +84,14 @@ def build(cls, assembly_id, raw):
 
     @property
     def name(self):
-        return self.region.name(upi=self.upi)
+        return self.region.name(self.upi, is_upi=True)
 
     @property
     def match_fraction(self):
         return float(self.matches) / float(self.sequence_length)
 
     def writeable(self):
-        return self.writeable(self.upi)
+        return self.region.writeable(self.upi, is_upi=True)
 
 
 def select_possible(hit):
@@ -113,7 +113,7 @@ def select_best(hits):
     return hits
 
 
-def parse(assembly_id, handle):
+def parse_psl(assembly_id, handle):
     to_split = ['blockSizes', 'qStarts', 'tStarts']
     for row in csv.reader(handle, delimiter='\t'):
         result = dict(zip(FIELDS, row))
@@ -129,8 +129,13 @@ def parse(assembly_id, handle):
         yield BlatHit.build(assembly_id, result)
 
 
-def select_hits(assembly_id, handle):
-    hits = parse(assembly_id, handle)
+def parse_json(handle):
+    for line in handle:
+        data = json.loads(line)
+        yield BlatHit(**data)
+
+
+def select_hits(hits):
     hits = six.moves.filter(select_possible, hits)
     hits = it.groupby(hits, op.attrgetter('upi'))
     hits = six.moves.map(op.itemgetter(1), hits)
@@ -139,8 +144,27 @@ def select_hits(assembly_id, handle):
     return hits
 
 
-def write_selected(assembly_id, hits, output):
-    selected = select_hits(assembly_id, hits)
-    selected = six.moves.map(op.methodcaller('writeable'), selected)
-    selected = it.chain.from_iterable(selected)
-    csv.writer(output).writerows(selected)
+def write_json(hits, output):
+    for hit in hits:
+        output.write(json.dumps(hit))
+        output.write('\n')
+
+
+def write_importable(hits, output):
+    writeable = six.moves.map(op.methodcaller('writeable'), hits)
+    writeable = it.chain.from_iterable(hits)
+    csv.writer(output).writerows(writeable)
+
+
+def as_json(assembly_id, hits, output):
+    parsed = parse_psl(assembly_id, hits)
+    parsed = six.moves.map(attr.asdict, selected)
+    write_json(parsed, output)
+
+
+def select_json(hits, output, sort=False):
+    parsed = parse_json(hits)
+    if sort:
+        parsed = sorted(parsed, key=op.itemgetter('upi'))
+    selected = select_hits(parsed)
+    write_json(selected, output)