MNT bump to v3.0.1

dieterich-lab · Feb 10, 2023 · 3061917 · 3061917
1 parent a944462
commit 3061917
Show file tree

Hide file tree

Showing 25 changed files with 112 additions and 281 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 
 ## [Unreleased] - started 2023-02
 
+## [3.0.1] 2023-02-10
+
+### Changed
+
+- I/O: ensure bool in config, remove symlinks
+- Documentation
+
+### Fixed
+
+- Missing GTF attributes
+
 ## [3.0.0] 2023-02-06
 
 ### Changed

diff --git a/README.md b/README.md
@@ -5,13 +5,13 @@ Ribosome profiling (Ribo-seq) is an RNA-sequencing-based readout of RNA translat
 **Rp-Bp** comes with two _interactive dashboards_ or _web applications_, one for read and periodicity quality control, the other to facilitate Ribo-seq ORFs discovery.
 
 <p align="center">
-  <a href="https://rp-bp.readthedocs.io/en/latest/"><img alt="Rp-Bp" src="docs/source/_static/logo-rpbp-dark.png"></a>
+  <a href="https://rp-bp.readthedocs.io/en/latest/"><img alt="Rp-Bp" src="https://github.com/dieterich-lab/rp-bp/raw/master/docs/source/_static/logo-rpbp-dark.png"></a>
 </p>
 
 <p align="center">
 <a href="http://bioconda.github.io/recipes/rpbp/README.html"><img alt="Install with bioconda" src="https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat"></a>
 <a href="https://pypi.org/project/rpbp/"><img alt="PyPI" src="https://img.shields.io/pypi/v/rpbp"></a>
-<a href="https://github.com/dieterich-lab/rp-bp/actions/workflows/ci.yml/badge.svg"><img alt="CI" src="https://github.com/dieterich-lab/rp-bp/actions/workflows/ci.yml/badge.svg"></a>
+<a href="https://github.com/dieterich-lab/rp-bp/actions/workflows/ci.yml"><img alt="CI" src="https://github.com/dieterich-lab/rp-bp/actions/workflows/ci.yml/badge.svg"></a>
 <a href="https://rp-bp.readthedocs.io/en/latest/?badge=latest"><img alt="Docs" src="https://readthedocs.org/projects/rp-bp/badge/?version=latest"></a>
 </p>
 

diff --git a/docs/source/api-prepare.rst b/docs/source/api-prepare.rst
@@ -5,58 +5,3 @@ prepare-rpbp-genome
    :module: rpbp.reference_preprocessing.prepare_rpbp_genome
    :func: get_parser
    :prog: prepare-rpbp-genome
-
-   --star-options : @replace
-         A space-delimited list of options to pass to STAR. Each option is quoted separately as in "--starOption value", using soft quotes, where --starOption is the long parameter name from STAR, and value is the value given to this parameter. If specified, STAR options will override
-         default settings.
-
-   --num-cpus : @replace
-         The number of CPUs to use (not only for SLURM). The definition of a "CPU" varies somewhat among the programs. For example, for STAR, these are actually threads. For many of the python scripts, this number is translated into the number of processes to spawn. None of the code parallelizes across machines, so the value should not be greater than the number of cores on the machine on which the programs are executed. When used with SLURM, this will be translated into an sbatch request like: ``--ntasks 1 --cpus-per-task <num-cpus>``.
-
-   --mem : @replace
-         For STAR genome indexing, the amount of RAM to request. The rest of the programs do not use this value. When used with SLURM, this will be translated into an sbatch request like: ``--mem=<mem>``.
-
-   --time : @replace
-         The amount of time to request. This will be translated into an sbatch request like: ``--time <time>``.
-
-   --partitions : @replace
-         The partitions to request. This will be translated into an sbatch request like: ``-p <partitions>``.
-
-   --no-output : @replace
-         Redirect stdout to /dev/null. This will be translated into an sbatch request like: ``--output=/dev/null``. By default, stdout is redirected to a log file with the job number ``--output=slurm-%J.out``.
-
-   --no-error : @replace
-         Redirect stderr to /dev/null. This will be translated into an sbatch request like: ``--output=/dev/null``. By default, stderr is redirected to a log file with the job number ``--output=slurm-%J.err``.
-
-   --stdout-file : @replace
-         Log file (stdout) if not ``--no-output``. This corresponds to ``--output=stdout-file`` in the sbatch call.
-
-   --stderr-file : @replace
-         Log file (stderr) if not ``--no-error``. This corresponds to ``--error=stderr-file`` in the sbatch call.
-
-   --do-not-call : @replace
-         If this flag is present, then the program will not be executed (dry run).
-
-   --log-file : @replace
-         Log file (logging will be redirected to this file, in addition to stdout and stderr, if specified).
-
-   --log-stdout : @replace
-         Log to stdout (in addition to a file and stderr, if specified).
-
-   --no-log-stderr : @replace
-         By default, logging is redirected to stderr (in addition to a file and stdout, if specified). If this flag is present, then no logging will be written to stderr.
-
-   --enable-ext-logging : @replace
-         Enable logging for external programs that may be disabled by default, *e.g.* CmdStanPy.
-
-   --logging-level : @replace
-         Logging level for all logs.
-
-   --file-logging-level : @replace
-         Logging level for the log file. This option overrides ``--logging-level``.
-
-   --stdout-logging-level : @replace
-         Logging level for stdout. This option overrides ``--logging-level``.
-
-   --stderr-logging-level : @replace
-         Logging level for stderr. This option overrides ``--logging-level``.
diff --git a/docs/source/api-rpbp.rst b/docs/source/api-rpbp.rst
@@ -5,61 +5,3 @@ run-all-rpbp-instances
    :module: rpbp.run_all_rpbp_instances
    :func: get_parser
    :prog: run-all-rpbp-instances
-
-   --flexbar-options : @replace
-         A space-delimited list of options to pass to Flexbar. Each option is quoted separately as in "--flexbarOption value", using soft quotes, where --flexbarOption is the long parameter name from Flexbar, and value is the value given to this parameter. If specified, Flexbar options will override default settings.
-
-   --star-options : @replace
-         A space-delimited list of options to pass to STAR. Each option is quoted separately as in "--starOption value", using soft quotes, where --starOption is the long parameter name from STAR, and value is the value given to this parameter. If specified, STAR options will override
-         default settings.
-
-   --num-cpus : @replace
-         The number of CPUs to use (not only for SLURM). The definition of a "CPU" varies somewhat among the programs. For example, for STAR, these are actually threads. For many of the python scripts, this number is translated into the number of processes to spawn. None of the code parallelizes across machines, so the value should not be greater than the number of cores on the machine on which the programs are executed. When used with SLURM, this will be translated into an sbatch request like: ``--ntasks 1 --cpus-per-task <num-cpus>``.
-
-   --mem : @replace
-         For STAR genome indexing, the amount of RAM to request. The rest of the programs do not use this value. When used with SLURM, this will be translated into an sbatch request like: ``--mem=<mem>``.
-
-   --time : @replace
-         The amount of time to request. This will be translated into an sbatch request like: ``--time <time>``.
-
-   --partitions : @replace
-         The partitions to request. This will be translated into an sbatch request like: ``-p <partitions>``.
-
-   --no-output : @replace
-         Redirect stdout to /dev/null. This will be translated into an sbatch request like: ``--output=/dev/null``. By default, stdout is redirected to a log file with the job number ``--output=slurm-%J.out``.
-
-   --no-error : @replace
-         Redirect stderr to /dev/null. This will be translated into an sbatch request like: ``--output=/dev/null``. By default, stderr is redirected to a log file with the job number ``--output=slurm-%J.err``.
-
-   --stdout-file : @replace
-         Log file (stdout) if not ``--no-output``. This corresponds to ``--output=stdout-file`` in the sbatch call.
-
-   --stderr-file : @replace
-         Log file (stderr) if not ``--no-error``. This corresponds to ``--error=stderr-file`` in the sbatch call.
-
-   --do-not-call : @replace
-         If this flag is present, then the program will not be executed (dry run).
-
-   --log-file : @replace
-         Log file (logging will be redirected to this file, in addition to stdout and stderr, if specified).
-
-   --log-stdout : @replace
-         Log to stdout (in addition to a file and stderr, if specified).
-
-   --no-log-stderr : @replace
-         By default, logging is redirected to stderr (in addition to a file and stdout, if specified). If this flag is present, then no logging will be written to stderr.
-
-   --enable-ext-logging : @replace
-         Enable logging for external programs that may be disabled by default, *e.g.* CmdStanPy.
-
-   --logging-level : @replace
-         Logging level for all logs.
-
-   --file-logging-level : @replace
-         Logging level for the log file. This option overrides ``--logging-level``.
-
-   --stdout-logging-level : @replace
-         Logging level for stdout. This option overrides ``--logging-level``.
-
-   --stderr-logging-level : @replace
-         Logging level for stderr. This option overrides ``--logging-level``.
diff --git a/docs/source/api-summarize-profiles.rst b/docs/source/api-summarize-profiles.rst
@@ -5,27 +5,3 @@ summarize-rpbp-profile-construction
    :module: rpbp.analysis.profile_construction.summarize_rpbp_profile_construction
    :func: get_parser
    :prog: summarize-rpbp-profile-construction
-
-   --log-file : @replace
-         Log file (logging will be redirected to this file, in addition to stdout and stderr, if specified).
-
-   --log-stdout : @replace
-         Log to stdout (in addition to a file and stderr, if specified).
-
-   --no-log-stderr : @replace
-         By default, logging is redirected to stderr (in addition to a file and stdout, if specified). If this flag is present, then no logging will be written to stderr.
-
-   --enable-ext-logging : @replace
-         Enable logging for external programs that may be disabled by default, *e.g.* CmdStanPy.
-
-   --logging-level : @replace
-         Logging level for all logs.
-
-   --file-logging-level : @replace
-         Logging level for the log file. This option overrides ``--logging-level``.
-
-   --stdout-logging-level : @replace
-         Logging level for stdout. This option overrides ``--logging-level``.
-
-   --stderr-logging-level : @replace
-         Logging level for stderr. This option overrides ``--logging-level``.
diff --git a/docs/source/api-summarize-rpbp.rst b/docs/source/api-summarize-rpbp.rst
@@ -5,27 +5,3 @@ summarize-rpbp-predictions
    :module: rpbp.analysis.rpbp_predictions.summarize_rpbp_predictions
    :func: get_parser
    :prog: summarize-rpbp-predictions
-
-   --log-file : @replace
-         Log file (logging will be redirected to this file, in addition to stdout and stderr, if specified).
-
-   --log-stdout : @replace
-         Log to stdout (in addition to a file and stderr, if specified).
-
-   --no-log-stderr : @replace
-         By default, logging is redirected to stderr (in addition to a file and stdout, if specified). If this flag is present, then no logging will be written to stderr.
-
-   --enable-ext-logging : @replace
-         Enable logging for external programs that may be disabled by default, *e.g.* CmdStanPy.
-
-   --logging-level : @replace
-         Logging level for all logs.
-
-   --file-logging-level : @replace
-         Logging level for the log file. This option overrides ``--logging-level``.
-
-   --stdout-logging-level : @replace
-         Logging level for stdout. This option overrides ``--logging-level``.
-
-   --stderr-logging-level : @replace
-         Logging level for stderr. This option overrides ``--logging-level``.
diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst
@@ -82,7 +82,7 @@ Required input
 Reference annotations
 """""""""""""""""""""
 
-The reference annotations (format GTF2), with *exons* and *CDS* features (*start_codon* and *stop_codon* features are not required). The attribute field must include *transcript_id*, *transcript_biotype*, *gene_id*, *gene_name*, and *gene_biotype*. The annotations must match the version of the reference genome sequence.
+The reference annotations (format GTF2), with *transcript*, *exon*, and *CDS* features (*start_codon* and *stop_codon* features are not required). The attribute field must include *transcript_id* and *gene_id*, and optionally *transcript_biotype*, *gene_name*, and *gene_biotype*. The annotations must match the version of the reference genome sequence.
 
 .. caution::
 
@@ -127,16 +127,16 @@ The base path for the following files is: *<genome_base_path>/transcript-index*
 
 * *<genome_name>.transcripts.annotated.fa* A FASTA file with the annotated transcript sequences.
 
-* *<genome_name>.orfs-genomic.annotated[.orf_note].bed.gz*. A BED12+ with the ORFs extracted from all transcripts. The ORFs are numbered, and their length is also reported. The ORF ids are of the form: *transcript_seqname:start-end:strand*. The start codon is included, but the stop codon is not.
+* *<genome_name>.orfs-genomic[.orf_note].bed.gz*. A BED12+ with the ORFs extracted from all transcripts. The ORFs are numbered, and their length is also reported. The ORF ids are of the form: *transcript_seqname:start-end:strand*. The start codon is included, but the stop codon is not.
 
-* *<genome_name>.orfs-exons.annotated[.orf_note].bed.gz*. A BED6+ file with the ORF exons. The extra columns are *exon_index*, giving the order of the exon in the transcript, and *transcript_start*, giving the start position of that index in transcript coordinates.
+* *<genome_name>.orfs-exons[.orf_note].bed.gz*. A BED6+ file with the ORF exons. The extra columns are *exon_index*, giving the order of the exon in the transcript, and *transcript_start*, giving the start position of that index in transcript coordinates.
 
-* *<genome_name>.orfs-labels.annotated[.orf_note].tab.gz*. A TAB-delimited file with ORF categories and all compatible transcripts. See `More about prepare-rpbp-genome`_ to learn about ORF categories or labels.
+* *<genome_name>.orfs-labels[.orf_note].tab.gz*. A TAB-delimited file with ORF categories and all compatible transcripts. See `More about prepare-rpbp-genome`_ to learn about ORF categories or labels.
 
 
 .. note::
 
-    If a ``de_novo_gtf`` file is provided, additional output files are created using the same convention as described above, with the addition of a *<de-novo>* flag. In this case, the files used by the pipeline are the "concatenation" of the respective *annotated* and *de-novo* files; otherwise, they are symlink to the respective *annotated* files.
+    If a ``de_novo_gtf`` file is provided, intermediate output files are split into *annotated* and *de-novo*. The files used by the pipeline, as described above, are the "concatenation" of the respective *annotated* and *de-novo* files. In addition, a GTF file is created by concatenating ``gtf`` and ``de_novo_gtf``. This new GTF file is written to ``genome_base_path``. Be careful not to overwrite any existing GTF file there!
 
 
 .. _running_rpbp:
@@ -251,14 +251,13 @@ The base path for the following files is: *<riboseq_data>/without-rrna*
 
 The base path for the following files is: *<riboseq_data>/without-rrna-mapping*
 
-* *<sample_name>[.note].Aligned.sortedByCoord.out.bam* A sorted BAM file with genome alignments.
-* *<sample_name>[.note].bam* A symlink to *Aligned.sortedByCoord.out.bam*
+* *<sample_name>[.note].bam* A sorted BAM file with genome alignments (the *Aligned.sortedByCoord.out.bam* STAR output).
 * *<sample_name>[.note]-unique.bam* A sorted BAM file with unique alignments (multimapping reads removed).
 
 
 .. note::
 
-    If the ``keep_riboseq_multimappers`` configuration option is given, then there will be no *-unique* files. In general, we do not recommend to keep multimappers.
+    If ``keep_riboseq_multimappers`` is ``True`` in the configuration file, then there will be no *-unique* files. In general, we do not recommend to keep multimappers.
 
 
 The base path for the following files is: *<riboseq_data>/metagene-profiles*
@@ -365,7 +364,7 @@ STAR
 Rp-Bp parameters
 ^^^^^^^^^^^^^^^^
 
-* ``keep_riboseq_multimappers`` If this key is present in the configuration file with any value (even something like "no" or "null" or "false"), then multimapping riboseq reads *will not* be removed. They will be treated as "normal" reads in every place they map, *i.e.* the weight of the read will not be distributed fractionally, probabilistically, *etc.* We do not in general recommend to use this option.
+* ``keep_riboseq_multimappers`` If ``True`` in the configuration file, then multimapping riboseq reads *will not* be removed. They will be treated as "normal" reads in every place they map, *i.e.* the weight of the read will not be distributed fractionally, probabilistically, *etc.* We do not in general recommend to use this option.
 * ``models_base`` The path to the compiled models, if installed in a different location. The models are included with the source distribution and compiled as part of the installation. *Do not change this, unless you know what you are doing!*
 
 
@@ -401,9 +400,9 @@ Metagene and periodicity estimation parameters
 Fixed lengths and offsets
 """""""""""""""""""""""""
 
-* ``use_fixed_lengths`` If this variable is present in the config file with any value (even something like "no" or "null" or "false"), fixed values given by ``lengths`` and ``offsets`` are used (no periodicity estimation).
-* ``lengths`` A list of read lengths to use for creating the profiles if the ``use_fixed_lengths`` option is given. Presumably, these are lengths that have periodic metagene profiles.
-* ``offsets``  The P-site offset to use for each read length specifed by ``lengths`` if the ``use_fixed_lengths`` option is given. The number of offsets must match the number of lengths, and they are assumed to match. For example ``lengths``:  [26, 29] with ``offsets``: [9, 12] means only reads of lengths 26 bp and 29 bp are used to create the profiles. The 26 bp reads will be shifted by 9 bp in the 5' direction, while reads of length 29 bp will be shifted by 12 bp.
+* ``use_fixed_lengths`` If ``True`` in the configuration file, fixed values given by ``lengths`` and ``offsets`` are used (no periodicity estimation).
+* ``lengths`` A list of read lengths to use for creating the profiles if the ``use_fixed_lengths`` option is ``True``. Presumably, these are lengths that have periodic metagene profiles.
+* ``offsets``  The P-site offset to use for each read length specifed by ``lengths`` if the ``use_fixed_lengths`` option is ``True``. The number of offsets must match the number of lengths, and they are assumed to match. For example ``lengths``:  [26, 29] with ``offsets``: [9, 12] means only reads of lengths 26 bp and 29 bp are used to create the profiles. The 26 bp reads will be shifted by 9 bp in the 5' direction, while reads of length 29 bp will be shifted by 12 bp.
 
 
 Smoothing parameters

diff --git a/environment.yml b/environment.yml
@@ -16,7 +16,7 @@ dependencies:
   - joblib
   - numpy
   - pandas
-  - pbiotools>=4.0.0
+  - pbiotools>=4.0.1
   - pyyaml
   - samtools
   - scipy

diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ classifiers = [
 dynamic = ["version"]
 requires-python = ">=3.7,<3.11"
 dependencies = [
-  "pbiotools>=4.0.0",
+  "pbiotools>=4.0.1",
   "appdirs",
   "biopython",
   "cmdstanpy",

diff --git a/src/rpbp/__init__.py b/src/rpbp/__init__.py
@@ -1,2 +1,2 @@
-__version_info__ = ("3", "0", "0")
+__version_info__ = ("3", "0", "1")
 __version__ = ".".join(__version_info__)
diff --git a/src/rpbp/analysis/profile_construction/collect_read_length_orf_profiles.py b/src/rpbp/analysis/profile_construction/collect_read_length_orf_profiles.py
@@ -66,7 +66,7 @@ def main():
     config = yaml.load(open(args.config), Loader=yaml.FullLoader)
 
     # pull out what we need from the config file
-    is_unique = not ("keep_riboseq_multimappers" in config)
+    is_unique = not config.get("keep_riboseq_multimappers", False)
     note = config.get("note", None)
 
     if args.add_ids:

diff --git a/src/rpbp/analysis/profile_construction/create_read_length_orf_profiles.py b/src/rpbp/analysis/profile_construction/create_read_length_orf_profiles.py
@@ -70,7 +70,7 @@ def main():
     config = yaml.load(open(args.config), Loader=yaml.FullLoader)
 
     # pull out what we need from the config file
-    is_unique = not ("keep_riboseq_multimappers" in config)
+    is_unique = not config.get("keep_riboseq_multimappers", False)
     seqname_str = utils.get_config_argument(config, "seqname_prefix")
     note = config.get("note", None)
     orf_note = config.get("orf_note", None)
@@ -80,7 +80,7 @@ def main():
     )
 
     exons = filenames.get_exons(
-        config["genome_base_path"], config["genome_name"], note=orf_note, is_orf=True
+        config["genome_base_path"], config["genome_name"], note=orf_note
     )
 
     # make sure the necessary files exist
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,7 +16,7 @@ dependencies: @@
       - joblib
       - numpy
       - pandas
-      - pbiotools>=4.0.0
+      - pbiotools>=4.0.1
       - pyyaml
       - samtools
       - scipy
@@ Expand Down @@