diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6eecdfa..ecd9a00 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
## [Unreleased] - started 2023-02
+## [3.0.1] 2023-02-10
+
+### Changed
+
+- I/O: ensure bool in config, remove symlinks
+- Documentation
+
+### Fixed
+
+- Missing GTF attributes
+
## [3.0.0] 2023-02-06
### Changed
diff --git a/README.md b/README.md
index ad5f2f9..61b0f64 100644
--- a/README.md
+++ b/README.md
@@ -5,13 +5,13 @@ Ribosome profiling (Ribo-seq) is an RNA-sequencing-based readout of RNA translat
**Rp-Bp** comes with two _interactive dashboards_ or _web applications_, one for read and periodicity quality control, the other to facilitate Ribo-seq ORFs discovery.
-
+
-
+
diff --git a/docs/source/api-prepare.rst b/docs/source/api-prepare.rst
index 24ccb48..7fad3c2 100644
--- a/docs/source/api-prepare.rst
+++ b/docs/source/api-prepare.rst
@@ -5,58 +5,3 @@ prepare-rpbp-genome
:module: rpbp.reference_preprocessing.prepare_rpbp_genome
:func: get_parser
:prog: prepare-rpbp-genome
-
- --star-options : @replace
- A space-delimited list of options to pass to STAR. Each option is quoted separately as in "--starOption value", using soft quotes, where --starOption is the long parameter name from STAR, and value is the value given to this parameter. If specified, STAR options will override
- default settings.
-
- --num-cpus : @replace
- The number of CPUs to use (not only for SLURM). The definition of a "CPU" varies somewhat among the programs. For example, for STAR, these are actually threads. For many of the python scripts, this number is translated into the number of processes to spawn. None of the code parallelizes across machines, so the value should not be greater than the number of cores on the machine on which the programs are executed. When used with SLURM, this will be translated into an sbatch request like: ``--ntasks 1 --cpus-per-task ``.
-
- --mem : @replace
- For STAR genome indexing, the amount of RAM to request. The rest of the programs do not use this value. When used with SLURM, this will be translated into an sbatch request like: ``--mem=``.
-
- --time : @replace
- The amount of time to request. This will be translated into an sbatch request like: ``--time ``.
-
- --partitions : @replace
- The partitions to request. This will be translated into an sbatch request like: ``-p ``.
-
- --no-output : @replace
- Redirect stdout to /dev/null. This will be translated into an sbatch request like: ``--output=/dev/null``. By default, stdout is redirected to a log file with the job number ``--output=slurm-%J.out``.
-
- --no-error : @replace
- Redirect stderr to /dev/null. This will be translated into an sbatch request like: ``--output=/dev/null``. By default, stderr is redirected to a log file with the job number ``--output=slurm-%J.err``.
-
- --stdout-file : @replace
- Log file (stdout) if not ``--no-output``. This corresponds to ``--output=stdout-file`` in the sbatch call.
-
- --stderr-file : @replace
- Log file (stderr) if not ``--no-error``. This corresponds to ``--error=stderr-file`` in the sbatch call.
-
- --do-not-call : @replace
- If this flag is present, then the program will not be executed (dry run).
-
- --log-file : @replace
- Log file (logging will be redirected to this file, in addition to stdout and stderr, if specified).
-
- --log-stdout : @replace
- Log to stdout (in addition to a file and stderr, if specified).
-
- --no-log-stderr : @replace
- By default, logging is redirected to stderr (in addition to a file and stdout, if specified). If this flag is present, then no logging will be written to stderr.
-
- --enable-ext-logging : @replace
- Enable logging for external programs that may be disabled by default, *e.g.* CmdStanPy.
-
- --logging-level : @replace
- Logging level for all logs.
-
- --file-logging-level : @replace
- Logging level for the log file. This option overrides ``--logging-level``.
-
- --stdout-logging-level : @replace
- Logging level for stdout. This option overrides ``--logging-level``.
-
- --stderr-logging-level : @replace
- Logging level for stderr. This option overrides ``--logging-level``.
diff --git a/docs/source/api-rpbp.rst b/docs/source/api-rpbp.rst
index 327533c..900fc4b 100644
--- a/docs/source/api-rpbp.rst
+++ b/docs/source/api-rpbp.rst
@@ -5,61 +5,3 @@ run-all-rpbp-instances
:module: rpbp.run_all_rpbp_instances
:func: get_parser
:prog: run-all-rpbp-instances
-
- --flexbar-options : @replace
- A space-delimited list of options to pass to Flexbar. Each option is quoted separately as in "--flexbarOption value", using soft quotes, where --flexbarOption is the long parameter name from Flexbar, and value is the value given to this parameter. If specified, Flexbar options will override default settings.
-
- --star-options : @replace
- A space-delimited list of options to pass to STAR. Each option is quoted separately as in "--starOption value", using soft quotes, where --starOption is the long parameter name from STAR, and value is the value given to this parameter. If specified, STAR options will override
- default settings.
-
- --num-cpus : @replace
- The number of CPUs to use (not only for SLURM). The definition of a "CPU" varies somewhat among the programs. For example, for STAR, these are actually threads. For many of the python scripts, this number is translated into the number of processes to spawn. None of the code parallelizes across machines, so the value should not be greater than the number of cores on the machine on which the programs are executed. When used with SLURM, this will be translated into an sbatch request like: ``--ntasks 1 --cpus-per-task ``.
-
- --mem : @replace
- For STAR genome indexing, the amount of RAM to request. The rest of the programs do not use this value. When used with SLURM, this will be translated into an sbatch request like: ``--mem=``.
-
- --time : @replace
- The amount of time to request. This will be translated into an sbatch request like: ``--time ``.
-
- --partitions : @replace
- The partitions to request. This will be translated into an sbatch request like: ``-p ``.
-
- --no-output : @replace
- Redirect stdout to /dev/null. This will be translated into an sbatch request like: ``--output=/dev/null``. By default, stdout is redirected to a log file with the job number ``--output=slurm-%J.out``.
-
- --no-error : @replace
- Redirect stderr to /dev/null. This will be translated into an sbatch request like: ``--output=/dev/null``. By default, stderr is redirected to a log file with the job number ``--output=slurm-%J.err``.
-
- --stdout-file : @replace
- Log file (stdout) if not ``--no-output``. This corresponds to ``--output=stdout-file`` in the sbatch call.
-
- --stderr-file : @replace
- Log file (stderr) if not ``--no-error``. This corresponds to ``--error=stderr-file`` in the sbatch call.
-
- --do-not-call : @replace
- If this flag is present, then the program will not be executed (dry run).
-
- --log-file : @replace
- Log file (logging will be redirected to this file, in addition to stdout and stderr, if specified).
-
- --log-stdout : @replace
- Log to stdout (in addition to a file and stderr, if specified).
-
- --no-log-stderr : @replace
- By default, logging is redirected to stderr (in addition to a file and stdout, if specified). If this flag is present, then no logging will be written to stderr.
-
- --enable-ext-logging : @replace
- Enable logging for external programs that may be disabled by default, *e.g.* CmdStanPy.
-
- --logging-level : @replace
- Logging level for all logs.
-
- --file-logging-level : @replace
- Logging level for the log file. This option overrides ``--logging-level``.
-
- --stdout-logging-level : @replace
- Logging level for stdout. This option overrides ``--logging-level``.
-
- --stderr-logging-level : @replace
- Logging level for stderr. This option overrides ``--logging-level``.
diff --git a/docs/source/api-summarize-profiles.rst b/docs/source/api-summarize-profiles.rst
index a07a6ed..69e6900 100644
--- a/docs/source/api-summarize-profiles.rst
+++ b/docs/source/api-summarize-profiles.rst
@@ -5,27 +5,3 @@ summarize-rpbp-profile-construction
:module: rpbp.analysis.profile_construction.summarize_rpbp_profile_construction
:func: get_parser
:prog: summarize-rpbp-profile-construction
-
- --log-file : @replace
- Log file (logging will be redirected to this file, in addition to stdout and stderr, if specified).
-
- --log-stdout : @replace
- Log to stdout (in addition to a file and stderr, if specified).
-
- --no-log-stderr : @replace
- By default, logging is redirected to stderr (in addition to a file and stdout, if specified). If this flag is present, then no logging will be written to stderr.
-
- --enable-ext-logging : @replace
- Enable logging for external programs that may be disabled by default, *e.g.* CmdStanPy.
-
- --logging-level : @replace
- Logging level for all logs.
-
- --file-logging-level : @replace
- Logging level for the log file. This option overrides ``--logging-level``.
-
- --stdout-logging-level : @replace
- Logging level for stdout. This option overrides ``--logging-level``.
-
- --stderr-logging-level : @replace
- Logging level for stderr. This option overrides ``--logging-level``.
diff --git a/docs/source/api-summarize-rpbp.rst b/docs/source/api-summarize-rpbp.rst
index d824e2c..028294e 100644
--- a/docs/source/api-summarize-rpbp.rst
+++ b/docs/source/api-summarize-rpbp.rst
@@ -5,27 +5,3 @@ summarize-rpbp-predictions
:module: rpbp.analysis.rpbp_predictions.summarize_rpbp_predictions
:func: get_parser
:prog: summarize-rpbp-predictions
-
- --log-file : @replace
- Log file (logging will be redirected to this file, in addition to stdout and stderr, if specified).
-
- --log-stdout : @replace
- Log to stdout (in addition to a file and stderr, if specified).
-
- --no-log-stderr : @replace
- By default, logging is redirected to stderr (in addition to a file and stdout, if specified). If this flag is present, then no logging will be written to stderr.
-
- --enable-ext-logging : @replace
- Enable logging for external programs that may be disabled by default, *e.g.* CmdStanPy.
-
- --logging-level : @replace
- Logging level for all logs.
-
- --file-logging-level : @replace
- Logging level for the log file. This option overrides ``--logging-level``.
-
- --stdout-logging-level : @replace
- Logging level for stdout. This option overrides ``--logging-level``.
-
- --stderr-logging-level : @replace
- Logging level for stderr. This option overrides ``--logging-level``.
diff --git a/docs/source/user-guide.rst b/docs/source/user-guide.rst
index 38a2dbb..11b1899 100644
--- a/docs/source/user-guide.rst
+++ b/docs/source/user-guide.rst
@@ -82,7 +82,7 @@ Required input
Reference annotations
"""""""""""""""""""""
-The reference annotations (format GTF2), with *exons* and *CDS* features (*start_codon* and *stop_codon* features are not required). The attribute field must include *transcript_id*, *transcript_biotype*, *gene_id*, *gene_name*, and *gene_biotype*. The annotations must match the version of the reference genome sequence.
+The reference annotations (format GTF2), with *transcript*, *exon*, and *CDS* features (*start_codon* and *stop_codon* features are not required). The attribute field must include *transcript_id* and *gene_id*, and optionally *transcript_biotype*, *gene_name*, and *gene_biotype*. The annotations must match the version of the reference genome sequence.
.. caution::
@@ -127,16 +127,16 @@ The base path for the following files is: */transcript-index*
* *.transcripts.annotated.fa* A FASTA file with the annotated transcript sequences.
-* *.orfs-genomic.annotated[.orf_note].bed.gz*. A BED12+ with the ORFs extracted from all transcripts. The ORFs are numbered, and their length is also reported. The ORF ids are of the form: *transcript_seqname:start-end:strand*. The start codon is included, but the stop codon is not.
+* *.orfs-genomic[.orf_note].bed.gz*. A BED12+ with the ORFs extracted from all transcripts. The ORFs are numbered, and their length is also reported. The ORF ids are of the form: *transcript_seqname:start-end:strand*. The start codon is included, but the stop codon is not.
-* *.orfs-exons.annotated[.orf_note].bed.gz*. A BED6+ file with the ORF exons. The extra columns are *exon_index*, giving the order of the exon in the transcript, and *transcript_start*, giving the start position of that index in transcript coordinates.
+* *.orfs-exons[.orf_note].bed.gz*. A BED6+ file with the ORF exons. The extra columns are *exon_index*, giving the order of the exon in the transcript, and *transcript_start*, giving the start position of that index in transcript coordinates.
-* *.orfs-labels.annotated[.orf_note].tab.gz*. A TAB-delimited file with ORF categories and all compatible transcripts. See `More about prepare-rpbp-genome`_ to learn about ORF categories or labels.
+* *.orfs-labels[.orf_note].tab.gz*. A TAB-delimited file with ORF categories and all compatible transcripts. See `More about prepare-rpbp-genome`_ to learn about ORF categories or labels.
.. note::
- If a ``de_novo_gtf`` file is provided, additional output files are created using the same convention as described above, with the addition of a ** flag. In this case, the files used by the pipeline are the "concatenation" of the respective *annotated* and *de-novo* files; otherwise, they are symlink to the respective *annotated* files.
+ If a ``de_novo_gtf`` file is provided, intermediate output files are split into *annotated* and *de-novo*. The files used by the pipeline, as described above, are the "concatenation" of the respective *annotated* and *de-novo* files. In addition, a GTF file is created by concatenating ``gtf`` and ``de_novo_gtf``. This new GTF file is written to ``genome_base_path``. Be careful not to overwrite any existing GTF file there!
.. _running_rpbp:
@@ -251,14 +251,13 @@ The base path for the following files is: */without-rrna*
The base path for the following files is: */without-rrna-mapping*
-* *[.note].Aligned.sortedByCoord.out.bam* A sorted BAM file with genome alignments.
-* *[.note].bam* A symlink to *Aligned.sortedByCoord.out.bam*
+* *[.note].bam* A sorted BAM file with genome alignments (the *Aligned.sortedByCoord.out.bam* STAR output).
* *[.note]-unique.bam* A sorted BAM file with unique alignments (multimapping reads removed).
.. note::
- If the ``keep_riboseq_multimappers`` configuration option is given, then there will be no *-unique* files. In general, we do not recommend to keep multimappers.
+ If ``keep_riboseq_multimappers`` is ``True`` in the configuration file, then there will be no *-unique* files. In general, we do not recommend to keep multimappers.
The base path for the following files is: */metagene-profiles*
@@ -365,7 +364,7 @@ STAR
Rp-Bp parameters
^^^^^^^^^^^^^^^^
-* ``keep_riboseq_multimappers`` If this key is present in the configuration file with any value (even something like "no" or "null" or "false"), then multimapping riboseq reads *will not* be removed. They will be treated as "normal" reads in every place they map, *i.e.* the weight of the read will not be distributed fractionally, probabilistically, *etc.* We do not in general recommend to use this option.
+* ``keep_riboseq_multimappers`` If ``True`` in the configuration file, then multimapping riboseq reads *will not* be removed. They will be treated as "normal" reads in every place they map, *i.e.* the weight of the read will not be distributed fractionally, probabilistically, *etc.* We do not in general recommend to use this option.
* ``models_base`` The path to the compiled models, if installed in a different location. The models are included with the source distribution and compiled as part of the installation. *Do not change this, unless you know what you are doing!*
@@ -401,9 +400,9 @@ Metagene and periodicity estimation parameters
Fixed lengths and offsets
"""""""""""""""""""""""""
-* ``use_fixed_lengths`` If this variable is present in the config file with any value (even something like "no" or "null" or "false"), fixed values given by ``lengths`` and ``offsets`` are used (no periodicity estimation).
-* ``lengths`` A list of read lengths to use for creating the profiles if the ``use_fixed_lengths`` option is given. Presumably, these are lengths that have periodic metagene profiles.
-* ``offsets`` The P-site offset to use for each read length specifed by ``lengths`` if the ``use_fixed_lengths`` option is given. The number of offsets must match the number of lengths, and they are assumed to match. For example ``lengths``: [26, 29] with ``offsets``: [9, 12] means only reads of lengths 26 bp and 29 bp are used to create the profiles. The 26 bp reads will be shifted by 9 bp in the 5' direction, while reads of length 29 bp will be shifted by 12 bp.
+* ``use_fixed_lengths`` If ``True`` in the configuration file, fixed values given by ``lengths`` and ``offsets`` are used (no periodicity estimation).
+* ``lengths`` A list of read lengths to use for creating the profiles if the ``use_fixed_lengths`` option is ``True``. Presumably, these are lengths that have periodic metagene profiles.
+* ``offsets`` The P-site offset to use for each read length specifed by ``lengths`` if the ``use_fixed_lengths`` option is ``True``. The number of offsets must match the number of lengths, and they are assumed to match. For example ``lengths``: [26, 29] with ``offsets``: [9, 12] means only reads of lengths 26 bp and 29 bp are used to create the profiles. The 26 bp reads will be shifted by 9 bp in the 5' direction, while reads of length 29 bp will be shifted by 12 bp.
Smoothing parameters
diff --git a/environment.yml b/environment.yml
index 4adf849..1f18d10 100644
--- a/environment.yml
+++ b/environment.yml
@@ -16,7 +16,7 @@ dependencies:
- joblib
- numpy
- pandas
- - pbiotools>=4.0.0
+ - pbiotools>=4.0.1
- pyyaml
- samtools
- scipy
diff --git a/pyproject.toml b/pyproject.toml
index 407802e..4b24966 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ classifiers = [
dynamic = ["version"]
requires-python = ">=3.7,<3.11"
dependencies = [
- "pbiotools>=4.0.0",
+ "pbiotools>=4.0.1",
"appdirs",
"biopython",
"cmdstanpy",
diff --git a/src/rpbp/__init__.py b/src/rpbp/__init__.py
index ec5b727..7d4f050 100644
--- a/src/rpbp/__init__.py
+++ b/src/rpbp/__init__.py
@@ -1,2 +1,2 @@
-__version_info__ = ("3", "0", "0")
+__version_info__ = ("3", "0", "1")
__version__ = ".".join(__version_info__)
diff --git a/src/rpbp/analysis/profile_construction/collect_read_length_orf_profiles.py b/src/rpbp/analysis/profile_construction/collect_read_length_orf_profiles.py
index 2c2c748..853cdc5 100644
--- a/src/rpbp/analysis/profile_construction/collect_read_length_orf_profiles.py
+++ b/src/rpbp/analysis/profile_construction/collect_read_length_orf_profiles.py
@@ -66,7 +66,7 @@ def main():
config = yaml.load(open(args.config), Loader=yaml.FullLoader)
# pull out what we need from the config file
- is_unique = not ("keep_riboseq_multimappers" in config)
+ is_unique = not config.get("keep_riboseq_multimappers", False)
note = config.get("note", None)
if args.add_ids:
diff --git a/src/rpbp/analysis/profile_construction/create_read_length_orf_profiles.py b/src/rpbp/analysis/profile_construction/create_read_length_orf_profiles.py
index 2602bd4..cd94246 100644
--- a/src/rpbp/analysis/profile_construction/create_read_length_orf_profiles.py
+++ b/src/rpbp/analysis/profile_construction/create_read_length_orf_profiles.py
@@ -70,7 +70,7 @@ def main():
config = yaml.load(open(args.config), Loader=yaml.FullLoader)
# pull out what we need from the config file
- is_unique = not ("keep_riboseq_multimappers" in config)
+ is_unique = not config.get("keep_riboseq_multimappers", False)
seqname_str = utils.get_config_argument(config, "seqname_prefix")
note = config.get("note", None)
orf_note = config.get("orf_note", None)
@@ -80,7 +80,7 @@ def main():
)
exons = filenames.get_exons(
- config["genome_base_path"], config["genome_name"], note=orf_note, is_orf=True
+ config["genome_base_path"], config["genome_name"], note=orf_note
)
# make sure the necessary files exist
diff --git a/src/rpbp/analysis/profile_construction/dashboard/rpbp_profile_construction_dashboard.py b/src/rpbp/analysis/profile_construction/dashboard/rpbp_profile_construction_dashboard.py
index 257c4ff..8648ef1 100644
--- a/src/rpbp/analysis/profile_construction/dashboard/rpbp_profile_construction_dashboard.py
+++ b/src/rpbp/analysis/profile_construction/dashboard/rpbp_profile_construction_dashboard.py
@@ -407,7 +407,7 @@ def fig_to_uri(in_fig, close_all=True, **save_args):
f"---"
)
-is_unique = not ("keep_riboseq_multimappers" in config)
+is_unique = not config.get("keep_riboseq_multimappers", False)
config_note = config.get("note", None)
# ribo_utils._return_key_dict
diff --git a/src/rpbp/analysis/profile_construction/get_all_read_filtering_counts.py b/src/rpbp/analysis/profile_construction/get_all_read_filtering_counts.py
index 89f89b5..07545d0 100644
--- a/src/rpbp/analysis/profile_construction/get_all_read_filtering_counts.py
+++ b/src/rpbp/analysis/profile_construction/get_all_read_filtering_counts.py
@@ -30,7 +30,7 @@ def get_counts(name_data, config, args):
note = config.get("note", None)
# keep multimappers?
- is_unique = not ("keep_riboseq_multimappers" in config)
+ is_unique = not config.get("keep_riboseq_multimappers", False)
raw_data = data
without_adapters = filenames.get_without_adapters_fastq(
diff --git a/src/rpbp/analysis/profile_construction/summarize_rpbp_profile_construction.py b/src/rpbp/analysis/profile_construction/summarize_rpbp_profile_construction.py
index e245a80..142d597 100644
--- a/src/rpbp/analysis/profile_construction/summarize_rpbp_profile_construction.py
+++ b/src/rpbp/analysis/profile_construction/summarize_rpbp_profile_construction.py
@@ -395,7 +395,7 @@ def main():
# nomenclature
project = config.get("project_name", "rpbp")
note = config.get("note", None)
- is_unique = not ("keep_riboseq_multimappers" in config)
+ is_unique = not config.get("keep_riboseq_multimappers", False)
msg = "Collecting all read filtering counts..."
logger.info(msg)
diff --git a/src/rpbp/analysis/rpbp_predictions/dashboard/rpbp_predictions_dashboard.py b/src/rpbp/analysis/rpbp_predictions/dashboard/rpbp_predictions_dashboard.py
index cfa53f6..06a0859 100644
--- a/src/rpbp/analysis/rpbp_predictions/dashboard/rpbp_predictions_dashboard.py
+++ b/src/rpbp/analysis/rpbp_predictions/dashboard/rpbp_predictions_dashboard.py
@@ -217,6 +217,8 @@ def filter_sort_table(filter_query, sort_by):
orfs[["PHASE I ORFs", "SS ORFs"]] = orfs[["PHASE I ORFs", "SS ORFs"]].fillna(
value="NA"
)
+# missing GTF fields - typically with de novo
+orfs[TABLE_FIELDS[4:]] = orfs[TABLE_FIELDS[4:]].fillna(value="NA")
display_table = pd.merge(display_table, orfs[TABLE_FIELDS], on="id", how="left")
display_table = display_table[TABLE_FIELDS + ["orf_info"]]
display_table.drop_duplicates(inplace=True)
diff --git a/src/rpbp/analysis/rpbp_predictions/summarize_rpbp_predictions.py b/src/rpbp/analysis/rpbp_predictions/summarize_rpbp_predictions.py
index 79d56a4..049326f 100644
--- a/src/rpbp/analysis/rpbp_predictions/summarize_rpbp_predictions.py
+++ b/src/rpbp/analysis/rpbp_predictions/summarize_rpbp_predictions.py
@@ -321,7 +321,7 @@ def _create_figures(name_pretty_name_is_sample, config, args):
name, pretty_name, is_sample = name_pretty_name_is_sample
- is_unique = not ("keep_riboseq_multimappers" in config)
+ is_unique = not config.get("keep_riboseq_multimappers", False)
note = config.get("note", None)
fraction = config.get("smoothing_fraction", None)
reweighting_iterations = config.get("smoothing_reweighting_iterations", None)
@@ -440,7 +440,7 @@ def get_parser():
parser.add_argument(
"--min-samples",
help="An ORF is filtered out if not predicted in at "
- "least [--min-samples] number of samples. By default "
+ "least ``--min-samples`` number of samples. By default "
"all ORFs are kept. This is ignored if merged replicates "
"are included in the output.",
type=int,
@@ -450,14 +450,14 @@ def get_parser():
parser.add_argument(
"-k",
"--keep-other",
- help="Include ORFs labeled as 'other', if present. "
+ help="Include ORFs labeled as *other*, if present. "
"They are discarded by default.",
action="store_true",
)
parser.add_argument(
"--no-replicates",
- help="If Rp-Bp was run with [--merge-replicates], "
+ help="If Rp-Bp was run with ``--merge-replicates``, "
"predictions from merged replicates are included by "
"default, unless this flag is present.",
required="--min-samples" in sys.argv,
@@ -466,17 +466,17 @@ def get_parser():
parser.add_argument(
"--use-unfiltered",
- help="Use the 'unfiltered' ORF predictions. "
- "Unless Rp-Bp was run with [--write-unfiltered], "
+ help="Use the *unfiltered* ORF predictions. "
+ "Unless Rp-Bp was run with ``--write-unfiltered``, "
"these will not be available. By default, the "
- "'filtered' predictions are used.",
+ "*filtered* predictions are used.",
action="store_true",
)
# display
parser.add_argument(
"--use-name-maps",
- help="Use 'riboseq_sample_name_map' and 'riboseq_condition_name_map' "
+ help="Use ``riboseq_sample_name_map`` and ``riboseq_condition_name_map`` "
"from the config. Do not use this flag when preparing results for "
"the dashboard, mapping is done in the app.",
action="store_true",
@@ -533,7 +533,7 @@ def get_parser():
parser.add_argument(
"--image-type",
- help="Format for [--show-orf-periodicity].",
+ help="Format for ``--show-orf-periodicity``.",
default="eps",
)
@@ -568,8 +568,6 @@ def main():
"riboseq_samples",
"genome_base_path",
"genome_name",
- "fasta",
- "gtf",
"star_index",
]
utils.check_keys_exist(config, required_keys)
@@ -581,7 +579,8 @@ def main():
# nomenclature
project = config.get("project_name", "rpbp")
note = config.get("note", None)
- is_unique = not ("keep_riboseq_multimappers" in config)
+ orf_note = config.get("orf_note", None)
+ is_unique = not config.get("keep_riboseq_multimappers", False)
# defaults are unused in file names
fraction = config.get("smoothing_fraction", None)
reweighting_iterations = config.get("smoothing_reweighting_iterations", None)
@@ -683,8 +682,21 @@ def main():
bed_df = bed_utils.read_bed(transcript_bed, low_memory=False)[cols]
bed_df.rename(columns={"id": "transcript_id"}, inplace=True)
+ # add info for de novo
+ if "de_novo_gtf" in config:
+ transcript_bed_dn = filenames.get_bed(
+ config["genome_base_path"],
+ config["genome_name"],
+ is_annotated=False,
+ is_de_novo=True,
+ )
+ cols = ["id", "biotype", "gene_id", "gene_name", "gene_biotype"]
+ bed_df_dn = bed_utils.read_bed(transcript_bed_dn, low_memory=False)[cols]
+ bed_df_dn.rename(columns={"id": "transcript_id"}, inplace=True)
+ bed_df = pd.concat([bed_df, bed_df_dn])
+
labeled_orfs = filenames.get_labels(
- config["genome_base_path"], config["genome_name"], note=note
+ config["genome_base_path"], config["genome_name"], note=orf_note
)
cols = ["id", "orf_type", "transcripts"]
labels_df = bed_utils.read_bed(labeled_orfs)[cols]
diff --git a/src/rpbp/orf_profile_construction/create_base_genome_profile.py b/src/rpbp/orf_profile_construction/create_base_genome_profile.py
index 6b47e54..8311426 100644
--- a/src/rpbp/orf_profile_construction/create_base_genome_profile.py
+++ b/src/rpbp/orf_profile_construction/create_base_genome_profile.py
@@ -5,12 +5,13 @@
"""
-import argparse
-import logging
import os
import sys
-
import yaml
+import logging
+import argparse
+
+from pathlib import Path
import pbiotools.utils.bam_utils as bam_utils
import pbiotools.utils.fastx_utils as fastx_utils
@@ -196,10 +197,15 @@ def main():
# Step 2: Running STAR to align rRNA-depleted reads to genome
+ # STAR standard output
star_output_prefix = filenames.get_riboseq_bam_base(
config["riboseq_data"], args.name, note=note
)
genome_star_bam = "{}{}".format(star_output_prefix, "Aligned.sortedByCoord.out.bam")
+ # Rp-Bp file name
+ genome_sorted_bam = filenames.get_riboseq_bam(
+ config["riboseq_data"], args.name, note=note
+ )
# get all options, command line options override defaults
@@ -213,11 +219,7 @@ def main():
star_option_str = pgrm_utils.get_final_args(star_options, args.star_options)
- gtf_file = filenames.get_gtf(
- config["genome_base_path"],
- config["genome_name"],
- is_star_input=True,
- )
+ gtf_file = filenames.get_gtf(config)
cmd = (
"{} --runThreadN {} --genomeDir {} --sjdbGTFfile {} --readFilesIn {} "
@@ -234,7 +236,9 @@ def main():
in_files = [without_rrna]
in_files.extend(pgrm_utils.get_star_index_files(config["star_index"]))
to_delete = [without_rrna]
- out_files = [genome_star_bam]
+ # run if any of the two doesn't exist, but only validate genome_star_bam
+ # as genome_sorted_bam normally doesn't exist yet
+ out_files = [genome_star_bam, genome_sorted_bam]
file_checkers = {genome_star_bam: bam_utils.check_bam_file}
shell_utils.call_if_not_exists(
cmd,
@@ -247,28 +251,16 @@ def main():
to_delete=to_delete,
)
- # now, we need to symlink the (genome) STAR output to that expected by the rest of the pipeline
- genome_sorted_bam = filenames.get_riboseq_bam(
- config["riboseq_data"], args.name, note=note
- )
-
- if os.path.exists(genome_star_bam):
- shell_utils.create_symlink(
- genome_star_bam, genome_sorted_bam, remove=args.overwrite, call=call
- )
- else:
- msg = (
- "Could not find the STAR genome bam alignment file. Unless "
- "[--do-not-call] was given, this is a problem!"
- )
- logger.critical(msg)
+ # rename STAR output to that expected by the pipeline
+ genome_star_bam = Path(genome_star_bam)
+ genome_star_bam.replace(genome_sorted_bam)
# create the bamtools index
cmd = "samtools index -b {}".format(genome_sorted_bam)
shell_utils.check_call(cmd, call=call)
# check if we want to keep multimappers
- if "keep_riboseq_multimappers" in config:
+ if config.get("keep_riboseq_multimappers", False):
return
# remove multimapping reads from the genome file
@@ -286,7 +278,7 @@ def main():
in_files = [genome_sorted_bam]
out_files = [unique_genome_filename]
- to_delete = [genome_star_bam, genome_sorted_bam]
+ to_delete = [genome_sorted_bam]
file_checkers = {unique_genome_filename: bam_utils.check_bam_file}
shell_utils.call_if_not_exists(
cmd,
diff --git a/src/rpbp/orf_profile_construction/create_orf_profiles.py b/src/rpbp/orf_profile_construction/create_orf_profiles.py
index 69fcb3f..46dcd56 100644
--- a/src/rpbp/orf_profile_construction/create_orf_profiles.py
+++ b/src/rpbp/orf_profile_construction/create_orf_profiles.py
@@ -119,7 +119,6 @@ def main():
required_keys = [
"riboseq_data",
"ribosomal_index",
- "gtf",
"genome_base_path",
"genome_name",
]
@@ -154,7 +153,7 @@ def main():
mem_str = "--mem {}".format(shlex.quote(args.mem))
# check if we want to keep multimappers
- is_unique = not ("keep_riboseq_multimappers" in config)
+ is_unique = not config.get("keep_riboseq_multimappers", False)
riboseq_raw_data = args.raw_data
riboseq_bam_filename = filenames.get_riboseq_bam(
diff --git a/src/rpbp/reference_preprocessing/prepare_rpbp_genome.py b/src/rpbp/reference_preprocessing/prepare_rpbp_genome.py
index ac6e476..258131e 100644
--- a/src/rpbp/reference_preprocessing/prepare_rpbp_genome.py
+++ b/src/rpbp/reference_preprocessing/prepare_rpbp_genome.py
@@ -11,8 +11,10 @@
import os
import sys
import yaml
-import argparse
import logging
+import argparse
+
+from pathlib import Path
import pbiotools.misc.logging_utils as logging_utils
import pbiotools.misc.shell_utils as shell_utils
@@ -307,12 +309,6 @@ def main():
config["genome_base_path"], config["genome_name"], note=config.get("orf_note")
)
- gtf_file = filenames.get_gtf(
- config["genome_base_path"],
- config["genome_name"],
- is_star_input=True,
- )
-
# now, check if we have a de novo assembly
if "de_novo_gtf" in config:
get_orfs(
@@ -394,9 +390,8 @@ def main():
msg = "Skipping concatenation due to --call value"
logger.info(msg)
- # we also need to concat the annotations to inform STAR
- # there is no particular reason to merge and sort the files, so
- # we just concatenate them...
+ # concat the annotations to inform STAR
+ gtf_file = filenames.get_gtf(config)
star_files_str = " ".join([config["gtf"], config["de_novo_gtf"]])
msg = "Concatenating files. Output file: {}; Input files: {}".format(
gtf_file, star_files_str
@@ -413,27 +408,15 @@ def main():
)
else:
- # if we do not have a de novo assembly, symlink the files
-
- if os.path.exists(annotated_orfs):
- shell_utils.create_symlink(
- annotated_orfs, orfs_genomic, remove=args.overwrite, call=call
- )
-
- if os.path.exists(annotated_exons_file):
- shell_utils.create_symlink(
- annotated_exons_file, exons_file, remove=args.overwrite, call=call
- )
-
- if os.path.exists(annotated_labeled_orfs):
- shell_utils.create_symlink(
- annotated_labeled_orfs, labeled_orfs, remove=args.overwrite, call=call
- )
-
- if os.path.exists(config["gtf"]):
- shell_utils.create_symlink(
- config["gtf"], gtf_file, remove=args.overwrite, call=call
- )
+ # if we do not have a de novo assembly, rename the files
+ annotated_orfs = Path(annotated_orfs)
+ annotated_orfs.replace(orfs_genomic)
+
+ annotated_exons_file = Path(annotated_exons_file)
+ annotated_exons_file.replace(exons_file)
+
+ annotated_labeled_orfs = Path(annotated_labeled_orfs)
+ annotated_labeled_orfs.replace(labeled_orfs)
if __name__ == "__main__":
diff --git a/src/rpbp/ribo_utils/filenames.py b/src/rpbp/ribo_utils/filenames.py
index 79fe159..8bf45b7 100644
--- a/src/rpbp/ribo_utils/filenames.py
+++ b/src/rpbp/ribo_utils/filenames.py
@@ -254,23 +254,20 @@ def get_riboseq_frame_counts(riboseq_base, name, **kwargs):
return loc
-### g
+# g
def get_gtf(
- base_path,
- name,
- is_de_novo=False,
- is_annotated=False,
- is_star_input=False,
+ config,
):
- c = get_annotated_string(is_annotated)
- d = get_de_novo_string(is_de_novo)
- s = get_star_input_string(is_star_input)
- ext = "gtf" # GTF only
- fn = "{}{}{}{}.{}".format(name, c, d, s, ext)
- return os.path.join(base_path, fn)
+ if "de_novo_gtf" in config:
+ base_path = config["genome_base_path"]
+ name = config["genome_name"]
+ fn = f"{name}.gtf"
+ return os.path.join(base_path, fn)
+ else:
+ return config["gtf"]
# m
diff --git a/src/rpbp/ribo_utils/utils.py b/src/rpbp/ribo_utils/utils.py
index ddf5c79..3513ced 100644
--- a/src/rpbp/ribo_utils/utils.py
+++ b/src/rpbp/ribo_utils/utils.py
@@ -200,7 +200,7 @@ def get_periodic_lengths_and_offsets(
import scipy.stats
# check if we specified to just use a fixed offset and length
- if "use_fixed_lengths" in config:
+ if config.get("use_fixed_lengths", False):
lengths = [str(l) for l in config["lengths"]]
offsets = [str(o) for o in config["offsets"]]
diff --git a/src/rpbp/run_all_rpbp_instances.py b/src/rpbp/run_all_rpbp_instances.py
index 8821953..192e9ed 100644
--- a/src/rpbp/run_all_rpbp_instances.py
+++ b/src/rpbp/run_all_rpbp_instances.py
@@ -38,15 +38,15 @@ def get_parser():
parser.add_argument(
"--merge-replicates",
- help="Predict Ribo-seq ORFs in merged profiles. If [--merge-replicates], "
- "then use [--run-replicates] to also predict Ribo-seq ORFs in all samples.",
+ help="Predict Ribo-seq ORFs in merged profiles. If ``--merge-replicates``, "
+ "then use ``--run-replicates`` to also predict Ribo-seq ORFs in all samples.",
action="store_true",
)
parser.add_argument(
"--run-replicates",
- help="With [--merge-replicates], predict Ribo-seq ORFs in all samples and "
- "in merged profiles. This has no effect without [--merge-replicates], i.e. "
+ help="With ``--merge-replicates``, predict Ribo-seq ORFs in all samples and "
+ "in merged profiles. This has no effect without ``--merge-replicates``, *i.e.* "
"predictions are made for all samples by default.",
action="store_true",
)
diff --git a/src/rpbp/translation_prediction/predict_translated_orfs.py b/src/rpbp/translation_prediction/predict_translated_orfs.py
index 2c6a305..f24645d 100644
--- a/src/rpbp/translation_prediction/predict_translated_orfs.py
+++ b/src/rpbp/translation_prediction/predict_translated_orfs.py
@@ -30,7 +30,7 @@ def get_profile(name, config, args):
from the given parameters.
"""
# keep multimappers?
- is_unique = not ("keep_riboseq_multimappers" in config)
+ is_unique = not config.get("keep_riboseq_multimappers", False)
# get the lengths and offsets which meet the required criteria from the config file
lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
@@ -157,7 +157,7 @@ def main():
logger.info(msg)
# keep multimappers?
- is_unique = not ("keep_riboseq_multimappers" in config)
+ is_unique = not config.get("keep_riboseq_multimappers", False)
# first, check if we are merging replicates
diff --git a/tests/regression/conftest.py b/tests/regression/conftest.py
index 3478c14..33a6679 100644
--- a/tests/regression/conftest.py
+++ b/tests/regression/conftest.py
@@ -140,19 +140,16 @@ def getf_genome(get_genome):
config["genome_base_path"],
config["genome_name"],
note=config.get("orf_note"),
- is_annotated=True,
),
"exons": filenames.get_exons(
config["genome_base_path"],
config["genome_name"],
note=config.get("orf_note"),
- is_annotated=True,
),
"labels": filenames.get_labels(
config["genome_base_path"],
config["genome_name"],
note=config.get("orf_note"),
- is_annotated=True,
),
}
@@ -164,13 +161,13 @@ def getf_genome(get_genome):
ref_config["genome_base_path"], ref_config["genome_name"], is_annotated=True
),
"orfs": filenames.get_orfs(
- ref_config["genome_base_path"], ref_config["genome_name"], is_annotated=True
+ ref_config["genome_base_path"], ref_config["genome_name"]
),
"exons": filenames.get_exons(
- ref_config["genome_base_path"], ref_config["genome_name"], is_annotated=True
+ ref_config["genome_base_path"], ref_config["genome_name"]
),
"labels": filenames.get_labels(
- ref_config["genome_base_path"], ref_config["genome_name"], is_annotated=True
+ ref_config["genome_base_path"], ref_config["genome_name"]
),
}