From 35d5194023ca51e71dd8a82cac200160ba416b43 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 14:26:49 +0200 Subject: [PATCH 01/19] bin\litscan-create-xml-metadata.py line-too-long Made a readable line shorter. --- bin/litscan-create-xml-metadata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/litscan-create-xml-metadata.py b/bin/litscan-create-xml-metadata.py index 95e31285..04cf6755 100755 --- a/bin/litscan-create-xml-metadata.py +++ b/bin/litscan-create-xml-metadata.py @@ -79,7 +79,8 @@ def main(conn_string, filename, output): database = line[1] # get hit_count - cursor.execute("SELECT hit_count FROM litscan_job WHERE job_id='{0}'".format(job_id.lower())) + cursor.execute( + "SELECT hit_count FROM litscan_job WHERE job_id='{0}'".format(job_id.lower())) result = cursor.fetchone() hit_count = str(result[0]) if result else "" From cab32fa288a1668ae98da48b8f8530726225c7bd Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 14:42:53 +0200 Subject: [PATCH 02/19] rnacentral_pipeline\rnacentral\genome_mapping\blat.py line-too-long Made too line shorter. One was not readable. --- rnacentral_pipeline/rnacentral/genome_mapping/blat.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rnacentral_pipeline/rnacentral/genome_mapping/blat.py b/rnacentral_pipeline/rnacentral/genome_mapping/blat.py index aa519d62..e5fd0fd9 100644 --- a/rnacentral_pipeline/rnacentral/genome_mapping/blat.py +++ b/rnacentral_pipeline/rnacentral/genome_mapping/blat.py @@ -40,7 +40,8 @@ "qBaseInsert", # Number of bases inserted in query "tNumInsert", # Number of inserts in target "tBaseInsert", # Number of bases inserted in target - "strand", # "+" or "-" for query strand. For translated alignments, second "+"or "-" is for target genomic strand. + "strand", # "+" or "-" for query strand. For translated alignments, + # second "+"or "-" is for target genomic strand. "qName", # Query sequence name "qSize", # Query sequence size. "qStart", # Alignment start position in query @@ -50,7 +51,10 @@ "tStart", # Alignment start position in target "tEnd", # Alignment end position in target "blockCount", # Number of blocks in the alignment (a block contains no gaps) - "blockSizes", # Comma-separated list of sizes of each block. If the query is a protein and the target the genome, blockSizes are in amino acids. See below for more information on protein query PSLs. + "blockSizes", # Comma-separated list of sizes of each block. + # If the query is a protein and the target the genome, + # blockSizes are in amino acids. + # See below for more information on protein query PSLs. "qStarts", # Comma-separated list of starting positions of each block in query "tStarts", # Comma-separated list of starting positions of each block in target ] From 8689b59e72be192971cc3776095cb223be5eb677 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 14:50:33 +0200 Subject: [PATCH 03/19] rnacentral_pipeline\utils.py line-too-long Made a readable line shorter --- rnacentral_pipeline/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/utils.py b/rnacentral_pipeline/utils.py index 7e1c299a..a98a20f5 100644 --- a/rnacentral_pipeline/utils.py +++ b/rnacentral_pipeline/utils.py @@ -37,7 +37,8 @@ def __init__(self, co): self.done = False self.result = None self.lock = threading.RLock() - ## This needs to be a re-rntrant lock so it is only release by the coroutine that acquired it + ## This needs to be a re-rntrant lock so it is only + ## release by the coroutine that acquired it def __await__(self): with self.lock: From 51a100dd461aeccea49933d93c30afda1ec26963 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 14:54:45 +0200 Subject: [PATCH 04/19] rnacentral_pipeline\databases\evlncrnas\lookup.py line-too-long Made unreadable commented out line shorter. --- rnacentral_pipeline/databases/evlncrnas/lookup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/databases/evlncrnas/lookup.py b/rnacentral_pipeline/databases/evlncrnas/lookup.py index a98927ce..c065b969 100644 --- a/rnacentral_pipeline/databases/evlncrnas/lookup.py +++ b/rnacentral_pipeline/databases/evlncrnas/lookup.py @@ -105,7 +105,8 @@ def mapping(db_url, data): def as_mapping(db_url, data): - # data = data.explode('Aliases').drop_duplicates(subset='Aliases').rename(columns={'Aliases':'external_id'})#.set_index('external_id') + # data = data.explode('Aliases').drop_duplicates(subset='Aliases').rename( + # columns={'Aliases':'external_id'})#.set_index('external_id') print(len(data)) data = data.drop(data[data["Name"] == " "].index) print(data) From 8a05dd2b3f98d8fdc1da22b088eb3f15c3e0e925 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 14:57:56 +0200 Subject: [PATCH 05/19] rnacentral_pipeline\databases\evlncrnas\parser.py line-too-long Made a readable line shorter --- rnacentral_pipeline/databases/evlncrnas/parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/databases/evlncrnas/parser.py b/rnacentral_pipeline/databases/evlncrnas/parser.py index 8f06b32d..05d5782e 100644 --- a/rnacentral_pipeline/databases/evlncrnas/parser.py +++ b/rnacentral_pipeline/databases/evlncrnas/parser.py @@ -270,7 +270,8 @@ def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None: ) # ## Match with RNAcentral based on the gene name - ## This is optionally chunked to save memory - split the lookup file and provide a list on the commandline + ## This is optionally chunked to save memory - + ## split the lookup file and provide a list on the commandline matched_frame = pd.concat( [get_db_matches(no_accession_frame, dump_chunk) for dump_chunk in db_dumps] ) From 4edfd129893fddc9b78d521cd13dfb24b5ad4ee8 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 15:01:53 +0200 Subject: [PATCH 06/19] rnacentral_pipeline\databases\mgnify\prepare.py line-too-long Made a readable line shorter --- rnacentral_pipeline/databases/mgnify/prepare.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/databases/mgnify/prepare.py b/rnacentral_pipeline/databases/mgnify/prepare.py index a09d2091..4cb5b2b8 100644 --- a/rnacentral_pipeline/databases/mgnify/prepare.py +++ b/rnacentral_pipeline/databases/mgnify/prepare.py @@ -37,7 +37,8 @@ def prepare_mgnify_data(data, conn_str): ## Define fallback taxids of the general metagenome of the environment ## These are used if we can't do any better fallback = { - "zebrafish fecal genome catalogue": 1331678, # zebrafish metagenome - more accurate then generic fish fecal? + "zebrafish fecal genome catalogue": 1331678, # zebrafish metagenome - + # more accurate then generic fish fecal? "human gut genome catalogue": 408170, # human gut metagenome "human oral genome catalogue": 447426, # human oral metagenome "marine genome catalogue": 2994539, # human skin metagenome From 4f15d94ffe9abcb71ef6bd233f8908e83c3deeb4 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 15:05:52 +0200 Subject: [PATCH 07/19] rnacentral_pipeline\databases\plncdb\parser.py line-too-long --- rnacentral_pipeline/databases/plncdb/parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/databases/plncdb/parser.py b/rnacentral_pipeline/databases/plncdb/parser.py index 7be36f96..819f513c 100644 --- a/rnacentral_pipeline/databases/plncdb/parser.py +++ b/rnacentral_pipeline/databases/plncdb/parser.py @@ -96,7 +96,8 @@ def parse(data:pathlib.Path) -> ty.Iterable[Entry]: species_info["taxid"] = species_info["Species"].apply(phy.taxid) - total_entries = len(gff_db.execute("select DISTINCT(id) from features where featuretype = 'transcript' ").fetchall()) + total_entries = len(gff_db.execute( + "select DISTINCT(id) from features where featuretype = 'transcript' ").fetchall()) entries = [] for gene_id_q in tqdm(gff_db.execute("select id from features"), total=total_entries): primary_id = gene_id_q["id"] From ac410925572eb92e3cd4de28d990a7996784e032 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 15:15:59 +0200 Subject: [PATCH 08/19] bin\litscan-get-statistics.py line-too-long Made a readable line shorter --- bin/litscan-get-statistics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/litscan-get-statistics.py b/bin/litscan-get-statistics.py index 048cfc67..000a214d 100755 --- a/bin/litscan-get-statistics.py +++ b/bin/litscan-get-statistics.py @@ -47,7 +47,8 @@ def main(database, output): results['ids_in_use'] = cursor.fetchone()[0] # number of urs in the current version - cursor.execute(""" SELECT COUNT(DISTINCT job_id) FROM litscan_database WHERE job_id like 'urs%' """) + cursor.execute( + """ SELECT COUNT(DISTINCT job_id) FROM litscan_database WHERE job_id like 'urs%' """) results['urs'] = cursor.fetchone()[0] # number of expert dbs From cbaeb05b733828c1ca15f211e94538b5cb885c39 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 15:17:28 +0200 Subject: [PATCH 09/19] rnacentral_pipeline\cli\ensembl.py line-too-long Made a readable line shorter --- rnacentral_pipeline/cli/ensembl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/cli/ensembl.py b/rnacentral_pipeline/cli/ensembl.py index 3ba49c22..96c53e53 100644 --- a/rnacentral_pipeline/cli/ensembl.py +++ b/rnacentral_pipeline/cli/ensembl.py @@ -89,7 +89,8 @@ def parse_data(division, embl_file, gff_file, output, family_file=None): writer.write(entries) except ValueError: print("Empty entries, implies no ncRNAs. You should check that") - message = f"No ncRNA entries found for {embl_file.name}, or {gff_file.name}. Empty data supplied for now, but you should check the legitimacy of this result.\n" + message = (f"No ncRNA entries found for {embl_file.name}, or {gff_file.name}. " + + "Empty data supplied for now, but you should check the legitimacy of this result.\n") message += "For reference, the other parameters to the parser were:\n" message += f"division: {division}\n" message += f"embl_file: {embl_file.name}\n" From 6913c5fd73daacebafd724b698eda963f2786e1c Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 15:24:06 +0200 Subject: [PATCH 10/19] rnacentral_pipeline\databases\tmrna\helpers.py line-too-long Made a readable line shorter --- rnacentral_pipeline/databases/tmrna/helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/databases/tmrna/helpers.py b/rnacentral_pipeline/databases/tmrna/helpers.py index b0ca3997..3df14e6f 100644 --- a/rnacentral_pipeline/databases/tmrna/helpers.py +++ b/rnacentral_pipeline/databases/tmrna/helpers.py @@ -15,7 +15,8 @@ async def fetch_records(session, accessions: ty.List[str]): try: accession_str = ",".join(accessions) async with session.get( - f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={accession_str}&rettype=gb&retmode=text" + ("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + + f"db=nuccore&id={accession_str}&rettype=gb&retmode=text") ) as response: records_text = await response.text() handle = io.StringIO(records_text) From 73a5ed651c6f67b6d83f7cebe5ec1dae7962b949 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 15:25:34 +0200 Subject: [PATCH 11/19] Leftover from rnacentral_pipeline\cli\ensembl.py --- rnacentral_pipeline/cli/ensembl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/cli/ensembl.py b/rnacentral_pipeline/cli/ensembl.py index 96c53e53..3a4845c3 100644 --- a/rnacentral_pipeline/cli/ensembl.py +++ b/rnacentral_pipeline/cli/ensembl.py @@ -90,7 +90,8 @@ def parse_data(division, embl_file, gff_file, output, family_file=None): except ValueError: print("Empty entries, implies no ncRNAs. You should check that") message = (f"No ncRNA entries found for {embl_file.name}, or {gff_file.name}. " - + "Empty data supplied for now, but you should check the legitimacy of this result.\n") + + "Empty data supplied for now" + + ", but you should check the legitimacy of this result.\n") message += "For reference, the other parameters to the parser were:\n" message += f"division: {division}\n" message += f"embl_file: {embl_file.name}\n" From 16dc5b5936495fedd02d9e922bf60142058d4c71 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 15:29:47 +0200 Subject: [PATCH 12/19] rnacentral_pipeline\rnacentral\precompute\utils.py line-too-long Made a readable line shorter --- rnacentral_pipeline/rnacentral/precompute/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/rnacentral/precompute/utils.py b/rnacentral_pipeline/rnacentral/precompute/utils.py index 1b631d35..8382b1cb 100644 --- a/rnacentral_pipeline/rnacentral/precompute/utils.py +++ b/rnacentral_pipeline/rnacentral/precompute/utils.py @@ -76,7 +76,8 @@ def entropy(data): structures) the name will be very long because it contains the sequence itself. For example: - RNA (5'-R(*GP*UP*GP*GP*UP*CP*UP*GP*AP*UP*GP*AP*GP*GP*CP*C)-3') from synthetic construct (PDB 3D0M, chain X) + RNA (5'-R(*GP*UP*GP*GP*UP*CP*UP*GP*AP*UP*GP*AP*GP*GP*CP*C)-3') + from synthetic construct (PDB 3D0M, chain X) This is not a useful name, but it is very long. Thus we do not want it. What we are generally after is something with the most information (to a From b4b385e0a5a39504d91ccb75235167d4179b5e62 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 15:41:26 +0200 Subject: [PATCH 13/19] rnacentral_pipeline\rnacentral\search_export\compare.py line-too-long Made a readable line shorter --- rnacentral_pipeline/rnacentral/search_export/compare.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rnacentral_pipeline/rnacentral/search_export/compare.py b/rnacentral_pipeline/rnacentral/search_export/compare.py index 0a76cedc..38b92587 100644 --- a/rnacentral_pipeline/rnacentral/search_export/compare.py +++ b/rnacentral_pipeline/rnacentral/search_export/compare.py @@ -75,7 +75,8 @@ def compare(output, results1, results2, facet): def write(output: ty.IO): """ """ - index1 = "http://www.ebi.ac.uk/ebisearch/ws/rest/rnacentral?query={query}&format=json&facetfields={facet}&facetcount=30" + index1 = ("http://www.ebi.ac.uk/ebisearch/ws/rest/rnacentral" + + "?query={query}&format=json&facetfields={facet}&facetcount=30") index2 = index1.replace("http://www.", "http://wwwdev.") queries = ["RNA", 'TAXONOMY:"9606"'] + EXPERT_DATABASES facets = ["rna_type", "has_genomic_coordinates"] From 4644e8c5b14daa34db25025ef8606bf5c98b1d06 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 16:44:37 +0200 Subject: [PATCH 14/19] rnacentral_pipeline\databases\psi_mi\tab.py unnecessary-pass unneeded pass in the end of exception handling --- rnacentral_pipeline/databases/psi_mi/tab.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rnacentral_pipeline/databases/psi_mi/tab.py b/rnacentral_pipeline/databases/psi_mi/tab.py index 38c5c6a2..d6003448 100644 --- a/rnacentral_pipeline/databases/psi_mi/tab.py +++ b/rnacentral_pipeline/databases/psi_mi/tab.py @@ -120,7 +120,6 @@ def as_pubs(value): refs.append(pubs.reference(ident.value)) except data.UnknownPublicationType: LOGGER.warn("Could not handle publication %s", ident) - pass return refs From b31a011fbc644699349524b180d7f85ccbae679a Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 18:28:14 +0200 Subject: [PATCH 15/19] rnacentral_pipeline\databases\hgnc\helpers.py broad-exception-caught MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Catching Exception might hide unexpected exceptions (e.g., due to new code that will be added). More than that, the exception handling does not log it but returns None, making it harder to detect. I changed it to catching  requests.exceptions.HTTPError See https://stackoverflow.com/questions/24518944/try-except-when-using-python-requests-module --- rnacentral_pipeline/databases/hgnc/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rnacentral_pipeline/databases/hgnc/helpers.py b/rnacentral_pipeline/databases/hgnc/helpers.py index d777e3c6..4248f8e3 100644 --- a/rnacentral_pipeline/databases/hgnc/helpers.py +++ b/rnacentral_pipeline/databases/hgnc/helpers.py @@ -124,7 +124,7 @@ def ensembl_sequence(context: Context, ensembl_id: str) -> ty.Optional[str]: response = requests.get(url) try: response.raise_for_status() - except Exception: + except requests.exceptions.HTTPError: return None return response.text From 5d1281152774afcd36568e0e86fe0ca015a32d3e Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 19:15:46 +0200 Subject: [PATCH 16/19] rnacentral_pipeline\rnacentral\r2dt\parser.py broad-exception-caught MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Catching Exception might hide unexpected exceptions (e.g., due to new code that will be added). The method parse calls R2DTResultInfo.validate (in line 96).Validate is a sequence of assertions, hence the specific AssertionError can be used instead. See https://stackoverflow.com/questions/1569049/making-pythons-assert-throw-an-exception-that-i-choose --- rnacentral_pipeline/rnacentral/r2dt/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rnacentral_pipeline/rnacentral/r2dt/parser.py b/rnacentral_pipeline/rnacentral/r2dt/parser.py index bec61a02..5b440ca9 100644 --- a/rnacentral_pipeline/rnacentral/r2dt/parser.py +++ b/rnacentral_pipeline/rnacentral/r2dt/parser.py @@ -94,7 +94,7 @@ def parse( try: info.validate() - except Exception as e: + except AssertionError as e: if allow_missing: LOGGER.warn("Did not find all required files for %s", urs) LOGGER.exception(e) From d458b8455225a93d311cccc6c4eb2f9a12a6abba Mon Sep 17 00:00:00 2001 From: evidencebp Date: Mon, 18 Nov 2024 19:30:14 +0200 Subject: [PATCH 17/19] rnacentral_pipeline\rnacentral\genome_mapping\igv.py broad-exception-caught MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Catching Exception might hide unexpected exceptions (e.g., due to new code that will be added).   The function ftp calls ftplib.FTP's quit (in line 40). The specific ftplib.all_errors   can be used instead.   See  https://docs.python.org/3/library/ftplib.html --- rnacentral_pipeline/rnacentral/genome_mapping/igv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rnacentral_pipeline/rnacentral/genome_mapping/igv.py b/rnacentral_pipeline/rnacentral/genome_mapping/igv.py index 4a867590..66f3669a 100644 --- a/rnacentral_pipeline/rnacentral/genome_mapping/igv.py +++ b/rnacentral_pipeline/rnacentral/genome_mapping/igv.py @@ -38,7 +38,7 @@ def ftp(host): try: conn.quit() - except Exception as err: + except ftplib.all_errors as err: LOGGER.info("Failed to close FTP connection") LOGGER.exception(err) From ba4a8bff34736af1b2dbd8845246c4bb6aa8e583 Mon Sep 17 00:00:00 2001 From: evidencebp Date: Wed, 20 Nov 2024 14:32:01 +0200 Subject: [PATCH 18/19] rnacentral_pipeline\databases\europepmc\stream.py broad-exception-caught The function fallback calls fetch.lookup (in line 41). I changed to (fetch.UnknownReference, fetch.TooManyPublications) as discussed. Please note that I could not find where TooManyPublications is defined --- rnacentral_pipeline/databases/europepmc/stream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rnacentral_pipeline/databases/europepmc/stream.py b/rnacentral_pipeline/databases/europepmc/stream.py index 148f6687..1a8c07de 100644 --- a/rnacentral_pipeline/databases/europepmc/stream.py +++ b/rnacentral_pipeline/databases/europepmc/stream.py @@ -40,7 +40,7 @@ def fallback(data): try: ref = fetch.lookup(id_ref) yield id_ref, ref, rows - except Exception: + except (fetch.UnknownReference, fetch.TooManyPublications): pass From 00612f7c3395d9b687d2238d346c4f07a1b3090e Mon Sep 17 00:00:00 2001 From: evidencebp Date: Wed, 20 Nov 2024 14:35:22 +0200 Subject: [PATCH 19/19] bin\litscan-retracted-articles.py broad-exception-caught As discussed, narrowed Exception to ValueError --- bin/litscan-retracted-articles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/litscan-retracted-articles.py b/bin/litscan-retracted-articles.py index 8c3636d3..6e397f3d 100755 --- a/bin/litscan-retracted-articles.py +++ b/bin/litscan-retracted-articles.py @@ -77,7 +77,7 @@ def main(database, webhook): message = f'{len(retracted_articles)} {"articles have" if len(retracted_articles) > 1 else "article has"} ' \ f'been retracted: {", ".join(retracted_articles)}' requests.post(webhook, json.dumps({"text": message})) - except (Exception, psycopg2.DatabaseError) as error: + except (ValueError, psycopg2.DatabaseError) as error: requests.post(webhook, json.dumps({"text": error})) finally: if conn is not None: