Merge pull request #208 from evidencebp/master

blakesweeney · web-flow · commit 8a3abb95a3b5 · 2024-11-25T13:44:05.000Z
Pylint alerts corrections as part of an intervention experiment 1853
diff --git a/bin/litscan-create-xml-metadata.py b/bin/litscan-create-xml-metadata.py
@@ -79,7 +79,8 @@ def main(conn_string, filename, output):
             database = line[1]
 
             # get hit_count
-            cursor.execute("SELECT hit_count FROM litscan_job WHERE job_id='{0}'".format(job_id.lower()))
+            cursor.execute(
+                "SELECT hit_count FROM litscan_job WHERE job_id='{0}'".format(job_id.lower()))
             result = cursor.fetchone()
             hit_count = str(result[0]) if result else ""
 
diff --git a/bin/litscan-get-statistics.py b/bin/litscan-get-statistics.py
@@ -47,7 +47,8 @@ def main(database, output):
     results['ids_in_use'] = cursor.fetchone()[0]
 
     # number of urs in the current version
-    cursor.execute(""" SELECT COUNT(DISTINCT job_id) FROM litscan_database WHERE job_id like 'urs%' """)
+    cursor.execute(
+        """ SELECT COUNT(DISTINCT job_id) FROM litscan_database WHERE job_id like 'urs%' """)
     results['urs'] = cursor.fetchone()[0]
 
     # number of expert dbs
diff --git a/bin/litscan-retracted-articles.py b/bin/litscan-retracted-articles.py
@@ -77,7 +77,7 @@ def main(database, webhook):
             message = f'{len(retracted_articles)} {"articles have" if len(retracted_articles) > 1 else "article has"} ' \
                       f'been retracted: {", ".join(retracted_articles)}'
             requests.post(webhook, json.dumps({"text": message}))
-    except (Exception, psycopg2.DatabaseError) as error:
+    except (ValueError, psycopg2.DatabaseError) as error:
         requests.post(webhook, json.dumps({"text": error}))
     finally:
         if conn is not None:
diff --git a/rnacentral_pipeline/cli/ensembl.py b/rnacentral_pipeline/cli/ensembl.py
@@ -89,7 +89,9 @@ def parse_data(division, embl_file, gff_file, output, family_file=None):
             writer.write(entries)
     except ValueError:
         print("Empty entries, implies no ncRNAs. You should check that")
-        message = f"No ncRNA entries found for {embl_file.name}, or {gff_file.name}. Empty data supplied for now, but you should check the legitimacy of this result.\n"
+        message = (f"No ncRNA entries found for {embl_file.name}, or {gff_file.name}. " 
+                   + "Empty data supplied for now"
+                   + ", but you should check the legitimacy of this result.\n")
         message += "For reference, the other parameters to the parser were:\n"
         message += f"division: {division}\n"
         message += f"embl_file: {embl_file.name}\n"
diff --git a/rnacentral_pipeline/databases/europepmc/stream.py b/rnacentral_pipeline/databases/europepmc/stream.py
@@ -40,7 +40,7 @@ def fallback(data):
         try:
             ref = fetch.lookup(id_ref)
             yield id_ref, ref, rows
-        except Exception:
+        except (fetch.UnknownReference, fetch.TooManyPublications):
             pass
 
 
diff --git a/rnacentral_pipeline/databases/evlncrnas/lookup.py b/rnacentral_pipeline/databases/evlncrnas/lookup.py
@@ -105,7 +105,8 @@ def mapping(db_url, data):
 
 
 def as_mapping(db_url, data):
-    # data = data.explode('Aliases').drop_duplicates(subset='Aliases').rename(columns={'Aliases':'external_id'})#.set_index('external_id')
+    # data = data.explode('Aliases').drop_duplicates(subset='Aliases').rename(
+    #           columns={'Aliases':'external_id'})#.set_index('external_id')
     print(len(data))
     data = data.drop(data[data["Name"] == " "].index)
     print(data)
diff --git a/rnacentral_pipeline/databases/evlncrnas/parser.py b/rnacentral_pipeline/databases/evlncrnas/parser.py
@@ -270,7 +270,8 @@ def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None:
     )  #
 
     ## Match with RNAcentral based on the gene name
-    ## This is optionally chunked to save memory - split the lookup file and provide a list on the commandline
+    ## This is optionally chunked to save memory - 
+    ## split the lookup file and provide a list on the commandline
     matched_frame = pd.concat(
         [get_db_matches(no_accession_frame, dump_chunk) for dump_chunk in db_dumps]
     )
diff --git a/rnacentral_pipeline/databases/hgnc/helpers.py b/rnacentral_pipeline/databases/hgnc/helpers.py
@@ -124,7 +124,7 @@ def ensembl_sequence(context: Context, ensembl_id: str) -> ty.Optional[str]:
     response = requests.get(url)
     try:
         response.raise_for_status()
-    except Exception:
+    except requests.exceptions.HTTPError:
         return None
     return response.text
 
diff --git a/rnacentral_pipeline/databases/mgnify/prepare.py b/rnacentral_pipeline/databases/mgnify/prepare.py
@@ -37,7 +37,8 @@ def prepare_mgnify_data(data, conn_str):
     ## Define fallback taxids of the general metagenome of the environment
     ## These are used if we can't do any better
     fallback = {
-        "zebrafish fecal genome catalogue": 1331678,  # zebrafish metagenome - more accurate then generic fish fecal?
+        "zebrafish fecal genome catalogue": 1331678,  # zebrafish metagenome - 
+                                                        # more accurate then generic fish fecal?
         "human gut genome catalogue": 408170,  # human gut metagenome
         "human oral genome catalogue": 447426,  # human oral metagenome
         "marine genome catalogue": 2994539,  # human skin metagenome
diff --git a/rnacentral_pipeline/databases/plncdb/parser.py b/rnacentral_pipeline/databases/plncdb/parser.py
@@ -96,7 +96,8 @@ def parse(data:pathlib.Path) -> ty.Iterable[Entry]:
     species_info["taxid"] = species_info["Species"].apply(phy.taxid)
 
 
-    total_entries = len(gff_db.execute("select DISTINCT(id) from features where featuretype = 'transcript' ").fetchall())
+    total_entries = len(gff_db.execute(
+        "select DISTINCT(id) from features where featuretype = 'transcript' ").fetchall())
     entries = []
     for gene_id_q in tqdm(gff_db.execute("select id from features"), total=total_entries):
         primary_id = gene_id_q["id"]
diff --git a/rnacentral_pipeline/databases/psi_mi/tab.py b/rnacentral_pipeline/databases/psi_mi/tab.py
@@ -120,7 +120,6 @@ def as_pubs(value):
                 refs.append(pubs.reference(ident.value))
             except data.UnknownPublicationType:
                 LOGGER.warn("Could not handle publication %s", ident)
-                pass
     return refs
 
 
diff --git a/rnacentral_pipeline/databases/tmrna/helpers.py b/rnacentral_pipeline/databases/tmrna/helpers.py
@@ -15,7 +15,8 @@ async def fetch_records(session, accessions: ty.List[str]):
     try:
         accession_str = ",".join(accessions)
         async with session.get(
-            f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={accession_str}&rettype=gb&retmode=text"
+            ("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
+                + f"db=nuccore&id={accession_str}&rettype=gb&retmode=text")
         ) as response:
             records_text = await response.text()
             handle = io.StringIO(records_text)
diff --git a/rnacentral_pipeline/rnacentral/genome_mapping/blat.py b/rnacentral_pipeline/rnacentral/genome_mapping/blat.py
@@ -40,7 +40,8 @@
     "qBaseInsert",  # Number of bases inserted in query
     "tNumInsert",  # Number of inserts in target
     "tBaseInsert",  # Number of bases inserted in target
-    "strand",  # "+" or "-" for query strand. For translated alignments, second "+"or "-" is for target genomic strand.
+    "strand",  # "+" or "-" for query strand. For translated alignments,
+                    # second "+"or "-" is for target genomic strand.
     "qName",  # Query sequence name
     "qSize",  # Query sequence size.
     "qStart",  # Alignment start position in query
@@ -50,7 +51,10 @@
     "tStart",  # Alignment start position in target
     "tEnd",  # Alignment end position in target
     "blockCount",  # Number of blocks in the alignment (a block contains no gaps)
-    "blockSizes",  # Comma-separated list of sizes of each block. If the query is a protein and the target the genome, blockSizes are in amino acids. See below for more information on protein query PSLs.
+    "blockSizes",  # Comma-separated list of sizes of each block. 
+                        # If the query is a protein and the target the genome,
+                        #  blockSizes are in amino acids. 
+                        # See below for more information on protein query PSLs.
     "qStarts",  # Comma-separated list of starting positions of each block in query
     "tStarts",  # Comma-separated list of starting positions of each block in target
 ]
diff --git a/rnacentral_pipeline/rnacentral/genome_mapping/igv.py b/rnacentral_pipeline/rnacentral/genome_mapping/igv.py
@@ -38,7 +38,7 @@ def ftp(host):
 
     try:
         conn.quit()
-    except Exception as err:
+    except ftplib.all_errors as err:
         LOGGER.info("Failed to close FTP connection")
         LOGGER.exception(err)
 
diff --git a/rnacentral_pipeline/rnacentral/precompute/utils.py b/rnacentral_pipeline/rnacentral/precompute/utils.py
@@ -76,7 +76,8 @@ def entropy(data):
     structures) the name will be very long because it contains the sequence
     itself. For example:
 
-    RNA (5'-R(*GP*UP*GP*GP*UP*CP*UP*GP*AP*UP*GP*AP*GP*GP*CP*C)-3') from synthetic construct (PDB 3D0M, chain X)
+    RNA (5'-R(*GP*UP*GP*GP*UP*CP*UP*GP*AP*UP*GP*AP*GP*GP*CP*C)-3') 
+    from synthetic construct (PDB 3D0M, chain X)
 
     This is not a useful name, but it is very long. Thus we do not want it.
     What we are generally after is something with the most information (to a
diff --git a/rnacentral_pipeline/rnacentral/r2dt/parser.py b/rnacentral_pipeline/rnacentral/r2dt/parser.py
@@ -94,7 +94,7 @@ def parse(
 
             try:
                 info.validate()
-            except Exception as e:
+            except AssertionError as e:
                 if allow_missing:
                     LOGGER.warn("Did not find all required files for %s", urs)
                     LOGGER.exception(e)
diff --git a/rnacentral_pipeline/rnacentral/search_export/compare.py b/rnacentral_pipeline/rnacentral/search_export/compare.py
@@ -75,7 +75,8 @@ def compare(output, results1, results2, facet):
 
 def write(output: ty.IO):
     """ """
-    index1 = "http://www.ebi.ac.uk/ebisearch/ws/rest/rnacentral?query={query}&format=json&facetfields={facet}&facetcount=30"
+    index1 = ("http://www.ebi.ac.uk/ebisearch/ws/rest/rnacentral"
+                + "?query={query}&format=json&facetfields={facet}&facetcount=30")
     index2 = index1.replace("http://www.", "http://wwwdev.")
     queries = ["RNA", 'TAXONOMY:"9606"'] + EXPERT_DATABASES
     facets = ["rna_type", "has_genomic_coordinates"]
diff --git a/rnacentral_pipeline/utils.py b/rnacentral_pipeline/utils.py
@@ -37,7 +37,8 @@ def __init__(self, co):
         self.done = False
         self.result = None
         self.lock = threading.RLock()
-        ## This needs to be a re-rntrant lock so it is only release by the coroutine that acquired it
+        ## This needs to be a re-rntrant lock so it is only 
+        ## release by the coroutine that acquired it
 
     def __await__(self):
         with self.lock:

Original file line number	Diff line number	Diff line change
`@@ -270,7 +270,8 @@ def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None:`
`270`	`270`	`) #`
`271`	`271`
`272`	`272`	`## Match with RNAcentral based on the gene name`
`273`		`- ## This is optionally chunked to save memory - split the lookup file and provide a list on the commandline`
	`273`	`+ ## This is optionally chunked to save memory -`
	`274`	`+ ## split the lookup file and provide a list on the commandline`
`274`	`275`	`matched_frame = pd.concat(`
`275`	`276`	`[get_db_matches(no_accession_frame, dump_chunk) for dump_chunk in db_dumps]`
`276`	`277`	`)`