RNAcentral
diff --git a/‎bin/litscan-get-articles.py
Lines changed: 25 additions & 1 deletion b/‎bin/litscan-get-articles.py
Lines changed: 25 additions & 1 deletion
diff --git a/‎bin/litscan-get-unique-ids.sh
Lines changed: 13 additions & 27 deletions b/‎bin/litscan-get-unique-ids.sh
Lines changed: 13 additions & 27 deletions
diff --git a/‎bin/litscan-upload-ids.sh
Lines changed: 1 addition & 1 deletion b/‎bin/litscan-upload-ids.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎poetry.lock
Lines changed: 440 additions & 380 deletions b/‎poetry.lock
Lines changed: 440 additions & 380 deletions
diff --git a/‎pyproject.toml
Lines changed: 3 additions & 2 deletions b/‎pyproject.toml
Lines changed: 3 additions & 2 deletions
diff --git a/‎rnacentral_pipeline/rnacentral/ftp_export/id_mapping.py
Lines changed: 54 additions & 57 deletions b/‎rnacentral_pipeline/rnacentral/ftp_export/id_mapping.py
Lines changed: 54 additions & 57 deletions
diff --git a/‎rnacentral_pipeline/rnacentral/genome_mapping/igv.py
Lines changed: 1 addition & 1 deletion b/‎rnacentral_pipeline/rnacentral/genome_mapping/igv.py
Lines changed: 1 addition & 1 deletion
@@ -81,7 +81,13 @@ def create_xml_file(results, directory):
 @click.argument('directory')
 def main(database, directory):
     """
-    Get the data that will be used by the search index
+    Get the data that will be used by the search index.
+    I tried to fetch all data in one query (query available at the end of this
+    file), but that requires a large amount of RAM. A second approach was made
+    using two queries (the second query was to fetch organism data), but that
+    still requires 65GB of RAM. This task usually takes about 3 hours and for
+    now it will stay like this.
+
     :param database: params to connect to the db
     :param directory: directory to store xml files
     :return: None
@@ -173,3 +179,21 @@ def main(database, directory):
 
 if __name__ == "__main__":
     main()
+
+# cursor.execute("""
+#     SELECT
+#         a.pmcid, a.title, a.abstract, a.author, a.pmid, a.doi, a.year, a.journal, a.score, a.cited_by, a.type,
+#         r.id AS result_id, r.job_id, r.id_in_title, r.id_in_abstract, r.id_in_body,
+#         j.display_id,
+#         abs_s.sentence AS abstract_sentence,
+#         bod_s.sentence AS body_sentence,
+#         ma.urs AS manually_annotated
+#     FROM litscan_article a
+#     LEFT JOIN litscan_result r ON r.pmcid = a.pmcid
+#     LEFT JOIN litscan_job j ON r.job_id = j.job_id
+#     LEFT JOIN (SELECT result_id, sentence FROM litscan_abstract_sentence ORDER BY length(sentence) DESC LIMIT 1) abs_s ON abs_s.result_id = r.id
+#     LEFT JOIN (SELECT result_id, sentence FROM litscan_body_sentence ORDER BY length(sentence) DESC LIMIT 1) bod_s ON bod_s.result_id = r.id
+#     LEFT JOIN litscan_manually_annotated ma ON ma.pmcid = a.pmcid
+#     WHERE a.retracted IS NOT TRUE;
+# """)
+# rows = cursor.fetchall()
@@ -4,31 +4,17 @@
 file=$1
 database=$2
 
-# read file line by line
-while IFS= read -r line; do
-    IFS=$"|"
-    tmp=($line)
-    if [[ ${#tmp[*]} = 2 ]]; then
-      job_id="${tmp[0]}"
-      urs="${tmp[1]}"
-    else
-      job_id="${tmp[0]}"
-      primary_id="${tmp[1]}"
-      urs="${tmp[2]}"
-    fi
+awk -F'|' '{
+    if (NF == 2) {
+        job_id = $1;
+        urs = $2;
+    } else {
+        job_id = $1;
+        primary_id = $2;
+        urs = $3;
+    }
 
-    if [[ -n "${job_id}" ]]; then
-      echo ${job_id} >> ${database}_all_ids.txt
-    fi
-
-    if [[ -n "${primary_id}" ]]; then
-      echo ${primary_id} >> ${database}_all_ids.txt
-    fi
-
-    if [[ -n "${urs}" ]]; then
-      echo ${urs} >> ${database}_all_ids.txt
-    fi
-done < ${file}
-
-# create file with unique ids
-cat ${database}_all_ids.txt | sort | uniq > ${database}_ids.txt
+    if (job_id) print job_id;
+    if (primary_id) print primary_id;
+    if (urs) print urs;
+}' "$file" | LC_ALL=C sort -u > "${database}_ids.txt"
@@ -17,7 +17,7 @@ function submitJob
   curl -X POST \
        -H "Content-Type:application/json" \
        -d "{\"id\": \"${job_id}\"}" \
-       http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file};
+       http://45.88.80.122:8080/api/submit-job
 }
 
 # loop through the file
 
@@ -30,11 +30,12 @@ scikit-learn = "^1.1.3"
 semver = "^2.13.0"
 slack_sdk = "^3.19.4"
 sqlitedict = "^2.0.0"
-TatSu = "5.8.2"
+TatSu = "5.10.6"
+textblob = "0.15.3"
 throttler = "^1.2.0"
 nltk = "^3.8.1"
 openpyxl = "^3.0.10"
-pybedtools = {git = "https://github.com/afg1/pybedtools.git"}
+pybedtools = "^0.10.0"
 psycopg2-binary = "^2.9.7"
 
 [tool.poetry.group.dev.dependencies]
 
@@ -14,71 +14,68 @@
 """
 
 import csv
+import re
+import typing as ty
 
-from rnacentral_pipeline import psql
-
-
-def gene(result):
-    """
-    Convert the gene name into a standarized format.
-    """
-
-    if result["database"] == "ENSEMBL":
-        return result["optional_id"]
+from attr import frozen
 
-    if result["database"] == "MIRBASE":
-        return result["optional_id"]
-
-    if result["rna_type"] == "piRNA" and result["database"] == "ENA":
-        return result["product"]
-
-    name = result["gene"] or ""
-    name = name.replace("\t", " ")
-    return name
-
-
-def accession(result):
-    """
-    Produce the accession for the result. This will compute the accession
-    depending on the database.
-    """
-
-    if result["database"] == "ENA" or result["database"] == "HGNC":
-        return result["accession"]
-    if result["database"] == "PDBE":
-        return "%s_%s" % (result["external_id"], result["optional_id"])
-    return result["external_id"]
-
-
-def database(result):
-    """
-    Normalize the database name.
-    """
-
-    if result["database"] == "PDBE":
-        return "PDB"
-    return result["database"]
+from rnacentral_pipeline import psql
 
 
-def as_entry(result):
-    """
-    Produce the final result list for writing.
-    """
-    return [
-        result["upi"],
-        database(result),
-        accession(result),
-        result["taxid"],
-        result["rna_type"],
-        gene(result),
-    ]
+@frozen
+class IdMapping:
+    upi: str
+    accession: str
+    taxid: int
+    external_id: str
+    optional_id: str
+    rna_type: str
+    gene: str
+    product: str
+    database: str
+
+    def entry(self) -> ty.List[str]:
+        database = self.database
+        accession = self.external_id
+        gene = self.gene or ""
+        gene = gene.replace("\t", " ")
+
+        if self.database == "PDBE":
+            database = "PDB"
+            accession = "%s_%s" % (self.external_id, self.optional_id)
+        elif self.database == "HGNC":
+            accession = self.accession
+        elif self.database == "ENA":
+            if self.rna_type == "piRNA":
+                gene = self.product
+            accession = self.accession
+        elif self.database == "MIRBASE":
+            gene = self.optional_id
+        elif self.database == "ENSEMBL":
+            gene = self.optional_id
+        elif self.database == "RFAM":
+            if self.rna_type == "pre_miRNA":
+                acc_range, endpoints, _ = self.accession.split(":", 2)
+                acc = re.sub(r"\.\d\d+$", "", acc_range)
+                start, stop = endpoints.split("..", 1)
+                gene = f"{acc}/{start}-{stop}"
+
+        return [
+            self.upi,
+            database,
+            accession,
+            str(self.taxid),
+            self.rna_type,
+            gene,
+        ]
 
 
 def generate_file(json_file, output):
     """
     This will generate a TSV mapping file given the input TSV.
     """
 
-    entries = psql.json_handler(json_file)
-    data = map(as_entry, entries)
-    csv.writer(output, delimiter="\t").writerows(data)
+    writer = csv.writer(output, delimiter="\t")
+    for entry in psql.json_handler(json_file):
+        data = IdMapping(**entry)
+        writer.writerow(data.entry())
@@ -72,7 +72,7 @@ def check_url(file_list: List[str], name: str, path: str, assembly_id: str) -> D
 
 def create_json(species: str, assembly_id: str, output: IO[str]) -> None:
     with ftp(FTP_SERVER) as conn:
-        path = "pub/databases/RNAcentral/.genome-browser"
+        path = "pub/databases/RNAcentral/.genome-browser-dev"
         conn.cwd(path)
         file_list = conn.nlst()
         name = f"{species}.{assembly_id}"
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ function submitJob`
`17`	`17`	`curl -X POST \`
`18`	`18`	`-H "Content-Type:application/json" \`
`19`	`19`	`-d "{\"id\": \"${job_id}\"}" \`
`20`		`- http://45.88.80.122:8080/api/submit-job && echo ${job_id} >> submitted/${file};`
	`20`	`+ http://45.88.80.122:8080/api/submit-job`
`21`	`21`	`}`
`22`	`22`
`23`	`23`	`# loop through the file`